mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-08 11:28:38 +03:00
Compare commits
87 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8e5fba712c | ||
|
|
f522f891cb | ||
|
|
743ca9c165 | ||
|
|
2130a33829 | ||
|
|
3d5b6de557 | ||
|
|
b9a6b2b747 | ||
|
|
d948044ae9 | ||
|
|
49c254a413 | ||
|
|
d840b280d7 | ||
|
|
e6c98c73ea | ||
|
|
e6fdd54518 | ||
|
|
f61204c4b1 | ||
|
|
ea834c37e2 | ||
|
|
3e22709430 | ||
|
|
9c7eadc716 | ||
|
|
5d30d18b7b | ||
|
|
b2d462441e | ||
|
|
73f17407c0 | ||
|
|
95d249f5d0 | ||
|
|
ccb8c1f5c7 | ||
|
|
e525ff24b1 | ||
|
|
699b4ebdd8 | ||
|
|
8d1a86a7fa | ||
|
|
b01dbe6299 | ||
|
|
5b0f034c12 | ||
|
|
a1c098335c | ||
|
|
12a5d22f64 | ||
|
|
ab602e5d31 | ||
|
|
67bc8b5569 | ||
|
|
021e7c2304 | ||
|
|
91b6dcf291 | ||
|
|
2a1a4e2cae | ||
|
|
fc6b914e2d | ||
|
|
d155c1364a | ||
|
|
8882a87048 | ||
|
|
a95c675e9c | ||
|
|
8864e7c87d | ||
|
|
db9b613ae4 | ||
|
|
37828b4be4 | ||
|
|
1a3a7dc0f3 | ||
|
|
f67707a740 | ||
|
|
798684a334 | ||
|
|
a556b237e9 | ||
|
|
283bc35658 | ||
|
|
cef70fb80d | ||
|
|
e66ef4f477 | ||
|
|
1f6a8368fd | ||
|
|
9a046fd1cb | ||
|
|
aae2bb5999 | ||
|
|
9e30b81d16 | ||
|
|
72bc355606 | ||
|
|
7f59eefb73 | ||
|
|
30003c524e | ||
|
|
d1b27bef1d | ||
|
|
e5e9e08ee6 | ||
|
|
3a8a3f54c0 | ||
|
|
4d73864dbb | ||
|
|
ceaa8e45f3 | ||
|
|
007c8e07a8 | ||
|
|
a515b2c3de | ||
|
|
54540cd132 | ||
|
|
cfb13e5d82 | ||
|
|
926c3cb8a4 | ||
|
|
15ebe2e66c | ||
|
|
eaff88b2d9 | ||
|
|
a6d066a192 | ||
|
|
9078a17400 | ||
|
|
17b516bd7f | ||
|
|
8bd182b041 | ||
|
|
0b8abfb5cb | ||
|
|
cf5fb91c84 | ||
|
|
5c965102a4 | ||
|
|
df10e6f55f | ||
|
|
863dd44463 | ||
|
|
578ec81443 | ||
|
|
c32caec442 | ||
|
|
80b43f7c95 | ||
|
|
90b72b6d22 | ||
|
|
d96e0c0a3a | ||
|
|
db03a4c0f6 | ||
|
|
cf100ee69e | ||
|
|
a09cf32b3e | ||
|
|
e1eb3ed620 | ||
|
|
72bd951d9c | ||
|
|
59d53be68b | ||
|
|
24a1313a65 | ||
|
|
64df4eec28 |
3
.env.example
Normal file
3
.env.example
Normal file
@@ -0,0 +1,3 @@
|
||||
VK_USERNAME="your username"
|
||||
VK_PASSWORD="your password"
|
||||
VK_TOKEN="optional token"
|
||||
7
.github/actions/setup-venv/action.yml
vendored
7
.github/actions/setup-venv/action.yml
vendored
@@ -16,6 +16,11 @@ runs:
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
|
||||
- shell: bash
|
||||
run: |
|
||||
# install ffmpeg
|
||||
sudo apt install ffmpeg
|
||||
|
||||
- shell: bash
|
||||
run: |
|
||||
# Install prerequisites.
|
||||
@@ -26,7 +31,7 @@ runs:
|
||||
# Get the exact Python version to use in the cache key.
|
||||
echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV
|
||||
|
||||
- uses: actions/cache@v2
|
||||
- uses: actions/cache@v4
|
||||
id: virtualenv-cache
|
||||
with:
|
||||
path: .venv
|
||||
|
||||
11
.github/dependabot.yml
vendored
11
.github/dependabot.yml
vendored
@@ -1,11 +0,0 @@
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
open-pull-requests-limit: 10
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
12
.github/workflows/main.yml
vendored
12
.github/workflows/main.yml
vendored
@@ -20,6 +20,7 @@ env:
|
||||
PYTHONPATH: ./
|
||||
VK_USERNAME: ${{ secrets.VK_USERNAME }}
|
||||
VK_PASSWORD: ${{ secrets.VK_PASSWORD }}
|
||||
VK_TOKEN: ${{ secrets.VK_TOKEN }}
|
||||
|
||||
jobs:
|
||||
checks:
|
||||
@@ -29,11 +30,11 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python: ['3.7', '3.10']
|
||||
task: # --show-capture=no on purpose
|
||||
python: ['3.10']
|
||||
task: # --show-capture=no on purpose, -s for captchas
|
||||
- name: Test
|
||||
run: |
|
||||
pytest --show-capture=no --color=yes tests/
|
||||
pytest -s --show-capture=no --color=yes tests/
|
||||
|
||||
include:
|
||||
- python: '3.10'
|
||||
@@ -78,10 +79,11 @@ jobs:
|
||||
run: |
|
||||
. .venv/bin/activate
|
||||
${{ matrix.task.run }}
|
||||
continue-on-error: ${{ matrix.task.name != 'Build' }}
|
||||
|
||||
- name: Upload package distribution files
|
||||
if: matrix.task.name == 'Build'
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: package
|
||||
path: dist
|
||||
@@ -116,7 +118,7 @@ jobs:
|
||||
echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
|
||||
|
||||
- name: Download package distribution files
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: package
|
||||
path: dist
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,6 +1,7 @@
|
||||
.env
|
||||
vk_config.v2.json
|
||||
output/
|
||||
tmp*/
|
||||
# build artifacts
|
||||
|
||||
.eggs/
|
||||
|
||||
@@ -4,8 +4,12 @@ sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: false
|
||||
|
||||
build:
|
||||
os: "ubuntu-22.04"
|
||||
tools:
|
||||
python: "3.10"
|
||||
|
||||
python:
|
||||
version: "3.8"
|
||||
install:
|
||||
- requirements: requirements.txt
|
||||
- requirements: dev-requirements.txt
|
||||
|
||||
2
Makefile
2
Makefile
@@ -13,4 +13,4 @@ run-checks :
|
||||
black .
|
||||
flake8 .
|
||||
mypy .
|
||||
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/
|
||||
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes .
|
||||
|
||||
35
Pipfile
35
Pipfile
@@ -4,8 +4,32 @@ verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
vk-api = "*"
|
||||
yt-dlp = "*"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
certifi = ">=2023.7.22"
|
||||
charset-normalizer = ">=3.0.1"
|
||||
idna = ">=3.4"
|
||||
mutagen = ">=1.46.0"
|
||||
pycryptodomex = ">=3.17"
|
||||
requests = ">=2.28.2"
|
||||
urllib3 = ">=1.26.14"
|
||||
websockets = ">=10.4"
|
||||
vk-api = {ref = "b99dac0ec2f832a6c4b20bde49869e7229ce4742", git = "git+https://github.com/python273/vk_api.git"}
|
||||
flake8 = "*"
|
||||
mypy = ">=0.961"
|
||||
black = ">=22.3.0"
|
||||
isort = ">=5.10.1"
|
||||
pytest = "*"
|
||||
pytest-sphinx = "*"
|
||||
pytest-cov = "*"
|
||||
twine = ">=1.11.0"
|
||||
sphinx = "<5.1.0,>=4.3.0"
|
||||
furo = ">=2022.6.4.1"
|
||||
myst-parser = "<0.19.0,>=0.15.2"
|
||||
sphinx-copybutton = ">=0.5.0"
|
||||
sphinx-autobuild = ">=2021.3.14"
|
||||
sphinx-autodoc-typehints = "*"
|
||||
packaging = "*"
|
||||
python-dotenv = ">=0.21.1"
|
||||
|
||||
[dev-packages]
|
||||
sphinx-copybutton = "==0.5.0"
|
||||
@@ -18,11 +42,14 @@ pytest-sphinx = "*"
|
||||
pytest-cov = "*"
|
||||
twine = ">=1.11.0"
|
||||
sphinx = ">=4.3.0,<5.1.0"
|
||||
furo = "==2022.6.4.1"
|
||||
furo = "==2022.6.21"
|
||||
myst-parser = ">=0.15.2,<0.19.0"
|
||||
sphinx-autobuild = "==2021.3.14"
|
||||
sphinx-autodoc-typehints = "*"
|
||||
python-dotenv = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
python_version = "3.11"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
||||
|
||||
2814
Pipfile.lock
generated
2814
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
39
README.md
39
README.md
@@ -1,11 +1,24 @@
|
||||
# vk-url-scraper
|
||||
Library to scrape data and especially media links (videos and photos) from vk.com URLs.
|
||||
Python library to scrape data, and especially media links like videos and photos, from vk.com URLs.
|
||||
|
||||
You can use it via the [command line](#command-line-usage) or as a [python library](#python-library-usage).
|
||||
> This repo has been archived because it relies on a fixed git commit of the vk_api library which we can no longer publish to pypi, see [issue](https://github.com/bellingcat/vk-url-scraper/issues/66). You can still install the latest install. This archived state may change if a solution is found to publish the library to pypi again.
|
||||
|
||||
[](https://badge.fury.io/py/vk-url-scraper)
|
||||
[](https://pypi.python.org/pypi/vk-url-scraper/)
|
||||
[](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest)
|
||||
|
||||
|
||||
You can use it via the [command line](#command-line-usage) or as a [python library](#python-library-usage), check the **[documentation](https://vk-url-scraper.readthedocs.io/en/latest/)**.
|
||||
|
||||
## Installation
|
||||
You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`.
|
||||
|
||||
Currently you need to manually unsintall and re-install one dependency (as it is installed from github and not pypi):
|
||||
```bash
|
||||
pip uninstall vk-api
|
||||
pip install git+https://github.com/python273/vk_api.git@b99dac0ec2f832a6c4b20bde49869e7229ce4742
|
||||
```
|
||||
|
||||
To use the library you will need a valid username/password combination for vk.com.
|
||||
|
||||
## Command line usage
|
||||
@@ -14,12 +27,16 @@ To use the library you will need a valid username/password combination for vk.co
|
||||
vk_url_scraper --help
|
||||
|
||||
# scrape a URL and get the JSON result in the console
|
||||
vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789
|
||||
vk_url_scraper --username "username here" --password "password here" --urls https://vk.com/wall12345_6789
|
||||
# OR
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
|
||||
# you can also have multiple urls
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
|
||||
|
||||
# you can pass a token as well to avoid always authenticating
|
||||
# and possibly getting captcha prompts
|
||||
# you can fetch the token from the vk_config.v2.json file generated under by searching for "access_token"
|
||||
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
|
||||
|
||||
# save the JSON output into a file
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json
|
||||
@@ -45,7 +62,7 @@ res = vks.scrape("https://vk.com/wall-1_398461")
|
||||
|
||||
# scrape any "video" URL
|
||||
res = vks.scrape("https://vk.com/video-6596301_145810025")
|
||||
print(res[0]["text]) # eg: -> to get the text from code
|
||||
print(res[0]["text"]) # eg: -> to get the text from code
|
||||
```
|
||||
|
||||
```python
|
||||
@@ -74,12 +91,13 @@ see [docs] for all available functions.
|
||||
## Development
|
||||
(more info in [CONTRIBUTING.md](CONTRIBUTING.md)).
|
||||
|
||||
1. setup dev environment with `pip install -r dev-requirements.txt` or `pipenv install -r dev-requirements.txt`
|
||||
1. setup environment with `pip install -r requirements.txt` or `pipenv install -r requirements.txt`
|
||||
1. setup dev environment with `pipenv install --dev`
|
||||
1. setup environment with `pipenv install -r requirements.txt`
|
||||
1. Activate the environment with `pipenv shell` (or prepend `pipenv run` to all commands)
|
||||
2. To run all checks to `make run-checks` (fixes style) or individually
|
||||
1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
|
||||
2. To do type checking: `mypy .`
|
||||
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
|
||||
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to use verbose, colors, and test docstring examples)
|
||||
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
|
||||
|
||||
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`
|
||||
@@ -87,9 +105,12 @@ To test the command line interface available in [__main__.py](__vk_url_scraper/_
|
||||
|
||||
## Releasing new version
|
||||
1. edit [version.py](vk_url_scraper/version.py) with proper versioning
|
||||
2. run `./scripts/release.sh` to create a tag and push, alternatively
|
||||
2. make sure to run `pipenv run pip freeze > requirements.txt` if you manage libs with pipenv
|
||||
1. if the hardcoded version of [vk_api](https://github.com/python273/vk_api) is still being used, then you must comment/remove that line from the generated requirements file and instruct users to manually install the version from the source as pypi does not allow repo/commit tags. Additionally, add the latest released version, currently `vk-api==11.9.9`.
|
||||
3. run `./scripts/release.sh` to create a tag and push, alternatively
|
||||
1. `git tag vx.y.z` to tag version
|
||||
2. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
|
||||
4. go to https://readthedocs.org/ to deploy new docs version (if webhook is not setup)
|
||||
|
||||
### Fixing a failed release
|
||||
|
||||
@@ -99,4 +120,4 @@ If for some reason the GitHub Actions release workflow failed with an error that
|
||||
git tag -l | xargs git tag -d && git fetch -t
|
||||
```
|
||||
|
||||
Then repeat the steps above.
|
||||
Then repeat the steps above.
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
flake8
|
||||
|
||||
# Static type checking
|
||||
mypy==0.961
|
||||
mypy>=0.961
|
||||
|
||||
# Automatic code formatting
|
||||
black==22.3.0
|
||||
isort==5.10.1
|
||||
black>=22.3.0
|
||||
isort>=5.10.1
|
||||
|
||||
# Running tests
|
||||
pytest
|
||||
@@ -24,19 +24,20 @@ wheel
|
||||
Sphinx>=4.3.0,<5.1.0
|
||||
|
||||
# Sphinx theme: https://sphinx-themes.org/sample-sites/furo/
|
||||
furo==2022.6.4.1
|
||||
furo>=2022.6.4.1
|
||||
|
||||
# Lets Sphinx parse markdown files in addition to rst.
|
||||
myst-parser>=0.15.2,<0.19.0
|
||||
|
||||
# Adds a copy button to code examples in the docs.
|
||||
sphinx-copybutton==0.5.0
|
||||
sphinx-copybutton>=0.5.0
|
||||
|
||||
# Live rebuilding and reloading of docs for developing locally.
|
||||
sphinx-autobuild==2021.3.14
|
||||
sphinx-autobuild>=2021.3.14
|
||||
|
||||
# Automatically adds types to docs
|
||||
sphinx-autodoc-typehints
|
||||
|
||||
# For parsing and comparing version numbers.
|
||||
packaging
|
||||
python-dotenv>=0.21.1
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 7.6 KiB |
@@ -1,7 +1,7 @@
|
||||
Installation
|
||||
============
|
||||
|
||||
**vk-url-scraper** supports Python >= 3.7.
|
||||
**vk-url-scraper** supports Python >= 3.10.
|
||||
|
||||
## Installing with `pip`
|
||||
|
||||
|
||||
102
requirements.txt
102
requirements.txt
@@ -1,15 +1,87 @@
|
||||
#
|
||||
# These requirements were autogenerated by pipenv
|
||||
# To regenerate from the project's Pipfile, run:
|
||||
#
|
||||
# pipenv lock --requirements
|
||||
#
|
||||
|
||||
certifi==2022.6.15
|
||||
charset-normalizer==2.0.12
|
||||
idna==3.3
|
||||
requests==2.28.0
|
||||
urllib3==1.26.9
|
||||
vk-api==11.9.8
|
||||
python-dotenv==0.20.0
|
||||
yt-dlp==2022.5.18
|
||||
alabaster==0.7.16
|
||||
anyio==4.9.0
|
||||
babel==2.17.0
|
||||
backports.tarfile==1.2.0
|
||||
beautifulsoup4==4.13.4
|
||||
black==25.1.0
|
||||
certifi==2025.4.26
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.4.2
|
||||
click==8.1.8
|
||||
colorama==0.4.6
|
||||
coverage==7.8.0
|
||||
cryptography==44.0.3
|
||||
docutils==0.18.1
|
||||
flake8==7.2.0
|
||||
furo==2023.3.27
|
||||
h11==0.16.0
|
||||
id==1.5.0
|
||||
idna==3.10
|
||||
imagesize==1.4.1
|
||||
importlib_metadata==8.7.0
|
||||
iniconfig==2.1.0
|
||||
isort==6.0.1
|
||||
jaraco.classes==3.4.0
|
||||
jaraco.context==6.0.1
|
||||
jaraco.functools==4.1.0
|
||||
jeepney==0.9.0
|
||||
Jinja2==3.1.6
|
||||
keyring==25.6.0
|
||||
livereload==2.7.1
|
||||
markdown-it-py==2.2.0
|
||||
MarkupSafe==3.0.2
|
||||
mccabe==0.7.0
|
||||
mdit-py-plugins==0.3.5
|
||||
mdurl==0.1.2
|
||||
more-itertools==10.7.0
|
||||
mutagen==1.47.0
|
||||
mypy==1.15.0
|
||||
mypy_extensions==1.1.0
|
||||
myst-parser==0.18.1
|
||||
nh3==0.2.21
|
||||
packaging==25.0
|
||||
pathspec==0.12.1
|
||||
pkginfo==1.10.0
|
||||
platformdirs==4.3.7
|
||||
pluggy==1.5.0
|
||||
pycodestyle==2.13.0
|
||||
pycparser==2.22
|
||||
pycryptodomex==3.22.0
|
||||
pyflakes==3.3.2
|
||||
Pygments==2.19.1
|
||||
pytest==8.3.5
|
||||
pytest-cov==6.1.1
|
||||
pytest-sphinx==0.6.3
|
||||
python-dotenv==1.1.0
|
||||
PyYAML==6.0.2
|
||||
readme_renderer==43.0
|
||||
requests==2.32.3
|
||||
requests-toolbelt==1.0.0
|
||||
rfc3986==2.0.0
|
||||
rich==14.0.0
|
||||
SecretStorage==3.3.3
|
||||
sniffio==1.3.1
|
||||
snowballstemmer==2.2.0
|
||||
soupsieve==2.7
|
||||
Sphinx==5.0.2
|
||||
sphinx-autobuild==2024.10.3
|
||||
sphinx-autodoc-typehints==1.19.1
|
||||
sphinx-basic-ng==1.0.0b2
|
||||
sphinx-copybutton==0.5.2
|
||||
sphinxcontrib-applehelp==2.0.0
|
||||
sphinxcontrib-devhelp==2.0.0
|
||||
sphinxcontrib-htmlhelp==2.1.0
|
||||
sphinxcontrib-jsmath==1.0.1
|
||||
sphinxcontrib-qthelp==2.0.0
|
||||
sphinxcontrib-serializinghtml==2.0.0
|
||||
starlette==0.46.2
|
||||
tornado==6.5b1
|
||||
twine==6.1.0
|
||||
typing_extensions==4.13.2
|
||||
urllib3==2.4.0
|
||||
uvicorn==0.34.2
|
||||
vk_api @ git+https://github.com/python273/vk_api.git@b99dac0ec2f832a6c4b20bde49869e7229ce4742
|
||||
watchfiles==1.0.5
|
||||
websockets==15.0.1
|
||||
yt-dlp==2025.5.3.232917.dev0
|
||||
zipp==3.21.0
|
||||
|
||||
16
setup.py
16
setup.py
@@ -33,7 +33,7 @@ with open("vk_url_scraper/version.py", "r") as version_file:
|
||||
setup(
|
||||
name="vk-url-scraper",
|
||||
version=VERSION["VERSION"],
|
||||
description="",
|
||||
description="Scrape VK URLs to fetch info and media - python API or command line tool.",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
classifiers=[
|
||||
@@ -43,8 +43,11 @@ setup(
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
],
|
||||
keywords="",
|
||||
url="https://github.com/bellingcat/vk-url-scraper",
|
||||
keywords=["scraper", "vk", "vkontakte", "vk-api", "media-downloader"],
|
||||
project_urls={
|
||||
"Code": "https://github.com/bellingcat/vk-url-scraper",
|
||||
"Documentation": "https://vk-url-scraper.readthedocs.io/en/latest/",
|
||||
},
|
||||
author="Bellingcat",
|
||||
author_email="tech@bellingcat.com",
|
||||
license="MIT",
|
||||
@@ -54,5 +57,10 @@ setup(
|
||||
package_data={"vk_url_scraper": ["py.typed"]},
|
||||
install_requires=read_requirements("requirements.txt"),
|
||||
extras_require={"dev": read_requirements("dev-requirements.txt")},
|
||||
python_requires=">=3.7",
|
||||
python_requires=">=3.10",
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"vk_url_scraper=vk_url_scraper.__main__:main",
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
@@ -14,9 +14,23 @@ def test_login_fail():
|
||||
VkScraper("invalid", "combination")
|
||||
|
||||
|
||||
# disabled due to CI
|
||||
# def test_login_custom_file():
|
||||
# session_filename = "test-session.json"
|
||||
# VkScraper(
|
||||
# os.environ["VK_USERNAME"],
|
||||
# os.environ["VK_PASSWORD"],
|
||||
# session_file=session_filename,
|
||||
# )
|
||||
# assert os.path.isfile(session_filename)
|
||||
# os.unlink(session_filename)
|
||||
|
||||
|
||||
def test_login_success():
|
||||
global vks
|
||||
vks = VkScraper(os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"])
|
||||
vks = VkScraper(
|
||||
os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"], os.environ.get("VK_TOKEN")
|
||||
)
|
||||
|
||||
|
||||
def test_scrape_empty_urll():
|
||||
@@ -67,7 +81,7 @@ def test_scrape_wall_url_with_photos():
|
||||
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
|
||||
)
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
|
||||
assert len(res[0]["payload"]) == 16
|
||||
assert len(res[0]["payload"]) == 19
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["photo"]
|
||||
assert len(res[0]["attachments"]["photo"]) == 9
|
||||
@@ -79,7 +93,7 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
|
||||
assert res[0]["id"] == "wall-17315087_74182"
|
||||
assert res[0]["text"] == ""
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
|
||||
assert len(res[0]["payload"]) == 15
|
||||
assert len(res[0]["payload"]) == 18
|
||||
assert len(res[0]["attachments"].keys()) == 3
|
||||
for k in ["photo", "link", "video"]:
|
||||
assert k in list(res[0]["attachments"].keys())
|
||||
@@ -99,7 +113,7 @@ def test_scrape_download_multiple_media():
|
||||
"wall-17315087_74182_2.jpg",
|
||||
"wall-17315087_74182_3.jpg",
|
||||
"wall-17315087_74182_4.jpg",
|
||||
"wall-17315087_74182_0.mkv",
|
||||
"wall-17315087_74182_0.mp4",
|
||||
}
|
||||
found_files = set(os.listdir(tempdir))
|
||||
assert len(expect_files) == len(expect_files & found_files)
|
||||
@@ -114,7 +128,7 @@ def test_scrape_photo_only():
|
||||
== "Делимся расписанием конкурса [https://vk.com/wall-1_399468|«Код Петербурга»]. Все важные этапы — на одной схеме \n\nЕсли участвуете, обязательно сохраните себе. Так будет удобнее планировать работу над проектом, и вы точно не упустите лучший момент для отправки сервиса на модерацию."
|
||||
)
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 7, 9, 43))
|
||||
assert len(res[0]["payload"]) == 15
|
||||
assert len(res[0]["payload"]) == 16
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["photo"]
|
||||
assert len(res[0]["attachments"]["photo"]) == 1
|
||||
@@ -125,7 +139,6 @@ def test_scrape_video_only():
|
||||
assert len(res) == 1
|
||||
assert res[0]["id"] == "video38556806_456251917"
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38))
|
||||
assert len(res[0]["payload"]) == 31
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["video"]
|
||||
|
||||
@@ -135,8 +148,22 @@ def test_scrape_video_only2():
|
||||
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||
vks.download_media(res, tempdir)
|
||||
found_files = set(os.listdir(tempdir))
|
||||
# different systems might attribute different extension
|
||||
assert (
|
||||
"video-17546758_456239898_0.webm" in found_files
|
||||
or "video-17546758_456239898_0.mp4" in found_files
|
||||
)
|
||||
assert "video-17546758_456239898_0.mp4" in found_files
|
||||
|
||||
|
||||
def test_scrape_private_video():
|
||||
"""
|
||||
> Some videos are kept private and cannot be accessed without a passkey . In this case, send the ID in {owner_id}_{video_id}_{access_key}.
|
||||
From https://dev.vk.com/ru/method/video.get
|
||||
"""
|
||||
res = vks.scrape("https://vk.com/wall-127774884_178565")
|
||||
|
||||
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||
vks.download_media(res, tempdir)
|
||||
expect_files = {
|
||||
"wall-127774884_178565_0.mp4",
|
||||
"wall-127774884_178565_1.mp4",
|
||||
"wall-127774884_178565_2.mp4",
|
||||
}
|
||||
found_files = set(os.listdir(tempdir))
|
||||
assert len(expect_files) == len(expect_files & found_files)
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
from .scraper import VkScraper
|
||||
from .utils import DateTimeEncoder, mkdir_if_not_exists
|
||||
from .utils import DateTimeEncoder, suppress_stdout
|
||||
|
||||
@@ -19,7 +19,7 @@ def get_argument_parser():
|
||||
action="store",
|
||||
dest="username",
|
||||
required=True,
|
||||
help="username for a valid vk.com account",
|
||||
help="username for a valid vk.com account (pass empty if using --token)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
@@ -27,7 +27,15 @@ def get_argument_parser():
|
||||
action="store",
|
||||
dest="password",
|
||||
required=True,
|
||||
help="password for the valid vk.com account",
|
||||
help="password for the valid vk.com account (pass empty if using --token)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--token",
|
||||
action="store",
|
||||
dest="token",
|
||||
required=False,
|
||||
help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
@@ -50,7 +58,7 @@ def get_argument_parser():
|
||||
def main():
|
||||
parser = get_argument_parser()
|
||||
args = parser.parse_args()
|
||||
vks = VkScraper(args.username, args.password)
|
||||
vks = VkScraper(args.username, args.password, args.token)
|
||||
text = " ".join(args.urls)
|
||||
res = vks.scrape(text)
|
||||
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
import vk_api # used to get api_token after authentication
|
||||
import yt_dlp # to download videos from url
|
||||
|
||||
from .utils import mkdir_if_not_exists
|
||||
from .utils import captcha_handler, suppress_stdout
|
||||
|
||||
|
||||
class VkScraper:
|
||||
@@ -36,12 +37,20 @@ class VkScraper:
|
||||
|
||||
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
|
||||
|
||||
def __init__(self, username: str, password: str) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
username: str,
|
||||
password: str,
|
||||
token: Optional[str] = None,
|
||||
session_file="vk_config.v2.json",
|
||||
captcha_handler=captcha_handler,
|
||||
) -> None:
|
||||
"""Initializes the scraper.
|
||||
|
||||
This function receives a username and password and performs authentication on vk.com to then call api endpoints
|
||||
This function receives a username and password (or access token) and performs
|
||||
authentication on vk.com to then call api endpoints. If token is passed, authentication will not be performed again.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -49,9 +58,22 @@ class VkScraper:
|
||||
Username on vk.com, can be a phone number or email
|
||||
password : str
|
||||
Matching password on vk.com
|
||||
token : str
|
||||
Access token received after authenticating, can be found in the vk_config.v2.json file
|
||||
session_file : str
|
||||
File name where the VK session is saved so future logins are easier, this will not be created if token is passed
|
||||
captcha_handler : func
|
||||
Function that can receive a vk_api captcha instance and help the user solve it, default is a complete CLI handler
|
||||
"""
|
||||
self.session = vk_api.VkApi(username, password)
|
||||
self.session.auth(token_only=True)
|
||||
self.session = vk_api.VkApi(
|
||||
username,
|
||||
password,
|
||||
token=token,
|
||||
config_filename=session_file,
|
||||
captcha_handler=captcha_handler,
|
||||
)
|
||||
if token is None or len(token) == 0:
|
||||
self.session.auth(token_only=True)
|
||||
|
||||
def scrape(self, url: str) -> List:
|
||||
"""Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
|
||||
@@ -122,10 +144,11 @@ class VkScraper:
|
||||
first_type = a["type"]
|
||||
attachment = a[first_type]
|
||||
if first_type == "video":
|
||||
video_path = f'video{attachment["owner_id"]}_{attachment["id"]}'
|
||||
if "access_key" in attachment:
|
||||
video_path += f"_{attachment['access_key']}"
|
||||
attachments["video"].extend(
|
||||
self.scrape_videos(f'video{attachment["owner_id"]}_{attachment["id"]}')[
|
||||
0
|
||||
]
|
||||
self.scrape_videos(video_path)[0]
|
||||
.get("attachments", {})
|
||||
.get("video", [""])
|
||||
)
|
||||
@@ -298,7 +321,7 @@ class VkScraper:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
mkdir_if_not_exists(destination)
|
||||
os.makedirs(destination, exist_ok=True)
|
||||
downloaded = []
|
||||
for r in results:
|
||||
for k, attachments in r["attachments"].items():
|
||||
@@ -311,23 +334,31 @@ class VkScraper:
|
||||
f.write(d.content)
|
||||
downloaded.append(filename)
|
||||
elif k == "video":
|
||||
for i, url in enumerate(attachments):
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
|
||||
ydl = yt_dlp.YoutubeDL(
|
||||
{
|
||||
"outtmpl": filename,
|
||||
"quiet": True,
|
||||
"restrictfilenames": True,
|
||||
"forcefilename": True,
|
||||
}
|
||||
)
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl.prepare_filename(info)
|
||||
if "unknown_video" in filename:
|
||||
new_filename = filename.replace("unknown_video", "mkv")
|
||||
with open(filename, "rb") as vin, open(new_filename, "wb") as vout:
|
||||
vout.write(vin.read())
|
||||
os.remove(filename)
|
||||
filename = new_filename
|
||||
downloaded.append(filename)
|
||||
with suppress_stdout(): # ytdlp is not 100% quiet
|
||||
for i, url in enumerate(attachments):
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
|
||||
ydl = yt_dlp.YoutubeDL(
|
||||
{
|
||||
"format": (
|
||||
"bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
|
||||
),
|
||||
"merge_output_format": "mp4",
|
||||
"retries": 5,
|
||||
"noplaylist": True,
|
||||
"outtmpl": filename,
|
||||
"quiet": True,
|
||||
"restrictfilenames": True,
|
||||
"forcefilename": True,
|
||||
"simulate": False,
|
||||
}
|
||||
)
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl.prepare_filename(info)
|
||||
if "unknown_video" in filename:
|
||||
old_filename = filename
|
||||
filename = shutil.copy(
|
||||
filename, filename.replace("unknown_video", "mp4")
|
||||
)
|
||||
os.remove(old_filename)
|
||||
downloaded.append(filename)
|
||||
return downloaded
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@@ -11,6 +13,21 @@ class DateTimeEncoder(json.JSONEncoder):
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
|
||||
def mkdir_if_not_exists(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
def captcha_handler(captcha):
|
||||
key = input(
|
||||
f"CAPTCHA DETECTED, please solve it and input the solution. url= {captcha.get_url()} :"
|
||||
).strip()
|
||||
return captcha.try_again(key.strip())
|
||||
|
||||
|
||||
@contextmanager
|
||||
def suppress_stdout():
|
||||
# https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
|
||||
# this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console
|
||||
with open(os.devnull, "w") as devnull:
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = devnull
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
_MAJOR = "0"
|
||||
_MINOR = "2"
|
||||
_MINOR = "3"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "3"
|
||||
_PATCH = "34"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user