mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-10 12:28:39 +03:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3e22709430 | ||
|
|
9c7eadc716 | ||
|
|
5d30d18b7b | ||
|
|
b2d462441e | ||
|
|
73f17407c0 | ||
|
|
95d249f5d0 | ||
|
|
ccb8c1f5c7 | ||
|
|
e525ff24b1 | ||
|
|
699b4ebdd8 | ||
|
|
8d1a86a7fa | ||
|
|
b01dbe6299 |
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python: ['3.7', '3.10']
|
||||
python: ['3.10']
|
||||
task: # --show-capture=no on purpose, -s for captchas
|
||||
- name: Test
|
||||
run: |
|
||||
|
||||
@@ -7,7 +7,7 @@ sphinx:
|
||||
build:
|
||||
os: "ubuntu-22.04"
|
||||
tools:
|
||||
python: "3.8"
|
||||
python: "3.10"
|
||||
|
||||
python:
|
||||
install:
|
||||
|
||||
2
Makefile
2
Makefile
@@ -13,4 +13,4 @@ run-checks :
|
||||
black .
|
||||
flake8 .
|
||||
mypy .
|
||||
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/
|
||||
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes .
|
||||
|
||||
4
Pipfile
4
Pipfile
@@ -4,7 +4,6 @@ verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
vk-api = ">=11.9.9"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
flake8 = "*"
|
||||
mypy = ">=0.961"
|
||||
@@ -30,6 +29,9 @@ pycryptodomex = ">=3.17"
|
||||
requests = ">=2.28.2"
|
||||
urllib3 = ">=1.26.14"
|
||||
websockets = ">=10.4"
|
||||
# vk-api = {ref = "77b5a0d51a6bbf54d59554332f28a488615fbd6c", git = "git+https://github.com/python273/vk_api.git"}
|
||||
# vk-api = "*"
|
||||
vk-api = {ref = "b99dac0ec2f832a6c4b20bde49869e7229ce4742", git = "git+https://github.com/python273/vk_api.git"}
|
||||
|
||||
[dev-packages]
|
||||
sphinx-copybutton = "==0.5.0"
|
||||
|
||||
2797
Pipfile.lock
generated
2797
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
13
README.md
13
README.md
@@ -12,6 +12,9 @@ You can use it via the [command line](#command-line-usage) or as a [python libra
|
||||
## Installation
|
||||
You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`.
|
||||
|
||||
Currently you need to manually install one dependency (as it is installed from github and not pypi):
|
||||
`pip install git+https://github.com/python273/vk_api.git@b99dac0ec2f832a6c4b20bde49869e7229ce4742`
|
||||
|
||||
To use the library you will need a valid username/password combination for vk.com.
|
||||
|
||||
## Command line usage
|
||||
@@ -20,7 +23,7 @@ To use the library you will need a valid username/password combination for vk.co
|
||||
vk_url_scraper --help
|
||||
|
||||
# scrape a URL and get the JSON result in the console
|
||||
vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789
|
||||
vk_url_scraper --username "username here" --password "password here" --urls https://vk.com/wall12345_6789
|
||||
# OR
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
|
||||
# you can also have multiple urls
|
||||
@@ -89,7 +92,7 @@ see [docs] for all available functions.
|
||||
2. To run all checks to `make run-checks` (fixes style) or individually
|
||||
1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
|
||||
2. To do type checking: `mypy .`
|
||||
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
|
||||
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to use verbose, colors, and test docstring examples)
|
||||
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
|
||||
|
||||
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`
|
||||
@@ -97,10 +100,12 @@ To test the command line interface available in [__main__.py](__vk_url_scraper/_
|
||||
|
||||
## Releasing new version
|
||||
1. edit [version.py](vk_url_scraper/version.py) with proper versioning
|
||||
2. run `./scripts/release.sh` to create a tag and push, alternatively
|
||||
2. make sure to run `pipenv run pip freeze > requirements.txt` if you manage libs with pipenv
|
||||
1. if the hardcoded version of [vk_api](https://github.com/python273/vk_api) is still being used, then you must comment/remove that line from the generated requirements file and instruct users to manually install the version from the source as pypi does not allow repo/commit tags. Additionally, add the latest released version, currently `vk-api==11.9.9`.
|
||||
3. run `./scripts/release.sh` to create a tag and push, alternatively
|
||||
1. `git tag vx.y.z` to tag version
|
||||
2. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
|
||||
3. go to https://readthedocs.org/ to deploy new docs version (if webhook is not setup)
|
||||
4. go to https://readthedocs.org/ to deploy new docs version (if webhook is not setup)
|
||||
|
||||
### Fixing a failed release
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
Installation
|
||||
============
|
||||
|
||||
**vk-url-scraper** supports Python >= 3.7.
|
||||
**vk-url-scraper** supports Python >= 3.10.
|
||||
|
||||
## Installing with `pip`
|
||||
|
||||
|
||||
123
requirements.txt
123
requirements.txt
@@ -1,19 +1,104 @@
|
||||
#
|
||||
# These requirements were autogenerated by pipenv
|
||||
# To regenerate from the project's Pipfile, run:
|
||||
#
|
||||
# pipenv lock --requirements
|
||||
#
|
||||
|
||||
# -i https://pypi.org/simple
|
||||
brotli>=1.0.9; platform_python_implementation >= 'CPython'
|
||||
certifi>=2022.12.7; python_version >= '3.6'
|
||||
charset-normalizer>=3.0.1; python_version >= '3.6'
|
||||
idna>=3.4; python_version >= '3.5'
|
||||
mutagen>=1.46.0; python_version >= '3.7'
|
||||
pycryptodomex>=3.17; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||
requests>=2.28.2; python_version >= '3.7' and python_version < '4'
|
||||
urllib3>=1.26.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
||||
vk-api>=11.9.9
|
||||
websockets>=10.4; python_version >= '3.7'
|
||||
yt-dlp>=2023.2.17
|
||||
aiohttp==3.9.1
|
||||
aiosignal==1.3.1
|
||||
alabaster==0.7.16
|
||||
anyio==4.4.0
|
||||
async-timeout==4.0.3
|
||||
attrs==23.2.0
|
||||
Babel==2.15.0
|
||||
backports.tarfile==1.2.0
|
||||
beautifulsoup4==4.13.0b2
|
||||
black==24.4.2
|
||||
bleach==6.0.0
|
||||
Brotli==1.1.0
|
||||
certifi==2024.7.4
|
||||
cffi==1.17.0rc1
|
||||
charset-normalizer==3.3.2
|
||||
click==8.1.7
|
||||
colorama==0.4.6
|
||||
commonmark==0.9.1
|
||||
coverage==7.6.0
|
||||
cryptography==42.0.8
|
||||
docutils==0.18.1
|
||||
exceptiongroup==1.2.2
|
||||
flake8==7.1.0
|
||||
frozenlist==1.4.1
|
||||
furo==2023.3.27
|
||||
h11==0.14.0
|
||||
idna==3.7
|
||||
imagesize==1.4.1
|
||||
importlib_metadata==8.0.0
|
||||
iniconfig==2.0.0
|
||||
isort==6.0.0b2
|
||||
jaraco.classes==3.4.0
|
||||
jaraco.context==5.3.0
|
||||
jaraco.functools==4.0.1
|
||||
jeepney==0.8.0
|
||||
Jinja2==3.1.4
|
||||
keyring==25.2.1
|
||||
livereload==2.6.3
|
||||
markdown-it-py==2.2.0
|
||||
MarkupSafe==2.1.5
|
||||
mccabe==0.7.0
|
||||
mdit-py-plugins==0.3.5
|
||||
mdurl==0.1.2
|
||||
more-itertools==10.3.0
|
||||
multidict==6.0.4
|
||||
mutagen==1.47.0
|
||||
mypy==1.10.1
|
||||
mypy-extensions==1.0.0
|
||||
myst-parser==0.18.1
|
||||
nh3==0.2.18
|
||||
packaging==24.1
|
||||
pathspec==0.12.1
|
||||
pkginfo==1.10.0
|
||||
platformdirs==4.2.2
|
||||
pluggy==1.5.0
|
||||
py==1.11.0
|
||||
pycodestyle==2.12.0
|
||||
pycparser==2.22
|
||||
pycryptodomex==3.20.0
|
||||
pyflakes==3.2.0
|
||||
Pygments==2.18.0
|
||||
pyparsing==3.0.9
|
||||
pytest==8.2.2
|
||||
pytest-cov==5.0.0
|
||||
pytest-sphinx==0.6.3
|
||||
python-dotenv==1.0.1
|
||||
pytz==2022.1
|
||||
PyYAML==6.0.2rc1
|
||||
readme_renderer==43.0
|
||||
requests==2.32.3
|
||||
requests-toolbelt==1.0.0
|
||||
rfc3986==2.0.0
|
||||
rich==13.7.1
|
||||
SecretStorage==3.3.3
|
||||
six==1.16.0
|
||||
sniffio==1.3.1
|
||||
snowballstemmer==2.2.0
|
||||
soupsieve==2.5
|
||||
Sphinx==5.0.2
|
||||
sphinx-autobuild==2024.4.16
|
||||
sphinx-autodoc-typehints==1.19.1
|
||||
sphinx-basic-ng==1.0.0b2
|
||||
sphinx-copybutton==0.5.2
|
||||
sphinxcontrib-applehelp==1.0.8
|
||||
sphinxcontrib-devhelp==1.0.6
|
||||
sphinxcontrib-htmlhelp==2.0.5
|
||||
sphinxcontrib-jsmath==1.0.1
|
||||
sphinxcontrib-qthelp==1.0.7
|
||||
sphinxcontrib-serializinghtml==1.1.10
|
||||
starlette==0.37.2
|
||||
tomli==2.0.1
|
||||
tornado==6.4
|
||||
twine==5.1.1
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.2.2
|
||||
uvicorn==0.30.1
|
||||
# vk-api @ git+https://github.com/python273/vk_api.git@b99dac0ec2f832a6c4b20bde49869e7229ce4742
|
||||
vk-api==11.9.9
|
||||
watchfiles==0.22.0
|
||||
webencodings==0.5.1
|
||||
websockets==12.0
|
||||
yarl==1.9.4
|
||||
yt-dlp==2024.7.15.232803.dev0
|
||||
zipp==3.19.2
|
||||
|
||||
2
setup.py
2
setup.py
@@ -57,7 +57,7 @@ setup(
|
||||
package_data={"vk_url_scraper": ["py.typed"]},
|
||||
install_requires=read_requirements("requirements.txt"),
|
||||
extras_require={"dev": read_requirements("dev-requirements.txt")},
|
||||
python_requires=">=3.7",
|
||||
python_requires=">=3.10",
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"vk_url_scraper=vk_url_scraper.__main__:main",
|
||||
|
||||
@@ -81,7 +81,7 @@ def test_scrape_wall_url_with_photos():
|
||||
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
|
||||
)
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
|
||||
assert len(res[0]["payload"]) == 17
|
||||
assert len(res[0]["payload"]) == 18
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["photo"]
|
||||
assert len(res[0]["attachments"]["photo"]) == 9
|
||||
@@ -93,7 +93,7 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
|
||||
assert res[0]["id"] == "wall-17315087_74182"
|
||||
assert res[0]["text"] == ""
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
|
||||
assert len(res[0]["payload"]) == 17
|
||||
assert len(res[0]["payload"]) == 18
|
||||
assert len(res[0]["attachments"].keys()) == 3
|
||||
for k in ["photo", "link", "video"]:
|
||||
assert k in list(res[0]["attachments"].keys())
|
||||
@@ -128,7 +128,7 @@ def test_scrape_photo_only():
|
||||
== "Делимся расписанием конкурса [https://vk.com/wall-1_399468|«Код Петербурга»]. Все важные этапы — на одной схеме \n\nЕсли участвуете, обязательно сохраните себе. Так будет удобнее планировать работу над проектом, и вы точно не упустите лучший момент для отправки сервиса на модерацию."
|
||||
)
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 7, 9, 43))
|
||||
assert len(res[0]["payload"]) == 15
|
||||
assert len(res[0]["payload"]) == 16
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["photo"]
|
||||
assert len(res[0]["attachments"]["photo"]) == 1
|
||||
@@ -139,7 +139,6 @@ def test_scrape_video_only():
|
||||
assert len(res) == 1
|
||||
assert res[0]["id"] == "video38556806_456251917"
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38))
|
||||
assert len(res[0]["payload"]) == 34
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["video"]
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ class VkScraper:
|
||||
password : str
|
||||
Matching password on vk.com
|
||||
token : str
|
||||
Access token received after authenticating, can be found in the vl_config.v2.json file
|
||||
Access token received after authenticating, can be found in the vk_config.v2.json file
|
||||
session_file : str
|
||||
File name where the VK session is saved so future logins are easier, this will not be created if token is passed
|
||||
captcha_handler : func
|
||||
@@ -339,7 +339,9 @@ class VkScraper:
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
|
||||
ydl = yt_dlp.YoutubeDL(
|
||||
{
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"format": (
|
||||
"bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
|
||||
),
|
||||
"merge_output_format": "mp4",
|
||||
"retries": 5,
|
||||
"noplaylist": True,
|
||||
|
||||
@@ -17,7 +17,7 @@ def captcha_handler(captcha):
|
||||
key = input(
|
||||
f"CAPTCHA DETECTED, please solve it and input the solution. url= {captcha.get_url()} :"
|
||||
).strip()
|
||||
return captcha.try_again(key)
|
||||
return captcha.try_again(key.strip())
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
||||
@@ -2,7 +2,7 @@ _MAJOR = "0"
|
||||
_MINOR = "3"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "26"
|
||||
_PATCH = "30"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user