Compare commits

..

15 Commits

Author SHA1 Message Date
msramalho
73f17407c0 reverting library dependencies 2024-01-23 18:09:56 +00:00
msramalho
95d249f5d0 min py to 3.10 2024-01-23 13:01:38 +00:00
msramalho
ccb8c1f5c7 min python to 3.8 2024-01-23 12:50:55 +00:00
msramalho
e525ff24b1 lint 2024-01-23 12:45:45 +00:00
msramalho
699b4ebdd8 fix lib dependencies in pypi version 2024-01-23 12:41:25 +00:00
msramalho
8d1a86a7fa fix captcha processing 2024-01-23 12:41:14 +00:00
msramalho
b01dbe6299 fix vk_api dependency changes 2024-01-23 11:56:49 +00:00
msramalho
5b0f034c12 Bump version to v0.3.26 for release 2023-08-18 21:15:54 +01:00
msramalho
a1c098335c fix: private videos 2023-08-18 21:15:34 +01:00
msramalho
12a5d22f64 fix: certifi 2023-08-18 21:12:44 +01:00
Miguel Sozinho Ramalho
ab602e5d31 Update .readthedocs.yaml
https://blog.readthedocs.com/use-build-os-config/

https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
2023-08-16 18:34:36 +01:00
msramalho
67bc8b5569 Bump version to v0.3.24 for release 2023-05-10 17:09:22 +01:00
msramalho
021e7c2304 disables test due to CI 2023-05-10 17:08:39 +01:00
msramalho
91b6dcf291 Bump version to v0.3.23 for release 2023-05-10 16:47:53 +01:00
msramalho
2a1a4e2cae minor CI update 2023-05-10 16:47:39 +01:00
14 changed files with 1943 additions and 1109 deletions

View File

@@ -30,11 +30,11 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python: ['3.7', '3.10'] python: ['3.10']
task: # --show-capture=no on purpose task: # --show-capture=no on purpose, -s for captchas
- name: Test - name: Test
run: | run: |
pytest --show-capture=no --color=yes tests/ pytest -s --show-capture=no --color=yes tests/
include: include:
- python: '3.10' - python: '3.10'

View File

@@ -4,8 +4,12 @@ sphinx:
configuration: docs/source/conf.py configuration: docs/source/conf.py
fail_on_warning: false fail_on_warning: false
build:
os: "ubuntu-22.04"
tools:
python: "3.10"
python: python:
version: "3.8"
install: install:
- requirements: requirements.txt - requirements: requirements.txt
- requirements: dev-requirements.txt - requirements: dev-requirements.txt

View File

@@ -13,4 +13,4 @@ run-checks :
black . black .
flake8 . flake8 .
mypy . mypy .
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/ CUDA_VISIBLE_DEVICES='' pytest -v --color=yes .

View File

@@ -4,7 +4,6 @@ verify_ssl = true
name = "pypi" name = "pypi"
[packages] [packages]
vk-api = ">=11.9.9"
yt-dlp = ">=2023.2.17" yt-dlp = ">=2023.2.17"
flake8 = "*" flake8 = "*"
mypy = ">=0.961" mypy = ">=0.961"
@@ -22,7 +21,7 @@ sphinx-autobuild = ">=2021.3.14"
sphinx-autodoc-typehints = "*" sphinx-autodoc-typehints = "*"
python-dotenv = ">=0.21.1" python-dotenv = ">=0.21.1"
brotli = ">=1.0.9" brotli = ">=1.0.9"
certifi = ">=2022.12.7" certifi = ">=2023.7.22"
charset-normalizer = ">=3.0.1" charset-normalizer = ">=3.0.1"
idna = ">=3.4" idna = ">=3.4"
mutagen = ">=1.46.0" mutagen = ">=1.46.0"
@@ -30,6 +29,8 @@ pycryptodomex = ">=3.17"
requests = ">=2.28.2" requests = ">=2.28.2"
urllib3 = ">=1.26.14" urllib3 = ">=1.26.14"
websockets = ">=10.4" websockets = ">=10.4"
# vk-api = {ref = "77b5a0d51a6bbf54d59554332f28a488615fbd6c", git = "git+https://github.com/python273/vk_api.git"}
vk-api = "*"
[dev-packages] [dev-packages]
sphinx-copybutton = "==0.5.0" sphinx-copybutton = "==0.5.0"

2834
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -20,7 +20,7 @@ To use the library you will need a valid username/password combination for vk.co
vk_url_scraper --help vk_url_scraper --help
# scrape a URL and get the JSON result in the console # scrape a URL and get the JSON result in the console
vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789 vk_url_scraper --username "username here" --password "password here" --urls https://vk.com/wall12345_6789
# OR # OR
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
# you can also have multiple urls # you can also have multiple urls
@@ -28,7 +28,7 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
# you can pass a token as well to avoid always authenticating # you can pass a token as well to avoid always authenticating
# and possibly getting captcha prompts # and possibly getting captcha prompts
# you can fetch the token from the bk_config.v2.json file generated under by searching for "access_token" # you can fetch the token from the vk_config.v2.json file generated under by searching for "access_token"
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789 vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
# save the JSON output into a file # save the JSON output into a file
@@ -89,7 +89,7 @@ see [docs] for all available functions.
2. To run all checks to `make run-checks` (fixes style) or individually 2. To run all checks to `make run-checks` (fixes style) or individually
1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint 1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
2. To do type checking: `mypy .` 2. To do type checking: `mypy .`
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples) 3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to use verbose, colors, and test docstring examples)
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed 3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...` To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`
@@ -97,10 +97,11 @@ To test the command line interface available in [__main__.py](__vk_url_scraper/_
## Releasing new version ## Releasing new version
1. edit [version.py](vk_url_scraper/version.py) with proper versioning 1. edit [version.py](vk_url_scraper/version.py) with proper versioning
2. run `./scripts/release.sh` to create a tag and push, alternatively 2. make sure to run `pipenv run pip freeze > requirements.txt` if you manage libs with pipenv
3. run `./scripts/release.sh` to create a tag and push, alternatively
1. `git tag vx.y.z` to tag version 1. `git tag vx.y.z` to tag version
2. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/) 2. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
3. go to https://readthedocs.org/ to deploy new docs version (if webhook is not setup) 4. go to https://readthedocs.org/ to deploy new docs version (if webhook is not setup)
### Fixing a failed release ### Fixing a failed release

View File

@@ -1,7 +1,7 @@
Installation Installation
============ ============
**vk-url-scraper** supports Python >= 3.7. **vk-url-scraper** supports Python >= 3.10.
## Installing with `pip` ## Installing with `pip`

View File

@@ -1,19 +1,94 @@
# aiohttp==3.9.1
# These requirements were autogenerated by pipenv aiosignal==1.3.1
# To regenerate from the project's Pipfile, run: alabaster==0.7.16
# async-timeout==4.0.3
# pipenv lock --requirements attrs==23.2.0
# Babel==2.14.0
beautifulsoup4==4.12.3
# -i https://pypi.org/simple black==24.1a1
brotli>=1.0.9; platform_python_implementation >= 'CPython' bleach==6.0.0
certifi>=2022.12.7; python_version >= '3.6' Brotli==1.1.0
charset-normalizer>=3.0.1; python_version >= '3.6' certifi==2023.11.17
idna>=3.4; python_version >= '3.5' cffi==1.16.0
mutagen>=1.46.0; python_version >= '3.7' charset-normalizer==3.3.2
pycryptodomex>=3.17; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' click==8.1.7
requests>=2.28.2; python_version >= '3.7' and python_version < '4' colorama==0.4.6
urllib3>=1.26.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' commonmark==0.9.1
vk-api>=11.9.9 coverage==7.4.0
websockets>=10.4; python_version >= '3.7' cryptography==42.0.0
yt-dlp>=2023.2.17 docutils==0.18.1
exceptiongroup==1.2.0
flake8==7.0.0
frozenlist==1.4.1
furo==2023.3.27
idna==3.6
imagesize==1.4.1
importlib-metadata==7.0.1
iniconfig==2.0.0
isort==6.0.0b2
jaraco.classes==3.3.0
jeepney==0.8.0
Jinja2==3.1.3
keyring==24.3.0
livereload==2.6.3
markdown-it-py==2.2.0
MarkupSafe==2.1.4
mccabe==0.7.0
mdit-py-plugins==0.3.5
mdurl==0.1.2
more-itertools==10.2.0
multidict==6.0.4
mutagen==1.47.0
mypy==1.8.0
mypy-extensions==1.0.0
myst-parser==0.18.1
nh3==0.2.15
packaging==23.2
pathspec==0.12.1
pkginfo==1.9.6
platformdirs==4.1.0
pluggy==1.3.0
py==1.11.0
pycodestyle==2.11.1
pycparser==2.21
pycryptodomex==3.20.0
pyflakes==3.2.0
Pygments==2.17.2
pyparsing==3.0.9
pytest==8.0.0rc2
pytest-cov==4.1.0
pytest-sphinx==0.5.0
python-dotenv==1.0.1
pytz==2022.1
PyYAML==6.0.1
readme-renderer==42.0
requests==2.31.0
requests-toolbelt==1.0.0
rfc3986==2.0.0
rich==13.7.0
SecretStorage==3.3.3
six==1.16.0
snowballstemmer==2.2.0
soupsieve==2.5
Sphinx==5.0.2
sphinx-autobuild==2021.3.14
sphinx-autodoc-typehints==1.19.1
sphinx-basic-ng==1.0.0b2
sphinx-copybutton==0.5.2
sphinxcontrib-applehelp==1.0.8
sphinxcontrib-devhelp==1.0.6
sphinxcontrib-htmlhelp==2.0.5
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.7
sphinxcontrib-serializinghtml==1.1.10
tomli==2.0.1
tornado==6.4
twine==4.0.2
typing_extensions==4.9.0
urllib3==2.1.0
vk-api @ git+https://github.com/python273/vk_api.git@77b5a0d51a6bbf54d59554332f28a488615fbd6c
webencodings==0.5.1
websockets==12.0
yarl==1.9.4
yt-dlp==2024.1.22.232713.dev0
zipp==3.17.0

View File

@@ -57,7 +57,7 @@ setup(
package_data={"vk_url_scraper": ["py.typed"]}, package_data={"vk_url_scraper": ["py.typed"]},
install_requires=read_requirements("requirements.txt"), install_requires=read_requirements("requirements.txt"),
extras_require={"dev": read_requirements("dev-requirements.txt")}, extras_require={"dev": read_requirements("dev-requirements.txt")},
python_requires=">=3.7", python_requires=">=3.10",
entry_points={ entry_points={
"console_scripts": [ "console_scripts": [
"vk_url_scraper=vk_url_scraper.__main__:main", "vk_url_scraper=vk_url_scraper.__main__:main",

View File

@@ -14,15 +14,16 @@ def test_login_fail():
VkScraper("invalid", "combination") VkScraper("invalid", "combination")
def test_login_custom_file(): # disabled due to CI
session_filename = "test-session.json" # def test_login_custom_file():
VkScraper( # session_filename = "test-session.json"
os.environ["VK_USERNAME"], # VkScraper(
os.environ["VK_PASSWORD"], # os.environ["VK_USERNAME"],
session_file=session_filename, # os.environ["VK_PASSWORD"],
) # session_file=session_filename,
assert os.path.isfile(session_filename) # )
os.unlink(session_filename) # assert os.path.isfile(session_filename)
# os.unlink(session_filename)
def test_login_success(): def test_login_success():
@@ -80,7 +81,7 @@ def test_scrape_wall_url_with_photos():
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея." == "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
) )
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24)) assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
assert len(res[0]["payload"]) == 17 assert len(res[0]["payload"]) == 18
assert len(res[0]["attachments"].keys()) == 1 assert len(res[0]["attachments"].keys()) == 1
assert list(res[0]["attachments"].keys()) == ["photo"] assert list(res[0]["attachments"].keys()) == ["photo"]
assert len(res[0]["attachments"]["photo"]) == 9 assert len(res[0]["attachments"]["photo"]) == 9
@@ -92,7 +93,7 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
assert res[0]["id"] == "wall-17315087_74182" assert res[0]["id"] == "wall-17315087_74182"
assert res[0]["text"] == "" assert res[0]["text"] == ""
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9)) assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
assert len(res[0]["payload"]) == 17 assert len(res[0]["payload"]) == 18
assert len(res[0]["attachments"].keys()) == 3 assert len(res[0]["attachments"].keys()) == 3
for k in ["photo", "link", "video"]: for k in ["photo", "link", "video"]:
assert k in list(res[0]["attachments"].keys()) assert k in list(res[0]["attachments"].keys())
@@ -127,7 +128,7 @@ def test_scrape_photo_only():
== "Делимся расписанием конкурса [https://vk.com/wall-1_399468|«Код Петербурга»]. Все важные этапы — на одной схеме \n\nЕсли участвуете, обязательно сохраните себе. Так будет удобнее планировать работу над проектом, и вы точно не упустите лучший момент для отправки сервиса на модерацию." == "Делимся расписанием конкурса [https://vk.com/wall-1_399468|«Код Петербурга»]. Все важные этапы — на одной схеме \n\nЕсли участвуете, обязательно сохраните себе. Так будет удобнее планировать работу над проектом, и вы точно не упустите лучший момент для отправки сервиса на модерацию."
) )
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 7, 9, 43)) assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 7, 9, 43))
assert len(res[0]["payload"]) == 15 assert len(res[0]["payload"]) == 16
assert len(res[0]["attachments"].keys()) == 1 assert len(res[0]["attachments"].keys()) == 1
assert list(res[0]["attachments"].keys()) == ["photo"] assert list(res[0]["attachments"].keys()) == ["photo"]
assert len(res[0]["attachments"]["photo"]) == 1 assert len(res[0]["attachments"]["photo"]) == 1
@@ -149,3 +150,21 @@ def test_scrape_video_only2():
vks.download_media(res, tempdir) vks.download_media(res, tempdir)
found_files = set(os.listdir(tempdir)) found_files = set(os.listdir(tempdir))
assert "video-17546758_456239898_0.mp4" in found_files assert "video-17546758_456239898_0.mp4" in found_files
def test_scrape_private_video():
"""
> Some videos are kept private and cannot be accessed without a passkey . In this case, send the ID in {owner_id}_{video_id}_{access_key}.
From https://dev.vk.com/ru/method/video.get
"""
res = vks.scrape("https://vk.com/wall-127774884_178565")
with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir)
expect_files = {
"wall-127774884_178565_0.mp4",
"wall-127774884_178565_1.mp4",
"wall-127774884_178565_2.mp4",
}
found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files)

View File

@@ -19,7 +19,7 @@ def get_argument_parser():
action="store", action="store",
dest="username", dest="username",
required=True, required=True,
help="username for a valid vk.com account", help="username for a valid vk.com account (pass empty if using --token)",
) )
parser.add_argument( parser.add_argument(
"-p", "-p",
@@ -27,7 +27,7 @@ def get_argument_parser():
action="store", action="store",
dest="password", dest="password",
required=True, required=True,
help="password for the valid vk.com account", help="password for the valid vk.com account (pass empty if using --token)",
) )
parser.add_argument( parser.add_argument(
"-t", "-t",

View File

@@ -37,7 +37,7 @@ class VkScraper:
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)") WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)") VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
def __init__( def __init__(
self, self,
@@ -144,10 +144,11 @@ class VkScraper:
first_type = a["type"] first_type = a["type"]
attachment = a[first_type] attachment = a[first_type]
if first_type == "video": if first_type == "video":
video_path = f'video{attachment["owner_id"]}_{attachment["id"]}'
if "access_key" in attachment:
video_path += f"_{attachment['access_key']}"
attachments["video"].extend( attachments["video"].extend(
self.scrape_videos(f'video{attachment["owner_id"]}_{attachment["id"]}')[ self.scrape_videos(video_path)[0]
0
]
.get("attachments", {}) .get("attachments", {})
.get("video", [""]) .get("video", [""])
) )
@@ -338,7 +339,9 @@ class VkScraper:
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s") filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
ydl = yt_dlp.YoutubeDL( ydl = yt_dlp.YoutubeDL(
{ {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", "format": (
"bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
),
"merge_output_format": "mp4", "merge_output_format": "mp4",
"retries": 5, "retries": 5,
"noplaylist": True, "noplaylist": True,
@@ -352,9 +355,10 @@ class VkScraper:
info = ydl.extract_info(url, download=True) info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info) filename = ydl.prepare_filename(info)
if "unknown_video" in filename: if "unknown_video" in filename:
old_filename = filename
filename = shutil.copy( filename = shutil.copy(
filename, filename.replace("unknown_video", "mkv") filename, filename.replace("unknown_video", "mp4")
) )
os.remove(filename) os.remove(old_filename)
downloaded.append(filename) downloaded.append(filename)
return downloaded return downloaded

View File

@@ -15,9 +15,9 @@ class DateTimeEncoder(json.JSONEncoder):
def captcha_handler(captcha): def captcha_handler(captcha):
key = input( key = input(
f"CAPTCHA DETECTED, please solve it and input the solution. url={captcha.get_url()}:" f"CAPTCHA DETECTED, please solve it and input the solution. url= {captcha.get_url()} :"
).strip() ).strip()
return captcha.try_again(key) return captcha.try_again(key.strip())
@contextmanager @contextmanager

View File

@@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "3" _MINOR = "3"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "22" _PATCH = "28"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""