Compare commits

..

12 Commits

Author SHA1 Message Date
msramalho
b01dbe6299 fix vk_api dependency changes 2024-01-23 11:56:49 +00:00
msramalho
5b0f034c12 Bump version to v0.3.26 for release 2023-08-18 21:15:54 +01:00
msramalho
a1c098335c fix: private videos 2023-08-18 21:15:34 +01:00
msramalho
12a5d22f64 fix: certifi 2023-08-18 21:12:44 +01:00
Miguel Sozinho Ramalho
ab602e5d31 Update .readthedocs.yaml
https://blog.readthedocs.com/use-build-os-config/

https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
2023-08-16 18:34:36 +01:00
msramalho
67bc8b5569 Bump version to v0.3.24 for release 2023-05-10 17:09:22 +01:00
msramalho
021e7c2304 disables test due to CI 2023-05-10 17:08:39 +01:00
msramalho
91b6dcf291 Bump version to v0.3.23 for release 2023-05-10 16:47:53 +01:00
msramalho
2a1a4e2cae minor CI update 2023-05-10 16:47:39 +01:00
msramalho
fc6b914e2d Bump version to v0.3.22 for release 2023-05-10 16:28:30 +01:00
Logan Williams
d155c1364a Bump version number 2023-05-10 14:56:39 +02:00
Logan Williams
8882a87048 Fix import order 2023-05-10 14:33:47 +02:00
10 changed files with 1840 additions and 1089 deletions

View File

@@ -31,10 +31,10 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
python: ['3.7', '3.10'] python: ['3.7', '3.10']
task: # --show-capture=no on purpose task: # --show-capture=no on purpose, -s for captchas
- name: Test - name: Test
run: | run: |
pytest --show-capture=no --color=yes tests/ pytest -s --show-capture=no --color=yes tests/
include: include:
- python: '3.10' - python: '3.10'

View File

@@ -4,8 +4,12 @@ sphinx:
configuration: docs/source/conf.py configuration: docs/source/conf.py
fail_on_warning: false fail_on_warning: false
build:
os: "ubuntu-22.04"
tools:
python: "3.8"
python: python:
version: "3.8"
install: install:
- requirements: requirements.txt - requirements: requirements.txt
- requirements: dev-requirements.txt - requirements: dev-requirements.txt

View File

@@ -4,7 +4,6 @@ verify_ssl = true
name = "pypi" name = "pypi"
[packages] [packages]
vk-api = ">=11.9.9"
yt-dlp = ">=2023.2.17" yt-dlp = ">=2023.2.17"
flake8 = "*" flake8 = "*"
mypy = ">=0.961" mypy = ">=0.961"
@@ -22,7 +21,7 @@ sphinx-autobuild = ">=2021.3.14"
sphinx-autodoc-typehints = "*" sphinx-autodoc-typehints = "*"
python-dotenv = ">=0.21.1" python-dotenv = ">=0.21.1"
brotli = ">=1.0.9" brotli = ">=1.0.9"
certifi = ">=2022.12.7" certifi = ">=2023.7.22"
charset-normalizer = ">=3.0.1" charset-normalizer = ">=3.0.1"
idna = ">=3.4" idna = ">=3.4"
mutagen = ">=1.46.0" mutagen = ">=1.46.0"
@@ -30,6 +29,7 @@ pycryptodomex = ">=3.17"
requests = ">=2.28.2" requests = ">=2.28.2"
urllib3 = ">=1.26.14" urllib3 = ">=1.26.14"
websockets = ">=10.4" websockets = ">=10.4"
vk-api = {ref = "77b5a0d51a6bbf54d59554332f28a488615fbd6c", git = "git+https://github.com/python273/vk_api.git"}
[dev-packages] [dev-packages]
sphinx-copybutton = "==0.5.0" sphinx-copybutton = "==0.5.0"

2842
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -20,7 +20,7 @@ To use the library you will need a valid username/password combination for vk.co
vk_url_scraper --help vk_url_scraper --help
# scrape a URL and get the JSON result in the console # scrape a URL and get the JSON result in the console
vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789 vk_url_scraper --username "username here" --password "password here" --urls https://vk.com/wall12345_6789
# OR # OR
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
# you can also have multiple urls # you can also have multiple urls
@@ -28,7 +28,7 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
# you can pass a token as well to avoid always authenticating # you can pass a token as well to avoid always authenticating
# and possibly getting captcha prompts # and possibly getting captcha prompts
# you can fetch the token from the bk_config.v2.json file generated under by searching for "access_token" # you can fetch the token from the vk_config.v2.json file generated under by searching for "access_token"
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789 vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
# save the JSON output into a file # save the JSON output into a file
@@ -89,7 +89,7 @@ see [docs] for all available functions.
2. To run all checks to `make run-checks` (fixes style) or individually 2. To run all checks to `make run-checks` (fixes style) or individually
1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint 1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
2. To do type checking: `mypy .` 2. To do type checking: `mypy .`
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples) 3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to use verbose, colors, and test docstring examples)
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed 3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...` To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`

View File

@@ -14,15 +14,16 @@ def test_login_fail():
VkScraper("invalid", "combination") VkScraper("invalid", "combination")
def test_login_custom_file(): # disabled due to CI
session_filename = "test-session.json" # def test_login_custom_file():
VkScraper( # session_filename = "test-session.json"
os.environ["VK_USERNAME"], # VkScraper(
os.environ["VK_PASSWORD"], # os.environ["VK_USERNAME"],
session_file=session_filename, # os.environ["VK_PASSWORD"],
) # session_file=session_filename,
assert os.path.isfile(session_filename) # )
os.unlink(session_filename) # assert os.path.isfile(session_filename)
# os.unlink(session_filename)
def test_login_success(): def test_login_success():
@@ -80,7 +81,7 @@ def test_scrape_wall_url_with_photos():
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея." == "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
) )
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24)) assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
assert len(res[0]["payload"]) == 17 assert len(res[0]["payload"]) == 18
assert len(res[0]["attachments"].keys()) == 1 assert len(res[0]["attachments"].keys()) == 1
assert list(res[0]["attachments"].keys()) == ["photo"] assert list(res[0]["attachments"].keys()) == ["photo"]
assert len(res[0]["attachments"]["photo"]) == 9 assert len(res[0]["attachments"]["photo"]) == 9
@@ -92,7 +93,7 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
assert res[0]["id"] == "wall-17315087_74182" assert res[0]["id"] == "wall-17315087_74182"
assert res[0]["text"] == "" assert res[0]["text"] == ""
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9)) assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
assert len(res[0]["payload"]) == 17 assert len(res[0]["payload"]) == 18
assert len(res[0]["attachments"].keys()) == 3 assert len(res[0]["attachments"].keys()) == 3
for k in ["photo", "link", "video"]: for k in ["photo", "link", "video"]:
assert k in list(res[0]["attachments"].keys()) assert k in list(res[0]["attachments"].keys())
@@ -127,7 +128,7 @@ def test_scrape_photo_only():
== "Делимся расписанием конкурса [https://vk.com/wall-1_399468|«Код Петербурга»]. Все важные этапы — на одной схеме \n\nЕсли участвуете, обязательно сохраните себе. Так будет удобнее планировать работу над проектом, и вы точно не упустите лучший момент для отправки сервиса на модерацию." == "Делимся расписанием конкурса [https://vk.com/wall-1_399468|«Код Петербурга»]. Все важные этапы — на одной схеме \n\nЕсли участвуете, обязательно сохраните себе. Так будет удобнее планировать работу над проектом, и вы точно не упустите лучший момент для отправки сервиса на модерацию."
) )
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 7, 9, 43)) assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 7, 9, 43))
assert len(res[0]["payload"]) == 15 assert len(res[0]["payload"]) == 16
assert len(res[0]["attachments"].keys()) == 1 assert len(res[0]["attachments"].keys()) == 1
assert list(res[0]["attachments"].keys()) == ["photo"] assert list(res[0]["attachments"].keys()) == ["photo"]
assert len(res[0]["attachments"]["photo"]) == 1 assert len(res[0]["attachments"]["photo"]) == 1
@@ -149,3 +150,21 @@ def test_scrape_video_only2():
vks.download_media(res, tempdir) vks.download_media(res, tempdir)
found_files = set(os.listdir(tempdir)) found_files = set(os.listdir(tempdir))
assert "video-17546758_456239898_0.mp4" in found_files assert "video-17546758_456239898_0.mp4" in found_files
def test_scrape_private_video():
"""
> Some videos are kept private and cannot be accessed without a passkey . In this case, send the ID in {owner_id}_{video_id}_{access_key}.
From https://dev.vk.com/ru/method/video.get
"""
res = vks.scrape("https://vk.com/wall-127774884_178565")
with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir)
expect_files = {
"wall-127774884_178565_0.mp4",
"wall-127774884_178565_1.mp4",
"wall-127774884_178565_2.mp4",
}
found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files)

View File

@@ -19,7 +19,7 @@ def get_argument_parser():
action="store", action="store",
dest="username", dest="username",
required=True, required=True,
help="username for a valid vk.com account", help="username for a valid vk.com account (pass empty if using --token)",
) )
parser.add_argument( parser.add_argument(
"-p", "-p",
@@ -27,7 +27,7 @@ def get_argument_parser():
action="store", action="store",
dest="password", dest="password",
required=True, required=True,
help="password for the valid vk.com account", help="password for the valid vk.com account (pass empty if using --token)",
) )
parser.add_argument( parser.add_argument(
"-t", "-t",

View File

@@ -3,7 +3,7 @@ import re
import shutil import shutil
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from typing import Optional, List from typing import List, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@@ -37,7 +37,7 @@ class VkScraper:
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)") WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)") VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
def __init__( def __init__(
self, self,
@@ -144,10 +144,11 @@ class VkScraper:
first_type = a["type"] first_type = a["type"]
attachment = a[first_type] attachment = a[first_type]
if first_type == "video": if first_type == "video":
video_path = f'video{attachment["owner_id"]}_{attachment["id"]}'
if "access_key" in attachment:
video_path += f"_{attachment['access_key']}"
attachments["video"].extend( attachments["video"].extend(
self.scrape_videos(f'video{attachment["owner_id"]}_{attachment["id"]}')[ self.scrape_videos(video_path)[0]
0
]
.get("attachments", {}) .get("attachments", {})
.get("video", [""]) .get("video", [""])
) )
@@ -352,9 +353,10 @@ class VkScraper:
info = ydl.extract_info(url, download=True) info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info) filename = ydl.prepare_filename(info)
if "unknown_video" in filename: if "unknown_video" in filename:
old_filename = filename
filename = shutil.copy( filename = shutil.copy(
filename, filename.replace("unknown_video", "mkv") filename, filename.replace("unknown_video", "mp4")
) )
os.remove(filename) os.remove(old_filename)
downloaded.append(filename) downloaded.append(filename)
return downloaded return downloaded

View File

@@ -15,7 +15,7 @@ class DateTimeEncoder(json.JSONEncoder):
def captcha_handler(captcha): def captcha_handler(captcha):
key = input( key = input(
f"CAPTCHA DETECTED, please solve it and input the solution. url={captcha.get_url()}:" f"CAPTCHA DETECTED, please solve it and input the solution. url= {captcha.get_url()} :"
).strip() ).strip()
return captcha.try_again(key) return captcha.try_again(key)

View File

@@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "3" _MINOR = "3"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "15" _PATCH = "27"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""