mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-10 12:28:39 +03:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b01dbe6299 | ||
|
|
5b0f034c12 | ||
|
|
a1c098335c | ||
|
|
12a5d22f64 | ||
|
|
ab602e5d31 | ||
|
|
67bc8b5569 | ||
|
|
021e7c2304 | ||
|
|
91b6dcf291 | ||
|
|
2a1a4e2cae |
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@@ -31,10 +31,10 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python: ['3.7', '3.10']
|
||||
task: # --show-capture=no on purpose
|
||||
task: # --show-capture=no on purpose, -s for captchas
|
||||
- name: Test
|
||||
run: |
|
||||
pytest --show-capture=no --color=yes tests/
|
||||
pytest -s --show-capture=no --color=yes tests/
|
||||
|
||||
include:
|
||||
- python: '3.10'
|
||||
|
||||
@@ -4,8 +4,12 @@ sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: false
|
||||
|
||||
build:
|
||||
os: "ubuntu-22.04"
|
||||
tools:
|
||||
python: "3.8"
|
||||
|
||||
python:
|
||||
version: "3.8"
|
||||
install:
|
||||
- requirements: requirements.txt
|
||||
- requirements: dev-requirements.txt
|
||||
|
||||
4
Pipfile
4
Pipfile
@@ -4,7 +4,6 @@ verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
vk-api = ">=11.9.9"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
flake8 = "*"
|
||||
mypy = ">=0.961"
|
||||
@@ -22,7 +21,7 @@ sphinx-autobuild = ">=2021.3.14"
|
||||
sphinx-autodoc-typehints = "*"
|
||||
python-dotenv = ">=0.21.1"
|
||||
brotli = ">=1.0.9"
|
||||
certifi = ">=2022.12.7"
|
||||
certifi = ">=2023.7.22"
|
||||
charset-normalizer = ">=3.0.1"
|
||||
idna = ">=3.4"
|
||||
mutagen = ">=1.46.0"
|
||||
@@ -30,6 +29,7 @@ pycryptodomex = ">=3.17"
|
||||
requests = ">=2.28.2"
|
||||
urllib3 = ">=1.26.14"
|
||||
websockets = ">=10.4"
|
||||
vk-api = {ref = "77b5a0d51a6bbf54d59554332f28a488615fbd6c", git = "git+https://github.com/python273/vk_api.git"}
|
||||
|
||||
[dev-packages]
|
||||
sphinx-copybutton = "==0.5.0"
|
||||
|
||||
2842
Pipfile.lock
generated
2842
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -20,7 +20,7 @@ To use the library you will need a valid username/password combination for vk.co
|
||||
vk_url_scraper --help
|
||||
|
||||
# scrape a URL and get the JSON result in the console
|
||||
vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789
|
||||
vk_url_scraper --username "username here" --password "password here" --urls https://vk.com/wall12345_6789
|
||||
# OR
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
|
||||
# you can also have multiple urls
|
||||
@@ -28,7 +28,7 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
|
||||
|
||||
# you can pass a token as well to avoid always authenticating
|
||||
# and possibly getting captcha prompts
|
||||
# you can fetch the token from the bk_config.v2.json file generated under by searching for "access_token"
|
||||
# you can fetch the token from the vk_config.v2.json file generated under by searching for "access_token"
|
||||
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
|
||||
|
||||
# save the JSON output into a file
|
||||
@@ -89,7 +89,7 @@ see [docs] for all available functions.
|
||||
2. To run all checks to `make run-checks` (fixes style) or individually
|
||||
1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
|
||||
2. To do type checking: `mypy .`
|
||||
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
|
||||
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to use verbose, colors, and test docstring examples)
|
||||
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
|
||||
|
||||
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`
|
||||
|
||||
@@ -14,15 +14,16 @@ def test_login_fail():
|
||||
VkScraper("invalid", "combination")
|
||||
|
||||
|
||||
def test_login_custom_file():
|
||||
session_filename = "test-session.json"
|
||||
VkScraper(
|
||||
os.environ["VK_USERNAME"],
|
||||
os.environ["VK_PASSWORD"],
|
||||
session_file=session_filename,
|
||||
)
|
||||
assert os.path.isfile(session_filename)
|
||||
os.unlink(session_filename)
|
||||
# disabled due to CI
|
||||
# def test_login_custom_file():
|
||||
# session_filename = "test-session.json"
|
||||
# VkScraper(
|
||||
# os.environ["VK_USERNAME"],
|
||||
# os.environ["VK_PASSWORD"],
|
||||
# session_file=session_filename,
|
||||
# )
|
||||
# assert os.path.isfile(session_filename)
|
||||
# os.unlink(session_filename)
|
||||
|
||||
|
||||
def test_login_success():
|
||||
@@ -80,7 +81,7 @@ def test_scrape_wall_url_with_photos():
|
||||
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
|
||||
)
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
|
||||
assert len(res[0]["payload"]) == 17
|
||||
assert len(res[0]["payload"]) == 18
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["photo"]
|
||||
assert len(res[0]["attachments"]["photo"]) == 9
|
||||
@@ -92,7 +93,7 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
|
||||
assert res[0]["id"] == "wall-17315087_74182"
|
||||
assert res[0]["text"] == ""
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
|
||||
assert len(res[0]["payload"]) == 17
|
||||
assert len(res[0]["payload"]) == 18
|
||||
assert len(res[0]["attachments"].keys()) == 3
|
||||
for k in ["photo", "link", "video"]:
|
||||
assert k in list(res[0]["attachments"].keys())
|
||||
@@ -127,7 +128,7 @@ def test_scrape_photo_only():
|
||||
== "Делимся расписанием конкурса [https://vk.com/wall-1_399468|«Код Петербурга»]. Все важные этапы — на одной схеме \n\nЕсли участвуете, обязательно сохраните себе. Так будет удобнее планировать работу над проектом, и вы точно не упустите лучший момент для отправки сервиса на модерацию."
|
||||
)
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 7, 9, 43))
|
||||
assert len(res[0]["payload"]) == 15
|
||||
assert len(res[0]["payload"]) == 16
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["photo"]
|
||||
assert len(res[0]["attachments"]["photo"]) == 1
|
||||
@@ -149,3 +150,21 @@ def test_scrape_video_only2():
|
||||
vks.download_media(res, tempdir)
|
||||
found_files = set(os.listdir(tempdir))
|
||||
assert "video-17546758_456239898_0.mp4" in found_files
|
||||
|
||||
|
||||
def test_scrape_private_video():
|
||||
"""
|
||||
> Some videos are kept private and cannot be accessed without a passkey . In this case, send the ID in {owner_id}_{video_id}_{access_key}.
|
||||
From https://dev.vk.com/ru/method/video.get
|
||||
"""
|
||||
res = vks.scrape("https://vk.com/wall-127774884_178565")
|
||||
|
||||
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||
vks.download_media(res, tempdir)
|
||||
expect_files = {
|
||||
"wall-127774884_178565_0.mp4",
|
||||
"wall-127774884_178565_1.mp4",
|
||||
"wall-127774884_178565_2.mp4",
|
||||
}
|
||||
found_files = set(os.listdir(tempdir))
|
||||
assert len(expect_files) == len(expect_files & found_files)
|
||||
|
||||
@@ -19,7 +19,7 @@ def get_argument_parser():
|
||||
action="store",
|
||||
dest="username",
|
||||
required=True,
|
||||
help="username for a valid vk.com account",
|
||||
help="username for a valid vk.com account (pass empty if using --token)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
@@ -27,7 +27,7 @@ def get_argument_parser():
|
||||
action="store",
|
||||
dest="password",
|
||||
required=True,
|
||||
help="password for the valid vk.com account",
|
||||
help="password for the valid vk.com account (pass empty if using --token)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
|
||||
@@ -37,7 +37,7 @@ class VkScraper:
|
||||
|
||||
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -144,10 +144,11 @@ class VkScraper:
|
||||
first_type = a["type"]
|
||||
attachment = a[first_type]
|
||||
if first_type == "video":
|
||||
video_path = f'video{attachment["owner_id"]}_{attachment["id"]}'
|
||||
if "access_key" in attachment:
|
||||
video_path += f"_{attachment['access_key']}"
|
||||
attachments["video"].extend(
|
||||
self.scrape_videos(f'video{attachment["owner_id"]}_{attachment["id"]}')[
|
||||
0
|
||||
]
|
||||
self.scrape_videos(video_path)[0]
|
||||
.get("attachments", {})
|
||||
.get("video", [""])
|
||||
)
|
||||
@@ -352,9 +353,10 @@ class VkScraper:
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl.prepare_filename(info)
|
||||
if "unknown_video" in filename:
|
||||
old_filename = filename
|
||||
filename = shutil.copy(
|
||||
filename, filename.replace("unknown_video", "mkv")
|
||||
filename, filename.replace("unknown_video", "mp4")
|
||||
)
|
||||
os.remove(filename)
|
||||
os.remove(old_filename)
|
||||
downloaded.append(filename)
|
||||
return downloaded
|
||||
|
||||
@@ -15,7 +15,7 @@ class DateTimeEncoder(json.JSONEncoder):
|
||||
|
||||
def captcha_handler(captcha):
|
||||
key = input(
|
||||
f"CAPTCHA DETECTED, please solve it and input the solution. url={captcha.get_url()}:"
|
||||
f"CAPTCHA DETECTED, please solve it and input the solution. url= {captcha.get_url()} :"
|
||||
).strip()
|
||||
return captcha.try_again(key)
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ _MAJOR = "0"
|
||||
_MINOR = "3"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "22"
|
||||
_PATCH = "27"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user