mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-12 21:38:36 +03:00
Compare commits
18 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5b0f034c12 | ||
|
|
a1c098335c | ||
|
|
12a5d22f64 | ||
|
|
ab602e5d31 | ||
|
|
67bc8b5569 | ||
|
|
021e7c2304 | ||
|
|
91b6dcf291 | ||
|
|
2a1a4e2cae | ||
|
|
fc6b914e2d | ||
|
|
d155c1364a | ||
|
|
8882a87048 | ||
|
|
a95c675e9c | ||
|
|
8864e7c87d | ||
|
|
db9b613ae4 | ||
|
|
37828b4be4 | ||
|
|
1a3a7dc0f3 | ||
|
|
f67707a740 | ||
|
|
798684a334 |
11
.github/dependabot.yml
vendored
11
.github/dependabot.yml
vendored
@@ -1,11 +0,0 @@
|
|||||||
version: 2
|
|
||||||
updates:
|
|
||||||
- package-ecosystem: "pip"
|
|
||||||
directory: "/"
|
|
||||||
schedule:
|
|
||||||
interval: "daily"
|
|
||||||
open-pull-requests-limit: 10
|
|
||||||
- package-ecosystem: "github-actions"
|
|
||||||
directory: "/"
|
|
||||||
schedule:
|
|
||||||
interval: "daily"
|
|
||||||
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@@ -31,10 +31,10 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python: ['3.7', '3.10']
|
python: ['3.7', '3.10']
|
||||||
task: # --show-capture=no on purpose
|
task: # --show-capture=no on purpose, -s for captchas
|
||||||
- name: Test
|
- name: Test
|
||||||
run: |
|
run: |
|
||||||
pytest --show-capture=no --color=yes tests/
|
pytest -s --show-capture=no --color=yes tests/
|
||||||
|
|
||||||
include:
|
include:
|
||||||
- python: '3.10'
|
- python: '3.10'
|
||||||
|
|||||||
@@ -4,8 +4,12 @@ sphinx:
|
|||||||
configuration: docs/source/conf.py
|
configuration: docs/source/conf.py
|
||||||
fail_on_warning: false
|
fail_on_warning: false
|
||||||
|
|
||||||
|
build:
|
||||||
|
os: "ubuntu-22.04"
|
||||||
|
tools:
|
||||||
|
python: "3.8"
|
||||||
|
|
||||||
python:
|
python:
|
||||||
version: "3.8"
|
|
||||||
install:
|
install:
|
||||||
- requirements: requirements.txt
|
- requirements: requirements.txt
|
||||||
- requirements: dev-requirements.txt
|
- requirements: dev-requirements.txt
|
||||||
|
|||||||
28
Pipfile
28
Pipfile
@@ -4,8 +4,32 @@ verify_ssl = true
|
|||||||
name = "pypi"
|
name = "pypi"
|
||||||
|
|
||||||
[packages]
|
[packages]
|
||||||
vk-api = "*"
|
vk-api = ">=11.9.9"
|
||||||
yt-dlp = "*"
|
yt-dlp = ">=2023.2.17"
|
||||||
|
flake8 = "*"
|
||||||
|
mypy = ">=0.961"
|
||||||
|
black = ">=22.3.0"
|
||||||
|
isort = ">=5.10.1"
|
||||||
|
pytest = "*"
|
||||||
|
pytest-sphinx = "*"
|
||||||
|
pytest-cov = "*"
|
||||||
|
twine = ">=1.11.0"
|
||||||
|
sphinx = "<5.1.0,>=4.3.0"
|
||||||
|
furo = ">=2022.6.4.1"
|
||||||
|
myst-parser = "<0.19.0,>=0.15.2"
|
||||||
|
sphinx-copybutton = ">=0.5.0"
|
||||||
|
sphinx-autobuild = ">=2021.3.14"
|
||||||
|
sphinx-autodoc-typehints = "*"
|
||||||
|
python-dotenv = ">=0.21.1"
|
||||||
|
brotli = ">=1.0.9"
|
||||||
|
certifi = ">=2023.7.22"
|
||||||
|
charset-normalizer = ">=3.0.1"
|
||||||
|
idna = ">=3.4"
|
||||||
|
mutagen = ">=1.46.0"
|
||||||
|
pycryptodomex = ">=3.17"
|
||||||
|
requests = ">=2.28.2"
|
||||||
|
urllib3 = ">=1.26.14"
|
||||||
|
websockets = ">=10.4"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
sphinx-copybutton = "==0.5.0"
|
sphinx-copybutton = "==0.5.0"
|
||||||
|
|||||||
2336
Pipfile.lock
generated
2336
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -28,7 +28,7 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
|
|||||||
|
|
||||||
# you can pass a token as well to avoid always authenticating
|
# you can pass a token as well to avoid always authenticating
|
||||||
# and possibly getting captcha prompts
|
# and possibly getting captcha prompts
|
||||||
# you can fetch the token from the bk_config.v2.json file generated under by searching for "access_token"
|
# you can fetch the token from the vk_config.v2.json file generated under by searching for "access_token"
|
||||||
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
|
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
|
||||||
|
|
||||||
# save the JSON output into a file
|
# save the JSON output into a file
|
||||||
|
|||||||
@@ -2,11 +2,11 @@
|
|||||||
flake8
|
flake8
|
||||||
|
|
||||||
# Static type checking
|
# Static type checking
|
||||||
mypy==0.961
|
mypy>=0.961
|
||||||
|
|
||||||
# Automatic code formatting
|
# Automatic code formatting
|
||||||
black==22.3.0
|
black>=22.3.0
|
||||||
isort==5.10.1
|
isort>=5.10.1
|
||||||
|
|
||||||
# Running tests
|
# Running tests
|
||||||
pytest
|
pytest
|
||||||
@@ -24,19 +24,20 @@ wheel
|
|||||||
Sphinx>=4.3.0,<5.1.0
|
Sphinx>=4.3.0,<5.1.0
|
||||||
|
|
||||||
# Sphinx theme: https://sphinx-themes.org/sample-sites/furo/
|
# Sphinx theme: https://sphinx-themes.org/sample-sites/furo/
|
||||||
furo==2022.6.4.1
|
furo>=2022.6.4.1
|
||||||
|
|
||||||
# Lets Sphinx parse markdown files in addition to rst.
|
# Lets Sphinx parse markdown files in addition to rst.
|
||||||
myst-parser>=0.15.2,<0.19.0
|
myst-parser>=0.15.2,<0.19.0
|
||||||
|
|
||||||
# Adds a copy button to code examples in the docs.
|
# Adds a copy button to code examples in the docs.
|
||||||
sphinx-copybutton==0.5.0
|
sphinx-copybutton>=0.5.0
|
||||||
|
|
||||||
# Live rebuilding and reloading of docs for developing locally.
|
# Live rebuilding and reloading of docs for developing locally.
|
||||||
sphinx-autobuild==2021.3.14
|
sphinx-autobuild>=2021.3.14
|
||||||
|
|
||||||
# Automatically adds types to docs
|
# Automatically adds types to docs
|
||||||
sphinx-autodoc-typehints
|
sphinx-autodoc-typehints
|
||||||
|
|
||||||
# For parsing and comparing version numbers.
|
# For parsing and comparing version numbers.
|
||||||
packaging
|
packaging
|
||||||
|
python-dotenv>=0.21.1
|
||||||
@@ -5,14 +5,15 @@
|
|||||||
# pipenv lock --requirements
|
# pipenv lock --requirements
|
||||||
#
|
#
|
||||||
|
|
||||||
brotli==1.0.9; platform_python_implementation == 'CPython'
|
# -i https://pypi.org/simple
|
||||||
certifi==2022.12.7; python_version >= '3.6'
|
brotli>=1.0.9; platform_python_implementation >= 'CPython'
|
||||||
charset-normalizer==3.0.1; python_version >= '3.6'
|
certifi>=2022.12.7; python_version >= '3.6'
|
||||||
idna==3.4; python_version >= '3.5'
|
charset-normalizer>=3.0.1; python_version >= '3.6'
|
||||||
mutagen==1.46.0; python_version >= '3.7'
|
idna>=3.4; python_version >= '3.5'
|
||||||
pycryptodomex==3.17; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
mutagen>=1.46.0; python_version >= '3.7'
|
||||||
requests==2.28.2; python_version >= '3.7' and python_version < '4'
|
pycryptodomex>=3.17; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||||
urllib3==1.26.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
requests>=2.28.2; python_version >= '3.7' and python_version < '4'
|
||||||
vk-api==11.9.9
|
urllib3>=1.26.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
||||||
websockets==10.4; python_version >= '3.7'
|
vk-api>=11.9.9
|
||||||
yt-dlp==2023.2.17
|
websockets>=10.4; python_version >= '3.7'
|
||||||
|
yt-dlp>=2023.2.17
|
||||||
@@ -14,15 +14,16 @@ def test_login_fail():
|
|||||||
VkScraper("invalid", "combination")
|
VkScraper("invalid", "combination")
|
||||||
|
|
||||||
|
|
||||||
def test_login_custom_file():
|
# disabled due to CI
|
||||||
session_filename = "test-session.json"
|
# def test_login_custom_file():
|
||||||
VkScraper(
|
# session_filename = "test-session.json"
|
||||||
os.environ["VK_USERNAME"],
|
# VkScraper(
|
||||||
os.environ["VK_PASSWORD"],
|
# os.environ["VK_USERNAME"],
|
||||||
session_file=session_filename,
|
# os.environ["VK_PASSWORD"],
|
||||||
)
|
# session_file=session_filename,
|
||||||
assert os.path.isfile(session_filename)
|
# )
|
||||||
os.unlink(session_filename)
|
# assert os.path.isfile(session_filename)
|
||||||
|
# os.unlink(session_filename)
|
||||||
|
|
||||||
|
|
||||||
def test_login_success():
|
def test_login_success():
|
||||||
@@ -138,7 +139,7 @@ def test_scrape_video_only():
|
|||||||
assert len(res) == 1
|
assert len(res) == 1
|
||||||
assert res[0]["id"] == "video38556806_456251917"
|
assert res[0]["id"] == "video38556806_456251917"
|
||||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38))
|
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38))
|
||||||
assert len(res[0]["payload"]) == 31
|
assert len(res[0]["payload"]) == 34
|
||||||
assert len(res[0]["attachments"].keys()) == 1
|
assert len(res[0]["attachments"].keys()) == 1
|
||||||
assert list(res[0]["attachments"].keys()) == ["video"]
|
assert list(res[0]["attachments"].keys()) == ["video"]
|
||||||
|
|
||||||
@@ -149,3 +150,21 @@ def test_scrape_video_only2():
|
|||||||
vks.download_media(res, tempdir)
|
vks.download_media(res, tempdir)
|
||||||
found_files = set(os.listdir(tempdir))
|
found_files = set(os.listdir(tempdir))
|
||||||
assert "video-17546758_456239898_0.mp4" in found_files
|
assert "video-17546758_456239898_0.mp4" in found_files
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_private_video():
|
||||||
|
"""
|
||||||
|
> Some videos are kept private and cannot be accessed without a passkey . In this case, send the ID in {owner_id}_{video_id}_{access_key}.
|
||||||
|
From https://dev.vk.com/ru/method/video.get
|
||||||
|
"""
|
||||||
|
res = vks.scrape("https://vk.com/wall-127774884_178565")
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||||
|
vks.download_media(res, tempdir)
|
||||||
|
expect_files = {
|
||||||
|
"wall-127774884_178565_0.mp4",
|
||||||
|
"wall-127774884_178565_1.mp4",
|
||||||
|
"wall-127774884_178565_2.mp4",
|
||||||
|
}
|
||||||
|
found_files = set(os.listdir(tempdir))
|
||||||
|
assert len(expect_files) == len(expect_files & found_files)
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ def get_argument_parser():
|
|||||||
action="store",
|
action="store",
|
||||||
dest="username",
|
dest="username",
|
||||||
required=True,
|
required=True,
|
||||||
help="username for a valid vk.com account",
|
help="username for a valid vk.com account (pass empty if using --token)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-p",
|
"-p",
|
||||||
@@ -27,7 +27,7 @@ def get_argument_parser():
|
|||||||
action="store",
|
action="store",
|
||||||
dest="password",
|
dest="password",
|
||||||
required=True,
|
required=True,
|
||||||
help="password for the valid vk.com account",
|
help="password for the valid vk.com account (pass empty if using --token)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-t",
|
"-t",
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@@ -37,13 +37,13 @@ class VkScraper:
|
|||||||
|
|
||||||
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
|
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
username: str,
|
username: str,
|
||||||
password: str,
|
password: str,
|
||||||
token: str = None,
|
token: Optional[str] = None,
|
||||||
session_file="vk_config.v2.json",
|
session_file="vk_config.v2.json",
|
||||||
captcha_handler=captcha_handler,
|
captcha_handler=captcha_handler,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -144,10 +144,11 @@ class VkScraper:
|
|||||||
first_type = a["type"]
|
first_type = a["type"]
|
||||||
attachment = a[first_type]
|
attachment = a[first_type]
|
||||||
if first_type == "video":
|
if first_type == "video":
|
||||||
|
video_path = f'video{attachment["owner_id"]}_{attachment["id"]}'
|
||||||
|
if "access_key" in attachment:
|
||||||
|
video_path += f"_{attachment['access_key']}"
|
||||||
attachments["video"].extend(
|
attachments["video"].extend(
|
||||||
self.scrape_videos(f'video{attachment["owner_id"]}_{attachment["id"]}')[
|
self.scrape_videos(video_path)[0]
|
||||||
0
|
|
||||||
]
|
|
||||||
.get("attachments", {})
|
.get("attachments", {})
|
||||||
.get("video", [""])
|
.get("video", [""])
|
||||||
)
|
)
|
||||||
@@ -352,9 +353,10 @@ class VkScraper:
|
|||||||
info = ydl.extract_info(url, download=True)
|
info = ydl.extract_info(url, download=True)
|
||||||
filename = ydl.prepare_filename(info)
|
filename = ydl.prepare_filename(info)
|
||||||
if "unknown_video" in filename:
|
if "unknown_video" in filename:
|
||||||
|
old_filename = filename
|
||||||
filename = shutil.copy(
|
filename = shutil.copy(
|
||||||
filename, filename.replace("unknown_video", "mkv")
|
filename, filename.replace("unknown_video", "mp4")
|
||||||
)
|
)
|
||||||
os.remove(filename)
|
os.remove(old_filename)
|
||||||
downloaded.append(filename)
|
downloaded.append(filename)
|
||||||
return downloaded
|
return downloaded
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ class DateTimeEncoder(json.JSONEncoder):
|
|||||||
|
|
||||||
def captcha_handler(captcha):
|
def captcha_handler(captcha):
|
||||||
key = input(
|
key = input(
|
||||||
f"CAPTCHA DETECTED, please solve it and input the solution. url={captcha.get_url()}:"
|
f"CAPTCHA DETECTED, please solve it and input the solution. url= {captcha.get_url()} :"
|
||||||
).strip()
|
).strip()
|
||||||
return captcha.try_again(key)
|
return captcha.try_again(key)
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "3"
|
_MINOR = "3"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "12"
|
_PATCH = "26"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user