Compare commits

...

24 Commits

Author SHA1 Message Date
msramalho
5b0f034c12 Bump version to v0.3.26 for release 2023-08-18 21:15:54 +01:00
msramalho
a1c098335c fix: private videos 2023-08-18 21:15:34 +01:00
msramalho
12a5d22f64 fix: certifi 2023-08-18 21:12:44 +01:00
Miguel Sozinho Ramalho
ab602e5d31 Update .readthedocs.yaml
https://blog.readthedocs.com/use-build-os-config/

https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
2023-08-16 18:34:36 +01:00
msramalho
67bc8b5569 Bump version to v0.3.24 for release 2023-05-10 17:09:22 +01:00
msramalho
021e7c2304 disables test due to CI 2023-05-10 17:08:39 +01:00
msramalho
91b6dcf291 Bump version to v0.3.23 for release 2023-05-10 16:47:53 +01:00
msramalho
2a1a4e2cae minor CI update 2023-05-10 16:47:39 +01:00
msramalho
fc6b914e2d Bump version to v0.3.22 for release 2023-05-10 16:28:30 +01:00
Logan Williams
d155c1364a Bump version number 2023-05-10 14:56:39 +02:00
Logan Williams
8882a87048 Fix import order 2023-05-10 14:33:47 +02:00
Logan Williams
a95c675e9c No implicit optional 2023-05-10 14:28:59 +02:00
Logan Williams
8864e7c87d Fix failing test 2023-05-10 14:25:50 +02:00
Logan Williams
db9b613ae4 Loosen dependency version requirements 2023-05-10 14:15:56 +02:00
Miguel Sozinho Ramalho
37828b4be4 Delete dependabot.yml 2023-02-27 10:21:19 +01:00
msramalho
1a3a7dc0f3 Bump version to v0.3.15 for release 2023-02-23 17:07:13 +01:00
msramalho
f67707a740 Bump version to v0.3.14 for release 2023-02-23 17:05:43 +01:00
msramalho
798684a334 Bump version to v0.3.13 for release 2023-02-23 17:02:14 +01:00
msramalho
a556b237e9 Bump version to v0.3.12 for release 2023-02-23 16:58:11 +01:00
msramalho
283bc35658 Bump version to v0.3.11 for release 2023-02-23 16:52:59 +01:00
msramalho
cef70fb80d update yt-dlp 2023-02-23 16:52:52 +01:00
msramalho
e66ef4f477 fix tests 2023-02-23 16:52:45 +01:00
msramalho
1f6a8368fd updates 2022-11-03 17:07:34 +00:00
msramalho
9a046fd1cb Bump version to v0.3.9 for release 2022-11-03 16:35:59 +00:00
14 changed files with 1756 additions and 656 deletions

View File

@@ -1,11 +0,0 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "daily"
open-pull-requests-limit: 10
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"

View File

@@ -31,10 +31,10 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
python: ['3.7', '3.10'] python: ['3.7', '3.10']
task: # --show-capture=no on purpose task: # --show-capture=no on purpose, -s for captchas
- name: Test - name: Test
run: | run: |
pytest --show-capture=no --color=yes tests/ pytest -s --show-capture=no --color=yes tests/
include: include:
- python: '3.10' - python: '3.10'

View File

@@ -4,8 +4,12 @@ sphinx:
configuration: docs/source/conf.py configuration: docs/source/conf.py
fail_on_warning: false fail_on_warning: false
build:
os: "ubuntu-22.04"
tools:
python: "3.8"
python: python:
version: "3.8"
install: install:
- requirements: requirements.txt - requirements: requirements.txt
- requirements: dev-requirements.txt - requirements: dev-requirements.txt

28
Pipfile
View File

@@ -4,8 +4,32 @@ verify_ssl = true
name = "pypi" name = "pypi"
[packages] [packages]
vk-api = "*" vk-api = ">=11.9.9"
yt-dlp = "*" yt-dlp = ">=2023.2.17"
flake8 = "*"
mypy = ">=0.961"
black = ">=22.3.0"
isort = ">=5.10.1"
pytest = "*"
pytest-sphinx = "*"
pytest-cov = "*"
twine = ">=1.11.0"
sphinx = "<5.1.0,>=4.3.0"
furo = ">=2022.6.4.1"
myst-parser = "<0.19.0,>=0.15.2"
sphinx-copybutton = ">=0.5.0"
sphinx-autobuild = ">=2021.3.14"
sphinx-autodoc-typehints = "*"
python-dotenv = ">=0.21.1"
brotli = ">=1.0.9"
certifi = ">=2023.7.22"
charset-normalizer = ">=3.0.1"
idna = ">=3.4"
mutagen = ">=1.46.0"
pycryptodomex = ">=3.17"
requests = ">=2.28.2"
urllib3 = ">=1.26.14"
websockets = ">=10.4"
[dev-packages] [dev-packages]
sphinx-copybutton = "==0.5.0" sphinx-copybutton = "==0.5.0"

2254
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -28,7 +28,7 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
# you can pass a token as well to avoid always authenticating # you can pass a token as well to avoid always authenticating
# and possibly getting captcha prompts # and possibly getting captcha prompts
# you can fetch the token from the bk_config.v2.json file generated under by searching for "access_token" # you can fetch the token from the vk_config.v2.json file generated under by searching for "access_token"
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789 vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
# save the JSON output into a file # save the JSON output into a file

View File

@@ -2,11 +2,11 @@
flake8 flake8
# Static type checking # Static type checking
mypy==0.961 mypy>=0.961
# Automatic code formatting # Automatic code formatting
black==22.3.0 black>=22.3.0
isort==5.10.1 isort>=5.10.1
# Running tests # Running tests
pytest pytest
@@ -24,19 +24,20 @@ wheel
Sphinx>=4.3.0,<5.1.0 Sphinx>=4.3.0,<5.1.0
# Sphinx theme: https://sphinx-themes.org/sample-sites/furo/ # Sphinx theme: https://sphinx-themes.org/sample-sites/furo/
furo==2022.6.4.1 furo>=2022.6.4.1
# Lets Sphinx parse markdown files in addition to rst. # Lets Sphinx parse markdown files in addition to rst.
myst-parser>=0.15.2,<0.19.0 myst-parser>=0.15.2,<0.19.0
# Adds a copy button to code examples in the docs. # Adds a copy button to code examples in the docs.
sphinx-copybutton==0.5.0 sphinx-copybutton>=0.5.0
# Live rebuilding and reloading of docs for developing locally. # Live rebuilding and reloading of docs for developing locally.
sphinx-autobuild==2021.3.14 sphinx-autobuild>=2021.3.14
# Automatically adds types to docs # Automatically adds types to docs
sphinx-autodoc-typehints sphinx-autodoc-typehints
# For parsing and comparing version numbers. # For parsing and comparing version numbers.
packaging packaging
python-dotenv>=0.21.1

View File

@@ -5,11 +5,15 @@
# pipenv lock --requirements # pipenv lock --requirements
# #
certifi==2022.6.15 # -i https://pypi.org/simple
charset-normalizer==2.0.12 brotli>=1.0.9; platform_python_implementation >= 'CPython'
idna==3.3 certifi>=2022.12.7; python_version >= '3.6'
requests==2.28.0 charset-normalizer>=3.0.1; python_version >= '3.6'
urllib3==1.26.9 idna>=3.4; python_version >= '3.5'
vk-api==11.9.8 mutagen>=1.46.0; python_version >= '3.7'
python-dotenv==0.20.0 pycryptodomex>=3.17; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
yt-dlp==2022.7.18 requests>=2.28.2; python_version >= '3.7' and python_version < '4'
urllib3>=1.26.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
vk-api>=11.9.9
websockets>=10.4; python_version >= '3.7'
yt-dlp>=2023.2.17

View File

@@ -44,7 +44,10 @@ setup(
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
], ],
keywords=["scraper", "vk", "vkontakte", "vk-api", "media-downloader"], keywords=["scraper", "vk", "vkontakte", "vk-api", "media-downloader"],
url="https://github.com/bellingcat/vk-url-scraper", project_urls={
"Code": "https://github.com/bellingcat/vk-url-scraper",
"Documentation": "https://vk-url-scraper.readthedocs.io/en/latest/",
},
author="Bellingcat", author="Bellingcat",
author_email="tech@bellingcat.com", author_email="tech@bellingcat.com",
license="MIT", license="MIT",

View File

@@ -14,15 +14,16 @@ def test_login_fail():
VkScraper("invalid", "combination") VkScraper("invalid", "combination")
def test_login_custom_file(): # disabled due to CI
session_filename = "test-session.json" # def test_login_custom_file():
VkScraper( # session_filename = "test-session.json"
os.environ["VK_USERNAME"], # VkScraper(
os.environ["VK_PASSWORD"], # os.environ["VK_USERNAME"],
session_file=session_filename, # os.environ["VK_PASSWORD"],
) # session_file=session_filename,
assert os.path.isfile(session_filename) # )
os.unlink(session_filename) # assert os.path.isfile(session_filename)
# os.unlink(session_filename)
def test_login_success(): def test_login_success():
@@ -80,7 +81,7 @@ def test_scrape_wall_url_with_photos():
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея." == "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
) )
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24)) assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
assert len(res[0]["payload"]) == 16 assert len(res[0]["payload"]) == 17
assert len(res[0]["attachments"].keys()) == 1 assert len(res[0]["attachments"].keys()) == 1
assert list(res[0]["attachments"].keys()) == ["photo"] assert list(res[0]["attachments"].keys()) == ["photo"]
assert len(res[0]["attachments"]["photo"]) == 9 assert len(res[0]["attachments"]["photo"]) == 9
@@ -92,7 +93,7 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
assert res[0]["id"] == "wall-17315087_74182" assert res[0]["id"] == "wall-17315087_74182"
assert res[0]["text"] == "" assert res[0]["text"] == ""
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9)) assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
assert len(res[0]["payload"]) == 15 assert len(res[0]["payload"]) == 17
assert len(res[0]["attachments"].keys()) == 3 assert len(res[0]["attachments"].keys()) == 3
for k in ["photo", "link", "video"]: for k in ["photo", "link", "video"]:
assert k in list(res[0]["attachments"].keys()) assert k in list(res[0]["attachments"].keys())
@@ -138,7 +139,7 @@ def test_scrape_video_only():
assert len(res) == 1 assert len(res) == 1
assert res[0]["id"] == "video38556806_456251917" assert res[0]["id"] == "video38556806_456251917"
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38)) assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38))
assert len(res[0]["payload"]) == 31 assert len(res[0]["payload"]) == 34
assert len(res[0]["attachments"].keys()) == 1 assert len(res[0]["attachments"].keys()) == 1
assert list(res[0]["attachments"].keys()) == ["video"] assert list(res[0]["attachments"].keys()) == ["video"]
@@ -149,3 +150,21 @@ def test_scrape_video_only2():
vks.download_media(res, tempdir) vks.download_media(res, tempdir)
found_files = set(os.listdir(tempdir)) found_files = set(os.listdir(tempdir))
assert "video-17546758_456239898_0.mp4" in found_files assert "video-17546758_456239898_0.mp4" in found_files
def test_scrape_private_video():
"""
> Some videos are kept private and cannot be accessed without a passkey . In this case, send the ID in {owner_id}_{video_id}_{access_key}.
From https://dev.vk.com/ru/method/video.get
"""
res = vks.scrape("https://vk.com/wall-127774884_178565")
with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir)
expect_files = {
"wall-127774884_178565_0.mp4",
"wall-127774884_178565_1.mp4",
"wall-127774884_178565_2.mp4",
}
found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files)

View File

@@ -19,7 +19,7 @@ def get_argument_parser():
action="store", action="store",
dest="username", dest="username",
required=True, required=True,
help="username for a valid vk.com account", help="username for a valid vk.com account (pass empty if using --token)",
) )
parser.add_argument( parser.add_argument(
"-p", "-p",
@@ -27,7 +27,7 @@ def get_argument_parser():
action="store", action="store",
dest="password", dest="password",
required=True, required=True,
help="password for the valid vk.com account", help="password for the valid vk.com account (pass empty if using --token)",
) )
parser.add_argument( parser.add_argument(
"-t", "-t",

View File

@@ -3,7 +3,7 @@ import re
import shutil import shutil
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from typing import List from typing import List, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@@ -37,13 +37,13 @@ class VkScraper:
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)") WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)") VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
def __init__( def __init__(
self, self,
username: str, username: str,
password: str, password: str,
token: str = None, token: Optional[str] = None,
session_file="vk_config.v2.json", session_file="vk_config.v2.json",
captcha_handler=captcha_handler, captcha_handler=captcha_handler,
) -> None: ) -> None:
@@ -144,10 +144,11 @@ class VkScraper:
first_type = a["type"] first_type = a["type"]
attachment = a[first_type] attachment = a[first_type]
if first_type == "video": if first_type == "video":
video_path = f'video{attachment["owner_id"]}_{attachment["id"]}'
if "access_key" in attachment:
video_path += f"_{attachment['access_key']}"
attachments["video"].extend( attachments["video"].extend(
self.scrape_videos(f'video{attachment["owner_id"]}_{attachment["id"]}')[ self.scrape_videos(video_path)[0]
0
]
.get("attachments", {}) .get("attachments", {})
.get("video", [""]) .get("video", [""])
) )
@@ -352,9 +353,10 @@ class VkScraper:
info = ydl.extract_info(url, download=True) info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info) filename = ydl.prepare_filename(info)
if "unknown_video" in filename: if "unknown_video" in filename:
old_filename = filename
filename = shutil.copy( filename = shutil.copy(
filename, filename.replace("unknown_video", "mkv") filename, filename.replace("unknown_video", "mp4")
) )
os.remove(filename) os.remove(old_filename)
downloaded.append(filename) downloaded.append(filename)
return downloaded return downloaded

View File

@@ -15,7 +15,7 @@ class DateTimeEncoder(json.JSONEncoder):
def captcha_handler(captcha): def captcha_handler(captcha):
key = input( key = input(
f"CAPTCHA DETECTED, please solve it and input the solution. url={captcha.get_url()}:" f"CAPTCHA DETECTED, please solve it and input the solution. url= {captcha.get_url()} :"
).strip() ).strip()
return captcha.try_again(key) return captcha.try_again(key)

View File

@@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "3" _MINOR = "3"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "8" _PATCH = "26"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""