Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot]
135886a49c Bump yt-dlp from 2022.7.18 to 2023.2.17
Bumps [yt-dlp](https://github.com/yt-dlp/yt-dlp) from 2022.7.18 to 2023.2.17.
- [Release notes](https://github.com/yt-dlp/yt-dlp/releases)
- [Changelog](https://github.com/yt-dlp/yt-dlp/blob/master/Changelog.md)
- [Commits](https://github.com/yt-dlp/yt-dlp/compare/2022.07.18...2023.02.17)

---
updated-dependencies:
- dependency-name: yt-dlp
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-02-17 18:18:04 +00:00
17 changed files with 795 additions and 2199 deletions

View File

@@ -31,7 +31,7 @@ runs:
# Get the exact Python version to use in the cache key.
echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV
- uses: actions/cache@v4
- uses: actions/cache@v2
id: virtualenv-cache
with:
path: .venv

11
.github/dependabot.yml vendored Normal file
View File

@@ -0,0 +1,11 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "daily"
open-pull-requests-limit: 10
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"

View File

@@ -30,11 +30,11 @@ jobs:
strategy:
fail-fast: false
matrix:
python: ['3.10']
task: # --show-capture=no on purpose, -s for captchas
python: ['3.7', '3.10']
task: # --show-capture=no on purpose
- name: Test
run: |
pytest -s --show-capture=no --color=yes tests/
pytest --show-capture=no --color=yes tests/
include:
- python: '3.10'
@@ -79,11 +79,10 @@ jobs:
run: |
. .venv/bin/activate
${{ matrix.task.run }}
continue-on-error: ${{ matrix.task.name != 'Build' }}
- name: Upload package distribution files
if: matrix.task.name == 'Build'
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
name: package
path: dist
@@ -118,7 +117,7 @@ jobs:
echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
- name: Download package distribution files
uses: actions/download-artifact@v4
uses: actions/download-artifact@v3
with:
name: package
path: dist

View File

@@ -4,12 +4,8 @@ sphinx:
configuration: docs/source/conf.py
fail_on_warning: false
build:
os: "ubuntu-22.04"
tools:
python: "3.10"
python:
version: "3.8"
install:
- requirements: requirements.txt
- requirements: dev-requirements.txt

View File

@@ -13,4 +13,4 @@ run-checks :
black .
flake8 .
mypy .
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes .
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/

30
Pipfile
View File

@@ -4,32 +4,8 @@ verify_ssl = true
name = "pypi"
[packages]
yt-dlp = ">=2023.2.17"
certifi = ">=2023.7.22"
charset-normalizer = ">=3.0.1"
idna = ">=3.4"
mutagen = ">=1.46.0"
pycryptodomex = ">=3.17"
requests = ">=2.28.2"
urllib3 = ">=1.26.14"
websockets = ">=10.4"
vk-api = {ref = "b99dac0ec2f832a6c4b20bde49869e7229ce4742", git = "git+https://github.com/python273/vk_api.git"}
flake8 = "*"
mypy = ">=0.961"
black = ">=22.3.0"
isort = ">=5.10.1"
pytest = "*"
pytest-sphinx = "*"
pytest-cov = "*"
twine = ">=1.11.0"
sphinx = "<5.1.0,>=4.3.0"
furo = ">=2022.6.4.1"
myst-parser = "<0.19.0,>=0.15.2"
sphinx-copybutton = ">=0.5.0"
sphinx-autobuild = ">=2021.3.14"
sphinx-autodoc-typehints = "*"
packaging = "*"
python-dotenv = ">=0.21.1"
vk-api = "*"
yt-dlp = "*"
[dev-packages]
sphinx-copybutton = "==0.5.0"
@@ -49,7 +25,7 @@ sphinx-autodoc-typehints = "*"
python-dotenv = "*"
[requires]
python_version = "3.11"
python_version = "3.9"
[pipenv]
allow_prereleases = true

2711
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,6 @@
# vk-url-scraper
Python library to scrape data, and especially media links like videos and photos, from vk.com URLs.
> This repo has been archived because it relies on a fixed git commit of the vk_api library which we can no longer publish to pypi, see [issue](https://github.com/bellingcat/vk-url-scraper/issues/66). You can still install the latest install. This archived state may change if a solution is found to publish the library to pypi again.
[![PyPI version](https://badge.fury.io/py/vk-url-scraper.svg)](https://badge.fury.io/py/vk-url-scraper)
[![PyPI download month](https://img.shields.io/pypi/dm/vk-url-scraper.svg)](https://pypi.python.org/pypi/vk-url-scraper/)
@@ -13,12 +12,6 @@ You can use it via the [command line](#command-line-usage) or as a [python libra
## Installation
You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`.
Currently you need to manually unsintall and re-install one dependency (as it is installed from github and not pypi):
```bash
pip uninstall vk-api
pip install git+https://github.com/python273/vk_api.git@b99dac0ec2f832a6c4b20bde49869e7229ce4742
```
To use the library you will need a valid username/password combination for vk.com.
## Command line usage
@@ -27,7 +20,7 @@ To use the library you will need a valid username/password combination for vk.co
vk_url_scraper --help
# scrape a URL and get the JSON result in the console
vk_url_scraper --username "username here" --password "password here" --urls https://vk.com/wall12345_6789
vk_url_scraper -username "username here" --password "password here" --urls https://vk.com/wall12345_6789
# OR
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
# you can also have multiple urls
@@ -35,7 +28,7 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
# you can pass a token as well to avoid always authenticating
# and possibly getting captcha prompts
# you can fetch the token from the vk_config.v2.json file generated under by searching for "access_token"
# you can fetch the token from the bk_config.v2.json file generated under by searching for "access_token"
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
# save the JSON output into a file
@@ -91,13 +84,12 @@ see [docs] for all available functions.
## Development
(more info in [CONTRIBUTING.md](CONTRIBUTING.md)).
1. setup dev environment with `pipenv install --dev`
1. setup environment with `pipenv install -r requirements.txt`
1. Activate the environment with `pipenv shell` (or prepend `pipenv run` to all commands)
1. setup dev environment with `pip install -r dev-requirements.txt` or `pipenv install -r dev-requirements.txt`
1. setup environment with `pip install -r requirements.txt` or `pipenv install -r requirements.txt`
2. To run all checks to `make run-checks` (fixes style) or individually
1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
2. To do type checking: `mypy .`
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to use verbose, colors, and test docstring examples)
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`
@@ -105,12 +97,10 @@ To test the command line interface available in [__main__.py](__vk_url_scraper/_
## Releasing new version
1. edit [version.py](vk_url_scraper/version.py) with proper versioning
2. make sure to run `pipenv run pip freeze > requirements.txt` if you manage libs with pipenv
1. if the hardcoded version of [vk_api](https://github.com/python273/vk_api) is still being used, then you must comment/remove that line from the generated requirements file and instruct users to manually install the version from the source as pypi does not allow repo/commit tags. Additionally, add the latest released version, currently `vk-api==11.9.9`.
3. run `./scripts/release.sh` to create a tag and push, alternatively
2. run `./scripts/release.sh` to create a tag and push, alternatively
1. `git tag vx.y.z` to tag version
2. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
4. go to https://readthedocs.org/ to deploy new docs version (if webhook is not setup)
3. go to https://readthedocs.org/ to deploy new docs version (if webhook is not setup)
### Fixing a failed release

View File

@@ -2,11 +2,11 @@
flake8
# Static type checking
mypy>=0.961
mypy==0.961
# Automatic code formatting
black>=22.3.0
isort>=5.10.1
black==22.3.0
isort==5.10.1
# Running tests
pytest
@@ -24,20 +24,19 @@ wheel
Sphinx>=4.3.0,<5.1.0
# Sphinx theme: https://sphinx-themes.org/sample-sites/furo/
furo>=2022.6.4.1
furo==2022.6.4.1
# Lets Sphinx parse markdown files in addition to rst.
myst-parser>=0.15.2,<0.19.0
# Adds a copy button to code examples in the docs.
sphinx-copybutton>=0.5.0
sphinx-copybutton==0.5.0
# Live rebuilding and reloading of docs for developing locally.
sphinx-autobuild>=2021.3.14
sphinx-autobuild==2021.3.14
# Automatically adds types to docs
sphinx-autodoc-typehints
# For parsing and comparing version numbers.
packaging
python-dotenv>=0.21.1

View File

@@ -1,7 +1,7 @@
Installation
============
**vk-url-scraper** supports Python >= 3.10.
**vk-url-scraper** supports Python >= 3.7.
## Installing with `pip`

View File

@@ -1,87 +1,15 @@
alabaster==0.7.16
anyio==4.9.0
babel==2.17.0
backports.tarfile==1.2.0
beautifulsoup4==4.13.4
black==25.1.0
certifi==2025.4.26
cffi==1.17.1
charset-normalizer==3.4.2
click==8.1.8
colorama==0.4.6
coverage==7.8.0
cryptography==44.0.3
docutils==0.18.1
flake8==7.2.0
furo==2023.3.27
h11==0.16.0
id==1.5.0
idna==3.10
imagesize==1.4.1
importlib_metadata==8.7.0
iniconfig==2.1.0
isort==6.0.1
jaraco.classes==3.4.0
jaraco.context==6.0.1
jaraco.functools==4.1.0
jeepney==0.9.0
Jinja2==3.1.6
keyring==25.6.0
livereload==2.7.1
markdown-it-py==2.2.0
MarkupSafe==3.0.2
mccabe==0.7.0
mdit-py-plugins==0.3.5
mdurl==0.1.2
more-itertools==10.7.0
mutagen==1.47.0
mypy==1.15.0
mypy_extensions==1.1.0
myst-parser==0.18.1
nh3==0.2.21
packaging==25.0
pathspec==0.12.1
pkginfo==1.10.0
platformdirs==4.3.7
pluggy==1.5.0
pycodestyle==2.13.0
pycparser==2.22
pycryptodomex==3.22.0
pyflakes==3.3.2
Pygments==2.19.1
pytest==8.3.5
pytest-cov==6.1.1
pytest-sphinx==0.6.3
python-dotenv==1.1.0
PyYAML==6.0.2
readme_renderer==43.0
requests==2.32.3
requests-toolbelt==1.0.0
rfc3986==2.0.0
rich==14.0.0
SecretStorage==3.3.3
sniffio==1.3.1
snowballstemmer==2.2.0
soupsieve==2.7
Sphinx==5.0.2
sphinx-autobuild==2024.10.3
sphinx-autodoc-typehints==1.19.1
sphinx-basic-ng==1.0.0b2
sphinx-copybutton==0.5.2
sphinxcontrib-applehelp==2.0.0
sphinxcontrib-devhelp==2.0.0
sphinxcontrib-htmlhelp==2.1.0
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==2.0.0
sphinxcontrib-serializinghtml==2.0.0
starlette==0.46.2
tornado==6.5b1
twine==6.1.0
typing_extensions==4.13.2
urllib3==2.4.0
uvicorn==0.34.2
vk_api @ git+https://github.com/python273/vk_api.git@b99dac0ec2f832a6c4b20bde49869e7229ce4742
watchfiles==1.0.5
websockets==15.0.1
yt-dlp==2025.5.3.232917.dev0
zipp==3.21.0
#
# These requirements were autogenerated by pipenv
# To regenerate from the project's Pipfile, run:
#
# pipenv lock --requirements
#
certifi==2022.6.15
charset-normalizer==2.0.12
idna==3.3
requests==2.28.0
urllib3==1.26.9
vk-api==11.9.8
python-dotenv==0.20.0
yt-dlp==2023.2.17

View File

@@ -57,7 +57,7 @@ setup(
package_data={"vk_url_scraper": ["py.typed"]},
install_requires=read_requirements("requirements.txt"),
extras_require={"dev": read_requirements("dev-requirements.txt")},
python_requires=">=3.10",
python_requires=">=3.7",
entry_points={
"console_scripts": [
"vk_url_scraper=vk_url_scraper.__main__:main",

View File

@@ -14,16 +14,15 @@ def test_login_fail():
VkScraper("invalid", "combination")
# disabled due to CI
# def test_login_custom_file():
# session_filename = "test-session.json"
# VkScraper(
# os.environ["VK_USERNAME"],
# os.environ["VK_PASSWORD"],
# session_file=session_filename,
# )
# assert os.path.isfile(session_filename)
# os.unlink(session_filename)
def test_login_custom_file():
session_filename = "test-session.json"
VkScraper(
os.environ["VK_USERNAME"],
os.environ["VK_PASSWORD"],
session_file=session_filename,
)
assert os.path.isfile(session_filename)
os.unlink(session_filename)
def test_login_success():
@@ -81,7 +80,7 @@ def test_scrape_wall_url_with_photos():
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
)
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
assert len(res[0]["payload"]) == 19
assert len(res[0]["payload"]) == 16
assert len(res[0]["attachments"].keys()) == 1
assert list(res[0]["attachments"].keys()) == ["photo"]
assert len(res[0]["attachments"]["photo"]) == 9
@@ -93,7 +92,7 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
assert res[0]["id"] == "wall-17315087_74182"
assert res[0]["text"] == ""
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
assert len(res[0]["payload"]) == 18
assert len(res[0]["payload"]) == 15
assert len(res[0]["attachments"].keys()) == 3
for k in ["photo", "link", "video"]:
assert k in list(res[0]["attachments"].keys())
@@ -128,7 +127,7 @@ def test_scrape_photo_only():
== "Делимся расписанием конкурса [https://vk.com/wall-1_399468|«Код Петербурга»]. Все важные этапы — на одной схеме \n\nЕсли участвуете, обязательно сохраните себе. Так будет удобнее планировать работу над проектом, и вы точно не упустите лучший момент для отправки сервиса на модерацию."
)
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 7, 9, 43))
assert len(res[0]["payload"]) == 16
assert len(res[0]["payload"]) == 15
assert len(res[0]["attachments"].keys()) == 1
assert list(res[0]["attachments"].keys()) == ["photo"]
assert len(res[0]["attachments"]["photo"]) == 1
@@ -139,6 +138,7 @@ def test_scrape_video_only():
assert len(res) == 1
assert res[0]["id"] == "video38556806_456251917"
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38))
assert len(res[0]["payload"]) == 31
assert len(res[0]["attachments"].keys()) == 1
assert list(res[0]["attachments"].keys()) == ["video"]
@@ -149,21 +149,3 @@ def test_scrape_video_only2():
vks.download_media(res, tempdir)
found_files = set(os.listdir(tempdir))
assert "video-17546758_456239898_0.mp4" in found_files
def test_scrape_private_video():
"""
> Some videos are kept private and cannot be accessed without a passkey . In this case, send the ID in {owner_id}_{video_id}_{access_key}.
From https://dev.vk.com/ru/method/video.get
"""
res = vks.scrape("https://vk.com/wall-127774884_178565")
with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir)
expect_files = {
"wall-127774884_178565_0.mp4",
"wall-127774884_178565_1.mp4",
"wall-127774884_178565_2.mp4",
}
found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files)

View File

@@ -19,7 +19,7 @@ def get_argument_parser():
action="store",
dest="username",
required=True,
help="username for a valid vk.com account (pass empty if using --token)",
help="username for a valid vk.com account",
)
parser.add_argument(
"-p",
@@ -27,7 +27,7 @@ def get_argument_parser():
action="store",
dest="password",
required=True,
help="password for the valid vk.com account (pass empty if using --token)",
help="password for the valid vk.com account",
)
parser.add_argument(
"-t",

View File

@@ -3,7 +3,7 @@ import re
import shutil
from collections import defaultdict
from datetime import datetime
from typing import List, Optional
from typing import List
from urllib.parse import urlparse
import requests
@@ -37,13 +37,13 @@ class VkScraper:
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
def __init__(
self,
username: str,
password: str,
token: Optional[str] = None,
token: str = None,
session_file="vk_config.v2.json",
captcha_handler=captcha_handler,
) -> None:
@@ -59,7 +59,7 @@ class VkScraper:
password : str
Matching password on vk.com
token : str
Access token received after authenticating, can be found in the vk_config.v2.json file
Access token received after authenticating, can be found in the vl_config.v2.json file
session_file : str
File name where the VK session is saved so future logins are easier, this will not be created if token is passed
captcha_handler : func
@@ -144,11 +144,10 @@ class VkScraper:
first_type = a["type"]
attachment = a[first_type]
if first_type == "video":
video_path = f'video{attachment["owner_id"]}_{attachment["id"]}'
if "access_key" in attachment:
video_path += f"_{attachment['access_key']}"
attachments["video"].extend(
self.scrape_videos(video_path)[0]
self.scrape_videos(f'video{attachment["owner_id"]}_{attachment["id"]}')[
0
]
.get("attachments", {})
.get("video", [""])
)
@@ -339,9 +338,7 @@ class VkScraper:
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
ydl = yt_dlp.YoutubeDL(
{
"format": (
"bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
),
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"retries": 5,
"noplaylist": True,
@@ -355,10 +352,9 @@ class VkScraper:
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
if "unknown_video" in filename:
old_filename = filename
filename = shutil.copy(
filename, filename.replace("unknown_video", "mp4")
filename, filename.replace("unknown_video", "mkv")
)
os.remove(old_filename)
os.remove(filename)
downloaded.append(filename)
return downloaded

View File

@@ -15,9 +15,9 @@ class DateTimeEncoder(json.JSONEncoder):
def captcha_handler(captcha):
key = input(
f"CAPTCHA DETECTED, please solve it and input the solution. url= {captcha.get_url()} :"
f"CAPTCHA DETECTED, please solve it and input the solution. url={captcha.get_url()}:"
).strip()
return captcha.try_again(key.strip())
return captcha.try_again(key)
@contextmanager

View File

@@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "3"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "34"
_PATCH = "10"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""