mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-08 11:28:38 +03:00
Compare commits
99 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8e5fba712c | ||
|
|
f522f891cb | ||
|
|
743ca9c165 | ||
|
|
2130a33829 | ||
|
|
3d5b6de557 | ||
|
|
b9a6b2b747 | ||
|
|
d948044ae9 | ||
|
|
49c254a413 | ||
|
|
d840b280d7 | ||
|
|
e6c98c73ea | ||
|
|
e6fdd54518 | ||
|
|
f61204c4b1 | ||
|
|
ea834c37e2 | ||
|
|
3e22709430 | ||
|
|
9c7eadc716 | ||
|
|
5d30d18b7b | ||
|
|
b2d462441e | ||
|
|
73f17407c0 | ||
|
|
95d249f5d0 | ||
|
|
ccb8c1f5c7 | ||
|
|
e525ff24b1 | ||
|
|
699b4ebdd8 | ||
|
|
8d1a86a7fa | ||
|
|
b01dbe6299 | ||
|
|
5b0f034c12 | ||
|
|
a1c098335c | ||
|
|
12a5d22f64 | ||
|
|
ab602e5d31 | ||
|
|
67bc8b5569 | ||
|
|
021e7c2304 | ||
|
|
91b6dcf291 | ||
|
|
2a1a4e2cae | ||
|
|
fc6b914e2d | ||
|
|
d155c1364a | ||
|
|
8882a87048 | ||
|
|
a95c675e9c | ||
|
|
8864e7c87d | ||
|
|
db9b613ae4 | ||
|
|
37828b4be4 | ||
|
|
1a3a7dc0f3 | ||
|
|
f67707a740 | ||
|
|
798684a334 | ||
|
|
a556b237e9 | ||
|
|
283bc35658 | ||
|
|
cef70fb80d | ||
|
|
e66ef4f477 | ||
|
|
1f6a8368fd | ||
|
|
9a046fd1cb | ||
|
|
aae2bb5999 | ||
|
|
9e30b81d16 | ||
|
|
72bc355606 | ||
|
|
7f59eefb73 | ||
|
|
30003c524e | ||
|
|
d1b27bef1d | ||
|
|
e5e9e08ee6 | ||
|
|
3a8a3f54c0 | ||
|
|
4d73864dbb | ||
|
|
ceaa8e45f3 | ||
|
|
007c8e07a8 | ||
|
|
a515b2c3de | ||
|
|
54540cd132 | ||
|
|
cfb13e5d82 | ||
|
|
926c3cb8a4 | ||
|
|
15ebe2e66c | ||
|
|
eaff88b2d9 | ||
|
|
a6d066a192 | ||
|
|
9078a17400 | ||
|
|
17b516bd7f | ||
|
|
8bd182b041 | ||
|
|
0b8abfb5cb | ||
|
|
cf5fb91c84 | ||
|
|
5c965102a4 | ||
|
|
df10e6f55f | ||
|
|
863dd44463 | ||
|
|
578ec81443 | ||
|
|
c32caec442 | ||
|
|
80b43f7c95 | ||
|
|
90b72b6d22 | ||
|
|
d96e0c0a3a | ||
|
|
db03a4c0f6 | ||
|
|
cf100ee69e | ||
|
|
a09cf32b3e | ||
|
|
e1eb3ed620 | ||
|
|
72bd951d9c | ||
|
|
59d53be68b | ||
|
|
24a1313a65 | ||
|
|
64df4eec28 | ||
|
|
42bdc1441c | ||
|
|
c25880ee6d | ||
|
|
e1e3648852 | ||
|
|
c74dc280d8 | ||
|
|
ab15b35008 | ||
|
|
62c4536d0b | ||
|
|
eac0fc4904 | ||
|
|
1341cd866f | ||
|
|
7824c2922d | ||
|
|
c9a3ece9af | ||
|
|
50b78d618a | ||
|
|
c4a1333428 |
3
.env.example
Normal file
3
.env.example
Normal file
@@ -0,0 +1,3 @@
|
||||
VK_USERNAME="your username"
|
||||
VK_PASSWORD="your password"
|
||||
VK_TOKEN="optional token"
|
||||
7
.github/actions/setup-venv/action.yml
vendored
7
.github/actions/setup-venv/action.yml
vendored
@@ -16,6 +16,11 @@ runs:
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
|
||||
- shell: bash
|
||||
run: |
|
||||
# install ffmpeg
|
||||
sudo apt install ffmpeg
|
||||
|
||||
- shell: bash
|
||||
run: |
|
||||
# Install prerequisites.
|
||||
@@ -26,7 +31,7 @@ runs:
|
||||
# Get the exact Python version to use in the cache key.
|
||||
echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV
|
||||
|
||||
- uses: actions/cache@v2
|
||||
- uses: actions/cache@v4
|
||||
id: virtualenv-cache
|
||||
with:
|
||||
path: .venv
|
||||
|
||||
11
.github/dependabot.yml
vendored
11
.github/dependabot.yml
vendored
@@ -1,11 +0,0 @@
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
open-pull-requests-limit: 10
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
36
.github/pull_request_template.md
vendored
36
.github/pull_request_template.md
vendored
@@ -1,18 +1,18 @@
|
||||
<!-- To ensure we can review your pull request promptly please complete this template entirely. -->
|
||||
|
||||
<!-- Please reference the issue number here. You can replace "Fixes" with "Closes" if it makes more sense. -->
|
||||
Fixes #
|
||||
|
||||
Changes proposed in this pull request:
|
||||
<!-- Please list all changes/additions here. -->
|
||||
-
|
||||
|
||||
## Before submitting
|
||||
|
||||
<!-- Please complete this checklist BEFORE submitting your PR to speed along the review process. -->
|
||||
- [ ] I've read and followed all steps in the [Making a pull request](https://github.com/bellingcat/vk-url-scraper/blob/main/CONTRIBUTING.md#making-a-pull-request)
|
||||
section of the `CONTRIBUTING` docs.
|
||||
- [ ] I've updated or added any relevant docstrings following the syntax described in the
|
||||
[Writing docstrings](https://github.com/bellingcat/vk-url-scraper/blob/main/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs.
|
||||
- [ ] If this PR fixes a bug, I've added a test that will fail without my fix.
|
||||
- [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality.
|
||||
<!-- To ensure we can review your pull request promptly please complete this template entirely. -->
|
||||
|
||||
<!-- Please reference the issue number here. You can replace "Fixes" with "Closes" if it makes more sense. -->
|
||||
Fixes #
|
||||
|
||||
Changes proposed in this pull request:
|
||||
<!-- Please list all changes/additions here. -->
|
||||
-
|
||||
|
||||
## Before submitting
|
||||
|
||||
<!-- Please complete this checklist BEFORE submitting your PR to speed along the review process. -->
|
||||
- [ ] I've read and followed all steps in the [Making a pull request](https://github.com/bellingcat/vk-url-scraper/blob/main/CONTRIBUTING.md#making-a-pull-request)
|
||||
section of the `CONTRIBUTING` docs.
|
||||
- [ ] I've updated or added any relevant docstrings following the syntax described in the
|
||||
[Writing docstrings](https://github.com/bellingcat/vk-url-scraper/blob/main/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs.
|
||||
- [ ] If this PR fixes a bug, I've added a test that will fail without my fix.
|
||||
- [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality.
|
||||
|
||||
16
.github/workflows/main.yml
vendored
16
.github/workflows/main.yml
vendored
@@ -20,6 +20,7 @@ env:
|
||||
PYTHONPATH: ./
|
||||
VK_USERNAME: ${{ secrets.VK_USERNAME }}
|
||||
VK_PASSWORD: ${{ secrets.VK_PASSWORD }}
|
||||
VK_TOKEN: ${{ secrets.VK_TOKEN }}
|
||||
|
||||
jobs:
|
||||
checks:
|
||||
@@ -29,11 +30,11 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python: ['3.7', '3.10']
|
||||
task: # --show-capture=no on purpose
|
||||
python: ['3.10']
|
||||
task: # --show-capture=no on purpose, -s for captchas
|
||||
- name: Test
|
||||
run: |
|
||||
pytest --show-capture=no --color=yes tests/
|
||||
pytest -s --show-capture=no --color=yes tests/
|
||||
|
||||
include:
|
||||
- python: '3.10'
|
||||
@@ -78,10 +79,11 @@ jobs:
|
||||
run: |
|
||||
. .venv/bin/activate
|
||||
${{ matrix.task.run }}
|
||||
continue-on-error: ${{ matrix.task.name != 'Build' }}
|
||||
|
||||
- name: Upload package distribution files
|
||||
if: matrix.task.name == 'Build'
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: package
|
||||
path: dist
|
||||
@@ -116,15 +118,11 @@ jobs:
|
||||
echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
|
||||
|
||||
- name: Download package distribution files
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: package
|
||||
path: dist
|
||||
|
||||
# - name: Generate release notes
|
||||
# run: |
|
||||
# python scripts/release_notes.py > ${{ github.workspace }}-RELEASE_NOTES.md
|
||||
|
||||
- name: Publish package to PyPI
|
||||
run: |
|
||||
twine upload -u '${{ secrets.PYPI_USERNAME }}' -p '${{ secrets.PYPI_PASSWORD }}' dist/*
|
||||
|
||||
27
.github/workflows/pr_checks.yml
vendored
27
.github/workflows/pr_checks.yml
vendored
@@ -1,27 +0,0 @@
|
||||
name: PR Checks
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'vk_url_scraper/**'
|
||||
|
||||
jobs:
|
||||
changelog:
|
||||
name: CHANGELOG
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'pull_request'
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
|
||||
- name: Check that CHANGELOG has been updated
|
||||
run: |
|
||||
# If this step fails, this means you haven't updated the CHANGELOG.md
|
||||
# file with notes on your contribution.
|
||||
git diff --name-only $(git merge-base origin/main HEAD) | grep '^CHANGELOG.md$' && echo "Thanks for helping keep our CHANGELOG up-to-date!"
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,7 @@
|
||||
.env
|
||||
vk_config.v2.json
|
||||
output/
|
||||
tmp*/
|
||||
# build artifacts
|
||||
|
||||
.eggs/
|
||||
|
||||
@@ -4,8 +4,12 @@ sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: false
|
||||
|
||||
build:
|
||||
os: "ubuntu-22.04"
|
||||
tools:
|
||||
python: "3.10"
|
||||
|
||||
python:
|
||||
version: "3.8"
|
||||
install:
|
||||
- requirements: requirements.txt
|
||||
- requirements: dev-requirements.txt
|
||||
|
||||
13
CHANGELOG.md
13
CHANGELOG.md
@@ -1,13 +0,0 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## Unreleased
|
||||
|
||||
## [0.1.2]
|
||||
* Added wall scraper with tests
|
||||
* Added photo scraper with tests
|
||||
* Added scraper with tests
|
||||
@@ -156,8 +156,6 @@ When you're ready to contribute code to address an open issue, please follow the
|
||||
|
||||
If the build fails, it's most likely due to small formatting issues. If the error message isn't clear, feel free to comment on this in your pull request.
|
||||
|
||||
And finally, please update the [CHANGELOG](https://github.com/bellingcat/vk-url-scraper/blob/main/CHANGELOG.md) with notes on your contribution in the "Unreleased" section at the top.
|
||||
|
||||
After all of the above checks have passed, you can now open [a new GitHub pull request](https://github.com/bellingcat/vk-url-scraper/pulls).
|
||||
Make sure you have a clear description of the problem and the solution, and include a link to relevant issues.
|
||||
|
||||
|
||||
2
Makefile
2
Makefile
@@ -13,4 +13,4 @@ run-checks :
|
||||
black .
|
||||
flake8 .
|
||||
mypy .
|
||||
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/
|
||||
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes .
|
||||
|
||||
34
Pipfile
34
Pipfile
@@ -4,7 +4,32 @@ verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
vk-api = "*"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
certifi = ">=2023.7.22"
|
||||
charset-normalizer = ">=3.0.1"
|
||||
idna = ">=3.4"
|
||||
mutagen = ">=1.46.0"
|
||||
pycryptodomex = ">=3.17"
|
||||
requests = ">=2.28.2"
|
||||
urllib3 = ">=1.26.14"
|
||||
websockets = ">=10.4"
|
||||
vk-api = {ref = "b99dac0ec2f832a6c4b20bde49869e7229ce4742", git = "git+https://github.com/python273/vk_api.git"}
|
||||
flake8 = "*"
|
||||
mypy = ">=0.961"
|
||||
black = ">=22.3.0"
|
||||
isort = ">=5.10.1"
|
||||
pytest = "*"
|
||||
pytest-sphinx = "*"
|
||||
pytest-cov = "*"
|
||||
twine = ">=1.11.0"
|
||||
sphinx = "<5.1.0,>=4.3.0"
|
||||
furo = ">=2022.6.4.1"
|
||||
myst-parser = "<0.19.0,>=0.15.2"
|
||||
sphinx-copybutton = ">=0.5.0"
|
||||
sphinx-autobuild = ">=2021.3.14"
|
||||
sphinx-autodoc-typehints = "*"
|
||||
packaging = "*"
|
||||
python-dotenv = ">=0.21.1"
|
||||
|
||||
[dev-packages]
|
||||
sphinx-copybutton = "==0.5.0"
|
||||
@@ -17,11 +42,14 @@ pytest-sphinx = "*"
|
||||
pytest-cov = "*"
|
||||
twine = ">=1.11.0"
|
||||
sphinx = ">=4.3.0,<5.1.0"
|
||||
furo = "==2022.6.4.1"
|
||||
furo = "==2022.6.21"
|
||||
myst-parser = ">=0.15.2,<0.19.0"
|
||||
sphinx-autobuild = "==2021.3.14"
|
||||
sphinx-autodoc-typehints = "*"
|
||||
python-dotenv = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
python_version = "3.11"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
||||
|
||||
2643
Pipfile.lock
generated
2643
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
90
README.md
90
README.md
@@ -1,14 +1,54 @@
|
||||
# vk-url-scraper
|
||||
Library to scrape data and especially media links (videos and photos) from vk.com URLs.
|
||||
Python library to scrape data, and especially media links like videos and photos, from vk.com URLs.
|
||||
|
||||
> This repo has been archived because it relies on a fixed git commit of the vk_api library which we can no longer publish to pypi, see [issue](https://github.com/bellingcat/vk-url-scraper/issues/66). You can still install the latest install. This archived state may change if a solution is found to publish the library to pypi again.
|
||||
|
||||
[](https://badge.fury.io/py/vk-url-scraper)
|
||||
[](https://pypi.python.org/pypi/vk-url-scraper/)
|
||||
[](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest)
|
||||
|
||||
|
||||
# TODO
|
||||
* docs online from sphinx
|
||||
You can use it via the [command line](#command-line-usage) or as a [python library](#python-library-usage), check the **[documentation](https://vk-url-scraper.readthedocs.io/en/latest/)**.
|
||||
|
||||
## Quick usage
|
||||
`pip install vk-url-scraper` to install.
|
||||
## Installation
|
||||
You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`.
|
||||
|
||||
Currently you need to manually unsintall and re-install one dependency (as it is installed from github and not pypi):
|
||||
```bash
|
||||
pip uninstall vk-api
|
||||
pip install git+https://github.com/python273/vk_api.git@b99dac0ec2f832a6c4b20bde49869e7229ce4742
|
||||
```
|
||||
|
||||
To use the library you will need a valid username/password combination for vk.com.
|
||||
|
||||
## Command line usage
|
||||
```bash
|
||||
# run this to learn more about the parameters
|
||||
vk_url_scraper --help
|
||||
|
||||
# scrape a URL and get the JSON result in the console
|
||||
vk_url_scraper --username "username here" --password "password here" --urls https://vk.com/wall12345_6789
|
||||
# OR
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789
|
||||
# you can also have multiple urls
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
|
||||
|
||||
# you can pass a token as well to avoid always authenticating
|
||||
# and possibly getting captcha prompts
|
||||
# you can fetch the token from the vk_config.v2.json file generated under by searching for "access_token"
|
||||
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
|
||||
|
||||
# save the JSON output into a file
|
||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json
|
||||
|
||||
# download any photos or videos found in these URLS
|
||||
# this will use or create an output/ folder and dump the files there
|
||||
vk_url_scraper -u "username here" -p "password here" --download --urls https://vk.com/wall12345_6789
|
||||
# or
|
||||
vk_url_scraper -u "username here" -p "password here" -d --urls https://vk.com/wall12345_6789
|
||||
```
|
||||
|
||||
## Python library usage
|
||||
```python
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
@@ -22,7 +62,7 @@ res = vks.scrape("https://vk.com/wall-1_398461")
|
||||
|
||||
# scrape any "video" URL
|
||||
res = vks.scrape("https://vk.com/video-6596301_145810025")
|
||||
print(res[0]["text]) # eg: -> to get the text from code
|
||||
print(res[0]["text"]) # eg: -> to get the text from code
|
||||
```
|
||||
|
||||
```python
|
||||
@@ -43,15 +83,41 @@ print(res[0]["text]) # eg: -> to get the text from code
|
||||
|
||||
see [docs] for all available functions.
|
||||
|
||||
### Development
|
||||
1. setup environment with `pip install -r requirements` or `pipenv install -r requirements`
|
||||
### TODO
|
||||
* scrape album links
|
||||
* scrape profile links
|
||||
* docs online from sphinx
|
||||
|
||||
## Development
|
||||
(more info in [CONTRIBUTING.md](CONTRIBUTING.md)).
|
||||
|
||||
1. setup dev environment with `pipenv install --dev`
|
||||
1. setup environment with `pipenv install -r requirements.txt`
|
||||
1. Activate the environment with `pipenv shell` (or prepend `pipenv run` to all commands)
|
||||
2. To run all checks to `make run-checks` (fixes style) or individually
|
||||
1. To fix style: `black .` and `isort .` -> `flake8 .` to validate lint
|
||||
2. To do type checking: `mypy .`
|
||||
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to user verbose, colors, and test docstring examples)
|
||||
3. To test: `pytest .` (`pytest -v --color=yes --doctest-modules tests/ vk_url_scraper/` to use verbose, colors, and test docstring examples)
|
||||
3. `make docs` to generate shpynx docs -> edit [config.py](docs/source/conf.py) if needed
|
||||
|
||||
### Releasing new version
|
||||
To test the command line interface available in [__main__.py](__vk_url_scraper/__main__.py) you need to pass the `-m` option to python like so: `python -m vk_url_scraper -u "" -p "" --urls ...`
|
||||
|
||||
|
||||
## Releasing new version
|
||||
1. edit [version.py](vk_url_scraper/version.py) with proper versioning
|
||||
2. `git tag vx.y.z` to tag version
|
||||
3. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
|
||||
2. make sure to run `pipenv run pip freeze > requirements.txt` if you manage libs with pipenv
|
||||
1. if the hardcoded version of [vk_api](https://github.com/python273/vk_api) is still being used, then you must comment/remove that line from the generated requirements file and instruct users to manually install the version from the source as pypi does not allow repo/commit tags. Additionally, add the latest released version, currently `vk-api==11.9.9`.
|
||||
3. run `./scripts/release.sh` to create a tag and push, alternatively
|
||||
1. `git tag vx.y.z` to tag version
|
||||
2. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
|
||||
4. go to https://readthedocs.org/ to deploy new docs version (if webhook is not setup)
|
||||
|
||||
### Fixing a failed release
|
||||
|
||||
If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete both the tag and corresponding release from GitHub. After you've pushed a fix, delete the tag from your local clone with
|
||||
|
||||
```bash
|
||||
git tag -l | xargs git tag -d && git fetch -t
|
||||
```
|
||||
|
||||
Then repeat the steps above.
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
# GitHub Release Process
|
||||
|
||||
## Steps
|
||||
|
||||
1. Update the version in `vk_url_scraper/version.py`.
|
||||
|
||||
3. Run the release script:
|
||||
|
||||
```bash
|
||||
./scripts/release.sh
|
||||
```
|
||||
|
||||
This will commit the changes to the CHANGELOG and `version.py` files and then create a new tag in git
|
||||
which will trigger a workflow on GitHub Actions that handles the rest.
|
||||
|
||||
## Fixing a failed release
|
||||
|
||||
If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete both the tag and corresponding release from GitHub. After you've pushed a fix, delete the tag from your local clone with
|
||||
|
||||
```bash
|
||||
git tag -l | xargs git tag -d && git fetch -t
|
||||
```
|
||||
|
||||
Then repeat the steps above.
|
||||
@@ -2,11 +2,11 @@
|
||||
flake8
|
||||
|
||||
# Static type checking
|
||||
mypy==0.961
|
||||
mypy>=0.961
|
||||
|
||||
# Automatic code formatting
|
||||
black==22.3.0
|
||||
isort==5.10.1
|
||||
black>=22.3.0
|
||||
isort>=5.10.1
|
||||
|
||||
# Running tests
|
||||
pytest
|
||||
@@ -24,19 +24,20 @@ wheel
|
||||
Sphinx>=4.3.0,<5.1.0
|
||||
|
||||
# Sphinx theme: https://sphinx-themes.org/sample-sites/furo/
|
||||
furo==2022.6.4.1
|
||||
furo>=2022.6.4.1
|
||||
|
||||
# Lets Sphinx parse markdown files in addition to rst.
|
||||
myst-parser>=0.15.2,<0.19.0
|
||||
|
||||
# Adds a copy button to code examples in the docs.
|
||||
sphinx-copybutton==0.5.0
|
||||
sphinx-copybutton>=0.5.0
|
||||
|
||||
# Live rebuilding and reloading of docs for developing locally.
|
||||
sphinx-autobuild==2021.3.14
|
||||
sphinx-autobuild>=2021.3.14
|
||||
|
||||
# Automatically adds types to docs
|
||||
sphinx-autodoc-typehints
|
||||
|
||||
# For parsing and comparing version numbers.
|
||||
packaging
|
||||
python-dotenv>=0.21.1
|
||||
@@ -1 +0,0 @@
|
||||
../../CHANGELOG.md
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 7.6 KiB |
@@ -23,7 +23,6 @@ Contents
|
||||
|
||||
installation
|
||||
overview
|
||||
CHANGELOG
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
Installation
|
||||
============
|
||||
|
||||
**vk-url-scraper** supports Python >= 3.7.
|
||||
**vk-url-scraper** supports Python >= 3.10.
|
||||
|
||||
## Installing with `pip`
|
||||
|
||||
|
||||
101
requirements.txt
101
requirements.txt
@@ -1,14 +1,87 @@
|
||||
#
|
||||
# These requirements were autogenerated by pipenv
|
||||
# To regenerate from the project's Pipfile, run:
|
||||
#
|
||||
# pipenv lock --requirements
|
||||
#
|
||||
|
||||
certifi==2022.6.15
|
||||
charset-normalizer==2.0.12
|
||||
idna==3.3
|
||||
requests==2.28.0
|
||||
urllib3==1.26.9
|
||||
vk-api==11.9.8
|
||||
python-dotenv==0.20.0
|
||||
alabaster==0.7.16
|
||||
anyio==4.9.0
|
||||
babel==2.17.0
|
||||
backports.tarfile==1.2.0
|
||||
beautifulsoup4==4.13.4
|
||||
black==25.1.0
|
||||
certifi==2025.4.26
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.4.2
|
||||
click==8.1.8
|
||||
colorama==0.4.6
|
||||
coverage==7.8.0
|
||||
cryptography==44.0.3
|
||||
docutils==0.18.1
|
||||
flake8==7.2.0
|
||||
furo==2023.3.27
|
||||
h11==0.16.0
|
||||
id==1.5.0
|
||||
idna==3.10
|
||||
imagesize==1.4.1
|
||||
importlib_metadata==8.7.0
|
||||
iniconfig==2.1.0
|
||||
isort==6.0.1
|
||||
jaraco.classes==3.4.0
|
||||
jaraco.context==6.0.1
|
||||
jaraco.functools==4.1.0
|
||||
jeepney==0.9.0
|
||||
Jinja2==3.1.6
|
||||
keyring==25.6.0
|
||||
livereload==2.7.1
|
||||
markdown-it-py==2.2.0
|
||||
MarkupSafe==3.0.2
|
||||
mccabe==0.7.0
|
||||
mdit-py-plugins==0.3.5
|
||||
mdurl==0.1.2
|
||||
more-itertools==10.7.0
|
||||
mutagen==1.47.0
|
||||
mypy==1.15.0
|
||||
mypy_extensions==1.1.0
|
||||
myst-parser==0.18.1
|
||||
nh3==0.2.21
|
||||
packaging==25.0
|
||||
pathspec==0.12.1
|
||||
pkginfo==1.10.0
|
||||
platformdirs==4.3.7
|
||||
pluggy==1.5.0
|
||||
pycodestyle==2.13.0
|
||||
pycparser==2.22
|
||||
pycryptodomex==3.22.0
|
||||
pyflakes==3.3.2
|
||||
Pygments==2.19.1
|
||||
pytest==8.3.5
|
||||
pytest-cov==6.1.1
|
||||
pytest-sphinx==0.6.3
|
||||
python-dotenv==1.1.0
|
||||
PyYAML==6.0.2
|
||||
readme_renderer==43.0
|
||||
requests==2.32.3
|
||||
requests-toolbelt==1.0.0
|
||||
rfc3986==2.0.0
|
||||
rich==14.0.0
|
||||
SecretStorage==3.3.3
|
||||
sniffio==1.3.1
|
||||
snowballstemmer==2.2.0
|
||||
soupsieve==2.7
|
||||
Sphinx==5.0.2
|
||||
sphinx-autobuild==2024.10.3
|
||||
sphinx-autodoc-typehints==1.19.1
|
||||
sphinx-basic-ng==1.0.0b2
|
||||
sphinx-copybutton==0.5.2
|
||||
sphinxcontrib-applehelp==2.0.0
|
||||
sphinxcontrib-devhelp==2.0.0
|
||||
sphinxcontrib-htmlhelp==2.1.0
|
||||
sphinxcontrib-jsmath==1.0.1
|
||||
sphinxcontrib-qthelp==2.0.0
|
||||
sphinxcontrib-serializinghtml==2.0.0
|
||||
starlette==0.46.2
|
||||
tornado==6.5b1
|
||||
twine==6.1.0
|
||||
typing_extensions==4.13.2
|
||||
urllib3==2.4.0
|
||||
uvicorn==0.34.2
|
||||
vk_api @ git+https://github.com/python273/vk_api.git@b99dac0ec2f832a6c4b20bde49869e7229ce4742
|
||||
watchfiles==1.0.5
|
||||
websockets==15.0.1
|
||||
yt-dlp==2025.5.3.232917.dev0
|
||||
zipp==3.21.0
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from vk_url_scraper.version import VERSION
|
||||
|
||||
|
||||
def main():
|
||||
changelog = Path("CHANGELOG.md")
|
||||
|
||||
with changelog.open() as f:
|
||||
lines = f.readlines()
|
||||
|
||||
insert_index: int = -1
|
||||
for i in range(len(lines)):
|
||||
line = lines[i]
|
||||
if line.startswith("## Unreleased"):
|
||||
insert_index = i + 1
|
||||
elif line.startswith(f"## [v{VERSION}]"):
|
||||
print("CHANGELOG already up-to-date")
|
||||
return
|
||||
elif line.startswith("## [v"):
|
||||
break
|
||||
|
||||
if insert_index < 0:
|
||||
raise RuntimeError("Couldn't find 'Unreleased' section")
|
||||
|
||||
lines.insert(insert_index, "\n")
|
||||
lines.insert(
|
||||
insert_index + 1,
|
||||
f"## [v{VERSION}](https://github.com/bellingcat/vk-url-scraper/releases/tag/v{VERSION}) - "
|
||||
f"{datetime.now().strftime('%Y-%m-%d')}\n",
|
||||
)
|
||||
|
||||
with changelog.open("w") as f:
|
||||
f.writelines(lines)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -7,7 +7,6 @@ TAG=$(python -c 'from vk_url_scraper.version import VERSION; print("v" + VERSION
|
||||
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
|
||||
|
||||
if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
|
||||
python scripts/prepare_changelog.py
|
||||
git add -A
|
||||
git commit -m "Bump version to $TAG for release" || true && git push
|
||||
echo "Creating new git tag $TAG"
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
Prepares markdown release notes for GitHub releases.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
import packaging.version
|
||||
|
||||
TAG = os.environ["TAG"]
|
||||
|
||||
ADDED_HEADER = "### Added 🎉"
|
||||
CHANGED_HEADER = "### Changed ⚠️"
|
||||
FIXED_HEADER = "### Fixed ✅"
|
||||
REMOVED_HEADER = "### Removed 👋"
|
||||
|
||||
|
||||
def get_change_log_notes() -> str:
|
||||
in_current_section = False
|
||||
current_section_notes: List[str] = []
|
||||
with open("CHANGELOG.md") as changelog:
|
||||
for line in changelog:
|
||||
if line.startswith("## "):
|
||||
if line.startswith("## Unreleased"):
|
||||
continue
|
||||
if line.startswith(f"## [{TAG}]"):
|
||||
in_current_section = True
|
||||
continue
|
||||
break
|
||||
if in_current_section:
|
||||
if line.startswith("### Added"):
|
||||
line = ADDED_HEADER + "\n"
|
||||
elif line.startswith("### Changed"):
|
||||
line = CHANGED_HEADER + "\n"
|
||||
elif line.startswith("### Fixed"):
|
||||
line = FIXED_HEADER + "\n"
|
||||
elif line.startswith("### Removed"):
|
||||
line = REMOVED_HEADER + "\n"
|
||||
current_section_notes.append(line)
|
||||
assert current_section_notes
|
||||
return "## What's new\n\n" + "".join(current_section_notes).strip() + "\n"
|
||||
|
||||
|
||||
def get_commit_history() -> str:
|
||||
new_version = packaging.version.parse(TAG)
|
||||
|
||||
# Get all tags sorted by version, latest first.
|
||||
all_tags = os.popen("git tag -l --sort=-version:refname 'v*'").read().split("\n")
|
||||
|
||||
# Out of `all_tags`, find the latest previous version so that we can collect all
|
||||
# commits between that version and the new version we're about to publish.
|
||||
# Note that we ignore pre-releases unless the new version is also a pre-release.
|
||||
last_tag: Optional[str] = None
|
||||
for tag in all_tags:
|
||||
if not tag.strip(): # could be blank line
|
||||
continue
|
||||
version = packaging.version.parse(tag)
|
||||
if new_version.pre is None and version.pre is not None:
|
||||
continue
|
||||
if version < new_version:
|
||||
last_tag = tag
|
||||
break
|
||||
if last_tag is not None:
|
||||
commits = os.popen(f"git log {last_tag}..{TAG}^ --oneline --first-parent").read()
|
||||
else:
|
||||
commits = os.popen("git log --oneline --first-parent").read()
|
||||
return "## Commits\n\n" + commits
|
||||
|
||||
|
||||
def main():
|
||||
print(get_change_log_notes())
|
||||
print(get_commit_history())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
18
setup.py
18
setup.py
@@ -33,7 +33,7 @@ with open("vk_url_scraper/version.py", "r") as version_file:
|
||||
setup(
|
||||
name="vk-url-scraper",
|
||||
version=VERSION["VERSION"],
|
||||
description="",
|
||||
description="Scrape VK URLs to fetch info and media - python API or command line tool.",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
classifiers=[
|
||||
@@ -43,16 +43,24 @@ setup(
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
],
|
||||
keywords="",
|
||||
url="https://github.com/bellingcat/vk-url-scraper",
|
||||
keywords=["scraper", "vk", "vkontakte", "vk-api", "media-downloader"],
|
||||
project_urls={
|
||||
"Code": "https://github.com/bellingcat/vk-url-scraper",
|
||||
"Documentation": "https://vk-url-scraper.readthedocs.io/en/latest/",
|
||||
},
|
||||
author="Bellingcat",
|
||||
author_email="tech@bellingcat.com",
|
||||
license="Apache",
|
||||
license="MIT",
|
||||
packages=find_packages(
|
||||
exclude=["*.tests", "*.tests.*", "tests.*", "tests"],
|
||||
),
|
||||
package_data={"vk_url_scraper": ["py.typed"]},
|
||||
install_requires=read_requirements("requirements.txt"),
|
||||
extras_require={"dev": read_requirements("dev-requirements.txt")},
|
||||
python_requires=">=3.7",
|
||||
python_requires=">=3.10",
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"vk_url_scraper=vk_url_scraper.__main__:main",
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
import datetime
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
from .util import assert_equal_lists
|
||||
|
||||
vks = None
|
||||
|
||||
|
||||
@@ -15,9 +14,23 @@ def test_login_fail():
|
||||
VkScraper("invalid", "combination")
|
||||
|
||||
|
||||
# disabled due to CI
|
||||
# def test_login_custom_file():
|
||||
# session_filename = "test-session.json"
|
||||
# VkScraper(
|
||||
# os.environ["VK_USERNAME"],
|
||||
# os.environ["VK_PASSWORD"],
|
||||
# session_file=session_filename,
|
||||
# )
|
||||
# assert os.path.isfile(session_filename)
|
||||
# os.unlink(session_filename)
|
||||
|
||||
|
||||
def test_login_success():
|
||||
global vks
|
||||
vks = VkScraper(os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"])
|
||||
vks = VkScraper(
|
||||
os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"], os.environ.get("VK_TOKEN")
|
||||
)
|
||||
|
||||
|
||||
def test_scrape_empty_urll():
|
||||
@@ -68,7 +81,7 @@ def test_scrape_wall_url_with_photos():
|
||||
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
|
||||
)
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
|
||||
assert len(res[0]["payload"]) == 16
|
||||
assert len(res[0]["payload"]) == 19
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["photo"]
|
||||
assert len(res[0]["attachments"]["photo"]) == 9
|
||||
@@ -80,14 +93,32 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
|
||||
assert res[0]["id"] == "wall-17315087_74182"
|
||||
assert res[0]["text"] == ""
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
|
||||
assert len(res[0]["payload"]) == 15
|
||||
assert len(res[0]["payload"]) == 18
|
||||
assert len(res[0]["attachments"].keys()) == 3
|
||||
assert_equal_lists(list(res[0]["attachments"].keys()), ["photo", "link", "video"])
|
||||
for k in ["photo", "link", "video"]:
|
||||
assert k in list(res[0]["attachments"].keys())
|
||||
assert len(res[0]["attachments"]["photo"]) == 5
|
||||
assert len(res[0]["attachments"]["link"]) == 1
|
||||
assert len(res[0]["attachments"]["video"]) == 1
|
||||
|
||||
|
||||
def test_scrape_download_multiple_media():
|
||||
res = vks.scrape("https://vk.com/w=wall-17315087_74182")
|
||||
|
||||
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||
vks.download_media(res, tempdir)
|
||||
expect_files = {
|
||||
"wall-17315087_74182_0.jpg",
|
||||
"wall-17315087_74182_1.jpg",
|
||||
"wall-17315087_74182_2.jpg",
|
||||
"wall-17315087_74182_3.jpg",
|
||||
"wall-17315087_74182_4.jpg",
|
||||
"wall-17315087_74182_0.mp4",
|
||||
}
|
||||
found_files = set(os.listdir(tempdir))
|
||||
assert len(expect_files) == len(expect_files & found_files)
|
||||
|
||||
|
||||
def test_scrape_photo_only():
|
||||
res = vks.scrape("https://vk.com/apiclub?z=photo-1_457242435%2Falbum-1_00%2Frev")
|
||||
assert len(res) == 1
|
||||
@@ -97,7 +128,7 @@ def test_scrape_photo_only():
|
||||
== "Делимся расписанием конкурса [https://vk.com/wall-1_399468|«Код Петербурга»]. Все важные этапы — на одной схеме \n\nЕсли участвуете, обязательно сохраните себе. Так будет удобнее планировать работу над проектом, и вы точно не упустите лучший момент для отправки сервиса на модерацию."
|
||||
)
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 7, 9, 43))
|
||||
assert len(res[0]["payload"]) == 15
|
||||
assert len(res[0]["payload"]) == 16
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["photo"]
|
||||
assert len(res[0]["attachments"]["photo"]) == 1
|
||||
@@ -108,11 +139,31 @@ def test_scrape_video_only():
|
||||
assert len(res) == 1
|
||||
assert res[0]["id"] == "video38556806_456251917"
|
||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38))
|
||||
assert len(res[0]["payload"]) == 31
|
||||
assert len(res[0]["attachments"].keys()) == 1
|
||||
assert list(res[0]["attachments"].keys()) == ["video"]
|
||||
|
||||
|
||||
def test_scrape_video_only2():
|
||||
res = vks.scrape("https://vk.com/video-1_456239018")
|
||||
print(res[0]["attachments"]["video"][0])
|
||||
res = vks.scrape("https://vk.com/video-17546758_456239898")
|
||||
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||
vks.download_media(res, tempdir)
|
||||
found_files = set(os.listdir(tempdir))
|
||||
assert "video-17546758_456239898_0.mp4" in found_files
|
||||
|
||||
|
||||
def test_scrape_private_video():
|
||||
"""
|
||||
> Some videos are kept private and cannot be accessed without a passkey . In this case, send the ID in {owner_id}_{video_id}_{access_key}.
|
||||
From https://dev.vk.com/ru/method/video.get
|
||||
"""
|
||||
res = vks.scrape("https://vk.com/wall-127774884_178565")
|
||||
|
||||
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||
vks.download_media(res, tempdir)
|
||||
expect_files = {
|
||||
"wall-127774884_178565_0.mp4",
|
||||
"wall-127774884_178565_1.mp4",
|
||||
"wall-127774884_178565_2.mp4",
|
||||
}
|
||||
found_files = set(os.listdir(tempdir))
|
||||
assert len(expect_files) == len(expect_files & found_files)
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
def assert_equal_lists(l1, l2):
|
||||
assert len(l1) == len(l2)
|
||||
assert str(sorted(l1)) == str(sorted(l2))
|
||||
@@ -1 +1,2 @@
|
||||
from .scraper import VkScraper
|
||||
from .utils import DateTimeEncoder, suppress_stdout
|
||||
|
||||
71
vk_url_scraper/__main__.py
Normal file
71
vk_url_scraper/__main__.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
from .scraper import VkScraper
|
||||
from .utils import DateTimeEncoder
|
||||
|
||||
|
||||
def get_argument_parser():
|
||||
"""
|
||||
Creates the CMD line arguments. 'python vk_url_scraper.py --help'
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Authenticate and scrape information from vk.com based on a URL or set of URLs."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-u",
|
||||
"--username",
|
||||
action="store",
|
||||
dest="username",
|
||||
required=True,
|
||||
help="username for a valid vk.com account (pass empty if using --token)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--password",
|
||||
action="store",
|
||||
dest="password",
|
||||
required=True,
|
||||
help="password for the valid vk.com account (pass empty if using --token)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--token",
|
||||
action="store",
|
||||
dest="token",
|
||||
required=False,
|
||||
help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--download",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
dest="download",
|
||||
help="if set then all photos and videos will be downloaded to folder output/",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--urls",
|
||||
action="store",
|
||||
dest="urls",
|
||||
nargs=argparse.REMAINDER,
|
||||
required=True,
|
||||
help="must be the last argument: any text with one or more urls to scrape",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
parser = get_argument_parser()
|
||||
args = parser.parse_args()
|
||||
vks = VkScraper(args.username, args.password, args.token)
|
||||
text = " ".join(args.urls)
|
||||
res = vks.scrape(text)
|
||||
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||
print(res_json)
|
||||
if args.download:
|
||||
vks.download_media(res)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,10 +1,16 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
import vk_api # used to get api_token after authentication
|
||||
import yt_dlp # to download videos from url
|
||||
|
||||
from .utils import captcha_handler, suppress_stdout
|
||||
|
||||
|
||||
class VkScraper:
|
||||
@@ -31,12 +37,20 @@ class VkScraper:
|
||||
|
||||
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
|
||||
|
||||
def __init__(self, username: str, password: str) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
username: str,
|
||||
password: str,
|
||||
token: Optional[str] = None,
|
||||
session_file="vk_config.v2.json",
|
||||
captcha_handler=captcha_handler,
|
||||
) -> None:
|
||||
"""Initializes the scraper.
|
||||
|
||||
This function receives a username and password and performs authentication on vk.com to then call api endpoints
|
||||
This function receives a username and password (or access token) and performs
|
||||
authentication on vk.com to then call api endpoints. If token is passed, authentication will not be performed again.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -44,9 +58,22 @@ class VkScraper:
|
||||
Username on vk.com, can be a phone number or email
|
||||
password : str
|
||||
Matching password on vk.com
|
||||
token : str
|
||||
Access token received after authenticating, can be found in the vk_config.v2.json file
|
||||
session_file : str
|
||||
File name where the VK session is saved so future logins are easier, this will not be created if token is passed
|
||||
captcha_handler : func
|
||||
Function that can receive a vk_api captcha instance and help the user solve it, default is a complete CLI handler
|
||||
"""
|
||||
self.session = vk_api.VkApi(username, password)
|
||||
self.session.auth(token_only=True)
|
||||
self.session = vk_api.VkApi(
|
||||
username,
|
||||
password,
|
||||
token=token,
|
||||
config_filename=session_file,
|
||||
captcha_handler=captcha_handler,
|
||||
)
|
||||
if token is None or len(token) == 0:
|
||||
self.session.auth(token_only=True)
|
||||
|
||||
def scrape(self, url: str) -> List:
|
||||
"""Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
|
||||
@@ -117,10 +144,11 @@ class VkScraper:
|
||||
first_type = a["type"]
|
||||
attachment = a[first_type]
|
||||
if first_type == "video":
|
||||
video_path = f'video{attachment["owner_id"]}_{attachment["id"]}'
|
||||
if "access_key" in attachment:
|
||||
video_path += f"_{attachment['access_key']}"
|
||||
attachments["video"].extend(
|
||||
self.scrape_videos(f'video{attachment["owner_id"]}_{attachment["id"]}')[
|
||||
0
|
||||
]
|
||||
self.scrape_videos(video_path)[0]
|
||||
.get("attachments", {})
|
||||
.get("video", [""])
|
||||
)
|
||||
@@ -273,3 +301,64 @@ class VkScraper:
|
||||
}
|
||||
)
|
||||
return res
|
||||
|
||||
def download_media(self, results: List[dict], destination: str = "./output/") -> List[str]:
|
||||
"""
|
||||
Receives a list of dicts as returned by any of the scrape* methods and downloads the URLS present
|
||||
if they are of type photo or video into the destination folder
|
||||
|
||||
Parameters
|
||||
----------
|
||||
results : List[dict]
|
||||
list with valid dictionary results (see class definition)
|
||||
destination : str
|
||||
the directory to save the downloaded files to. defaults to output/
|
||||
|
||||
Returns
|
||||
-------
|
||||
a list of filenames for the downloaded files
|
||||
"""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
os.makedirs(destination, exist_ok=True)
|
||||
downloaded = []
|
||||
for r in results:
|
||||
for k, attachments in r["attachments"].items():
|
||||
if k == "photo":
|
||||
for i, url in enumerate(attachments):
|
||||
ext = os.path.splitext(urlparse(url).path)[1]
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}{ext}")
|
||||
d = requests.get(url, headers=headers)
|
||||
with open(filename, "wb") as f:
|
||||
f.write(d.content)
|
||||
downloaded.append(filename)
|
||||
elif k == "video":
|
||||
with suppress_stdout(): # ytdlp is not 100% quiet
|
||||
for i, url in enumerate(attachments):
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
|
||||
ydl = yt_dlp.YoutubeDL(
|
||||
{
|
||||
"format": (
|
||||
"bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
|
||||
),
|
||||
"merge_output_format": "mp4",
|
||||
"retries": 5,
|
||||
"noplaylist": True,
|
||||
"outtmpl": filename,
|
||||
"quiet": True,
|
||||
"restrictfilenames": True,
|
||||
"forcefilename": True,
|
||||
"simulate": False,
|
||||
}
|
||||
)
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl.prepare_filename(info)
|
||||
if "unknown_video" in filename:
|
||||
old_filename = filename
|
||||
filename = shutil.copy(
|
||||
filename, filename.replace("unknown_video", "mp4")
|
||||
)
|
||||
os.remove(old_filename)
|
||||
downloaded.append(filename)
|
||||
return downloaded
|
||||
|
||||
33
vk_url_scraper/utils.py
Normal file
33
vk_url_scraper/utils.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class DateTimeEncoder(json.JSONEncoder):
|
||||
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime):
|
||||
return str(o) # with timezone
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
|
||||
def captcha_handler(captcha):
|
||||
key = input(
|
||||
f"CAPTCHA DETECTED, please solve it and input the solution. url= {captcha.get_url()} :"
|
||||
).strip()
|
||||
return captcha.try_again(key.strip())
|
||||
|
||||
|
||||
@contextmanager
|
||||
def suppress_stdout():
|
||||
# https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
|
||||
# this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console
|
||||
with open(os.devnull, "w") as devnull:
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = devnull
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
@@ -1,8 +1,8 @@
|
||||
_MAJOR = "0"
|
||||
_MINOR = "1"
|
||||
_MINOR = "3"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "5"
|
||||
_PATCH = "34"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user