mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-12 13:28:37 +03:00
Compare commits
37 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8864e7c87d | ||
|
|
db9b613ae4 | ||
|
|
37828b4be4 | ||
|
|
1a3a7dc0f3 | ||
|
|
f67707a740 | ||
|
|
798684a334 | ||
|
|
a556b237e9 | ||
|
|
283bc35658 | ||
|
|
cef70fb80d | ||
|
|
e66ef4f477 | ||
|
|
1f6a8368fd | ||
|
|
9a046fd1cb | ||
|
|
aae2bb5999 | ||
|
|
9e30b81d16 | ||
|
|
72bc355606 | ||
|
|
7f59eefb73 | ||
|
|
30003c524e | ||
|
|
d1b27bef1d | ||
|
|
e5e9e08ee6 | ||
|
|
3a8a3f54c0 | ||
|
|
4d73864dbb | ||
|
|
ceaa8e45f3 | ||
|
|
007c8e07a8 | ||
|
|
a515b2c3de | ||
|
|
54540cd132 | ||
|
|
cfb13e5d82 | ||
|
|
926c3cb8a4 | ||
|
|
15ebe2e66c | ||
|
|
eaff88b2d9 | ||
|
|
a6d066a192 | ||
|
|
9078a17400 | ||
|
|
17b516bd7f | ||
|
|
8bd182b041 | ||
|
|
0b8abfb5cb | ||
|
|
cf5fb91c84 | ||
|
|
5c965102a4 | ||
|
|
df10e6f55f |
5
.github/actions/setup-venv/action.yml
vendored
5
.github/actions/setup-venv/action.yml
vendored
@@ -16,6 +16,11 @@ runs:
|
|||||||
with:
|
with:
|
||||||
python-version: ${{ inputs.python-version }}
|
python-version: ${{ inputs.python-version }}
|
||||||
|
|
||||||
|
- shell: bash
|
||||||
|
run: |
|
||||||
|
# install ffmpeg
|
||||||
|
sudo apt install ffmpeg
|
||||||
|
|
||||||
- shell: bash
|
- shell: bash
|
||||||
run: |
|
run: |
|
||||||
# Install prerequisites.
|
# Install prerequisites.
|
||||||
|
|||||||
11
.github/dependabot.yml
vendored
11
.github/dependabot.yml
vendored
@@ -1,11 +0,0 @@
|
|||||||
version: 2
|
|
||||||
updates:
|
|
||||||
- package-ecosystem: "pip"
|
|
||||||
directory: "/"
|
|
||||||
schedule:
|
|
||||||
interval: "daily"
|
|
||||||
open-pull-requests-limit: 10
|
|
||||||
- package-ecosystem: "github-actions"
|
|
||||||
directory: "/"
|
|
||||||
schedule:
|
|
||||||
interval: "daily"
|
|
||||||
3
.github/workflows/main.yml
vendored
3
.github/workflows/main.yml
vendored
@@ -30,8 +30,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
# python: ['3.7', '3.10']
|
python: ['3.7', '3.10']
|
||||||
python: ['3.10']
|
|
||||||
task: # --show-capture=no on purpose
|
task: # --show-capture=no on purpose
|
||||||
- name: Test
|
- name: Test
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,6 +1,7 @@
|
|||||||
.env
|
.env
|
||||||
vk_config.v2.json
|
vk_config.v2.json
|
||||||
output/
|
output/
|
||||||
|
tmp*/
|
||||||
# build artifacts
|
# build artifacts
|
||||||
|
|
||||||
.eggs/
|
.eggs/
|
||||||
|
|||||||
33
Pipfile
33
Pipfile
@@ -4,8 +4,32 @@ verify_ssl = true
|
|||||||
name = "pypi"
|
name = "pypi"
|
||||||
|
|
||||||
[packages]
|
[packages]
|
||||||
vk-api = "*"
|
vk-api = ">=11.9.9"
|
||||||
yt-dlp = "*"
|
yt-dlp = ">=2023.2.17"
|
||||||
|
flake8 = "*"
|
||||||
|
mypy = ">=0.961"
|
||||||
|
black = ">=22.3.0"
|
||||||
|
isort = ">=5.10.1"
|
||||||
|
pytest = "*"
|
||||||
|
pytest-sphinx = "*"
|
||||||
|
pytest-cov = "*"
|
||||||
|
twine = ">=1.11.0"
|
||||||
|
sphinx = "<5.1.0,>=4.3.0"
|
||||||
|
furo = ">=2022.6.4.1"
|
||||||
|
myst-parser = "<0.19.0,>=0.15.2"
|
||||||
|
sphinx-copybutton = ">=0.5.0"
|
||||||
|
sphinx-autobuild = ">=2021.3.14"
|
||||||
|
sphinx-autodoc-typehints = "*"
|
||||||
|
python-dotenv = ">=0.21.1"
|
||||||
|
brotli = ">=1.0.9"
|
||||||
|
certifi = ">=2022.12.7"
|
||||||
|
charset-normalizer = ">=3.0.1"
|
||||||
|
idna = ">=3.4"
|
||||||
|
mutagen = ">=1.46.0"
|
||||||
|
pycryptodomex = ">=3.17"
|
||||||
|
requests = ">=2.28.2"
|
||||||
|
urllib3 = ">=1.26.14"
|
||||||
|
websockets = ">=10.4"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
sphinx-copybutton = "==0.5.0"
|
sphinx-copybutton = "==0.5.0"
|
||||||
@@ -18,7 +42,7 @@ pytest-sphinx = "*"
|
|||||||
pytest-cov = "*"
|
pytest-cov = "*"
|
||||||
twine = ">=1.11.0"
|
twine = ">=1.11.0"
|
||||||
sphinx = ">=4.3.0,<5.1.0"
|
sphinx = ">=4.3.0,<5.1.0"
|
||||||
furo = "==2022.6.4.1"
|
furo = "==2022.6.21"
|
||||||
myst-parser = ">=0.15.2,<0.19.0"
|
myst-parser = ">=0.15.2,<0.19.0"
|
||||||
sphinx-autobuild = "==2021.3.14"
|
sphinx-autobuild = "==2021.3.14"
|
||||||
sphinx-autodoc-typehints = "*"
|
sphinx-autodoc-typehints = "*"
|
||||||
@@ -26,3 +50,6 @@ python-dotenv = "*"
|
|||||||
|
|
||||||
[requires]
|
[requires]
|
||||||
python_version = "3.9"
|
python_version = "3.9"
|
||||||
|
|
||||||
|
[pipenv]
|
||||||
|
allow_prereleases = true
|
||||||
|
|||||||
2092
Pipfile.lock
generated
2092
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
16
README.md
16
README.md
@@ -1,7 +1,13 @@
|
|||||||
# vk-url-scraper
|
# vk-url-scraper
|
||||||
Library to scrape data and especially media links (videos and photos) from vk.com URLs.
|
Python library to scrape data, and especially media links like videos and photos, from vk.com URLs.
|
||||||
|
|
||||||
You can use it via the [command line](#command-line-usage) or as a [python library](#python-library-usage).
|
|
||||||
|
[](https://badge.fury.io/py/vk-url-scraper)
|
||||||
|
[](https://pypi.python.org/pypi/vk-url-scraper/)
|
||||||
|
[](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest)
|
||||||
|
|
||||||
|
|
||||||
|
You can use it via the [command line](#command-line-usage) or as a [python library](#python-library-usage), check the **[documentation](https://vk-url-scraper.readthedocs.io/en/latest/)**.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`.
|
You can install the most recent release from [pypi](https://pypi.org/project/vk-url-scraper/) via `pip install vk-url-scraper`.
|
||||||
@@ -21,7 +27,8 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
|
|||||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
|
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
|
||||||
|
|
||||||
# you can pass a token as well to avoid always authenticating
|
# you can pass a token as well to avoid always authenticating
|
||||||
# and possibly getting captch prompts
|
# and possibly getting captcha prompts
|
||||||
|
# you can fetch the token from the bk_config.v2.json file generated under by searching for "access_token"
|
||||||
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
|
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
|
||||||
|
|
||||||
# save the JSON output into a file
|
# save the JSON output into a file
|
||||||
@@ -48,7 +55,7 @@ res = vks.scrape("https://vk.com/wall-1_398461")
|
|||||||
|
|
||||||
# scrape any "video" URL
|
# scrape any "video" URL
|
||||||
res = vks.scrape("https://vk.com/video-6596301_145810025")
|
res = vks.scrape("https://vk.com/video-6596301_145810025")
|
||||||
print(res[0]["text]) # eg: -> to get the text from code
|
print(res[0]["text"]) # eg: -> to get the text from code
|
||||||
```
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -93,6 +100,7 @@ To test the command line interface available in [__main__.py](__vk_url_scraper/_
|
|||||||
2. run `./scripts/release.sh` to create a tag and push, alternatively
|
2. run `./scripts/release.sh` to create a tag and push, alternatively
|
||||||
1. `git tag vx.y.z` to tag version
|
1. `git tag vx.y.z` to tag version
|
||||||
2. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
|
2. `git push origin vx.y.z` -> this will trigger workflow and put project on [pypi](https://pypi.org/project/vk-url-scraper/)
|
||||||
|
3. go to https://readthedocs.org/ to deploy new docs version (if webhook is not setup)
|
||||||
|
|
||||||
### Fixing a failed release
|
### Fixing a failed release
|
||||||
|
|
||||||
|
|||||||
@@ -2,11 +2,11 @@
|
|||||||
flake8
|
flake8
|
||||||
|
|
||||||
# Static type checking
|
# Static type checking
|
||||||
mypy==0.961
|
mypy>=0.961
|
||||||
|
|
||||||
# Automatic code formatting
|
# Automatic code formatting
|
||||||
black==22.3.0
|
black>=22.3.0
|
||||||
isort==5.10.1
|
isort>=5.10.1
|
||||||
|
|
||||||
# Running tests
|
# Running tests
|
||||||
pytest
|
pytest
|
||||||
@@ -24,19 +24,20 @@ wheel
|
|||||||
Sphinx>=4.3.0,<5.1.0
|
Sphinx>=4.3.0,<5.1.0
|
||||||
|
|
||||||
# Sphinx theme: https://sphinx-themes.org/sample-sites/furo/
|
# Sphinx theme: https://sphinx-themes.org/sample-sites/furo/
|
||||||
furo==2022.6.4.1
|
furo>=2022.6.4.1
|
||||||
|
|
||||||
# Lets Sphinx parse markdown files in addition to rst.
|
# Lets Sphinx parse markdown files in addition to rst.
|
||||||
myst-parser>=0.15.2,<0.19.0
|
myst-parser>=0.15.2,<0.19.0
|
||||||
|
|
||||||
# Adds a copy button to code examples in the docs.
|
# Adds a copy button to code examples in the docs.
|
||||||
sphinx-copybutton==0.5.0
|
sphinx-copybutton>=0.5.0
|
||||||
|
|
||||||
# Live rebuilding and reloading of docs for developing locally.
|
# Live rebuilding and reloading of docs for developing locally.
|
||||||
sphinx-autobuild==2021.3.14
|
sphinx-autobuild>=2021.3.14
|
||||||
|
|
||||||
# Automatically adds types to docs
|
# Automatically adds types to docs
|
||||||
sphinx-autodoc-typehints
|
sphinx-autodoc-typehints
|
||||||
|
|
||||||
# For parsing and comparing version numbers.
|
# For parsing and comparing version numbers.
|
||||||
packaging
|
packaging
|
||||||
|
python-dotenv>=0.21.1
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 7.6 KiB |
@@ -5,11 +5,15 @@
|
|||||||
# pipenv lock --requirements
|
# pipenv lock --requirements
|
||||||
#
|
#
|
||||||
|
|
||||||
certifi==2022.6.15
|
# -i https://pypi.org/simple
|
||||||
charset-normalizer==2.0.12
|
brotli>=1.0.9; platform_python_implementation >= 'CPython'
|
||||||
idna==3.3
|
certifi>=2022.12.7; python_version >= '3.6'
|
||||||
requests==2.28.0
|
charset-normalizer>=3.0.1; python_version >= '3.6'
|
||||||
urllib3==1.26.9
|
idna>=3.4; python_version >= '3.5'
|
||||||
vk-api==11.9.8
|
mutagen>=1.46.0; python_version >= '3.7'
|
||||||
python-dotenv==0.20.0
|
pycryptodomex>=3.17; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||||
yt-dlp==2022.5.18
|
requests>=2.28.2; python_version >= '3.7' and python_version < '4'
|
||||||
|
urllib3>=1.26.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
||||||
|
vk-api>=11.9.9
|
||||||
|
websockets>=10.4; python_version >= '3.7'
|
||||||
|
yt-dlp>=2023.2.17
|
||||||
5
setup.py
5
setup.py
@@ -44,7 +44,10 @@ setup(
|
|||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
],
|
],
|
||||||
keywords=["scraper", "vk", "vkontakte", "vk-api", "media-downloader"],
|
keywords=["scraper", "vk", "vkontakte", "vk-api", "media-downloader"],
|
||||||
url="https://github.com/bellingcat/vk-url-scraper",
|
project_urls={
|
||||||
|
"Code": "https://github.com/bellingcat/vk-url-scraper",
|
||||||
|
"Documentation": "https://vk-url-scraper.readthedocs.io/en/latest/",
|
||||||
|
},
|
||||||
author="Bellingcat",
|
author="Bellingcat",
|
||||||
author_email="tech@bellingcat.com",
|
author_email="tech@bellingcat.com",
|
||||||
license="MIT",
|
license="MIT",
|
||||||
|
|||||||
@@ -2,17 +2,27 @@ import datetime
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from vk_url_scraper import VkScraper
|
from vk_url_scraper import VkScraper
|
||||||
|
|
||||||
# import pytest
|
|
||||||
|
|
||||||
|
|
||||||
vks = None
|
vks = None
|
||||||
|
|
||||||
|
|
||||||
# def test_login_fail():
|
def test_login_fail():
|
||||||
# with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
# VkScraper("invalid", "combination")
|
VkScraper("invalid", "combination")
|
||||||
|
|
||||||
|
|
||||||
|
def test_login_custom_file():
|
||||||
|
session_filename = "test-session.json"
|
||||||
|
VkScraper(
|
||||||
|
os.environ["VK_USERNAME"],
|
||||||
|
os.environ["VK_PASSWORD"],
|
||||||
|
session_file=session_filename,
|
||||||
|
)
|
||||||
|
assert os.path.isfile(session_filename)
|
||||||
|
os.unlink(session_filename)
|
||||||
|
|
||||||
|
|
||||||
def test_login_success():
|
def test_login_success():
|
||||||
@@ -70,7 +80,7 @@ def test_scrape_wall_url_with_photos():
|
|||||||
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
|
== "Хабаровск\nАллея героев\nПомолимся об укокоении воинов:\nАлександра, Игоря, Эдуарда, \nДионисия, Евгения, Александра, Артемия, Иннокентия, Андрея."
|
||||||
)
|
)
|
||||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
|
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 6, 15, 10, 37, 24))
|
||||||
assert len(res[0]["payload"]) == 16
|
assert len(res[0]["payload"]) == 17
|
||||||
assert len(res[0]["attachments"].keys()) == 1
|
assert len(res[0]["attachments"].keys()) == 1
|
||||||
assert list(res[0]["attachments"].keys()) == ["photo"]
|
assert list(res[0]["attachments"].keys()) == ["photo"]
|
||||||
assert len(res[0]["attachments"]["photo"]) == 9
|
assert len(res[0]["attachments"]["photo"]) == 9
|
||||||
@@ -82,7 +92,7 @@ def test_scrape_wall_url_with_photos_inner_videos_and_links_with_inner_photos():
|
|||||||
assert res[0]["id"] == "wall-17315087_74182"
|
assert res[0]["id"] == "wall-17315087_74182"
|
||||||
assert res[0]["text"] == ""
|
assert res[0]["text"] == ""
|
||||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
|
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 11, 1, 9))
|
||||||
assert len(res[0]["payload"]) == 15
|
assert len(res[0]["payload"]) == 17
|
||||||
assert len(res[0]["attachments"].keys()) == 3
|
assert len(res[0]["attachments"].keys()) == 3
|
||||||
for k in ["photo", "link", "video"]:
|
for k in ["photo", "link", "video"]:
|
||||||
assert k in list(res[0]["attachments"].keys())
|
assert k in list(res[0]["attachments"].keys())
|
||||||
@@ -102,7 +112,7 @@ def test_scrape_download_multiple_media():
|
|||||||
"wall-17315087_74182_2.jpg",
|
"wall-17315087_74182_2.jpg",
|
||||||
"wall-17315087_74182_3.jpg",
|
"wall-17315087_74182_3.jpg",
|
||||||
"wall-17315087_74182_4.jpg",
|
"wall-17315087_74182_4.jpg",
|
||||||
"wall-17315087_74182_0.mkv",
|
"wall-17315087_74182_0.mp4",
|
||||||
}
|
}
|
||||||
found_files = set(os.listdir(tempdir))
|
found_files = set(os.listdir(tempdir))
|
||||||
assert len(expect_files) == len(expect_files & found_files)
|
assert len(expect_files) == len(expect_files & found_files)
|
||||||
@@ -128,7 +138,7 @@ def test_scrape_video_only():
|
|||||||
assert len(res) == 1
|
assert len(res) == 1
|
||||||
assert res[0]["id"] == "video38556806_456251917"
|
assert res[0]["id"] == "video38556806_456251917"
|
||||||
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38))
|
assert str(res[0]["datetime"]) == str(datetime.datetime(2022, 3, 24, 5, 42, 38))
|
||||||
assert len(res[0]["payload"]) == 31
|
assert len(res[0]["payload"]) == 34
|
||||||
assert len(res[0]["attachments"].keys()) == 1
|
assert len(res[0]["attachments"].keys()) == 1
|
||||||
assert list(res[0]["attachments"].keys()) == ["video"]
|
assert list(res[0]["attachments"].keys()) == ["video"]
|
||||||
|
|
||||||
@@ -138,8 +148,4 @@ def test_scrape_video_only2():
|
|||||||
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||||
vks.download_media(res, tempdir)
|
vks.download_media(res, tempdir)
|
||||||
found_files = set(os.listdir(tempdir))
|
found_files = set(os.listdir(tempdir))
|
||||||
# different systems might attribute different extension
|
assert "video-17546758_456239898_0.mp4" in found_files
|
||||||
assert (
|
|
||||||
"video-17546758_456239898_0.webm" in found_files
|
|
||||||
or "video-17546758_456239898_0.mp4" in found_files
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
from .scraper import VkScraper
|
from .scraper import VkScraper
|
||||||
from .utils import DateTimeEncoder, mkdir_if_not_exists
|
from .utils import DateTimeEncoder, suppress_stdout
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ def get_argument_parser():
|
|||||||
action="store",
|
action="store",
|
||||||
dest="token",
|
dest="token",
|
||||||
required=False,
|
required=False,
|
||||||
help="optional token, when passed authentication will not be performed - good to avoid captcha issues",
|
help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-d",
|
"-d",
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List
|
from typing import List
|
||||||
@@ -9,7 +10,7 @@ import requests
|
|||||||
import vk_api # used to get api_token after authentication
|
import vk_api # used to get api_token after authentication
|
||||||
import yt_dlp # to download videos from url
|
import yt_dlp # to download videos from url
|
||||||
|
|
||||||
from .utils import captcha_handler, mkdir_if_not_exists
|
from .utils import captcha_handler, suppress_stdout
|
||||||
|
|
||||||
|
|
||||||
class VkScraper:
|
class VkScraper:
|
||||||
@@ -39,7 +40,12 @@ class VkScraper:
|
|||||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, username: str, password: str, token: str = None, captcha_handler=captcha_handler
|
self,
|
||||||
|
username: str,
|
||||||
|
password: str,
|
||||||
|
token: str = None,
|
||||||
|
session_file="vk_config.v2.json",
|
||||||
|
captcha_handler=captcha_handler,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initializes the scraper.
|
"""Initializes the scraper.
|
||||||
|
|
||||||
@@ -54,9 +60,17 @@ class VkScraper:
|
|||||||
Matching password on vk.com
|
Matching password on vk.com
|
||||||
token : str
|
token : str
|
||||||
Access token received after authenticating, can be found in the vl_config.v2.json file
|
Access token received after authenticating, can be found in the vl_config.v2.json file
|
||||||
|
session_file : str
|
||||||
|
File name where the VK session is saved so future logins are easier, this will not be created if token is passed
|
||||||
|
captcha_handler : func
|
||||||
|
Function that can receive a vk_api captcha instance and help the user solve it, default is a complete CLI handler
|
||||||
"""
|
"""
|
||||||
self.session = vk_api.VkApi(
|
self.session = vk_api.VkApi(
|
||||||
username, password, token=token, captcha_handler=captcha_handler
|
username,
|
||||||
|
password,
|
||||||
|
token=token,
|
||||||
|
config_filename=session_file,
|
||||||
|
captcha_handler=captcha_handler,
|
||||||
)
|
)
|
||||||
if token is None or len(token) == 0:
|
if token is None or len(token) == 0:
|
||||||
self.session.auth(token_only=True)
|
self.session.auth(token_only=True)
|
||||||
@@ -306,7 +320,7 @@ class VkScraper:
|
|||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||||
}
|
}
|
||||||
mkdir_if_not_exists(destination)
|
os.makedirs(destination, exist_ok=True)
|
||||||
downloaded = []
|
downloaded = []
|
||||||
for r in results:
|
for r in results:
|
||||||
for k, attachments in r["attachments"].items():
|
for k, attachments in r["attachments"].items():
|
||||||
@@ -319,23 +333,28 @@ class VkScraper:
|
|||||||
f.write(d.content)
|
f.write(d.content)
|
||||||
downloaded.append(filename)
|
downloaded.append(filename)
|
||||||
elif k == "video":
|
elif k == "video":
|
||||||
for i, url in enumerate(attachments):
|
with suppress_stdout(): # ytdlp is not 100% quiet
|
||||||
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
|
for i, url in enumerate(attachments):
|
||||||
ydl = yt_dlp.YoutubeDL(
|
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
|
||||||
{
|
ydl = yt_dlp.YoutubeDL(
|
||||||
"outtmpl": filename,
|
{
|
||||||
"quiet": True,
|
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||||
"restrictfilenames": True,
|
"merge_output_format": "mp4",
|
||||||
"forcefilename": True,
|
"retries": 5,
|
||||||
}
|
"noplaylist": True,
|
||||||
)
|
"outtmpl": filename,
|
||||||
info = ydl.extract_info(url, download=True)
|
"quiet": True,
|
||||||
filename = ydl.prepare_filename(info)
|
"restrictfilenames": True,
|
||||||
if "unknown_video" in filename:
|
"forcefilename": True,
|
||||||
new_filename = filename.replace("unknown_video", "mkv")
|
"simulate": False,
|
||||||
with open(filename, "rb") as vin, open(new_filename, "wb") as vout:
|
}
|
||||||
vout.write(vin.read())
|
)
|
||||||
os.remove(filename)
|
info = ydl.extract_info(url, download=True)
|
||||||
filename = new_filename
|
filename = ydl.prepare_filename(info)
|
||||||
downloaded.append(filename)
|
if "unknown_video" in filename:
|
||||||
|
filename = shutil.copy(
|
||||||
|
filename, filename.replace("unknown_video", "mkv")
|
||||||
|
)
|
||||||
|
os.remove(filename)
|
||||||
|
downloaded.append(filename)
|
||||||
return downloaded
|
return downloaded
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
from contextlib import contextmanager
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
@@ -11,15 +13,21 @@ class DateTimeEncoder(json.JSONEncoder):
|
|||||||
return json.JSONEncoder.default(self, o)
|
return json.JSONEncoder.default(self, o)
|
||||||
|
|
||||||
|
|
||||||
def mkdir_if_not_exists(folder):
|
|
||||||
if not os.path.exists(folder):
|
|
||||||
os.makedirs(folder)
|
|
||||||
|
|
||||||
|
|
||||||
def captcha_handler(captcha):
|
def captcha_handler(captcha):
|
||||||
print(
|
key = input(
|
||||||
f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}",
|
f"CAPTCHA DETECTED, please solve it and input the solution. url={captcha.get_url()}:"
|
||||||
flush=True,
|
).strip()
|
||||||
)
|
|
||||||
key = input(f"Enter captcha code for {captcha.get_url()}:").strip()
|
|
||||||
return captcha.try_again(key)
|
return captcha.try_again(key)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def suppress_stdout():
|
||||||
|
# https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
|
||||||
|
# this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console
|
||||||
|
with open(os.devnull, "w") as devnull:
|
||||||
|
old_stdout = sys.stdout
|
||||||
|
sys.stdout = devnull
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
sys.stdout = old_stdout
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "3"
|
_MINOR = "3"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "2"
|
_PATCH = "15"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user