mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-12 05:18:35 +03:00
Compare commits
27 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
17b516bd7f | ||
|
|
8bd182b041 | ||
|
|
0b8abfb5cb | ||
|
|
cf5fb91c84 | ||
|
|
5c965102a4 | ||
|
|
df10e6f55f | ||
|
|
863dd44463 | ||
|
|
578ec81443 | ||
|
|
c32caec442 | ||
|
|
80b43f7c95 | ||
|
|
90b72b6d22 | ||
|
|
d96e0c0a3a | ||
|
|
db03a4c0f6 | ||
|
|
cf100ee69e | ||
|
|
a09cf32b3e | ||
|
|
e1eb3ed620 | ||
|
|
72bd951d9c | ||
|
|
59d53be68b | ||
|
|
24a1313a65 | ||
|
|
64df4eec28 | ||
|
|
42bdc1441c | ||
|
|
c25880ee6d | ||
|
|
e1e3648852 | ||
|
|
c74dc280d8 | ||
|
|
ab15b35008 | ||
|
|
62c4536d0b | ||
|
|
eac0fc4904 |
3
.env.example
Normal file
3
.env.example
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
VK_USERNAME="your username"
|
||||||
|
VK_PASSWORD="your password"
|
||||||
|
VK_TOKEN="optional token"
|
||||||
5
.github/actions/setup-venv/action.yml
vendored
5
.github/actions/setup-venv/action.yml
vendored
@@ -16,6 +16,11 @@ runs:
|
|||||||
with:
|
with:
|
||||||
python-version: ${{ inputs.python-version }}
|
python-version: ${{ inputs.python-version }}
|
||||||
|
|
||||||
|
- shell: bash
|
||||||
|
run: |
|
||||||
|
# install ffmpeg
|
||||||
|
sudo apt install ffmpeg
|
||||||
|
|
||||||
- shell: bash
|
- shell: bash
|
||||||
run: |
|
run: |
|
||||||
# Install prerequisites.
|
# Install prerequisites.
|
||||||
|
|||||||
5
.github/workflows/main.yml
vendored
5
.github/workflows/main.yml
vendored
@@ -20,6 +20,7 @@ env:
|
|||||||
PYTHONPATH: ./
|
PYTHONPATH: ./
|
||||||
VK_USERNAME: ${{ secrets.VK_USERNAME }}
|
VK_USERNAME: ${{ secrets.VK_USERNAME }}
|
||||||
VK_PASSWORD: ${{ secrets.VK_PASSWORD }}
|
VK_PASSWORD: ${{ secrets.VK_PASSWORD }}
|
||||||
|
VK_TOKEN: ${{ secrets.VK_TOKEN }}
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
checks:
|
checks:
|
||||||
@@ -30,10 +31,10 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python: ['3.7', '3.10']
|
python: ['3.7', '3.10']
|
||||||
task: # --show-capture=no on purpose
|
task: # --show-capture=no on purpose
|
||||||
- name: Test
|
- name: Test
|
||||||
run: |
|
run: |
|
||||||
pytest --show-capture=no --color=yes tests/
|
pytest --show-capture=no --color=yes tests/
|
||||||
|
|
||||||
include:
|
include:
|
||||||
- python: '3.10'
|
- python: '3.10'
|
||||||
|
|||||||
@@ -20,6 +20,9 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
|
|||||||
# you can also have multiple urls
|
# you can also have multiple urls
|
||||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
|
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
|
||||||
|
|
||||||
|
# you can pass a token as well to avoid always authenticating
|
||||||
|
# and possibly getting captch prompts
|
||||||
|
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
|
||||||
|
|
||||||
# save the JSON output into a file
|
# save the JSON output into a file
|
||||||
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json
|
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 7.6 KiB |
9
setup.py
9
setup.py
@@ -33,7 +33,7 @@ with open("vk_url_scraper/version.py", "r") as version_file:
|
|||||||
setup(
|
setup(
|
||||||
name="vk-url-scraper",
|
name="vk-url-scraper",
|
||||||
version=VERSION["VERSION"],
|
version=VERSION["VERSION"],
|
||||||
description="",
|
description="Scrape VK URLs to fetch info and media - python API or command line tool.",
|
||||||
long_description=open("README.md").read(),
|
long_description=open("README.md").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
classifiers=[
|
classifiers=[
|
||||||
@@ -43,7 +43,7 @@ setup(
|
|||||||
"License :: OSI Approved :: MIT License",
|
"License :: OSI Approved :: MIT License",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
],
|
],
|
||||||
keywords="",
|
keywords=["scraper", "vk", "vkontakte", "vk-api", "media-downloader"],
|
||||||
url="https://github.com/bellingcat/vk-url-scraper",
|
url="https://github.com/bellingcat/vk-url-scraper",
|
||||||
author="Bellingcat",
|
author="Bellingcat",
|
||||||
author_email="tech@bellingcat.com",
|
author_email="tech@bellingcat.com",
|
||||||
@@ -55,4 +55,9 @@ setup(
|
|||||||
install_requires=read_requirements("requirements.txt"),
|
install_requires=read_requirements("requirements.txt"),
|
||||||
extras_require={"dev": read_requirements("dev-requirements.txt")},
|
extras_require={"dev": read_requirements("dev-requirements.txt")},
|
||||||
python_requires=">=3.7",
|
python_requires=">=3.7",
|
||||||
|
entry_points={
|
||||||
|
"console_scripts": [
|
||||||
|
"vk_url_scraper=vk_url_scraper.__main__:main",
|
||||||
|
],
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -16,7 +16,9 @@ def test_login_fail():
|
|||||||
|
|
||||||
def test_login_success():
|
def test_login_success():
|
||||||
global vks
|
global vks
|
||||||
vks = VkScraper(os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"])
|
vks = VkScraper(
|
||||||
|
os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"], os.environ.get("VK_TOKEN")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_empty_urll():
|
def test_scrape_empty_urll():
|
||||||
@@ -99,7 +101,7 @@ def test_scrape_download_multiple_media():
|
|||||||
"wall-17315087_74182_2.jpg",
|
"wall-17315087_74182_2.jpg",
|
||||||
"wall-17315087_74182_3.jpg",
|
"wall-17315087_74182_3.jpg",
|
||||||
"wall-17315087_74182_4.jpg",
|
"wall-17315087_74182_4.jpg",
|
||||||
"wall-17315087_74182_0.mkv",
|
"wall-17315087_74182_0.mp4",
|
||||||
}
|
}
|
||||||
found_files = set(os.listdir(tempdir))
|
found_files = set(os.listdir(tempdir))
|
||||||
assert len(expect_files) == len(expect_files & found_files)
|
assert len(expect_files) == len(expect_files & found_files)
|
||||||
@@ -131,5 +133,8 @@ def test_scrape_video_only():
|
|||||||
|
|
||||||
|
|
||||||
def test_scrape_video_only2():
|
def test_scrape_video_only2():
|
||||||
res = vks.scrape("https://vk.com/video-1_456239018")
|
res = vks.scrape("https://vk.com/video-17546758_456239898")
|
||||||
print(res[0]["attachments"]["video"][0])
|
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||||
|
vks.download_media(res, tempdir)
|
||||||
|
found_files = set(os.listdir(tempdir))
|
||||||
|
assert "video-17546758_456239898_0.mp4" in found_files
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
from .scraper import VkScraper
|
from .scraper import VkScraper
|
||||||
from .utils import DateTimeEncoder, mkdir_if_not_exists
|
from .utils import DateTimeEncoder, suppress_stdout
|
||||||
|
|||||||
@@ -29,6 +29,14 @@ def get_argument_parser():
|
|||||||
required=True,
|
required=True,
|
||||||
help="password for the valid vk.com account",
|
help="password for the valid vk.com account",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-t",
|
||||||
|
"--token",
|
||||||
|
action="store",
|
||||||
|
dest="token",
|
||||||
|
required=False,
|
||||||
|
help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-d",
|
"-d",
|
||||||
"--download",
|
"--download",
|
||||||
@@ -50,7 +58,7 @@ def get_argument_parser():
|
|||||||
def main():
|
def main():
|
||||||
parser = get_argument_parser()
|
parser = get_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
vks = VkScraper(args.username, args.password)
|
vks = VkScraper(args.username, args.password, args.token)
|
||||||
text = " ".join(args.urls)
|
text = " ".join(args.urls)
|
||||||
res = vks.scrape(text)
|
res = vks.scrape(text)
|
||||||
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List
|
from typing import List
|
||||||
@@ -9,7 +10,7 @@ import requests
|
|||||||
import vk_api # used to get api_token after authentication
|
import vk_api # used to get api_token after authentication
|
||||||
import yt_dlp # to download videos from url
|
import yt_dlp # to download videos from url
|
||||||
|
|
||||||
from .utils import mkdir_if_not_exists
|
from .utils import captcha_handler, suppress_stdout
|
||||||
|
|
||||||
|
|
||||||
class VkScraper:
|
class VkScraper:
|
||||||
@@ -38,10 +39,13 @@ class VkScraper:
|
|||||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
|
||||||
|
|
||||||
def __init__(self, username: str, password: str) -> None:
|
def __init__(
|
||||||
|
self, username: str, password: str, token: str = None, captcha_handler=captcha_handler
|
||||||
|
) -> None:
|
||||||
"""Initializes the scraper.
|
"""Initializes the scraper.
|
||||||
|
|
||||||
This function receives a username and password and performs authentication on vk.com to then call api endpoints
|
This function receives a username and password (or access token) and performs
|
||||||
|
authentication on vk.com to then call api endpoints. If token is passed, authentication will not be performed again.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -49,9 +53,14 @@ class VkScraper:
|
|||||||
Username on vk.com, can be a phone number or email
|
Username on vk.com, can be a phone number or email
|
||||||
password : str
|
password : str
|
||||||
Matching password on vk.com
|
Matching password on vk.com
|
||||||
|
token : str
|
||||||
|
Access token received after authenticating, can be found in the vl_config.v2.json file
|
||||||
"""
|
"""
|
||||||
self.session = vk_api.VkApi(username, password)
|
self.session = vk_api.VkApi(
|
||||||
self.session.auth(token_only=True)
|
username, password, token=token, captcha_handler=captcha_handler
|
||||||
|
)
|
||||||
|
if token is None or len(token) == 0:
|
||||||
|
self.session.auth(token_only=True)
|
||||||
|
|
||||||
def scrape(self, url: str) -> List:
|
def scrape(self, url: str) -> List:
|
||||||
"""Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
|
"""Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
|
||||||
@@ -298,7 +307,7 @@ class VkScraper:
|
|||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||||
}
|
}
|
||||||
mkdir_if_not_exists(destination)
|
os.makedirs(destination, exist_ok=True)
|
||||||
downloaded = []
|
downloaded = []
|
||||||
for r in results:
|
for r in results:
|
||||||
for k, attachments in r["attachments"].items():
|
for k, attachments in r["attachments"].items():
|
||||||
@@ -311,9 +320,28 @@ class VkScraper:
|
|||||||
f.write(d.content)
|
f.write(d.content)
|
||||||
downloaded.append(filename)
|
downloaded.append(filename)
|
||||||
elif k == "video":
|
elif k == "video":
|
||||||
for i, url in enumerate(attachments):
|
with suppress_stdout(): # ytdlp is not 100% quiet
|
||||||
filename = os.path.join(destination, f"{r['id']}_{i}.mkv")
|
for i, url in enumerate(attachments):
|
||||||
ydl = yt_dlp.YoutubeDL({"outtmpl": filename, "quiet": True})
|
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
|
||||||
ydl.extract_info(url, download=True)
|
ydl = yt_dlp.YoutubeDL(
|
||||||
downloaded.append(filename)
|
{
|
||||||
|
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||||
|
"merge_output_format": "mp4",
|
||||||
|
"retries": 5,
|
||||||
|
"noplaylist": True,
|
||||||
|
"outtmpl": filename,
|
||||||
|
"quiet": True,
|
||||||
|
"restrictfilenames": True,
|
||||||
|
"forcefilename": True,
|
||||||
|
"simulate": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
info = ydl.extract_info(url, download=True)
|
||||||
|
filename = ydl.prepare_filename(info)
|
||||||
|
if "unknown_video" in filename:
|
||||||
|
filename = shutil.copy(
|
||||||
|
filename, filename.replace("unknown_video", "mkv")
|
||||||
|
)
|
||||||
|
os.remove(filename)
|
||||||
|
downloaded.append(filename)
|
||||||
return downloaded
|
return downloaded
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
from contextlib import contextmanager
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
@@ -11,6 +13,21 @@ class DateTimeEncoder(json.JSONEncoder):
|
|||||||
return json.JSONEncoder.default(self, o)
|
return json.JSONEncoder.default(self, o)
|
||||||
|
|
||||||
|
|
||||||
def mkdir_if_not_exists(folder):
|
def captcha_handler(captcha):
|
||||||
if not os.path.exists(folder):
|
key = input(
|
||||||
os.makedirs(folder)
|
f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}:"
|
||||||
|
).strip()
|
||||||
|
return captcha.try_again(key)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def suppress_stdout():
|
||||||
|
# https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
|
||||||
|
# this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console
|
||||||
|
with open(os.devnull, "w") as devnull:
|
||||||
|
old_stdout = sys.stdout
|
||||||
|
sys.stdout = devnull
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
sys.stdout = old_stdout
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
_MAJOR = "0"
|
_MAJOR = "0"
|
||||||
_MINOR = "2"
|
_MINOR = "3"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "0"
|
_PATCH = "4"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user