mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-10 12:28:39 +03:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a6d066a192 | ||
|
|
9078a17400 | ||
|
|
17b516bd7f | ||
|
|
8bd182b041 | ||
|
|
0b8abfb5cb | ||
|
|
cf5fb91c84 | ||
|
|
5c965102a4 | ||
|
|
df10e6f55f | ||
|
|
863dd44463 |
5
.github/actions/setup-venv/action.yml
vendored
5
.github/actions/setup-venv/action.yml
vendored
@@ -16,6 +16,11 @@ runs:
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
|
||||
- shell: bash
|
||||
run: |
|
||||
# install ffmpeg
|
||||
sudo apt install ffmpeg
|
||||
|
||||
- shell: bash
|
||||
run: |
|
||||
# Install prerequisites.
|
||||
|
||||
3
.github/workflows/main.yml
vendored
3
.github/workflows/main.yml
vendored
@@ -30,8 +30,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# python: ['3.7', '3.10']
|
||||
python: ['3.10']
|
||||
python: ['3.7', '3.10']
|
||||
task: # --show-capture=no on purpose
|
||||
- name: Test
|
||||
run: |
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 7.6 KiB |
4
setup.py
4
setup.py
@@ -56,8 +56,8 @@ setup(
|
||||
extras_require={"dev": read_requirements("dev-requirements.txt")},
|
||||
python_requires=">=3.7",
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'vk_url_scraper=vk_url_scraper.__main__:main',
|
||||
"console_scripts": [
|
||||
"vk_url_scraper=vk_url_scraper.__main__:main",
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
@@ -2,17 +2,16 @@ import datetime
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
# import pytest
|
||||
|
||||
|
||||
vks = None
|
||||
|
||||
|
||||
# def test_login_fail():
|
||||
# with pytest.raises(Exception):
|
||||
# VkScraper("invalid", "combination")
|
||||
def test_login_fail():
|
||||
with pytest.raises(Exception):
|
||||
VkScraper("invalid", "combination")
|
||||
|
||||
|
||||
def test_login_success():
|
||||
@@ -102,7 +101,7 @@ def test_scrape_download_multiple_media():
|
||||
"wall-17315087_74182_2.jpg",
|
||||
"wall-17315087_74182_3.jpg",
|
||||
"wall-17315087_74182_4.jpg",
|
||||
"wall-17315087_74182_0.mkv",
|
||||
"wall-17315087_74182_0.mp4",
|
||||
}
|
||||
found_files = set(os.listdir(tempdir))
|
||||
assert len(expect_files) == len(expect_files & found_files)
|
||||
@@ -138,8 +137,4 @@ def test_scrape_video_only2():
|
||||
with tempfile.TemporaryDirectory(dir="./") as tempdir:
|
||||
vks.download_media(res, tempdir)
|
||||
found_files = set(os.listdir(tempdir))
|
||||
# different systems might attribute different extension
|
||||
assert (
|
||||
"video-17546758_456239898_0.webm" in found_files
|
||||
or "video-17546758_456239898_0.mp4" in found_files
|
||||
)
|
||||
assert "video-17546758_456239898_0.mp4" in found_files
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
from .scraper import VkScraper
|
||||
from .utils import DateTimeEncoder, mkdir_if_not_exists
|
||||
from .utils import DateTimeEncoder, suppress_stdout
|
||||
|
||||
@@ -35,7 +35,7 @@ def get_argument_parser():
|
||||
action="store",
|
||||
dest="token",
|
||||
required=False,
|
||||
help="optional token, when passed authentication will not be performed - good to avoid captcha issues",
|
||||
help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
@@ -9,7 +10,7 @@ import requests
|
||||
import vk_api # used to get api_token after authentication
|
||||
import yt_dlp # to download videos from url
|
||||
|
||||
from .utils import captcha_handler, mkdir_if_not_exists
|
||||
from .utils import captcha_handler, suppress_stdout
|
||||
|
||||
|
||||
class VkScraper:
|
||||
@@ -306,7 +307,7 @@ class VkScraper:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
mkdir_if_not_exists(destination)
|
||||
os.makedirs(destination, exist_ok=True)
|
||||
downloaded = []
|
||||
for r in results:
|
||||
for k, attachments in r["attachments"].items():
|
||||
@@ -319,23 +320,28 @@ class VkScraper:
|
||||
f.write(d.content)
|
||||
downloaded.append(filename)
|
||||
elif k == "video":
|
||||
for i, url in enumerate(attachments):
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
|
||||
ydl = yt_dlp.YoutubeDL(
|
||||
{
|
||||
"outtmpl": filename,
|
||||
"quiet": True,
|
||||
"restrictfilenames": True,
|
||||
"forcefilename": True,
|
||||
}
|
||||
)
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl.prepare_filename(info)
|
||||
if "unknown_video" in filename:
|
||||
new_filename = filename.replace("unknown_video", "mkv")
|
||||
with open(filename, "rb") as vin, open(new_filename, "wb") as vout:
|
||||
vout.write(vin.read())
|
||||
os.remove(filename)
|
||||
filename = new_filename
|
||||
downloaded.append(filename)
|
||||
with suppress_stdout(): # ytdlp is not 100% quiet
|
||||
for i, url in enumerate(attachments):
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
|
||||
ydl = yt_dlp.YoutubeDL(
|
||||
{
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"merge_output_format": "mp4",
|
||||
"retries": 5,
|
||||
"noplaylist": True,
|
||||
"outtmpl": filename,
|
||||
"quiet": True,
|
||||
"restrictfilenames": True,
|
||||
"forcefilename": True,
|
||||
"simulate": False,
|
||||
}
|
||||
)
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl.prepare_filename(info)
|
||||
if "unknown_video" in filename:
|
||||
filename = shutil.copy(
|
||||
filename, filename.replace("unknown_video", "mkv")
|
||||
)
|
||||
os.remove(filename)
|
||||
downloaded.append(filename)
|
||||
return downloaded
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@@ -11,15 +13,21 @@ class DateTimeEncoder(json.JSONEncoder):
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
|
||||
def mkdir_if_not_exists(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
|
||||
|
||||
def captcha_handler(captcha):
|
||||
print(
|
||||
f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}",
|
||||
flush=True,
|
||||
)
|
||||
key = input(f"Enter captcha code for {captcha.get_url()}:").strip()
|
||||
key = input(
|
||||
f"CAPTCHA DETECTED, please solve it and input the solution. url={captcha.get_url()}:"
|
||||
).strip()
|
||||
return captcha.try_again(key)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def suppress_stdout():
|
||||
# https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
|
||||
# this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console
|
||||
with open(os.devnull, "w") as devnull:
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = devnull
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
|
||||
@@ -2,7 +2,7 @@ _MAJOR = "0"
|
||||
_MINOR = "3"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "1"
|
||||
_PATCH = "5"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user