diff --git a/docs/source/_static/favicon.ico b/docs/source/_static/favicon.ico index 1aaa613..bf73c0f 100644 Binary files a/docs/source/_static/favicon.ico and b/docs/source/_static/favicon.ico differ diff --git a/tests/scraper_test.py b/tests/scraper_test.py index 5178f45..2f31b7b 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -2,17 +2,16 @@ import datetime import os import tempfile +import pytest + from vk_url_scraper import VkScraper -# import pytest - - vks = None -# def test_login_fail(): -# with pytest.raises(Exception): -# VkScraper("invalid", "combination") +def test_login_fail(): + with pytest.raises(Exception): + VkScraper("invalid", "combination") def test_login_success(): @@ -102,7 +101,7 @@ def test_scrape_download_multiple_media(): "wall-17315087_74182_2.jpg", "wall-17315087_74182_3.jpg", "wall-17315087_74182_4.jpg", - "wall-17315087_74182_0.mkv", + "wall-17315087_74182_0.mp4", } found_files = set(os.listdir(tempdir)) assert len(expect_files) == len(expect_files & found_files) @@ -138,8 +137,4 @@ def test_scrape_video_only2(): with tempfile.TemporaryDirectory(dir="./") as tempdir: vks.download_media(res, tempdir) found_files = set(os.listdir(tempdir)) - # different systems might attribute different extension - assert ( - "video-17546758_456239898_0.webm" in found_files - or "video-17546758_456239898_0.mp4" in found_files - ) + assert "video-17546758_456239898_0.mp4" in found_files diff --git a/vk_url_scraper/__init__.py b/vk_url_scraper/__init__.py index 9f50225..499c193 100644 --- a/vk_url_scraper/__init__.py +++ b/vk_url_scraper/__init__.py @@ -1,2 +1,2 @@ from .scraper import VkScraper -from .utils import DateTimeEncoder, mkdir_if_not_exists +from .utils import DateTimeEncoder, suppress_stdout diff --git a/vk_url_scraper/__main__.py b/vk_url_scraper/__main__.py index 3ebf819..a2a7738 100644 --- a/vk_url_scraper/__main__.py +++ b/vk_url_scraper/__main__.py @@ -35,7 +35,7 @@ def get_argument_parser(): action="store", dest="token", required=False, - help="optional token, when passed authentication will not be performed - good to avoid captcha issues", + help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues", ) parser.add_argument( "-d", diff --git a/vk_url_scraper/scraper.py b/vk_url_scraper/scraper.py index 2d73b92..54fd3f3 100644 --- a/vk_url_scraper/scraper.py +++ b/vk_url_scraper/scraper.py @@ -1,5 +1,6 @@ import os import re +import shutil from collections import defaultdict from datetime import datetime from typing import List @@ -9,7 +10,7 @@ import requests import vk_api # used to get api_token after authentication import yt_dlp # to download videos from url -from .utils import captcha_handler, mkdir_if_not_exists +from .utils import captcha_handler, suppress_stdout class VkScraper: @@ -306,7 +307,7 @@ class VkScraper: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" } - mkdir_if_not_exists(destination) + os.makedirs(destination, exist_ok=True) downloaded = [] for r in results: for k, attachments in r["attachments"].items(): @@ -319,23 +320,30 @@ class VkScraper: f.write(d.content) downloaded.append(filename) elif k == "video": - for i, url in enumerate(attachments): - filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s") - ydl = yt_dlp.YoutubeDL( - { - "outtmpl": filename, - "quiet": True, - "restrictfilenames": True, - "forcefilename": True, - } - ) - info = ydl.extract_info(url, download=True) - filename = ydl.prepare_filename(info) - if "unknown_video" in filename: - new_filename = filename.replace("unknown_video", "mkv") - with open(filename, "rb") as vin, open(new_filename, "wb") as vout: - vout.write(vin.read()) - os.remove(filename) - filename = new_filename - downloaded.append(filename) + with suppress_stdout(): # ytdlp is not 100% quiet + for i, url in enumerate(attachments): + filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s") + ydl = yt_dlp.YoutubeDL( + { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "retries": 5, + "noplaylist": True, + "outtmpl": filename, + "quiet": True, + "restrictfilenames": True, + "forcefilename": True, + "simulate": False, + } + ) + info = ydl.extract_info(url, download=True) + filename = ydl.prepare_filename(info) + if "unknown_video" in filename: + print(f"before {filename=}") + filename = shutil.copy( + filename, filename.replace("unknown_video", "mkv") + ) + print(f"after {filename=}") + os.remove(filename) + downloaded.append(filename) return downloaded diff --git a/vk_url_scraper/utils.py b/vk_url_scraper/utils.py index 77aa5af..b51de5a 100644 --- a/vk_url_scraper/utils.py +++ b/vk_url_scraper/utils.py @@ -1,5 +1,7 @@ import json import os +import sys +from contextlib import contextmanager from datetime import datetime @@ -11,15 +13,21 @@ class DateTimeEncoder(json.JSONEncoder): return json.JSONEncoder.default(self, o) -def mkdir_if_not_exists(folder): - if not os.path.exists(folder): - os.makedirs(folder) - - def captcha_handler(captcha): - print( - f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}", - flush=True, - ) - key = input(f"Enter captcha code for {captcha.get_url()}:").strip() + key = input( + f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}:" + ).strip() return captcha.try_again(key) + + +@contextmanager +def suppress_stdout(): + # https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/ + # this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console + with open(os.devnull, "w") as devnull: + old_stdout = sys.stdout + sys.stdout = devnull + try: + yield + finally: + sys.stdout = old_stdout