Compare commits

..

3 Commits

Author SHA1 Message Date
Miguel Ramalho
5c965102a4 Bump version to v0.3.3 for release 2022-06-21 18:24:24 +02:00
msramalho
df10e6f55f applying feedback 2022-06-21 18:24:04 +02:00
Miguel Ramalho
863dd44463 Bump version to v0.3.2 for release 2022-06-21 14:58:27 +02:00
8 changed files with 59 additions and 48 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 7.6 KiB

View File

@@ -56,8 +56,8 @@ setup(
extras_require={"dev": read_requirements("dev-requirements.txt")}, extras_require={"dev": read_requirements("dev-requirements.txt")},
python_requires=">=3.7", python_requires=">=3.7",
entry_points={ entry_points={
'console_scripts': [ "console_scripts": [
'vk_url_scraper=vk_url_scraper.__main__:main', "vk_url_scraper=vk_url_scraper.__main__:main",
], ],
}, },
) )

View File

@@ -2,17 +2,16 @@ import datetime
import os import os
import tempfile import tempfile
import pytest
from vk_url_scraper import VkScraper from vk_url_scraper import VkScraper
# import pytest
vks = None vks = None
# def test_login_fail(): def test_login_fail():
# with pytest.raises(Exception): with pytest.raises(Exception):
# VkScraper("invalid", "combination") VkScraper("invalid", "combination")
def test_login_success(): def test_login_success():
@@ -102,7 +101,7 @@ def test_scrape_download_multiple_media():
"wall-17315087_74182_2.jpg", "wall-17315087_74182_2.jpg",
"wall-17315087_74182_3.jpg", "wall-17315087_74182_3.jpg",
"wall-17315087_74182_4.jpg", "wall-17315087_74182_4.jpg",
"wall-17315087_74182_0.mkv", "wall-17315087_74182_0.mp4",
} }
found_files = set(os.listdir(tempdir)) found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files) assert len(expect_files) == len(expect_files & found_files)
@@ -138,8 +137,4 @@ def test_scrape_video_only2():
with tempfile.TemporaryDirectory(dir="./") as tempdir: with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir) vks.download_media(res, tempdir)
found_files = set(os.listdir(tempdir)) found_files = set(os.listdir(tempdir))
# different systems might attribute different extension assert "video-17546758_456239898_0.mp4" in found_files
assert (
"video-17546758_456239898_0.webm" in found_files
or "video-17546758_456239898_0.mp4" in found_files
)

View File

@@ -1,2 +1,2 @@
from .scraper import VkScraper from .scraper import VkScraper
from .utils import DateTimeEncoder, mkdir_if_not_exists from .utils import DateTimeEncoder, suppress_stdout

View File

@@ -35,7 +35,7 @@ def get_argument_parser():
action="store", action="store",
dest="token", dest="token",
required=False, required=False,
help="optional token, when passed authentication will not be performed - good to avoid captcha issues", help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues",
) )
parser.add_argument( parser.add_argument(
"-d", "-d",

View File

@@ -1,5 +1,6 @@
import os import os
import re import re
import shutil
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from typing import List from typing import List
@@ -9,7 +10,7 @@ import requests
import vk_api # used to get api_token after authentication import vk_api # used to get api_token after authentication
import yt_dlp # to download videos from url import yt_dlp # to download videos from url
from .utils import captcha_handler, mkdir_if_not_exists from .utils import captcha_handler, suppress_stdout
class VkScraper: class VkScraper:
@@ -306,7 +307,7 @@ class VkScraper:
headers = { headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
} }
mkdir_if_not_exists(destination) os.makedirs(destination, exist_ok=True)
downloaded = [] downloaded = []
for r in results: for r in results:
for k, attachments in r["attachments"].items(): for k, attachments in r["attachments"].items():
@@ -319,23 +320,30 @@ class VkScraper:
f.write(d.content) f.write(d.content)
downloaded.append(filename) downloaded.append(filename)
elif k == "video": elif k == "video":
for i, url in enumerate(attachments): with suppress_stdout(): # ytdlp is not 100% quiet
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s") for i, url in enumerate(attachments):
ydl = yt_dlp.YoutubeDL( filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
{ ydl = yt_dlp.YoutubeDL(
"outtmpl": filename, {
"quiet": True, "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"restrictfilenames": True, "merge_output_format": "mp4",
"forcefilename": True, "retries": 5,
} "noplaylist": True,
) "outtmpl": filename,
info = ydl.extract_info(url, download=True) "quiet": True,
filename = ydl.prepare_filename(info) "restrictfilenames": True,
if "unknown_video" in filename: "forcefilename": True,
new_filename = filename.replace("unknown_video", "mkv") "simulate": False,
with open(filename, "rb") as vin, open(new_filename, "wb") as vout: }
vout.write(vin.read()) )
os.remove(filename) info = ydl.extract_info(url, download=True)
filename = new_filename filename = ydl.prepare_filename(info)
downloaded.append(filename) if "unknown_video" in filename:
print(f"before {filename=}")
filename = shutil.copy(
filename, filename.replace("unknown_video", "mkv")
)
print(f"after {filename=}")
os.remove(filename)
downloaded.append(filename)
return downloaded return downloaded

View File

@@ -1,5 +1,7 @@
import json import json
import os import os
import sys
from contextlib import contextmanager
from datetime import datetime from datetime import datetime
@@ -11,15 +13,21 @@ class DateTimeEncoder(json.JSONEncoder):
return json.JSONEncoder.default(self, o) return json.JSONEncoder.default(self, o)
def mkdir_if_not_exists(folder):
if not os.path.exists(folder):
os.makedirs(folder)
def captcha_handler(captcha): def captcha_handler(captcha):
print( key = input(
f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}", f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}:"
flush=True, ).strip()
)
key = input(f"Enter captcha code for {captcha.get_url()}:").strip()
return captcha.try_again(key) return captcha.try_again(key)
@contextmanager
def suppress_stdout():
# https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
# this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console
with open(os.devnull, "w") as devnull:
old_stdout = sys.stdout
sys.stdout = devnull
try:
yield
finally:
sys.stdout = old_stdout

View File

@@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "3" _MINOR = "3"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "1" _PATCH = "3"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""