Compare commits

..

3 Commits

Author SHA1 Message Date
Miguel Ramalho
5c965102a4 Bump version to v0.3.3 for release 2022-06-21 18:24:24 +02:00
msramalho
df10e6f55f applying feedback 2022-06-21 18:24:04 +02:00
Miguel Ramalho
863dd44463 Bump version to v0.3.2 for release 2022-06-21 14:58:27 +02:00
8 changed files with 59 additions and 48 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 7.6 KiB

View File

@@ -56,8 +56,8 @@ setup(
extras_require={"dev": read_requirements("dev-requirements.txt")},
python_requires=">=3.7",
entry_points={
'console_scripts': [
'vk_url_scraper=vk_url_scraper.__main__:main',
"console_scripts": [
"vk_url_scraper=vk_url_scraper.__main__:main",
],
},
)

View File

@@ -2,17 +2,16 @@ import datetime
import os
import tempfile
import pytest
from vk_url_scraper import VkScraper
# import pytest
vks = None
# def test_login_fail():
# with pytest.raises(Exception):
# VkScraper("invalid", "combination")
def test_login_fail():
with pytest.raises(Exception):
VkScraper("invalid", "combination")
def test_login_success():
@@ -102,7 +101,7 @@ def test_scrape_download_multiple_media():
"wall-17315087_74182_2.jpg",
"wall-17315087_74182_3.jpg",
"wall-17315087_74182_4.jpg",
"wall-17315087_74182_0.mkv",
"wall-17315087_74182_0.mp4",
}
found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files)
@@ -138,8 +137,4 @@ def test_scrape_video_only2():
with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir)
found_files = set(os.listdir(tempdir))
# different systems might attribute different extension
assert (
"video-17546758_456239898_0.webm" in found_files
or "video-17546758_456239898_0.mp4" in found_files
)
assert "video-17546758_456239898_0.mp4" in found_files

View File

@@ -1,2 +1,2 @@
from .scraper import VkScraper
from .utils import DateTimeEncoder, mkdir_if_not_exists
from .utils import DateTimeEncoder, suppress_stdout

View File

@@ -35,7 +35,7 @@ def get_argument_parser():
action="store",
dest="token",
required=False,
help="optional token, when passed authentication will not be performed - good to avoid captcha issues",
help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues",
)
parser.add_argument(
"-d",

View File

@@ -1,5 +1,6 @@
import os
import re
import shutil
from collections import defaultdict
from datetime import datetime
from typing import List
@@ -9,7 +10,7 @@ import requests
import vk_api # used to get api_token after authentication
import yt_dlp # to download videos from url
from .utils import captcha_handler, mkdir_if_not_exists
from .utils import captcha_handler, suppress_stdout
class VkScraper:
@@ -306,7 +307,7 @@ class VkScraper:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
mkdir_if_not_exists(destination)
os.makedirs(destination, exist_ok=True)
downloaded = []
for r in results:
for k, attachments in r["attachments"].items():
@@ -319,23 +320,30 @@ class VkScraper:
f.write(d.content)
downloaded.append(filename)
elif k == "video":
for i, url in enumerate(attachments):
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
ydl = yt_dlp.YoutubeDL(
{
"outtmpl": filename,
"quiet": True,
"restrictfilenames": True,
"forcefilename": True,
}
)
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
if "unknown_video" in filename:
new_filename = filename.replace("unknown_video", "mkv")
with open(filename, "rb") as vin, open(new_filename, "wb") as vout:
vout.write(vin.read())
os.remove(filename)
filename = new_filename
downloaded.append(filename)
with suppress_stdout(): # ytdlp is not 100% quiet
for i, url in enumerate(attachments):
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
ydl = yt_dlp.YoutubeDL(
{
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"retries": 5,
"noplaylist": True,
"outtmpl": filename,
"quiet": True,
"restrictfilenames": True,
"forcefilename": True,
"simulate": False,
}
)
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
if "unknown_video" in filename:
print(f"before {filename=}")
filename = shutil.copy(
filename, filename.replace("unknown_video", "mkv")
)
print(f"after {filename=}")
os.remove(filename)
downloaded.append(filename)
return downloaded

View File

@@ -1,5 +1,7 @@
import json
import os
import sys
from contextlib import contextmanager
from datetime import datetime
@@ -11,15 +13,21 @@ class DateTimeEncoder(json.JSONEncoder):
return json.JSONEncoder.default(self, o)
def mkdir_if_not_exists(folder):
if not os.path.exists(folder):
os.makedirs(folder)
def captcha_handler(captcha):
print(
f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}",
flush=True,
)
key = input(f"Enter captcha code for {captcha.get_url()}:").strip()
key = input(
f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}:"
).strip()
return captcha.try_again(key)
@contextmanager
def suppress_stdout():
# https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
# this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console
with open(os.devnull, "w") as devnull:
old_stdout = sys.stdout
sys.stdout = devnull
try:
yield
finally:
sys.stdout = old_stdout

View File

@@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "3"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "1"
_PATCH = "3"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""