From df10e6f55f7b7c1617927968b525755efcd15055 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 21 Jun 2022 18:24:04 +0200 Subject: [PATCH] applying feedback --- docs/source/_static/favicon.ico | Bin 15086 -> 7774 bytes tests/scraper_test.py | 19 +++++------- vk_url_scraper/__init__.py | 2 +- vk_url_scraper/__main__.py | 2 +- vk_url_scraper/scraper.py | 50 ++++++++++++++++++-------------- vk_url_scraper/utils.py | 28 +++++++++++------- 6 files changed, 56 insertions(+), 45 deletions(-) diff --git a/docs/source/_static/favicon.ico b/docs/source/_static/favicon.ico index 1aaa6139b623ca31250621b7c0f08b7d26795f8f..bf73c0f6ebb1cc233fb5c83010151fcb9db92c14 100644 GIT binary patch literal 7774 zcmeHLTTB#J82;u0yA&wbA{2{lky~4!$ek8zfm%zUE!ct-Xem&jEiIICQ!JNuEyzW7 z=U^ya;0cYAphg}bJP_0bH9ToT&=8`A2MIz#AVIm=Lx{%U9Kyi#ZD%vHv)_0AZ~pWB z=ggd0fTN$77|^2@`&FEhCFE4m|d&9@a2fn_(@bmM7zrQ~M0s;^i7>J;tAOr^oBP1jQp`oD& z3kySdcsL>=A`lrFiKwV3L`O%{{>C6SHWqPlafpwPM?yjZ5)%`Vl$3-R23sF>5gyQ02l$4a9 zw6qjuWo0NYFGod11u82mQB_q1olb}9>T1-~)S$Mu7WMV@kR%CBO-(p{{5YDMo6*wJ zf)giB;N;1ZICbh2PMu3W*@t5$q{_ z25#QGiNV1^+`4rOw{PFZ(9jTuhlg?J&K=yndlw@kBN!bW#l3s?aR2^&Jb3T`V`F1@ z`0ydd$Hy@-F@Z;q9$|8F5>r!Cn4X@-%*+gCXJ;`tH;2cMALGfBCzzj~$HKw_78e(> zw6uifgCvoaO_B^pzS+)iE<29jS)ivn^g$!79*9)8-)2){V))UObpHAA(_LP!Zr*#o z*U_K)LvL-OpTb8CgcuHgVD&uK^20x=TD8}6V529(dvy{ymSFK>d+yb5X{UADU&JVY!Cwspjx}+Iq`2IMaxt2s0-DKm8Zm_ z^706IN*>kzMM&YA8T70rePd8G@c9FD+`sWGdW+vX8zm`4(dF3}6f+D`{W~g&IaU*L z=ru)?g_#*2+(4w?tW*)>#4q$jpKHe`CNl?Ff;0V^&X!_Fbu!cJHxPY=yE+BX#x)Y@ zh_5cUhJObqXK^(#($TkS(=RkP~;)XHvG(Fh+aR zG`bqt#vm2K+}pI6!3u*|%{5hoFmAG}5GKy$6Oznil@WXfAwMV!VYcSJBP5f_DkM?~ z`AK02v$={|e0Wk)gevo!uDR%@%Is!#+(@t-;HdX1m65aEo1MMaS`{^?g9f@9D|=mq z`?Sfgt1pmwTOVTF4w}`?_5Q|%=>OfClGVgYO_hW( t+Ir|##B`o2)Ddt@DlI+Wtcn!vr_yCOqlZS==z!4yqXR|UUV literal 15086 zcmche30ze7wZ|`;?1Ba{BFcy$3`0`ezE8Tn_oNs#Br!3{z|*vbG_5ghp0P=q^)*hK z*EFUt(JXD6nAB{(r$$9ZKon&V7gU&GbBV@43UR3tKx9YG`<*-UpSj%59j+*y59jm0 z+y9*J|J?uG&N)bukK`v!pDv*_Nb3_N={8A{3d8ZZ^GZ&r03sO{y|e-e@lhUe~&5G_`FBr_0;Is68)~+6zJm{M1n# zM4fwssLL8i-TOvSPuZxx?*0C=MuJ)BtTLOHfc+|jh7wz*>6Od-1C!czgizboFv2;Y z_WV$~vNMD_b_Y}E9T4%%;iU?;48RxqqN4J9!%0 zZ`%`e*u`E5_FZoFE@o%n?;pkN)LZUPeMkIv6G^@|lAU$g%%*2_vh%q)Anac*4tl7q zD2T2wJLXNNdftG&N3b9Er~adUbhXm=Inn<{wP32!%U_k1`ZHno;x;L~r7*a?wGeZj z=Z%>2%6W4rfcn7R2loC-@Y{X+`|LOFTBulXOR`>e9M^vGo}_TW&u#OLkoTMOLa2S0 zhdB>+=e(i*3b5OJnV+s!`JSZ;X@XMEaF=nPvFPT{GObbDY0eA&!Nsj1bTJp#FXoSj zIWM#KyV+@=+K0tXC#6BYQ0vyS4)&9J`75)H-&gCz_2_e9OX$f9n?mWr<}l*xR-W_D zdBbzwId9nS4)z-Gv()(%(`hMCtyAxMb+++4df9P*I-#Za!e^mRoZApa=hlbOh0nsM z8`tfiInTZa`yl>WAEG*m=$Q0Z_JNjeZ`va(OfSTVihNrygSMRfDrv6YTJ7Ml@QukM{gE`}GA4bsM)<)1*e~qAq z4oM^t3lS@aIgO>{OL$`kgmj%6+HbjhG7N z0cQTC_uz<@Juz0Poy`$MKfdeF^uSH9GkRDza0jDjA1bKK16RURsDr%vU z`Q-mZ(TP7oUX7yTE635XSH{WrSiTfL(S`50T4X`$$+ zw{3hqHW~r_M1{QneJz40zx9R40GSLGurS z^>QnZJGOG%TgTyNHFNDt(Ny!h@l>vDG z;bMlbt6p$3+n$?1mCr(+nLrg!$CR-VeisG*pCOv(;>#B%4f$=$?6__#h7IoQSp7ou zBQmqo*UW5lF@yW57&`hJ$dfTwk31eTN!Y_PVh+aex%yo2w?p+Jqb`1%IrR~tZ@6vE zGn2>Ip1-+M_OhTE1sJUP$pXJ(FbF_$bh(B7X;p}jwuLVFg*uTtBvXPTDl@qN`IV-c8_K@LOI z&yZql`<`02-t~PC8RiLQEBri&`F?OOil^fLh^O7z@l6(dn<({yy<88!J}_JMV+cOZY(OvuCC}tAm|5jw1RD)#BOcOJ}5rp^j7U-<=(2#QcD-;bZt(#Y~0w zCQ!jWiBvE*v85!<5XAk#Uag0}GCS3%wl@;%m>(s~?ew#fney*Wr2M<4(T+Q(QC>z; z<~77+<@djl;8p7^oR^rxelB$Qc@XoC3~=8ujkeEDqV2PiKE^xbi+X&IV{+7XMryri z_N}4tb6fshiL##sbKV>q#$|-(^3$m}fD! z%*@=f6wKE5o{VS1d!B5J=FWzwzG^#(}lR(-`*6P0{n6(ql7!dqi@v zTFmP0+tQ62{*m8E6$^CC9pu;5`u^GLx#`KX)%-i>=(}$5jWy?=&8xPz(Jc9azfn1c zvU_yR9o_M8wapv7j&ooU{M+#iz%y&exsRCUZ*MfmaP@6YH^zbg>UKN_4$T{y9na&o zBWBah!?iaO{R(bR`rzQbDUId#nVPkvVnIqJ-jP45T#$O}4P%HP>SkquM6KNV5jCK( zAS~X1Y^bY6TFZ)ctF(fwe%#J*wsB-Z=C?tLaJ8HXZiPaAJ8y+TN9!}dEo=>NQ~~Y3 zbK5T=9z8>D!#3gbX)3ACbZ^|3?%0)w|iupWTLd1`UY4BjqU0IuM5b<5j43VV|GiPj2FJL3@KHst@ zDDHAm@Br6#$@N8|#szgzTvvoV{{U(fh7}`0ocE{NwoO-U^Nn%L`%Pg`(dC`a+9s=_ zCW~vDxF*Qv+e(#Ef^uwnmhldkuVo+SWT(*4oFANZDKFSvU&8fZT%Upbx~R>N>q!nu zy7M(0BZloBE&Dk4W?}zyffV@nEuoh#=7vz$zCed$@Ivdr%-~qn?vO|nOuwItm}~T^X}aCYIiT~CD`7p zX+O*K4{@-jb)nOnK6TVLaE(bXk6lp@Bj=_?F7deZine7*;%5+SYsLg^*`{xK>W}(_ zqjGHm^5JLyHyr*)J&smhAMsC5I(QrA-D8?#H=7P?^{anx%+h+1pGR)K@x#&7XT$AG zD<`j!hp#fH9@4VU^Fq`9s>n#>);s0gx|~@gJTc2kHy$EYYpN1e;vv``w}+K$oWyu=eqKuj{G97 z&m6=Vt35HGKG9pv$me)0y3Ug~j@p;TKg&5}ksm~!&5_T1dIBANYT_z6=6OuAfT2st zaqjJCt@{ljmN6&?=gq#&5*35oZ=Wk4$+^fUV;niXUr%h}`WWsH+mj(%1;g>BiCXsi zI8Wfsrg|Upa(SHpl=FzlAh7n9ys>%hEwRqL9+xOQy*wbnv%yc>D2^9PVqa_458xugfK z!FIhT`*jZ{IqIU0NoR4M9@P8%V%^ANy(;H}5H}>{~tQ$2stW{pmb! zJ?&R++nqgiu^i7w?gTbD|KN%*@3<@BAzPO5b`9Ia?|({N!&%;jcUahLm-7d6IR@{D z4a+gsS;;$aPkmp*erv_t?}iM=rn*o5-3e>txW9<&a(oou#1O9O|HhMjZ*>lyC) z_6+0qT`^f_44LB!yYA4A=hzmc{>yOfdFlsyCqyhmwEH#UF}NOTJn6WeZSeKu!Y;Sl zAe9>N6kAs6vg@(ssqdEAQ=?(m71P|Eq3QdciiOjnJn6XJZCht1ZxTK@aIfcB7<|9} zdhKZS&rP3}!S?fK8F6iU{-V$Fz>#s>3%&gyY0%gW!EM$Vmf+9?hb4qG;9XXh2;zrBB*Xv_af-A-6`j~|gML@6 z0rj}=JKm$~q}5t#?dFsm#XYS>99O#ZW=ltzzxzGBPFgPZDO>a{5&bJOQ!HX%?i~ih z-o+E${h6gs{&zZYh-9BPJyP_QTdccwVj*|tl#JVhlzn-5 zbGji1I*2(u=}@xQWTxD!?$7J!U(Y&<%#+CF(>kf8`IAVCg1v>u;;H|ZnNvRG{tk&> zi}(4*Fou?Me{#P+S1w4=JU{G5SO4@W^=yM{!D<#8TZH|lKaTf!ZUhhdm$-DK8PZVi zpCD{FH_Uv7mT#Tb-+ugo;cw&nV?S~&!#>?$wkrWqo0)2Fn^{V@jE z7VMgSQ!5dEESxjd_1^EzZ|eTJw;R&fe(BsKSATnEa+>-$Z|ixp{Xa6LG2Z|H diff --git a/tests/scraper_test.py b/tests/scraper_test.py index 5178f45..2f31b7b 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -2,17 +2,16 @@ import datetime import os import tempfile +import pytest + from vk_url_scraper import VkScraper -# import pytest - - vks = None -# def test_login_fail(): -# with pytest.raises(Exception): -# VkScraper("invalid", "combination") +def test_login_fail(): + with pytest.raises(Exception): + VkScraper("invalid", "combination") def test_login_success(): @@ -102,7 +101,7 @@ def test_scrape_download_multiple_media(): "wall-17315087_74182_2.jpg", "wall-17315087_74182_3.jpg", "wall-17315087_74182_4.jpg", - "wall-17315087_74182_0.mkv", + "wall-17315087_74182_0.mp4", } found_files = set(os.listdir(tempdir)) assert len(expect_files) == len(expect_files & found_files) @@ -138,8 +137,4 @@ def test_scrape_video_only2(): with tempfile.TemporaryDirectory(dir="./") as tempdir: vks.download_media(res, tempdir) found_files = set(os.listdir(tempdir)) - # different systems might attribute different extension - assert ( - "video-17546758_456239898_0.webm" in found_files - or "video-17546758_456239898_0.mp4" in found_files - ) + assert "video-17546758_456239898_0.mp4" in found_files diff --git a/vk_url_scraper/__init__.py b/vk_url_scraper/__init__.py index 9f50225..499c193 100644 --- a/vk_url_scraper/__init__.py +++ b/vk_url_scraper/__init__.py @@ -1,2 +1,2 @@ from .scraper import VkScraper -from .utils import DateTimeEncoder, mkdir_if_not_exists +from .utils import DateTimeEncoder, suppress_stdout diff --git a/vk_url_scraper/__main__.py b/vk_url_scraper/__main__.py index 3ebf819..a2a7738 100644 --- a/vk_url_scraper/__main__.py +++ b/vk_url_scraper/__main__.py @@ -35,7 +35,7 @@ def get_argument_parser(): action="store", dest="token", required=False, - help="optional token, when passed authentication will not be performed - good to avoid captcha issues", + help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues", ) parser.add_argument( "-d", diff --git a/vk_url_scraper/scraper.py b/vk_url_scraper/scraper.py index 2d73b92..54fd3f3 100644 --- a/vk_url_scraper/scraper.py +++ b/vk_url_scraper/scraper.py @@ -1,5 +1,6 @@ import os import re +import shutil from collections import defaultdict from datetime import datetime from typing import List @@ -9,7 +10,7 @@ import requests import vk_api # used to get api_token after authentication import yt_dlp # to download videos from url -from .utils import captcha_handler, mkdir_if_not_exists +from .utils import captcha_handler, suppress_stdout class VkScraper: @@ -306,7 +307,7 @@ class VkScraper: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" } - mkdir_if_not_exists(destination) + os.makedirs(destination, exist_ok=True) downloaded = [] for r in results: for k, attachments in r["attachments"].items(): @@ -319,23 +320,30 @@ class VkScraper: f.write(d.content) downloaded.append(filename) elif k == "video": - for i, url in enumerate(attachments): - filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s") - ydl = yt_dlp.YoutubeDL( - { - "outtmpl": filename, - "quiet": True, - "restrictfilenames": True, - "forcefilename": True, - } - ) - info = ydl.extract_info(url, download=True) - filename = ydl.prepare_filename(info) - if "unknown_video" in filename: - new_filename = filename.replace("unknown_video", "mkv") - with open(filename, "rb") as vin, open(new_filename, "wb") as vout: - vout.write(vin.read()) - os.remove(filename) - filename = new_filename - downloaded.append(filename) + with suppress_stdout(): # ytdlp is not 100% quiet + for i, url in enumerate(attachments): + filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s") + ydl = yt_dlp.YoutubeDL( + { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "retries": 5, + "noplaylist": True, + "outtmpl": filename, + "quiet": True, + "restrictfilenames": True, + "forcefilename": True, + "simulate": False, + } + ) + info = ydl.extract_info(url, download=True) + filename = ydl.prepare_filename(info) + if "unknown_video" in filename: + print(f"before {filename=}") + filename = shutil.copy( + filename, filename.replace("unknown_video", "mkv") + ) + print(f"after {filename=}") + os.remove(filename) + downloaded.append(filename) return downloaded diff --git a/vk_url_scraper/utils.py b/vk_url_scraper/utils.py index 77aa5af..b51de5a 100644 --- a/vk_url_scraper/utils.py +++ b/vk_url_scraper/utils.py @@ -1,5 +1,7 @@ import json import os +import sys +from contextlib import contextmanager from datetime import datetime @@ -11,15 +13,21 @@ class DateTimeEncoder(json.JSONEncoder): return json.JSONEncoder.default(self, o) -def mkdir_if_not_exists(folder): - if not os.path.exists(folder): - os.makedirs(folder) - - def captcha_handler(captcha): - print( - f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}", - flush=True, - ) - key = input(f"Enter captcha code for {captcha.get_url()}:").strip() + key = input( + f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}:" + ).strip() return captcha.try_again(key) + + +@contextmanager +def suppress_stdout(): + # https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/ + # this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console + with open(os.devnull, "w") as devnull: + old_stdout = sys.stdout + sys.stdout = devnull + try: + yield + finally: + sys.stdout = old_stdout