Compare commits

...

20 Commits

Author SHA1 Message Date
Miguel Ramalho
5c965102a4 Bump version to v0.3.3 for release 2022-06-21 18:24:24 +02:00
msramalho
df10e6f55f applying feedback 2022-06-21 18:24:04 +02:00
Miguel Ramalho
863dd44463 Bump version to v0.3.2 for release 2022-06-21 14:58:27 +02:00
Miguel Ramalho
578ec81443 Bump version to v0.3.1 for release 2022-06-21 14:46:56 +02:00
Miguel Ramalho
c32caec442 Bump version to v0.3.0 for release 2022-06-21 14:25:48 +02:00
msramalho
80b43f7c95 token functionality 2022-06-21 14:23:54 +02:00
msramalho
90b72b6d22 trying with token 2022-06-21 14:16:54 +02:00
msramalho
d96e0c0a3a captcha fix 2022-06-21 14:05:33 +02:00
msramalho
db03a4c0f6 captch regex fix 2022-06-21 13:57:13 +02:00
msramalho
cf100ee69e updated captcha logic 2022-06-21 12:59:45 +02:00
msramalho
a09cf32b3e captch fix 2 2022-06-21 12:38:55 +02:00
msramalho
e1eb3ed620 -s 2022-06-21 12:23:23 +02:00
msramalho
72bd951d9c show capture 2022-06-21 12:21:47 +02:00
msramalho
59d53be68b attempts at captch fix in workflow 2022-06-21 12:16:58 +02:00
Miguel Ramalho
24a1313a65 Bump version to v0.2.4 for release 2022-06-21 01:33:38 +02:00
msramalho
64df4eec28 3.10 only due to test issues 2022-06-21 01:33:16 +02:00
Miguel Ramalho
42bdc1441c Bump version to v0.2.3 for release 2022-06-21 01:23:29 +02:00
msramalho
c25880ee6d fix tests 2022-06-21 01:21:53 +02:00
msramalho
e1e3648852 remove print 2022-06-21 01:17:47 +02:00
msramalho
c74dc280d8 fix ytdlp naming 2022-06-21 01:17:26 +02:00
11 changed files with 96 additions and 34 deletions

3
.env.example Normal file
View File

@@ -0,0 +1,3 @@
VK_USERNAME="your username"
VK_PASSWORD="your password"
VK_TOKEN="optional token"

View File

@@ -20,6 +20,7 @@ env:
PYTHONPATH: ./ PYTHONPATH: ./
VK_USERNAME: ${{ secrets.VK_USERNAME }} VK_USERNAME: ${{ secrets.VK_USERNAME }}
VK_PASSWORD: ${{ secrets.VK_PASSWORD }} VK_PASSWORD: ${{ secrets.VK_PASSWORD }}
VK_TOKEN: ${{ secrets.VK_TOKEN }}
jobs: jobs:
checks: checks:
@@ -29,7 +30,8 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python: ['3.7', '3.10'] # python: ['3.7', '3.10']
python: ['3.10']
task: # --show-capture=no on purpose task: # --show-capture=no on purpose
- name: Test - name: Test
run: | run: |

View File

@@ -20,6 +20,9 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
# you can also have multiple urls # you can also have multiple urls
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789 vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
# you can pass a token as well to avoid always authenticating
# and possibly getting captch prompts
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
# save the JSON output into a file # save the JSON output into a file
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 7.6 KiB

View File

@@ -33,7 +33,7 @@ with open("vk_url_scraper/version.py", "r") as version_file:
setup( setup(
name="vk-url-scraper", name="vk-url-scraper",
version=VERSION["VERSION"], version=VERSION["VERSION"],
description="", description="Scrape VK URLs to fetch info and media - python API or command line tool.",
long_description=open("README.md").read(), long_description=open("README.md").read(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
classifiers=[ classifiers=[
@@ -43,7 +43,7 @@ setup(
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
], ],
keywords="", keywords=["scraper", "vk", "vkontakte", "vk-api", "media-downloader"],
url="https://github.com/bellingcat/vk-url-scraper", url="https://github.com/bellingcat/vk-url-scraper",
author="Bellingcat", author="Bellingcat",
author_email="tech@bellingcat.com", author_email="tech@bellingcat.com",
@@ -55,4 +55,9 @@ setup(
install_requires=read_requirements("requirements.txt"), install_requires=read_requirements("requirements.txt"),
extras_require={"dev": read_requirements("dev-requirements.txt")}, extras_require={"dev": read_requirements("dev-requirements.txt")},
python_requires=">=3.7", python_requires=">=3.7",
entry_points={
"console_scripts": [
"vk_url_scraper=vk_url_scraper.__main__:main",
],
},
) )

View File

@@ -16,7 +16,9 @@ def test_login_fail():
def test_login_success(): def test_login_success():
global vks global vks
vks = VkScraper(os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"]) vks = VkScraper(
os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"], os.environ.get("VK_TOKEN")
)
def test_scrape_empty_urll(): def test_scrape_empty_urll():
@@ -99,7 +101,7 @@ def test_scrape_download_multiple_media():
"wall-17315087_74182_2.jpg", "wall-17315087_74182_2.jpg",
"wall-17315087_74182_3.jpg", "wall-17315087_74182_3.jpg",
"wall-17315087_74182_4.jpg", "wall-17315087_74182_4.jpg",
"wall-17315087_74182_0.mkv", "wall-17315087_74182_0.mp4",
} }
found_files = set(os.listdir(tempdir)) found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files) assert len(expect_files) == len(expect_files & found_files)
@@ -135,4 +137,4 @@ def test_scrape_video_only2():
with tempfile.TemporaryDirectory(dir="./") as tempdir: with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir) vks.download_media(res, tempdir)
found_files = set(os.listdir(tempdir)) found_files = set(os.listdir(tempdir))
assert "video-17546758_456239898_0.mkv.webm" in found_files assert "video-17546758_456239898_0.mp4" in found_files

View File

@@ -1,2 +1,2 @@
from .scraper import VkScraper from .scraper import VkScraper
from .utils import DateTimeEncoder, mkdir_if_not_exists from .utils import DateTimeEncoder, suppress_stdout

View File

@@ -29,6 +29,14 @@ def get_argument_parser():
required=True, required=True,
help="password for the valid vk.com account", help="password for the valid vk.com account",
) )
parser.add_argument(
"-t",
"--token",
action="store",
dest="token",
required=False,
help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues",
)
parser.add_argument( parser.add_argument(
"-d", "-d",
"--download", "--download",
@@ -50,7 +58,7 @@ def get_argument_parser():
def main(): def main():
parser = get_argument_parser() parser = get_argument_parser()
args = parser.parse_args() args = parser.parse_args()
vks = VkScraper(args.username, args.password) vks = VkScraper(args.username, args.password, args.token)
text = " ".join(args.urls) text = " ".join(args.urls)
res = vks.scrape(text) res = vks.scrape(text)
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder) res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)

View File

@@ -1,5 +1,6 @@
import os import os
import re import re
import shutil
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from typing import List from typing import List
@@ -9,7 +10,7 @@ import requests
import vk_api # used to get api_token after authentication import vk_api # used to get api_token after authentication
import yt_dlp # to download videos from url import yt_dlp # to download videos from url
from .utils import mkdir_if_not_exists from .utils import captcha_handler, suppress_stdout
class VkScraper: class VkScraper:
@@ -38,10 +39,13 @@ class VkScraper:
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)") VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
def __init__(self, username: str, password: str) -> None: def __init__(
self, username: str, password: str, token: str = None, captcha_handler=captcha_handler
) -> None:
"""Initializes the scraper. """Initializes the scraper.
This function receives a username and password and performs authentication on vk.com to then call api endpoints This function receives a username and password (or access token) and performs
authentication on vk.com to then call api endpoints. If token is passed, authentication will not be performed again.
Parameters Parameters
---------- ----------
@@ -49,8 +53,13 @@ class VkScraper:
Username on vk.com, can be a phone number or email Username on vk.com, can be a phone number or email
password : str password : str
Matching password on vk.com Matching password on vk.com
token : str
Access token received after authenticating, can be found in the vl_config.v2.json file
""" """
self.session = vk_api.VkApi(username, password) self.session = vk_api.VkApi(
username, password, token=token, captcha_handler=captcha_handler
)
if token is None or len(token) == 0:
self.session.auth(token_only=True) self.session.auth(token_only=True)
def scrape(self, url: str) -> List: def scrape(self, url: str) -> List:
@@ -298,7 +307,7 @@ class VkScraper:
headers = { headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
} }
mkdir_if_not_exists(destination) os.makedirs(destination, exist_ok=True)
downloaded = [] downloaded = []
for r in results: for r in results:
for k, attachments in r["attachments"].items(): for k, attachments in r["attachments"].items():
@@ -311,17 +320,30 @@ class VkScraper:
f.write(d.content) f.write(d.content)
downloaded.append(filename) downloaded.append(filename)
elif k == "video": elif k == "video":
with suppress_stdout(): # ytdlp is not 100% quiet
for i, url in enumerate(attachments): for i, url in enumerate(attachments):
filename = os.path.join(destination, f"{r['id']}_{i}.mkv") filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
ydl = yt_dlp.YoutubeDL( ydl = yt_dlp.YoutubeDL(
{ {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"retries": 5,
"noplaylist": True,
"outtmpl": filename, "outtmpl": filename,
"quiet": True, "quiet": True,
"restrictfilenames": True, "restrictfilenames": True,
"forcefilename": True, "forcefilename": True,
"simulate": False,
} }
) )
info = ydl.extract_info(url, download=True) info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info) filename = ydl.prepare_filename(info)
if "unknown_video" in filename:
print(f"before {filename=}")
filename = shutil.copy(
filename, filename.replace("unknown_video", "mkv")
)
print(f"after {filename=}")
os.remove(filename)
downloaded.append(filename) downloaded.append(filename)
return downloaded return downloaded

View File

@@ -1,5 +1,7 @@
import json import json
import os import os
import sys
from contextlib import contextmanager
from datetime import datetime from datetime import datetime
@@ -11,6 +13,21 @@ class DateTimeEncoder(json.JSONEncoder):
return json.JSONEncoder.default(self, o) return json.JSONEncoder.default(self, o)
def mkdir_if_not_exists(folder): def captcha_handler(captcha):
if not os.path.exists(folder): key = input(
os.makedirs(folder) f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}:"
).strip()
return captcha.try_again(key)
@contextmanager
def suppress_stdout():
# https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
# this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console
with open(os.devnull, "w") as devnull:
old_stdout = sys.stdout
sys.stdout = devnull
try:
yield
finally:
sys.stdout = old_stdout

View File

@@ -1,8 +1,8 @@
_MAJOR = "0" _MAJOR = "0"
_MINOR = "2" _MINOR = "3"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "2" _PATCH = "3"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""