Compare commits

...

20 Commits

Author SHA1 Message Date
Miguel Ramalho
5c965102a4 Bump version to v0.3.3 for release 2022-06-21 18:24:24 +02:00
msramalho
df10e6f55f applying feedback 2022-06-21 18:24:04 +02:00
Miguel Ramalho
863dd44463 Bump version to v0.3.2 for release 2022-06-21 14:58:27 +02:00
Miguel Ramalho
578ec81443 Bump version to v0.3.1 for release 2022-06-21 14:46:56 +02:00
Miguel Ramalho
c32caec442 Bump version to v0.3.0 for release 2022-06-21 14:25:48 +02:00
msramalho
80b43f7c95 token functionality 2022-06-21 14:23:54 +02:00
msramalho
90b72b6d22 trying with token 2022-06-21 14:16:54 +02:00
msramalho
d96e0c0a3a captcha fix 2022-06-21 14:05:33 +02:00
msramalho
db03a4c0f6 captch regex fix 2022-06-21 13:57:13 +02:00
msramalho
cf100ee69e updated captcha logic 2022-06-21 12:59:45 +02:00
msramalho
a09cf32b3e captch fix 2 2022-06-21 12:38:55 +02:00
msramalho
e1eb3ed620 -s 2022-06-21 12:23:23 +02:00
msramalho
72bd951d9c show capture 2022-06-21 12:21:47 +02:00
msramalho
59d53be68b attempts at captch fix in workflow 2022-06-21 12:16:58 +02:00
Miguel Ramalho
24a1313a65 Bump version to v0.2.4 for release 2022-06-21 01:33:38 +02:00
msramalho
64df4eec28 3.10 only due to test issues 2022-06-21 01:33:16 +02:00
Miguel Ramalho
42bdc1441c Bump version to v0.2.3 for release 2022-06-21 01:23:29 +02:00
msramalho
c25880ee6d fix tests 2022-06-21 01:21:53 +02:00
msramalho
e1e3648852 remove print 2022-06-21 01:17:47 +02:00
msramalho
c74dc280d8 fix ytdlp naming 2022-06-21 01:17:26 +02:00
11 changed files with 96 additions and 34 deletions

3
.env.example Normal file
View File

@@ -0,0 +1,3 @@
VK_USERNAME="your username"
VK_PASSWORD="your password"
VK_TOKEN="optional token"

View File

@@ -20,6 +20,7 @@ env:
PYTHONPATH: ./
VK_USERNAME: ${{ secrets.VK_USERNAME }}
VK_PASSWORD: ${{ secrets.VK_PASSWORD }}
VK_TOKEN: ${{ secrets.VK_TOKEN }}
jobs:
checks:
@@ -29,11 +30,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python: ['3.7', '3.10']
task: # --show-capture=no on purpose
# python: ['3.7', '3.10']
python: ['3.10']
task: # --show-capture=no on purpose
- name: Test
run: |
pytest --show-capture=no --color=yes tests/
pytest --show-capture=no --color=yes tests/
include:
- python: '3.10'

View File

@@ -20,6 +20,9 @@ vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall1
# you can also have multiple urls
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 https://vk.com/photo-12345_6789 https://vk.com/video12345_6789
# you can pass a token as well to avoid always authenticating
# and possibly getting captch prompts
vk_url_scraper -u "username" -p "password" -t "vktoken goes here" --urls https://vk.com/wall12345_6789
# save the JSON output into a file
vk_url_scraper -u "username here" -p "password here" --urls https://vk.com/wall12345_6789 > output.json

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 7.6 KiB

View File

@@ -33,7 +33,7 @@ with open("vk_url_scraper/version.py", "r") as version_file:
setup(
name="vk-url-scraper",
version=VERSION["VERSION"],
description="",
description="Scrape VK URLs to fetch info and media - python API or command line tool.",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
classifiers=[
@@ -43,7 +43,7 @@ setup(
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
],
keywords="",
keywords=["scraper", "vk", "vkontakte", "vk-api", "media-downloader"],
url="https://github.com/bellingcat/vk-url-scraper",
author="Bellingcat",
author_email="tech@bellingcat.com",
@@ -55,4 +55,9 @@ setup(
install_requires=read_requirements("requirements.txt"),
extras_require={"dev": read_requirements("dev-requirements.txt")},
python_requires=">=3.7",
entry_points={
"console_scripts": [
"vk_url_scraper=vk_url_scraper.__main__:main",
],
},
)

View File

@@ -16,7 +16,9 @@ def test_login_fail():
def test_login_success():
global vks
vks = VkScraper(os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"])
vks = VkScraper(
os.environ["VK_USERNAME"], os.environ["VK_PASSWORD"], os.environ.get("VK_TOKEN")
)
def test_scrape_empty_urll():
@@ -99,7 +101,7 @@ def test_scrape_download_multiple_media():
"wall-17315087_74182_2.jpg",
"wall-17315087_74182_3.jpg",
"wall-17315087_74182_4.jpg",
"wall-17315087_74182_0.mkv",
"wall-17315087_74182_0.mp4",
}
found_files = set(os.listdir(tempdir))
assert len(expect_files) == len(expect_files & found_files)
@@ -135,4 +137,4 @@ def test_scrape_video_only2():
with tempfile.TemporaryDirectory(dir="./") as tempdir:
vks.download_media(res, tempdir)
found_files = set(os.listdir(tempdir))
assert "video-17546758_456239898_0.mkv.webm" in found_files
assert "video-17546758_456239898_0.mp4" in found_files

View File

@@ -1,2 +1,2 @@
from .scraper import VkScraper
from .utils import DateTimeEncoder, mkdir_if_not_exists
from .utils import DateTimeEncoder, suppress_stdout

View File

@@ -29,6 +29,14 @@ def get_argument_parser():
required=True,
help="password for the valid vk.com account",
)
parser.add_argument(
"-t",
"--token",
action="store",
dest="token",
required=False,
help="optional token, when passed username/password authentication will not be done - good to avoid captcha issues",
)
parser.add_argument(
"-d",
"--download",
@@ -50,7 +58,7 @@ def get_argument_parser():
def main():
parser = get_argument_parser()
args = parser.parse_args()
vks = VkScraper(args.username, args.password)
vks = VkScraper(args.username, args.password, args.token)
text = " ".join(args.urls)
res = vks.scrape(text)
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)

View File

@@ -1,5 +1,6 @@
import os
import re
import shutil
from collections import defaultdict
from datetime import datetime
from typing import List
@@ -9,7 +10,7 @@ import requests
import vk_api # used to get api_token after authentication
import yt_dlp # to download videos from url
from .utils import mkdir_if_not_exists
from .utils import captcha_handler, suppress_stdout
class VkScraper:
@@ -38,10 +39,13 @@ class VkScraper:
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+)")
def __init__(self, username: str, password: str) -> None:
def __init__(
self, username: str, password: str, token: str = None, captcha_handler=captcha_handler
) -> None:
"""Initializes the scraper.
This function receives a username and password and performs authentication on vk.com to then call api endpoints
This function receives a username and password (or access token) and performs
authentication on vk.com to then call api endpoints. If token is passed, authentication will not be performed again.
Parameters
----------
@@ -49,9 +53,14 @@ class VkScraper:
Username on vk.com, can be a phone number or email
password : str
Matching password on vk.com
token : str
Access token received after authenticating, can be found in the vl_config.v2.json file
"""
self.session = vk_api.VkApi(username, password)
self.session.auth(token_only=True)
self.session = vk_api.VkApi(
username, password, token=token, captcha_handler=captcha_handler
)
if token is None or len(token) == 0:
self.session.auth(token_only=True)
def scrape(self, url: str) -> List:
"""Scrapes a URL for multiple possibilities of inner links such as wall, video, photo, ...
@@ -298,7 +307,7 @@ class VkScraper:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
mkdir_if_not_exists(destination)
os.makedirs(destination, exist_ok=True)
downloaded = []
for r in results:
for k, attachments in r["attachments"].items():
@@ -311,17 +320,30 @@ class VkScraper:
f.write(d.content)
downloaded.append(filename)
elif k == "video":
for i, url in enumerate(attachments):
filename = os.path.join(destination, f"{r['id']}_{i}.mkv")
ydl = yt_dlp.YoutubeDL(
{
"outtmpl": filename,
"quiet": True,
"restrictfilenames": True,
"forcefilename": True,
}
)
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
downloaded.append(filename)
with suppress_stdout(): # ytdlp is not 100% quiet
for i, url in enumerate(attachments):
filename = os.path.join(destination, f"{r['id']}_{i}.%(ext)s")
ydl = yt_dlp.YoutubeDL(
{
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"retries": 5,
"noplaylist": True,
"outtmpl": filename,
"quiet": True,
"restrictfilenames": True,
"forcefilename": True,
"simulate": False,
}
)
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
if "unknown_video" in filename:
print(f"before {filename=}")
filename = shutil.copy(
filename, filename.replace("unknown_video", "mkv")
)
print(f"after {filename=}")
os.remove(filename)
downloaded.append(filename)
return downloaded

View File

@@ -1,5 +1,7 @@
import json
import os
import sys
from contextlib import contextmanager
from datetime import datetime
@@ -11,6 +13,21 @@ class DateTimeEncoder(json.JSONEncoder):
return json.JSONEncoder.default(self, o)
def mkdir_if_not_exists(folder):
if not os.path.exists(folder):
os.makedirs(folder)
def captcha_handler(captcha):
key = input(
f"CAPTCHA DETECTED, please solve it and input the solution. {captcha.sid=} {captcha.get_url()=}:"
).strip()
return captcha.try_again(key)
@contextmanager
def suppress_stdout():
# https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
# this is used to silence ytdlp which does not fully respects quite=True and outputs filenames to the console
with open(os.devnull, "w") as devnull:
old_stdout = sys.stdout
sys.stdout = devnull
try:
yield
finally:
sys.stdout = old_stdout

View File

@@ -1,8 +1,8 @@
_MAJOR = "0"
_MINOR = "2"
_MINOR = "3"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "2"
_PATCH = "3"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""