mirror of
https://github.com/bellingcat/vk-url-scraper.git
synced 2026-06-13 05:48:37 +03:00
adds command line interface
This commit is contained in:
@@ -1 +1,2 @@
|
||||
from .scraper import VkScraper
|
||||
from .utils import DateTimeEncoder, mkdir_if_not_exists
|
||||
|
||||
63
vk_url_scraper/__main__.py
Normal file
63
vk_url_scraper/__main__.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
from .scraper import VkScraper
|
||||
from .utils import DateTimeEncoder
|
||||
|
||||
|
||||
def get_argument_parser():
|
||||
"""
|
||||
Creates the CMD line arguments. 'python vk_url_scraper.py --help'
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Authenticate and scrape information from vk.com based on a URL or set of URLs."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-u",
|
||||
"--username",
|
||||
action="store",
|
||||
dest="username",
|
||||
required=True,
|
||||
help="username for a valid vk.com account",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--password",
|
||||
action="store",
|
||||
dest="password",
|
||||
required=True,
|
||||
help="password for the valid vk.com account",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--download",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
dest="download",
|
||||
help="if set then all photos and videos will be downloaded to folder output/",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--urls",
|
||||
action="store",
|
||||
dest="urls",
|
||||
nargs=argparse.REMAINDER,
|
||||
required=True,
|
||||
help="must be the last argument: any text with one or more urls to scrape",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
parser = get_argument_parser()
|
||||
args = parser.parse_args()
|
||||
vks = VkScraper(args.username, args.password)
|
||||
text = " ".join(args.urls)
|
||||
res = vks.scrape(text)
|
||||
res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||
print(res_json)
|
||||
if args.download:
|
||||
vks.download_media(res)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,10 +1,15 @@
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
import vk_api # used to get api_token after authentication
|
||||
import yt_dlp # to download videos from url
|
||||
|
||||
from .utils import mkdir_if_not_exists
|
||||
|
||||
|
||||
class VkScraper:
|
||||
@@ -273,3 +278,42 @@ class VkScraper:
|
||||
}
|
||||
)
|
||||
return res
|
||||
|
||||
def download_media(self, results: List[dict], destination: str = "./output/") -> List[str]:
|
||||
"""
|
||||
Receives a list of dicts as returned by any of the scrape* methods and downloads the URLS present
|
||||
if they are of type photo or video into the destination folder
|
||||
|
||||
Parameters
|
||||
----------
|
||||
results : List[dict]
|
||||
list with valid dictionary results (see class definition)
|
||||
destination : str
|
||||
the directory to save the downloaded files to. defaults to output/
|
||||
|
||||
Returns
|
||||
-------
|
||||
a list of filenames for the downloaded files
|
||||
"""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
mkdir_if_not_exists(destination)
|
||||
downloaded = []
|
||||
for r in results:
|
||||
for k, attachments in r["attachments"].items():
|
||||
if k == "photo":
|
||||
for i, url in enumerate(attachments):
|
||||
ext = os.path.splitext(urlparse(url).path)[1]
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}{ext}")
|
||||
d = requests.get(url, headers=headers)
|
||||
with open(filename, "wb") as f:
|
||||
f.write(d.content)
|
||||
downloaded.append(filename)
|
||||
elif k == "video":
|
||||
for i, url in enumerate(attachments):
|
||||
filename = os.path.join(destination, f"{r['id']}_{i}.mkv")
|
||||
ydl = yt_dlp.YoutubeDL({"outtmpl": filename, "quiet": True})
|
||||
ydl.extract_info(url, download=True)
|
||||
downloaded.append(filename)
|
||||
return downloaded
|
||||
|
||||
16
vk_url_scraper/utils.py
Normal file
16
vk_url_scraper/utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class DateTimeEncoder(json.JSONEncoder):
|
||||
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime):
|
||||
return str(o) # with timezone
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
|
||||
def mkdir_if_not_exists(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
Reference in New Issue
Block a user