adds command line interface

2026-06-13 05:48:37 +03:00 · 2022-06-20 23:52:14 +02:00
parent 50b78d618a
commit c9a3ece9af
11 changed files with 354 additions and 12 deletions
--- a/vk_url_scraper/init.py
+++ b/vk_url_scraper/init.py
@@ -1 +1,2 @@
 from .scraper import VkScraper
+from .utils import DateTimeEncoder, mkdir_if_not_exists
--- a/vk_url_scraper/main.py
+++ b/vk_url_scraper/main.py
@@ -0,0 +1,63 @@
+import argparse
+import json
+
+from .scraper import VkScraper
+from .utils import DateTimeEncoder
+
+
+def get_argument_parser():
+    """
+    Creates the CMD line arguments. 'python vk_url_scraper.py --help'
+    """
+    parser = argparse.ArgumentParser(
+        description="Authenticate and scrape information from vk.com based on a URL or set of URLs."
+    )
+
+    parser.add_argument(
+        "-u",
+        "--username",
+        action="store",
+        dest="username",
+        required=True,
+        help="username for a valid vk.com account",
+    )
+    parser.add_argument(
+        "-p",
+        "--password",
+        action="store",
+        dest="password",
+        required=True,
+        help="password for the valid vk.com account",
+    )
+    parser.add_argument(
+        "-d",
+        "--download",
+        action=argparse.BooleanOptionalAction,
+        dest="download",
+        help="if set then all photos and videos will be downloaded to folder output/",
+    )
+    parser.add_argument(
+        "--urls",
+        action="store",
+        dest="urls",
+        nargs=argparse.REMAINDER,
+        required=True,
+        help="must be the last argument: any text with one or more urls to scrape",
+    )
+    return parser
+
+
+def main():
+    parser = get_argument_parser()
+    args = parser.parse_args()
+    vks = VkScraper(args.username, args.password)
+    text = " ".join(args.urls)
+    res = vks.scrape(text)
+    res_json = json.dumps(res, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
+    print(res_json)
+    if args.download:
+        vks.download_media(res)
+
+
+if __name__ == "__main__":
+    main()
--- a/vk_url_scraper/scraper.py
+++ b/vk_url_scraper/scraper.py
@@ -1,10 +1,15 @@
+import os
 import re
 from collections import defaultdict
 from datetime import datetime
 from typing import List
+from urllib.parse import urlparse

 import requests
 import vk_api  # used to get api_token after authentication
+import yt_dlp  # to download videos from url
+
+from .utils import mkdir_if_not_exists


 class VkScraper:
@@ -273,3 +278,42 @@ class VkScraper:
                }
            )
        return res
+
+    def download_media(self, results: List[dict], destination: str = "./output/") -> List[str]:
+        """
+        Receives a list of dicts as returned by any of the scrape* methods and downloads the URLS present
+        if they are of type photo or video into the destination folder
+
+        Parameters
+        ----------
+        results : List[dict]
+            list with valid dictionary results (see class definition)
+        destination : str
+            the directory to save the downloaded files to. defaults to output/
+
+        Returns
+        -------
+        a list of filenames for the downloaded files
+        """
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
+        }
+        mkdir_if_not_exists(destination)
+        downloaded = []
+        for r in results:
+            for k, attachments in r["attachments"].items():
+                if k == "photo":
+                    for i, url in enumerate(attachments):
+                        ext = os.path.splitext(urlparse(url).path)[1]
+                        filename = os.path.join(destination, f"{r['id']}_{i}{ext}")
+                        d = requests.get(url, headers=headers)
+                        with open(filename, "wb") as f:
+                            f.write(d.content)
+                            downloaded.append(filename)
+                elif k == "video":
+                    for i, url in enumerate(attachments):
+                        filename = os.path.join(destination, f"{r['id']}_{i}.mkv")
+                        ydl = yt_dlp.YoutubeDL({"outtmpl": filename, "quiet": True})
+                        ydl.extract_info(url, download=True)
+                        downloaded.append(filename)
+        return downloaded
--- a/vk_url_scraper/utils.py
+++ b/vk_url_scraper/utils.py
@@ -0,0 +1,16 @@
+import json
+import os
+from datetime import datetime
+
+
+class DateTimeEncoder(json.JSONEncoder):
+    # to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
+    def default(self, o):
+        if isinstance(o, datetime):
+            return str(o)  # with timezone
+        return json.JSONEncoder.default(self, o)
+
+
+def mkdir_if_not_exists(folder):
+    if not os.path.exists(folder):
+        os.makedirs(folder)