Merge remote-tracking branch 'origin/main' into feat/seleniumbase

2026-06-11 12:48:28 +03:00 · 2025-06-03 11:05:16 +01:00
parent 5cf640af8a 6735fa890b
commit ee68f3efee
15 changed files with 1188 additions and 1333 deletions
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,3 +1,4 @@
+import mimetypes
 import shutil
 import sys
 import datetime
@@ -11,6 +12,7 @@ from urllib.request import urlretrieve

 import yt_dlp
 from yt_dlp.extractor.common import InfoExtractor
+from yt_dlp.utils import MaxDownloadsReached
 import pysubs2

 from loguru import logger
@@ -156,7 +158,7 @@ class GenericExtractor(Extractor):
                logger.error("generate_once.js not found after transpilation.")
                return

-            self.extractor_args.setdefault("youtube", {})["getpot_bgutil_script"] = script_path
+            self.extractor_args.setdefault("youtubepot-bgutilscript", {})["script_path"] = script_path
            logger.info(f"PO Token script configured at: {script_path}")

        except Exception as e:
@@ -301,9 +303,9 @@ class GenericExtractor(Extractor):
            result.set_url(url)

        if "description" in video_data and not result.get("content"):
-            result.set_content(video_data["description"])
+            result.set_content(video_data.pop("description"))
        # extract comments if enabled
-        if self.comments:
+        if self.comments and video_data.get("comments", []) is not None:
            result.set(
                "comments",
                [
@@ -362,7 +364,12 @@ class GenericExtractor(Extractor):
        # this time download
        ydl.params["getcomments"] = self.comments
        # TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
-        data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
+        try:
+            data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
+        except MaxDownloadsReached:  # proceed as normal once MaxDownloadsReached is raised
+            pass
+        logger.success(data)
+
        if "entries" in data:
            entries = data.get("entries", [])
            if not len(entries):
@@ -370,14 +377,33 @@ class GenericExtractor(Extractor):
                return False
        else:
            entries = [data]
-
        result = Metadata()

+        def _helper_get_filename(entry: dict) -> str:
+            entry_url = entry.get("url")
+
+            filename = ydl.prepare_filename(entry)
+            base_filename, _ = os.path.splitext(filename)  # '/get/path/to/file' ignore '.ext'
+            directory = os.path.dirname(base_filename)  # '/get/path/to'
+            basename = os.path.basename(base_filename)  # 'file'
+            for f in os.listdir(directory):
+                if (
+                    f.startswith(basename)
+                    or (entry_url and os.path.splitext(f)[0] in entry_url)
+                    and "video/" in (mimetypes.guess_type(f)[0] or "")
+                ):
+                    return os.path.join(directory, f)
+            return False
+
        for entry in entries:
            try:
-                filename = ydl.prepare_filename(entry)
-                if not os.path.exists(filename):
-                    filename = filename.split(".")[0] + ".mkv"
+                filename = _helper_get_filename(entry)
+
+                if not filename or not os.path.exists(filename):
+                    # file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
+                    continue
+
+                logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")

                new_media = Media(filename)
                for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
@@ -396,6 +422,9 @@ class GenericExtractor(Extractor):
                result.add_media(new_media)
            except Exception as e:
                logger.error(f"Error processing entry {entry}: {e}")
+        if not len(result.media):
+            logger.warning(f"No media found for entry {entry}, skipping.")
+            return False

        return self.add_metadata(data, info_extractor, url, result)

@@ -454,6 +483,13 @@ class GenericExtractor(Extractor):

        dropin_submodule = self.dropin_for_name(info_extractor.ie_key())

+        def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
+            if data.get("is_live", False) and not self.livestreams:
+                logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
+                return False
+            # it's a valid video, that the youtubdedl can download out of the box
+            return self.get_metadata_for_video(data, info_extractor, url, ydl)
+
        try:
            if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
                logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
@@ -461,11 +497,12 @@ class GenericExtractor(Extractor):

            # don't download since it can be a live stream
            data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
-            if data.get("is_live", False) and not self.livestreams:
-                logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
-                return False
-            # it's a valid video, that the youtubdedl can download out of the box
-            result = self.get_metadata_for_video(data, info_extractor, url, ydl)
+
+            result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
+
+        except MaxDownloadsReached:
+            # yt-dlp raises an error when the max downloads limit is reached, and it shouldn't for our purposes, so we consider that a success
+            result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)

        except Exception as e:
            if info_extractor.IE_NAME == "generic":
@@ -519,6 +556,8 @@ class GenericExtractor(Extractor):
            "--write-subs" if self.subtitles else "--no-write-subs",
            "--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
            "--live-from-start" if self.live_from_start else "--no-live-from-start",
+            "--postprocessor-args",
+            "ffmpeg:-bitexact",  # ensure bitexact output to avoid mismatching hashes for same video
        ]

        # proxy handling
--- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
@@ -98,7 +98,7 @@ class GsheetsFeederDB(Feeder, Database):
        return missing

    def started(self, item: Metadata) -> None:
-        logger.warning(f"STARTED {item}")
+        logger.info(f"STARTED {item}")
        gw, row = self._retrieve_gsheet(item)
        gw.set_cell(row, "status", "Archive in progress")

--- a/src/auto_archiver/modules/vk_extractor/init.py
+++ b/src/auto_archiver/modules/vk_extractor/init.py
@@ -1 +0,0 @@
-from .vk_extractor import VkExtractor
--- a/src/auto_archiver/modules/vk_extractor/manifest.py
+++ b/src/auto_archiver/modules/vk_extractor/manifest.py
@@ -1,37 +0,0 @@
-{
-    "name": "VKontakte Extractor",
-    "type": ["extractor"],
-    "requires_setup": True,
-    "depends": ["core", "utils"],
-    "dependencies": {
-        "python": ["loguru", "vk_url_scraper"],
-    },
-    "configs": {
-        "username": {"required": True, "help": "valid VKontakte username"},
-        "password": {"required": True, "help": "valid VKontakte password"},
-        "session_file": {
-            "default": "secrets/vk_config.v2.json",
-            "help": "valid VKontakte password",
-        },
-    },
-    "description": """
-The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages. 
-This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract 
-and download content. Note that VK videos are handled separately by the `YTDownloader`.
-
-### Features
- Extracts text, timestamps, and metadata from VK `/wall` posts.
- Downloads associated images and attaches them to the resulting `Metadata` object.
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
- Outputs structured metadata and media using `Metadata` and `Media` objects.
-
-### Setup
-To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
- **Username**: A valid VKontakte account username.
- **Password**: The corresponding password for the VKontakte account.
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
-
-Credentials can be set in the configuration file or directly via environment variables. Ensure you 
-have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
-""",
-}
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@@ -1,43 +0,0 @@
-from loguru import logger
-from vk_url_scraper import VkScraper
-
-from auto_archiver.utils.misc import dump_payload
-from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media
-
-
-class VkExtractor(Extractor):
-    """ "
-    VK videos are handled by YTDownloader, this archiver gets posts text and images.
-    Currently only works for /wall posts
-    """
-
-    def setup(self) -> None:
-        self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
-
-    def download(self, item: Metadata) -> Metadata:
-        url = item.get_url()
-
-        if "vk.com" not in item.netloc:
-            return False
-
-        # some urls can contain multiple wall/photo/... parts and all will be fetched
-        vk_scrapes = self.vks.scrape(url)
-        if not len(vk_scrapes):
-            return False
-        logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
-
-        result = Metadata()
-        for scrape in vk_scrapes:
-            if not result.get_title():
-                result.set_title(scrape["text"])
-            if not result.get_timestamp():
-                result.set_timestamp(scrape["datetime"])
-
-        result.set_content(dump_payload(vk_scrapes))
-
-        filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
-        for filename in filenames:
-            result.add_media(Media(filename))
-
-        return result.success("vk")
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
@@ -194,7 +194,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
                        shutil.copyfileobj(infile, outfile)

        # get media out of .warc
-        counter = 0
+        counter_warc_files = 0
+        counter_screenshots = 0
        seen_urls = set()

        with open(warc_filename, "rb") as warc_stream:
@@ -203,12 +204,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
                if (
                    record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot
                ):  # screenshots
-                    fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
+                    fn = os.path.join(tmp_dir, f"warc-file-{counter_screenshots}.png")
                    with open(fn, "wb") as outf:
                        outf.write(record.raw_stream.read())
                    m = Media(filename=fn)
-                    to_enrich.add_media(m, "browsertrix-screenshot")
-                    counter += 1
+                    to_enrich.add_media(m, f"browsertrix-screenshot-{counter_screenshots}")
+                    counter_screenshots += 1
                if not self.extract_media:
                    continue

@@ -231,7 +232,7 @@ class WaczExtractorEnricher(Enricher, Extractor):

                # create local file and add media
                ext = mimetypes.guess_extension(content_type)
-                warc_fn = f"warc-file-{counter}{ext}"
+                warc_fn = f"warc-file-{counter_screenshots}{ext}"
                fn = os.path.join(tmp_dir, warc_fn)

                record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
@@ -256,6 +257,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
                    continue

                to_enrich.add_media(m, warc_fn)
-                counter += 1
+                counter_warc_files += 1
                seen_urls.add(record_url)
-        logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
+        logger.info(
+            f"WACZ extract_media/extract_screenshot finished, found {counter_warc_files + counter_screenshots} relevant media file(s)"
+        )