Merge main

2026-06-12 05:08:28 +03:00 · 2025-02-20 10:29:57 +00:00
parent 6ea943b680 5211c5de18
commit 7dde8d609d
50 changed files with 2381 additions and 669 deletions
--- a/src/auto_archiver/modules/atlos_db/init.py
+++ b/src/auto_archiver/modules/atlos_db/init.py
@@ -1 +1 @@
-from atlos_db import AtlosDb
+from .atlos_db import AtlosDb
--- a/src/auto_archiver/modules/atlos_storage/init.py
+++ b/src/auto_archiver/modules/atlos_storage/init.py
@@ -0,0 +1 @@
+from .atlos_storage import AtlosStorage
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -281,7 +281,7 @@ class GenericExtractor(Extractor):
        # set up auth
        auth = self.auth_for_site(url, extract_cookies=False)

-        # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
+        # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
        if auth:
            if 'username' in auth and 'password' in auth:
                logger.debug(f'Using provided auth username and password for {url}')
@@ -290,7 +290,7 @@ class GenericExtractor(Extractor):
            elif 'cookie' in auth:
                logger.debug(f'Using provided auth cookie for {url}')
                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
-            elif 'cookie_from_browser' in auth:
+            elif 'cookies_from_browser' in auth:
                logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
            elif 'cookies_file' in auth:
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -10,7 +10,6 @@ from auto_archiver.version import __version__
 from auto_archiver.core import Metadata, Media
 from auto_archiver.core import Formatter
 from auto_archiver.utils.misc import random_str
-from auto_archiver.core.module import get_module

 class HtmlFormatter(Formatter):
    environment: Environment = None
@@ -50,7 +49,7 @@ class HtmlFormatter(Formatter):
        final_media = Media(filename=html_path, _mimetype="text/html")

        # get the already instantiated hash_enricher module
-        he = get_module('hash_enricher', self.config)
+        he = self.module_factory.get_module('hash_enricher', self.config)
        if len(hd := he.calculate_hash(final_media.filename)):
            final_media.set("hash", f"{he.algorithm}:{hd}")

--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
            chat, since_id = self._send_url_to_bot(url)
            message = self._process_messages(chat, since_id, tmp_dir, result)

+            # This may be outdated and replaced by the below message, but keeping until confirmed
            if "You must enter a URL to a post" in message:
                logger.debug(f"invalid link {url=} for {self.name}: {message}")
                return False
-            # # TODO: It currently returns this as a success - is that intentional?
-            # if "Media not found or unavailable" in message:
-            #     logger.debug(f"invalid link {url=} for {self.name}: {message}")
-            #     return False
+
+            if "Media not found or unavailable" in message:
+                logger.debug(f"No media found for link {url=} for {self.name}: {message}")
+                return False

            if message:
                result.set_content(message).set_title(message[:128])
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata

 class ScreenshotEnricher(Enricher):

+    def __init__(self, webdriver_factory=None):
+        super().__init__()
+        self.webdriver_factory = webdriver_factory or Webdriver
+
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()

@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):

        logger.debug(f"Enriching screenshot for {url=}")
        auth = self.auth_for_site(url)
-        with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
+        with self.webdriver_factory(
+                self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
                       http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
            try:
                driver.get(url)
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
                logger.info("TimeoutException loading page for screenshot")
            except Exception as e:
                logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
+
--- a/src/auto_archiver/modules/thumbnail_enricher/manifest.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/manifest.py
@@ -7,8 +7,12 @@
        "bin": ["ffmpeg"]
    },
    "configs": {
-            "thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
-            "max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
+            "thumbnails_per_minute": {"default": 60,
+                                      "type": "int",
+                                      "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
+            "max_thumbnails": {"default": 16,
+                               "type": "int",
+                               "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
        },
    "description": """
    Generates thumbnails for video files to provide visual previews.
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
                        logger.error(f"error getting duration of video {m.filename}: {e}")
                        return

-                num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
+                num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
                timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]

                thumbnails_media = []
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -4,7 +4,6 @@ from loguru import logger

 from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata, Media
-from auto_archiver.core.module import get_module

 class WhisperEnricher(Enricher):
    """
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):

    def setup(self) -> None:
        self.stores = self.config['steps']['storages']
-        self.s3 = get_module("s3_storage", self.config)
+        self.s3 = self.module_factory.get_module("s3_storage", self.config)
        if not "s3_storage" in self.stores:
            logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
            return
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
        job_results = {}
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
-                # TODO: this used to pass all storage items to store now
-                # Now only passing S3, the rest will get added later in the usual order (?)
+                # Only storing S3, the rest will get added later in the usual order (?)
                m.store(url=url, metadata=to_enrich, storages=[self.s3])
                try:
                    job_id = self.submit_job(m)