Merge branch 'main' into timestamping_rewrite

2026-06-12 13:18:28 +03:00 · 2025-02-24 12:03:14 +00:00
parent d0c379a3ba 5211c5de18
commit 01bf88a695
125 changed files with 4277 additions and 2162 deletions
--- a/src/auto_archiver/modules/atlos_db/init.py
+++ b/src/auto_archiver/modules/atlos_db/init.py
@@ -1 +1 @@
-from atlos_db import AtlosDb
+from .atlos_db import AtlosDb
--- a/src/auto_archiver/modules/atlos_db/manifest.py
+++ b/src/auto_archiver/modules/atlos_db/manifest.py
@@ -11,6 +11,8 @@
        "api_token": {
            "default": None,
            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+            "required": True,
+            "type": "str",
        },
        "atlos_url": {
            "default": "https://platform.atlos.org",
--- a/src/auto_archiver/modules/atlos_db/base_configs.py
+++ b/src/auto_archiver/modules/atlos_db/base_configs.py
@@ -1,13 +0,0 @@
-def get_atlos_config_options():
-    return {
-        "api_token": {
-            "default": None,
-            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
-            "type": str
-        },
-        "atlos_url": {
-            "default": "https://platform.atlos.org",
-            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
-            "type": str
-        },
-    }
--- a/src/auto_archiver/modules/atlos_storage/init.py
+++ b/src/auto_archiver/modules/atlos_storage/init.py
@@ -0,0 +1 @@
+from .atlos_storage import AtlosStorage
--- a/src/auto_archiver/modules/atlos_storage/manifest.py
+++ b/src/auto_archiver/modules/atlos_storage/manifest.py
@@ -0,0 +1,32 @@
+{
+    "name": "Atlos Storage",
+    "type": ["storage"],
+    "requires_setup": True,
+    "dependencies": {
+        "python": ["loguru", "boto3"],
+        "bin": []
+    },
+    "description": """
+    Stores media files in a [Atlos](https://www.atlos.org/).
+
+    ### Features
+    - Saves media files to Atlos, organizing them into folders based on the provided path structure.
+
+    ### Notes
+    - Requires setup with Atlos credentials.
+    - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
+    """,
+    "configs": {
+        "api_token": {
+            "default": None,
+            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+            "required": True,
+            "type": "str"
+        },
+        "atlos_url": {
+            "default": "https://platform.atlos.org",
+            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
+            "type": "str"
+        },
+    }
+}
--- a/src/auto_archiver/modules/gdrive_storage/manifest.py
+++ b/src/auto_archiver/modules/gdrive_storage/manifest.py
@@ -32,7 +32,6 @@
    
    GDriveStorage: A storage module for saving archived content to Google Drive.

-    Author: Dave Mateer, (And maintained by: )
    Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python

    ### Features
--- a/src/auto_archiver/modules/generic_extractor/bluesky.py
+++ b/src/auto_archiver/modules/generic_extractor/bluesky.py
@@ -39,11 +39,11 @@ class Bluesky(GenericDropin):
        for image_media in image_medias:
            url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
            image_media = archiver.download_from_url(url)
-            media.append(image_media)
+            media.append(Media(image_media))
        for video_media in video_medias:
            url = media_url.format(video_media['ref']['$link'], post['author']['did'])
            video_media = archiver.download_from_url(url)
-            media.append(video_media)
+            media.append(Media(video_media))
        return media


--- a/src/auto_archiver/modules/generic_extractor/facebook.py
+++ b/src/auto_archiver/modules/generic_extractor/facebook.py
@@ -8,7 +8,8 @@ class Facebook(GenericDropin):
            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
        webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))

-        post_data = ie_instance._extract_from_url.extract_metadata(webpage)
+        # TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
+        post_data = ie_instance._extract_metadata(webpage)
        return post_data
    
    def create_metadata(self, post: dict, ie_instance, archiver, url):
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,6 +1,6 @@
 import datetime, os, yt_dlp, pysubs2
 import importlib
-from typing import Type
+from typing import Generator, Type
 from yt_dlp.extractor.common import InfoExtractor

 from loguru import logger
@@ -11,7 +11,7 @@ from auto_archiver.core import Metadata, Media
 class GenericExtractor(Extractor):
    _dropins = {}

-    def suitable_extractors(self, url: str) -> list[str]:
+    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
        Returns a list of valid extractors for the given URL"""
        for info_extractor in yt_dlp.YoutubeDL()._ies.values():
@@ -116,7 +116,7 @@ class GenericExtractor(Extractor):

    def get_metadata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
        """
-        Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
+        Calls into the ytdlp InfoExtract subclass to use the private _extract_post method to get the post metadata.
        """

        ie_instance = info_extractor(downloader=ydl)
@@ -266,6 +266,11 @@ class GenericExtractor(Extractor):
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()

+        #TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
+        if url.startswith("https://ya.ru"):
+            url = url.replace("https://ya.ru", "https://yandex.ru")
+            item.set("replaced_url", url)
+

        ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 
                       'quiet': False, 'noplaylist': not self.allow_playlist ,
@@ -275,7 +280,7 @@ class GenericExtractor(Extractor):
        
        # set up auth
        auth = self.auth_for_site(url, extract_cookies=False)
-        # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
+        # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
        if auth:
            if 'username' in auth and 'password' in auth:
                logger.debug(f'Using provided auth username and password for {url}')
@@ -284,7 +289,7 @@ class GenericExtractor(Extractor):
            elif 'cookie' in auth:
                logger.debug(f'Using provided auth cookie for {url}')
                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
-            elif 'cookie_from_browser' in auth:
+            elif 'cookies_from_browser' in auth:
                logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
            elif 'cookies_file' in auth:
--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@@ -10,7 +10,7 @@
        "sheet": {"default": None, "help": "name of the sheet to archive"},
        "sheet_id": {
            "default": None,
-            "help": "(alternative to sheet name) the id of the sheet to archive",
+            "help": "the id of the sheet to archive (alternative to 'sheet' config)",
        },
        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -9,9 +9,7 @@ import base64
 from auto_archiver.version import __version__
 from auto_archiver.core import Metadata, Media
 from auto_archiver.core import Formatter
-from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.utils.misc import random_str
-from auto_archiver.core.module import get_module

 class HtmlFormatter(Formatter):
    environment: Environment = None
@@ -51,7 +49,7 @@ class HtmlFormatter(Formatter):
        final_media = Media(filename=html_path, _mimetype="text/html")

        # get the already instantiated hash_enricher module
-        he = get_module('hash_enricher', self.config)
+        he = self.module_factory.get_module('hash_enricher', self.config)
        if len(hd := he.calculate_hash(final_media.filename)):
            final_media.set("hash", f"{he.algorithm}:{hd}")

--- a/src/auto_archiver/modules/html_formatter/templates/html_template.html
+++ b/src/auto_archiver/modules/html_formatter/templates/html_template.html
@@ -200,7 +200,7 @@
                el.innerHTML = decodeCertificate(certificate);

                let cyberChefUrl =
-                    `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
+                    `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`;
                // create a new anchor with this url and append after the code
                let a = document.createElement("a");
                a.href = cyberChefUrl;
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
            chat, since_id = self._send_url_to_bot(url)
            message = self._process_messages(chat, since_id, tmp_dir, result)

+            # This may be outdated and replaced by the below message, but keeping until confirmed
            if "You must enter a URL to a post" in message:
                logger.debug(f"invalid link {url=} for {self.name}: {message}")
                return False
-            # # TODO: It currently returns this as a success - is that intentional?
-            # if "Media not found or unavailable" in message:
-            #     logger.debug(f"invalid link {url=} for {self.name}: {message}")
-            #     return False
+
+            if "Media not found or unavailable" in message:
+                logger.debug(f"No media found for link {url=} for {self.name}: {message}")
+                return False

            if message:
                result.set_content(message).set_title(message[:128])
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@@ -4,7 +4,6 @@
    "requires_setup": True,
    "dependencies": {
        "python": ["loguru", "selenium"],
-        "bin": ["chromedriver"]
    },
    "configs": {
            "width": {"default": 1280, "help": "width of the screenshots"},
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata

 class ScreenshotEnricher(Enricher):

+    def __init__(self, webdriver_factory=None):
+        super().__init__()
+        self.webdriver_factory = webdriver_factory or Webdriver
+
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()

@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):

        logger.debug(f"Enriching screenshot for {url=}")
        auth = self.auth_for_site(url)
-        with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
+        with self.webdriver_factory(
+                self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
                       http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
            try:
                driver.get(url)
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
                logger.info("TimeoutException loading page for screenshot")
            except Exception as e:
                logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
+
--- a/src/auto_archiver/modules/telegram_extractor/manifest.py
+++ b/src/auto_archiver/modules/telegram_extractor/manifest.py
@@ -20,5 +20,6 @@
 - Processes HTML content of messages to retrieve embedded media.
 - Sets structured metadata, including timestamps, content, and media details.
 - Does not require user authentication for Telegram.
+
    """,
 }
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@@ -1,5 +1,5 @@
 {
-    "name": "telethon_extractor",
+    "name": "Telethon Extractor",
    "type": ["extractor"],
    "requires_setup": True,
    "dependencies": {
@@ -40,5 +40,9 @@ To use the `TelethonExtractor`, you must configure the following:
 - **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
 - **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.

+### First Time Login
+The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
+
+
 """
 }
--- a/src/auto_archiver/modules/thumbnail_enricher/manifest.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/manifest.py
@@ -7,8 +7,12 @@
        "bin": ["ffmpeg"]
    },
    "configs": {
-            "thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
-            "max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
+            "thumbnails_per_minute": {"default": 60,
+                                      "type": "int",
+                                      "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
+            "max_thumbnails": {"default": 16,
+                               "type": "int",
+                               "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
        },
    "description": """
    Generates thumbnails for video files to provide visual previews.
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
                        logger.error(f"error getting duration of video {m.filename}: {e}")
                        return

-                num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
+                num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
                timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]

                thumbnails_media = []
--- a/src/auto_archiver/modules/wacz_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_enricher/manifest.py
@@ -1,6 +1,6 @@
 {
    "name": "WACZ Enricher",
-    "type": ["enricher", "archiver"],
+    "type": ["enricher", "extractor"],
    "entry_point": "wacz_enricher::WaczExtractorEnricher",
    "requires_setup": True,
    "dependencies": {
--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@@ -221,4 +221,4 @@ class WaczExtractorEnricher(Enricher, Extractor):
                to_enrich.add_media(m, warc_fn)
                counter += 1
                seen_urls.add(record_url)
-        logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
+        logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
--- a/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
@@ -1,6 +1,6 @@
 {
    "name": "Wayback Machine Enricher",
-    "type": ["enricher", "archiver"],
+    "type": ["enricher", "extractor"],
    "entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
    "requires_setup": True,
    "dependencies": {
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -4,7 +4,6 @@ from loguru import logger

 from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata, Media
-from auto_archiver.core.module import get_module

 class WhisperEnricher(Enricher):
    """
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):

    def setup(self) -> None:
        self.stores = self.config['steps']['storages']
-        self.s3 = get_module("s3_storage", self.config)
+        self.s3 = self.module_factory.get_module("s3_storage", self.config)
        if not "s3_storage" in self.stores:
            logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
            return
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
        job_results = {}
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
-                # TODO: this used to pass all storage items to store now
-                # Now only passing S3, the rest will get added later in the usual order (?)
+                # Only storing S3, the rest will get added later in the usual order (?)
                m.store(url=url, metadata=to_enrich, storages=[self.s3])
                try:
                    job_id = self.submit_job(m)