Merge branch 'main' into merge_modules

2026-06-12 05:08:28 +03:00 · 2025-03-07 16:19:51 +00:00
parent 8fcec692b7 3fac353407
commit be513e95aa
47 changed files with 55825 additions and 102 deletions
--- a/src/auto_archiver/modules/cli_feeder/manifest.py
+++ b/src/auto_archiver/modules/cli_feeder/manifest.py
@@ -0,0 +1,23 @@
+{
+    'name': 'Command Line Feeder',
+    'type': ['feeder'],
+    'entry_point': 'cli_feeder::CLIFeeder',
+    'requires_setup': False,
+    'description': 'Feeds URLs to orchestrator from the command line',
+    'configs': {
+        'urls': {
+            'default': None,
+            'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
+        },
+    },
+    'description': """
+The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line 
+without the need to specify any additional configuration or command line arguments:
+
+`auto-archiver --feeder cli_feeder -- "https://example.com/1/,https://example.com/2/"`
+
+You can pass multiple URLs by separating them with a space. The URLs will be processed in the order they are provided.
+
+`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
+""",
+}
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@@ -0,0 +1,21 @@
+from loguru import logger
+
+from auto_archiver.core.feeder import Feeder
+from auto_archiver.core.metadata import Metadata
+
+class CLIFeeder(Feeder):
+
+    def setup(self) -> None:
+        self.urls = self.config['urls']
+        if not self.urls:
+            raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
+
+    def __iter__(self) -> Metadata:
+        urls = self.config['urls']
+        for url in urls:
+            logger.debug(f"Processing {url}")
+            m = Metadata().set_url(url)
+            m.set_context("folder", "cli")
+            yield m
+
+        logger.success(f"Processed {len(urls)} URL(s)")
--- a/src/auto_archiver/modules/console_db/console_db.py
+++ b/src/auto_archiver/modules/console_db/console_db.py
@@ -10,7 +10,7 @@ class ConsoleDb(Database):
    """

    def started(self, item: Metadata) -> None:
-        logger.warning(f"STARTED {item}")
+        logger.info(f"STARTED {item}")

    def failed(self, item: Metadata, reason:str) -> None:
        logger.error(f"FAILED {item}: {reason}")
--- a/src/auto_archiver/modules/csv_db/manifest.py
+++ b/src/auto_archiver/modules/csv_db/manifest.py
@@ -6,7 +6,7 @@
                              },
    'entry_point': 'csv_db::CSVDb',
    "configs": {
-            "csv_file": {"default": "db.csv", "help": "CSV file name"}
+            "csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
        },
    "description": """
 Handles exporting archival results to a CSV file.
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -28,6 +28,13 @@ the broader archiving framework.
 metadata objects. Some dropins are included in this generic_archiver by default, but
 custom dropins can be created to handle additional websites and passed to the archiver
 via the command line using the `--dropins` option (TODO!).
+
+### Auto-Updates
+
+The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
+This can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).
+If you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.
+
 """,
    "configs": {
        "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
@@ -64,5 +71,10 @@ via the command line using the `--dropins` option (TODO!).
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
+        "ytdlp_update_interval": {
+            "default": 5,
+            "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
+            "type": "int",
+        },
    },
 }
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,7 +1,11 @@
-import datetime, os, yt_dlp, pysubs2
+import datetime, os
 import importlib
+import subprocess
 from typing import Generator, Type
+
+import yt_dlp
 from yt_dlp.extractor.common import InfoExtractor
+import pysubs2

 from loguru import logger

@@ -11,6 +15,44 @@ from auto_archiver.core import Metadata, Media
 class GenericExtractor(Extractor):
    _dropins = {}

+    def setup(self):
+        # check for file .ytdlp-update in the secrets folder
+        if self.ytdlp_update_interval < 0:
+            return
+        
+        use_secrets = os.path.exists('secrets')
+        path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
+        next_update_check = None
+        if os.path.exists(path):
+            with open(path, "r") as f:
+                next_update_check = datetime.datetime.fromisoformat(f.read())
+        
+        if not next_update_check or next_update_check < datetime.datetime.now():
+            self.update_ytdlp()
+
+            next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
+            with open(path, "w") as f:
+                f.write(next_update_check.isoformat())
+
+    def update_ytdlp(self):
+        logger.info("Checking and updating yt-dlp...")
+        logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
+        from importlib.metadata import version as get_version
+        old_version = get_version("yt-dlp")
+        try:
+            # try and update with pip (this works inside poetry environment and in a normal virtualenv)
+            result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
+
+            if "Successfully installed yt-dlp" in result.stdout.decode():
+                new_version = importlib.metadata.version("yt-dlp")
+                logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
+                importlib.reload(yt_dlp)
+            else:
+                logger.info("yt-dlp already up to date")
+
+        except Exception as e:
+            logger.error(f"Error updating yt-dlp: {e}")
+
    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
        Returns a list of valid extractors for the given URL"""
--- a/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
@@ -12,7 +12,9 @@
            "default": None,
            "help": "the id of the sheet to archive (alternative to 'sheet' config)",
        },
-        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
+        "header": {"default": 1,
+                   "type": "int",
+                   "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
            "default": "secrets/service_account.json",
            "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
--- a/src/auto_archiver/modules/html_formatter/manifest.py
+++ b/src/auto_archiver/modules/html_formatter/manifest.py
@@ -7,7 +7,9 @@
                          "bin": [""]
    },
    "configs": {
-            "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
+            "detect_thumbnails": {"default": True,
+                                  "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
+                                  "type": "bool"},
        },
    "description": """ """,
 }
--- a/src/auto_archiver/modules/instagram_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_extractor/manifest.py
@@ -10,25 +10,30 @@
    "requires_setup": True,
    "configs": {
        "username": {"required": True,
-                     "help": "a valid Instagram username"},
+                     "help": "A valid Instagram username."},
        "password": {
            "required": True,
-            "help": "the corresponding Instagram account password",
+            "help": "The corresponding Instagram account password.",
        },
        "download_folder": {
            "default": "instaloader",
-            "help": "name of a folder to temporarily download content to",
+            "help": "Name of a folder to temporarily download content to.",
        },
        "session_file": {
            "default": "secrets/instaloader.session",
-            "help": "path to the instagram session which saves session credentials",
+            "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
        },
        # TODO: fine-grain
        # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
    },
    "description": """
-    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
-    and user profiles, downloading as much information as possible, including images, videos, text, stories,
+    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. 
+    
+      > ⚠️ **Warning**  
+      > This module is not actively maintained due to known issues with blocking.  
+      > Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
+  
+    This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
    highlights, and tagged posts. 
    Authentication is required via username/password or a session file.
                    
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -3,7 +3,7 @@
    highlights, and tagged posts. Authentication is required via username/password or a session file.

 """
-import re, os, shutil, traceback
+import re, os, shutil
 import instaloader
 from loguru import logger

@@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
    """
    Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
    """
+
    # NB: post regex should be tested before profile
-
    valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
-
    # https://regex101.com/r/MGPquX/1
    post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
    # https://regex101.com/r/6Wbsxa/1
@@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
    def setup(self) -> None:

        self.insta = instaloader.Instaloader(
-            download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
+            download_geotags=True,
+            download_comments=True,
+            compress_json=False,
+            dirname_pattern=self.download_folder,
+            filename_pattern="{date_utc}_UTC_{target}__{typename}"
        )
        try:
            self.insta.load_session_from_file(self.username, self.session_file)
        except Exception as e:
-            logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
            try:
-                self.insta.login(self.username, config.instagram_self.password)
-                # TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
+                logger.debug(f"Session file failed", exc_info=True)
+                logger.info("No valid session file found - Attempting login with use and password.")
+                self.insta.login(self.username, self.password)
                self.insta.save_session_to_file(self.session_file)
-            except Exception as e2:
-                logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
-
+            except Exception as e:
+                logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")


    def download(self, item: Metadata) -> Metadata:
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -104,7 +104,7 @@ class InstagramTbotExtractor(Extractor):
        message = ""
        time.sleep(3)
        # media is added before text by the bot so it can be used as a stop-logic mechanism
-        while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
+        while attempts < max(self.timeout - 3, 3) and (not message or not len(seen_media)):
            attempts += 1
            time.sleep(1)
            for post in self.client.iter_messages(chat, min_id=since_id):
--- a/src/auto_archiver/modules/local_storage/manifest.py
+++ b/src/auto_archiver/modules/local_storage/manifest.py
@@ -17,7 +17,9 @@
            "choices": ["random", "static"],
        },
        "save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
-        "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
+        "save_absolute": {"default": False, 
+                          "type": "bool",
+                          "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
    },
    "description": """
    LocalStorage: A storage module for saving archived content locally on the filesystem.
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@@ -6,13 +6,25 @@
        "python": ["loguru", "selenium"],
    },
    "configs": {
-            "width": {"default": 1280, "help": "width of the screenshots"},
-            "height": {"default": 720, "help": "height of the screenshots"},
-            "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
-            "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
+            "width": {"default": 1280,
+                      "type": "int",
+                      "help": "width of the screenshots"},
+            "height": {"default": 1024,
+                        "type": "int",
+                       "help": "height of the screenshots"},
+            "timeout": {"default": 60,
+                        "type": "int",
+                        "help": "timeout for taking the screenshot"},
+            "sleep_before_screenshot": {"default": 4,
+                                        "type": "int",
+                                        "help": "seconds to wait for the pages to load before taking screenshot"},
            "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
-            "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
-            "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
+            "save_to_pdf": {"default": False,
+                            "type": "bool",
+                            "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
+            "print_options": {"default": {},
+                              "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
+                              "type": "json_loader"},
        },
    "description": """
    Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
--- a/src/auto_archiver/modules/ssl_enricher/manifest.py
+++ b/src/auto_archiver/modules/ssl_enricher/manifest.py
@@ -7,7 +7,9 @@
    },
    'entry_point': 'ssl_enricher::SSLEnricher',
    "configs": {
-        "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
+        "skip_when_nothing_archived": {"default": True,
+                                       "type": 'bool',
+                                       "help": "if true, will skip enriching when no media is archived"},
    },
    "description": """
    Retrieves SSL certificate information for a domain and stores it as a file.
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@@ -14,7 +14,9 @@
            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
            "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
            "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
-            "join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
+            "join_channels": {"default": True,
+                              "type": "bool",
+                              "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
            "channel_invites": {
                "default": {},
                "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -17,11 +17,19 @@
    "configs": {
            "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
            "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
-            "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
-            "extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
-            "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
+            "timeout": {"default": 120,
+                        "type": "int",
+                        "help": "timeout for WACZ generation in seconds", "type": "int"},
+            "extract_media": {"default": False, 
+                              "type": 'bool',
+                              "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
+                              },
+            "extract_screenshot": {"default": True,
+                                    "type": 'bool',
+                                   "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
+                                   },
            "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
-            "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
+            "socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
            "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
        },
    "description": """
--- a/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
@@ -9,6 +9,7 @@
    "configs": {
        "timeout": {
            "default": 15,
+            "type": "int",
            "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
        },
        "if_not_archived_within": {
--- a/src/auto_archiver/modules/whisper_enricher/manifest.py
+++ b/src/auto_archiver/modules/whisper_enricher/manifest.py
@@ -10,8 +10,12 @@
                         "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
        "api_key": {"required": True,
                    "help": "WhisperApi api key for authentication"},
-        "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
-        "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
+        "include_srt": {"default": False,
+                        "type": "bool",
+                        "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
+        "timeout": {"default": 90,
+                    "type": "int",
+                    "help": "How many seconds to wait at most for a successful job completion."},
        "action": {"default": "translate",
                   "help": "which Whisper operation to execute",
                   "choices": ["transcribe", "translate", "language_detection"]},