Merge branch 'main' into merge_modules

2026-06-11 20:58:29 +03:00 · 2025-03-07 16:19:51 +00:00
parent 8fcec692b7 3fac353407
commit be513e95aa
47 changed files with 55825 additions and 102 deletions
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -105,8 +105,8 @@ class BaseModule(ABC):
            for key in self.authentication.keys():
                if key in site or site in key:
                    logger.debug(f"Could not find exact authentication information for site '{site}'. \
-                                    did find information for '{key}' which is close, is this what you meant? \
-                                    If so, edit your authentication settings to make sure it exactly matches.")
+did find information for '{key}' which is close, is this what you meant? \
+If so, edit your authentication settings to make sure it exactly matches.")

        def get_ytdlp_cookiejar(args):
            import yt_dlp
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -80,7 +80,10 @@ class ModuleFactory:

        available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
        if not available:
-            raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
+            message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
+            if 'archiver' in module_name:
+                message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
+            raise IndexError(message)
        return available[0]

    def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -15,6 +15,7 @@ from copy import copy

 from rich_argparse import RichHelpFormatter
 from loguru import logger
+import requests

 from .metadata import Metadata, Media
 from auto_archiver.version import __version__
@@ -72,10 +73,20 @@ class ArchivingOrchestrator:

        self.basic_parser = parser
        return parser
+    
+    def check_steps(self, config):
+        for module_type in MODULE_TYPES:
+            if not config['steps'].get(f"{module_type}s", []):
+                if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
+                    raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
+Here's how that would look: \n\nsteps:\n  {module_type}s:\n  - [your_{module_type}_name_here]\n  {'extractors:...' if module_type == 'feeder' else '...'}\n")
+                if module_type == 'extractor' and config['steps'].get('archivers'):
+                    raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
+Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_here]\n  enrichers:...\n")
+                raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")

    def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:

-
        # modules parser to get the overridden 'steps' values
        modules_parser = argparse.ArgumentParser(
            add_help=False,
@@ -100,6 +111,7 @@ class ArchivingOrchestrator:
        # but should we add them? Or should we just add them to the 'complete' parser?

        if is_valid_config(yaml_config):
+            self.check_steps(yaml_config)
            # only load the modules enabled in config
            # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
            enabled_modules = []
@@ -115,10 +127,6 @@ class ArchivingOrchestrator:
            simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
            self.add_individual_module_args(simple_modules, parser)

-            # for simple mode, we use the cli_feeder and any modules that don't require setup
-            if not yaml_config['steps']['feeders']:
-                yaml_config['steps']['feeders'] = ['cli_feeder']
-
            # add them to the config
            for module in simple_modules:
                for module_type in module.type:
@@ -171,9 +179,6 @@ class ArchivingOrchestrator:
        if not parser:
            parser = self.parser

-        # allow passing URLs directly on the command line
-        parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
-
        parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
                                                                            (token, username etc.) that extractors can use to log into \
                                                                            a website. If passing this on the command line, use a JSON string. \
@@ -193,7 +198,11 @@ class ArchivingOrchestrator:
            modules = self.module_factory.available_modules()
        
        for module in modules:
-
+            if module.name == 'cli_feeder':
+                # special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
+                parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
+                continue
+                
            if not module.configs:
                # this module has no configs, don't show anything in the help
                # (TODO: do we want to show something about this module though, like a description?)
@@ -277,36 +286,16 @@ class ArchivingOrchestrator:
                    raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")

            for module in modules_to_load:
-                if module == 'cli_feeder':
-                    # cli_feeder is a pseudo module, it just takes the command line args for [URLS]
-                    urls = self.config['urls']
-                    if not urls:
-                        raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
-
-                    def feed(self) -> Generator[Metadata]:
-                        for url in urls:
-                            logger.debug(f"Processing URL: '{url}'")
-                            yield Metadata().set_url(url)
-
-                    pseudo_module = type('CLIFeeder', (Feeder,), {
-                        'name': 'cli_feeder',
-                        'display_name': 'CLI Feeder',
-                        '__iter__': feed
-
-                    })()
-
-                    pseudo_module.__iter__ = feed
-                    step_items.append(pseudo_module)
-                    continue

                if module in invalid_modules:
                    continue

+                loaded_module = None
                try:
                    loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
                except (KeyboardInterrupt, Exception) as e:
                    logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
-                    if module_type == 'extractor' and loaded_module.name == module:
+                    if loaded_module and module_type == 'extractor':
                        loaded_module.cleanup()
                    raise e

@@ -348,7 +337,23 @@ class ArchivingOrchestrator:
        yaml_config = self.load_config(basic_config.config_file)

        return self.setup_complete_parser(basic_config, yaml_config, unused_args)
+    
+    def check_for_updates(self):
+        response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
+        latest_version = response['info']['version']
+        # check version compared to current version
+        if latest_version != __version__:
+            if os.environ.get('RUNNING_IN_DOCKER'):
+                update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
+            else:
+                update_cmd = "`pip install --upgrade auto-archiver`"
+            logger.warning("")
+            logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
+            logger.warning(f"A new version of auto-archiver is available (v{latest_version}, you have {__version__})")
+            logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
+            logger.warning("")

+        
    def setup(self, args: list):
        """
        Function to configure all setup of the orchestrator: setup configs and load modules.
@@ -356,6 +361,8 @@ class ArchivingOrchestrator:
        This method should only ever be called once
        """

+        self.check_for_updates()
+
        if self.setup_finished:
            logger.warning("The `setup_config()` function should only ever be run once. \
                           If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
--- a/src/auto_archiver/modules/cli_feeder/manifest.py
+++ b/src/auto_archiver/modules/cli_feeder/manifest.py
@@ -0,0 +1,23 @@
+{
+    'name': 'Command Line Feeder',
+    'type': ['feeder'],
+    'entry_point': 'cli_feeder::CLIFeeder',
+    'requires_setup': False,
+    'description': 'Feeds URLs to orchestrator from the command line',
+    'configs': {
+        'urls': {
+            'default': None,
+            'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
+        },
+    },
+    'description': """
+The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line 
+without the need to specify any additional configuration or command line arguments:
+
+`auto-archiver --feeder cli_feeder -- "https://example.com/1/,https://example.com/2/"`
+
+You can pass multiple URLs by separating them with a space. The URLs will be processed in the order they are provided.
+
+`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
+""",
+}
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@@ -0,0 +1,21 @@
+from loguru import logger
+
+from auto_archiver.core.feeder import Feeder
+from auto_archiver.core.metadata import Metadata
+
+class CLIFeeder(Feeder):
+
+    def setup(self) -> None:
+        self.urls = self.config['urls']
+        if not self.urls:
+            raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
+
+    def __iter__(self) -> Metadata:
+        urls = self.config['urls']
+        for url in urls:
+            logger.debug(f"Processing {url}")
+            m = Metadata().set_url(url)
+            m.set_context("folder", "cli")
+            yield m
+
+        logger.success(f"Processed {len(urls)} URL(s)")
--- a/src/auto_archiver/modules/console_db/console_db.py
+++ b/src/auto_archiver/modules/console_db/console_db.py
@@ -10,7 +10,7 @@ class ConsoleDb(Database):
    """

    def started(self, item: Metadata) -> None:
-        logger.warning(f"STARTED {item}")
+        logger.info(f"STARTED {item}")

    def failed(self, item: Metadata, reason:str) -> None:
        logger.error(f"FAILED {item}: {reason}")
--- a/src/auto_archiver/modules/csv_db/manifest.py
+++ b/src/auto_archiver/modules/csv_db/manifest.py
@@ -6,7 +6,7 @@
                              },
    'entry_point': 'csv_db::CSVDb',
    "configs": {
-            "csv_file": {"default": "db.csv", "help": "CSV file name"}
+            "csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
        },
    "description": """
 Handles exporting archival results to a CSV file.
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -28,6 +28,13 @@ the broader archiving framework.
 metadata objects. Some dropins are included in this generic_archiver by default, but
 custom dropins can be created to handle additional websites and passed to the archiver
 via the command line using the `--dropins` option (TODO!).
+
+### Auto-Updates
+
+The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
+This can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).
+If you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.
+
 """,
    "configs": {
        "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
@@ -64,5 +71,10 @@ via the command line using the `--dropins` option (TODO!).
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
+        "ytdlp_update_interval": {
+            "default": 5,
+            "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
+            "type": "int",
+        },
    },
 }
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,7 +1,11 @@
-import datetime, os, yt_dlp, pysubs2
+import datetime, os
 import importlib
+import subprocess
 from typing import Generator, Type
+
+import yt_dlp
 from yt_dlp.extractor.common import InfoExtractor
+import pysubs2

 from loguru import logger

@@ -11,6 +15,44 @@ from auto_archiver.core import Metadata, Media
 class GenericExtractor(Extractor):
    _dropins = {}

+    def setup(self):
+        # check for file .ytdlp-update in the secrets folder
+        if self.ytdlp_update_interval < 0:
+            return
+        
+        use_secrets = os.path.exists('secrets')
+        path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
+        next_update_check = None
+        if os.path.exists(path):
+            with open(path, "r") as f:
+                next_update_check = datetime.datetime.fromisoformat(f.read())
+        
+        if not next_update_check or next_update_check < datetime.datetime.now():
+            self.update_ytdlp()
+
+            next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
+            with open(path, "w") as f:
+                f.write(next_update_check.isoformat())
+
+    def update_ytdlp(self):
+        logger.info("Checking and updating yt-dlp...")
+        logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
+        from importlib.metadata import version as get_version
+        old_version = get_version("yt-dlp")
+        try:
+            # try and update with pip (this works inside poetry environment and in a normal virtualenv)
+            result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
+
+            if "Successfully installed yt-dlp" in result.stdout.decode():
+                new_version = importlib.metadata.version("yt-dlp")
+                logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
+                importlib.reload(yt_dlp)
+            else:
+                logger.info("yt-dlp already up to date")
+
+        except Exception as e:
+            logger.error(f"Error updating yt-dlp: {e}")
+
    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
        Returns a list of valid extractors for the given URL"""
--- a/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
@@ -12,7 +12,9 @@
            "default": None,
            "help": "the id of the sheet to archive (alternative to 'sheet' config)",
        },
-        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
+        "header": {"default": 1,
+                   "type": "int",
+                   "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
            "default": "secrets/service_account.json",
            "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
--- a/src/auto_archiver/modules/html_formatter/manifest.py
+++ b/src/auto_archiver/modules/html_formatter/manifest.py
@@ -7,7 +7,9 @@
                          "bin": [""]
    },
    "configs": {
-            "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
+            "detect_thumbnails": {"default": True,
+                                  "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
+                                  "type": "bool"},
        },
    "description": """ """,
 }
--- a/src/auto_archiver/modules/instagram_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_extractor/manifest.py
@@ -10,25 +10,30 @@
    "requires_setup": True,
    "configs": {
        "username": {"required": True,
-                     "help": "a valid Instagram username"},
+                     "help": "A valid Instagram username."},
        "password": {
            "required": True,
-            "help": "the corresponding Instagram account password",
+            "help": "The corresponding Instagram account password.",
        },
        "download_folder": {
            "default": "instaloader",
-            "help": "name of a folder to temporarily download content to",
+            "help": "Name of a folder to temporarily download content to.",
        },
        "session_file": {
            "default": "secrets/instaloader.session",
-            "help": "path to the instagram session which saves session credentials",
+            "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
        },
        # TODO: fine-grain
        # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
    },
    "description": """
-    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
-    and user profiles, downloading as much information as possible, including images, videos, text, stories,
+    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. 
+    
+      > ⚠️ **Warning**  
+      > This module is not actively maintained due to known issues with blocking.  
+      > Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
+  
+    This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
    highlights, and tagged posts. 
    Authentication is required via username/password or a session file.
                    
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -3,7 +3,7 @@
    highlights, and tagged posts. Authentication is required via username/password or a session file.

 """
-import re, os, shutil, traceback
+import re, os, shutil
 import instaloader
 from loguru import logger

@@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
    """
    Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
    """
+
    # NB: post regex should be tested before profile
-
    valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
-
    # https://regex101.com/r/MGPquX/1
    post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
    # https://regex101.com/r/6Wbsxa/1
@@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
    def setup(self) -> None:

        self.insta = instaloader.Instaloader(
-            download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
+            download_geotags=True,
+            download_comments=True,
+            compress_json=False,
+            dirname_pattern=self.download_folder,
+            filename_pattern="{date_utc}_UTC_{target}__{typename}"
        )
        try:
            self.insta.load_session_from_file(self.username, self.session_file)
        except Exception as e:
-            logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
            try:
-                self.insta.login(self.username, config.instagram_self.password)
-                # TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
+                logger.debug(f"Session file failed", exc_info=True)
+                logger.info("No valid session file found - Attempting login with use and password.")
+                self.insta.login(self.username, self.password)
                self.insta.save_session_to_file(self.session_file)
-            except Exception as e2:
-                logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
-
+            except Exception as e:
+                logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")


    def download(self, item: Metadata) -> Metadata:
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -104,7 +104,7 @@ class InstagramTbotExtractor(Extractor):
        message = ""
        time.sleep(3)
        # media is added before text by the bot so it can be used as a stop-logic mechanism
-        while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
+        while attempts < max(self.timeout - 3, 3) and (not message or not len(seen_media)):
            attempts += 1
            time.sleep(1)
            for post in self.client.iter_messages(chat, min_id=since_id):
--- a/src/auto_archiver/modules/local_storage/manifest.py
+++ b/src/auto_archiver/modules/local_storage/manifest.py
@@ -17,7 +17,9 @@
            "choices": ["random", "static"],
        },
        "save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
-        "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
+        "save_absolute": {"default": False, 
+                          "type": "bool",
+                          "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
    },
    "description": """
    LocalStorage: A storage module for saving archived content locally on the filesystem.
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@@ -6,13 +6,25 @@
        "python": ["loguru", "selenium"],
    },
    "configs": {
-            "width": {"default": 1280, "help": "width of the screenshots"},
-            "height": {"default": 720, "help": "height of the screenshots"},
-            "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
-            "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
+            "width": {"default": 1280,
+                      "type": "int",
+                      "help": "width of the screenshots"},
+            "height": {"default": 1024,
+                        "type": "int",
+                       "help": "height of the screenshots"},
+            "timeout": {"default": 60,
+                        "type": "int",
+                        "help": "timeout for taking the screenshot"},
+            "sleep_before_screenshot": {"default": 4,
+                                        "type": "int",
+                                        "help": "seconds to wait for the pages to load before taking screenshot"},
            "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
-            "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
-            "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
+            "save_to_pdf": {"default": False,
+                            "type": "bool",
+                            "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
+            "print_options": {"default": {},
+                              "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
+                              "type": "json_loader"},
        },
    "description": """
    Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
--- a/src/auto_archiver/modules/ssl_enricher/manifest.py
+++ b/src/auto_archiver/modules/ssl_enricher/manifest.py
@@ -7,7 +7,9 @@
    },
    'entry_point': 'ssl_enricher::SSLEnricher',
    "configs": {
-        "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
+        "skip_when_nothing_archived": {"default": True,
+                                       "type": 'bool',
+                                       "help": "if true, will skip enriching when no media is archived"},
    },
    "description": """
    Retrieves SSL certificate information for a domain and stores it as a file.
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@@ -14,7 +14,9 @@
            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
            "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
            "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
-            "join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
+            "join_channels": {"default": True,
+                              "type": "bool",
+                              "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
            "channel_invites": {
                "default": {},
                "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -17,11 +17,19 @@
    "configs": {
            "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
            "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
-            "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
-            "extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
-            "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
+            "timeout": {"default": 120,
+                        "type": "int",
+                        "help": "timeout for WACZ generation in seconds", "type": "int"},
+            "extract_media": {"default": False, 
+                              "type": 'bool',
+                              "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
+                              },
+            "extract_screenshot": {"default": True,
+                                    "type": 'bool',
+                                   "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
+                                   },
            "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
-            "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
+            "socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
            "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
        },
    "description": """
--- a/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
@@ -9,6 +9,7 @@
    "configs": {
        "timeout": {
            "default": 15,
+            "type": "int",
            "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
        },
        "if_not_archived_within": {
--- a/src/auto_archiver/modules/whisper_enricher/manifest.py
+++ b/src/auto_archiver/modules/whisper_enricher/manifest.py
@@ -10,8 +10,12 @@
                         "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
        "api_key": {"required": True,
                    "help": "WhisperApi api key for authentication"},
-        "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
-        "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
+        "include_srt": {"default": False,
+                        "type": "bool",
+                        "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
+        "timeout": {"default": 90,
+                    "type": "int",
+                    "help": "How many seconds to wait at most for a successful job completion."},
        "action": {"default": "translate",
                   "help": "which Whisper operation to execute",
                   "choices": ["transcribe", "translate", "language_detection"]},
--- a/src/auto_archiver/utils/webdriver.py
+++ b/src/auto_archiver/utils/webdriver.py
@@ -1,18 +1,23 @@
 """ This Webdriver class acts as a context manager for the selenium webdriver. """
 from __future__ import annotations
-from selenium import webdriver
-from selenium.common.exceptions import TimeoutException
-from selenium.webdriver.common.proxy import Proxy, ProxyType
-from selenium.webdriver.common.print_page_options import PrintOptions

-from loguru import logger
-from selenium.webdriver.common.by import By
+import os
 import time

 #import domain_for_url
 from urllib.parse import urlparse, urlunparse
 from http.cookiejar import MozillaCookieJar

+from selenium import webdriver
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common import exceptions as selenium_exceptions
+from selenium.webdriver.common.print_page_options import PrintOptions
+from selenium.webdriver.common.by import By
+
+from loguru import logger
+
+
 class CookieSettingDriver(webdriver.Firefox):

    facebook_accept_cookies: bool
@@ -20,6 +25,10 @@ class CookieSettingDriver(webdriver.Firefox):
    cookiejar: MozillaCookieJar

    def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
+        if os.environ.get('RUNNING_IN_DOCKER'):
+            # Selenium doesn't support linux-aarch64 driver, we need to set this manually
+            kwargs['service'] = webdriver.FirefoxService(executable_path='/usr/local/bin/geckodriver')
+        
        super(CookieSettingDriver, self).__init__(*args, **kwargs)
        self.cookies = cookies
        self.cookiejar = cookiejar
@@ -64,14 +73,29 @@ class CookieSettingDriver(webdriver.Firefox):
                time.sleep(2)
            except Exception as e:
                logger.warning(f'Failed on fb accept cookies.', e)
+        
+
        # now get the actual URL
        super(CookieSettingDriver, self).get(url)
        if self.facebook_accept_cookies:
            # try and click the 'close' button on the 'login' window to close it
-            close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']")
-            if close_button:
-                close_button.click()
+            try:
+                xpath = "//div[@role='dialog']//div[@aria-label='Close']"
+                WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
+            except selenium_exceptions.NoSuchElementException:
+                logger.warning("Unable to find the 'close' button on the facebook login window")
+                pass

+        else:
+
+            # for all other sites, try and use some common button text to reject/accept cookies
+            for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Accept all cookies"]:
+                try:
+                    xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
+                    WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
+                    break
+                except selenium_exceptions.WebDriverException:
+                    pass

    
 class Webdriver:
@@ -90,7 +114,6 @@ class Webdriver:
            setattr(self.print_options, k, v)

    def __enter__(self) -> webdriver:
-
        options = webdriver.FirefoxOptions()
        options.add_argument("--headless")
        options.add_argument(f'--proxy-server={self.http_proxy}')
@@ -105,7 +128,7 @@ class Webdriver:
            self.driver.set_window_size(self.width, self.height)
            self.driver.set_page_load_timeout(self.timeout_seconds)
            self.driver.print_options = self.print_options
-        except TimeoutException as e:
+        except selenium_exceptions.TimeoutException as e:
            logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")

        return self.driver