Merge pull request #217 from bellingcat/settings_page

Settings page user interface
2026-06-12 05:08:28 +03:00 · 2025-03-07 16:10:50 +00:00
parent 928c6f88a9 333201acec
commit 3fac353407
36 changed files with 55642 additions and 51 deletions
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -80,7 +80,10 @@ class ModuleFactory:

        available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
        if not available:
-            raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
+            message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
+            if 'archiver' in module_name:
+                message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
+            raise IndexError(message)
        return available[0]

    def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -73,10 +73,20 @@ class ArchivingOrchestrator:

        self.basic_parser = parser
        return parser
+    
+    def check_steps(self, config):
+        for module_type in MODULE_TYPES:
+            if not config['steps'].get(f"{module_type}s", []):
+                if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
+                    raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
+Here's how that would look: \n\nsteps:\n  {module_type}s:\n  - [your_{module_type}_name_here]\n  {'extractors:...' if module_type == 'feeder' else '...'}\n")
+                if module_type == 'extractor' and config['steps'].get('archivers'):
+                    raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
+Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_here]\n  enrichers:...\n")
+                raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")

    def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:

-
        # modules parser to get the overridden 'steps' values
        modules_parser = argparse.ArgumentParser(
            add_help=False,
@@ -101,6 +111,7 @@ class ArchivingOrchestrator:
        # but should we add them? Or should we just add them to the 'complete' parser?

        if is_valid_config(yaml_config):
+            self.check_steps(yaml_config)
            # only load the modules enabled in config
            # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
            enabled_modules = []
@@ -116,10 +127,6 @@ class ArchivingOrchestrator:
            simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
            self.add_individual_module_args(simple_modules, parser)

-            # for simple mode, we use the cli_feeder and any modules that don't require setup
-            if not yaml_config['steps']['feeders']:
-                yaml_config['steps']['feeders'] = ['cli_feeder']
-
            # add them to the config
            for module in simple_modules:
                for module_type in module.type:
@@ -172,9 +179,6 @@ class ArchivingOrchestrator:
        if not parser:
            parser = self.parser

-        # allow passing URLs directly on the command line
-        parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
-
        parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
                                                                            (token, username etc.) that extractors can use to log into \
                                                                            a website. If passing this on the command line, use a JSON string. \
@@ -194,7 +198,11 @@ class ArchivingOrchestrator:
            modules = self.module_factory.available_modules()
        
        for module in modules:
-
+            if module.name == 'cli_feeder':
+                # special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
+                parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
+                continue
+                
            if not module.configs:
                # this module has no configs, don't show anything in the help
                # (TODO: do we want to show something about this module though, like a description?)
@@ -278,27 +286,6 @@ class ArchivingOrchestrator:
                    raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")

            for module in modules_to_load:
-                if module == 'cli_feeder':
-                    # cli_feeder is a pseudo module, it just takes the command line args for [URLS]
-                    urls = self.config['urls']
-                    if not urls:
-                        raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
-
-                    def feed(self) -> Generator[Metadata]:
-                        for url in urls:
-                            logger.debug(f"Processing URL: '{url}'")
-                            yield Metadata().set_url(url)
-
-                    pseudo_module = type('CLIFeeder', (Feeder,), {
-                        'name': 'cli_feeder',
-                        'display_name': 'CLI Feeder',
-                        '__iter__': feed
-
-                    })()
-
-                    pseudo_module.__iter__ = feed
-                    step_items.append(pseudo_module)
-                    continue

                if module in invalid_modules:
                    continue
--- a/src/auto_archiver/modules/cli_feeder/manifest.py
+++ b/src/auto_archiver/modules/cli_feeder/manifest.py
@@ -0,0 +1,23 @@
+{
+    'name': 'Command Line Feeder',
+    'type': ['feeder'],
+    'entry_point': 'cli_feeder::CLIFeeder',
+    'requires_setup': False,
+    'description': 'Feeds URLs to orchestrator from the command line',
+    'configs': {
+        'urls': {
+            'default': None,
+            'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
+        },
+    },
+    'description': """
+The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line 
+without the need to specify any additional configuration or command line arguments:
+
+`auto-archiver --feeder cli_feeder -- "https://example.com/1/,https://example.com/2/"`
+
+You can pass multiple URLs by separating them with a space. The URLs will be processed in the order they are provided.
+
+`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
+""",
+}
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@@ -0,0 +1,21 @@
+from loguru import logger
+
+from auto_archiver.core.feeder import Feeder
+from auto_archiver.core.metadata import Metadata
+
+class CLIFeeder(Feeder):
+
+    def setup(self) -> None:
+        self.urls = self.config['urls']
+        if not self.urls:
+            raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
+
+    def __iter__(self) -> Metadata:
+        urls = self.config['urls']
+        for url in urls:
+            logger.debug(f"Processing {url}")
+            m = Metadata().set_url(url)
+            m.set_context("folder", "cli")
+            yield m
+
+        logger.success(f"Processed {len(urls)} URL(s)")
--- a/src/auto_archiver/modules/csv_db/manifest.py
+++ b/src/auto_archiver/modules/csv_db/manifest.py
@@ -6,7 +6,7 @@
                              },
    'entry_point': 'csv_db::CSVDb',
    "configs": {
-            "csv_file": {"default": "db.csv", "help": "CSV file name"}
+            "csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
        },
    "description": """
 Handles exporting archival results to a CSV file.
--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@@ -12,7 +12,9 @@
            "default": None,
            "help": "the id of the sheet to archive (alternative to 'sheet' config)",
        },
-        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
+        "header": {"default": 1,
+                   "type": "int",
+                   "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
            "default": "secrets/service_account.json",
            "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
--- a/src/auto_archiver/modules/html_formatter/manifest.py
+++ b/src/auto_archiver/modules/html_formatter/manifest.py
@@ -7,7 +7,9 @@
                          "bin": [""]
    },
    "configs": {
-            "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
+            "detect_thumbnails": {"default": True,
+                                  "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
+                                  "type": "bool"},
        },
    "description": """ """,
 }
--- a/src/auto_archiver/modules/local_storage/manifest.py
+++ b/src/auto_archiver/modules/local_storage/manifest.py
@@ -17,7 +17,9 @@
            "choices": ["random", "static"],
        },
        "save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
-        "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
+        "save_absolute": {"default": False, 
+                          "type": "bool",
+                          "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
    },
    "description": """
    LocalStorage: A storage module for saving archived content locally on the filesystem.
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@@ -6,13 +6,25 @@
        "python": ["loguru", "selenium"],
    },
    "configs": {
-            "width": {"default": 1280, "help": "width of the screenshots"},
-            "height": {"default": 1024, "help": "height of the screenshots"},
-            "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
-            "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
+            "width": {"default": 1280,
+                      "type": "int",
+                      "help": "width of the screenshots"},
+            "height": {"default": 1024,
+                        "type": "int",
+                       "help": "height of the screenshots"},
+            "timeout": {"default": 60,
+                        "type": "int",
+                        "help": "timeout for taking the screenshot"},
+            "sleep_before_screenshot": {"default": 4,
+                                        "type": "int",
+                                        "help": "seconds to wait for the pages to load before taking screenshot"},
            "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
-            "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
-            "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
+            "save_to_pdf": {"default": False,
+                            "type": "bool",
+                            "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
+            "print_options": {"default": {},
+                              "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
+                              "type": "json_loader"},
        },
    "description": """
    Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
--- a/src/auto_archiver/modules/ssl_enricher/manifest.py
+++ b/src/auto_archiver/modules/ssl_enricher/manifest.py
@@ -7,7 +7,9 @@
    },
    'entry_point': 'ssl_enricher::SSLEnricher',
    "configs": {
-        "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
+        "skip_when_nothing_archived": {"default": True,
+                                       "type": 'bool',
+                                       "help": "if true, will skip enriching when no media is archived"},
    },
    "description": """
    Retrieves SSL certificate information for a domain and stores it as a file.
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@@ -14,7 +14,9 @@
            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
            "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
            "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
-            "join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
+            "join_channels": {"default": True,
+                              "type": "bool",
+                              "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
            "channel_invites": {
                "default": {},
                "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -17,11 +17,19 @@
    "configs": {
            "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
            "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
-            "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
-            "extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
-            "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
+            "timeout": {"default": 120,
+                        "type": "int",
+                        "help": "timeout for WACZ generation in seconds", "type": "int"},
+            "extract_media": {"default": False, 
+                              "type": 'bool',
+                              "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
+                              },
+            "extract_screenshot": {"default": True,
+                                    "type": 'bool',
+                                   "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
+                                   },
            "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
-            "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
+            "socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
            "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
        },
    "description": """
--- a/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
@@ -9,6 +9,7 @@
    "configs": {
        "timeout": {
            "default": 15,
+            "type": "int",
            "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
        },
        "if_not_archived_within": {
--- a/src/auto_archiver/modules/whisper_enricher/manifest.py
+++ b/src/auto_archiver/modules/whisper_enricher/manifest.py
@@ -10,8 +10,12 @@
                         "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
        "api_key": {"required": True,
                    "help": "WhisperApi api key for authentication"},
-        "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
-        "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
+        "include_srt": {"default": False,
+                        "type": "bool",
+                        "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
+        "timeout": {"default": 90,
+                    "type": "int",
+                    "help": "How many seconds to wait at most for a successful job completion."},
        "action": {"default": "translate",
                   "help": "which Whisper operation to execute",
                   "choices": ["transcribe", "translate", "language_detection"]},