Merge branch 'main' into small_issues

# Conflicts: # src/auto_archiver/core/base_module.py # src/auto_archiver/utils/misc.py
2026-06-11 20:58:29 +03:00 · 2025-02-26 13:19:49 +00:00
parent b2e654aef9 1ad158c016
commit 8124bb831d
48 changed files with 890 additions and 299 deletions
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -84,11 +84,13 @@ class BaseModule(ABC):
        * api_key: str - the API key to use for login\n
        * api_secret: str - the API secret to use for login\n
        * cookie: str - a cookie string to use for login (specific to this site)\n
+        * cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
+        * cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
        """
        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
        # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?

-        site = UrlUtil.domain_for_url(site)
+        site = UrlUtil.domain_for_url(site).lstrip("www.")
        # add the 'www' version of the site to the list of sites to check
        authdict = {}

@@ -115,16 +117,29 @@ class BaseModule(ABC):
            ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
            return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar

-        # get the cookies jar, prefer the browser cookies than the file
-        if 'cookies_from_browser' in self.authentication:
+        get_cookiejar_options = None
+
+        # order of priority:
+        # 1. cookies_from_browser setting in site config
+        # 2. cookies_file setting in site config
+        # 3. cookies_from_browser setting in global config
+        # 4. cookies_file setting in global config
+
+        if 'cookies_from_browser' in authdict:
+            get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']]
+        elif 'cookies_file' in authdict:
+            get_cookiejar_options = ['--cookies', authdict['cookies_file']]
+        elif 'cookies_from_browser' in self.authentication:
            authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
-            if extract_cookies:
-                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
+            get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']]
        elif 'cookies_file' in self.authentication:
            authdict['cookies_file'] = self.authentication['cookies_file']
-            if extract_cookies:
-                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
+            get_cookiejar_options = ['--cookies', self.authentication['cookies_file']]
+
        
+        if get_cookiejar_options:
+            authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options)
+
        return authdict
    
    def repr(self):
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -7,6 +7,7 @@ flexible setup in various environments.

 import argparse
 from ruamel.yaml import YAML, CommentedMap, add_representer
+import json

 from loguru import logger

@@ -17,10 +18,12 @@ from typing import Any, List, Type, Tuple

 _yaml: YAML = YAML()

+DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml"
+
 EMPTY_CONFIG = _yaml.load("""
 # Auto Archiver Configuration
-# Steps are the modules that will be run in the order they are defined

+# Steps are the modules that will be run in the order they are defined
 steps:""" + "".join([f"\n   {module}s: []" for module in MODULE_TYPES]) + \
 """

@@ -52,6 +55,57 @@ logging:
 """)
 # note: 'logging' is explicitly added above in order to better format the config file

+
+# Arg Parse Actions/Classes
+class AuthenticationJsonParseAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+
+        try:
+            auth_dict = json.loads(values)
+            setattr(namespace, self.dest, auth_dict)
+        except json.JSONDecodeError as e:
+            raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
+
+        def load_from_file(path):
+            try:
+                with open(path, 'r') as f:
+                    try:
+                        auth_dict = json.load(f)
+                    except json.JSONDecodeError:
+                        f.seek(0)
+                        # maybe it's yaml, try that
+                        auth_dict = _yaml.load(f)
+                    if auth_dict.get('authentication'):
+                        auth_dict = auth_dict['authentication']
+                    auth_dict['load_from_file']  = path
+                    return auth_dict
+            except:
+                return None
+
+        if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
+            auth_dict = load_from_file(auth_dict['from_file'])
+        elif isinstance(auth_dict, str):
+            # if it's a string
+            auth_dict = load_from_file(auth_dict)
+        
+        if not isinstance(auth_dict, dict):
+            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
+        for key, auth in auth_dict.items():
+            if key in global_options:
+                continue
+            if not isinstance(key, str) or not isinstance(auth, dict):
+                raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
+
+        setattr(namespace, self.dest, auth_dict)
+
+
+class UniqueAppendAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        for value in values:
+            if value not in getattr(namespace, self.dest):
+                getattr(namespace, self.dest).append(value)
+
 class DefaultValidatingParser(argparse.ArgumentParser):

    def error(self, message):
@@ -82,6 +136,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):

        return super().parse_known_args(args, namespace)

+# Config Utils

 def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
    dotdict = {}
@@ -153,8 +208,8 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
        pass

    if not config:
-        config = EMPTY_CONFIG
-    
+        config = deepcopy(EMPTY_CONFIG)
+
    return config

 # TODO: make this tidier/find a way to notify of which keys should not be stored
@@ -170,4 +225,7 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:

    config_to_save.pop('urls', None)
    with open(yaml_filename, "w", encoding="utf-8") as outf:
-        _yaml.dump(config_to_save, outf)
+        _yaml.dump(config_to_save, outf)
+
+def is_valid_config(config: CommentedMap) -> bool:
+    return config and config != EMPTY_CONFIG
--- a/src/auto_archiver/core/enricher.py
+++ b/src/auto_archiver/core/enricher.py
@@ -13,7 +13,7 @@ from abc import abstractmethod
 from auto_archiver.core import Metadata, BaseModule

 class Enricher(BaseModule):
-    """Base classes and utilities for enrichers in the Auto-Archiver system.
+    """Base classes and utilities for enrichers in the Auto Archiver system.
    
    Enricher modules must implement the `enrich` method to define their behavior.
    """
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -134,7 +134,6 @@ class LazyBaseModule:

    """
    name: str
-    type: list
    description: str
    path: str
    module_factory: ModuleFactory
@@ -148,6 +147,10 @@ class LazyBaseModule:
        self.path = path
        self.module_factory = factory

+    @property
+    def type(self):
+        return self.manifest['type']
+
    @property
    def entry_point(self):
        if not self._entry_point and not self.manifest['entry_point']:
@@ -183,10 +186,9 @@ class LazyBaseModule:
            try:
                manifest.update(ast.literal_eval(f.read()))
            except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
-                logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
+                raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
            
        self._manifest = manifest
-        self.type = manifest['type']
        self._entry_point = manifest['entry_point']
        self.description = manifest['description']
        self.version = manifest['version']
@@ -254,7 +256,7 @@ class LazyBaseModule:
        instance.module_factory = self.module_factory
        
        # merge the default config with the user config
-        default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
+        default_config = dict((k, v['default']) for k, v in self.configs.items() if 'default' in v)

        config[self.name] = default_config  | config.get(self.name, {})
        instance.config_setup(config)
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -6,95 +6,31 @@

 from __future__ import annotations
 from typing import Generator, Union, List, Type, TYPE_CHECKING
-from urllib.parse import urlparse
-from ipaddress import ip_address
-from copy import copy
 import argparse
 import os
 import sys
-import json
 from tempfile import TemporaryDirectory
 import traceback
+from copy import copy

 from rich_argparse import RichHelpFormatter
-
+from loguru import logger

 from .metadata import Metadata, Media
 from auto_archiver.version import __version__
-from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
+from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
+    DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
 from .module import ModuleFactory, LazyBaseModule
 from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
 from .consts import MODULE_TYPES
-from loguru import logger
+from auto_archiver.utils.url import check_url_or_raise

 if TYPE_CHECKING:
    from .base_module import BaseModule
    from .module import LazyBaseModule

-DEFAULT_CONFIG_FILE = "orchestration.yaml"
-
-
-class JsonParseAction(argparse.Action):
-    def __call__(self, parser, namespace, values, option_string=None):
-        try:
-            setattr(namespace, self.dest, json.loads(values))
-        except json.JSONDecodeError as e:
-            raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
-
-
-class AuthenticationJsonParseAction(JsonParseAction):
-    def __call__(self, parser, namespace, values, option_string=None):
-        super().__call__(parser, namespace, values, option_string)
-        auth_dict = getattr(namespace, self.dest)
-
-        def load_from_file(path):
-            try:
-                with open(path, 'r') as f:
-                    try:
-                        auth_dict = json.load(f)
-                    except json.JSONDecodeError:
-                        f.seek(0)
-                        # maybe it's yaml, try that
-                        auth_dict = _yaml.load(f)
-                    if auth_dict.get('authentication'):
-                        auth_dict = auth_dict['authentication']
-                    auth_dict['load_from_file']  = path
-                    return auth_dict
-            except:
-                return None
-
-        if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
-            auth_dict = load_from_file(auth_dict['from_file'])
-        elif isinstance(auth_dict, str):
-            # if it's a string
-            auth_dict = load_from_file(auth_dict)
-        
-        if not isinstance(auth_dict, dict):
-            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
-        global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
-        for key, auth in auth_dict.items():
-            if key in global_options:
-                continue
-            if not isinstance(key, str) or not isinstance(auth, dict):
-                raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
-        
-        # extract out concatenated sites
-        for key, val in copy(auth_dict).items():
-            if "," in key:
-                for site in key.split(","):
-                    auth_dict[site] = val
-                del auth_dict[key]
-
-        setattr(namespace, self.dest, auth_dict)
-
-
-class UniqueAppendAction(argparse.Action):
-    def __call__(self, parser, namespace, values, option_string=None):
-        for value in values:
-            if value not in getattr(namespace, self.dest):
-                getattr(namespace, self.dest).append(value)
-
-
+class SetupError(ValueError):
+    pass
 class ArchivingOrchestrator:

    # instance variables
@@ -163,7 +99,7 @@ class ArchivingOrchestrator:
        # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
        # but should we add them? Or should we just add them to the 'complete' parser?

-        if yaml_config != EMPTY_CONFIG:
+        if is_valid_config(yaml_config):
            # only load the modules enabled in config
            # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
            enabled_modules = []
@@ -189,7 +125,13 @@ class ArchivingOrchestrator:
                    yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
        else:
            # load all modules, they're not using the 'simple' mode
-            self.add_individual_module_args(self.module_factory.available_modules(), parser)
+            all_modules = self.module_factory.available_modules()
+            # add all the modules to the steps
+            for module in all_modules:
+                for module_type in module.type:
+                    yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
+
+            self.add_individual_module_args(all_modules, parser)
        
        parser.set_defaults(**to_dot_notation(yaml_config))

@@ -198,6 +140,9 @@ class ArchivingOrchestrator:
        # merge the new config with the old one
        config = merge_dicts(vars(parsed), yaml_config)

+        # set up the authentication dict as needed
+        config = self.setup_authentication(config)
+
        # clean out args from the base_parser that we don't want in the config
        for key in vars(basic_config):
            config.pop(key, None)
@@ -286,14 +231,20 @@ class ArchivingOrchestrator:
        self.basic_parser.exit()

    def setup_logging(self, config):
+
+        logging_config = config['logging']
+
+        if logging_config.get('enabled', True) is False:
+            # disabled logging settings, they're set on a higher level
+            logger.disable('auto_archiver')
+            return
+
        # setup loguru logging
        try:
            logger.remove(0)  # remove the default logger
        except ValueError:
            pass

-        logging_config = config['logging']
-
        # add other logging info
        if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
            self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
@@ -312,27 +263,25 @@ class ArchivingOrchestrator:

            step_items = []
            modules_to_load = modules_by_type[f"{module_type}s"]
-            assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
+            if not modules_to_load:
+                raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")

            def check_steps_ok():
                if not len(step_items):
-                    logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
                    if len(modules_to_load):
-                        logger.error(f"Tried to load the following modules, but none were available: {modules_to_load}")
-                    exit()
+                        logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
+                    raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
+                

                if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
-                    logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
-                    exit()
+                    raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")

            for module in modules_to_load:
                if module == 'cli_feeder':
-                    # pseudo module, don't load it
+                    # cli_feeder is a pseudo module, it just takes the command line args for [URLS]
                    urls = self.config['urls']
                    if not urls:
-                        logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
-                        exit()
-                    # cli_feeder is a pseudo module, it just takes the command line args
+                        raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")

                    def feed(self) -> Generator[Metadata]:
                        for url in urls:
@@ -352,13 +301,14 @@ class ArchivingOrchestrator:

                if module in invalid_modules:
                    continue
+
                try:
                    loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
                except (KeyboardInterrupt, Exception) as e:
                    logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
                    if module_type == 'extractor' and loaded_module.name == module:
                        loaded_module.cleanup()
-                    exit()
+                    raise e

                if not loaded_module:
                    invalid_modules.append(module)
@@ -372,7 +322,7 @@ class ArchivingOrchestrator:
    def load_config(self, config_file: str) -> dict:
        if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
            logger.error(f"The configuration file {config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
-            exit()
+            raise FileNotFoundError(f"Configuration file {config_file} not found")

        return read_yaml(config_file)
    
@@ -437,8 +387,12 @@ class ArchivingOrchestrator:
        If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
        To test configurations, without loading any modules you can also first call 'setup_configs'
        """
-        self.setup(args)
-        return self.feed()
+        try:
+            self.setup(args)
+            return self.feed()
+        except Exception as e:
+            logger.error(e)
+            exit(1)

    def cleanup(self) -> None:
        logger.info("Cleaning up")
@@ -503,8 +457,8 @@ class ArchivingOrchestrator:

        original_url = result.get_url().strip()
        try:
-            self.assert_valid_url(original_url)
-        except AssertionError as e:
+            check_url_or_raise(original_url)
+        except ValueError as e:
            logger.error(f"Error archiving URL {original_url}: {e}")
            raise e

@@ -564,26 +518,27 @@ class ArchivingOrchestrator:
                logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")

        return result
+    

-    def assert_valid_url(self, url: str) -> bool:
+    def setup_authentication(self, config: dict) -> dict:
        """
-        Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
+        Setup authentication for all modules that require it
+
+        Split up strings into multiple sites if they are comma separated
        """
-        assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"

-        parsed = urlparse(url)
-        assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
-        assert parsed.hostname, f"Invalid URL hostname"
-        assert parsed.hostname != "localhost", f"Invalid URL"
+        authentication = config.get('authentication', {})

-        try:  # special rules for IP addresses
-            ip = ip_address(parsed.hostname)
-        except ValueError: pass
-        else:
-            assert ip.is_global, f"Invalid IP used"
-            assert not ip.is_reserved, f"Invalid IP used"
-            assert not ip.is_link_local, f"Invalid IP used"
-            assert not ip.is_private, f"Invalid IP used"
+        # extract out concatenated sites
+        for key, val in copy(authentication).items():
+            if "," in key:
+                for site in key.split(","):
+                    site = site.strip()
+                    authentication[site] = val
+                del authentication[key]
+        
+        config['authentication'] = authentication
+        return config

    # Helper Properties

--- a/src/auto_archiver/core/validators.py
+++ b/src/auto_archiver/core/validators.py
@@ -1,6 +1,7 @@
 # used as validators for config values. Should raise an exception if the value is invalid.
 from pathlib import Path
 import argparse
+import json

 def example_validator(value):
    if "example" not in value:
@@ -16,4 +17,7 @@ def positive_number(value):
 def valid_file(value):
    if not Path(value).is_file():
        raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
-    return value
+    return value
+
+def json_loader(cli_val):
+    return json.loads(cli_val)
--- a/src/auto_archiver/modules/api_db/manifest.py
+++ b/src/auto_archiver/modules/api_db/manifest.py
@@ -1,5 +1,5 @@
 {
-    "name": "Auto-Archiver API Database",
+    "name": "Auto Archiver API Database",
    "type": ["database"],
    "entry_point": "api_db::AAApiDb",
    "requires_setup": True,
@@ -39,7 +39,7 @@
        },
    },
    "description": """
-     Provides integration with the Auto-Archiver API for querying and storing archival data.
+     Provides integration with the Auto Archiver API for querying and storing archival data.

 ### Features
 - **API Integration**: Supports querying for existing archives and submitting results.
@@ -49,6 +49,6 @@
 - **Optional Storage**: Archives results conditionally based on configuration.

 ### Setup
-Requires access to an Auto-Archiver API instance and a valid API token.
+Requires access to an Auto Archiver API instance and a valid API token.
     """,
 }
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -280,6 +280,7 @@ class GenericExtractor(Extractor):
        
        # set up auth
        auth = self.auth_for_site(url, extract_cookies=False)
+
        # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
        if auth:
            if 'username' in auth and 'password' in auth:
@@ -290,11 +291,11 @@ class GenericExtractor(Extractor):
                logger.debug(f'Using provided auth cookie for {url}')
                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
            elif 'cookies_from_browser' in auth:
-                logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
+                logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
            elif 'cookies_file' in auth:
-                logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
-                ydl_options['cookiesfile'] = auth['cookies_file']
+                logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
+                ydl_options['cookiefile'] = auth['cookies_file']

        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@@ -15,7 +15,8 @@
        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
            "default": "secrets/service_account.json",
-            "help": "service account JSON file path",
+            "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
+            "required": True,
        },
        "columns": {
            "default": {
@@ -34,16 +35,16 @@
                "wacz": "wacz",
                "replaywebpage": "replaywebpage",
            },
-            "help": "names of columns in the google sheet (stringified JSON object)",
-            "type": "auto_archiver.utils.json_loader",
+            "help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
+            "type": "json_loader",
        },
        "allow_worksheets": {
            "default": set(),
-            "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
+            "help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed",
        },
        "block_worksheets": {
            "default": set(),
-            "help": "(CSV) explicitly block some worksheets from being processed",
+            "help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed",
        },
        "use_sheet_names_in_stored_paths": {
            "default": True,
@@ -64,8 +65,10 @@
    - Ensures only rows with valid URLs and unprocessed statuses are included for archival.
    - Supports organizing stored files into folder paths based on sheet and worksheet names.

-    ### Notes
-    - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
-    - Create the sheet using the template provided in the docs.
+    ### Setup
+    - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
+    To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
+    - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
+    - Customize the column names in your Google sheet using the `columns` configuration.
    """,
 }
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -24,9 +24,8 @@ class GsheetsFeeder(Feeder):
    def setup(self) -> None:
        self.gsheets_client = gspread.service_account(filename=self.service_account)
        # TODO mv to validators
-        assert self.sheet or self.sheet_id, (
-            "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
-        )
+        if not self.sheet and not self.sheet_id:
+            raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")

    def open_sheet(self):
        if self.sheet:
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@@ -18,7 +18,7 @@
            "channel_invites": {
                "default": {},
                "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
-                "type": "auto_archiver.utils.json_loader",
+                "type": "json_loader",
            }
        },
    "description": """
--- a/src/auto_archiver/modules/wacz_enricher/init.py
+++ b/src/auto_archiver/modules/wacz_enricher/init.py
@@ -1 +0,0 @@
-from .wacz_enricher import WaczExtractorEnricher
--- a/src/auto_archiver/modules/wacz_extractor_enricher/init.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/init.py
@@ -0,0 +1 @@
+from .wacz_extractor_enricher import WaczExtractorEnricher
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -1,7 +1,7 @@
 {
-    "name": "WACZ Enricher",
+    "name": "WACZ Enricher (and Extractor)",
    "type": ["enricher", "extractor"],
-    "entry_point": "wacz_enricher::WaczExtractorEnricher",
+    "entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
    "requires_setup": True,
    "dependencies": {
        "python": [
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
--- a/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
@@ -1,5 +1,5 @@
 {
-    "name": "Wayback Machine Enricher",
+    "name": "Wayback Machine Enricher (and Extractor)",
    "type": ["enricher", "extractor"],
    "entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
    "requires_setup": True,
--- a/src/auto_archiver/utils/misc.py
+++ b/src/auto_archiver/utils/misc.py
@@ -61,11 +61,7 @@ def random_str(length: int = 32) -> str:
    return str(uuid.uuid4()).replace("-", "")[:length]


-def json_loader(cli_val):
-    return json.loads(cli_val)
-
-
-def calculate_file_hash(filename: str, hash_algo=hashlib.sha256, chunksize: int = 16000000) -> str:
+def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
    hash = hash_algo()
    with open(filename, "rb") as f:
        while True:
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@@ -1,5 +1,6 @@
 import re
 from urllib.parse import urlparse, urlunparse
+from ipaddress import ip_address


 AUTHWALL_URLS = [
@@ -7,6 +8,43 @@ AUTHWALL_URLS = [
    re.compile(r"https:\/\/www\.instagram\.com"), # instagram
 ]

+
+def check_url_or_raise(url: str) -> bool | ValueError:
+    """
+    Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
+    """
+
+    
+    if not (url.startswith("http://") or url.startswith("https://")):
+        raise ValueError(f"Invalid URL scheme for url {url}")
+    
+    parsed = urlparse(url)
+    if not parsed.hostname:
+        raise ValueError(f"Invalid URL hostname for url {url}")
+    
+    if parsed.hostname == "localhost":
+        raise ValueError(f"Localhost URLs cannot be parsed for security reasons (for url {url})")
+    
+    if parsed.scheme not in ["http", "https"]:
+        raise ValueError(f"Invalid URL scheme, only http and https supported (for url {url})")
+
+    try:  # special rules for IP addresses
+        ip = ip_address(parsed.hostname)
+    except ValueError:
+        pass
+    
+    else:
+        if not ip.is_global:
+            raise ValueError(f"IP address {ip} is not globally reachable")
+        if ip.is_reserved:
+            raise ValueError(f"Reserved IP address {ip} used")
+        if ip.is_link_local:
+            raise ValueError(f"Link-local IP address {ip} used")
+        if ip.is_private:
+            raise ValueError(f"Private IP address {ip} used")
+    
+    return True
+
 def domain_for_url(url: str) -> str:
    """
    SECURITY: parse the domain using urllib to avoid any potential security issues
				`@@ -1 +0,0 @@`
				`from .wacz_enricher import WaczExtractorEnricher`
				`@@ -0,0 +1 @@`
				`from .wacz_extractor_enricher import WaczExtractorEnricher`