Merge branch 'main' into timestamping_rewrite

2026-06-11 12:48:28 +03:00 · 2025-02-24 12:03:14 +00:00
parent d0c379a3ba 5211c5de18
commit 01bf88a695
125 changed files with 4277 additions and 2162 deletions
--- a/src/auto_archiver/main.py
+++ b/src/auto_archiver/main.py
@@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
 import sys

 def main():
-    ArchivingOrchestrator().run(sys.argv[1:])
+    for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass

 if __name__ == "__main__":
    main()
--- a/src/auto_archiver/core/init.py
+++ b/src/auto_archiver/core/init.py
@@ -3,7 +3,7 @@
 """
 from .metadata import Metadata
 from .media import Media
-from .module import BaseModule
+from .base_module import BaseModule

 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -1,13 +1,18 @@

-from urllib.parse import urlparse
-from typing import  Mapping, Any
+from __future__ import annotations
+
+from typing import  Mapping, Any, Type, TYPE_CHECKING
 from abc import ABC
 from copy import deepcopy, copy
 from tempfile import TemporaryDirectory
 from auto_archiver.utils import url as UrlUtil
+from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES

 from loguru import logger

+if TYPE_CHECKING:
+    from .module import ModuleFactory
+
 class BaseModule(ABC):

    """
@@ -17,41 +22,24 @@ class BaseModule(ABC):
    however modules can have a .setup() method to run any setup code
    (e.g. logging in to a site, spinning up a browser etc.)

-    See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
+    See consts.MODULE_TYPES for the types of modules you can create, noting that
    a subclass can be of multiple types. For example, a module that extracts data from
    a website and stores it in a database would be both an 'extractor' and a 'database' module.

    Each module is a python package, and should have a __manifest__.py file in the
    same directory as the module file. The __manifest__.py specifies the module information
-    like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
+    like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the
    default manifest structure.

    """

-    MODULE_TYPES = [
-        'feeder',
-        'extractor',
-        'enricher',
-        'database',
-        'storage',
-        'formatter'
-    ]
-
-    _DEFAULT_MANIFEST = {
-    'name': '', # the display name of the module
-    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
-    'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
-    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
-    'description': '', # a description of the module
-    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
-    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
-    'version': '1.0', # the version of the module
-    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
-}
+    MODULE_TYPES = CONF_MODULE_TYPES

+    # NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method
    config: Mapping[str, Any]
    authentication: Mapping[str, Mapping[str, str]]
    name: str
+    module_factory: ModuleFactory

    # this is set by the orchestrator prior to archiving
    tmp_dir: TemporaryDirectory = None
@@ -63,12 +51,6 @@ class BaseModule(ABC):
    def config_setup(self, config: dict):

        authentication = config.get('authentication', {})
-        # extract out concatenated sites
-        for key, val in copy(authentication).items():
-            if "," in key:
-                for site in key.split(","):
-                    authentication[site] = val
-                del authentication[key]

        # this is important. Each instance is given its own deepcopied config, so modules cannot
        # change values to affect other modules
@@ -89,16 +71,21 @@ class BaseModule(ABC):
        Returns the authentication information for a given site. This is used to authenticate
        with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
        
-        extract_cookies: bool - whether or not to extract cookies from the given browser and return the 
-        cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
+        :param site: the domain of the site to get authentication information for
+        :param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).

-        Currently, the dict can have keys of the following types:
-        - username: str - the username to use for login
-        - password: str - the password to use for login
-        - api_key: str - the API key to use for login
-        - api_secret: str - the API secret to use for login
-        - cookie: str - a cookie string to use for login (specific to this site)
-        - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
+        :returns: authdict dict of login information for the given site
+
+        **Global options:**\n
+        * cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
+        * cookies_file: str - the path to a cookies file to use for login\n
+
+        **Currently, the sites dict can have keys of the following types:**\n
+        * username: str - the username to use for login\n
+        * password: str - the password to use for login\n
+        * api_key: str - the API key to use for login\n
+        * api_secret: str - the API secret to use for login\n
+        * cookie: str - a cookie string to use for login (specific to this site)\n
        """
        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
        # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
 from loguru import logger

 from copy import deepcopy
-from .module import BaseModule
+from auto_archiver.core.consts import MODULE_TYPES

 from typing import Any, List, Type, Tuple

@@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load("""
 # Auto Archiver Configuration
 # Steps are the modules that will be run in the order they are defined

-steps:""" + "".join([f"\n   {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
+steps:""" + "".join([f"\n   {module}s: []" for module in MODULE_TYPES]) + \
 """

 # Global configuration
@@ -48,6 +48,7 @@ authentication: {}

 logging:
  level: INFO
+
 """)
 # note: 'logging' is explicitly added above in order to better format the config file

@@ -128,6 +129,11 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
                yaml_subdict[key] = value
                continue

+            if key == 'steps':
+                for module_type, modules in value.items():
+                    # overwrite the 'steps' from the config file with the ones from the CLI
+                    yaml_subdict[key][module_type] = modules
+
            if is_dict_type(value):
                update_dict(value, yaml_subdict[key])
            elif is_list_type(value):
@@ -136,7 +142,6 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
                yaml_subdict[key] = value

    update_dict(from_dot_notation(dotdict), yaml_dict)
-
    return yaml_dict

 def read_yaml(yaml_filename: str) -> CommentedMap:
@@ -158,6 +163,11 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
 def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
    config_to_save = deepcopy(config)

+    auth_dict = config_to_save.get("authentication", {})
+    if auth_dict and auth_dict.get('load_from_file'):
+        # remove all other values from the config, don't want to store it in the config file
+        auth_dict = {"load_from_file": auth_dict["load_from_file"]}
+
    config_to_save.pop('urls', None)
    with open(yaml_filename, "w", encoding="utf-8") as outf:
        _yaml.dump(config_to_save, outf)
--- a/src/auto_archiver/core/consts.py
+++ b/src/auto_archiver/core/consts.py
@@ -0,0 +1,23 @@
+
+MODULE_TYPES = [
+    'feeder',
+    'extractor',
+    'enricher',
+    'database',
+    'storage',
+    'formatter'
+]
+
+MANIFEST_FILE = "__manifest__.py"
+
+DEFAULT_MANIFEST = {
+    'name': '', # the display name of the module
+    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
+    'type': [], # the type of the module, can be one or more of MODULE_TYPES
+    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
+    'description': '', # a description of the module
+    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
+    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
+    'version': '1.0', # the version of the module
+    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
+}
--- a/src/auto_archiver/core/database.py
+++ b/src/auto_archiver/core/database.py
@@ -1,3 +1,8 @@
+"""
+Database module for the auto-archiver that defines the interface for implementing database modules
+in the media archiving framework. 
+"""
+
 from __future__ import annotations
 from abc import abstractmethod
 from typing import Union
@@ -5,6 +10,11 @@ from typing import Union
 from auto_archiver.core import Metadata, BaseModule

 class Database(BaseModule):
+    """
+    Base class for implementing database modules in the media archiving framework.
+
+    Subclasses must implement the `fetch` and `done` methods to define platform-specific behavior.
+    """

    def started(self, item: Metadata) -> None:
        """signals the DB that the given item archival has started"""
--- a/src/auto_archiver/core/enricher.py
+++ b/src/auto_archiver/core/enricher.py
@@ -1,5 +1,5 @@
 """
-Enrichers are modular components that enhance archived content by adding
+Base module for Enrichers – modular components that enhance archived content by adding
 context, metadata, or additional processing.

 These add additional information to the context, such as screenshots, hashes, and metadata.
@@ -13,7 +13,16 @@ from abc import abstractmethod
 from auto_archiver.core import Metadata, BaseModule

 class Enricher(BaseModule):
-    """Base classes and utilities for enrichers in the Auto-Archiver system."""
+    """Base classes and utilities for enrichers in the Auto-Archiver system.
+    
+    Enricher modules must implement the `enrich` method to define their behavior.
+    """

    @abstractmethod
-    def enrich(self, to_enrich: Metadata) -> None: pass
+    def enrich(self, to_enrich: Metadata) -> None:
+        """
+        Enriches a Metadata object with additional information or context.
+
+        Takes the metadata object to enrich as an argument and modifies it in place, returning None.
+        """
+        pass
--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@@ -29,14 +29,24 @@ class Extractor(BaseModule):
    valid_url: re.Pattern = None

    def cleanup(self) -> None:
-        # called when extractors are done, or upon errors, cleanup any resources
+        """
+        Called when extractors are done, or upon errors, cleanup any resources
+        """
        pass

    def sanitize_url(self, url: str) -> str:
-        # used to clean unnecessary URL parameters OR unfurl redirect links
+        """
+        Used to clean unnecessary URL parameters OR unfurl redirect links
+        """
        return url
    
    def match_link(self, url: str) -> re.Match:
+        """
+        Returns a match object if the given URL matches the valid_url pattern or False/None if not.
+
+        Normally used in the `suitable` method to check if the URL is supported by this extractor.
+
+        """
        return self.valid_url.match(url)

    def suitable(self, url: str) -> bool:
@@ -80,8 +90,8 @@ class Extractor(BaseModule):
            d.raise_for_status()

            # get mimetype from the response headers
-            if not Path(to_filename).suffix:
-                content_type = d.headers.get('Content-Type')
+            if not mimetypes.guess_type(to_filename)[0]:
+                content_type = d.headers.get('Content-Type') or self._guess_file_type(url)
                extension = mimetypes.guess_extension(content_type)
                if extension:
                    to_filename += extension
--- a/src/auto_archiver/core/feeder.py
+++ b/src/auto_archiver/core/feeder.py
@@ -1,3 +1,7 @@
+"""
+The feeder base module defines the interface for implementing feeders in the media archiving framework. 
+"""
+
 from __future__ import annotations
 from abc import abstractmethod
 from auto_archiver.core import Metadata
@@ -5,5 +9,17 @@ from auto_archiver.core import BaseModule

 class Feeder(BaseModule):

+    """
+    Base class for implementing feeders in the media archiving framework.
+
+    Subclasses must implement the `__iter__` method to define platform-specific behavior.
+    """
+
    @abstractmethod
-    def __iter__(self) -> Metadata: return None
+    def __iter__(self) -> Metadata:
+        """
+        Returns an iterator (use `yield`) over the items to be archived.
+        
+        These should be instances of Metadata, typically created with Metadata().set_url(url).
+        """
+        return None
--- a/src/auto_archiver/core/formatter.py
+++ b/src/auto_archiver/core/formatter.py
@@ -1,9 +1,24 @@
+"""
+Base module for formatters – modular components that format metadata into media objects for storage.
+
+The most commonly used formatter is the HTML formatter, which takes metadata and formats it into an HTML file for storage.
+"""
+
 from __future__ import annotations
 from abc import abstractmethod
 from auto_archiver.core import Metadata, Media, BaseModule


 class Formatter(BaseModule):
+    """
+    Base class for implementing formatters in the media archiving framework.
+    
+    Subclasses must implement the `format` method to define their behavior.
+    """

    @abstractmethod
-    def format(self, item: Metadata) -> Media: return None
+    def format(self, item: Metadata) -> Media:
+        """
+        Formats a Metadata object into a user-viewable format (e.g. HTML) and stores it if needed.
+        """
+        return None
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin
 from __future__ import annotations

 from dataclasses import dataclass
-from typing import List
+from typing import List, TYPE_CHECKING
 import shutil
 import ast
 import copy
@@ -16,99 +16,113 @@ import os
 from os.path import join
 from loguru import logger
 import auto_archiver
-from .base_module import BaseModule
+from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE

-_LAZY_LOADED_MODULES = {}
-
-MANIFEST_FILE = "__manifest__.py"
+if TYPE_CHECKING:
+    from .base_module import BaseModule


-def setup_paths(paths: list[str]) -> None:
-    """
-    Sets up the paths for the modules to be loaded from
-    
-    This is necessary for the modules to be imported correctly
-    
-    """
-    for path in paths:
-        # check path exists, if it doesn't, log a warning
-        if not os.path.exists(path):
-            logger.warning(f"Path '{path}' does not exist. Skipping...")
-            continue
+HAS_SETUP_PATHS = False

-        # see odoo/module/module.py -> initialize_sys_path
-        if path not in auto_archiver.modules.__path__:
-                auto_archiver.modules.__path__.append(path)
+class ModuleFactory:

-    # sort based on the length of the path, so that the longest path is last in the list
-    auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
+    def __init__(self):
+        self._lazy_modules = {}

-def get_module(module_name: str, config: dict) -> BaseModule:
-    """
-    Gets and sets up a module using the provided config
-    
-    This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
-    
-    """
-    return get_module_lazy(module_name).load(config)
+    def setup_paths(self, paths: list[str]) -> None:
+        """
+        Sets up the paths for the modules to be loaded from
+        
+        This is necessary for the modules to be imported correctly
+        
+        """
+        global HAS_SETUP_PATHS

-def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
-    """
-    Lazily loads a module, returning a LazyBaseModule
-    
-    This has all the information about the module, but does not load the module itself or its dependencies
-    
-    To load an actual module, call .setup() on a lazy module
-    
-    """
-    if module_name in _LAZY_LOADED_MODULES:
-        return _LAZY_LOADED_MODULES[module_name]
-
-    available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
-    if not available:
-        raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
-    return available[0]
-
-def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
-    
-    # search through all valid 'modules' paths. Default is 'modules' in the current directory
-
-    # see odoo/modules/module.py -> get_modules
-    def is_really_module(module_path):
-        if os.path.isfile(join(module_path, MANIFEST_FILE)):
-            return True
-
-    all_modules = []
-
-    for module_folder in auto_archiver.modules.__path__:
-        # walk through each module in module_folder and check if it has a valid manifest
-        try:
-            possible_modules = os.listdir(module_folder)
-        except FileNotFoundError:
-            logger.warning(f"Module folder {module_folder} does not exist")
-            continue
-
-        for possible_module in possible_modules:
-            if limit_to_modules and possible_module not in limit_to_modules:
+        for path in paths:
+            # check path exists, if it doesn't, log a warning
+            if not os.path.exists(path):
+                logger.warning(f"Path '{path}' does not exist. Skipping...")
                continue

-            possible_module_path = join(module_folder, possible_module)
-            if not is_really_module(possible_module_path):
+            # see odoo/module/module.py -> initialize_sys_path
+            if path not in auto_archiver.modules.__path__:
+                    if HAS_SETUP_PATHS == True:
+                        logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
+                                       This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
+                                       If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
+                    auto_archiver.modules.__path__.append(path)
+
+        # sort based on the length of the path, so that the longest path is last in the list
+        auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
+
+        HAS_SETUP_PATHS = True
+
+    def get_module(self, module_name: str, config: dict) -> BaseModule:
+        """
+        Gets and sets up a module using the provided config
+        
+        This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
+        
+        """
+        return self.get_module_lazy(module_name).load(config)
+
+    def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
+        """
+        Lazily loads a module, returning a LazyBaseModule
+        
+        This has all the information about the module, but does not load the module itself or its dependencies
+        
+        To load an actual module, call .setup() on a lazy module
+        
+        """
+        if module_name in self._lazy_modules:
+            return self._lazy_modules[module_name]
+
+        available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
+        if not available:
+            raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
+        return available[0]
+
+    def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
+        
+        # search through all valid 'modules' paths. Default is 'modules' in the current directory
+
+        # see odoo/modules/module.py -> get_modules
+        def is_really_module(module_path):
+            if os.path.isfile(join(module_path, MANIFEST_FILE)):
+                return True
+
+        all_modules = []
+
+        for module_folder in auto_archiver.modules.__path__:
+            # walk through each module in module_folder and check if it has a valid manifest
+            try:
+                possible_modules = os.listdir(module_folder)
+            except FileNotFoundError:
+                logger.warning(f"Module folder {module_folder} does not exist")
                continue
-            if _LAZY_LOADED_MODULES.get(possible_module):
-                continue
-            lazy_module = LazyBaseModule(possible_module, possible_module_path)

-            _LAZY_LOADED_MODULES[possible_module] = lazy_module
+            for possible_module in possible_modules:
+                if limit_to_modules and possible_module not in limit_to_modules:
+                    continue

-            all_modules.append(lazy_module)
-    
-    if not suppress_warnings:
-        for module in limit_to_modules:
-            if not any(module == m.name for m in all_modules):
-                logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
+                possible_module_path = join(module_folder, possible_module)
+                if not is_really_module(possible_module_path):
+                    continue
+                if self._lazy_modules.get(possible_module):
+                    continue
+                lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self)

-    return all_modules
+                self._lazy_modules[possible_module] = lazy_module
+
+                all_modules.append(lazy_module)
+        
+        if not suppress_warnings:
+            for module in limit_to_modules:
+                if not any(module == m.name for m in all_modules):
+                    logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
+
+        return all_modules

@dataclass
 class LazyBaseModule:
@@ -123,14 +137,16 @@ class LazyBaseModule:
    type: list
    description: str
    path: str
+    module_factory: ModuleFactory

    _manifest: dict = None
    _instance: BaseModule = None
    _entry_point: str = None

-    def __init__(self, module_name, path):
+    def __init__(self, module_name, path, factory: ModuleFactory):
        self.name = module_name
        self.path = path
+        self.module_factory = factory

    @property
    def entry_point(self):
@@ -161,7 +177,7 @@ class LazyBaseModule:
            return self._manifest
        # print(f"Loading manifest for module {module_path}")
        # load the manifest file
-        manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
+        manifest = copy.deepcopy(DEFAULT_MANIFEST)

        with open(join(self.path, MANIFEST_FILE)) as f:
            try:
@@ -189,13 +205,14 @@ class LazyBaseModule:
                    # clear out any empty strings that a user may have erroneously added
                    continue
                if not check(dep):
-                    logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
+                    logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
+                                 Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
                    exit(1)

        def check_python_dep(dep):
            # first check if it's a module:
            try:
-                m = get_module_lazy(dep, suppress_warnings=True)
+                m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
                try:
                # we must now load this module and set it up with the config
                    m.load(config)
@@ -230,19 +247,21 @@ class LazyBaseModule:
        __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
        # finally, get the class instance
        instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
-        if not getattr(instance, 'name', None):
-            instance.name = self.name
-
-        if not getattr(instance, 'display_name', None):
-            instance.display_name = self.display_name
-
-        self._instance = instance

+        # set the name, display name and module factory
+        instance.name = self.name
+        instance.display_name = self.display_name
+        instance.module_factory = self.module_factory
+        
        # merge the default config with the user config
        default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
+
        config[self.name] = default_config  | config.get(self.name, {})
        instance.config_setup(config)
        instance.setup()
+
+        # save the instance for future easy loading
+        self._instance = instance
        return instance

    def __repr__(self):
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -5,9 +5,10 @@
 """

 from __future__ import annotations
-from typing import Generator, Union, List, Type
+from typing import Generator, Union, List, Type, TYPE_CHECKING
 from urllib.parse import urlparse
 from ipaddress import ip_address
+from copy import copy
 import argparse
 import os
 import sys
@@ -21,15 +22,18 @@ from rich_argparse import RichHelpFormatter
 from .metadata import Metadata, Media
 from auto_archiver.version import __version__
 from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
-from .module import available_modules, LazyBaseModule, get_module, setup_paths
+from .module import ModuleFactory, LazyBaseModule
 from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
-from .module import BaseModule
-
+from .consts import MODULE_TYPES
 from loguru import logger

+if TYPE_CHECKING:
+    from .base_module import BaseModule
+    from .module import LazyBaseModule

 DEFAULT_CONFIG_FILE = "orchestration.yaml"

+
 class JsonParseAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        try:
@@ -42,51 +46,85 @@ class AuthenticationJsonParseAction(JsonParseAction):
    def __call__(self, parser, namespace, values, option_string=None):
        super().__call__(parser, namespace, values, option_string)
        auth_dict = getattr(namespace, self.dest)
-        if isinstance(auth_dict, str):
-            # if it's a string
+
+        def load_from_file(path):
            try:
-                with open(auth_dict, 'r') as f:
+                with open(path, 'r') as f:
                    try:
                        auth_dict = json.load(f)
                    except json.JSONDecodeError:
+                        f.seek(0)
                        # maybe it's yaml, try that
                        auth_dict = _yaml.load(f)
+                    if auth_dict.get('authentication'):
+                        auth_dict = auth_dict['authentication']
+                    auth_dict['load_from_file']  = path
+                    return auth_dict
            except:
-                pass
+                return None

+        if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
+            auth_dict = load_from_file(auth_dict['from_file'])
+        elif isinstance(auth_dict, str):
+            # if it's a string
+            auth_dict = load_from_file(auth_dict)
+        
        if not isinstance(auth_dict, dict):
            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
-        for site, auth in auth_dict.items():
-            if not isinstance(site, str) or not isinstance(auth, dict):
-                raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
+        for key, auth in auth_dict.items():
+            if key in global_options:
+                continue
+            if not isinstance(key, str) or not isinstance(auth, dict):
+                raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
+        
+        # extract out concatenated sites
+        for key, val in copy(auth_dict).items():
+            if "," in key:
+                for site in key.split(","):
+                    auth_dict[site] = val
+                del auth_dict[key]
+
        setattr(namespace, self.dest, auth_dict)
+
+
 class UniqueAppendAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
-        if not hasattr(namespace, self.dest):
-            setattr(namespace, self.dest, [])
        for value in values:
            if value not in getattr(namespace, self.dest):
                getattr(namespace, self.dest).append(value)

+
 class ArchivingOrchestrator:

+    # instance variables
+    module_factory: ModuleFactory
+    setup_finished: bool
+    logger_id: int
+
+    # instance variables, used for convenience to access modules by step
    feeders: List[Type[Feeder]]
    extractors: List[Type[Extractor]]
    enrichers: List[Type[Enricher]]
    databases: List[Type[Database]]
    storages: List[Type[Storage]]
    formatters: List[Type[Formatter]]
-    
+
+    def __init__(self):
+        self.module_factory = ModuleFactory()
+        self.setup_finished = False
+        self.logger_id = None
+
    def setup_basic_parser(self):
        parser = argparse.ArgumentParser(
-                prog="auto-archiver",
-                add_help=False,
-                description="""
+            prog="auto-archiver",
+            add_help=False,
+            description="""
                Auto Archiver is a CLI tool to archive media/metadata from online URLs;
                it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!
                """,
-                epilog="Check the code at https://github.com/bellingcat/auto-archiver",
-                formatter_class=RichHelpFormatter,
+            epilog="Check the code at https://github.com/bellingcat/auto-archiver",
+            formatter_class=RichHelpFormatter,
        )
        parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
        parser.add_argument('--version', action='version', version=__version__)
@@ -100,101 +138,115 @@ class ArchivingOrchestrator:
        return parser

    def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
+
+
+        # modules parser to get the overridden 'steps' values
+        modules_parser = argparse.ArgumentParser(
+            add_help=False,
+        )
+        self.add_modules_args(modules_parser)
+        cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
+        for module_type in MODULE_TYPES:
+            yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
+
        parser = DefaultValidatingParser(
            add_help=False,
        )
        self.add_additional_args(parser)

+        # merge command line module args (--feeders, --enrichers etc.) and add them to the config
+
        # check what mode we're in
        # if we have a config file, use that to decide which modules to load
        # if simple, we'll load just the modules that has requires_setup = False
        # if full, we'll load all modules
        # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
        # but should we add them? Or should we just add them to the 'complete' parser?
+
        if yaml_config != EMPTY_CONFIG:
            # only load the modules enabled in config
            # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
            enabled_modules = []
            # first loads the modules from the config file, then from the command line
-            for config in [yaml_config['steps'], basic_config.__dict__]:
-                for module_type in BaseModule.MODULE_TYPES:
-                    enabled_modules.extend(config.get(f"{module_type}s", []))
+            for module_type in MODULE_TYPES:
+                enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))

            # clear out duplicates, but keep the order
            enabled_modules = list(dict.fromkeys(enabled_modules))
-            avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
-            self.add_module_args(avail_modules, parser)
+            avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
+            self.add_individual_module_args(avail_modules, parser)
        elif basic_config.mode == 'simple':
-            simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
-            self.add_module_args(simple_modules, parser)
+            simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
+            self.add_individual_module_args(simple_modules, parser)

            # for simple mode, we use the cli_feeder and any modules that don't require setup
-            yaml_config['steps']['feeders'] = ['cli_feeder']
-            
+            if not yaml_config['steps']['feeders']:
+                yaml_config['steps']['feeders'] = ['cli_feeder']
+
            # add them to the config
            for module in simple_modules:
                for module_type in module.type:
                    yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
        else:
            # load all modules, they're not using the 'simple' mode
-            self.add_module_args(available_modules(with_manifest=True), parser)
-
+            self.add_individual_module_args(self.module_factory.available_modules(), parser)
+        
        parser.set_defaults(**to_dot_notation(yaml_config))

        # reload the parser with the new arguments, now that we have them
        parsed, unknown = parser.parse_known_args(unused_args)
-
        # merge the new config with the old one
-        self.config = merge_dicts(vars(parsed), yaml_config)
+        config = merge_dicts(vars(parsed), yaml_config)
+
        # clean out args from the base_parser that we don't want in the config
        for key in vars(basic_config):
-            self.config.pop(key, None)
+            config.pop(key, None)

        # setup the logging
-        self.setup_logging()
+        self.setup_logging(config)

        if unknown:
            logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
-        
-        if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
+
+        if (config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
            logger.info(f"Storing configuration file to {basic_config.config_file}")
-            store_yaml(self.config, basic_config.config_file)
-        
-        return self.config
+            store_yaml(config, basic_config.config_file)
+
+        return config
    
+    def add_modules_args(self, parser: argparse.ArgumentParser = None):
+        if not parser:
+            parser = self.parser
+
+        # Module loading from the command line
+        for module_type in MODULE_TYPES:
+            parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
+
    def add_additional_args(self, parser: argparse.ArgumentParser = None):
        if not parser:
            parser = self.parser

-
        # allow passing URLs directly on the command line
        parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')

-        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
-        parser.add_argument('--enrichers', dest='steps.enrichers',  nargs='+', help='the enrichers to use', action=UniqueAppendAction)
-        parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
-        parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
-        parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
-        parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
-
        parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
                                                                            (token, username etc.) that extractors can use to log into \
                                                                            a website. If passing this on the command line, use a JSON string. \
-                                                                            You may also pass a path to a valid JSON/YAML file which will be parsed.',\
+                                                                            You may also pass a path to a valid JSON/YAML file which will be parsed.',
                                                                            default={},
+                                                                            nargs="?",
                                                                            action=AuthenticationJsonParseAction)
+
        # logging arguments
        parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
        parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
        parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)

-
-    def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
+    def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:

        if not modules:
-            modules = available_modules(with_manifest=True)
-
-        module: LazyBaseModule
+            modules = self.module_factory.available_modules()
+        
        for module in modules:

            if not module.configs:
@@ -224,21 +276,29 @@ class ArchivingOrchestrator:
                arg.should_store = should_store

    def show_help(self, basic_config: dict):
-        # for the help message, we want to load *all* possible modules and show the help
-            # add configs as arg parser arguments
-        
+        # for the help message, we want to load manifests from *all* possible modules and show their help/settings
+        # add configs as arg parser arguments
+
+        self.add_modules_args(self.basic_parser)
        self.add_additional_args(self.basic_parser)
-        self.add_module_args(parser=self.basic_parser)
+        self.add_individual_module_args(parser=self.basic_parser)
        self.basic_parser.print_help()
        self.basic_parser.exit()
-    
-    def setup_logging(self):
+
+    def setup_logging(self, config):
        # setup loguru logging
-        logger.remove(0) # remove the default logger
-        logging_config = self.config['logging']
-        logger.add(sys.stderr, level=logging_config['level'])
-        if log_file := logging_config['file']:
-            logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
+        try:
+            logger.remove(0)  # remove the default logger
+        except ValueError:
+            pass
+
+        logging_config = config['logging']
+
+        # add other logging info
+        if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
+            self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
+            if log_file := logging_config['file']:
+                logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])

    def install_modules(self, modules_by_type):
        """
@@ -246,9 +306,9 @@ class ArchivingOrchestrator:
        orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
        are loaded, the program will exit with an error message.
        """
-        
+
        invalid_modules = []
-        for module_type in BaseModule.MODULE_TYPES:
+        for module_type in MODULE_TYPES:

            step_items = []
            modules_to_load = modules_by_type[f"{module_type}s"]
@@ -273,6 +333,7 @@ class ArchivingOrchestrator:
                        logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
                        exit()
                    # cli_feeder is a pseudo module, it just takes the command line args
+
                    def feed(self) -> Generator[Metadata]:
                        for url in urls:
                            logger.debug(f"Processing URL: '{url}'")
@@ -284,7 +345,6 @@ class ArchivingOrchestrator:
                        '__iter__': feed

                    })()
-  

                    pseudo_module.__iter__ = feed
                    step_items.append(pseudo_module)
@@ -293,7 +353,7 @@ class ArchivingOrchestrator:
                if module in invalid_modules:
                    continue
                try:
-                    loaded_module: BaseModule = get_module(module, self.config)
+                    loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
                except (KeyboardInterrupt, Exception) as e:
                    logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
                    if module_type == 'extractor' and loaded_module.name == module:
@@ -308,48 +368,85 @@ class ArchivingOrchestrator:

            check_steps_ok()
            setattr(self, f"{module_type}s", step_items)
-    
+
    def load_config(self, config_file: str) -> dict:
        if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
            logger.error(f"The configuration file {config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
            exit()

        return read_yaml(config_file)
+    
+    def setup_config(self, args: list) -> dict:
+        """
+        Sets up the configuration file, merging the default config with the user's config
+
+        This function should only ever be run once.
+        """

-    def run(self, args: list) -> None:
-        
        self.setup_basic_parser()

        # parse the known arguments for now (basically, we want the config file)
        basic_config, unused_args = self.basic_parser.parse_known_args(args)

        # setup any custom module paths, so they'll show in the help and for arg parsing
-        setup_paths(basic_config.module_paths)
+        self.module_factory.setup_paths(basic_config.module_paths)

        # if help flag was called, then show the help
        if basic_config.help:
            self.show_help(basic_config)
-
+        # merge command line --feeder etc. args with what's in the yaml config
        yaml_config = self.load_config(basic_config.config_file)
-        self.setup_complete_parser(basic_config, yaml_config, unused_args)
+
+        return self.setup_complete_parser(basic_config, yaml_config, unused_args)
+
+    def setup(self, args: list):
+        """
+        Function to configure all setup of the orchestrator: setup configs and load modules.
+        
+        This method should only ever be called once
+        """
+
+        if self.setup_finished:
+            logger.warning("The `setup_config()` function should only ever be run once. \
+                           If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
+                           For code implementatations, you should call .setup_config() once then you may call .feed() \
+                           multiple times to archive multiple URLs.")
+            return
+
+        self.setup_basic_parser()
+        self.config = self.setup_config(args)

        logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
        self.install_modules(self.config['steps'])

        # log out the modules that were loaded
-        for module_type in BaseModule.MODULE_TYPES:
+        for module_type in MODULE_TYPES:
            logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
+        
+        self.setup_finished = True

-        for _ in self.feed():
-            pass
+    def _command_line_run(self, args: list) -> Generator[Metadata]:
+        """
+        This is the main entry point for the orchestrator, when run from the command line.

-    def cleanup(self)->None:
+        :param args: list of arguments to pass to the orchestrator - these are the command line args
+        
+        You should not call this method from code implementations.
+          
+        This method sets up the configuration, loads the modules, and runs the feed.
+        If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
+        To test configurations, without loading any modules you can also first call 'setup_configs'
+        """
+        self.setup(args)
+        return self.feed()
+
+    def cleanup(self) -> None:
        logger.info("Cleaning up")
        for e in self.extractors:
            e.cleanup()

    def feed(self) -> Generator[Metadata]:
-
+        
        url_count = 0
        for feeder in self.feeders:
            for item in feeder:
@@ -393,7 +490,6 @@ class ArchivingOrchestrator:
                    m.tmp_dir = None
                tmp_dir.cleanup()

-
    def archive(self, result: Metadata) -> Union[Metadata, None]:
        """
            Runs the archiving process for a single URL
@@ -440,13 +536,13 @@ class ArchivingOrchestrator:
            try:
                result.merge(a.download(result))
                if result.is_success(): break
-            except Exception as e: 
+            except Exception as e:
                logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")

        # 4 - call enrichers to work with archived content
        for e in self.enrichers:
            try: e.enrich(result)
-            except Exception as exc: 
+            except Exception as exc:
                logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")

        # 5 - store all downloaded/generated media
@@ -474,13 +570,13 @@ class ArchivingOrchestrator:
        Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
        """
        assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
-        
+
        parsed = urlparse(url)
        assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
        assert parsed.hostname, f"Invalid URL hostname"
        assert parsed.hostname != "localhost", f"Invalid URL"

-        try: # special rules for IP addresses
+        try:  # special rules for IP addresses
            ip = ip_address(parsed.hostname)
        except ValueError: pass
        else:
@@ -489,9 +585,8 @@ class ArchivingOrchestrator:
            assert not ip.is_link_local, f"Invalid IP used"
            assert not ip.is_private, f"Invalid IP used"

-
    # Helper Properties
-    
+
    @property
    def all_modules(self) -> List[Type[BaseModule]]:
-        return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
+        return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
--- a/src/auto_archiver/core/storage.py
+++ b/src/auto_archiver/core/storage.py
@@ -1,3 +1,7 @@
+"""
+Base module for Storage modules – modular components that store media objects in various locations.
+"""
+
 from __future__ import annotations
 from abc import abstractmethod
 from typing import IO
@@ -10,8 +14,14 @@ from auto_archiver.utils.misc import random_str

 from auto_archiver.core import Media, BaseModule, Metadata
 from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
-from auto_archiver.core.module import get_module
+
 class Storage(BaseModule):
+    
+    """
+    Base class for implementing storage modules in the media archiving framework.
+
+    Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
+    """

    def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
        if media.is_stored(in_storage=self): 
@@ -22,10 +32,18 @@ class Storage(BaseModule):
        media.add_url(self.get_cdn_url(media))

    @abstractmethod
-    def get_cdn_url(self, media: Media) -> str: pass
+    def get_cdn_url(self, media: Media) -> str:
+        """
+        Returns the URL of the media object stored in the CDN.
+        """
+        pass

    @abstractmethod
-    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
+    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
+        """
+        Uploads (or saves) a file to the storage service/location.
+        """
+        pass

    def upload(self, media: Media, **kwargs) -> bool:
        logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
@@ -56,7 +74,7 @@ class Storage(BaseModule):
            filename = random_str(24)
        elif filename_generator == "static":
            # load the hash_enricher module
-            he = get_module(HashEnricher, self.config)
+            he = self.module_factory.get_module(HashEnricher, self.config)
            hd = he.calculate_hash(media.filename)
            filename = hd[:24]
        else:
--- a/src/auto_archiver/modules/atlos_db/init.py
+++ b/src/auto_archiver/modules/atlos_db/init.py
@@ -1 +1 @@
-from atlos_db import AtlosDb
+from .atlos_db import AtlosDb
--- a/src/auto_archiver/modules/atlos_db/manifest.py
+++ b/src/auto_archiver/modules/atlos_db/manifest.py
@@ -11,6 +11,8 @@
        "api_token": {
            "default": None,
            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+            "required": True,
+            "type": "str",
        },
        "atlos_url": {
            "default": "https://platform.atlos.org",
--- a/src/auto_archiver/modules/atlos_db/base_configs.py
+++ b/src/auto_archiver/modules/atlos_db/base_configs.py
@@ -1,13 +0,0 @@
-def get_atlos_config_options():
-    return {
-        "api_token": {
-            "default": None,
-            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
-            "type": str
-        },
-        "atlos_url": {
-            "default": "https://platform.atlos.org",
-            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
-            "type": str
-        },
-    }
--- a/src/auto_archiver/modules/atlos_storage/init.py
+++ b/src/auto_archiver/modules/atlos_storage/init.py
@@ -0,0 +1 @@
+from .atlos_storage import AtlosStorage
--- a/src/auto_archiver/modules/atlos_storage/manifest.py
+++ b/src/auto_archiver/modules/atlos_storage/manifest.py
@@ -0,0 +1,32 @@
+{
+    "name": "Atlos Storage",
+    "type": ["storage"],
+    "requires_setup": True,
+    "dependencies": {
+        "python": ["loguru", "boto3"],
+        "bin": []
+    },
+    "description": """
+    Stores media files in a [Atlos](https://www.atlos.org/).
+
+    ### Features
+    - Saves media files to Atlos, organizing them into folders based on the provided path structure.
+
+    ### Notes
+    - Requires setup with Atlos credentials.
+    - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
+    """,
+    "configs": {
+        "api_token": {
+            "default": None,
+            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+            "required": True,
+            "type": "str"
+        },
+        "atlos_url": {
+            "default": "https://platform.atlos.org",
+            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
+            "type": "str"
+        },
+    }
+}
--- a/src/auto_archiver/modules/gdrive_storage/manifest.py
+++ b/src/auto_archiver/modules/gdrive_storage/manifest.py
@@ -32,7 +32,6 @@
    
    GDriveStorage: A storage module for saving archived content to Google Drive.

-    Author: Dave Mateer, (And maintained by: )
    Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python

    ### Features
--- a/src/auto_archiver/modules/generic_extractor/bluesky.py
+++ b/src/auto_archiver/modules/generic_extractor/bluesky.py
@@ -39,11 +39,11 @@ class Bluesky(GenericDropin):
        for image_media in image_medias:
            url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
            image_media = archiver.download_from_url(url)
-            media.append(image_media)
+            media.append(Media(image_media))
        for video_media in video_medias:
            url = media_url.format(video_media['ref']['$link'], post['author']['did'])
            video_media = archiver.download_from_url(url)
-            media.append(video_media)
+            media.append(Media(video_media))
        return media


--- a/src/auto_archiver/modules/generic_extractor/facebook.py
+++ b/src/auto_archiver/modules/generic_extractor/facebook.py
@@ -8,7 +8,8 @@ class Facebook(GenericDropin):
            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
        webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))

-        post_data = ie_instance._extract_from_url.extract_metadata(webpage)
+        # TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
+        post_data = ie_instance._extract_metadata(webpage)
        return post_data
    
    def create_metadata(self, post: dict, ie_instance, archiver, url):
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,6 +1,6 @@
 import datetime, os, yt_dlp, pysubs2
 import importlib
-from typing import Type
+from typing import Generator, Type
 from yt_dlp.extractor.common import InfoExtractor

 from loguru import logger
@@ -11,7 +11,7 @@ from auto_archiver.core import Metadata, Media
 class GenericExtractor(Extractor):
    _dropins = {}

-    def suitable_extractors(self, url: str) -> list[str]:
+    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
        Returns a list of valid extractors for the given URL"""
        for info_extractor in yt_dlp.YoutubeDL()._ies.values():
@@ -116,7 +116,7 @@ class GenericExtractor(Extractor):

    def get_metadata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
        """
-        Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
+        Calls into the ytdlp InfoExtract subclass to use the private _extract_post method to get the post metadata.
        """

        ie_instance = info_extractor(downloader=ydl)
@@ -266,6 +266,11 @@ class GenericExtractor(Extractor):
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()

+        #TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
+        if url.startswith("https://ya.ru"):
+            url = url.replace("https://ya.ru", "https://yandex.ru")
+            item.set("replaced_url", url)
+

        ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 
                       'quiet': False, 'noplaylist': not self.allow_playlist ,
@@ -275,7 +280,7 @@ class GenericExtractor(Extractor):
        
        # set up auth
        auth = self.auth_for_site(url, extract_cookies=False)
-        # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
+        # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
        if auth:
            if 'username' in auth and 'password' in auth:
                logger.debug(f'Using provided auth username and password for {url}')
@@ -284,7 +289,7 @@ class GenericExtractor(Extractor):
            elif 'cookie' in auth:
                logger.debug(f'Using provided auth cookie for {url}')
                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
-            elif 'cookie_from_browser' in auth:
+            elif 'cookies_from_browser' in auth:
                logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
            elif 'cookies_file' in auth:
--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@@ -10,7 +10,7 @@
        "sheet": {"default": None, "help": "name of the sheet to archive"},
        "sheet_id": {
            "default": None,
-            "help": "(alternative to sheet name) the id of the sheet to archive",
+            "help": "the id of the sheet to archive (alternative to 'sheet' config)",
        },
        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -9,9 +9,7 @@ import base64
 from auto_archiver.version import __version__
 from auto_archiver.core import Metadata, Media
 from auto_archiver.core import Formatter
-from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.utils.misc import random_str
-from auto_archiver.core.module import get_module

 class HtmlFormatter(Formatter):
    environment: Environment = None
@@ -51,7 +49,7 @@ class HtmlFormatter(Formatter):
        final_media = Media(filename=html_path, _mimetype="text/html")

        # get the already instantiated hash_enricher module
-        he = get_module('hash_enricher', self.config)
+        he = self.module_factory.get_module('hash_enricher', self.config)
        if len(hd := he.calculate_hash(final_media.filename)):
            final_media.set("hash", f"{he.algorithm}:{hd}")

--- a/src/auto_archiver/modules/html_formatter/templates/html_template.html
+++ b/src/auto_archiver/modules/html_formatter/templates/html_template.html
@@ -200,7 +200,7 @@
                el.innerHTML = decodeCertificate(certificate);

                let cyberChefUrl =
-                    `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
+                    `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`;
                // create a new anchor with this url and append after the code
                let a = document.createElement("a");
                a.href = cyberChefUrl;
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
            chat, since_id = self._send_url_to_bot(url)
            message = self._process_messages(chat, since_id, tmp_dir, result)

+            # This may be outdated and replaced by the below message, but keeping until confirmed
            if "You must enter a URL to a post" in message:
                logger.debug(f"invalid link {url=} for {self.name}: {message}")
                return False
-            # # TODO: It currently returns this as a success - is that intentional?
-            # if "Media not found or unavailable" in message:
-            #     logger.debug(f"invalid link {url=} for {self.name}: {message}")
-            #     return False
+
+            if "Media not found or unavailable" in message:
+                logger.debug(f"No media found for link {url=} for {self.name}: {message}")
+                return False

            if message:
                result.set_content(message).set_title(message[:128])
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@@ -4,7 +4,6 @@
    "requires_setup": True,
    "dependencies": {
        "python": ["loguru", "selenium"],
-        "bin": ["chromedriver"]
    },
    "configs": {
            "width": {"default": 1280, "help": "width of the screenshots"},
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata

 class ScreenshotEnricher(Enricher):

+    def __init__(self, webdriver_factory=None):
+        super().__init__()
+        self.webdriver_factory = webdriver_factory or Webdriver
+
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()

@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):

        logger.debug(f"Enriching screenshot for {url=}")
        auth = self.auth_for_site(url)
-        with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
+        with self.webdriver_factory(
+                self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
                       http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
            try:
                driver.get(url)
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
                logger.info("TimeoutException loading page for screenshot")
            except Exception as e:
                logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
+
--- a/src/auto_archiver/modules/telegram_extractor/manifest.py
+++ b/src/auto_archiver/modules/telegram_extractor/manifest.py
@@ -20,5 +20,6 @@
 - Processes HTML content of messages to retrieve embedded media.
 - Sets structured metadata, including timestamps, content, and media details.
 - Does not require user authentication for Telegram.
+
    """,
 }
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@@ -1,5 +1,5 @@
 {
-    "name": "telethon_extractor",
+    "name": "Telethon Extractor",
    "type": ["extractor"],
    "requires_setup": True,
    "dependencies": {
@@ -40,5 +40,9 @@ To use the `TelethonExtractor`, you must configure the following:
 - **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
 - **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.

+### First Time Login
+The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
+
+
 """
 }
--- a/src/auto_archiver/modules/thumbnail_enricher/manifest.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/manifest.py
@@ -7,8 +7,12 @@
        "bin": ["ffmpeg"]
    },
    "configs": {
-            "thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
-            "max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
+            "thumbnails_per_minute": {"default": 60,
+                                      "type": "int",
+                                      "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
+            "max_thumbnails": {"default": 16,
+                               "type": "int",
+                               "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
        },
    "description": """
    Generates thumbnails for video files to provide visual previews.
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
                        logger.error(f"error getting duration of video {m.filename}: {e}")
                        return

-                num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
+                num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
                timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]

                thumbnails_media = []
--- a/src/auto_archiver/modules/wacz_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_enricher/manifest.py
@@ -1,6 +1,6 @@
 {
    "name": "WACZ Enricher",
-    "type": ["enricher", "archiver"],
+    "type": ["enricher", "extractor"],
    "entry_point": "wacz_enricher::WaczExtractorEnricher",
    "requires_setup": True,
    "dependencies": {
--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@@ -221,4 +221,4 @@ class WaczExtractorEnricher(Enricher, Extractor):
                to_enrich.add_media(m, warc_fn)
                counter += 1
                seen_urls.add(record_url)
-        logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
+        logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
--- a/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
@@ -1,6 +1,6 @@
 {
    "name": "Wayback Machine Enricher",
-    "type": ["enricher", "archiver"],
+    "type": ["enricher", "extractor"],
    "entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
    "requires_setup": True,
    "dependencies": {
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -4,7 +4,6 @@ from loguru import logger

 from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata, Media
-from auto_archiver.core.module import get_module

 class WhisperEnricher(Enricher):
    """
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):

    def setup(self) -> None:
        self.stores = self.config['steps']['storages']
-        self.s3 = get_module("s3_storage", self.config)
+        self.s3 = self.module_factory.get_module("s3_storage", self.config)
        if not "s3_storage" in self.stores:
            logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
            return
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
        job_results = {}
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
-                # TODO: this used to pass all storage items to store now
-                # Now only passing S3, the rest will get added later in the usual order (?)
+                # Only storing S3, the rest will get added later in the usual order (?)
                m.store(url=url, metadata=to_enrich, storages=[self.s3])
                try:
                    job_id = self.submit_job(m)
--- a/src/auto_archiver/utils/init.py
+++ b/src/auto_archiver/utils/init.py
@@ -2,7 +2,6 @@
 # we need to explicitly expose the available imports here
 from .misc import *
 from .webdriver import Webdriver
-from .atlos import get_atlos_config_options

 # handy utils from ytdlp
 from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none, url_or_none)
--- a/src/auto_archiver/utils/atlos.py
+++ b/src/auto_archiver/utils/atlos.py
@@ -1,13 +0,0 @@
-def get_atlos_config_options():
-    return {
-        "api_token": {
-            "default": None,
-            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
-            "cli_set": lambda cli_val, _: cli_val
-        },
-        "atlos_url": {
-            "default": "https://platform.atlos.org",
-            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
-            "cli_set": lambda cli_val, _: cli_val
-        },
-    }
--- a/src/auto_archiver/utils/gsheet.py
+++ b/src/auto_archiver/utils/gsheet.py
--- a/src/auto_archiver/utils/misc.py
+++ b/src/auto_archiver/utils/misc.py
@@ -46,7 +46,7 @@ def dump_payload(p):


 def update_nested_dict(dictionary, update_dict):
-    # takes 2 dicts and overwrites the first with the second only on the changed balues
+    # takes 2 dicts and overwrites the first with the second only on the changed values
    for key, value in update_dict.items():
        if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict):
            update_nested_dict(dictionary[key], value)