diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py index 9712439..a5f2998 100644 --- a/docs/scripts/scripts.py +++ b/docs/scripts/scripts.py @@ -1,6 +1,6 @@ # iterate through all the modules in auto_archiver.modules and turn the __manifest__.py file into a markdown table from pathlib import Path -from auto_archiver.core.module import available_modules +from auto_archiver.core.module import ModuleFactory from auto_archiver.core.base_module import BaseModule from ruamel.yaml import YAML import io @@ -41,7 +41,7 @@ def generate_module_docs(): configs_cheatsheet = "\n## Configuration Options\n" configs_cheatsheet += header_row - for module in sorted(available_modules(with_manifest=True), key=lambda x: (x.requires_setup, x.name)): + for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)): # generate the markdown file from the __manifest__.py file. manifest = module.manifest diff --git a/pyproject.toml b/pyproject.toml index 29de7e4..3c64eae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "0.13.3" +version = "0.13.4" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index ae4c41c..78d9a3d 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -3,7 +3,7 @@ """ from .metadata import Metadata from .media import Media -from .module import BaseModule +from .base_module import BaseModule # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index dfdd5ad..50ea3ff 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -1,13 +1,18 @@ -from urllib.parse import urlparse -from typing import Mapping, Any +from __future__ import annotations + +from typing import Mapping, Any, Type, TYPE_CHECKING from abc import ABC from copy import deepcopy, copy from tempfile import TemporaryDirectory from auto_archiver.utils import url as UrlUtil +from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES from loguru import logger +if TYPE_CHECKING: + from .module import ModuleFactory + class BaseModule(ABC): """ @@ -17,41 +22,24 @@ class BaseModule(ABC): however modules can have a .setup() method to run any setup code (e.g. logging in to a site, spinning up a browser etc.) - See BaseModule.MODULE_TYPES for the types of modules you can create, noting that + See consts.MODULE_TYPES for the types of modules you can create, noting that a subclass can be of multiple types. For example, a module that extracts data from a website and stores it in a database would be both an 'extractor' and a 'database' module. Each module is a python package, and should have a __manifest__.py file in the same directory as the module file. The __manifest__.py specifies the module information - like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the + like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the default manifest structure. """ - MODULE_TYPES = [ - 'feeder', - 'extractor', - 'enricher', - 'database', - 'storage', - 'formatter' - ] - - _DEFAULT_MANIFEST = { - 'name': '', # the display name of the module - 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! - 'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES - 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare - 'description': '', # a description of the module - 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format - 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName - 'version': '1.0', # the version of the module - 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line -} + MODULE_TYPES = CONF_MODULE_TYPES + # NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method config: Mapping[str, Any] authentication: Mapping[str, Mapping[str, str]] name: str + module_factory: ModuleFactory # this is set by the orchestrator prior to archiving tmp_dir: TemporaryDirectory = None diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 322ef6e..c3bc706 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer from loguru import logger from copy import deepcopy -from .module import BaseModule +from auto_archiver.core.consts import MODULE_TYPES from typing import Any, List, Type, Tuple @@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load(""" # Auto Archiver Configuration # Steps are the modules that will be run in the order they are defined -steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \ +steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \ """ # Global configuration diff --git a/src/auto_archiver/core/consts.py b/src/auto_archiver/core/consts.py new file mode 100644 index 0000000..0fb81fb --- /dev/null +++ b/src/auto_archiver/core/consts.py @@ -0,0 +1,23 @@ + +MODULE_TYPES = [ + 'feeder', + 'extractor', + 'enricher', + 'database', + 'storage', + 'formatter' +] + +MANIFEST_FILE = "__manifest__.py" + +DEFAULT_MANIFEST = { + 'name': '', # the display name of the module + 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! + 'type': [], # the type of the module, can be one or more of MODULE_TYPES + 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare + 'description': '', # a description of the module + 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format + 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName + 'version': '1.0', # the version of the module + 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line +} \ No newline at end of file diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index c81e26a..9556621 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin from __future__ import annotations from dataclasses import dataclass -from typing import List +from typing import List, TYPE_CHECKING import shutil import ast import copy @@ -16,99 +16,113 @@ import os from os.path import join from loguru import logger import auto_archiver -from .base_module import BaseModule +from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE -_LAZY_LOADED_MODULES = {} - -MANIFEST_FILE = "__manifest__.py" +if TYPE_CHECKING: + from .base_module import BaseModule -def setup_paths(paths: list[str]) -> None: - """ - Sets up the paths for the modules to be loaded from - - This is necessary for the modules to be imported correctly - - """ - for path in paths: - # check path exists, if it doesn't, log a warning - if not os.path.exists(path): - logger.warning(f"Path '{path}' does not exist. Skipping...") - continue +HAS_SETUP_PATHS = False - # see odoo/module/module.py -> initialize_sys_path - if path not in auto_archiver.modules.__path__: - auto_archiver.modules.__path__.append(path) +class ModuleFactory: - # sort based on the length of the path, so that the longest path is last in the list - auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True) + def __init__(self): + self._lazy_modules = {} -def get_module(module_name: str, config: dict) -> BaseModule: - """ - Gets and sets up a module using the provided config - - This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy) - - """ - return get_module_lazy(module_name).load(config) + def setup_paths(self, paths: list[str]) -> None: + """ + Sets up the paths for the modules to be loaded from + + This is necessary for the modules to be imported correctly + + """ + global HAS_SETUP_PATHS -def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule: - """ - Lazily loads a module, returning a LazyBaseModule - - This has all the information about the module, but does not load the module itself or its dependencies - - To load an actual module, call .setup() on a lazy module - - """ - if module_name in _LAZY_LOADED_MODULES: - return _LAZY_LOADED_MODULES[module_name] - - available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings) - if not available: - raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?") - return available[0] - -def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: - - # search through all valid 'modules' paths. Default is 'modules' in the current directory - - # see odoo/modules/module.py -> get_modules - def is_really_module(module_path): - if os.path.isfile(join(module_path, MANIFEST_FILE)): - return True - - all_modules = [] - - for module_folder in auto_archiver.modules.__path__: - # walk through each module in module_folder and check if it has a valid manifest - try: - possible_modules = os.listdir(module_folder) - except FileNotFoundError: - logger.warning(f"Module folder {module_folder} does not exist") - continue - - for possible_module in possible_modules: - if limit_to_modules and possible_module not in limit_to_modules: + for path in paths: + # check path exists, if it doesn't, log a warning + if not os.path.exists(path): + logger.warning(f"Path '{path}' does not exist. Skipping...") continue - possible_module_path = join(module_folder, possible_module) - if not is_really_module(possible_module_path): + # see odoo/module/module.py -> initialize_sys_path + if path not in auto_archiver.modules.__path__: + if HAS_SETUP_PATHS == True: + logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \ + This could lead to unexpected behaviour. It is recommended to only use a single modules path. \ + If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).") + auto_archiver.modules.__path__.append(path) + + # sort based on the length of the path, so that the longest path is last in the list + auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True) + + HAS_SETUP_PATHS = True + + def get_module(self, module_name: str, config: dict) -> BaseModule: + """ + Gets and sets up a module using the provided config + + This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy) + + """ + return self.get_module_lazy(module_name).load(config) + + def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule: + """ + Lazily loads a module, returning a LazyBaseModule + + This has all the information about the module, but does not load the module itself or its dependencies + + To load an actual module, call .setup() on a lazy module + + """ + if module_name in self._lazy_modules: + return self._lazy_modules[module_name] + + available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings) + if not available: + raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?") + return available[0] + + def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: + + # search through all valid 'modules' paths. Default is 'modules' in the current directory + + # see odoo/modules/module.py -> get_modules + def is_really_module(module_path): + if os.path.isfile(join(module_path, MANIFEST_FILE)): + return True + + all_modules = [] + + for module_folder in auto_archiver.modules.__path__: + # walk through each module in module_folder and check if it has a valid manifest + try: + possible_modules = os.listdir(module_folder) + except FileNotFoundError: + logger.warning(f"Module folder {module_folder} does not exist") continue - if _LAZY_LOADED_MODULES.get(possible_module): - continue - lazy_module = LazyBaseModule(possible_module, possible_module_path) - _LAZY_LOADED_MODULES[possible_module] = lazy_module + for possible_module in possible_modules: + if limit_to_modules and possible_module not in limit_to_modules: + continue - all_modules.append(lazy_module) - - if not suppress_warnings: - for module in limit_to_modules: - if not any(module == m.name for m in all_modules): - logger.warning(f"Module '{module}' not found. Are you sure it's installed?") + possible_module_path = join(module_folder, possible_module) + if not is_really_module(possible_module_path): + continue + if self._lazy_modules.get(possible_module): + continue + lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self) - return all_modules + self._lazy_modules[possible_module] = lazy_module + + all_modules.append(lazy_module) + + if not suppress_warnings: + for module in limit_to_modules: + if not any(module == m.name for m in all_modules): + logger.warning(f"Module '{module}' not found. Are you sure it's installed?") + + return all_modules @dataclass class LazyBaseModule: @@ -123,14 +137,16 @@ class LazyBaseModule: type: list description: str path: str + module_factory: ModuleFactory _manifest: dict = None _instance: BaseModule = None _entry_point: str = None - def __init__(self, module_name, path): + def __init__(self, module_name, path, factory: ModuleFactory): self.name = module_name self.path = path + self.module_factory = factory @property def entry_point(self): @@ -161,7 +177,7 @@ class LazyBaseModule: return self._manifest # print(f"Loading manifest for module {module_path}") # load the manifest file - manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST) + manifest = copy.deepcopy(DEFAULT_MANIFEST) with open(join(self.path, MANIFEST_FILE)) as f: try: @@ -189,13 +205,14 @@ class LazyBaseModule: # clear out any empty strings that a user may have erroneously added continue if not check(dep): - logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") + logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \ + Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") exit(1) def check_python_dep(dep): # first check if it's a module: try: - m = get_module_lazy(dep, suppress_warnings=True) + m = self.module_factory.get_module_lazy(dep, suppress_warnings=True) try: # we must now load this module and set it up with the config m.load(config) @@ -230,19 +247,21 @@ class LazyBaseModule: __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) # finally, get the class instance instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)() - if not getattr(instance, 'name', None): - instance.name = self.name - - if not getattr(instance, 'display_name', None): - instance.display_name = self.display_name - - self._instance = instance + # set the name, display name and module factory + instance.name = self.name + instance.display_name = self.display_name + instance.module_factory = self.module_factory + # merge the default config with the user config default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) + config[self.name] = default_config | config.get(self.name, {}) instance.config_setup(config) instance.setup() + + # save the instance for future easy loading + self._instance = instance return instance def __repr__(self): diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 208512a..10d9215 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -5,7 +5,7 @@ """ from __future__ import annotations -from typing import Generator, Union, List, Type +from typing import Generator, Union, List, Type, TYPE_CHECKING from urllib.parse import urlparse from ipaddress import ip_address from copy import copy @@ -22,12 +22,14 @@ from rich_argparse import RichHelpFormatter from .metadata import Metadata, Media from auto_archiver.version import __version__ from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser -from .module import available_modules, LazyBaseModule, get_module, setup_paths +from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher -from .module import BaseModule - +from .consts import MODULE_TYPES from loguru import logger +if TYPE_CHECKING: + from .base_module import BaseModule + from .module import LazyBaseModule DEFAULT_CONFIG_FILE = "orchestration.yaml" @@ -95,6 +97,12 @@ class UniqueAppendAction(argparse.Action): class ArchivingOrchestrator: + # instance variables + module_factory: ModuleFactory + setup_finished: bool + logger_id: int + + # instance variables, used for convenience to access modules by step feeders: List[Type[Feeder]] extractors: List[Type[Extractor]] enrichers: List[Type[Enricher]] @@ -102,6 +110,11 @@ class ArchivingOrchestrator: storages: List[Type[Storage]] formatters: List[Type[Formatter]] + def __init__(self): + self.module_factory = ModuleFactory() + self.setup_finished = False + self.logger_id = None + def setup_basic_parser(self): parser = argparse.ArgumentParser( prog="auto-archiver", @@ -133,7 +146,7 @@ class ArchivingOrchestrator: ) self.add_modules_args(modules_parser) cli_modules, unused_args = modules_parser.parse_known_args(unused_args) - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", []) parser = DefaultValidatingParser( @@ -155,15 +168,15 @@ class ArchivingOrchestrator: # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] # first loads the modules from the config file, then from the command line - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) # clear out duplicates, but keep the order enabled_modules = list(dict.fromkeys(enabled_modules)) - avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True) + avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True) self.add_individual_module_args(avail_modules, parser) elif basic_config.mode == 'simple': - simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] + simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup] self.add_individual_module_args(simple_modules, parser) # for simple mode, we use the cli_feeder and any modules that don't require setup @@ -176,7 +189,7 @@ class ArchivingOrchestrator: yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) else: # load all modules, they're not using the 'simple' mode - self.add_individual_module_args(available_modules(with_manifest=True), parser) + self.add_individual_module_args(self.module_factory.available_modules(), parser) parser.set_defaults(**to_dot_notation(yaml_config)) @@ -206,7 +219,7 @@ class ArchivingOrchestrator: parser = self.parser # Module loading from the command line - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction) def add_additional_args(self, parser: argparse.ArgumentParser = None): @@ -232,7 +245,7 @@ class ArchivingOrchestrator: def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: if not modules: - modules = available_modules(with_manifest=True) + modules = self.module_factory.available_modules() for module in modules: @@ -274,11 +287,18 @@ class ArchivingOrchestrator: def setup_logging(self, config): # setup loguru logging - logger.remove(0) # remove the default logger + try: + logger.remove(0) # remove the default logger + except ValueError: + pass + logging_config = config['logging'] - logger.add(sys.stderr, level=logging_config['level']) - if log_file := logging_config['file']: - logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) + + # add other logging info + if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0 + self.logger_id = logger.add(sys.stderr, level=logging_config['level']) + if log_file := logging_config['file']: + logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) def install_modules(self, modules_by_type): """ @@ -288,7 +308,7 @@ class ArchivingOrchestrator: """ invalid_modules = [] - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: step_items = [] modules_to_load = modules_by_type[f"{module_type}s"] @@ -333,7 +353,7 @@ class ArchivingOrchestrator: if module in invalid_modules: continue try: - loaded_module: BaseModule = get_module(module, self.config) + loaded_module: BaseModule = self.module_factory.get_module(module, self.config) except (KeyboardInterrupt, Exception) as e: logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") if module_type == 'extractor' and loaded_module.name == module: @@ -359,14 +379,17 @@ class ArchivingOrchestrator: def setup_config(self, args: list) -> dict: """ Sets up the configuration file, merging the default config with the user's config + + This function should only ever be run once. """ + self.setup_basic_parser() # parse the known arguments for now (basically, we want the config file) basic_config, unused_args = self.basic_parser.parse_known_args(args) # setup any custom module paths, so they'll show in the help and for arg parsing - setup_paths(basic_config.module_paths) + self.module_factory.setup_paths(basic_config.module_paths) # if help flag was called, then show the help if basic_config.help: @@ -378,16 +401,29 @@ class ArchivingOrchestrator: def setup(self, args: list): """ - Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser + Function to configure all setup of the orchestrator: setup configs and load modules. + + This method should only ever be called once """ + + if self.setup_finished: + logger.warning("The `setup_config()` function should only ever be run once. \ + If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \ + For code implementatations, you should call .setup_config() once then you may call .feed() \ + multiple times to archive multiple URLs.") + return + + self.setup_basic_parser() self.config = self.setup_config(args) logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") self.install_modules(self.config['steps']) # log out the modules that were loaded - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) + + self.setup_finished = True def _command_line_run(self, args: list) -> Generator[Metadata]: """ diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 15d4705..1535eab 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -14,7 +14,7 @@ from auto_archiver.utils.misc import random_str from auto_archiver.core import Media, BaseModule, Metadata from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher -from auto_archiver.core.module import get_module + class Storage(BaseModule): """ @@ -74,7 +74,7 @@ class Storage(BaseModule): filename = random_str(24) elif filename_generator == "static": # load the hash_enricher module - he = get_module(HashEnricher, self.config) + he = self.module_factory.get_module(HashEnricher, self.config) hd = he.calculate_hash(media.filename) filename = hd[:24] else: diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index ce4e67b..deb4b44 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -10,7 +10,6 @@ from auto_archiver.version import __version__ from auto_archiver.core import Metadata, Media from auto_archiver.core import Formatter from auto_archiver.utils.misc import random_str -from auto_archiver.core.module import get_module class HtmlFormatter(Formatter): environment: Environment = None @@ -50,7 +49,7 @@ class HtmlFormatter(Formatter): final_media = Media(filename=html_path, _mimetype="text/html") # get the already instantiated hash_enricher module - he = get_module('hash_enricher', self.config) + he = self.module_factory.get_module('hash_enricher', self.config) if len(hd := he.calculate_hash(final_media.filename)): final_media.set("hash", f"{he.algorithm}:{hd}") diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 917ab85..d63d2ed 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -4,7 +4,6 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media -from auto_archiver.core.module import get_module class WhisperEnricher(Enricher): """ @@ -15,7 +14,7 @@ class WhisperEnricher(Enricher): def setup(self) -> None: self.stores = self.config['steps']['storages'] - self.s3 = get_module("s3_storage", self.config) + self.s3 = self.module_factory.get_module("s3_storage", self.config) if not "s3_storage" in self.stores: logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") return diff --git a/tests/conftest.py b/tests/conftest.py index 2927735..a94abcd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,7 @@ import hashlib import pytest from auto_archiver.core.metadata import Metadata -from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES +from auto_archiver.core.module import ModuleFactory # Test names inserted into this list will be run last. This is useful for expensive/costly tests # that you only want to run if everything else succeeds (e.g. API calls). The order here is important @@ -22,19 +22,19 @@ TESTS_TO_RUN_LAST = ['test_twitter_api_archiver'] def setup_module(request): def _setup_module(module_name, config={}): + module_factory = ModuleFactory() + if isinstance(module_name, type): # get the module name: # if the class does not have a .name, use the name of the parent folder module_name = module_name.__module__.rsplit(".",2)[-2] - m = get_module(module_name, {module_name: config}) - + m = module_factory.get_module(module_name, {module_name: config}) # add the tmp_dir to the module tmp_dir = TemporaryDirectory() m.tmp_dir = tmp_dir.name - + def cleanup(): - _LAZY_LOADED_MODULES.pop(module_name) tmp_dir.cleanup() request.addfinalizer(cleanup) diff --git a/tests/enrichers/test_hash_enricher.py b/tests/enrichers/test_hash_enricher.py index 4b61fc2..c2fe67a 100644 --- a/tests/enrichers/test_hash_enricher.py +++ b/tests/enrichers/test_hash_enricher.py @@ -2,7 +2,7 @@ import pytest from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.core import Metadata, Media -from auto_archiver.core.module import get_module_lazy +from auto_archiver.core.module import ModuleFactory @pytest.mark.parametrize("algorithm, filename, expected_hash", [ ("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"), @@ -22,7 +22,7 @@ def test_default_config_values(setup_module): def test_config(): # test default config - c = get_module_lazy('hash_enricher').configs + c = ModuleFactory().get_module_lazy('hash_enricher').configs assert c["algorithm"]["default"] == "SHA-256" assert c["chunksize"]["default"] == 16000000 assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"] diff --git a/tests/test_modules.py b/tests/test_modules.py index 854edb5..7a2b14d 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -1,24 +1,18 @@ import sys import pytest -from auto_archiver.core.module import get_module_lazy, BaseModule, LazyBaseModule, _LAZY_LOADED_MODULES +from auto_archiver.core.module import ModuleFactory, LazyBaseModule +from auto_archiver.core.base_module import BaseModule @pytest.fixture def example_module(): import auto_archiver + module_factory = ModuleFactory() + previous_path = auto_archiver.modules.__path__ auto_archiver.modules.__path__.append("tests/data/test_modules/") - module = get_module_lazy("example_module") - yield module - # cleanup - try: - del module._manifest - except AttributeError: - pass - del _LAZY_LOADED_MODULES["example_module"] - sys.modules.pop("auto_archiver.modules.example_module.example_module", None) - auto_archiver.modules.__path__ = previous_path + return module_factory.get_module_lazy("example_module") def test_get_module_lazy(example_module): assert example_module.name == "example_module" @@ -46,12 +40,14 @@ def test_module_dependency_check_loads_module(example_module): # monkey patch the manifest to include a nonexistnet dependency example_module.manifest["dependencies"]["python"] = ["hash_enricher"] + module_factory = example_module.module_factory + loaded_module = example_module.load({}) assert loaded_module is not None # check the dependency is loaded - assert _LAZY_LOADED_MODULES["hash_enricher"] is not None - assert _LAZY_LOADED_MODULES["hash_enricher"]._instance is not None + assert module_factory._lazy_modules["hash_enricher"] is not None + assert module_factory._lazy_modules["hash_enricher"]._instance is not None def test_load_module(example_module): @@ -69,7 +65,7 @@ def test_load_module(example_module): @pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) def test_load_modules(module_name): # test that specific modules can be loaded - module = get_module_lazy(module_name) + module = ModuleFactory().get_module_lazy(module_name) assert module is not None assert isinstance(module, LazyBaseModule) assert module.name == module_name @@ -86,7 +82,7 @@ def test_load_modules(module_name): @pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) def test_lazy_base_module(module_name): - lazy_module = get_module_lazy(module_name) + lazy_module = ModuleFactory().get_module_lazy(module_name) assert lazy_module is not None assert isinstance(lazy_module, LazyBaseModule) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index f93f8b8..301e4d9 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -4,7 +4,7 @@ from argparse import ArgumentParser, ArgumentTypeError from auto_archiver.core.orchestrator import ArchivingOrchestrator from auto_archiver.version import __version__ from auto_archiver.core.config import read_yaml, store_yaml -from auto_archiver.core.module import _LAZY_LOADED_MODULES + TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml" TEST_MODULES = "tests/data/test_modules/" @@ -17,22 +17,7 @@ def test_args(): @pytest.fixture def orchestrator(): - yield ArchivingOrchestrator() - # hack - the loguru logger starts with one logger, but if orchestrator has run before - # it'll remove the default logger, add it back in: - - from loguru import logger - - if not logger._core.handlers.get(0): - logger._core.handlers_count = 0 - logger.add(sys.stderr) - # and remove the custom logger - if logger._core.handlers.get(1): - logger.remove(1) - - # delete out any loaded modules - _LAZY_LOADED_MODULES.clear() - + return ArchivingOrchestrator() @pytest.fixture def basic_parser(orchestrator) -> ArgumentParser: