From b6b085854c0f417101bafd019c8f66949883fe6c Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 22 Jan 2025 17:40:51 +0100 Subject: [PATCH 1/2] Switch back to using yaml with dot notation (two simple helper functions to convert between dot and dict notation) --- src/auto_archiver/core/config.py | 64 ++++++++++++++++++++------ src/auto_archiver/core/loader.py | 2 +- src/auto_archiver/core/orchestrator.py | 50 ++++++++++---------- 3 files changed, 77 insertions(+), 39 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index db5b6d2..9709be6 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -4,10 +4,13 @@ It supports CLI argument parsing, loading from YAML file, and overrides to allow flexible setup in various environments. """ -import argparse -from configparser import ConfigParser -from dataclasses import dataclass, field +import argparse +import yaml +from dataclasses import dataclass, field +from collections import OrderedDict + +from .loader import MODULE_TYPES # configurable_parents = [ # Feeder, @@ -47,21 +50,56 @@ from dataclasses import dataclass, field # parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') # parser.add_argument('--version', action='version', version=__version__) - +EMPTY_CONFIG = { + "steps": dict((f"{module_type}s", []) for module_type in MODULE_TYPES) +} class LoadFromFile (argparse.Action): def __call__ (self, parser, namespace, values, option_string = None): with values as f: # parse arguments in the file and store them in the target namespace parser.parse_args(f.read().split(), namespace) -def read_config(config_filename: str) -> dict: - config = ConfigParser() - config.read(config_filename) - # setup basic format - if 'STEPS' not in config.sections(): - config.add_section("STEPS") +def to_dot_notation(yaml_conf: str) -> argparse.ArgumentParser: + dotdict = {} + for step, vals in yaml_conf.pop('steps', {}).items(): + if vals: + dotdict[f"{step}s"] = vals + + def process_subdict(subdict, prefix=""): + for key, value in subdict.items(): + if type(value) == dict: + process_subdict(value, f"{prefix}{key}.") + else: + dotdict[f"{prefix}{key}"] = value + + process_subdict(yaml_conf) + return dotdict + +def merge_dicts(dotdict, yaml_dict): + def process_subdict(subdict, prefix=""): + for key, value in subdict.items(): + if "." in key: + keys = key.split(".") + subdict = yaml_dict + for k in keys[:-1]: + subdict = subdict.setdefault(k, {}) + subdict[keys[-1]] = value + else: + yaml_dict[key] = value + + process_subdict(dotdict) + return yaml_dict + +def read_yaml(yaml_filename: str) -> dict: + + try: + with open(yaml_filename, "r", encoding="utf-8") as inf: + config = yaml.safe_load(inf) + except FileNotFoundError: + config = EMPTY_CONFIG + return config -def store_config(config: ConfigParser, config_filename: str): - with open(config_filename, "w", encoding="utf-8") as outf: - config.write(outf) \ No newline at end of file +def store_yaml(config: dict, yaml_filename: str): + with open(yaml_filename, "w", encoding="utf-8") as outf: + yaml.dump(config, outf, default_flow_style=False) \ No newline at end of file diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index d39f31e..4460349 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -62,7 +62,7 @@ class Module: def load_manifest(module_path): - print(f"Loading manifest for module {module_path}") + # print(f"Loading manifest for module {module_path}") # load the manifest file manifest = copy.deepcopy(_DEFAULT_MANIFEST) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 0a2273f..f8df659 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -9,7 +9,6 @@ from typing import Generator, Union, List from urllib.parse import urlparse from ipaddress import ip_address import argparse -import configparser import os from os.path import join, dirname @@ -25,7 +24,7 @@ from ..enrichers import Enricher from ..databases import Database from .metadata import Metadata from ..version import __version__ -from .config import read_config, store_config +from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG from .loader import available_modules, Module, MODULE_TYPES import tempfile, traceback @@ -69,24 +68,23 @@ class ArchivingOrchestrator: parser.add_argument('-s', '--store', action='store_true', dest='store', help='Store the created config in the config file') self.basic_parser = parser - def setup_complete_parser(self, basic_config: dict, ini_config: dict, unused_args: list[str]) -> None: + def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None: parser = argparse.ArgumentParser( parents = [self.basic_parser], add_help=False, ) - + self.add_steps_args(parser) + breakpoint() # check what mode we're in # if we have a config file, use that to decide which modules to load # if simple, we'll load just the modules that has requires_setup = False # if full, we'll load all modules - if ini_config: + if yaml_config != EMPTY_CONFIG: # only load the modules enabled in config + # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] for module_type in MODULE_TYPES: - try: - enabled_modules.extend(ini_config.get("STEPS", module_type)) - except configparser.NoOptionError: - pass + enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter' for module_type in MODULE_TYPES: @@ -100,23 +98,25 @@ class ArchivingOrchestrator: # add them to the config for module in simple_modules: for module_type in module.type: - existing_modules = config['STEPS'] = module.name - ini_config.setdefault(f"{module_type}s", []).append(module.name) - + yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) else: # load all modules, they're not using the 'simple' mode self.add_module_args(available_modules(with_manifest=True), parser) - - parser.set_defaults(**ini_config) + + breakpoint() + parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them - self.config, unknown = parser.parse_known_args(unused_args) + parsed, unknown = parser.parse_known_args(unused_args) if unknown: - logger.warning(f"Ignoring unknown/unused arguments: {unknown}") + logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?") + + # merge the new config with the old one + yaml_config = merge_dicts(vars(parsed), yaml_config) if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): logger.info(f"Storing configuration file to {basic_config.config_file}") - store_config(ini_config, basic_config.config_file) + store_yaml(yaml_config, basic_config.config_file) breakpoint() logger.info(f"FEEDER: {self.config.feeders}") logger.info(f"ENRICHERS: {self.config.enrichers}") @@ -179,16 +179,16 @@ class ArchivingOrchestrator: self.show_help() # load the config file - ini_config = {} + yaml_config = {} - try: - ini_config = read_config(basic_config.config_file) - except FileNotFoundError: - if basic_config.config_file != DEFAULT_CONFIG_FILE: - logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") - exit() + if not os.path.exists(basic_config.config_file) and basic_config.config_file != DEFAULT_CONFIG_FILE: + logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") + exit() - self.setup_complete_parser(basic_config, ini_config, unused_args) + yaml_config = read_yaml(basic_config.config_file) + + + self.setup_complete_parser(basic_config, yaml_config, unused_args) config.parse() From ade5ea0f6f5f8715ea22aa8df32664e905e52b6a Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 22 Jan 2025 18:45:58 +0100 Subject: [PATCH 2/2] Tidy up imports + start on loading modules - program now starts much faster --- src/auto_archiver/__init__.py | 6 --- src/auto_archiver/__main__.py | 3 +- src/auto_archiver/core/__init__.py | 4 -- src/auto_archiver/core/config.py | 3 -- src/auto_archiver/core/loader.py | 60 +++++++++++++++++++++-- src/auto_archiver/core/media.py | 9 ++-- src/auto_archiver/core/orchestrator.py | 61 +++++++++++++----------- src/auto_archiver/databases/__init__.py | 8 +--- src/auto_archiver/enrichers/__init__.py | 12 ----- src/auto_archiver/feeders/__init__.py | 4 -- src/auto_archiver/formatters/__init__.py | 3 -- src/auto_archiver/storages/__init__.py | 7 +-- 12 files changed, 97 insertions(+), 83 deletions(-) delete mode 100644 src/auto_archiver/__init__.py diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py deleted file mode 100644 index 307716d..0000000 --- a/src/auto_archiver/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from . import archivers, databases, enrichers, feeders, formatters, storages, utils, core - -# need to manually specify due to cyclical deps -from .core.orchestrator import ArchivingOrchestrator -# making accessible directly -from .core.metadata import Metadata diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index 8b2a65a..d31ec5c 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -1,6 +1,5 @@ """ Entry point for the auto_archiver package. """ -from . import ArchivingOrchestrator - +from auto_archiver.core.orchestrator import ArchivingOrchestrator def main(): ArchivingOrchestrator().run() diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index b78df83..779d3ac 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -1,10 +1,6 @@ """ Core modules to handle things such as orchestration, metadata and configs.. """ -from .metadata import Metadata -from .media import Media -from .step import Step -from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 9709be6..f5d9fae 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -61,9 +61,6 @@ class LoadFromFile (argparse.Action): def to_dot_notation(yaml_conf: str) -> argparse.ArgumentParser: dotdict = {} - for step, vals in yaml_conf.pop('steps', {}).items(): - if vals: - dotdict[f"{step}s"] = vals def process_subdict(subdict, prefix=""): for key, value in subdict.items(): diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index 4460349..aa03b1f 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -4,12 +4,14 @@ import os import copy from os.path import join, dirname from typing import List - +from loguru import logger +import sys +import shutil MODULE_TYPES = [ 'feeder', 'enricher', - 'archiver', + 'extractor', 'database', 'storage', 'formatter' @@ -59,7 +61,44 @@ class Module: def __repr__(self): return f"Module<'{self.display_name}' ({self.name})>" +def load_modules(modules): + modules = available_modules(limit_to_modules=modules, with_manifest=True) + for module in modules: + _load_module(module) +def _load_module(module): + # first make sure that the 'depends' are installed and available in sys.args + for dependency in module.depends: + if dependency not in sys.modules: + logger.error(f""" + Module {module.name} depends on {dependency} which is not available. + + Have you set up the '{module.name}' module correctly? See the README for more information. + """) + exit() + # then check the external dependencies, these are binary dependencies that should be available on the path + for dep_type, deps in module.external_dependencies.items(): + if dep_type == 'python': + for dep in deps: + if dep not in sys.modules: + logger.error(f""" + Module {module.name} requires {dep} which is not available. + + Have you installed the required dependencies for the '{module.name}' module? See the README for more information. + """) + + elif dep_type == 'binary': + for dep in deps: + if not shutil.which(dep): + logger.error(f""" + Module {module.name} requires {dep} which is not available. + + Have you installed the required dependencies for the '{module.name}' module? See the README for more information. + """) + # finally, load the module + logger.info(f"Loading module {module.display_name}") + module = __import__(module.entry_point, fromlist=[module.entry_point]) + logger.info(f"Module {module.display_name} loaded") def load_manifest(module_path): # print(f"Loading manifest for module {module_path}") @@ -70,7 +109,7 @@ def load_manifest(module_path): manifest.update(ast.literal_eval(f.read())) return manifest -def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[Module]: +def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], ) -> List[Module]: # search through all valid 'modules' paths. Default is 'modules' in the current directory # see odoo/modules/module.py -> get_modules @@ -83,7 +122,16 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals for module_folder in default_path + additional_paths: # walk through each module in module_folder and check if it has a valid manifest - for possible_module in os.listdir(module_folder): + try: + possible_modules = os.listdir(module_folder) + except FileNotFoundError: + logger.warning(f"Module folder {module_folder} does not exist") + continue + + for possible_module in possible_modules: + if limit_to_modules and possible_module not in limit_to_modules: + continue + possible_module_path = join(module_folder, possible_module) if not is_really_module(possible_module_path): continue @@ -93,5 +141,9 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals else: manifest = {} all_modules.append(Module(possible_module, possible_module_path, manifest)) + + for module in limit_to_modules: + if not any(module == m.name for m in all_modules): + logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?") return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index d204a6e..e5026af 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -11,9 +11,6 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes -import ffmpeg -from ffmpeg._run import Error - from .context import ArchivingContext from loguru import logger @@ -106,6 +103,12 @@ class Media: return self.mimetype.startswith("image") def is_valid_video(self) -> bool: + # Note: this is intentional, to only import ffmpeg here - when the method is called + # this speeds up loading the module. We check that 'ffmpeg' is available on startup + # when we load each manifest file + import ffmpeg + from ffmpeg._run import Error + # checks for video streams with ffmpeg, or min file size for a video # self.is_video() should be used together with this method try: diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index f8df659..ee3a190 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -16,16 +16,10 @@ from rich_argparse import RichHelpFormatter from .context import ArchivingContext -from ..archivers import Archiver -from ..feeders import Feeder -from ..formatters import Formatter -from ..storages import Storage -from ..enrichers import Enricher -from ..databases import Database from .metadata import Metadata from ..version import __version__ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG -from .loader import available_modules, Module, MODULE_TYPES +from .loader import available_modules, Module, MODULE_TYPES, load_modules import tempfile, traceback from loguru import logger @@ -74,7 +68,7 @@ class ArchivingOrchestrator: add_help=False, ) self.add_steps_args(parser) - breakpoint() + # check what mode we're in # if we have a config file, use that to decide which modules to load # if simple, we'll load just the modules that has requires_setup = False @@ -91,7 +85,7 @@ class ArchivingOrchestrator: if modules := getattr(basic_config, f"{module_type}s", []): enabled_modules.extend(modules) - self.add_module_args(available_modules(enabled_modules, with_manifest=True), parser) + self.add_module_args(available_modules(with_manifest=True, limit_to_modules=enabled_modules), parser) elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) @@ -103,7 +97,7 @@ class ArchivingOrchestrator: # load all modules, they're not using the 'simple' mode self.add_module_args(available_modules(with_manifest=True), parser) - breakpoint() + parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -114,27 +108,30 @@ class ArchivingOrchestrator: # merge the new config with the old one yaml_config = merge_dicts(vars(parsed), yaml_config) - if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): + if basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): logger.info(f"Storing configuration file to {basic_config.config_file}") store_yaml(yaml_config, basic_config.config_file) - breakpoint() - logger.info(f"FEEDER: {self.config.feeders}") - logger.info(f"ENRICHERS: {self.config.enrichers}") - logger.info(f"ARCHIVERS: {self.config.archivers}") - logger.info(f"DATABASES: {self.config.databases}") - logger.info(f"STORAGES: {self.config.storages}") - logger.info(f"FORMATTER: {self.formatter.name}") + + self.config = yaml_config + + logger.info("FEEDERS: " + ", ".join(self.config['steps']['feeders'])) + logger.info("EXTRACTORS: " + ", ".join(self.config['steps']['extractors'])) + logger.info("ENRICHERS: " + ", ".join(self.config['steps']['enrichers'])) + logger.info("DATABASES: " + ", ".join(self.config['steps']['databases'])) + logger.info("STORAGES: " + ", ".join(self.config['steps']['storages'])) + logger.info("FORMATTERS: " + ", ".join(self.config['steps']['formatters'])) + return self.config def add_steps_args(self, parser: argparse.ArgumentParser = None): if not parser: parser = self.parser - parser.add_argument('--feeders', action='store', dest='feeders', nargs='+', required=True, help='the feeders to use') - parser.add_argument('--enrichers', action='store', dest='enrichers', nargs='+', required=True, help='the enrichers to use') - parser.add_argument('--archivers', action='store', dest='archivers', nargs='+', required=True, help='the archivers to use') - parser.add_argument('--databases', action='store', dest='databases', nargs='+', required=True, help='the databases to use') - parser.add_argument('--storages', action='store', dest='storages', nargs='+', required=True, help='the storages to use') - parser.add_argument('--formatter', action='store', dest='formatter', nargs='+', required=True, help='the formatter to use') + parser.add_argument('--feeders', action='store', dest='steps.feeders', nargs='+', required=True, help='the feeders to use') + parser.add_argument('--enrichers', action='store', dest='steps.enrichers', nargs='+', required=True, help='the enrichers to use') + parser.add_argument('--extractors', action='store', dest='steps.extractors', nargs='+', required=True, help='the extractors to use') + parser.add_argument('--databases', action='store', dest='steps.databases', nargs='+', required=True, help='the databases to use') + parser.add_argument('--storages', action='store', dest='steps.storages', nargs='+', required=True, help='the storages to use') + parser.add_argument('--formatters', action='store', dest='steps.formatters', nargs='+', required=True, help='the formatter to use') def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None): @@ -165,6 +162,12 @@ class ArchivingOrchestrator: self.basic_parser.print_help() exit() + + def install_modules(self): + modules = set() + [modules.update(*m) for m in self.config['steps'].values()] + + load_modules(modules) def run(self) -> None: self.setup_basic_parser() @@ -187,11 +190,10 @@ class ArchivingOrchestrator: yaml_config = read_yaml(basic_config.config_file) - + breakpoint() self.setup_complete_parser(basic_config, yaml_config, unused_args) - config.parse() - + self.install_modules() for item in self.feed(): pass @@ -201,8 +203,9 @@ class ArchivingOrchestrator: for a in self.all_archivers_for_setup(): a.cleanup() def feed(self) -> Generator[Metadata]: - for item in self.feeder: - yield self.feed_item(item) + for feeder in self.config['steps']['feeders']: + for item in feeder: + yield self.feed_item(item) self.cleanup() def feed_item(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py index 4c73896..5aaa679 100644 --- a/src/auto_archiver/databases/__init__.py +++ b/src/auto_archiver/databases/__init__.py @@ -1,10 +1,4 @@ """ Databases are used to store the outputs from running the Autp Archiver. -""" -from .database import Database -from .gsheet_db import GsheetsDb -from .console_db import ConsoleDb -from .csv_db import CSVDb -from .api_db import AAApiDb -from .atlos_db import AtlosDb \ No newline at end of file +""" \ No newline at end of file diff --git a/src/auto_archiver/enrichers/__init__.py b/src/auto_archiver/enrichers/__init__.py index 64ce248..67cb0e5 100644 --- a/src/auto_archiver/enrichers/__init__.py +++ b/src/auto_archiver/enrichers/__init__.py @@ -10,15 +10,3 @@ Enrichers are optional but highly useful for making the archived data more power """ -from .enricher import Enricher -from .screenshot_enricher import ScreenshotEnricher -from .wayback_enricher import WaybackArchiverEnricher -from .hash_enricher import HashEnricher -from .thumbnail_enricher import ThumbnailEnricher -from .wacz_enricher import WaczArchiverEnricher -from .whisper_enricher import WhisperEnricher -from .pdq_hash_enricher import PdqHashEnricher -from .metadata_enricher import MetadataEnricher -from .meta_enricher import MetaEnricher -from .ssl_enricher import SSLEnricher -from .timestamping_enricher import TimestampingEnricher \ No newline at end of file diff --git a/src/auto_archiver/feeders/__init__.py b/src/auto_archiver/feeders/__init__.py index 8117672..3eb33d7 100644 --- a/src/auto_archiver/feeders/__init__.py +++ b/src/auto_archiver/feeders/__init__.py @@ -1,7 +1,3 @@ """ Feeders handle the input of media into the Auto Archiver. """ -from.feeder import Feeder -from .gsheet_feeder import GsheetsFeeder -from .cli_feeder import CLIFeeder -from .atlos_feeder import AtlosFeeder \ No newline at end of file diff --git a/src/auto_archiver/formatters/__init__.py b/src/auto_archiver/formatters/__init__.py index af96f15..1a9dcd0 100644 --- a/src/auto_archiver/formatters/__init__.py +++ b/src/auto_archiver/formatters/__init__.py @@ -1,4 +1 @@ """ Formatters for the output of the content. """ -from .formatter import Formatter -from .html_formatter import HtmlFormatter -from .mute_formatter import MuteFormatter \ No newline at end of file diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py index bff83e6..0765833 100644 --- a/src/auto_archiver/storages/__init__.py +++ b/src/auto_archiver/storages/__init__.py @@ -1,8 +1,3 @@ """ This module contains the storage classes for the auto-archiver. -""" -from .storage import Storage -from .s3 import S3Storage -from .local import LocalStorage -from .gd import GDriveStorage -from .atlos import AtlosStorage \ No newline at end of file +""" \ No newline at end of file