Get module loading working properly

This commit is contained in:
Patrick Robertson
2025-01-22 23:54:21 +01:00
parent ade5ea0f6f
commit 550097ab7b
8 changed files with 100 additions and 131 deletions

View File

@@ -19,22 +19,12 @@ from ..core import Metadata, Step, ArchivingContext
@dataclass
class Archiver(Step):
class Archiver:
"""
Base class for implementing archivers in the media archiving framework.
Subclasses must implement the `download` method to define platform-specific behavior.
"""
name = "archiver"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def init(name: str, config: dict) -> Archiver:
# only for typing...
return Step.init(name, config, Archiver)
def setup(self) -> None:
# used when archivers need to login or do other one-time setup
pass

View File

@@ -1,6 +1,10 @@
""" Core modules to handle things such as orchestration, metadata and configs..
"""
from .metadata import Metadata
from .media import Media
from .step import Step
from .context import ArchivingContext
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

View File

@@ -1,5 +1,7 @@
import ast
from dataclasses import dataclass, field
from typing import Type
from importlib.util import find_spec
from dataclasses import dataclass
import os
import copy
from os.path import join, dirname
@@ -22,9 +24,8 @@ _DEFAULT_MANIFEST = {
'name': '',
'author': 'Bellingcat',
'requires_setup': True,
'depends': [],
'description': '',
'external_dependencies': {},
'dependencies': {},
'entry_point': '',
'version': '1.0',
'configs': {}
@@ -35,9 +36,7 @@ class Module:
name: str
display_name: str
type: list
entry_point: str
depends: list
external_dependencies: dict
dependencies: dict
requires_setup: bool
configs: dict
description: str
@@ -51,54 +50,47 @@ class Module:
if manifest:
self.display_name = manifest['name']
self.type = manifest['type']
self.entry_point = manifest['entry_point']
self.depends = manifest['depends']
self.external_dependencies = manifest['external_dependencies']
self._entry_point = manifest['entry_point']
self.dependencies = manifest['dependencies']
self.requires_setup = manifest['requires_setup']
self.configs = manifest['configs']
self.description = manifest['description']
@property
def entry_point(self):
if not self._entry_point:
# try to create the entry point from the module name
self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}"
return self._entry_point
def __repr__(self):
return f"Module<'{self.display_name}' ({self.name})>"
def load_modules(modules):
modules = available_modules(limit_to_modules=modules, with_manifest=True)
for module in modules:
_load_module(module)
def load_module(module: str) -> object: # TODO: change return type to Step
# load a module by name
module = get_module(module)
if not module:
return None
# check external dependencies are installed
def check_deps(deps, check):
for dep in deps:
if not check(dep):
logger.error(f"Module '{module.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{module.name}' module? See the README for more information.")
exit(1)
check_deps(module.dependencies.get('python', []), lambda dep: find_spec(dep))
check_deps(module.dependencies.get('bin', []), lambda dep: shutil.which(dep))
qualname = f'auto_archiver.modules.{module.name}'
if qualname in sys.modules:
return
logger.info(f"Loading module '{module.display_name}'...")
loaded_module = __import__(qualname)
return getattr(sys.modules[qualname], module.entry_point)()
def _load_module(module):
# first make sure that the 'depends' are installed and available in sys.args
for dependency in module.depends:
if dependency not in sys.modules:
logger.error(f"""
Module {module.name} depends on {dependency} which is not available.
Have you set up the '{module.name}' module correctly? See the README for more information.
""")
exit()
# then check the external dependencies, these are binary dependencies that should be available on the path
for dep_type, deps in module.external_dependencies.items():
if dep_type == 'python':
for dep in deps:
if dep not in sys.modules:
logger.error(f"""
Module {module.name} requires {dep} which is not available.
Have you installed the required dependencies for the '{module.name}' module? See the README for more information.
""")
elif dep_type == 'binary':
for dep in deps:
if not shutil.which(dep):
logger.error(f"""
Module {module.name} requires {dep} which is not available.
Have you installed the required dependencies for the '{module.name}' module? See the README for more information.
""")
# finally, load the module
logger.info(f"Loading module {module.display_name}")
module = __import__(module.entry_point, fromlist=[module.entry_point])
logger.info(f"Module {module.display_name} loaded")
def load_manifest(module_path):
# print(f"Loading manifest for module {module_path}")
@@ -109,7 +101,14 @@ def load_manifest(module_path):
manifest.update(ast.literal_eval(f.read()))
return manifest
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], ) -> List[Module]:
def get_module(module_name):
# get a module by name
try:
return available_modules(limit_to_modules=[module_name], with_manifest=True, suppress_warnings=True)[0]
except IndexError:
return None
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[Module]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
@@ -142,8 +141,9 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
manifest = {}
all_modules.append(Module(possible_module, possible_module_path, manifest))
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?")
if not suppress_warnings:
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?")
return all_modules

View File

@@ -19,7 +19,7 @@ from .context import ArchivingContext
from .metadata import Metadata
from ..version import __version__
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG
from .loader import available_modules, Module, MODULE_TYPES, load_modules
from .loader import available_modules, Module, MODULE_TYPES, load_module
import tempfile, traceback
from loguru import logger
@@ -114,12 +114,6 @@ class ArchivingOrchestrator:
self.config = yaml_config
logger.info("FEEDERS: " + ", ".join(self.config['steps']['feeders']))
logger.info("EXTRACTORS: " + ", ".join(self.config['steps']['extractors']))
logger.info("ENRICHERS: " + ", ".join(self.config['steps']['enrichers']))
logger.info("DATABASES: " + ", ".join(self.config['steps']['databases']))
logger.info("STORAGES: " + ", ".join(self.config['steps']['storages']))
logger.info("FORMATTERS: " + ", ".join(self.config['steps']['formatters']))
return self.config
def add_steps_args(self, parser: argparse.ArgumentParser = None):
@@ -164,10 +158,33 @@ class ArchivingOrchestrator:
exit()
def install_modules(self):
modules = set()
[modules.update(*m) for m in self.config['steps'].values()]
"""
Swaps out the previous 'strings' in the config with the actual modules
"""
for module_type in MODULE_TYPES:
step_items = []
modules_to_load = self.config['steps'][f"{module_type}s"]
load_modules(modules)
def check_steps_ok():
if not len(step_items):
logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration file and try again. Tried to load the following modules, but none were available: {modules_to_load}")
exit()
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
logger.error(f"Only one feeder is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
exit()
for i, module in enumerate(modules_to_load):
loaded_module = load_module(module)
if loaded_module:
step_items.append(loaded_module)
check_steps_ok()
self.config['steps'][f"{module_type}s"] = step_items
assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again."
self.config['steps'][f"{module_type}s"] = step_items
def run(self) -> None:
self.setup_basic_parser()
@@ -190,17 +207,26 @@ class ArchivingOrchestrator:
yaml_config = read_yaml(basic_config.config_file)
breakpoint()
self.setup_complete_parser(basic_config, yaml_config, unused_args)
self.install_modules()
logger.info("FEEDERS: " + ", ".join(m.name for m in self.config['steps']['feeders']))
logger.info("EXTRACTORS: " + ", ".join(m.name for m in self.config['steps']['extractors']))
logger.info("ENRICHERS: " + ", ".join(m.name for m in self.config['steps']['enrichers']))
logger.info("DATABASES: " + ", ".join(m.name for m in self.config['steps']['databases']))
logger.info("STORAGES: " + ", ".join(m.name for m in self.config['steps']['storages']))
logger.info("FORMATTERS: " + ", ".join(m.name for m in self.config['steps']['formatters']))
for item in self.feed():
pass
def cleanup(self)->None:
logger.info("Cleaning up")
for a in self.all_archivers_for_setup(): a.cleanup()
for e in self.config['steps']['extractors']:
breakpoint()
e.cleanup()
def feed(self) -> Generator[Metadata]:
for feeder in self.config['steps']['feeders']:
@@ -222,12 +248,12 @@ class ArchivingOrchestrator:
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}")
for d in self.databases: d.aborted(item)
for d in self.config['steps']['databases']: d.aborted(item)
self.cleanup()
exit()
except Exception as e:
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.databases:
for d in self.config['steps']['databases']:
if type(e) == AssertionError: d.failed(item, str(e))
else: d.failed(item)
@@ -317,7 +343,4 @@ class ArchivingOrchestrator:
assert ip.is_global, f"Invalid IP used"
assert not ip.is_reserved, f"Invalid IP used"
assert not ip.is_link_local, f"Invalid IP used"
assert not ip.is_private, f"Invalid IP used"
def all_archivers_for_setup(self) -> List[Archiver]:
return self.archivers + [e for e in self.enrichers if isinstance(e, Archiver)]
assert not ip.is_private, f"Invalid IP used"

View File

@@ -5,44 +5,7 @@ by handling user configuration, validating the steps properties, and implementin
"""
from __future__ import annotations
from dataclasses import dataclass
from inspect import ClassFoundException
from typing import Type
from abc import ABC
@dataclass
class Step(ABC):
name: str = None
def __init__(self, config: dict) -> None:
# Initialises each step by reading the relevant entries
# reads the configs into object properties
# self.config = config[self.name]
for k, v in config.get(self.name, {}).items():
self.__setattr__(k, v)
@staticmethod
def configs() -> dict: return {}
def init(name: str, config: dict, child: Type[Step]) -> Step:
"""
Attempts to instantiate a subclass of the provided `child` type
matching the given `name`.
Raises ClassFoundException if no matching subclass is found.
TODO: cannot find subclasses of child.subclasses
"""
for sub in child.__subclasses__():
if sub.name == name:
return sub(config)
raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names, and make sure you made the step discoverable by putting it into __init__.py")
def assert_valid_string(self, prop: str) -> None:
"""
Receives a property name and ensures it exists and is a valid non-empty string,
raising an AssertionError if not.
TODO: replace assertions with custom exceptions
"""
assert hasattr(self, prop), f"property {prop} not found"
s = getattr(self, prop)
assert s is not None and type(s) == str and len(s) > 0, f"invalid property {prop} value '{s}', it should be a valid string"
class Step:
# TODO: try and get this name from the manifest, so we don't have to set it twice
name: str

View File

@@ -0,0 +1 @@
from .generic_extractor import GenericExtractor

View File

@@ -3,10 +3,9 @@
'version': '0.1.0',
'author': 'Bellingcat',
'type': ['extractor'],
'entry_point': 'generic_extractor:GenericExtractor',
'entry_point': 'GenericExtractor', # this class should be present in the __init__.py
'requires_setup': False,
'depends': ['core'],
'external_dependencies': {
'dependencies': {
'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],
},
'description': """

View File

@@ -12,17 +12,6 @@ class GenericExtractor(Archiver):
name = "youtubedl_archiver" #left as is for backwards compat
_dropins = {}
def __init__(self, config: dict) -> None:
super().__init__(config)
self.subtitles = bool(self.subtitles)
self.comments = bool(self.comments)
self.livestreams = bool(self.livestreams)
self.live_from_start = bool(self.live_from_start)
self.end_means_success = bool(self.end_means_success)
self.allow_playlist = bool(self.allow_playlist)
self.max_downloads = self.max_downloads
def suitable_extractors(self, url: str) -> list[str]:
"""
Returns a list of valid extractors for the given URL"""