Tidy up imports + start on loading modules - program now starts much faster

This commit is contained in:
Patrick Robertson
2025-01-22 18:45:58 +01:00
parent b6b085854c
commit ade5ea0f6f
12 changed files with 97 additions and 83 deletions

View File

@@ -1,10 +1,6 @@
""" Core modules to handle things such as orchestration, metadata and configs..
"""
from .metadata import Metadata
from .media import Media
from .step import Step
from .context import ArchivingContext
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

View File

@@ -61,9 +61,6 @@ class LoadFromFile (argparse.Action):
def to_dot_notation(yaml_conf: str) -> argparse.ArgumentParser:
dotdict = {}
for step, vals in yaml_conf.pop('steps', {}).items():
if vals:
dotdict[f"{step}s"] = vals
def process_subdict(subdict, prefix=""):
for key, value in subdict.items():

View File

@@ -4,12 +4,14 @@ import os
import copy
from os.path import join, dirname
from typing import List
from loguru import logger
import sys
import shutil
MODULE_TYPES = [
'feeder',
'enricher',
'archiver',
'extractor',
'database',
'storage',
'formatter'
@@ -59,7 +61,44 @@ class Module:
def __repr__(self):
return f"Module<'{self.display_name}' ({self.name})>"
def load_modules(modules):
modules = available_modules(limit_to_modules=modules, with_manifest=True)
for module in modules:
_load_module(module)
def _load_module(module):
# first make sure that the 'depends' are installed and available in sys.args
for dependency in module.depends:
if dependency not in sys.modules:
logger.error(f"""
Module {module.name} depends on {dependency} which is not available.
Have you set up the '{module.name}' module correctly? See the README for more information.
""")
exit()
# then check the external dependencies, these are binary dependencies that should be available on the path
for dep_type, deps in module.external_dependencies.items():
if dep_type == 'python':
for dep in deps:
if dep not in sys.modules:
logger.error(f"""
Module {module.name} requires {dep} which is not available.
Have you installed the required dependencies for the '{module.name}' module? See the README for more information.
""")
elif dep_type == 'binary':
for dep in deps:
if not shutil.which(dep):
logger.error(f"""
Module {module.name} requires {dep} which is not available.
Have you installed the required dependencies for the '{module.name}' module? See the README for more information.
""")
# finally, load the module
logger.info(f"Loading module {module.display_name}")
module = __import__(module.entry_point, fromlist=[module.entry_point])
logger.info(f"Module {module.display_name} loaded")
def load_manifest(module_path):
# print(f"Loading manifest for module {module_path}")
@@ -70,7 +109,7 @@ def load_manifest(module_path):
manifest.update(ast.literal_eval(f.read()))
return manifest
def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[Module]:
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], ) -> List[Module]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
@@ -83,7 +122,16 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals
for module_folder in default_path + additional_paths:
# walk through each module in module_folder and check if it has a valid manifest
for possible_module in os.listdir(module_folder):
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue
for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
continue
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
continue
@@ -93,5 +141,9 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals
else:
manifest = {}
all_modules.append(Module(possible_module, possible_module_path, manifest))
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?")
return all_modules

View File

@@ -11,9 +11,6 @@ from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config
import mimetypes
import ffmpeg
from ffmpeg._run import Error
from .context import ArchivingContext
from loguru import logger
@@ -106,6 +103,12 @@ class Media:
return self.mimetype.startswith("image")
def is_valid_video(self) -> bool:
# Note: this is intentional, to only import ffmpeg here - when the method is called
# this speeds up loading the module. We check that 'ffmpeg' is available on startup
# when we load each manifest file
import ffmpeg
from ffmpeg._run import Error
# checks for video streams with ffmpeg, or min file size for a video
# self.is_video() should be used together with this method
try:

View File

@@ -16,16 +16,10 @@ from rich_argparse import RichHelpFormatter
from .context import ArchivingContext
from ..archivers import Archiver
from ..feeders import Feeder
from ..formatters import Formatter
from ..storages import Storage
from ..enrichers import Enricher
from ..databases import Database
from .metadata import Metadata
from ..version import __version__
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG
from .loader import available_modules, Module, MODULE_TYPES
from .loader import available_modules, Module, MODULE_TYPES, load_modules
import tempfile, traceback
from loguru import logger
@@ -74,7 +68,7 @@ class ArchivingOrchestrator:
add_help=False,
)
self.add_steps_args(parser)
breakpoint()
# check what mode we're in
# if we have a config file, use that to decide which modules to load
# if simple, we'll load just the modules that has requires_setup = False
@@ -91,7 +85,7 @@ class ArchivingOrchestrator:
if modules := getattr(basic_config, f"{module_type}s", []):
enabled_modules.extend(modules)
self.add_module_args(available_modules(enabled_modules, with_manifest=True), parser)
self.add_module_args(available_modules(with_manifest=True, limit_to_modules=enabled_modules), parser)
elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
self.add_module_args(simple_modules, parser)
@@ -103,7 +97,7 @@ class ArchivingOrchestrator:
# load all modules, they're not using the 'simple' mode
self.add_module_args(available_modules(with_manifest=True), parser)
breakpoint()
parser.set_defaults(**to_dot_notation(yaml_config))
# reload the parser with the new arguments, now that we have them
@@ -114,27 +108,30 @@ class ArchivingOrchestrator:
# merge the new config with the old one
yaml_config = merge_dicts(vars(parsed), yaml_config)
if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)):
if basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)):
logger.info(f"Storing configuration file to {basic_config.config_file}")
store_yaml(yaml_config, basic_config.config_file)
breakpoint()
logger.info(f"FEEDER: {self.config.feeders}")
logger.info(f"ENRICHERS: {self.config.enrichers}")
logger.info(f"ARCHIVERS: {self.config.archivers}")
logger.info(f"DATABASES: {self.config.databases}")
logger.info(f"STORAGES: {self.config.storages}")
logger.info(f"FORMATTER: {self.formatter.name}")
self.config = yaml_config
logger.info("FEEDERS: " + ", ".join(self.config['steps']['feeders']))
logger.info("EXTRACTORS: " + ", ".join(self.config['steps']['extractors']))
logger.info("ENRICHERS: " + ", ".join(self.config['steps']['enrichers']))
logger.info("DATABASES: " + ", ".join(self.config['steps']['databases']))
logger.info("STORAGES: " + ", ".join(self.config['steps']['storages']))
logger.info("FORMATTERS: " + ", ".join(self.config['steps']['formatters']))
return self.config
def add_steps_args(self, parser: argparse.ArgumentParser = None):
if not parser:
parser = self.parser
parser.add_argument('--feeders', action='store', dest='feeders', nargs='+', required=True, help='the feeders to use')
parser.add_argument('--enrichers', action='store', dest='enrichers', nargs='+', required=True, help='the enrichers to use')
parser.add_argument('--archivers', action='store', dest='archivers', nargs='+', required=True, help='the archivers to use')
parser.add_argument('--databases', action='store', dest='databases', nargs='+', required=True, help='the databases to use')
parser.add_argument('--storages', action='store', dest='storages', nargs='+', required=True, help='the storages to use')
parser.add_argument('--formatter', action='store', dest='formatter', nargs='+', required=True, help='the formatter to use')
parser.add_argument('--feeders', action='store', dest='steps.feeders', nargs='+', required=True, help='the feeders to use')
parser.add_argument('--enrichers', action='store', dest='steps.enrichers', nargs='+', required=True, help='the enrichers to use')
parser.add_argument('--extractors', action='store', dest='steps.extractors', nargs='+', required=True, help='the extractors to use')
parser.add_argument('--databases', action='store', dest='steps.databases', nargs='+', required=True, help='the databases to use')
parser.add_argument('--storages', action='store', dest='steps.storages', nargs='+', required=True, help='the storages to use')
parser.add_argument('--formatters', action='store', dest='steps.formatters', nargs='+', required=True, help='the formatter to use')
def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None):
@@ -165,6 +162,12 @@ class ArchivingOrchestrator:
self.basic_parser.print_help()
exit()
def install_modules(self):
modules = set()
[modules.update(*m) for m in self.config['steps'].values()]
load_modules(modules)
def run(self) -> None:
self.setup_basic_parser()
@@ -187,11 +190,10 @@ class ArchivingOrchestrator:
yaml_config = read_yaml(basic_config.config_file)
breakpoint()
self.setup_complete_parser(basic_config, yaml_config, unused_args)
config.parse()
self.install_modules()
for item in self.feed():
pass
@@ -201,8 +203,9 @@ class ArchivingOrchestrator:
for a in self.all_archivers_for_setup(): a.cleanup()
def feed(self) -> Generator[Metadata]:
for item in self.feeder:
yield self.feed_item(item)
for feeder in self.config['steps']['feeders']:
for item in feeder:
yield self.feed_item(item)
self.cleanup()
def feed_item(self, item: Metadata) -> Metadata: