mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Tidy ups + unit tests:
1. Allow loading modules from --module_paths=/extra/path/here 2. Improved unit tests for module loading 3. Further small tidy ups/clean ups
This commit is contained in:
@@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
|
||||
from loguru import logger
|
||||
|
||||
from copy import deepcopy
|
||||
from .module import MODULE_TYPES
|
||||
from .module import BaseModule
|
||||
|
||||
from typing import Any, List, Type, Tuple
|
||||
|
||||
@@ -21,7 +21,7 @@ EMPTY_CONFIG = yaml.load("""
|
||||
# Auto Archiver Configuration
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
|
||||
"""
|
||||
|
||||
# Global configuration
|
||||
|
||||
@@ -16,33 +16,53 @@ from importlib.util import find_spec
|
||||
import os
|
||||
from os.path import join, dirname
|
||||
from loguru import logger
|
||||
import auto_archiver
|
||||
|
||||
_LAZY_LOADED_MODULES = {}
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
_DEFAULT_MANIFEST = {
|
||||
'name': '',
|
||||
'author': 'Bellingcat',
|
||||
'type': [],
|
||||
'requires_setup': True,
|
||||
'description': '',
|
||||
'dependencies': {},
|
||||
'entry_point': '',
|
||||
'version': '1.0',
|
||||
'configs': {}
|
||||
}
|
||||
|
||||
class BaseModule(ABC):
|
||||
|
||||
"""
|
||||
Base module class. All modules should inherit from this class.
|
||||
|
||||
The exact methods a class implements will depend on the type of module it is,
|
||||
however all modules have a .setup(config: dict) method to run any setup code
|
||||
(e.g. logging in to a site, spinning up a browser etc.)
|
||||
|
||||
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
|
||||
a subclass can be of multiple types. For example, a module that extracts data from
|
||||
a website and stores it in a database would be both an 'extractor' and a 'database' module.
|
||||
|
||||
Each module is a python package, and should have a __manifest__.py file in the
|
||||
same directory as the module file. The __manifest__.py specifies the module information
|
||||
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
|
||||
default manifest structure.
|
||||
|
||||
"""
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
_DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
|
||||
config: dict
|
||||
name: str
|
||||
|
||||
@@ -51,15 +71,51 @@ class BaseModule(ABC):
|
||||
for key, val in config.get(self.name, {}).items():
|
||||
setattr(self, key, val)
|
||||
|
||||
def get_module(module_name: str, additional_paths: List[str] = []) -> LazyBaseModule:
|
||||
def repr(self):
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
||||
|
||||
|
||||
def setup_paths(paths: list[str]) -> None:
|
||||
"""
|
||||
Sets up the paths for the modules to be loaded from
|
||||
|
||||
This is necessary for the modules to be imported correctly
|
||||
|
||||
"""
|
||||
for path in paths:
|
||||
# see odoo/module/module.py -> initialize_sys_path
|
||||
if path not in auto_archiver.modules.__path__:
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
|
||||
# sort based on the length of the path, so that the longest path is last in the list
|
||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||
|
||||
|
||||
def get_module(module_name: str, config: dict) -> BaseModule:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
||||
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
|
||||
|
||||
"""
|
||||
return get_module_lazy(module_name).load(config)
|
||||
|
||||
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
|
||||
"""
|
||||
Lazily loads a module, returning a LazyBaseModule
|
||||
|
||||
This has all the information about the module, but does not load the module itself or its dependencies
|
||||
|
||||
To load an actual module, call .setup() on a laz module
|
||||
|
||||
"""
|
||||
if module_name in _LAZY_LOADED_MODULES:
|
||||
return _LAZY_LOADED_MODULES[module_name]
|
||||
|
||||
module = available_modules(additional_paths=additional_paths, limit_to_modules=[module_name])[0]
|
||||
_LAZY_LOADED_MODULES[module_name] = module
|
||||
module = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)[0]
|
||||
return module
|
||||
|
||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
@@ -67,10 +123,9 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
|
||||
if os.path.isfile(join(module_path, MANIFEST_FILE)):
|
||||
return True
|
||||
|
||||
default_path = [join(dirname(dirname((__file__))), "modules")]
|
||||
all_modules = []
|
||||
|
||||
for module_folder in default_path + additional_paths:
|
||||
for module_folder in auto_archiver.modules.__path__:
|
||||
# walk through each module in module_folder and check if it has a valid manifest
|
||||
try:
|
||||
possible_modules = os.listdir(module_folder)
|
||||
@@ -85,8 +140,12 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
|
||||
possible_module_path = join(module_folder, possible_module)
|
||||
if not is_really_module(possible_module_path):
|
||||
continue
|
||||
|
||||
all_modules.append(LazyBaseModule(possible_module, possible_module_path))
|
||||
if _LAZY_LOADED_MODULES.get(possible_module):
|
||||
continue
|
||||
lazy_module = LazyBaseModule(possible_module, possible_module_path)
|
||||
_LAZY_LOADED_MODULES[possible_module] = lazy_module
|
||||
|
||||
all_modules.append(lazy_module)
|
||||
|
||||
if not suppress_warnings:
|
||||
for module in limit_to_modules:
|
||||
@@ -97,8 +156,14 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
|
||||
|
||||
@dataclass
|
||||
class LazyBaseModule:
|
||||
|
||||
"""
|
||||
A lazy module class, which only loads the manifest and does not load the module itself.
|
||||
|
||||
This is useful for getting information about a module without actually loading it.
|
||||
|
||||
"""
|
||||
name: str
|
||||
display_name: str
|
||||
type: list
|
||||
description: str
|
||||
path: str
|
||||
@@ -129,6 +194,10 @@ class LazyBaseModule:
|
||||
@property
|
||||
def requires_setup(self) -> bool:
|
||||
return self.manifest['requires_setup']
|
||||
|
||||
@property
|
||||
def display_name(self) -> str:
|
||||
return self.manifest['name']
|
||||
|
||||
@property
|
||||
def manifest(self) -> dict:
|
||||
@@ -136,7 +205,7 @@ class LazyBaseModule:
|
||||
return self._manifest
|
||||
# print(f"Loading manifest for module {module_path}")
|
||||
# load the manifest file
|
||||
manifest = copy.deepcopy(_DEFAULT_MANIFEST)
|
||||
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
|
||||
|
||||
with open(join(self.path, MANIFEST_FILE)) as f:
|
||||
try:
|
||||
@@ -145,7 +214,6 @@ class LazyBaseModule:
|
||||
logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
||||
|
||||
self._manifest = manifest
|
||||
self.display_name = manifest['name']
|
||||
self.type = manifest['type']
|
||||
self._entry_point = manifest['entry_point']
|
||||
self.description = manifest['description']
|
||||
@@ -153,7 +221,7 @@ class LazyBaseModule:
|
||||
|
||||
return manifest
|
||||
|
||||
def load(self) -> BaseModule:
|
||||
def load(self, config) -> BaseModule:
|
||||
|
||||
if self._instance:
|
||||
return self._instance
|
||||
@@ -162,10 +230,27 @@ class LazyBaseModule:
|
||||
def check_deps(deps, check):
|
||||
for dep in deps:
|
||||
if not check(dep):
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
exit(1)
|
||||
|
||||
check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep))
|
||||
def check_python_dep(dep):
|
||||
# first check if it's a module:
|
||||
try:
|
||||
m = get_module_lazy(dep, suppress_warnings=True)
|
||||
try:
|
||||
# we must now load this module and set it up with the config
|
||||
m.load(config)
|
||||
return True
|
||||
except:
|
||||
logger.error(f"Unable to setup module '{dep}' for use in module '{self.name}'")
|
||||
return False
|
||||
except IndexError:
|
||||
# not a module, continue
|
||||
pass
|
||||
|
||||
return find_spec(dep)
|
||||
|
||||
check_deps(self.dependencies.get('python', []), check_python_dep)
|
||||
check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep))
|
||||
|
||||
|
||||
@@ -184,9 +269,8 @@ class LazyBaseModule:
|
||||
sub_qualname = f'{qualname}.{file_name}'
|
||||
|
||||
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
|
||||
|
||||
# finally, get the class instance
|
||||
instance = getattr(sys.modules[sub_qualname], class_name)()
|
||||
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
||||
if not getattr(instance, 'name', None):
|
||||
instance.name = self.name
|
||||
|
||||
@@ -194,6 +278,11 @@ class LazyBaseModule:
|
||||
instance.display_name = self.display_name
|
||||
|
||||
self._instance = instance
|
||||
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
instance.setup(config)
|
||||
return instance
|
||||
|
||||
def __repr__(self):
|
||||
|
||||
@@ -19,7 +19,7 @@ from .context import ArchivingContext
|
||||
from .metadata import Metadata
|
||||
from ..version import __version__
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .module import available_modules, LazyBaseModule, MODULE_TYPES, get_module
|
||||
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
||||
from . import validators
|
||||
from .module import BaseModule
|
||||
|
||||
@@ -57,6 +57,7 @@ class ArchivingOrchestrator:
|
||||
# override the default 'help' so we can inject all the configs and show those
|
||||
parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
|
||||
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
|
||||
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
|
||||
self.basic_parser = parser
|
||||
|
||||
@@ -72,19 +73,21 @@ class ArchivingOrchestrator:
|
||||
# if full, we'll load all modules
|
||||
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
|
||||
# but should we add them? Or should we just add them to the 'complete' parser?
|
||||
|
||||
if yaml_config != EMPTY_CONFIG:
|
||||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
for module_type in MODULE_TYPES:
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
|
||||
# add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
|
||||
for module_type in MODULE_TYPES:
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
if modules := getattr(basic_config, f"{module_type}s", []):
|
||||
enabled_modules.extend(modules)
|
||||
|
||||
self.add_module_args(available_modules(with_manifest=True, limit_to_modules=set(enabled_modules), suppress_warnings=True), parser)
|
||||
avail_modules = available_modules(with_manifest=True, limit_to_modules=list(dict.fromkeys(enabled_modules)), suppress_warnings=True)
|
||||
self.add_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == 'simple':
|
||||
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
|
||||
self.add_module_args(simple_modules, parser)
|
||||
@@ -135,10 +138,7 @@ class ArchivingOrchestrator:
|
||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
# additional modules
|
||||
parser.add_argument('--additional-modules', dest='additional_modules', nargs='+', help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
|
||||
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None):
|
||||
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
|
||||
if not modules:
|
||||
modules = available_modules(with_manifest=True)
|
||||
@@ -173,7 +173,7 @@ class ArchivingOrchestrator:
|
||||
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
|
||||
arg.should_store = should_store
|
||||
|
||||
def show_help(self):
|
||||
def show_help(self, basic_config: dict):
|
||||
# for the help message, we want to load *all* possible modules and show the help
|
||||
# add configs as arg parser arguments
|
||||
|
||||
@@ -198,7 +198,7 @@ class ArchivingOrchestrator:
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in MODULE_TYPES:
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
step_items = []
|
||||
modules_to_load = self.config['steps'][f"{module_type}s"]
|
||||
|
||||
@@ -216,9 +216,8 @@ class ArchivingOrchestrator:
|
||||
for module in modules_to_load:
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
loaded_module: BaseModule = get_module(module).load()
|
||||
try:
|
||||
loaded_module.setup(self.config)
|
||||
loaded_module: BaseModule = get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor':
|
||||
@@ -249,9 +248,11 @@ class ArchivingOrchestrator:
|
||||
# load the config file to get the list of enabled items
|
||||
basic_config, unused_args = self.basic_parser.parse_known_args()
|
||||
|
||||
setup_paths(basic_config.module_paths)
|
||||
|
||||
# if help flag was called, then show the help
|
||||
if basic_config.help:
|
||||
self.show_help()
|
||||
self.show_help(basic_config)
|
||||
|
||||
# load the config file
|
||||
yaml_config = {}
|
||||
@@ -268,7 +269,7 @@ class ArchivingOrchestrator:
|
||||
self.install_modules()
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in MODULE_TYPES:
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"]))
|
||||
|
||||
for item in self.feed():
|
||||
|
||||
@@ -19,16 +19,6 @@ class HashEnricher(Enricher):
|
||||
Calculates hashes for Media instances
|
||||
"""
|
||||
|
||||
def __init__(self, config: dict = None):
|
||||
"""
|
||||
Initialize the HashEnricher with a configuration dictionary.
|
||||
"""
|
||||
super().__init__()
|
||||
# TODO set these from the manifest?
|
||||
# Set default values
|
||||
self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256"
|
||||
self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7)
|
||||
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
@@ -12,7 +12,7 @@ from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Formatter
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
@dataclass
|
||||
class HtmlFormatter(Formatter):
|
||||
@@ -53,7 +53,7 @@ class HtmlFormatter(Formatter):
|
||||
outf.write(content)
|
||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
he = get_module('hash_enricher', self.config)
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
|
||||
import os, json, requests
|
||||
|
||||
import os
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user