mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
Merge main
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
"""
|
||||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .module import BaseModule
|
||||
from .base_module import BaseModule
|
||||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
|
||||
@@ -1,13 +1,18 @@
|
||||
|
||||
from urllib.parse import urlparse
|
||||
from typing import Mapping, Any
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Mapping, Any, Type, TYPE_CHECKING
|
||||
from abc import ABC
|
||||
from copy import deepcopy, copy
|
||||
from tempfile import TemporaryDirectory
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
|
||||
|
||||
from loguru import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .module import ModuleFactory
|
||||
|
||||
class BaseModule(ABC):
|
||||
|
||||
"""
|
||||
@@ -17,41 +22,24 @@ class BaseModule(ABC):
|
||||
however modules can have a .setup() method to run any setup code
|
||||
(e.g. logging in to a site, spinning up a browser etc.)
|
||||
|
||||
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
|
||||
See consts.MODULE_TYPES for the types of modules you can create, noting that
|
||||
a subclass can be of multiple types. For example, a module that extracts data from
|
||||
a website and stores it in a database would be both an 'extractor' and a 'database' module.
|
||||
|
||||
Each module is a python package, and should have a __manifest__.py file in the
|
||||
same directory as the module file. The __manifest__.py specifies the module information
|
||||
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
|
||||
like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the
|
||||
default manifest structure.
|
||||
|
||||
"""
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
_DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
MODULE_TYPES = CONF_MODULE_TYPES
|
||||
|
||||
# NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method
|
||||
config: Mapping[str, Any]
|
||||
authentication: Mapping[str, Mapping[str, str]]
|
||||
name: str
|
||||
module_factory: ModuleFactory
|
||||
|
||||
# this is set by the orchestrator prior to archiving
|
||||
tmp_dir: TemporaryDirectory = None
|
||||
|
||||
@@ -10,8 +10,8 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from copy import deepcopy, copy
|
||||
from .module import BaseModule
|
||||
from copy import deepcopy
|
||||
from auto_archiver.core.consts import MODULE_TYPES
|
||||
|
||||
from typing import Any, List, Type, Tuple
|
||||
|
||||
@@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load("""
|
||||
# Auto Archiver Configuration
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
|
||||
"""
|
||||
|
||||
# Global configuration
|
||||
@@ -170,4 +170,4 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||
|
||||
config_to_save.pop('urls', None)
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
_yaml.dump(config_to_save, outf)
|
||||
_yaml.dump(config_to_save, outf)
|
||||
|
||||
23
src/auto_archiver/core/consts.py
Normal file
23
src/auto_archiver/core/consts.py
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
|
||||
DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
@@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
from typing import List, TYPE_CHECKING
|
||||
import shutil
|
||||
import ast
|
||||
import copy
|
||||
@@ -16,99 +16,113 @@ import os
|
||||
from os.path import join
|
||||
from loguru import logger
|
||||
import auto_archiver
|
||||
from .base_module import BaseModule
|
||||
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
|
||||
|
||||
_LAZY_LOADED_MODULES = {}
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
|
||||
|
||||
def setup_paths(paths: list[str]) -> None:
|
||||
"""
|
||||
Sets up the paths for the modules to be loaded from
|
||||
|
||||
This is necessary for the modules to be imported correctly
|
||||
|
||||
"""
|
||||
for path in paths:
|
||||
# check path exists, if it doesn't, log a warning
|
||||
if not os.path.exists(path):
|
||||
logger.warning(f"Path '{path}' does not exist. Skipping...")
|
||||
continue
|
||||
HAS_SETUP_PATHS = False
|
||||
|
||||
# see odoo/module/module.py -> initialize_sys_path
|
||||
if path not in auto_archiver.modules.__path__:
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
class ModuleFactory:
|
||||
|
||||
# sort based on the length of the path, so that the longest path is last in the list
|
||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||
def __init__(self):
|
||||
self._lazy_modules = {}
|
||||
|
||||
def get_module(module_name: str, config: dict) -> BaseModule:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
||||
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
|
||||
|
||||
"""
|
||||
return get_module_lazy(module_name).load(config)
|
||||
def setup_paths(self, paths: list[str]) -> None:
|
||||
"""
|
||||
Sets up the paths for the modules to be loaded from
|
||||
|
||||
This is necessary for the modules to be imported correctly
|
||||
|
||||
"""
|
||||
global HAS_SETUP_PATHS
|
||||
|
||||
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
|
||||
"""
|
||||
Lazily loads a module, returning a LazyBaseModule
|
||||
|
||||
This has all the information about the module, but does not load the module itself or its dependencies
|
||||
|
||||
To load an actual module, call .setup() on a lazy module
|
||||
|
||||
"""
|
||||
if module_name in _LAZY_LOADED_MODULES:
|
||||
return _LAZY_LOADED_MODULES[module_name]
|
||||
|
||||
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
|
||||
return available[0]
|
||||
|
||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
def is_really_module(module_path):
|
||||
if os.path.isfile(join(module_path, MANIFEST_FILE)):
|
||||
return True
|
||||
|
||||
all_modules = []
|
||||
|
||||
for module_folder in auto_archiver.modules.__path__:
|
||||
# walk through each module in module_folder and check if it has a valid manifest
|
||||
try:
|
||||
possible_modules = os.listdir(module_folder)
|
||||
except FileNotFoundError:
|
||||
logger.warning(f"Module folder {module_folder} does not exist")
|
||||
continue
|
||||
|
||||
for possible_module in possible_modules:
|
||||
if limit_to_modules and possible_module not in limit_to_modules:
|
||||
for path in paths:
|
||||
# check path exists, if it doesn't, log a warning
|
||||
if not os.path.exists(path):
|
||||
logger.warning(f"Path '{path}' does not exist. Skipping...")
|
||||
continue
|
||||
|
||||
possible_module_path = join(module_folder, possible_module)
|
||||
if not is_really_module(possible_module_path):
|
||||
# see odoo/module/module.py -> initialize_sys_path
|
||||
if path not in auto_archiver.modules.__path__:
|
||||
if HAS_SETUP_PATHS == True:
|
||||
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
|
||||
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
|
||||
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
|
||||
# sort based on the length of the path, so that the longest path is last in the list
|
||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||
|
||||
HAS_SETUP_PATHS = True
|
||||
|
||||
def get_module(self, module_name: str, config: dict) -> BaseModule:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
||||
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
|
||||
|
||||
"""
|
||||
return self.get_module_lazy(module_name).load(config)
|
||||
|
||||
def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
|
||||
"""
|
||||
Lazily loads a module, returning a LazyBaseModule
|
||||
|
||||
This has all the information about the module, but does not load the module itself or its dependencies
|
||||
|
||||
To load an actual module, call .setup() on a lazy module
|
||||
|
||||
"""
|
||||
if module_name in self._lazy_modules:
|
||||
return self._lazy_modules[module_name]
|
||||
|
||||
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
|
||||
return available[0]
|
||||
|
||||
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
def is_really_module(module_path):
|
||||
if os.path.isfile(join(module_path, MANIFEST_FILE)):
|
||||
return True
|
||||
|
||||
all_modules = []
|
||||
|
||||
for module_folder in auto_archiver.modules.__path__:
|
||||
# walk through each module in module_folder and check if it has a valid manifest
|
||||
try:
|
||||
possible_modules = os.listdir(module_folder)
|
||||
except FileNotFoundError:
|
||||
logger.warning(f"Module folder {module_folder} does not exist")
|
||||
continue
|
||||
if _LAZY_LOADED_MODULES.get(possible_module):
|
||||
continue
|
||||
lazy_module = LazyBaseModule(possible_module, possible_module_path)
|
||||
|
||||
_LAZY_LOADED_MODULES[possible_module] = lazy_module
|
||||
for possible_module in possible_modules:
|
||||
if limit_to_modules and possible_module not in limit_to_modules:
|
||||
continue
|
||||
|
||||
all_modules.append(lazy_module)
|
||||
|
||||
if not suppress_warnings:
|
||||
for module in limit_to_modules:
|
||||
if not any(module == m.name for m in all_modules):
|
||||
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
|
||||
possible_module_path = join(module_folder, possible_module)
|
||||
if not is_really_module(possible_module_path):
|
||||
continue
|
||||
if self._lazy_modules.get(possible_module):
|
||||
continue
|
||||
lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self)
|
||||
|
||||
return all_modules
|
||||
self._lazy_modules[possible_module] = lazy_module
|
||||
|
||||
all_modules.append(lazy_module)
|
||||
|
||||
if not suppress_warnings:
|
||||
for module in limit_to_modules:
|
||||
if not any(module == m.name for m in all_modules):
|
||||
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
|
||||
|
||||
return all_modules
|
||||
|
||||
@dataclass
|
||||
class LazyBaseModule:
|
||||
@@ -123,14 +137,16 @@ class LazyBaseModule:
|
||||
type: list
|
||||
description: str
|
||||
path: str
|
||||
module_factory: ModuleFactory
|
||||
|
||||
_manifest: dict = None
|
||||
_instance: BaseModule = None
|
||||
_entry_point: str = None
|
||||
|
||||
def __init__(self, module_name, path):
|
||||
def __init__(self, module_name, path, factory: ModuleFactory):
|
||||
self.name = module_name
|
||||
self.path = path
|
||||
self.module_factory = factory
|
||||
|
||||
@property
|
||||
def entry_point(self):
|
||||
@@ -161,7 +177,7 @@ class LazyBaseModule:
|
||||
return self._manifest
|
||||
# print(f"Loading manifest for module {module_path}")
|
||||
# load the manifest file
|
||||
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
|
||||
manifest = copy.deepcopy(DEFAULT_MANIFEST)
|
||||
|
||||
with open(join(self.path, MANIFEST_FILE)) as f:
|
||||
try:
|
||||
@@ -189,13 +205,14 @@ class LazyBaseModule:
|
||||
# clear out any empty strings that a user may have erroneously added
|
||||
continue
|
||||
if not check(dep):
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
exit(1)
|
||||
|
||||
def check_python_dep(dep):
|
||||
# first check if it's a module:
|
||||
try:
|
||||
m = get_module_lazy(dep, suppress_warnings=True)
|
||||
m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
|
||||
try:
|
||||
# we must now load this module and set it up with the config
|
||||
m.load(config)
|
||||
@@ -230,19 +247,21 @@ class LazyBaseModule:
|
||||
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
|
||||
# finally, get the class instance
|
||||
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
||||
if not getattr(instance, 'name', None):
|
||||
instance.name = self.name
|
||||
|
||||
if not getattr(instance, 'display_name', None):
|
||||
instance.display_name = self.display_name
|
||||
|
||||
self._instance = instance
|
||||
|
||||
# set the name, display name and module factory
|
||||
instance.name = self.name
|
||||
instance.display_name = self.display_name
|
||||
instance.module_factory = self.module_factory
|
||||
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
|
||||
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
instance.config_setup(config)
|
||||
instance.setup()
|
||||
|
||||
# save the instance for future easy loading
|
||||
self._instance = instance
|
||||
return instance
|
||||
|
||||
def __repr__(self):
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Generator, Union, List, Type
|
||||
from typing import Generator, Union, List, Type, TYPE_CHECKING
|
||||
from urllib.parse import urlparse
|
||||
from ipaddress import ip_address
|
||||
import argparse
|
||||
@@ -21,12 +21,14 @@ from rich_argparse import RichHelpFormatter
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .module import BaseModule
|
||||
|
||||
from .consts import MODULE_TYPES
|
||||
from loguru import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
||||
|
||||
@@ -87,6 +89,12 @@ class UniqueAppendAction(argparse.Action):
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
# instance variables
|
||||
module_factory: ModuleFactory
|
||||
setup_finished: bool
|
||||
logger_id: int
|
||||
|
||||
# instance variables, used for convenience to access modules by step
|
||||
feeders: List[Type[Feeder]]
|
||||
extractors: List[Type[Extractor]]
|
||||
enrichers: List[Type[Enricher]]
|
||||
@@ -94,6 +102,11 @@ class ArchivingOrchestrator:
|
||||
storages: List[Type[Storage]]
|
||||
formatters: List[Type[Formatter]]
|
||||
|
||||
def __init__(self):
|
||||
self.module_factory = ModuleFactory()
|
||||
self.setup_finished = False
|
||||
self.logger_id = None
|
||||
|
||||
def setup_basic_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="auto-archiver",
|
||||
@@ -125,7 +138,7 @@ class ArchivingOrchestrator:
|
||||
)
|
||||
self.add_modules_args(modules_parser)
|
||||
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
for module_type in MODULE_TYPES:
|
||||
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
|
||||
|
||||
parser = DefaultValidatingParser(
|
||||
@@ -147,15 +160,15 @@ class ArchivingOrchestrator:
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
for module_type in MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
|
||||
# clear out duplicates, but keep the order
|
||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
self.add_individual_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == 'simple':
|
||||
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
|
||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||
self.add_individual_module_args(simple_modules, parser)
|
||||
|
||||
# for simple mode, we use the cli_feeder and any modules that don't require setup
|
||||
@@ -168,7 +181,7 @@ class ArchivingOrchestrator:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
else:
|
||||
# load all modules, they're not using the 'simple' mode
|
||||
self.add_individual_module_args(available_modules(with_manifest=True), parser)
|
||||
self.add_individual_module_args(self.module_factory.available_modules(), parser)
|
||||
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
@@ -198,7 +211,7 @@ class ArchivingOrchestrator:
|
||||
parser = self.parser
|
||||
|
||||
# Module loading from the command line
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
for module_type in MODULE_TYPES:
|
||||
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
|
||||
|
||||
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
||||
@@ -224,7 +237,7 @@ class ArchivingOrchestrator:
|
||||
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
|
||||
if not modules:
|
||||
modules = available_modules(with_manifest=True)
|
||||
modules = self.module_factory.available_modules()
|
||||
|
||||
for module in modules:
|
||||
|
||||
@@ -266,11 +279,18 @@ class ArchivingOrchestrator:
|
||||
|
||||
def setup_logging(self, config):
|
||||
# setup loguru logging
|
||||
logger.remove(0) # remove the default logger
|
||||
try:
|
||||
logger.remove(0) # remove the default logger
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
logging_config = config['logging']
|
||||
logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
@@ -280,7 +300,7 @@ class ArchivingOrchestrator:
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
for module_type in MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
@@ -325,7 +345,7 @@ class ArchivingOrchestrator:
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
try:
|
||||
loaded_module: BaseModule = get_module(module, self.config)
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor' and loaded_module.name == module:
|
||||
@@ -351,14 +371,17 @@ class ArchivingOrchestrator:
|
||||
def setup_config(self, args: list) -> dict:
|
||||
"""
|
||||
Sets up the configuration file, merging the default config with the user's config
|
||||
|
||||
This function should only ever be run once.
|
||||
"""
|
||||
|
||||
self.setup_basic_parser()
|
||||
|
||||
# parse the known arguments for now (basically, we want the config file)
|
||||
basic_config, unused_args = self.basic_parser.parse_known_args(args)
|
||||
|
||||
# setup any custom module paths, so they'll show in the help and for arg parsing
|
||||
setup_paths(basic_config.module_paths)
|
||||
self.module_factory.setup_paths(basic_config.module_paths)
|
||||
|
||||
# if help flag was called, then show the help
|
||||
if basic_config.help:
|
||||
@@ -370,16 +393,29 @@ class ArchivingOrchestrator:
|
||||
|
||||
def setup(self, args: list):
|
||||
"""
|
||||
Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser
|
||||
Function to configure all setup of the orchestrator: setup configs and load modules.
|
||||
|
||||
This method should only ever be called once
|
||||
"""
|
||||
|
||||
if self.setup_finished:
|
||||
logger.warning("The `setup_config()` function should only ever be run once. \
|
||||
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
||||
For code implementatations, you should call .setup_config() once then you may call .feed() \
|
||||
multiple times to archive multiple URLs.")
|
||||
return
|
||||
|
||||
self.setup_basic_parser()
|
||||
self.config = self.setup_config(args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
self.install_modules(self.config['steps'])
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
for module_type in MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
self.setup_finished = True
|
||||
|
||||
def _command_line_run(self, args: list) -> Generator[Metadata]:
|
||||
"""
|
||||
|
||||
@@ -14,7 +14,7 @@ from auto_archiver.utils.misc import random_str
|
||||
|
||||
from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class Storage(BaseModule):
|
||||
|
||||
"""
|
||||
@@ -74,7 +74,7 @@ class Storage(BaseModule):
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
# load the hash_enricher module
|
||||
he = get_module(HashEnricher, self.config)
|
||||
he = self.module_factory.get_module(HashEnricher, self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
|
||||
@@ -1 +1 @@
|
||||
from atlos_db import AtlosDb
|
||||
from .atlos_db import AtlosDb
|
||||
1
src/auto_archiver/modules/atlos_storage/__init__.py
Normal file
1
src/auto_archiver/modules/atlos_storage/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .atlos_storage import AtlosStorage
|
||||
@@ -281,7 +281,7 @@ class GenericExtractor(Extractor):
|
||||
# set up auth
|
||||
auth = self.auth_for_site(url, extract_cookies=False)
|
||||
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||
if auth:
|
||||
if 'username' in auth and 'password' in auth:
|
||||
logger.debug(f'Using provided auth username and password for {url}')
|
||||
@@ -290,7 +290,7 @@ class GenericExtractor(Extractor):
|
||||
elif 'cookie' in auth:
|
||||
logger.debug(f'Using provided auth cookie for {url}')
|
||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
||||
elif 'cookie_from_browser' in auth:
|
||||
elif 'cookies_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
|
||||
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
|
||||
elif 'cookies_file' in auth:
|
||||
|
||||
@@ -10,7 +10,6 @@ from auto_archiver.version import __version__
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core import Formatter
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class HtmlFormatter(Formatter):
|
||||
environment: Environment = None
|
||||
@@ -50,7 +49,7 @@ class HtmlFormatter(Formatter):
|
||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||
|
||||
# get the already instantiated hash_enricher module
|
||||
he = get_module('hash_enricher', self.config)
|
||||
he = self.module_factory.get_module('hash_enricher', self.config)
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
|
||||
@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
|
||||
chat, since_id = self._send_url_to_bot(url)
|
||||
message = self._process_messages(chat, since_id, tmp_dir, result)
|
||||
|
||||
# This may be outdated and replaced by the below message, but keeping until confirmed
|
||||
if "You must enter a URL to a post" in message:
|
||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
# # TODO: It currently returns this as a success - is that intentional?
|
||||
# if "Media not found or unavailable" in message:
|
||||
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
# return False
|
||||
|
||||
if "Media not found or unavailable" in message:
|
||||
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
|
||||
@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
|
||||
def __init__(self, webdriver_factory=None):
|
||||
super().__init__()
|
||||
self.webdriver_factory = webdriver_factory or Webdriver
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
auth = self.auth_for_site(url)
|
||||
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
with self.webdriver_factory(
|
||||
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
|
||||
|
||||
@@ -7,8 +7,12 @@
|
||||
"bin": ["ffmpeg"]
|
||||
},
|
||||
"configs": {
|
||||
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
"thumbnails_per_minute": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16,
|
||||
"type": "int",
|
||||
"help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
},
|
||||
"description": """
|
||||
Generates thumbnails for video files to provide visual previews.
|
||||
|
||||
@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
|
||||
logger.error(f"error getting duration of video {m.filename}: {e}")
|
||||
return
|
||||
|
||||
num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
|
||||
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
|
||||
timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]
|
||||
|
||||
thumbnails_media = []
|
||||
|
||||
@@ -4,7 +4,6 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
"""
|
||||
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.stores = self.config['steps']['storages']
|
||||
self.s3 = get_module("s3_storage", self.config)
|
||||
self.s3 = self.module_factory.get_module("s3_storage", self.config)
|
||||
if not "s3_storage" in self.stores:
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
return
|
||||
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
|
||||
job_results = {}
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
# TODO: this used to pass all storage items to store now
|
||||
# Now only passing S3, the rest will get added later in the usual order (?)
|
||||
# Only storing S3, the rest will get added later in the usual order (?)
|
||||
m.store(url=url, metadata=to_enrich, storages=[self.s3])
|
||||
try:
|
||||
job_id = self.submit_job(m)
|
||||
|
||||
@@ -46,7 +46,7 @@ def dump_payload(p):
|
||||
|
||||
|
||||
def update_nested_dict(dictionary, update_dict):
|
||||
# takes 2 dicts and overwrites the first with the second only on the changed balues
|
||||
# takes 2 dicts and overwrites the first with the second only on the changed values
|
||||
for key, value in update_dict.items():
|
||||
if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict):
|
||||
update_nested_dict(dictionary[key], value)
|
||||
|
||||
Reference in New Issue
Block a user