mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Add documentation, pre-commit hook, more make commands and
This commit is contained in:
@@ -1,25 +1,19 @@
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
MODULE_TYPES = ["feeder", "extractor", "enricher", "database", "storage", "formatter"]
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
|
||||
DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
"name": "", # the display name of the module
|
||||
"author": "Bellingcat", # creator of the module, leave this as Bellingcat or set your own name!
|
||||
"type": [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
"requires_setup": True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
"description": "", # a description of the module
|
||||
"dependencies": {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
"entry_point": "", # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
"version": "1.0", # the version of the module
|
||||
"configs": {}, # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
""" Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
"""Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
|
||||
"""
|
||||
|
||||
@@ -19,8 +19,17 @@ import requests
|
||||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
|
||||
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
|
||||
from .config import (
|
||||
read_yaml,
|
||||
store_yaml,
|
||||
to_dot_notation,
|
||||
merge_dicts,
|
||||
is_valid_config,
|
||||
DefaultValidatingParser,
|
||||
UniqueAppendAction,
|
||||
AuthenticationJsonParseAction,
|
||||
DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .consts import MODULE_TYPES, SetupError
|
||||
@@ -30,8 +39,8 @@ if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
# instance variables
|
||||
module_factory: ModuleFactory
|
||||
setup_finished: bool
|
||||
@@ -61,30 +70,63 @@ class ArchivingOrchestrator:
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
)
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
parser.add_argument("--help", "-h", action="store_true", dest="help", help="show a full help message and exit")
|
||||
parser.add_argument("--version", action="version", version=__version__)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
action="store",
|
||||
dest="config_file",
|
||||
help="the filename of the YAML configuration file (defaults to 'config.yaml')",
|
||||
default=DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
action="store",
|
||||
dest="mode",
|
||||
type=str,
|
||||
choices=["simple", "full"],
|
||||
help="the mode to run the archiver in",
|
||||
default="simple",
|
||||
)
|
||||
# override the default 'help' so we can inject all the configs and show those
|
||||
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
|
||||
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--store",
|
||||
dest="store",
|
||||
default=False,
|
||||
help="Store the created config in the config file",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--module_paths",
|
||||
dest="module_paths",
|
||||
nargs="+",
|
||||
default=[],
|
||||
help="additional paths to search for modules",
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
|
||||
self.basic_parser = parser
|
||||
return parser
|
||||
|
||||
|
||||
def check_steps(self, config):
|
||||
for module_type in MODULE_TYPES:
|
||||
if not config['steps'].get(f"{module_type}s", []):
|
||||
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
|
||||
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
|
||||
if module_type == 'extractor' and config['steps'].get('archivers'):
|
||||
raise SetupError("As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
if not config["steps"].get(f"{module_type}s", []):
|
||||
if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"):
|
||||
raise SetupError(
|
||||
f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n"
|
||||
)
|
||||
if module_type == "extractor" and config["steps"].get("archivers"):
|
||||
raise SetupError(
|
||||
"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n"
|
||||
)
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
|
||||
# modules parser to get the overridden 'steps' values
|
||||
modules_parser = argparse.ArgumentParser(
|
||||
add_help=False,
|
||||
@@ -92,7 +134,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.add_modules_args(modules_parser)
|
||||
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
|
||||
for module_type in MODULE_TYPES:
|
||||
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
|
||||
yaml_config["steps"][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config[
|
||||
"steps"
|
||||
].get(f"{module_type}s", [])
|
||||
|
||||
parser = DefaultValidatingParser(
|
||||
add_help=False,
|
||||
@@ -115,30 +159,32 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
enabled_modules = []
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
enabled_modules.extend(yaml_config["steps"].get(f"{module_type}s", []))
|
||||
|
||||
# clear out duplicates, but keep the order
|
||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
avail_modules = self.module_factory.available_modules(
|
||||
limit_to_modules=enabled_modules, suppress_warnings=True
|
||||
)
|
||||
self.add_individual_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == 'simple':
|
||||
elif basic_config.mode == "simple":
|
||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||
self.add_individual_module_args(simple_modules, parser)
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
else:
|
||||
# load all modules, they're not using the 'simple' mode
|
||||
all_modules = self.module_factory.available_modules()
|
||||
# add all the modules to the steps
|
||||
for module in all_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
|
||||
self.add_individual_module_args(all_modules, parser)
|
||||
|
||||
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
# reload the parser with the new arguments, now that we have them
|
||||
@@ -164,43 +210,76 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
store_yaml(config, basic_config.config_file)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def add_modules_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
# Module loading from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
|
||||
parser.add_argument(
|
||||
f"--{module_type}s",
|
||||
dest=f"{module_type}s",
|
||||
nargs="+",
|
||||
help=f"the {module_type}s to use",
|
||||
default=[],
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
|
||||
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
parser.add_argument(
|
||||
"--authentication",
|
||||
dest="authentication",
|
||||
help="A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction)
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.",
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction,
|
||||
)
|
||||
|
||||
# logging arguments
|
||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
|
||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
parser.add_argument(
|
||||
"--logging.level",
|
||||
action="store",
|
||||
dest="logging.level",
|
||||
choices=["INFO", "DEBUG", "ERROR", "WARNING"],
|
||||
help="the logging level to use",
|
||||
default="INFO",
|
||||
type=str.upper,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.file", action="store", dest="logging.file", help="the logging file to write to", default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.rotation",
|
||||
action="store",
|
||||
dest="logging.rotation",
|
||||
help="the logging rotation to use",
|
||||
default=None,
|
||||
)
|
||||
|
||||
def add_individual_module_args(
|
||||
self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None
|
||||
) -> None:
|
||||
if not modules:
|
||||
modules = self.module_factory.available_modules()
|
||||
|
||||
|
||||
for module in modules:
|
||||
if module.name == 'cli_feeder':
|
||||
if module.name == "cli_feeder":
|
||||
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
parser.add_argument(
|
||||
"urls",
|
||||
nargs="*",
|
||||
default=[],
|
||||
help="URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
)
|
||||
continue
|
||||
|
||||
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
@@ -209,21 +288,21 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||
|
||||
for name, kwargs in module.configs.items():
|
||||
if not kwargs.get('metavar', None):
|
||||
if not kwargs.get("metavar", None):
|
||||
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
||||
kwargs['metavar'] = name.upper()
|
||||
kwargs["metavar"] = name.upper()
|
||||
|
||||
if kwargs.get('required', False):
|
||||
if kwargs.get("required", False):
|
||||
# required args shouldn't have a 'default' value, remove it
|
||||
kwargs.pop('default', None)
|
||||
kwargs.pop("default", None)
|
||||
|
||||
kwargs.pop('cli_set', None)
|
||||
should_store = kwargs.pop('should_store', False)
|
||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
kwargs.pop("cli_set", None)
|
||||
should_store = kwargs.pop("should_store", False)
|
||||
kwargs["dest"] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
try:
|
||||
kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
|
||||
kwargs["type"] = getattr(validators, kwargs.get("type", "__invalid__"))
|
||||
except AttributeError:
|
||||
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
||||
kwargs["type"] = __builtins__.get(kwargs.get("type"), str)
|
||||
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
|
||||
arg.should_store = should_store
|
||||
|
||||
@@ -238,12 +317,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self, config):
|
||||
logging_config = config["logging"]
|
||||
|
||||
logging_config = config['logging']
|
||||
|
||||
if logging_config.get('enabled', True) is False:
|
||||
if logging_config.get("enabled", True) is False:
|
||||
# disabled logging settings, they're set on a higher level
|
||||
logger.disable('auto_archiver')
|
||||
logger.disable("auto_archiver")
|
||||
return
|
||||
|
||||
# setup loguru logging
|
||||
@@ -253,38 +331,45 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
pass
|
||||
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config["level"])
|
||||
if log_file := logging_config["file"]:
|
||||
logger.add(log_file) if not logging_config["rotation"] else logger.add(
|
||||
log_file, rotation=logging_config["rotation"]
|
||||
)
|
||||
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
||||
are loaded, the program will exit with an error message.
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
if not modules_to_load:
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
if len(modules_to_load):
|
||||
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
|
||||
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
|
||||
logger.error(
|
||||
f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}"
|
||||
)
|
||||
raise SetupError(
|
||||
f"NO {module_type.upper()}S LOADED. Please check your configuration and try again."
|
||||
)
|
||||
|
||||
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
|
||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
if (module_type == "feeder" or module_type == "formatter") and len(step_items) > 1:
|
||||
raise SetupError(
|
||||
f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}"
|
||||
)
|
||||
|
||||
for module in modules_to_load:
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
@@ -293,7 +378,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if loaded_module and module_type == 'extractor':
|
||||
if loaded_module and module_type == "extractor":
|
||||
loaded_module.cleanup()
|
||||
raise e
|
||||
|
||||
@@ -308,11 +393,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
logger.error(
|
||||
f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings."
|
||||
)
|
||||
raise FileNotFoundError(f"Configuration file {config_file} not found")
|
||||
|
||||
return read_yaml(config_file)
|
||||
|
||||
|
||||
def setup_config(self, args: list) -> dict:
|
||||
"""
|
||||
Sets up the configuration file, merging the default config with the user's config
|
||||
@@ -335,13 +422,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
yaml_config = self.load_config(basic_config.config_file)
|
||||
|
||||
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
|
||||
def check_for_updates(self):
|
||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
||||
latest_version = response['info']['version']
|
||||
latest_version = response["info"]["version"]
|
||||
# check version compared to current version
|
||||
if latest_version != __version__:
|
||||
if os.environ.get('RUNNING_IN_DOCKER'):
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||
else:
|
||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||
@@ -351,33 +438,36 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
||||
logger.warning("")
|
||||
|
||||
|
||||
def setup(self, args: list):
|
||||
"""
|
||||
Function to configure all setup of the orchestrator: setup configs and load modules.
|
||||
|
||||
|
||||
This method should only ever be called once
|
||||
"""
|
||||
|
||||
self.check_for_updates()
|
||||
|
||||
if self.setup_finished:
|
||||
logger.warning("The `setup_config()` function should only ever be run once. \
|
||||
logger.warning(
|
||||
"The `setup_config()` function should only ever be run once. \
|
||||
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
||||
For code implementatations, you should call .setup_config() once then you may call .feed() \
|
||||
multiple times to archive multiple URLs.")
|
||||
multiple times to archive multiple URLs."
|
||||
)
|
||||
return
|
||||
|
||||
self.setup_basic_parser()
|
||||
self.config = self.setup_config(args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
self.install_modules(self.config['steps'])
|
||||
self.install_modules(self.config["steps"])
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
logger.info(
|
||||
f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))
|
||||
)
|
||||
|
||||
self.setup_finished = True
|
||||
|
||||
def _command_line_run(self, args: list) -> Generator[Metadata]:
|
||||
@@ -385,9 +475,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
This is the main entry point for the orchestrator, when run from the command line.
|
||||
|
||||
:param args: list of arguments to pass to the orchestrator - these are the command line args
|
||||
|
||||
|
||||
You should not call this method from code implementations.
|
||||
|
||||
|
||||
This method sets up the configuration, loads the modules, and runs the feed.
|
||||
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
|
||||
To test configurations, without loading any modules you can also first call 'setup_configs'
|
||||
@@ -396,7 +486,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.setup(args)
|
||||
return self.feed()
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(e, exc_info=True)
|
||||
exit(1)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
@@ -405,7 +495,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
e.cleanup()
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
|
||||
url_count = 0
|
||||
for feeder in self.feeders:
|
||||
for item in feeder:
|
||||
@@ -436,7 +525,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.cleanup()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
|
||||
for d in self.databases:
|
||||
if isinstance(e, AssertionError):
|
||||
d.failed(item, str(e))
|
||||
@@ -451,13 +540,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
"""
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
"""
|
||||
|
||||
original_url = result.get_url().strip()
|
||||
@@ -528,7 +617,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def setup_authentication(self, config: dict) -> dict:
|
||||
"""
|
||||
@@ -537,7 +625,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
Split up strings into multiple sites if they are comma separated
|
||||
"""
|
||||
|
||||
authentication = config.get('authentication', {})
|
||||
authentication = config.get("authentication", {})
|
||||
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
@@ -546,8 +634,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
site = site.strip()
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
config['authentication'] = authentication
|
||||
|
||||
config["authentication"] = authentication
|
||||
return config
|
||||
|
||||
# Helper Properties
|
||||
|
||||
@@ -32,16 +32,16 @@ from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
|
||||
|
||||
class Storage(BaseModule):
|
||||
|
||||
"""
|
||||
Base class for implementing storage modules in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
||||
"""
|
||||
|
||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
def store(self, media: Media, url: str, metadata: Metadata = None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
|
||||
@@ -73,18 +73,18 @@ class Storage(BaseModule):
|
||||
This method should not be called directly, but instead be called through the 'store' method,
|
||||
which sets up the media for storage.
|
||||
"""
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
with open(media.filename, 'rb') as f:
|
||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
|
||||
with open(media.filename, "rb") as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
|
||||
|
||||
if media.key is not None and len(media.key) > 0:
|
||||
# media key is already set
|
||||
return
|
||||
|
||||
folder = metadata.get_context('folder', '')
|
||||
folder = metadata.get_context("folder", "")
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# Handle path_generator logic
|
||||
@@ -104,12 +104,11 @@ class Storage(BaseModule):
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
# load the hash_enricher module
|
||||
he = self.module_factory.get_module("hash_enricher", self.config)
|
||||
he: HashEnricher = self.module_factory.get_module("hash_enricher", self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
||||
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
|
||||
media._key = key
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
media._key = key
|
||||
|
||||
Reference in New Issue
Block a user