From 4830f9930015d14002001b3c075bb0b470d682d9 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 21 Jan 2025 20:03:10 +0100 Subject: [PATCH] Get parsing of manifest and combining with config file working --- src/auto_archiver/core/config.py | 120 +++++------------- src/auto_archiver/core/loader.py | 23 +++- src/auto_archiver/core/orchestrator.py | 69 ++++++++-- .../modules/generic_extractor/__manifest__.py | 15 ++- 4 files changed, 121 insertions(+), 106 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index ef012c9..66c2eb5 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -5,22 +5,9 @@ flexible setup in various environments. """ -import importlib import argparse import yaml from dataclasses import dataclass, field -from typing import List -from collections import defaultdict -from loguru import logger - -from ..archivers import Archiver -from ..feeders import Feeder -from ..databases import Database -from ..formatters import Formatter -from ..storages import Storage -from ..enrichers import Enricher -from . import Step -from ..utils import update_nested_dict # @dataclass @@ -46,84 +33,45 @@ from ..utils import update_nested_dict # self.cli_ops = {} # self.config = {} -# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}): -# """ -# if yaml_config_filename is provided, the --config argument is ignored, -# useful for library usage when the config values are preloaded -# overwrite_configs is a dict that overwrites the yaml file contents -# """ -# # 1. parse CLI values -# if use_cli: -# parser = argparse.ArgumentParser( -# # prog = "auto-archiver", -# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", -# epilog="Check the code at https://github.com/bellingcat/auto-archiver" -# ) + # def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}): + # """ + # if yaml_config_filename is provided, the --config argument is ignored, + # useful for library usage when the config values are preloaded + # overwrite_configs is a dict that overwrites the yaml file contents + # """ + # # 1. parse CLI values + # if use_cli: + # parser = argparse.ArgumentParser( + # # prog = "auto-archiver", + # description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", + # epilog="Check the code at https://github.com/bellingcat/auto-archiver" + # ) -# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') -# parser.add_argument('--version', action='version', version=__version__) + # parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') + # parser.add_argument('--version', action='version', version=__version__) -# # Iterate over all step subclasses to gather default configs and CLI arguments -# for configurable in self.configurable_parents: -# child: Step -# for child in configurable.__subclasses__(): -# assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict." -# for config, details in child.configs().items(): -# assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" -# assert "." not in config, f"config property cannot contain dots('.'): {config}" -# config_path = f"{child.name}.{config}" +def format_config(config: dict) -> dict: + # Iterate over all step subclasses to gather default configs and CLI arguments + new_config = {} + for step, values in config['steps'].items(): + new_config[f"--{step}"] = values + + # format configurations + for name, confg_vals in config['configurations'].items(): + for key, value in confg_vals.items(): + assert "." not in key, "config key cannot contain '.'" + config_path = f"--{name}.{key}" + new_config[config_path] = value -# if use_cli: -# try: -# parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) -# except argparse.ArgumentError: -# # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver -# pass + return new_config -# self.defaults[config_path] = details["default"] -# if "cli_set" in details: -# self.cli_ops[config_path] = details["cli_set"] -# if use_cli: -# args = parser.parse_args() -# yaml_config_filename = yaml_config_filename or getattr(args, "config") -# else: args = {} - -# # 2. read YAML config file (or use provided value) -# self.yaml_config = self.read_yaml(yaml_config_filename) -# update_nested_dict(self.yaml_config, overwrite_configs) - -# # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default -# self.config = defaultdict(dict) -# for config_path, default in self.defaults.items(): -# child, config = tuple(config_path.split(".")) -# val = getattr(args, config_path, None) -# if val is not None and config_path in self.cli_ops: -# val = self.cli_ops[config_path](val, default) -# if val is None: -# val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) -# self.config[child][config] = val -# self.config = dict(self.config) - -# # 4. STEPS: read steps and validate they exist -# steps = self.yaml_config.get("steps", {}) -# assert "archivers" in steps, "your configuration steps are missing the archivers property" -# assert "storages" in steps, "your configuration steps are missing the storages property" - -# self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) -# self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config) -# self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] -# self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])] -# self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] -# self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])] - -# logger.info(f"FEEDER: {self.feeder.name}") -# logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}") -# logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}") -# logger.info(f"DATABASES: {[x.name for x in self.databases]}") -# logger.info(f"STORAGES: {[x.name for x in self.storages]}") -# logger.info(f"FORMATTER: {self.formatter.name}") +class LoadFromFile (argparse.Action): + def __call__ (self, parser, namespace, values, option_string = None): + with values as f: + # parse arguments in the file and store them in the target namespace + parser.parse_args(f.read().split(), namespace) def read_yaml(yaml_filename: str) -> dict: with open(yaml_filename, "r", encoding="utf-8") as inf: - return yaml.safe_load(inf) + return format_config(yaml.safe_load(inf)) diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index e9de8c5..8b96198 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -1,4 +1,6 @@ +import ast import os +import copy from os.path import join, dirname from typing import List @@ -11,15 +13,18 @@ _DEFAULT_MANIFEST = { 'external_dependencies': {}, 'entry_point': '', 'version': '1.0', + 'config': {} } -def load_manifest(self, module): +def load_manifest(module): # load the manifest file + manifest = copy.deepcopy(_DEFAULT_MANIFEST) + with open(join(module, MANIFEST_FILE)) as f: - manifest = f.read() + manifest.update(ast.literal_eval(f.read())) return manifest -def available_modules(self, additional_paths: List[str] = []) -> List[dict]: +def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[dict]: # search through all valid 'modules' paths. Default is 'modules' in the current directory # see odoo/modules/module.py -> get_modules @@ -32,11 +37,15 @@ def available_modules(self, additional_paths: List[str] = []) -> List[dict]: for module_folder in default_path + additional_paths: # walk through each module in module_folder and check if it has a valid manifest - for folder in os.listdir(module_folder): - possible_module = join(module_folder, folder) - if not is_really_module(possible_module): + for possible_module in os.listdir(module_folder): + possible_module_path = join(module_folder, possible_module) + if not is_really_module(possible_module_path): continue # parse manifest and add to list of available modules - all_modules.append(possible_module) + if with_manifest: + manifest = load_manifest(possible_module_path) + else: + manifest = {} + all_modules.append((possible_module, possible_module_path, manifest)) return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index a18da0e..f788203 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -5,9 +5,6 @@ """ from __future__ import annotations -import ast -import os -from os.path import dirname, join from typing import Generator, Union, List from urllib.parse import urlparse from ipaddress import ip_address @@ -51,23 +48,67 @@ class ArchivingOrchestrator: def setup_parser(self): parser = argparse.ArgumentParser( # prog = "auto-archiver", + add_help=False, description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", epilog="Check the code at https://github.com/bellingcat/auto-archiver" ) parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) parser.add_argument('--version', action='version', version=__version__) parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') + # override the default 'help' so we can inject all the configs and show those + parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit') self.parser = parser - def setup_config(self): + def add_module_args(self, modules: list = None): + if not modules: + modules = available_modules(with_manifest=True) + + for module_name, module_path, manifest in modules: + for name, kwargs in manifest['config'].items(): + kwargs['dest'] = f"{module_name}.{kwargs.pop('dest', name)}" + self.parser.add_argument(f"--{module_name}.{name}", **kwargs) + + def show_help(self): + # for the help message, we want to load *all* possible modules and show the help + # add configs as arg parser arguments + self.add_module_args() + + self.parser.print_help() + exit() + + def setup_config(self, config: dict) -> None: # check what mode we're in # if simple, we'll load just the modules that has requires_setup = False # if full, we'll load all modules if self.config.mode == 'simple': - for module in available_modules(): - # load the module - manifest = load_manifest(module) - + simple_modules = [module for module in available_modules(with_manifest=True) if not module[2]['requires_setup']] + self.add_module_args(simple_modules) + + # now we add the --feeders, --enrichers, --archivers, --databases, --storages, and --formatter, and make them "required" + self.parser.add_argument('--feeders', action='store', nargs='*', dest='feeders', required=True, help='the feeders to use') + self.parser.add_argument('--enrichers', action='store', nargs='*', dest='enrichers', required=True, help='the enrichers to use') + self.parser.add_argument('--extractors', action='store', nargs='*', dest='extractors', required=True, help='the enrichers to use') + self.parser.add_argument('--databases', action='store', nargs='*', dest='databases', required=True, help='the databases to use') + self.parser.add_argument('--storages', action='store', nargs='*', dest='storages', required=True, help='the storages to use') + self.parser.add_argument('--formatter', action='store', nargs='*', dest='formatter', required=True, help='the formatter to use') + + + config.update(self.config.__dict__) + # reload the parser with the new arguments, now that we have them + self.config, unknown = self.parser.parse_known_args(config) + logger.warning(f"Ignoring unknown/unused arguments: {unknown}") + + breakpoint() + + + logger.info(f"FEEDER: {self.config.feeders}") + logger.info(f"ENRICHERS: {self.config.enrichers}") + logger.info(f"ARCHIVERS: {self.config.archivers}") + logger.info(f"DATABASES: {self.config.databases}") + logger.info(f"STORAGES: {self.config.storages}") + logger.info(f"FORMATTER: {self.formatter.name}") + + def run(self) -> None: self.setup_parser() @@ -77,17 +118,21 @@ class ArchivingOrchestrator: # load the config file to get the list of enabled items self.config, _ = self.parser.parse_known_args() + # if help flag was called, then show the help + if self.config.help: + self.show_help() # load the config file + config = {} + try: config = read_yaml(self.config.config_file) except FileNotFoundError: - if self.settings.config == DEFAULT_CONFIG_FILE: - # no config file found, let's do the setup with the default values - self.setup_config() - else: + if self.config.config_file != DEFAULT_CONFIG_FILE: logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") exit() + self.setup_config(config) + breakpoint() config.parse() diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index bae5f36..673399e 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -29,5 +29,18 @@ the broader archiving framework. metadata objects. Some dropins are included in this generic_archiver by default, but custom dropins can be created to handle additional websites and passed to the archiver via the command line using the `--dropins` option (TODO!). -""" +""", + 'config': { + "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"}, + "subtitles": {"default": True, "help": "download subtitles if available"}, + "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"}, + "livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"}, + "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."}, + "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"}, + "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."}, + 'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."}, + "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."}, + "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"}, + "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"}, + } } \ No newline at end of file