diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py index e9fe79f..307716d 100644 --- a/src/auto_archiver/__init__.py +++ b/src/auto_archiver/__init__.py @@ -2,6 +2,5 @@ from . import archivers, databases, enrichers, feeders, formatters, storages, ut # need to manually specify due to cyclical deps from .core.orchestrator import ArchivingOrchestrator -from .core.config import Config # making accessible directly from .core.metadata import Metadata diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index 1254ec4..8b2a65a 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -1,13 +1,9 @@ """ Entry point for the auto_archiver package. """ -from . import Config from . import ArchivingOrchestrator -def main(): - config = Config() - config.parse() - orchestrator = ArchivingOrchestrator(config) - for r in orchestrator.feed(): pass +def main(): + ArchivingOrchestrator().run() if __name__ == "__main__": main() diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py index 5733290..7519a8e 100644 --- a/src/auto_archiver/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -12,5 +12,4 @@ from .instagram_archiver import InstagramArchiver from .instagram_tbot_archiver import InstagramTbotArchiver from .telegram_archiver import TelegramArchiver from .vk_archiver import VkArchiver -from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver from .instagram_api_archiver import InstagramAPIArchiver diff --git a/src/auto_archiver/archivers/generic_archiver/__init__.py b/src/auto_archiver/archivers/generic_archiver/__init__.py deleted file mode 100644 index 0788ae0..0000000 --- a/src/auto_archiver/archivers/generic_archiver/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .generic_archiver import GenericArchiver \ No newline at end of file diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 038cbeb..ef012c9 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -21,110 +21,109 @@ from ..storages import Storage from ..enrichers import Enricher from . import Step from ..utils import update_nested_dict -from ..version import __version__ -@dataclass -class Config: - configurable_parents = [ - Feeder, - Enricher, - Archiver, - Database, - Storage, - Formatter - # Util - ] - feeder: Feeder - formatter: Formatter - archivers: List[Archiver] = field(default_factory=[]) - enrichers: List[Enricher] = field(default_factory=[]) - storages: List[Storage] = field(default_factory=[]) - databases: List[Database] = field(default_factory=[]) +# @dataclass +# class Config: +# configurable_parents = [ +# Feeder, +# Enricher, +# Archiver, +# Database, +# Storage, +# Formatter +# # Util +# ] +# feeder: Feeder +# formatter: Formatter +# archivers: List[Archiver] = field(default_factory=[]) +# enrichers: List[Enricher] = field(default_factory=[]) +# storages: List[Storage] = field(default_factory=[]) +# databases: List[Database] = field(default_factory=[]) - def __init__(self) -> None: - self.defaults = {} - self.cli_ops = {} - self.config = {} +# def __init__(self) -> None: +# self.defaults = {} +# self.cli_ops = {} +# self.config = {} - def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}): - """ - if yaml_config_filename is provided, the --config argument is ignored, - useful for library usage when the config values are preloaded - overwrite_configs is a dict that overwrites the yaml file contents - """ - # 1. parse CLI values - if use_cli: - parser = argparse.ArgumentParser( - # prog = "auto-archiver", - description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", - epilog="Check the code at https://github.com/bellingcat/auto-archiver" - ) +# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}): +# """ +# if yaml_config_filename is provided, the --config argument is ignored, +# useful for library usage when the config values are preloaded +# overwrite_configs is a dict that overwrites the yaml file contents +# """ +# # 1. parse CLI values +# if use_cli: +# parser = argparse.ArgumentParser( +# # prog = "auto-archiver", +# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", +# epilog="Check the code at https://github.com/bellingcat/auto-archiver" +# ) - parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') - parser.add_argument('--version', action='version', version=__version__) +# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') +# parser.add_argument('--version', action='version', version=__version__) - # Iterate over all step subclasses to gather default configs and CLI arguments - for configurable in self.configurable_parents: - child: Step - for child in configurable.__subclasses__(): - assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict." - for config, details in child.configs().items(): - assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" - assert "." not in config, f"config property cannot contain dots('.'): {config}" - config_path = f"{child.name}.{config}" +# # Iterate over all step subclasses to gather default configs and CLI arguments +# for configurable in self.configurable_parents: +# child: Step +# for child in configurable.__subclasses__(): +# assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict." +# for config, details in child.configs().items(): +# assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" +# assert "." not in config, f"config property cannot contain dots('.'): {config}" +# config_path = f"{child.name}.{config}" - if use_cli: - try: - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) - except argparse.ArgumentError: - # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver - pass +# if use_cli: +# try: +# parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) +# except argparse.ArgumentError: +# # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver +# pass - self.defaults[config_path] = details["default"] - if "cli_set" in details: - self.cli_ops[config_path] = details["cli_set"] +# self.defaults[config_path] = details["default"] +# if "cli_set" in details: +# self.cli_ops[config_path] = details["cli_set"] - if use_cli: - args = parser.parse_args() - yaml_config_filename = yaml_config_filename or getattr(args, "config") - else: args = {} +# if use_cli: +# args = parser.parse_args() +# yaml_config_filename = yaml_config_filename or getattr(args, "config") +# else: args = {} - # 2. read YAML config file (or use provided value) - self.yaml_config = self.read_yaml(yaml_config_filename) - update_nested_dict(self.yaml_config, overwrite_configs) +# # 2. read YAML config file (or use provided value) +# self.yaml_config = self.read_yaml(yaml_config_filename) +# update_nested_dict(self.yaml_config, overwrite_configs) - # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default - self.config = defaultdict(dict) - for config_path, default in self.defaults.items(): - child, config = tuple(config_path.split(".")) - val = getattr(args, config_path, None) - if val is not None and config_path in self.cli_ops: - val = self.cli_ops[config_path](val, default) - if val is None: - val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) - self.config[child][config] = val - self.config = dict(self.config) +# # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default +# self.config = defaultdict(dict) +# for config_path, default in self.defaults.items(): +# child, config = tuple(config_path.split(".")) +# val = getattr(args, config_path, None) +# if val is not None and config_path in self.cli_ops: +# val = self.cli_ops[config_path](val, default) +# if val is None: +# val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) +# self.config[child][config] = val +# self.config = dict(self.config) - # 4. STEPS: read steps and validate they exist - steps = self.yaml_config.get("steps", {}) - assert "archivers" in steps, "your configuration steps are missing the archivers property" - assert "storages" in steps, "your configuration steps are missing the storages property" +# # 4. STEPS: read steps and validate they exist +# steps = self.yaml_config.get("steps", {}) +# assert "archivers" in steps, "your configuration steps are missing the archivers property" +# assert "storages" in steps, "your configuration steps are missing the storages property" - self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) - self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config) - self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] - self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])] - self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] - self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])] +# self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) +# self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config) +# self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] +# self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])] +# self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] +# self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])] - logger.info(f"FEEDER: {self.feeder.name}") - logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}") - logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}") - logger.info(f"DATABASES: {[x.name for x in self.databases]}") - logger.info(f"STORAGES: {[x.name for x in self.storages]}") - logger.info(f"FORMATTER: {self.formatter.name}") +# logger.info(f"FEEDER: {self.feeder.name}") +# logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}") +# logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}") +# logger.info(f"DATABASES: {[x.name for x in self.databases]}") +# logger.info(f"STORAGES: {[x.name for x in self.storages]}") +# logger.info(f"FORMATTER: {self.formatter.name}") - def read_yaml(self, yaml_filename: str) -> dict: - with open(yaml_filename, "r", encoding="utf-8") as inf: - return yaml.safe_load(inf) +def read_yaml(yaml_filename: str) -> dict: + with open(yaml_filename, "r", encoding="utf-8") as inf: + return yaml.safe_load(inf) diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py new file mode 100644 index 0000000..e9de8c5 --- /dev/null +++ b/src/auto_archiver/core/loader.py @@ -0,0 +1,42 @@ +import os +from os.path import join, dirname +from typing import List + +MANIFEST_FILE = "__manifest__.py" +_DEFAULT_MANIFEST = { + 'author': 'Bellingcat', + 'requires_setup': True, + 'depends': [], + 'description': '', + 'external_dependencies': {}, + 'entry_point': '', + 'version': '1.0', +} + +def load_manifest(self, module): + # load the manifest file + with open(join(module, MANIFEST_FILE)) as f: + manifest = f.read() + return manifest + +def available_modules(self, additional_paths: List[str] = []) -> List[dict]: + # search through all valid 'modules' paths. Default is 'modules' in the current directory + + # see odoo/modules/module.py -> get_modules + def is_really_module(name): + if os.path.isfile(join(name, MANIFEST_FILE)): + return True + + default_path = [join(dirname(dirname((__file__))), "modules")] + all_modules = [] + + for module_folder in default_path + additional_paths: + # walk through each module in module_folder and check if it has a valid manifest + for folder in os.listdir(module_folder): + possible_module = join(module_folder, folder) + if not is_really_module(possible_module): + continue + # parse manifest and add to list of available modules + all_modules.append(possible_module) + + return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 3290070..a18da0e 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -5,9 +5,13 @@ """ from __future__ import annotations +import ast +import os +from os.path import dirname, join from typing import Generator, Union, List from urllib.parse import urlparse from ipaddress import ip_address +import argparse from .context import ArchivingContext @@ -18,27 +22,78 @@ from ..storages import Storage from ..enrichers import Enricher from ..databases import Database from .metadata import Metadata +from ..version import __version__ +from .config import read_yaml +from .loader import available_modules, load_manifest import tempfile, traceback from loguru import logger +DEFAULT_CONFIG_FILE = "orchestration.yaml" class ArchivingOrchestrator: - def __init__(self, config) -> None: - self.feeder: Feeder = config.feeder - self.formatter: Formatter = config.formatter - self.enrichers: List[Enricher] = config.enrichers - self.archivers: List[Archiver] = config.archivers - self.databases: List[Database] = config.databases - self.storages: List[Storage] = config.storages - ArchivingContext.set("storages", self.storages, keep_on_reset=True) - try: - for a in self.all_archivers_for_setup(): a.setup() - except (KeyboardInterrupt, Exception) as e: - logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") - self.cleanup() + # def __init__(self, config: Config) -> None: + # self.feeder: Feeder = config.feeder + # self.formatter: Formatter = config.formatter + # self.enrichers: List[Enricher] = config.enrichers + # self.archivers: List[Archiver] = config.archivers + # self.databases: List[Database] = config.databases + # self.storages: List[Storage] = config.storages + # ArchivingContext.set("storages", self.storages, keep_on_reset=True) + # try: + # for a in self.all_archivers_for_setup(): a.setup() + # except (KeyboardInterrupt, Exception) as e: + # logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") + # self.cleanup() + + def setup_parser(self): + parser = argparse.ArgumentParser( + # prog = "auto-archiver", + description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", + epilog="Check the code at https://github.com/bellingcat/auto-archiver" + ) + parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) + parser.add_argument('--version', action='version', version=__version__) + parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') + self.parser = parser + + def setup_config(self): + # check what mode we're in + # if simple, we'll load just the modules that has requires_setup = False + # if full, we'll load all modules + if self.config.mode == 'simple': + for module in available_modules(): + # load the module + manifest = load_manifest(module) + + + def run(self) -> None: + self.setup_parser() + + # parse the known arguments for now (basically, we want the config file) + + # load the config file to get the list of enabled items + self.config, _ = self.parser.parse_known_args() + + # load the config file + try: + config = read_yaml(self.config.config_file) + except FileNotFoundError: + if self.settings.config == DEFAULT_CONFIG_FILE: + # no config file found, let's do the setup with the default values + self.setup_config() + else: + logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") + exit() + + breakpoint() + config.parse() + + + for item in self.feed(): + pass def cleanup(self)->None: logger.info("Cleaning up") diff --git a/src/auto_archiver/modules/generic_extractor/__init__.py b/src/auto_archiver/modules/generic_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/archivers/generic_archiver/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py similarity index 85% rename from src/auto_archiver/archivers/generic_archiver/__manifest__.py rename to src/auto_archiver/modules/generic_extractor/__manifest__.py index 67c75f2..bae5f36 100644 --- a/src/auto_archiver/archivers/generic_archiver/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -1,15 +1,16 @@ { - 'name': 'Generic Archiver', + 'name': 'Generic Extractor', 'version': '0.1.0', 'author': 'Bellingcat', - 'type': ['archiver'], + 'type': ['extractor'], + 'entry_point': 'generic_extractor:GenericExtractor', 'requires_setup': False, 'depends': ['core'], 'external_dependencies': { 'python': ['yt_dlp', 'requests', 'loguru', 'slugify'], }, 'description': """ -This is the generic archiver used by auto-archiver, which uses `yt-dlp` under the hood. +This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood. This module is responsible for downloading and processing media content from platforms supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality diff --git a/src/auto_archiver/archivers/generic_archiver/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py similarity index 100% rename from src/auto_archiver/archivers/generic_archiver/bluesky.py rename to src/auto_archiver/modules/generic_extractor/bluesky.py diff --git a/src/auto_archiver/archivers/generic_archiver/dropin.py b/src/auto_archiver/modules/generic_extractor/dropin.py similarity index 100% rename from src/auto_archiver/archivers/generic_archiver/dropin.py rename to src/auto_archiver/modules/generic_extractor/dropin.py diff --git a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py similarity index 99% rename from src/auto_archiver/archivers/generic_archiver/generic_archiver.py rename to src/auto_archiver/modules/generic_extractor/generic_extractor.py index bf423e0..1fd6a18 100644 --- a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -8,7 +8,7 @@ from loguru import logger from auto_archiver.archivers.archiver import Archiver from ...core import Metadata, Media, ArchivingContext -class GenericArchiver(Archiver): +class GenericExtractor(Archiver): name = "youtubedl_archiver" #left as is for backwards compat _dropins = {} diff --git a/src/auto_archiver/archivers/generic_archiver/truth.py b/src/auto_archiver/modules/generic_extractor/truth.py similarity index 100% rename from src/auto_archiver/archivers/generic_archiver/truth.py rename to src/auto_archiver/modules/generic_extractor/truth.py diff --git a/src/auto_archiver/archivers/generic_archiver/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py similarity index 100% rename from src/auto_archiver/archivers/generic_archiver/twitter.py rename to src/auto_archiver/modules/generic_extractor/twitter.py diff --git a/tests/archivers/test_generic_archiver.py b/tests/archivers/test_generic_archiver.py index 6e249e8..b0190b6 100644 --- a/tests/archivers/test_generic_archiver.py +++ b/tests/archivers/test_generic_archiver.py @@ -6,13 +6,13 @@ from os.path import dirname import pytest -from auto_archiver.archivers.generic_archiver import GenericArchiver +from auto_archiver.archivers.generic_extractor.generic_extractor import GenericExtractor from .test_archiver_base import TestArchiverBase -class TestGenericArchiver(TestArchiverBase): +class TestGenericExtractor(TestArchiverBase): """Tests Base Archiver """ - archiver_class = GenericArchiver + archiver_class = GenericExtractor config = { 'subtitles': False, 'comments': False,