From c41d93a634a1d2164b6ab144eb12b5658882b916 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 21 Jan 2025 17:53:37 +0100 Subject: [PATCH 001/110] Use already implemented helper to get version --- src/auto_archiver/core/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index c6a2209..038cbeb 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -21,6 +21,7 @@ from ..storages import Storage from ..enrichers import Enricher from . import Step from ..utils import update_nested_dict +from ..version import __version__ @dataclass @@ -61,7 +62,7 @@ class Config: ) parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') - parser.add_argument('--version', action='version', version=importlib.metadata.version('auto_archiver')) + parser.add_argument('--version', action='version', version=__version__) # Iterate over all step subclasses to gather default configs and CLI arguments for configurable in self.configurable_parents: From bdfc855297f0e95f3e69af72d98f19f8840c1220 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 21 Jan 2025 17:59:52 +0100 Subject: [PATCH 002/110] Ignore pylint statements for manifest files --- .pylintrc | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .pylintrc diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..4a5b7f0 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,3 @@ +[MAIN] + +ignore-patterns=(.*tests.*.py, __manifest__.py) \ No newline at end of file From 03f377022333ac3afd6c452650c68960690d07f3 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 21 Jan 2025 18:00:45 +0100 Subject: [PATCH 003/110] Add __manifest__.py for generic_extractor --- .../generic_archiver/__manifest__.py | 32 +++++++++++++++++++ .../generic_archiver/generic_archiver.py | 24 -------------- 2 files changed, 32 insertions(+), 24 deletions(-) create mode 100644 src/auto_archiver/archivers/generic_archiver/__manifest__.py diff --git a/src/auto_archiver/archivers/generic_archiver/__manifest__.py b/src/auto_archiver/archivers/generic_archiver/__manifest__.py new file mode 100644 index 0000000..67c75f2 --- /dev/null +++ b/src/auto_archiver/archivers/generic_archiver/__manifest__.py @@ -0,0 +1,32 @@ +{ + 'name': 'Generic Archiver', + 'version': '0.1.0', + 'author': 'Bellingcat', + 'type': ['archiver'], + 'requires_setup': False, + 'depends': ['core'], + 'external_dependencies': { + 'python': ['yt_dlp', 'requests', 'loguru', 'slugify'], + }, + 'description': """ +This is the generic archiver used by auto-archiver, which uses `yt-dlp` under the hood. + +This module is responsible for downloading and processing media content from platforms +supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality +for retrieving videos, subtitles, comments, and other metadata, and it integrates with +the broader archiving framework. + +### Features +- Supports downloading videos and playlists. +- Retrieves metadata like titles, descriptions, upload dates, and durations. +- Downloads subtitles and comments when enabled. +- Configurable options for handling live streams, proxies, and more. + +### Dropins +- For websites supported by `yt-dlp` that also contain posts in addition to videos + (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create + metadata objects. Some dropins are included in this generic_archiver by default, but +custom dropins can be created to handle additional websites and passed to the archiver +via the command line using the `--dropins` option (TODO!). +""" +} \ No newline at end of file diff --git a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py index 729d6ef..bf423e0 100644 --- a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py +++ b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py @@ -1,27 +1,3 @@ -""" -This is the generic archiver used by auto-archiver, which uses `yt-dlp` under the hood. - -This module is responsible for downloading and processing media content from platforms -supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality -for retrieving videos, subtitles, comments, and other metadata, and it integrates with -the broader archiving framework. - -### Features -- Supports downloading videos and playlists. -- Retrieves metadata like titles, descriptions, upload dates, and durations. -- Downloads subtitles and comments when enabled. -- Configurable options for handling live streams, proxies, and more. - -### Dropins -- For websites supported by `yt-dlp` that also contain posts in addition to videos - (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create - metadata objects. Some dropins are included in this generic_archiver by default, but -custom dropins can be created to handle additional websites and passed to the archiver -via the command line using the `--dropins` option (TODO!). - -""" - - import datetime, os, yt_dlp, pysubs2 import importlib from typing import Type From 241b35002c27c34c51b813f40ada41875de8abc8 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 21 Jan 2025 19:02:38 +0100 Subject: [PATCH 004/110] Initial changes to move to '__manifest__' format --- src/auto_archiver/__init__.py | 1 - src/auto_archiver/__main__.py | 8 +- src/auto_archiver/archivers/__init__.py | 1 - .../archivers/generic_archiver/__init__.py | 1 - src/auto_archiver/core/config.py | 183 +++++++++--------- src/auto_archiver/core/loader.py | 42 ++++ src/auto_archiver/core/orchestrator.py | 81 ++++++-- .../modules/generic_extractor/__init__.py | 0 .../generic_extractor}/__manifest__.py | 7 +- .../generic_extractor}/bluesky.py | 0 .../generic_extractor}/dropin.py | 0 .../generic_extractor/generic_extractor.py} | 2 +- .../generic_extractor}/truth.py | 0 .../generic_extractor}/twitter.py | 0 tests/archivers/test_generic_archiver.py | 6 +- 15 files changed, 211 insertions(+), 121 deletions(-) delete mode 100644 src/auto_archiver/archivers/generic_archiver/__init__.py create mode 100644 src/auto_archiver/core/loader.py create mode 100644 src/auto_archiver/modules/generic_extractor/__init__.py rename src/auto_archiver/{archivers/generic_archiver => modules/generic_extractor}/__manifest__.py (85%) rename src/auto_archiver/{archivers/generic_archiver => modules/generic_extractor}/bluesky.py (100%) rename src/auto_archiver/{archivers/generic_archiver => modules/generic_extractor}/dropin.py (100%) rename src/auto_archiver/{archivers/generic_archiver/generic_archiver.py => modules/generic_extractor/generic_extractor.py} (99%) rename src/auto_archiver/{archivers/generic_archiver => modules/generic_extractor}/truth.py (100%) rename src/auto_archiver/{archivers/generic_archiver => modules/generic_extractor}/twitter.py (100%) diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py index e9fe79f..307716d 100644 --- a/src/auto_archiver/__init__.py +++ b/src/auto_archiver/__init__.py @@ -2,6 +2,5 @@ from . import archivers, databases, enrichers, feeders, formatters, storages, ut # need to manually specify due to cyclical deps from .core.orchestrator import ArchivingOrchestrator -from .core.config import Config # making accessible directly from .core.metadata import Metadata diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index 1254ec4..8b2a65a 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -1,13 +1,9 @@ """ Entry point for the auto_archiver package. """ -from . import Config from . import ArchivingOrchestrator -def main(): - config = Config() - config.parse() - orchestrator = ArchivingOrchestrator(config) - for r in orchestrator.feed(): pass +def main(): + ArchivingOrchestrator().run() if __name__ == "__main__": main() diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py index 5733290..7519a8e 100644 --- a/src/auto_archiver/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -12,5 +12,4 @@ from .instagram_archiver import InstagramArchiver from .instagram_tbot_archiver import InstagramTbotArchiver from .telegram_archiver import TelegramArchiver from .vk_archiver import VkArchiver -from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver from .instagram_api_archiver import InstagramAPIArchiver diff --git a/src/auto_archiver/archivers/generic_archiver/__init__.py b/src/auto_archiver/archivers/generic_archiver/__init__.py deleted file mode 100644 index 0788ae0..0000000 --- a/src/auto_archiver/archivers/generic_archiver/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .generic_archiver import GenericArchiver \ No newline at end of file diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 038cbeb..ef012c9 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -21,110 +21,109 @@ from ..storages import Storage from ..enrichers import Enricher from . import Step from ..utils import update_nested_dict -from ..version import __version__ -@dataclass -class Config: - configurable_parents = [ - Feeder, - Enricher, - Archiver, - Database, - Storage, - Formatter - # Util - ] - feeder: Feeder - formatter: Formatter - archivers: List[Archiver] = field(default_factory=[]) - enrichers: List[Enricher] = field(default_factory=[]) - storages: List[Storage] = field(default_factory=[]) - databases: List[Database] = field(default_factory=[]) +# @dataclass +# class Config: +# configurable_parents = [ +# Feeder, +# Enricher, +# Archiver, +# Database, +# Storage, +# Formatter +# # Util +# ] +# feeder: Feeder +# formatter: Formatter +# archivers: List[Archiver] = field(default_factory=[]) +# enrichers: List[Enricher] = field(default_factory=[]) +# storages: List[Storage] = field(default_factory=[]) +# databases: List[Database] = field(default_factory=[]) - def __init__(self) -> None: - self.defaults = {} - self.cli_ops = {} - self.config = {} +# def __init__(self) -> None: +# self.defaults = {} +# self.cli_ops = {} +# self.config = {} - def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}): - """ - if yaml_config_filename is provided, the --config argument is ignored, - useful for library usage when the config values are preloaded - overwrite_configs is a dict that overwrites the yaml file contents - """ - # 1. parse CLI values - if use_cli: - parser = argparse.ArgumentParser( - # prog = "auto-archiver", - description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", - epilog="Check the code at https://github.com/bellingcat/auto-archiver" - ) +# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}): +# """ +# if yaml_config_filename is provided, the --config argument is ignored, +# useful for library usage when the config values are preloaded +# overwrite_configs is a dict that overwrites the yaml file contents +# """ +# # 1. parse CLI values +# if use_cli: +# parser = argparse.ArgumentParser( +# # prog = "auto-archiver", +# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", +# epilog="Check the code at https://github.com/bellingcat/auto-archiver" +# ) - parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') - parser.add_argument('--version', action='version', version=__version__) +# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') +# parser.add_argument('--version', action='version', version=__version__) - # Iterate over all step subclasses to gather default configs and CLI arguments - for configurable in self.configurable_parents: - child: Step - for child in configurable.__subclasses__(): - assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict." - for config, details in child.configs().items(): - assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" - assert "." not in config, f"config property cannot contain dots('.'): {config}" - config_path = f"{child.name}.{config}" +# # Iterate over all step subclasses to gather default configs and CLI arguments +# for configurable in self.configurable_parents: +# child: Step +# for child in configurable.__subclasses__(): +# assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict." +# for config, details in child.configs().items(): +# assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" +# assert "." not in config, f"config property cannot contain dots('.'): {config}" +# config_path = f"{child.name}.{config}" - if use_cli: - try: - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) - except argparse.ArgumentError: - # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver - pass +# if use_cli: +# try: +# parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) +# except argparse.ArgumentError: +# # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver +# pass - self.defaults[config_path] = details["default"] - if "cli_set" in details: - self.cli_ops[config_path] = details["cli_set"] +# self.defaults[config_path] = details["default"] +# if "cli_set" in details: +# self.cli_ops[config_path] = details["cli_set"] - if use_cli: - args = parser.parse_args() - yaml_config_filename = yaml_config_filename or getattr(args, "config") - else: args = {} +# if use_cli: +# args = parser.parse_args() +# yaml_config_filename = yaml_config_filename or getattr(args, "config") +# else: args = {} - # 2. read YAML config file (or use provided value) - self.yaml_config = self.read_yaml(yaml_config_filename) - update_nested_dict(self.yaml_config, overwrite_configs) +# # 2. read YAML config file (or use provided value) +# self.yaml_config = self.read_yaml(yaml_config_filename) +# update_nested_dict(self.yaml_config, overwrite_configs) - # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default - self.config = defaultdict(dict) - for config_path, default in self.defaults.items(): - child, config = tuple(config_path.split(".")) - val = getattr(args, config_path, None) - if val is not None and config_path in self.cli_ops: - val = self.cli_ops[config_path](val, default) - if val is None: - val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) - self.config[child][config] = val - self.config = dict(self.config) +# # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default +# self.config = defaultdict(dict) +# for config_path, default in self.defaults.items(): +# child, config = tuple(config_path.split(".")) +# val = getattr(args, config_path, None) +# if val is not None and config_path in self.cli_ops: +# val = self.cli_ops[config_path](val, default) +# if val is None: +# val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) +# self.config[child][config] = val +# self.config = dict(self.config) - # 4. STEPS: read steps and validate they exist - steps = self.yaml_config.get("steps", {}) - assert "archivers" in steps, "your configuration steps are missing the archivers property" - assert "storages" in steps, "your configuration steps are missing the storages property" +# # 4. STEPS: read steps and validate they exist +# steps = self.yaml_config.get("steps", {}) +# assert "archivers" in steps, "your configuration steps are missing the archivers property" +# assert "storages" in steps, "your configuration steps are missing the storages property" - self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) - self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config) - self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] - self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])] - self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] - self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])] +# self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) +# self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config) +# self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] +# self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])] +# self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] +# self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])] - logger.info(f"FEEDER: {self.feeder.name}") - logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}") - logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}") - logger.info(f"DATABASES: {[x.name for x in self.databases]}") - logger.info(f"STORAGES: {[x.name for x in self.storages]}") - logger.info(f"FORMATTER: {self.formatter.name}") +# logger.info(f"FEEDER: {self.feeder.name}") +# logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}") +# logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}") +# logger.info(f"DATABASES: {[x.name for x in self.databases]}") +# logger.info(f"STORAGES: {[x.name for x in self.storages]}") +# logger.info(f"FORMATTER: {self.formatter.name}") - def read_yaml(self, yaml_filename: str) -> dict: - with open(yaml_filename, "r", encoding="utf-8") as inf: - return yaml.safe_load(inf) +def read_yaml(yaml_filename: str) -> dict: + with open(yaml_filename, "r", encoding="utf-8") as inf: + return yaml.safe_load(inf) diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py new file mode 100644 index 0000000..e9de8c5 --- /dev/null +++ b/src/auto_archiver/core/loader.py @@ -0,0 +1,42 @@ +import os +from os.path import join, dirname +from typing import List + +MANIFEST_FILE = "__manifest__.py" +_DEFAULT_MANIFEST = { + 'author': 'Bellingcat', + 'requires_setup': True, + 'depends': [], + 'description': '', + 'external_dependencies': {}, + 'entry_point': '', + 'version': '1.0', +} + +def load_manifest(self, module): + # load the manifest file + with open(join(module, MANIFEST_FILE)) as f: + manifest = f.read() + return manifest + +def available_modules(self, additional_paths: List[str] = []) -> List[dict]: + # search through all valid 'modules' paths. Default is 'modules' in the current directory + + # see odoo/modules/module.py -> get_modules + def is_really_module(name): + if os.path.isfile(join(name, MANIFEST_FILE)): + return True + + default_path = [join(dirname(dirname((__file__))), "modules")] + all_modules = [] + + for module_folder in default_path + additional_paths: + # walk through each module in module_folder and check if it has a valid manifest + for folder in os.listdir(module_folder): + possible_module = join(module_folder, folder) + if not is_really_module(possible_module): + continue + # parse manifest and add to list of available modules + all_modules.append(possible_module) + + return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 3290070..a18da0e 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -5,9 +5,13 @@ """ from __future__ import annotations +import ast +import os +from os.path import dirname, join from typing import Generator, Union, List from urllib.parse import urlparse from ipaddress import ip_address +import argparse from .context import ArchivingContext @@ -18,27 +22,78 @@ from ..storages import Storage from ..enrichers import Enricher from ..databases import Database from .metadata import Metadata +from ..version import __version__ +from .config import read_yaml +from .loader import available_modules, load_manifest import tempfile, traceback from loguru import logger +DEFAULT_CONFIG_FILE = "orchestration.yaml" class ArchivingOrchestrator: - def __init__(self, config) -> None: - self.feeder: Feeder = config.feeder - self.formatter: Formatter = config.formatter - self.enrichers: List[Enricher] = config.enrichers - self.archivers: List[Archiver] = config.archivers - self.databases: List[Database] = config.databases - self.storages: List[Storage] = config.storages - ArchivingContext.set("storages", self.storages, keep_on_reset=True) - try: - for a in self.all_archivers_for_setup(): a.setup() - except (KeyboardInterrupt, Exception) as e: - logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") - self.cleanup() + # def __init__(self, config: Config) -> None: + # self.feeder: Feeder = config.feeder + # self.formatter: Formatter = config.formatter + # self.enrichers: List[Enricher] = config.enrichers + # self.archivers: List[Archiver] = config.archivers + # self.databases: List[Database] = config.databases + # self.storages: List[Storage] = config.storages + # ArchivingContext.set("storages", self.storages, keep_on_reset=True) + # try: + # for a in self.all_archivers_for_setup(): a.setup() + # except (KeyboardInterrupt, Exception) as e: + # logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") + # self.cleanup() + + def setup_parser(self): + parser = argparse.ArgumentParser( + # prog = "auto-archiver", + description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", + epilog="Check the code at https://github.com/bellingcat/auto-archiver" + ) + parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) + parser.add_argument('--version', action='version', version=__version__) + parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') + self.parser = parser + + def setup_config(self): + # check what mode we're in + # if simple, we'll load just the modules that has requires_setup = False + # if full, we'll load all modules + if self.config.mode == 'simple': + for module in available_modules(): + # load the module + manifest = load_manifest(module) + + + def run(self) -> None: + self.setup_parser() + + # parse the known arguments for now (basically, we want the config file) + + # load the config file to get the list of enabled items + self.config, _ = self.parser.parse_known_args() + + # load the config file + try: + config = read_yaml(self.config.config_file) + except FileNotFoundError: + if self.settings.config == DEFAULT_CONFIG_FILE: + # no config file found, let's do the setup with the default values + self.setup_config() + else: + logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") + exit() + + breakpoint() + config.parse() + + + for item in self.feed(): + pass def cleanup(self)->None: logger.info("Cleaning up") diff --git a/src/auto_archiver/modules/generic_extractor/__init__.py b/src/auto_archiver/modules/generic_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/archivers/generic_archiver/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py similarity index 85% rename from src/auto_archiver/archivers/generic_archiver/__manifest__.py rename to src/auto_archiver/modules/generic_extractor/__manifest__.py index 67c75f2..bae5f36 100644 --- a/src/auto_archiver/archivers/generic_archiver/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -1,15 +1,16 @@ { - 'name': 'Generic Archiver', + 'name': 'Generic Extractor', 'version': '0.1.0', 'author': 'Bellingcat', - 'type': ['archiver'], + 'type': ['extractor'], + 'entry_point': 'generic_extractor:GenericExtractor', 'requires_setup': False, 'depends': ['core'], 'external_dependencies': { 'python': ['yt_dlp', 'requests', 'loguru', 'slugify'], }, 'description': """ -This is the generic archiver used by auto-archiver, which uses `yt-dlp` under the hood. +This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood. This module is responsible for downloading and processing media content from platforms supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality diff --git a/src/auto_archiver/archivers/generic_archiver/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py similarity index 100% rename from src/auto_archiver/archivers/generic_archiver/bluesky.py rename to src/auto_archiver/modules/generic_extractor/bluesky.py diff --git a/src/auto_archiver/archivers/generic_archiver/dropin.py b/src/auto_archiver/modules/generic_extractor/dropin.py similarity index 100% rename from src/auto_archiver/archivers/generic_archiver/dropin.py rename to src/auto_archiver/modules/generic_extractor/dropin.py diff --git a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py similarity index 99% rename from src/auto_archiver/archivers/generic_archiver/generic_archiver.py rename to src/auto_archiver/modules/generic_extractor/generic_extractor.py index bf423e0..1fd6a18 100644 --- a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -8,7 +8,7 @@ from loguru import logger from auto_archiver.archivers.archiver import Archiver from ...core import Metadata, Media, ArchivingContext -class GenericArchiver(Archiver): +class GenericExtractor(Archiver): name = "youtubedl_archiver" #left as is for backwards compat _dropins = {} diff --git a/src/auto_archiver/archivers/generic_archiver/truth.py b/src/auto_archiver/modules/generic_extractor/truth.py similarity index 100% rename from src/auto_archiver/archivers/generic_archiver/truth.py rename to src/auto_archiver/modules/generic_extractor/truth.py diff --git a/src/auto_archiver/archivers/generic_archiver/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py similarity index 100% rename from src/auto_archiver/archivers/generic_archiver/twitter.py rename to src/auto_archiver/modules/generic_extractor/twitter.py diff --git a/tests/archivers/test_generic_archiver.py b/tests/archivers/test_generic_archiver.py index 6e249e8..b0190b6 100644 --- a/tests/archivers/test_generic_archiver.py +++ b/tests/archivers/test_generic_archiver.py @@ -6,13 +6,13 @@ from os.path import dirname import pytest -from auto_archiver.archivers.generic_archiver import GenericArchiver +from auto_archiver.archivers.generic_extractor.generic_extractor import GenericExtractor from .test_archiver_base import TestArchiverBase -class TestGenericArchiver(TestArchiverBase): +class TestGenericExtractor(TestArchiverBase): """Tests Base Archiver """ - archiver_class = GenericArchiver + archiver_class = GenericExtractor config = { 'subtitles': False, 'comments': False, From 4830f9930015d14002001b3c075bb0b470d682d9 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 21 Jan 2025 20:03:10 +0100 Subject: [PATCH 005/110] Get parsing of manifest and combining with config file working --- src/auto_archiver/core/config.py | 120 +++++------------- src/auto_archiver/core/loader.py | 23 +++- src/auto_archiver/core/orchestrator.py | 69 ++++++++-- .../modules/generic_extractor/__manifest__.py | 15 ++- 4 files changed, 121 insertions(+), 106 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index ef012c9..66c2eb5 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -5,22 +5,9 @@ flexible setup in various environments. """ -import importlib import argparse import yaml from dataclasses import dataclass, field -from typing import List -from collections import defaultdict -from loguru import logger - -from ..archivers import Archiver -from ..feeders import Feeder -from ..databases import Database -from ..formatters import Formatter -from ..storages import Storage -from ..enrichers import Enricher -from . import Step -from ..utils import update_nested_dict # @dataclass @@ -46,84 +33,45 @@ from ..utils import update_nested_dict # self.cli_ops = {} # self.config = {} -# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}): -# """ -# if yaml_config_filename is provided, the --config argument is ignored, -# useful for library usage when the config values are preloaded -# overwrite_configs is a dict that overwrites the yaml file contents -# """ -# # 1. parse CLI values -# if use_cli: -# parser = argparse.ArgumentParser( -# # prog = "auto-archiver", -# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", -# epilog="Check the code at https://github.com/bellingcat/auto-archiver" -# ) + # def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}): + # """ + # if yaml_config_filename is provided, the --config argument is ignored, + # useful for library usage when the config values are preloaded + # overwrite_configs is a dict that overwrites the yaml file contents + # """ + # # 1. parse CLI values + # if use_cli: + # parser = argparse.ArgumentParser( + # # prog = "auto-archiver", + # description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", + # epilog="Check the code at https://github.com/bellingcat/auto-archiver" + # ) -# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') -# parser.add_argument('--version', action='version', version=__version__) + # parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') + # parser.add_argument('--version', action='version', version=__version__) -# # Iterate over all step subclasses to gather default configs and CLI arguments -# for configurable in self.configurable_parents: -# child: Step -# for child in configurable.__subclasses__(): -# assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict." -# for config, details in child.configs().items(): -# assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" -# assert "." not in config, f"config property cannot contain dots('.'): {config}" -# config_path = f"{child.name}.{config}" +def format_config(config: dict) -> dict: + # Iterate over all step subclasses to gather default configs and CLI arguments + new_config = {} + for step, values in config['steps'].items(): + new_config[f"--{step}"] = values + + # format configurations + for name, confg_vals in config['configurations'].items(): + for key, value in confg_vals.items(): + assert "." not in key, "config key cannot contain '.'" + config_path = f"--{name}.{key}" + new_config[config_path] = value -# if use_cli: -# try: -# parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) -# except argparse.ArgumentError: -# # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver -# pass + return new_config -# self.defaults[config_path] = details["default"] -# if "cli_set" in details: -# self.cli_ops[config_path] = details["cli_set"] -# if use_cli: -# args = parser.parse_args() -# yaml_config_filename = yaml_config_filename or getattr(args, "config") -# else: args = {} - -# # 2. read YAML config file (or use provided value) -# self.yaml_config = self.read_yaml(yaml_config_filename) -# update_nested_dict(self.yaml_config, overwrite_configs) - -# # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default -# self.config = defaultdict(dict) -# for config_path, default in self.defaults.items(): -# child, config = tuple(config_path.split(".")) -# val = getattr(args, config_path, None) -# if val is not None and config_path in self.cli_ops: -# val = self.cli_ops[config_path](val, default) -# if val is None: -# val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) -# self.config[child][config] = val -# self.config = dict(self.config) - -# # 4. STEPS: read steps and validate they exist -# steps = self.yaml_config.get("steps", {}) -# assert "archivers" in steps, "your configuration steps are missing the archivers property" -# assert "storages" in steps, "your configuration steps are missing the storages property" - -# self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) -# self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config) -# self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] -# self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])] -# self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] -# self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])] - -# logger.info(f"FEEDER: {self.feeder.name}") -# logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}") -# logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}") -# logger.info(f"DATABASES: {[x.name for x in self.databases]}") -# logger.info(f"STORAGES: {[x.name for x in self.storages]}") -# logger.info(f"FORMATTER: {self.formatter.name}") +class LoadFromFile (argparse.Action): + def __call__ (self, parser, namespace, values, option_string = None): + with values as f: + # parse arguments in the file and store them in the target namespace + parser.parse_args(f.read().split(), namespace) def read_yaml(yaml_filename: str) -> dict: with open(yaml_filename, "r", encoding="utf-8") as inf: - return yaml.safe_load(inf) + return format_config(yaml.safe_load(inf)) diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index e9de8c5..8b96198 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -1,4 +1,6 @@ +import ast import os +import copy from os.path import join, dirname from typing import List @@ -11,15 +13,18 @@ _DEFAULT_MANIFEST = { 'external_dependencies': {}, 'entry_point': '', 'version': '1.0', + 'config': {} } -def load_manifest(self, module): +def load_manifest(module): # load the manifest file + manifest = copy.deepcopy(_DEFAULT_MANIFEST) + with open(join(module, MANIFEST_FILE)) as f: - manifest = f.read() + manifest.update(ast.literal_eval(f.read())) return manifest -def available_modules(self, additional_paths: List[str] = []) -> List[dict]: +def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[dict]: # search through all valid 'modules' paths. Default is 'modules' in the current directory # see odoo/modules/module.py -> get_modules @@ -32,11 +37,15 @@ def available_modules(self, additional_paths: List[str] = []) -> List[dict]: for module_folder in default_path + additional_paths: # walk through each module in module_folder and check if it has a valid manifest - for folder in os.listdir(module_folder): - possible_module = join(module_folder, folder) - if not is_really_module(possible_module): + for possible_module in os.listdir(module_folder): + possible_module_path = join(module_folder, possible_module) + if not is_really_module(possible_module_path): continue # parse manifest and add to list of available modules - all_modules.append(possible_module) + if with_manifest: + manifest = load_manifest(possible_module_path) + else: + manifest = {} + all_modules.append((possible_module, possible_module_path, manifest)) return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index a18da0e..f788203 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -5,9 +5,6 @@ """ from __future__ import annotations -import ast -import os -from os.path import dirname, join from typing import Generator, Union, List from urllib.parse import urlparse from ipaddress import ip_address @@ -51,23 +48,67 @@ class ArchivingOrchestrator: def setup_parser(self): parser = argparse.ArgumentParser( # prog = "auto-archiver", + add_help=False, description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", epilog="Check the code at https://github.com/bellingcat/auto-archiver" ) parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) parser.add_argument('--version', action='version', version=__version__) parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') + # override the default 'help' so we can inject all the configs and show those + parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit') self.parser = parser - def setup_config(self): + def add_module_args(self, modules: list = None): + if not modules: + modules = available_modules(with_manifest=True) + + for module_name, module_path, manifest in modules: + for name, kwargs in manifest['config'].items(): + kwargs['dest'] = f"{module_name}.{kwargs.pop('dest', name)}" + self.parser.add_argument(f"--{module_name}.{name}", **kwargs) + + def show_help(self): + # for the help message, we want to load *all* possible modules and show the help + # add configs as arg parser arguments + self.add_module_args() + + self.parser.print_help() + exit() + + def setup_config(self, config: dict) -> None: # check what mode we're in # if simple, we'll load just the modules that has requires_setup = False # if full, we'll load all modules if self.config.mode == 'simple': - for module in available_modules(): - # load the module - manifest = load_manifest(module) - + simple_modules = [module for module in available_modules(with_manifest=True) if not module[2]['requires_setup']] + self.add_module_args(simple_modules) + + # now we add the --feeders, --enrichers, --archivers, --databases, --storages, and --formatter, and make them "required" + self.parser.add_argument('--feeders', action='store', nargs='*', dest='feeders', required=True, help='the feeders to use') + self.parser.add_argument('--enrichers', action='store', nargs='*', dest='enrichers', required=True, help='the enrichers to use') + self.parser.add_argument('--extractors', action='store', nargs='*', dest='extractors', required=True, help='the enrichers to use') + self.parser.add_argument('--databases', action='store', nargs='*', dest='databases', required=True, help='the databases to use') + self.parser.add_argument('--storages', action='store', nargs='*', dest='storages', required=True, help='the storages to use') + self.parser.add_argument('--formatter', action='store', nargs='*', dest='formatter', required=True, help='the formatter to use') + + + config.update(self.config.__dict__) + # reload the parser with the new arguments, now that we have them + self.config, unknown = self.parser.parse_known_args(config) + logger.warning(f"Ignoring unknown/unused arguments: {unknown}") + + breakpoint() + + + logger.info(f"FEEDER: {self.config.feeders}") + logger.info(f"ENRICHERS: {self.config.enrichers}") + logger.info(f"ARCHIVERS: {self.config.archivers}") + logger.info(f"DATABASES: {self.config.databases}") + logger.info(f"STORAGES: {self.config.storages}") + logger.info(f"FORMATTER: {self.formatter.name}") + + def run(self) -> None: self.setup_parser() @@ -77,17 +118,21 @@ class ArchivingOrchestrator: # load the config file to get the list of enabled items self.config, _ = self.parser.parse_known_args() + # if help flag was called, then show the help + if self.config.help: + self.show_help() # load the config file + config = {} + try: config = read_yaml(self.config.config_file) except FileNotFoundError: - if self.settings.config == DEFAULT_CONFIG_FILE: - # no config file found, let's do the setup with the default values - self.setup_config() - else: + if self.config.config_file != DEFAULT_CONFIG_FILE: logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") exit() + self.setup_config(config) + breakpoint() config.parse() diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index bae5f36..673399e 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -29,5 +29,18 @@ the broader archiving framework. metadata objects. Some dropins are included in this generic_archiver by default, but custom dropins can be created to handle additional websites and passed to the archiver via the command line using the `--dropins` option (TODO!). -""" +""", + 'config': { + "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"}, + "subtitles": {"default": True, "help": "download subtitles if available"}, + "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"}, + "livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"}, + "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."}, + "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"}, + "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."}, + 'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."}, + "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."}, + "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"}, + "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"}, + } } \ No newline at end of file From 7b3a1468cd28808fd34ed002b27116b2c1b24f5b Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 21 Jan 2025 22:29:50 +0000 Subject: [PATCH 006/110] Create manifest files for archiver modules. --- src/auto_archiver/archivers/__init__.py | 7 - .../archivers/youtubedl_archiver.py | 2 - .../instagram_api_archiver/__init__.py | 0 .../instagram_api_archiver/__manifest__.py | 30 ++ .../instagram_api_archiver.py | 289 ++++++++++++------ .../modules/instagram_archiver/__init__.py | 0 .../instagram_archiver/__manifest__.py | 33 ++ .../instagram_archiver}/instagram_archiver.py | 6 +- .../instagram_tbot_archiver/__init__.py | 0 .../instagram_tbot_archiver/__manifest__.py | 35 +++ .../instagram_tbot_archiver.py | 15 +- .../modules/telegram_archiver/__init__.py | 0 .../modules/telegram_archiver/__manifest__.py | 26 ++ .../telegram_archiver}/telegram_archiver.py | 7 +- .../modules/telethon_archiver/__init__.py | 0 .../modules/telethon_archiver/__manifest__.py | 48 +++ .../telethon_archiver}/telethon_archiver.py | 6 +- .../modules/twitter_api_archiver/__init__.py | 0 .../twitter_api_archiver/__manifest__.py | 45 +++ .../twitter_api_archiver.py | 4 +- .../modules/vk_archiver/__init__.py | 0 .../modules/vk_archiver/__manifest__.py | 37 +++ .../vk_archiver}/vk_archiver.py | 6 +- 23 files changed, 467 insertions(+), 129 deletions(-) delete mode 100644 src/auto_archiver/archivers/youtubedl_archiver.py create mode 100644 src/auto_archiver/modules/instagram_api_archiver/__init__.py create mode 100644 src/auto_archiver/modules/instagram_api_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/instagram_api_archiver}/instagram_api_archiver.py (59%) create mode 100644 src/auto_archiver/modules/instagram_archiver/__init__.py create mode 100644 src/auto_archiver/modules/instagram_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/instagram_archiver}/instagram_archiver.py (98%) create mode 100644 src/auto_archiver/modules/instagram_tbot_archiver/__init__.py create mode 100644 src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/instagram_tbot_archiver}/instagram_tbot_archiver.py (96%) create mode 100644 src/auto_archiver/modules/telegram_archiver/__init__.py create mode 100644 src/auto_archiver/modules/telegram_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/telegram_archiver}/telegram_archiver.py (92%) create mode 100644 src/auto_archiver/modules/telethon_archiver/__init__.py create mode 100644 src/auto_archiver/modules/telethon_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/telethon_archiver}/telethon_archiver.py (98%) create mode 100644 src/auto_archiver/modules/twitter_api_archiver/__init__.py create mode 100644 src/auto_archiver/modules/twitter_api_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/twitter_api_archiver}/twitter_api_archiver.py (98%) create mode 100644 src/auto_archiver/modules/vk_archiver/__init__.py create mode 100644 src/auto_archiver/modules/vk_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/vk_archiver}/vk_archiver.py (91%) diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py index 7519a8e..54515ec 100644 --- a/src/auto_archiver/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -6,10 +6,3 @@ collect and preserve a variety of content types, such as posts, images, videos a """ from .archiver import Archiver -from .telethon_archiver import TelethonArchiver -from .twitter_api_archiver import TwitterApiArchiver -from .instagram_archiver import InstagramArchiver -from .instagram_tbot_archiver import InstagramTbotArchiver -from .telegram_archiver import TelegramArchiver -from .vk_archiver import VkArchiver -from .instagram_api_archiver import InstagramAPIArchiver diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py deleted file mode 100644 index 8b61974..0000000 --- a/src/auto_archiver/archivers/youtubedl_archiver.py +++ /dev/null @@ -1,2 +0,0 @@ -# temporary hack, as we implement module -from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver diff --git a/src/auto_archiver/modules/instagram_api_archiver/__init__.py b/src/auto_archiver/modules/instagram_api_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py new file mode 100644 index 0000000..2bb3f67 --- /dev/null +++ b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py @@ -0,0 +1,30 @@ +{ + "name": "Instagram API Archiver", + "type": ["extractor"], + "entry_point": "instagram_api_archiver:InstagramApiArchiver", + "depends": ["core"], + "external_dependencies": + {"python": ["requests", + "loguru", + "retrying", + "tqdm",], + }, + "no_setup_required": False, + "configs": { + "access_token": {"default": None, "help": "a valid instagrapi-api token"}, + "api_endpoint": {"default": None, "help": "API endpoint to use"}, + "full_profile": { + "default": False, + "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.", + }, + "full_profile_max_posts": { + "default": 0, + "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights", + }, + "minimize_json_output": { + "default": True, + "help": "if true, will remove empty values from the json output", + }, + }, + "description": "", +} diff --git a/src/auto_archiver/archivers/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py similarity index 59% rename from src/auto_archiver/archivers/instagram_api_archiver.py rename to src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py index d0e7e87..cc6e074 100644 --- a/src/auto_archiver/archivers/instagram_api_archiver.py +++ b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py @@ -9,32 +9,38 @@ data, reducing JSON output size, and handling large profiles. """ import re -import requests from datetime import datetime + +import requests from loguru import logger from retrying import retry from tqdm import tqdm -from . import Archiver -from ..core import Metadata -from ..core import Media +from auto_archiver.archivers import Archiver +from auto_archiver.core import Media +from auto_archiver.core import Metadata + class InstagramAPIArchiver(Archiver): """ Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data - + # TODO: improvement collect aggregates of locations[0].location and mentions for all posts """ + name = "instagram_api_archiver" - global_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?") + global_pattern = re.compile( + r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" + ) def __init__(self, config: dict) -> None: super().__init__(config) self.assert_valid_string("access_token") self.assert_valid_string("api_endpoint") self.full_profile_max_posts = int(self.full_profile_max_posts) - if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] + if self.api_endpoint[-1] == "/": + self.api_endpoint = self.api_endpoint[:-1] self.full_profile = bool(self.full_profile) self.minimize_json_output = bool(self.minimize_json_output) @@ -44,52 +50,74 @@ class InstagramAPIArchiver(Archiver): return { "access_token": {"default": None, "help": "a valid instagrapi-api token"}, "api_endpoint": {"default": None, "help": "API endpoint to use"}, - "full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."}, - "full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"}, - "minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"}, + "full_profile": { + "default": False, + "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.", + }, + "full_profile_max_posts": { + "default": 0, + "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights", + }, + "minimize_json_output": { + "default": True, + "help": "if true, will remove empty values from the json output", + }, } - + def download(self, item: Metadata) -> Metadata: url = item.get_url() - url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com") + url.replace("instagr.com", "instagram.com").replace( + "instagr.am", "instagram.com" + ) insta_matches = self.global_pattern.findall(url) logger.info(f"{insta_matches=}") - if not len(insta_matches) or len(insta_matches[0])!=3: return - if len(insta_matches) > 1: - logger.warning(f"Multiple instagram matches found in {url=}, using the first one") + if not len(insta_matches) or len(insta_matches[0]) != 3: + return + if len(insta_matches) > 1: + logger.warning( + f"Multiple instagram matches found in {url=}, using the first one" + ) return g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2] - if g1 == "": return self.download_profile(item, g2) - elif g1 == "p": return self.download_post(item, g2, context="post") - elif g1 == "reel": return self.download_post(item, g2, context="reel") - elif g1 == "stories/highlights": return self.download_highlights(item, g2) - elif g1 == "stories": - if len(g3): return self.download_post(item, id=g3, context="story") + if g1 == "": + return self.download_profile(item, g2) + elif g1 == "p": + return self.download_post(item, g2, context="post") + elif g1 == "reel": + return self.download_post(item, g2, context="reel") + elif g1 == "stories/highlights": + return self.download_highlights(item, g2) + elif g1 == "stories": + if len(g3): + return self.download_post(item, id=g3, context="story") return self.download_stories(item, g2) - else: + else: logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}") return - + @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5) def call_api(self, path: str, params: dict) -> dict: - headers = { - "accept": "application/json", - "x-access-key": self.access_token - } + headers = {"accept": "application/json", "x-access-key": self.access_token} logger.debug(f"calling {self.api_endpoint}/{path} with {params=}") - return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json() + return requests.get( + f"{self.api_endpoint}/{path}", headers=headers, params=params + ).json() def cleanup_dict(self, d: dict | list) -> dict: # repeats 3 times to remove nested empty values - if not self.minimize_json_output: return d - if type(d) == list: return [self.cleanup_dict(v) for v in d] - if type(d) != dict: return d + if not self.minimize_json_output: + return d + if type(d) == list: + return [self.cleanup_dict(v) for v in d] + if type(d) != dict: + return d return { - k: clean_v - for k, v in d.items() - if (clean_v := self.cleanup_dict(v)) not in [0.0, 0, [], {}, "", None, "null"] and - k not in ["x", "y", "width", "height"] + k: clean_v + for k, v in d.items() + if (clean_v := self.cleanup_dict(v)) + not in [0.0, 0, [], {}, "", None, "null"] + and k not in ["x", "y", "width", "height"] } def download_profile(self, result: Metadata, username: str) -> Metadata: @@ -125,7 +153,9 @@ class InstagramAPIArchiver(Archiver): try: self.download_all_tagged(result, user_id) except Exception as e: - result.append("errors", f"Error downloading tagged posts for {username}") + result.append( + "errors", f"Error downloading tagged posts for {username}" + ) logger.error(f"Error downloading tagged posts for {username}: {e}") # download all highlights @@ -135,26 +165,37 @@ class InstagramAPIArchiver(Archiver): result.append("errors", f"Error downloading highlights for {username}") logger.error(f"Error downloading highlights for {username}: {e}") - - result.set_url(url) # reset as scrape_item modifies it + result.set_url(url) # reset as scrape_item modifies it return result.success("insta profile") def download_all_highlights(self, result, username, user_id): count_highlights = 0 highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id}) for h in highlights: - try: + try: h_info = self._download_highlights_reusable(result, h.get("pk")) count_highlights += len(h_info.get("items", [])) except Exception as e: - result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}") - logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}") - if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts: - logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}") + result.append( + "errors", + f"Error downloading highlight id{h.get('pk')} for {username}", + ) + logger.error( + f"Error downloading highlight id{h.get('pk')} for {username}: {e}" + ) + if ( + self.full_profile_max_posts + and count_highlights >= self.full_profile_max_posts + ): + logger.info( + f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}" + ) break result.set("#highlights", count_highlights) - def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata: + def download_post( + self, result: Metadata, code: str = None, id: str = None, context: str = None + ) -> Metadata: if id: post = self.call_api(f"v1/media/by/id", {"id": id}) else: @@ -166,7 +207,8 @@ class InstagramAPIArchiver(Archiver): post = self.scrape_item(result, post, context) - if post.get("taken_at"): result.set_timestamp(post.get("taken_at")) + if post.get("taken_at"): + result.set_timestamp(post.get("taken_at")) return result.success(f"insta {context or 'post'}") def download_highlights(self, result: Metadata, id: str) -> Metadata: @@ -175,96 +217,127 @@ class InstagramAPIArchiver(Archiver): del h_info["items"] result.set_title(h_info.get("title")).set("data", h_info).set("#reels", items) return result.success("insta highlights") - - def _download_highlights_reusable(self, result: Metadata, id: str) ->dict: + + def _download_highlights_reusable(self, result: Metadata, id: str) -> dict: full_h = self.call_api(f"v2/highlight/by/id", {"id": id}) h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}") assert h_info, f"Highlight {id} not found: {full_h=}" - if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"): + if ( + cover_media := h_info.get("cover_media", {}) + .get("cropped_image_version", {}) + .get("url") + ): filename = self.download_from_url(cover_media) result.add_media(Media(filename=filename), id=f"cover_media highlight {id}") - items = h_info.get("items", [])[::-1] # newest to oldest + items = h_info.get("items", [])[::-1] # newest to oldest for h in tqdm(items, desc="downloading highlights", unit="highlight"): - try: self.scrape_item(result, h, "highlight") + try: + self.scrape_item(result, h, "highlight") except Exception as e: result.append("errors", f"Error downloading highlight {h.get('id')}") - logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e}") - + logger.error( + f"Error downloading highlight, skipping {h.get('id')}: {e}" + ) + return h_info - + def download_stories(self, result: Metadata, username: str) -> Metadata: now = datetime.now().strftime("%Y-%m-%d_%H-%M") stories = self._download_stories_reusable(result, username) - if stories == []: return result.success("insta no story") + if stories == []: + return result.success("insta no story") result.set_title(f"stories {username} at {now}").set("#stories", len(stories)) return result.success(f"insta stories {now}") - + def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]: stories = self.call_api(f"v1/user/stories/by/username", {"username": username}) - if not stories or not len(stories): return [] - stories = stories[::-1] # newest to oldest + if not stories or not len(stories): + return [] + stories = stories[::-1] # newest to oldest for s in tqdm(stories, desc="downloading stories", unit="story"): - try: self.scrape_item(result, s, "story") + try: + self.scrape_item(result, s, "story") except Exception as e: result.append("errors", f"Error downloading story {s.get('id')}") logger.error(f"Error downloading story, skipping {s.get('id')}: {e}") return stories - + def download_all_posts(self, result: Metadata, user_id: str): end_cursor = None pbar = tqdm(desc="downloading posts") post_count = 0 while end_cursor != "": - posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor}) - if not len(posts) or not type(posts) == list or len(posts) != 2: break + posts = self.call_api( + f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor} + ) + if not len(posts) or not type(posts) == list or len(posts) != 2: + break posts, end_cursor = posts[0], posts[1] logger.info(f"parsing {len(posts)} posts, next {end_cursor=}") for p in posts: - try: self.scrape_item(result, p, "post") + try: + self.scrape_item(result, p, "post") except Exception as e: result.append("errors", f"Error downloading post {p.get('id')}") logger.error(f"Error downloading post, skipping {p.get('id')}: {e}") pbar.update(1) - post_count+=1 - if self.full_profile_max_posts and post_count >= self.full_profile_max_posts: - logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}") + post_count += 1 + if ( + self.full_profile_max_posts + and post_count >= self.full_profile_max_posts + ): + logger.info( + f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}" + ) break result.set("#posts", post_count) - + def download_all_tagged(self, result: Metadata, user_id: str): next_page_id = "" pbar = tqdm(desc="downloading tagged posts") tagged_count = 0 while next_page_id != None: - resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id}) + resp = self.call_api( + f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id} + ) posts = resp.get("response", {}).get("items", []) - if not len(posts): break + if not len(posts): + break next_page_id = resp.get("next_page_id") - + logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}") for p in posts: - try: self.scrape_item(result, p, "tagged") + try: + self.scrape_item(result, p, "tagged") except Exception as e: - result.append("errors", f"Error downloading tagged post {p.get('id')}") - logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}") + result.append( + "errors", f"Error downloading tagged post {p.get('id')}" + ) + logger.error( + f"Error downloading tagged post, skipping {p.get('id')}: {e}" + ) pbar.update(1) - tagged_count+=1 - if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts: - logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}") + tagged_count += 1 + if ( + self.full_profile_max_posts + and tagged_count >= self.full_profile_max_posts + ): + logger.info( + f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}" + ) break result.set("#tagged", tagged_count) + ### reusable parsing utils below -### reusable parsing utils below - - def scrape_item(self, result:Metadata, item:dict, context:str=None) -> dict: + def scrape_item(self, result: Metadata, item: dict, context: str = None) -> dict: """ receives a Metadata and an API dict response fetches the media and adds it to the Metadata @@ -272,23 +345,25 @@ class InstagramAPIArchiver(Archiver): context can be used to give specific id prefixes to media """ if "clips_metadata" in item: - if reusable_text := item.get("clips_metadata", {}).get("reusable_text_attribute_string"): + if reusable_text := item.get("clips_metadata", {}).get( + "reusable_text_attribute_string" + ): item["clips_metadata_text"] = reusable_text - if self.minimize_json_output: + if self.minimize_json_output: del item["clips_metadata"] - if code := item.get("code") and not result.get("url"): + if code := item.get("code") and not result.get("url"): result.set_url(f"https://www.instagram.com/p/{code}/") - + resources = item.get("resources", item.get("carousel_media", [])) item, media, media_id = self.scrape_media(item, context) # if resources are present take the main media from the first resource if not media and len(resources): _, media, media_id = self.scrape_media(resources[0], context) resources = resources[1:] - + assert media, f"Image/video not found in {item=}" - + # posts with multiple items contain a resources list resources_metadata = Metadata() for r in resources: @@ -298,40 +373,54 @@ class InstagramAPIArchiver(Archiver): result.add_media(media, id=media_id) return item - - def scrape_media(self, item: dict, context:str) -> tuple[dict, Media, str]: + + def scrape_media(self, item: dict, context: str) -> tuple[dict, Media, str]: # remove unnecessary info - if self.minimize_json_output: - for k in ["image_versions", "video_versions", "video_dash_manifest", "image_versions2", "video_versions2"]: - if k in item: del item[k] + if self.minimize_json_output: + for k in [ + "image_versions", + "video_versions", + "video_dash_manifest", + "image_versions2", + "video_versions2", + ]: + if k in item: + del item[k] item = self.cleanup_dict(item) image_media = None if image_url := item.get("thumbnail_url"): filename = self.download_from_url(image_url, verbose=False) image_media = Media(filename=filename) - + # retrieve video info - best_id = item.get('id', item.get('pk')) + best_id = item.get("id", item.get("pk")) taken_at = item.get("taken_at", item.get("taken_at_ts")) code = item.get("code") caption_text = item.get("caption_text") - if "carousel_media" in item: del item["carousel_media"] + if "carousel_media" in item: + del item["carousel_media"] if video_url := item.get("video_url"): filename = self.download_from_url(video_url, verbose=False) video_media = Media(filename=filename) - if taken_at: video_media.set("date", taken_at) - if code: video_media.set("url", f"https://www.instagram.com/p/{code}") - if caption_text: video_media.set("text", caption_text) + if taken_at: + video_media.set("date", taken_at) + if code: + video_media.set("url", f"https://www.instagram.com/p/{code}") + if caption_text: + video_media.set("text", caption_text) video_media.set("preview", [image_media]) video_media.set("data", [item]) return item, video_media, f"{context or 'video'} {best_id}" elif image_media: - if taken_at: image_media.set("date", taken_at) - if code: image_media.set("url", f"https://www.instagram.com/p/{code}") - if caption_text: image_media.set("text", caption_text) + if taken_at: + image_media.set("date", taken_at) + if code: + image_media.set("url", f"https://www.instagram.com/p/{code}") + if caption_text: + image_media.set("text", caption_text) image_media.set("data", [item]) return item, image_media, f"{context or 'image'} {best_id}" - - return item, None, None \ No newline at end of file + + return item, None, None diff --git a/src/auto_archiver/modules/instagram_archiver/__init__.py b/src/auto_archiver/modules/instagram_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/instagram_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_archiver/__manifest__.py new file mode 100644 index 0000000..bd63ab4 --- /dev/null +++ b/src/auto_archiver/modules/instagram_archiver/__manifest__.py @@ -0,0 +1,33 @@ +{ + "name": "Instagram Archiver", + "type": ["extractor"], + "entry_point": "instagram_archiver:InstagramArchiver", + "depends": ["core"], + "external_dependencies": { + "python": ["instaloader", + "loguru",], + }, + "no_setup_required": False, + "configs": { + "username": {"default": None, "help": "a valid Instagram username"}, + "password": { + "default": None, + "help": "the corresponding Instagram account password", + }, + "download_folder": { + "default": "instaloader", + "help": "name of a folder to temporarily download content to", + }, + "session_file": { + "default": "secrets/instaloader.session", + "help": "path to the instagram session which saves session credentials", + }, + # TODO: fine-grain + # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, + }, + "description": """Uses the Instaloader library to download content from Instagram. This class handles both individual posts + and user profiles, downloading as much information as possible, including images, videos, text, stories, + highlights, and tagged posts. Authentication is required via username/password or a session file. + + """, +} diff --git a/src/auto_archiver/archivers/instagram_archiver.py b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py similarity index 98% rename from src/auto_archiver/archivers/instagram_archiver.py rename to src/auto_archiver/modules/instagram_archiver/instagram_archiver.py index 94a8fc0..4cf001d 100644 --- a/src/auto_archiver/archivers/instagram_archiver.py +++ b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py @@ -7,9 +7,9 @@ import re, os, shutil, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from . import Archiver -from ..core import Metadata -from ..core import Media +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata +from auto_archiver.core import Media class InstagramArchiver(Archiver): """ diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__init__.py b/src/auto_archiver/modules/instagram_tbot_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py new file mode 100644 index 0000000..cadb729 --- /dev/null +++ b/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py @@ -0,0 +1,35 @@ +{ + "name": "Instagram Telegram Bot Archiver", + "type": ["extractor"], + "entry_point": "instagram_tbot_archiver:InstagramTbotArchiver", + "depends": ["core", "utils"], + "external_dependencies": {"python": ["loguru", + "telethon",], + }, + "requires_setup": True, + "configs": { + "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, + "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, + "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, + "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."}, + }, + "description": """ +The `InstagramTbotArchiver` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content, +such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs +to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and +returned as part of a `Metadata` object. + +### Features +- Supports archiving Instagram posts and stories through the Telegram bot. +- Downloads and saves media files (e.g., images, videos) in a temporary directory. +- Captures and returns metadata, including titles and descriptions, as a `Metadata` object. +- Automatically manages Telegram session files for secure access. + +### Setup + +To use the `InstagramTbotArchiver`, you need to provide the following configuration settings: +- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps). +- **Session File**: Optional path to store the Telegram session file for future use. + + """, +} diff --git a/src/auto_archiver/archivers/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py similarity index 96% rename from src/auto_archiver/archivers/instagram_tbot_archiver.py rename to src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py index 01b1614..9fdc208 100644 --- a/src/auto_archiver/archivers/instagram_tbot_archiver.py +++ b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py @@ -7,14 +7,17 @@ relevant media and metadata. The fetched content is saved as `Media` objects in `Metadata` object. """ +import os import shutil -from telethon.sync import TelegramClient -from loguru import logger -import time, os +import time from sqlite3 import OperationalError -from . import Archiver -from ..core import Metadata, Media, ArchivingContext -from ..utils import random_str + +from loguru import logger +from telethon.sync import TelegramClient + +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.utils import random_str class InstagramTbotArchiver(Archiver): diff --git a/src/auto_archiver/modules/telegram_archiver/__init__.py b/src/auto_archiver/modules/telegram_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/telegram_archiver/__manifest__.py b/src/auto_archiver/modules/telegram_archiver/__manifest__.py new file mode 100644 index 0000000..b56477a --- /dev/null +++ b/src/auto_archiver/modules/telegram_archiver/__manifest__.py @@ -0,0 +1,26 @@ +{ + "name": "Telegram Archiver", + "type": ["extractor"], + "entry_point": "telegram_archiver:TelegramArchiver", + "requires_setup": False, + "depends": ["core"], + "external_dependencies": { + "python": [ + "requests", + "bs4", + "loguru", + ], + }, + "description": """ + The `TelegramArchiver` retrieves publicly available media content from Telegram message links without requiring login credentials. + It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` + and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` + is advised for more comprehensive functionality. + + ### Features +- Extracts images and videos from public Telegram message links (`t.me`). +- Processes HTML content of messages to retrieve embedded media. +- Sets structured metadata, including timestamps, content, and media details. +- Does not require user authentication for Telegram. + """, +} diff --git a/src/auto_archiver/archivers/telegram_archiver.py b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py similarity index 92% rename from src/auto_archiver/archivers/telegram_archiver.py rename to src/auto_archiver/modules/telegram_archiver/telegram_archiver.py index ed57927..c793095 100644 --- a/src/auto_archiver/archivers/telegram_archiver.py +++ b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py @@ -2,13 +2,14 @@ import requests, re, html from bs4 import BeautifulSoup from loguru import logger -from . import Archiver -from ..core import Metadata, Media +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata, Media class TelegramArchiver(Archiver): """ - Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found + Archiver for telegram that does not require login, but the telethon_archiver is much more advised, + will only return if at least one image or one video is found """ name = "telegram_archiver" diff --git a/src/auto_archiver/modules/telethon_archiver/__init__.py b/src/auto_archiver/modules/telethon_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/telethon_archiver/__manifest__.py b/src/auto_archiver/modules/telethon_archiver/__manifest__.py new file mode 100644 index 0000000..82d56ba --- /dev/null +++ b/src/auto_archiver/modules/telethon_archiver/__manifest__.py @@ -0,0 +1,48 @@ +# TODO rm dependency on json +{ + "name": "telethon_archiver", + "type": ["extractor"], + "entry_point": "telethon_archiver:TelethonArchiver", + "requires_setup": True, + "depends": [""], + "external_dependencies": { + "python": ["telethon", + "loguru", + "tqdm", + ], + "bin": [""] + }, + "configs": { + "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, + "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, + "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"}, + "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, + "join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"}, + "channel_invites": { + "default": {}, + "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", + # TODO + #"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + } + }, + "description": """ +The `TelethonArchiver` uses the Telethon library to archive posts and media from Telegram channels and groups. +It supports private and public channels, downloading grouped posts with media, and can join channels using invite links +if provided in the configuration. + +### Features +- Fetches posts and metadata from Telegram channels and groups, including private channels. +- Downloads media attachments (e.g., images, videos, audio) from individual posts or grouped posts. +- Handles channel invites to join channels dynamically during setup. +- Utilizes Telethon's capabilities for reliable Telegram interactions. +- Outputs structured metadata and media using `Metadata` and `Media` objects. + +### Setup +To use the `TelethonArchiver`, you must configure the following: +- **API ID and API Hash**: Obtain these from [my.telegram.org](https://my.telegram.org/apps). +- **Session File**: Optional, but records login sessions for future use (default: `secrets/anon.session`). +- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving. +- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup. + +""" +} diff --git a/src/auto_archiver/archivers/telethon_archiver.py b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py similarity index 98% rename from src/auto_archiver/archivers/telethon_archiver.py rename to src/auto_archiver/modules/telethon_archiver/telethon_archiver.py index 2e2305d..89668f3 100644 --- a/src/auto_archiver/archivers/telethon_archiver.py +++ b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py @@ -8,9 +8,9 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from . import Archiver -from ..core import Metadata, Media, ArchivingContext -from ..utils import random_str +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.utils import random_str class TelethonArchiver(Archiver): diff --git a/src/auto_archiver/modules/twitter_api_archiver/__init__.py b/src/auto_archiver/modules/twitter_api_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py new file mode 100644 index 0000000..f4eb2b9 --- /dev/null +++ b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py @@ -0,0 +1,45 @@ +{ + "name": "Twitter API Archiver", + "type": ["extractor"], + "entry_point": "twitter_api_archiver:TwitterApiArchiver", + "requires_setup": True, + "depends": ["core"], + "external_dependencies": { + "python": ["requests", + "loguru", + "pytwitter", + "slugify",], + "bin": [""] + }, + "configs": { + "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, + "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, + "consumer_key": {"default": None, "help": "twitter API consumer_key"}, + "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, + "access_token": {"default": None, "help": "twitter API access_token"}, + "access_secret": {"default": None, "help": "twitter API access_secret"}, + }, + "description": """ + The `TwitterApiArchiver` fetches tweets and associated media using the Twitter API. + It supports multiple API configurations for extended rate limits and reliable access. + Features include URL expansion, media downloads (e.g., images, videos), and structured output + via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens + or consumer key/secret and access token/secret. + + ### Features + - Fetches tweets and their metadata, including text, creation timestamp, and author information. + - Downloads media attachments (e.g., images, videos) in high quality. + - Supports multiple API configurations for improved rate limiting. + - Expands shortened URLs (e.g., `t.co` links). + - Outputs structured metadata and media using `Metadata` and `Media` objects. + + ### Setup + To use the `TwitterApiArchiver`, you must provide valid Twitter API credentials via configuration: + - **Bearer Token(s)**: A single token or a list for rate-limited API access. + - **Consumer Key and Secret**: Required for user-authenticated API access. + - **Access Token and Secret**: Complements the consumer key for enhanced API capabilities. + + Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en). + """ +, +} diff --git a/src/auto_archiver/archivers/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py similarity index 98% rename from src/auto_archiver/archivers/twitter_api_archiver.py rename to src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py index d1e4dee..eb607cc 100644 --- a/src/auto_archiver/archivers/twitter_api_archiver.py +++ b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py @@ -8,8 +8,8 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from . import Archiver -from ..core import Metadata,Media +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata,Media class TwitterApiArchiver(Archiver): name = "twitter_api_archiver" diff --git a/src/auto_archiver/modules/vk_archiver/__init__.py b/src/auto_archiver/modules/vk_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/vk_archiver/__manifest__.py b/src/auto_archiver/modules/vk_archiver/__manifest__.py new file mode 100644 index 0000000..69bf162 --- /dev/null +++ b/src/auto_archiver/modules/vk_archiver/__manifest__.py @@ -0,0 +1,37 @@ +{ + "name": "VKontakte Archiver", + "type": ["extractor"], + "entry_point": "vk_archiver:VKArchiver", + "requires_setup": True, + "depends": ["core", "utils"], + "external_dependencies": { + "python": ["loguru", + "vk_url_scraper"], + }, + "configs": { + "username": {"default": None, "help": "valid VKontakte username"}, + "password": {"default": None, "help": "valid VKontakte password"}, + "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, + }, + "description": """ +The `VkArchiver` fetches posts, text, and images from VK (VKontakte) social media pages. +This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract +and download content. Note that VK videos are handled separately by the `YTDownloader`. + +### Features +- Extracts text, timestamps, and metadata from VK `/wall` posts. +- Downloads associated images and attaches them to the resulting `Metadata` object. +- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo). +- Outputs structured metadata and media using `Metadata` and `Media` objects. + +### Setup +To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information: +- **Username**: A valid VKontakte account username. +- **Password**: The corresponding password for the VKontakte account. +- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login. + +Credentials can be set in the configuration file or directly via environment variables. Ensure you +have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/). +""" +, +} diff --git a/src/auto_archiver/archivers/vk_archiver.py b/src/auto_archiver/modules/vk_archiver/vk_archiver.py similarity index 91% rename from src/auto_archiver/archivers/vk_archiver.py rename to src/auto_archiver/modules/vk_archiver/vk_archiver.py index f8bb60a..3cfb446 100644 --- a/src/auto_archiver/archivers/vk_archiver.py +++ b/src/auto_archiver/modules/vk_archiver/vk_archiver.py @@ -1,9 +1,9 @@ from loguru import logger from vk_url_scraper import VkScraper -from ..utils.misc import dump_payload -from . import Archiver -from ..core import Metadata, Media, ArchivingContext +from auto_archiver.utils.misc import dump_payload +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata, Media, ArchivingContext class VkArchiver(Archiver): From 54995ad6ab8fc94893a9a0c3ea9506ee28c3d278 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 22 Jan 2025 13:11:43 +0100 Subject: [PATCH 007/110] Further tweaks based on __manifest__.py files Loading configs now works --- poetry.lock | 43 ++++- pyproject.toml | 1 + src/auto_archiver/core/config.py | 34 ++-- src/auto_archiver/core/loader.py | 58 ++++++- src/auto_archiver/core/orchestrator.py | 158 ++++++++++++------ .../modules/generic_extractor/__manifest__.py | 4 +- .../twitter_api_archiver/__manifest__.py | 2 +- 7 files changed, 214 insertions(+), 86 deletions(-) diff --git a/poetry.lock b/poetry.lock index 40d108a..bbfb975 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1043,7 +1043,7 @@ version = "3.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" optional = false python-versions = ">=3.8" -groups = ["docs"] +groups = ["main", "docs"] files = [ {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, @@ -1179,7 +1179,7 @@ version = "0.1.2" description = "Markdown URL utilities" optional = false python-versions = ">=3.7" -groups = ["docs"] +groups = ["main", "docs"] files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, @@ -1654,7 +1654,7 @@ version = "2.19.1" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" -groups = ["docs"] +groups = ["main", "docs"] files = [ {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, @@ -2031,6 +2031,41 @@ files = [ [package.dependencies] six = ">=1.7.0" +[[package]] +name = "rich" +version = "13.9.4" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.8.0" +groups = ["main"] +files = [ + {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, + {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""} + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + +[[package]] +name = "rich-argparse" +version = "1.6.0" +description = "Rich help formatters for argparse and optparse" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7"}, + {file = "rich_argparse-1.6.0.tar.gz", hash = "sha256:092083c30da186f25bcdff8b1d47fdfb571288510fb051e0488a72cc3128de13"}, +] + +[package.dependencies] +rich = ">=11.0.0" + [[package]] name = "rsa" version = "4.9" @@ -2966,4 +3001,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "462c7c5f9d1fbae895d6299ba0b690b6e24d0655a4c9fc79f75ddef4eec222f8" +content-hash = "911543169cbd6c68ab3392a052ea58917539acdfbc6511e591f8a2b497443cdc" diff --git a/pyproject.toml b/pyproject.toml index ccfcae6..4f20c8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ dependencies = [ "retrying (>=0.0.0)", "tsp-client (>=0.0.0)", "certvalidator (>=0.0.0)", + "rich-argparse (>=1.6.0,<2.0.0)", ] [tool.poetry.group.dev.dependencies] diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 66c2eb5..db5b6d2 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -4,14 +4,11 @@ It supports CLI argument parsing, loading from YAML file, and overrides to allow flexible setup in various environments. """ - import argparse -import yaml +from configparser import ConfigParser from dataclasses import dataclass, field -# @dataclass -# class Config: # configurable_parents = [ # Feeder, # Enricher, @@ -50,21 +47,6 @@ from dataclasses import dataclass, field # parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') # parser.add_argument('--version', action='version', version=__version__) -def format_config(config: dict) -> dict: - # Iterate over all step subclasses to gather default configs and CLI arguments - new_config = {} - for step, values in config['steps'].items(): - new_config[f"--{step}"] = values - - # format configurations - for name, confg_vals in config['configurations'].items(): - for key, value in confg_vals.items(): - assert "." not in key, "config key cannot contain '.'" - config_path = f"--{name}.{key}" - new_config[config_path] = value - - return new_config - class LoadFromFile (argparse.Action): def __call__ (self, parser, namespace, values, option_string = None): @@ -72,6 +54,14 @@ class LoadFromFile (argparse.Action): # parse arguments in the file and store them in the target namespace parser.parse_args(f.read().split(), namespace) -def read_yaml(yaml_filename: str) -> dict: - with open(yaml_filename, "r", encoding="utf-8") as inf: - return format_config(yaml.safe_load(inf)) +def read_config(config_filename: str) -> dict: + config = ConfigParser() + config.read(config_filename) + # setup basic format + if 'STEPS' not in config.sections(): + config.add_section("STEPS") + return config + +def store_config(config: ConfigParser, config_filename: str): + with open(config_filename, "w", encoding="utf-8") as outf: + config.write(outf) \ No newline at end of file diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index 8b96198..d39f31e 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -1,11 +1,23 @@ import ast +from dataclasses import dataclass, field import os import copy from os.path import join, dirname from typing import List + +MODULE_TYPES = [ + 'feeder', + 'enricher', + 'archiver', + 'database', + 'storage', + 'formatter' +] + MANIFEST_FILE = "__manifest__.py" _DEFAULT_MANIFEST = { + 'name': '', 'author': 'Bellingcat', 'requires_setup': True, 'depends': [], @@ -13,20 +25,54 @@ _DEFAULT_MANIFEST = { 'external_dependencies': {}, 'entry_point': '', 'version': '1.0', - 'config': {} + 'configs': {} } -def load_manifest(module): +@dataclass +class Module: + name: str + display_name: str + type: list + entry_point: str + depends: list + external_dependencies: dict + requires_setup: bool + configs: dict + description: str + path: str + manifest: dict + + def __init__(self, module_name, path, manifest): + self.name = module_name + self.path = path + self.manifest = manifest + if manifest: + self.display_name = manifest['name'] + self.type = manifest['type'] + self.entry_point = manifest['entry_point'] + self.depends = manifest['depends'] + self.external_dependencies = manifest['external_dependencies'] + self.requires_setup = manifest['requires_setup'] + self.configs = manifest['configs'] + self.description = manifest['description'] + + def __repr__(self): + return f"Module<'{self.display_name}' ({self.name})>" + + + +def load_manifest(module_path): + print(f"Loading manifest for module {module_path}") # load the manifest file manifest = copy.deepcopy(_DEFAULT_MANIFEST) - with open(join(module, MANIFEST_FILE)) as f: + with open(join(module_path, MANIFEST_FILE)) as f: manifest.update(ast.literal_eval(f.read())) return manifest -def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[dict]: +def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[Module]: # search through all valid 'modules' paths. Default is 'modules' in the current directory - + # see odoo/modules/module.py -> get_modules def is_really_module(name): if os.path.isfile(join(name, MANIFEST_FILE)): @@ -46,6 +92,6 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals manifest = load_manifest(possible_module_path) else: manifest = {} - all_modules.append((possible_module, possible_module_path, manifest)) + all_modules.append(Module(possible_module, possible_module_path, manifest)) return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index f788203..0a2273f 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -9,6 +9,11 @@ from typing import Generator, Union, List from urllib.parse import urlparse from ipaddress import ip_address import argparse +import configparser +import os +from os.path import join, dirname + +from rich_argparse import RichHelpFormatter from .context import ArchivingContext @@ -20,14 +25,15 @@ from ..enrichers import Enricher from ..databases import Database from .metadata import Metadata from ..version import __version__ -from .config import read_yaml -from .loader import available_modules, load_manifest +from .config import read_config, store_config +from .loader import available_modules, Module, MODULE_TYPES import tempfile, traceback from loguru import logger DEFAULT_CONFIG_FILE = "orchestration.yaml" + class ArchivingOrchestrator: # def __init__(self, config: Config) -> None: @@ -45,95 +51,145 @@ class ArchivingOrchestrator: # logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") # self.cleanup() - def setup_parser(self): + def setup_basic_parser(self): parser = argparse.ArgumentParser( - # prog = "auto-archiver", add_help=False, - description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", - epilog="Check the code at https://github.com/bellingcat/auto-archiver" + description=""" + Auto Archiver is a CLI tool to archive media/metadata from online URLs; + it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)! + """, + epilog="Check the code at https://github.com/bellingcat/auto-archiver", + formatter_class=RichHelpFormatter, ) - parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) + parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) parser.add_argument('--version', action='version', version=__version__) parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') # override the default 'help' so we can inject all the configs and show those parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit') - self.parser = parser - - def add_module_args(self, modules: list = None): - if not modules: - modules = available_modules(with_manifest=True) + parser.add_argument('-s', '--store', action='store_true', dest='store', help='Store the created config in the config file') + self.basic_parser = parser - for module_name, module_path, manifest in modules: - for name, kwargs in manifest['config'].items(): - kwargs['dest'] = f"{module_name}.{kwargs.pop('dest', name)}" - self.parser.add_argument(f"--{module_name}.{name}", **kwargs) + def setup_complete_parser(self, basic_config: dict, ini_config: dict, unused_args: list[str]) -> None: + parser = argparse.ArgumentParser( + parents = [self.basic_parser], + add_help=False, + ) - def show_help(self): - # for the help message, we want to load *all* possible modules and show the help - # add configs as arg parser arguments - self.add_module_args() - - self.parser.print_help() - exit() - - def setup_config(self, config: dict) -> None: # check what mode we're in + # if we have a config file, use that to decide which modules to load # if simple, we'll load just the modules that has requires_setup = False # if full, we'll load all modules - if self.config.mode == 'simple': - simple_modules = [module for module in available_modules(with_manifest=True) if not module[2]['requires_setup']] - self.add_module_args(simple_modules) + if ini_config: + # only load the modules enabled in config + enabled_modules = [] + for module_type in MODULE_TYPES: + try: + enabled_modules.extend(ini_config.get("STEPS", module_type)) + except configparser.NoOptionError: + pass - # now we add the --feeders, --enrichers, --archivers, --databases, --storages, and --formatter, and make them "required" - self.parser.add_argument('--feeders', action='store', nargs='*', dest='feeders', required=True, help='the feeders to use') - self.parser.add_argument('--enrichers', action='store', nargs='*', dest='enrichers', required=True, help='the enrichers to use') - self.parser.add_argument('--extractors', action='store', nargs='*', dest='extractors', required=True, help='the enrichers to use') - self.parser.add_argument('--databases', action='store', nargs='*', dest='databases', required=True, help='the databases to use') - self.parser.add_argument('--storages', action='store', nargs='*', dest='storages', required=True, help='the storages to use') - self.parser.add_argument('--formatter', action='store', nargs='*', dest='formatter', required=True, help='the formatter to use') + # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter' + for module_type in MODULE_TYPES: + if modules := getattr(basic_config, f"{module_type}s", []): + enabled_modules.extend(modules) + + self.add_module_args(available_modules(enabled_modules, with_manifest=True), parser) + elif basic_config.mode == 'simple': + simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] + self.add_module_args(simple_modules, parser) + # add them to the config + for module in simple_modules: + for module_type in module.type: + existing_modules = config['STEPS'] = module.name + ini_config.setdefault(f"{module_type}s", []).append(module.name) + + else: + # load all modules, they're not using the 'simple' mode + self.add_module_args(available_modules(with_manifest=True), parser) + + parser.set_defaults(**ini_config) - - config.update(self.config.__dict__) # reload the parser with the new arguments, now that we have them - self.config, unknown = self.parser.parse_known_args(config) - logger.warning(f"Ignoring unknown/unused arguments: {unknown}") + self.config, unknown = parser.parse_known_args(unused_args) + if unknown: + logger.warning(f"Ignoring unknown/unused arguments: {unknown}") + if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): + logger.info(f"Storing configuration file to {basic_config.config_file}") + store_config(ini_config, basic_config.config_file) breakpoint() - - logger.info(f"FEEDER: {self.config.feeders}") logger.info(f"ENRICHERS: {self.config.enrichers}") logger.info(f"ARCHIVERS: {self.config.archivers}") logger.info(f"DATABASES: {self.config.databases}") logger.info(f"STORAGES: {self.config.storages}") logger.info(f"FORMATTER: {self.formatter.name}") - + + def add_steps_args(self, parser: argparse.ArgumentParser = None): + if not parser: + parser = self.parser + parser.add_argument('--feeders', action='store', dest='feeders', nargs='+', required=True, help='the feeders to use') + parser.add_argument('--enrichers', action='store', dest='enrichers', nargs='+', required=True, help='the enrichers to use') + parser.add_argument('--archivers', action='store', dest='archivers', nargs='+', required=True, help='the archivers to use') + parser.add_argument('--databases', action='store', dest='databases', nargs='+', required=True, help='the databases to use') + parser.add_argument('--storages', action='store', dest='storages', nargs='+', required=True, help='the storages to use') + parser.add_argument('--formatter', action='store', dest='formatter', nargs='+', required=True, help='the formatter to use') + + def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None): + + if not modules: + modules = available_modules(with_manifest=True) + + for module in modules: + if not module.configs: + # this module has no configs, don't show anything in the help + # (TODO: do we want to show something about this module though, like a description?) + continue + + group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") + for name, kwargs in module.configs.items(): + # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set + # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something + kwargs.pop('cli_set', None) + kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" + kwargs['type'] = type(kwargs.get('type', 'str')) + group.add_argument(f"--{module.name}.{name}", **kwargs) + + def show_help(self): + # for the help message, we want to load *all* possible modules and show the help + # add configs as arg parser arguments + + self.add_steps_args(self.basic_parser) + self.add_module_args(parser=self.basic_parser) + + self.basic_parser.print_help() + exit() def run(self) -> None: - self.setup_parser() + self.setup_basic_parser() # parse the known arguments for now (basically, we want the config file) # load the config file to get the list of enabled items - self.config, _ = self.parser.parse_known_args() + basic_config, unused_args = self.basic_parser.parse_known_args() # if help flag was called, then show the help - if self.config.help: + if basic_config.help: self.show_help() + # load the config file - config = {} + ini_config = {} try: - config = read_yaml(self.config.config_file) + ini_config = read_config(basic_config.config_file) except FileNotFoundError: - if self.config.config_file != DEFAULT_CONFIG_FILE: - logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") + if basic_config.config_file != DEFAULT_CONFIG_FILE: + logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") exit() - self.setup_config(config) + self.setup_complete_parser(basic_config, ini_config, unused_args) - breakpoint() config.parse() diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 673399e..d9d0669 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -30,7 +30,7 @@ the broader archiving framework. custom dropins can be created to handle additional websites and passed to the archiver via the command line using the `--dropins` option (TODO!). """, - 'config': { + 'configs': { "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"}, "subtitles": {"default": True, "help": "download subtitles if available"}, "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"}, @@ -40,7 +40,7 @@ via the command line using the `--dropins` option (TODO!). "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."}, 'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."}, "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."}, - "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"}, + "cookies_from_browser": {"default": None, 'type': 'str', "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"}, "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"}, } } \ No newline at end of file diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py index f4eb2b9..b415679 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py @@ -13,7 +13,7 @@ }, "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, - "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, + "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line"}, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, From b6b085854c0f417101bafd019c8f66949883fe6c Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 22 Jan 2025 17:40:51 +0100 Subject: [PATCH 008/110] Switch back to using yaml with dot notation (two simple helper functions to convert between dot and dict notation) --- src/auto_archiver/core/config.py | 64 ++++++++++++++++++++------ src/auto_archiver/core/loader.py | 2 +- src/auto_archiver/core/orchestrator.py | 50 ++++++++++---------- 3 files changed, 77 insertions(+), 39 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index db5b6d2..9709be6 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -4,10 +4,13 @@ It supports CLI argument parsing, loading from YAML file, and overrides to allow flexible setup in various environments. """ -import argparse -from configparser import ConfigParser -from dataclasses import dataclass, field +import argparse +import yaml +from dataclasses import dataclass, field +from collections import OrderedDict + +from .loader import MODULE_TYPES # configurable_parents = [ # Feeder, @@ -47,21 +50,56 @@ from dataclasses import dataclass, field # parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') # parser.add_argument('--version', action='version', version=__version__) - +EMPTY_CONFIG = { + "steps": dict((f"{module_type}s", []) for module_type in MODULE_TYPES) +} class LoadFromFile (argparse.Action): def __call__ (self, parser, namespace, values, option_string = None): with values as f: # parse arguments in the file and store them in the target namespace parser.parse_args(f.read().split(), namespace) -def read_config(config_filename: str) -> dict: - config = ConfigParser() - config.read(config_filename) - # setup basic format - if 'STEPS' not in config.sections(): - config.add_section("STEPS") +def to_dot_notation(yaml_conf: str) -> argparse.ArgumentParser: + dotdict = {} + for step, vals in yaml_conf.pop('steps', {}).items(): + if vals: + dotdict[f"{step}s"] = vals + + def process_subdict(subdict, prefix=""): + for key, value in subdict.items(): + if type(value) == dict: + process_subdict(value, f"{prefix}{key}.") + else: + dotdict[f"{prefix}{key}"] = value + + process_subdict(yaml_conf) + return dotdict + +def merge_dicts(dotdict, yaml_dict): + def process_subdict(subdict, prefix=""): + for key, value in subdict.items(): + if "." in key: + keys = key.split(".") + subdict = yaml_dict + for k in keys[:-1]: + subdict = subdict.setdefault(k, {}) + subdict[keys[-1]] = value + else: + yaml_dict[key] = value + + process_subdict(dotdict) + return yaml_dict + +def read_yaml(yaml_filename: str) -> dict: + + try: + with open(yaml_filename, "r", encoding="utf-8") as inf: + config = yaml.safe_load(inf) + except FileNotFoundError: + config = EMPTY_CONFIG + return config -def store_config(config: ConfigParser, config_filename: str): - with open(config_filename, "w", encoding="utf-8") as outf: - config.write(outf) \ No newline at end of file +def store_yaml(config: dict, yaml_filename: str): + with open(yaml_filename, "w", encoding="utf-8") as outf: + yaml.dump(config, outf, default_flow_style=False) \ No newline at end of file diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index d39f31e..4460349 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -62,7 +62,7 @@ class Module: def load_manifest(module_path): - print(f"Loading manifest for module {module_path}") + # print(f"Loading manifest for module {module_path}") # load the manifest file manifest = copy.deepcopy(_DEFAULT_MANIFEST) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 0a2273f..f8df659 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -9,7 +9,6 @@ from typing import Generator, Union, List from urllib.parse import urlparse from ipaddress import ip_address import argparse -import configparser import os from os.path import join, dirname @@ -25,7 +24,7 @@ from ..enrichers import Enricher from ..databases import Database from .metadata import Metadata from ..version import __version__ -from .config import read_config, store_config +from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG from .loader import available_modules, Module, MODULE_TYPES import tempfile, traceback @@ -69,24 +68,23 @@ class ArchivingOrchestrator: parser.add_argument('-s', '--store', action='store_true', dest='store', help='Store the created config in the config file') self.basic_parser = parser - def setup_complete_parser(self, basic_config: dict, ini_config: dict, unused_args: list[str]) -> None: + def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None: parser = argparse.ArgumentParser( parents = [self.basic_parser], add_help=False, ) - + self.add_steps_args(parser) + breakpoint() # check what mode we're in # if we have a config file, use that to decide which modules to load # if simple, we'll load just the modules that has requires_setup = False # if full, we'll load all modules - if ini_config: + if yaml_config != EMPTY_CONFIG: # only load the modules enabled in config + # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] for module_type in MODULE_TYPES: - try: - enabled_modules.extend(ini_config.get("STEPS", module_type)) - except configparser.NoOptionError: - pass + enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter' for module_type in MODULE_TYPES: @@ -100,23 +98,25 @@ class ArchivingOrchestrator: # add them to the config for module in simple_modules: for module_type in module.type: - existing_modules = config['STEPS'] = module.name - ini_config.setdefault(f"{module_type}s", []).append(module.name) - + yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) else: # load all modules, they're not using the 'simple' mode self.add_module_args(available_modules(with_manifest=True), parser) - - parser.set_defaults(**ini_config) + + breakpoint() + parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them - self.config, unknown = parser.parse_known_args(unused_args) + parsed, unknown = parser.parse_known_args(unused_args) if unknown: - logger.warning(f"Ignoring unknown/unused arguments: {unknown}") + logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?") + + # merge the new config with the old one + yaml_config = merge_dicts(vars(parsed), yaml_config) if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): logger.info(f"Storing configuration file to {basic_config.config_file}") - store_config(ini_config, basic_config.config_file) + store_yaml(yaml_config, basic_config.config_file) breakpoint() logger.info(f"FEEDER: {self.config.feeders}") logger.info(f"ENRICHERS: {self.config.enrichers}") @@ -179,16 +179,16 @@ class ArchivingOrchestrator: self.show_help() # load the config file - ini_config = {} + yaml_config = {} - try: - ini_config = read_config(basic_config.config_file) - except FileNotFoundError: - if basic_config.config_file != DEFAULT_CONFIG_FILE: - logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") - exit() + if not os.path.exists(basic_config.config_file) and basic_config.config_file != DEFAULT_CONFIG_FILE: + logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") + exit() - self.setup_complete_parser(basic_config, ini_config, unused_args) + yaml_config = read_yaml(basic_config.config_file) + + + self.setup_complete_parser(basic_config, yaml_config, unused_args) config.parse() From ade5ea0f6f5f8715ea22aa8df32664e905e52b6a Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 22 Jan 2025 18:45:58 +0100 Subject: [PATCH 009/110] Tidy up imports + start on loading modules - program now starts much faster --- src/auto_archiver/__init__.py | 6 --- src/auto_archiver/__main__.py | 3 +- src/auto_archiver/core/__init__.py | 4 -- src/auto_archiver/core/config.py | 3 -- src/auto_archiver/core/loader.py | 60 +++++++++++++++++++++-- src/auto_archiver/core/media.py | 9 ++-- src/auto_archiver/core/orchestrator.py | 61 +++++++++++++----------- src/auto_archiver/databases/__init__.py | 8 +--- src/auto_archiver/enrichers/__init__.py | 12 ----- src/auto_archiver/feeders/__init__.py | 4 -- src/auto_archiver/formatters/__init__.py | 3 -- src/auto_archiver/storages/__init__.py | 7 +-- 12 files changed, 97 insertions(+), 83 deletions(-) delete mode 100644 src/auto_archiver/__init__.py diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py deleted file mode 100644 index 307716d..0000000 --- a/src/auto_archiver/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from . import archivers, databases, enrichers, feeders, formatters, storages, utils, core - -# need to manually specify due to cyclical deps -from .core.orchestrator import ArchivingOrchestrator -# making accessible directly -from .core.metadata import Metadata diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index 8b2a65a..d31ec5c 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -1,6 +1,5 @@ """ Entry point for the auto_archiver package. """ -from . import ArchivingOrchestrator - +from auto_archiver.core.orchestrator import ArchivingOrchestrator def main(): ArchivingOrchestrator().run() diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index b78df83..779d3ac 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -1,10 +1,6 @@ """ Core modules to handle things such as orchestration, metadata and configs.. """ -from .metadata import Metadata -from .media import Media -from .step import Step -from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 9709be6..f5d9fae 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -61,9 +61,6 @@ class LoadFromFile (argparse.Action): def to_dot_notation(yaml_conf: str) -> argparse.ArgumentParser: dotdict = {} - for step, vals in yaml_conf.pop('steps', {}).items(): - if vals: - dotdict[f"{step}s"] = vals def process_subdict(subdict, prefix=""): for key, value in subdict.items(): diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index 4460349..aa03b1f 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -4,12 +4,14 @@ import os import copy from os.path import join, dirname from typing import List - +from loguru import logger +import sys +import shutil MODULE_TYPES = [ 'feeder', 'enricher', - 'archiver', + 'extractor', 'database', 'storage', 'formatter' @@ -59,7 +61,44 @@ class Module: def __repr__(self): return f"Module<'{self.display_name}' ({self.name})>" +def load_modules(modules): + modules = available_modules(limit_to_modules=modules, with_manifest=True) + for module in modules: + _load_module(module) +def _load_module(module): + # first make sure that the 'depends' are installed and available in sys.args + for dependency in module.depends: + if dependency not in sys.modules: + logger.error(f""" + Module {module.name} depends on {dependency} which is not available. + + Have you set up the '{module.name}' module correctly? See the README for more information. + """) + exit() + # then check the external dependencies, these are binary dependencies that should be available on the path + for dep_type, deps in module.external_dependencies.items(): + if dep_type == 'python': + for dep in deps: + if dep not in sys.modules: + logger.error(f""" + Module {module.name} requires {dep} which is not available. + + Have you installed the required dependencies for the '{module.name}' module? See the README for more information. + """) + + elif dep_type == 'binary': + for dep in deps: + if not shutil.which(dep): + logger.error(f""" + Module {module.name} requires {dep} which is not available. + + Have you installed the required dependencies for the '{module.name}' module? See the README for more information. + """) + # finally, load the module + logger.info(f"Loading module {module.display_name}") + module = __import__(module.entry_point, fromlist=[module.entry_point]) + logger.info(f"Module {module.display_name} loaded") def load_manifest(module_path): # print(f"Loading manifest for module {module_path}") @@ -70,7 +109,7 @@ def load_manifest(module_path): manifest.update(ast.literal_eval(f.read())) return manifest -def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[Module]: +def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], ) -> List[Module]: # search through all valid 'modules' paths. Default is 'modules' in the current directory # see odoo/modules/module.py -> get_modules @@ -83,7 +122,16 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals for module_folder in default_path + additional_paths: # walk through each module in module_folder and check if it has a valid manifest - for possible_module in os.listdir(module_folder): + try: + possible_modules = os.listdir(module_folder) + except FileNotFoundError: + logger.warning(f"Module folder {module_folder} does not exist") + continue + + for possible_module in possible_modules: + if limit_to_modules and possible_module not in limit_to_modules: + continue + possible_module_path = join(module_folder, possible_module) if not is_really_module(possible_module_path): continue @@ -93,5 +141,9 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals else: manifest = {} all_modules.append(Module(possible_module, possible_module_path, manifest)) + + for module in limit_to_modules: + if not any(module == m.name for m in all_modules): + logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?") return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index d204a6e..e5026af 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -11,9 +11,6 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes -import ffmpeg -from ffmpeg._run import Error - from .context import ArchivingContext from loguru import logger @@ -106,6 +103,12 @@ class Media: return self.mimetype.startswith("image") def is_valid_video(self) -> bool: + # Note: this is intentional, to only import ffmpeg here - when the method is called + # this speeds up loading the module. We check that 'ffmpeg' is available on startup + # when we load each manifest file + import ffmpeg + from ffmpeg._run import Error + # checks for video streams with ffmpeg, or min file size for a video # self.is_video() should be used together with this method try: diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index f8df659..ee3a190 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -16,16 +16,10 @@ from rich_argparse import RichHelpFormatter from .context import ArchivingContext -from ..archivers import Archiver -from ..feeders import Feeder -from ..formatters import Formatter -from ..storages import Storage -from ..enrichers import Enricher -from ..databases import Database from .metadata import Metadata from ..version import __version__ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG -from .loader import available_modules, Module, MODULE_TYPES +from .loader import available_modules, Module, MODULE_TYPES, load_modules import tempfile, traceback from loguru import logger @@ -74,7 +68,7 @@ class ArchivingOrchestrator: add_help=False, ) self.add_steps_args(parser) - breakpoint() + # check what mode we're in # if we have a config file, use that to decide which modules to load # if simple, we'll load just the modules that has requires_setup = False @@ -91,7 +85,7 @@ class ArchivingOrchestrator: if modules := getattr(basic_config, f"{module_type}s", []): enabled_modules.extend(modules) - self.add_module_args(available_modules(enabled_modules, with_manifest=True), parser) + self.add_module_args(available_modules(with_manifest=True, limit_to_modules=enabled_modules), parser) elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) @@ -103,7 +97,7 @@ class ArchivingOrchestrator: # load all modules, they're not using the 'simple' mode self.add_module_args(available_modules(with_manifest=True), parser) - breakpoint() + parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -114,27 +108,30 @@ class ArchivingOrchestrator: # merge the new config with the old one yaml_config = merge_dicts(vars(parsed), yaml_config) - if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): + if basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): logger.info(f"Storing configuration file to {basic_config.config_file}") store_yaml(yaml_config, basic_config.config_file) - breakpoint() - logger.info(f"FEEDER: {self.config.feeders}") - logger.info(f"ENRICHERS: {self.config.enrichers}") - logger.info(f"ARCHIVERS: {self.config.archivers}") - logger.info(f"DATABASES: {self.config.databases}") - logger.info(f"STORAGES: {self.config.storages}") - logger.info(f"FORMATTER: {self.formatter.name}") + + self.config = yaml_config + + logger.info("FEEDERS: " + ", ".join(self.config['steps']['feeders'])) + logger.info("EXTRACTORS: " + ", ".join(self.config['steps']['extractors'])) + logger.info("ENRICHERS: " + ", ".join(self.config['steps']['enrichers'])) + logger.info("DATABASES: " + ", ".join(self.config['steps']['databases'])) + logger.info("STORAGES: " + ", ".join(self.config['steps']['storages'])) + logger.info("FORMATTERS: " + ", ".join(self.config['steps']['formatters'])) + return self.config def add_steps_args(self, parser: argparse.ArgumentParser = None): if not parser: parser = self.parser - parser.add_argument('--feeders', action='store', dest='feeders', nargs='+', required=True, help='the feeders to use') - parser.add_argument('--enrichers', action='store', dest='enrichers', nargs='+', required=True, help='the enrichers to use') - parser.add_argument('--archivers', action='store', dest='archivers', nargs='+', required=True, help='the archivers to use') - parser.add_argument('--databases', action='store', dest='databases', nargs='+', required=True, help='the databases to use') - parser.add_argument('--storages', action='store', dest='storages', nargs='+', required=True, help='the storages to use') - parser.add_argument('--formatter', action='store', dest='formatter', nargs='+', required=True, help='the formatter to use') + parser.add_argument('--feeders', action='store', dest='steps.feeders', nargs='+', required=True, help='the feeders to use') + parser.add_argument('--enrichers', action='store', dest='steps.enrichers', nargs='+', required=True, help='the enrichers to use') + parser.add_argument('--extractors', action='store', dest='steps.extractors', nargs='+', required=True, help='the extractors to use') + parser.add_argument('--databases', action='store', dest='steps.databases', nargs='+', required=True, help='the databases to use') + parser.add_argument('--storages', action='store', dest='steps.storages', nargs='+', required=True, help='the storages to use') + parser.add_argument('--formatters', action='store', dest='steps.formatters', nargs='+', required=True, help='the formatter to use') def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None): @@ -165,6 +162,12 @@ class ArchivingOrchestrator: self.basic_parser.print_help() exit() + + def install_modules(self): + modules = set() + [modules.update(*m) for m in self.config['steps'].values()] + + load_modules(modules) def run(self) -> None: self.setup_basic_parser() @@ -187,11 +190,10 @@ class ArchivingOrchestrator: yaml_config = read_yaml(basic_config.config_file) - + breakpoint() self.setup_complete_parser(basic_config, yaml_config, unused_args) - config.parse() - + self.install_modules() for item in self.feed(): pass @@ -201,8 +203,9 @@ class ArchivingOrchestrator: for a in self.all_archivers_for_setup(): a.cleanup() def feed(self) -> Generator[Metadata]: - for item in self.feeder: - yield self.feed_item(item) + for feeder in self.config['steps']['feeders']: + for item in feeder: + yield self.feed_item(item) self.cleanup() def feed_item(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py index 4c73896..5aaa679 100644 --- a/src/auto_archiver/databases/__init__.py +++ b/src/auto_archiver/databases/__init__.py @@ -1,10 +1,4 @@ """ Databases are used to store the outputs from running the Autp Archiver. -""" -from .database import Database -from .gsheet_db import GsheetsDb -from .console_db import ConsoleDb -from .csv_db import CSVDb -from .api_db import AAApiDb -from .atlos_db import AtlosDb \ No newline at end of file +""" \ No newline at end of file diff --git a/src/auto_archiver/enrichers/__init__.py b/src/auto_archiver/enrichers/__init__.py index 64ce248..67cb0e5 100644 --- a/src/auto_archiver/enrichers/__init__.py +++ b/src/auto_archiver/enrichers/__init__.py @@ -10,15 +10,3 @@ Enrichers are optional but highly useful for making the archived data more power """ -from .enricher import Enricher -from .screenshot_enricher import ScreenshotEnricher -from .wayback_enricher import WaybackArchiverEnricher -from .hash_enricher import HashEnricher -from .thumbnail_enricher import ThumbnailEnricher -from .wacz_enricher import WaczArchiverEnricher -from .whisper_enricher import WhisperEnricher -from .pdq_hash_enricher import PdqHashEnricher -from .metadata_enricher import MetadataEnricher -from .meta_enricher import MetaEnricher -from .ssl_enricher import SSLEnricher -from .timestamping_enricher import TimestampingEnricher \ No newline at end of file diff --git a/src/auto_archiver/feeders/__init__.py b/src/auto_archiver/feeders/__init__.py index 8117672..3eb33d7 100644 --- a/src/auto_archiver/feeders/__init__.py +++ b/src/auto_archiver/feeders/__init__.py @@ -1,7 +1,3 @@ """ Feeders handle the input of media into the Auto Archiver. """ -from.feeder import Feeder -from .gsheet_feeder import GsheetsFeeder -from .cli_feeder import CLIFeeder -from .atlos_feeder import AtlosFeeder \ No newline at end of file diff --git a/src/auto_archiver/formatters/__init__.py b/src/auto_archiver/formatters/__init__.py index af96f15..1a9dcd0 100644 --- a/src/auto_archiver/formatters/__init__.py +++ b/src/auto_archiver/formatters/__init__.py @@ -1,4 +1 @@ """ Formatters for the output of the content. """ -from .formatter import Formatter -from .html_formatter import HtmlFormatter -from .mute_formatter import MuteFormatter \ No newline at end of file diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py index bff83e6..0765833 100644 --- a/src/auto_archiver/storages/__init__.py +++ b/src/auto_archiver/storages/__init__.py @@ -1,8 +1,3 @@ """ This module contains the storage classes for the auto-archiver. -""" -from .storage import Storage -from .s3 import S3Storage -from .local import LocalStorage -from .gd import GDriveStorage -from .atlos import AtlosStorage \ No newline at end of file +""" \ No newline at end of file From 99c8c690852a907db91fdf256451a7c348c69f69 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 22 Jan 2025 18:18:13 +0000 Subject: [PATCH 010/110] Manifests for databases --- src/auto_archiver/databases/__init__.py | 10 +- .../databases/api_db/__init__.py | 0 .../databases/api_db/__manifest__.py | 33 ++++++ src/auto_archiver/databases/api_db/api_db.py | 70 +++++++++++ .../databases/atlos_db/__init__.py | 0 .../databases/atlos_db/__manifest__.py | 26 ++++ .../databases/atlos_db/atlos_db.py | 79 ++++++++++++ .../databases/console_db/__init__.py | 0 .../databases/console_db/__manifest__.py | 22 ++++ .../databases/console_db/console_db.py | 32 +++++ .../databases/csv_db/__init__.py | 0 .../databases/csv_db/__manifest__.py | 22 ++++ src/auto_archiver/databases/csv_db/csv_db.py | 34 ++++++ .../databases/gsheet_db/__init__.py | 0 .../databases/gsheet_db/__manifest__.py | 21 ++++ .../databases/gsheet_db/gsheet_db.py | 112 ++++++++++++++++++ .../instagram_api_archiver/__manifest__.py | 1 - .../instagram_archiver/__manifest__.py | 1 - .../instagram_tbot_archiver/__manifest__.py | 1 - .../modules/telegram_archiver/__manifest__.py | 1 - .../modules/telethon_archiver/__manifest__.py | 1 - .../twitter_api_archiver/__manifest__.py | 1 - 22 files changed, 456 insertions(+), 11 deletions(-) create mode 100644 src/auto_archiver/databases/api_db/__init__.py create mode 100644 src/auto_archiver/databases/api_db/__manifest__.py create mode 100644 src/auto_archiver/databases/api_db/api_db.py create mode 100644 src/auto_archiver/databases/atlos_db/__init__.py create mode 100644 src/auto_archiver/databases/atlos_db/__manifest__.py create mode 100644 src/auto_archiver/databases/atlos_db/atlos_db.py create mode 100644 src/auto_archiver/databases/console_db/__init__.py create mode 100644 src/auto_archiver/databases/console_db/__manifest__.py create mode 100644 src/auto_archiver/databases/console_db/console_db.py create mode 100644 src/auto_archiver/databases/csv_db/__init__.py create mode 100644 src/auto_archiver/databases/csv_db/__manifest__.py create mode 100644 src/auto_archiver/databases/csv_db/csv_db.py create mode 100644 src/auto_archiver/databases/gsheet_db/__init__.py create mode 100644 src/auto_archiver/databases/gsheet_db/__manifest__.py create mode 100644 src/auto_archiver/databases/gsheet_db/gsheet_db.py diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py index 4c73896..d6de470 100644 --- a/src/auto_archiver/databases/__init__.py +++ b/src/auto_archiver/databases/__init__.py @@ -3,8 +3,8 @@ """ from .database import Database -from .gsheet_db import GsheetsDb -from .console_db import ConsoleDb -from .csv_db import CSVDb -from .api_db import AAApiDb -from .atlos_db import AtlosDb \ No newline at end of file +from .gsheet_db.gsheet_db import GsheetsDb +from .console_db.console_db import ConsoleDb +from .csv_db.csv_db import CSVDb +from .api_db.api_db import AAApiDb +from .atlos_db.atlos_db import AtlosDb \ No newline at end of file diff --git a/src/auto_archiver/databases/api_db/__init__.py b/src/auto_archiver/databases/api_db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/databases/api_db/__manifest__.py b/src/auto_archiver/databases/api_db/__manifest__.py new file mode 100644 index 0000000..a55f26c --- /dev/null +++ b/src/auto_archiver/databases/api_db/__manifest__.py @@ -0,0 +1,33 @@ +{ + "name": "Auto-Archiver API Database", + "type": ["database"], + "entry_point": "api_db:AAApiDb", + "requires_setup": True, + "external_dependencies": { + "python": ["requests", + "loguru"], + }, + "configs": { + "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, + "api_token": {"default": None, "help": "API Bearer token."}, + "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, + "author_id": {"default": None, "help": "which email to assign as author"}, + "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, + "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, + "store_results": {"default": True, "help": "when set, will send the results to the API database."}, + "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, + }, + "description": """ + Provides integration with the Auto-Archiver API for querying and storing archival data. + +### Features +- **API Integration**: Supports querying for existing archives and submitting results. +- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled. +- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions. +- **Tagging and Metadata**: Adds tags and manages metadata for archives. +- **Optional Storage**: Archives results conditionally based on configuration. + +### Setup +Requires access to an Auto-Archiver API instance and a valid API token. + """, +} diff --git a/src/auto_archiver/databases/api_db/api_db.py b/src/auto_archiver/databases/api_db/api_db.py new file mode 100644 index 0000000..84bdfcb --- /dev/null +++ b/src/auto_archiver/databases/api_db/api_db.py @@ -0,0 +1,70 @@ +from typing import Union +import requests, os +from loguru import logger + +from .. import Database +from ...core import Metadata + + +class AAApiDb(Database): + """ + Connects to auto-archiver-api instance + """ + name = "auto_archiver_api_db" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + self.allow_rearchive = bool(self.allow_rearchive) + self.store_results = bool(self.store_results) + self.assert_valid_string("api_endpoint") + + @staticmethod + def configs() -> dict: + return { + "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, + "api_token": {"default": None, "help": "API Bearer token."}, + "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, + "author_id": {"default": None, "help": "which email to assign as author"}, + "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, + "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, + "store_results": {"default": True, "help": "when set, will send the results to the API database."}, + "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, + } + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """ query the database for the existence of this item. + Helps avoid re-archiving the same URL multiple times. + """ + if not self.allow_rearchive: return + + params = {"url": item.get_url(), "limit": 15} + headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"} + response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers) + + if response.status_code == 200: + if len(response.json()): + logger.success(f"API returned {len(response.json())} previously archived instance(s)") + fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()] + return Metadata.choose_most_complete(fetched_metadata) + else: + logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") + return False + + + def done(self, item: Metadata, cached: bool=False) -> None: + """archival result ready - should be saved to DB""" + if not self.store_results: return + if cached: + logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached") + return + logger.debug(f"saving archive of {item.get_url()} to the AA API.") + + payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)} + headers = {"Authorization": f"Bearer {self.api_token}"} + response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers) + + if response.status_code == 200: + logger.success(f"AA API: {response.json()}") + else: + logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") + diff --git a/src/auto_archiver/databases/atlos_db/__init__.py b/src/auto_archiver/databases/atlos_db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/databases/atlos_db/__manifest__.py b/src/auto_archiver/databases/atlos_db/__manifest__.py new file mode 100644 index 0000000..1e2b676 --- /dev/null +++ b/src/auto_archiver/databases/atlos_db/__manifest__.py @@ -0,0 +1,26 @@ +{ + "name": "Atlos Database", + "type": ["database"], + "entry_point": "atlos_db:AtlosDb", + "requires_setup": True, + "external_dependencies": + {"python": ["loguru", + ""], + "bin": [""]}, + "configs": {}, + "description": """ +Handles integration with the Atlos platform for managing archival results. + +### Features +- Outputs archival results to the Atlos API for storage and tracking. +- Updates failure status with error details when archiving fails. +- Processes and formats metadata, including ISO formatting for datetime fields. +- Skips processing for items without an Atlos ID. + +### Setup +Required configs: +- atlos_url: Base URL for the Atlos API. +- api_token: Authentication token for API access. +""" +, +} diff --git a/src/auto_archiver/databases/atlos_db/atlos_db.py b/src/auto_archiver/databases/atlos_db/atlos_db.py new file mode 100644 index 0000000..4a00b9d --- /dev/null +++ b/src/auto_archiver/databases/atlos_db/atlos_db.py @@ -0,0 +1,79 @@ +import os +from typing import Union +from loguru import logger +from csv import DictWriter +from dataclasses import asdict +import requests + +from .. import Database +from ...core import Metadata +from ...utils import get_atlos_config_options + + +class AtlosDb(Database): + """ + Outputs results to Atlos + """ + + name = "atlos_db" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return get_atlos_config_options() + + def failed(self, item: Metadata, reason: str) -> None: + """Update DB accordingly for failure""" + # If the item has no Atlos ID, there's nothing for us to do + if not item.metadata.get("atlos_id"): + logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") + return + + requests.post( + f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", + headers={"Authorization": f"Bearer {self.api_token}"}, + json={"metadata": {"processed": True, "status": "error", "error": reason}}, + ).raise_for_status() + logger.info( + f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}" + ) + + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """check and fetch if the given item has been archived already, each + database should handle its own caching, and configuration mechanisms""" + return False + + def _process_metadata(self, item: Metadata) -> dict: + """Process metadata for storage on Atlos. Will convert any datetime + objects to ISO format.""" + + return { + k: v.isoformat() if hasattr(v, "isoformat") else v + for k, v in item.metadata.items() + } + + def done(self, item: Metadata, cached: bool = False) -> None: + """archival result ready - should be saved to DB""" + + if not item.metadata.get("atlos_id"): + logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") + return + + requests.post( + f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", + headers={"Authorization": f"Bearer {self.api_token}"}, + json={ + "metadata": dict( + processed=True, + status="success", + results=self._process_metadata(item), + ) + }, + ).raise_for_status() + + logger.info( + f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos" + ) diff --git a/src/auto_archiver/databases/console_db/__init__.py b/src/auto_archiver/databases/console_db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/databases/console_db/__manifest__.py b/src/auto_archiver/databases/console_db/__manifest__.py new file mode 100644 index 0000000..cd40496 --- /dev/null +++ b/src/auto_archiver/databases/console_db/__manifest__.py @@ -0,0 +1,22 @@ +{ + "name": "Console Database", + "type": ["database"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + }, + "description": """ +Provides a simple database implementation that outputs archival results and status updates to the console. + +### Features +- Logs the status of archival tasks directly to the console, including: + - started + - failed (with error details) + - aborted + - done (with optional caching status) +- Useful for debugging or lightweight setups where no external database is required. + +### Setup +No additional configuration is required. +""", +} diff --git a/src/auto_archiver/databases/console_db/console_db.py b/src/auto_archiver/databases/console_db/console_db.py new file mode 100644 index 0000000..a5e648b --- /dev/null +++ b/src/auto_archiver/databases/console_db/console_db.py @@ -0,0 +1,32 @@ +from loguru import logger + +from .. import Database +from ...core import Metadata + + +class ConsoleDb(Database): + """ + Outputs results to the console + """ + name = "console_db" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def started(self, item: Metadata) -> None: + logger.warning(f"STARTED {item}") + + def failed(self, item: Metadata, reason:str) -> None: + logger.error(f"FAILED {item}: {reason}") + + def aborted(self, item: Metadata) -> None: + logger.warning(f"ABORTED {item}") + + def done(self, item: Metadata, cached: bool=False) -> None: + """archival result ready - should be saved to DB""" + logger.success(f"DONE {item}") \ No newline at end of file diff --git a/src/auto_archiver/databases/csv_db/__init__.py b/src/auto_archiver/databases/csv_db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/databases/csv_db/__manifest__.py b/src/auto_archiver/databases/csv_db/__manifest__.py new file mode 100644 index 0000000..1fe2d7d --- /dev/null +++ b/src/auto_archiver/databases/csv_db/__manifest__.py @@ -0,0 +1,22 @@ +{ + "name": "csv_db", + "type": ["database"], + "requires_setup": False, + "external_dependencies": {"python": ["loguru"] + }, + "configs": { + "csv_file": {"default": "db.csv", "help": "CSV file name"} + }, + "description": """ +Handles exporting archival results to a CSV file. + +### Features +- Saves archival metadata as rows in a CSV file. +- Automatically creates the CSV file with a header if it does not exist. +- Appends new metadata entries to the existing file. + +### Setup +Required config: +- csv_file: Path to the CSV file where results will be stored (default: "db.csv"). +""", +} diff --git a/src/auto_archiver/databases/csv_db/csv_db.py b/src/auto_archiver/databases/csv_db/csv_db.py new file mode 100644 index 0000000..e24306f --- /dev/null +++ b/src/auto_archiver/databases/csv_db/csv_db.py @@ -0,0 +1,34 @@ +import os +from loguru import logger +from csv import DictWriter +from dataclasses import asdict + +from .. import Database +from ...core import Metadata + + +class CSVDb(Database): + """ + Outputs results to a CSV file + """ + name = "csv_db" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + self.assert_valid_string("csv_file") + + @staticmethod + def configs() -> dict: + return { + "csv_file": {"default": "db.csv", "help": "CSV file name"} + } + + def done(self, item: Metadata, cached: bool=False) -> None: + """archival result ready - should be saved to DB""" + logger.success(f"DONE {item}") + is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0 + with open(self.csv_file, "a", encoding="utf-8") as outf: + writer = DictWriter(outf, fieldnames=asdict(Metadata())) + if is_empty: writer.writeheader() + writer.writerow(asdict(item)) diff --git a/src/auto_archiver/databases/gsheet_db/__init__.py b/src/auto_archiver/databases/gsheet_db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/databases/gsheet_db/__manifest__.py b/src/auto_archiver/databases/gsheet_db/__manifest__.py new file mode 100644 index 0000000..f4db93b --- /dev/null +++ b/src/auto_archiver/databases/gsheet_db/__manifest__.py @@ -0,0 +1,21 @@ +# TODO merge with feeder manifest? +{ + "name": "gsheet_db", + "type": ["database"], + "requires_setup": True, + "external_dependencies": {"python": [" loguru"], + }, + "description": """ +Handles integration with Google Sheets for tracking archival tasks. + +### Features +- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. +- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. +- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. +- Skips redundant updates for empty or invalid data fields. + +### Notes +- Currently works only with metadata provided by GsheetFeeder. +- Requires configuration of a linked Google Sheet and appropriate API credentials. +""", +} diff --git a/src/auto_archiver/databases/gsheet_db/gsheet_db.py b/src/auto_archiver/databases/gsheet_db/gsheet_db.py new file mode 100644 index 0000000..631a554 --- /dev/null +++ b/src/auto_archiver/databases/gsheet_db/gsheet_db.py @@ -0,0 +1,112 @@ +from typing import Union, Tuple +import datetime +from urllib.parse import quote + +from loguru import logger + +from .. import Database +from ...core import Metadata, Media, ArchivingContext +from ...utils import GWorksheet + + +class GsheetsDb(Database): + """ + NB: only works if GsheetFeeder is used. + could be updated in the future to support non-GsheetFeeder metadata + """ + name = "gsheet_db" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def started(self, item: Metadata) -> None: + logger.warning(f"STARTED {item}") + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, 'status', 'Archive in progress') + + def failed(self, item: Metadata, reason:str) -> None: + logger.error(f"FAILED {item}") + self._safe_status_update(item, f'Archive failed {reason}') + + def aborted(self, item: Metadata) -> None: + logger.warning(f"ABORTED {item}") + self._safe_status_update(item, '') + + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """check if the given item has been archived already""" + return False + + def done(self, item: Metadata, cached: bool=False) -> None: + """archival result ready - should be saved to DB""" + logger.success(f"DONE {item.get_url()}") + gw, row = self._retrieve_gsheet(item) + # self._safe_status_update(item, 'done') + + cell_updates = [] + row_values = gw.get_row(row) + + def batch_if_valid(col, val, final_value=None): + final_value = final_value or val + try: + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': + cell_updates.append((row, col, final_value)) + except Exception as e: + logger.error(f"Unable to batch {col}={final_value} due to {e}") + status_message = item.status + if cached: + status_message = f"[cached] {status_message}" + cell_updates.append((row, 'status', status_message)) + + media: Media = item.get_final_media() + if hasattr(media, "urls"): + batch_if_valid('archive', "\n".join(media.urls)) + batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()) + batch_if_valid('title', item.get_title()) + batch_if_valid('text', item.get("content", "")) + batch_if_valid('timestamp', item.get_timestamp()) + if media: batch_if_valid('hash', media.get("hash", "not-calculated")) + + # merge all pdq hashes into a single string, if present + pdq_hashes = [] + all_media = item.get_all_media() + for m in all_media: + if pdq := m.get("pdq_hash"): + pdq_hashes.append(pdq) + if len(pdq_hashes): + batch_if_valid('pdq_hash', ",".join(pdq_hashes)) + + if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"): + batch_if_valid('screenshot', "\n".join(screenshot.urls)) + + if (thumbnail := item.get_first_image("thumbnail")): + if hasattr(thumbnail, "urls"): + batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")') + + if (browsertrix := item.get_media_by_id("browsertrix")): + batch_if_valid('wacz', "\n".join(browsertrix.urls)) + batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls])) + + gw.batch_set_cell(cell_updates) + + def _safe_status_update(self, item: Metadata, new_status: str) -> None: + try: + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, 'status', new_status) + except Exception as e: + logger.debug(f"Unable to update sheet: {e}") + + def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: + # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now + if gsheet := ArchivingContext.get("gsheet"): + gw: GWorksheet = gsheet.get("worksheet") + row: int = gsheet.get("row") + elif self.sheet_id: + print(self.sheet_id) + + + return gw, row diff --git a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py index 2bb3f67..fb12dc2 100644 --- a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py @@ -2,7 +2,6 @@ "name": "Instagram API Archiver", "type": ["extractor"], "entry_point": "instagram_api_archiver:InstagramApiArchiver", - "depends": ["core"], "external_dependencies": {"python": ["requests", "loguru", diff --git a/src/auto_archiver/modules/instagram_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_archiver/__manifest__.py index bd63ab4..bb143b3 100644 --- a/src/auto_archiver/modules/instagram_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_archiver/__manifest__.py @@ -2,7 +2,6 @@ "name": "Instagram Archiver", "type": ["extractor"], "entry_point": "instagram_archiver:InstagramArchiver", - "depends": ["core"], "external_dependencies": { "python": ["instaloader", "loguru",], diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py index cadb729..6e934b0 100644 --- a/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py @@ -2,7 +2,6 @@ "name": "Instagram Telegram Bot Archiver", "type": ["extractor"], "entry_point": "instagram_tbot_archiver:InstagramTbotArchiver", - "depends": ["core", "utils"], "external_dependencies": {"python": ["loguru", "telethon",], }, diff --git a/src/auto_archiver/modules/telegram_archiver/__manifest__.py b/src/auto_archiver/modules/telegram_archiver/__manifest__.py index b56477a..f3950b5 100644 --- a/src/auto_archiver/modules/telegram_archiver/__manifest__.py +++ b/src/auto_archiver/modules/telegram_archiver/__manifest__.py @@ -3,7 +3,6 @@ "type": ["extractor"], "entry_point": "telegram_archiver:TelegramArchiver", "requires_setup": False, - "depends": ["core"], "external_dependencies": { "python": [ "requests", diff --git a/src/auto_archiver/modules/telethon_archiver/__manifest__.py b/src/auto_archiver/modules/telethon_archiver/__manifest__.py index 82d56ba..e7359d7 100644 --- a/src/auto_archiver/modules/telethon_archiver/__manifest__.py +++ b/src/auto_archiver/modules/telethon_archiver/__manifest__.py @@ -4,7 +4,6 @@ "type": ["extractor"], "entry_point": "telethon_archiver:TelethonArchiver", "requires_setup": True, - "depends": [""], "external_dependencies": { "python": ["telethon", "loguru", diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py index b415679..203eee9 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py @@ -3,7 +3,6 @@ "type": ["extractor"], "entry_point": "twitter_api_archiver:TwitterApiArchiver", "requires_setup": True, - "depends": ["core"], "external_dependencies": { "python": ["requests", "loguru", From 550097ab7b927cb90e6ff46e2b35a7e07a319c99 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 22 Jan 2025 23:54:21 +0100 Subject: [PATCH 011/110] Get module loading working properly --- src/auto_archiver/archivers/archiver.py | 12 +-- src/auto_archiver/core/__init__.py | 4 + src/auto_archiver/core/loader.py | 96 +++++++++---------- src/auto_archiver/core/orchestrator.py | 59 ++++++++---- src/auto_archiver/core/step.py | 43 +-------- .../modules/generic_extractor/__init__.py | 1 + .../modules/generic_extractor/__manifest__.py | 5 +- .../generic_extractor/generic_extractor.py | 11 --- 8 files changed, 100 insertions(+), 131 deletions(-) diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py index 7ec699e..b5f3f40 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -19,22 +19,12 @@ from ..core import Metadata, Step, ArchivingContext @dataclass -class Archiver(Step): +class Archiver: """ Base class for implementing archivers in the media archiving framework. Subclasses must implement the `download` method to define platform-specific behavior. """ - name = "archiver" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - def init(name: str, config: dict) -> Archiver: - # only for typing... - return Step.init(name, config, Archiver) - def setup(self) -> None: # used when archivers need to login or do other one-time setup pass diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index 779d3ac..b78df83 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -1,6 +1,10 @@ """ Core modules to handle things such as orchestration, metadata and configs.. """ +from .metadata import Metadata +from .media import Media +from .step import Step +from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index aa03b1f..f0a6ee7 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -1,5 +1,7 @@ import ast -from dataclasses import dataclass, field +from typing import Type +from importlib.util import find_spec +from dataclasses import dataclass import os import copy from os.path import join, dirname @@ -22,9 +24,8 @@ _DEFAULT_MANIFEST = { 'name': '', 'author': 'Bellingcat', 'requires_setup': True, - 'depends': [], 'description': '', - 'external_dependencies': {}, + 'dependencies': {}, 'entry_point': '', 'version': '1.0', 'configs': {} @@ -35,9 +36,7 @@ class Module: name: str display_name: str type: list - entry_point: str - depends: list - external_dependencies: dict + dependencies: dict requires_setup: bool configs: dict description: str @@ -51,54 +50,47 @@ class Module: if manifest: self.display_name = manifest['name'] self.type = manifest['type'] - self.entry_point = manifest['entry_point'] - self.depends = manifest['depends'] - self.external_dependencies = manifest['external_dependencies'] + self._entry_point = manifest['entry_point'] + self.dependencies = manifest['dependencies'] self.requires_setup = manifest['requires_setup'] self.configs = manifest['configs'] self.description = manifest['description'] + + @property + def entry_point(self): + if not self._entry_point: + # try to create the entry point from the module name + self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}" + return self._entry_point def __repr__(self): return f"Module<'{self.display_name}' ({self.name})>" -def load_modules(modules): - modules = available_modules(limit_to_modules=modules, with_manifest=True) - for module in modules: - _load_module(module) +def load_module(module: str) -> object: # TODO: change return type to Step + + # load a module by name + module = get_module(module) + if not module: + return None + # check external dependencies are installed + def check_deps(deps, check): + for dep in deps: + if not check(dep): + logger.error(f"Module '{module.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{module.name}' module? See the README for more information.") + exit(1) + + check_deps(module.dependencies.get('python', []), lambda dep: find_spec(dep)) + check_deps(module.dependencies.get('bin', []), lambda dep: shutil.which(dep)) + + qualname = f'auto_archiver.modules.{module.name}' + if qualname in sys.modules: + return + logger.info(f"Loading module '{module.display_name}'...") + loaded_module = __import__(qualname) + return getattr(sys.modules[qualname], module.entry_point)() -def _load_module(module): - # first make sure that the 'depends' are installed and available in sys.args - for dependency in module.depends: - if dependency not in sys.modules: - logger.error(f""" - Module {module.name} depends on {dependency} which is not available. - - Have you set up the '{module.name}' module correctly? See the README for more information. - """) - exit() - # then check the external dependencies, these are binary dependencies that should be available on the path - for dep_type, deps in module.external_dependencies.items(): - if dep_type == 'python': - for dep in deps: - if dep not in sys.modules: - logger.error(f""" - Module {module.name} requires {dep} which is not available. - - Have you installed the required dependencies for the '{module.name}' module? See the README for more information. - """) - elif dep_type == 'binary': - for dep in deps: - if not shutil.which(dep): - logger.error(f""" - Module {module.name} requires {dep} which is not available. - - Have you installed the required dependencies for the '{module.name}' module? See the README for more information. - """) # finally, load the module - logger.info(f"Loading module {module.display_name}") - module = __import__(module.entry_point, fromlist=[module.entry_point]) - logger.info(f"Module {module.display_name} loaded") def load_manifest(module_path): # print(f"Loading manifest for module {module_path}") @@ -109,7 +101,14 @@ def load_manifest(module_path): manifest.update(ast.literal_eval(f.read())) return manifest -def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], ) -> List[Module]: +def get_module(module_name): + # get a module by name + try: + return available_modules(limit_to_modules=[module_name], with_manifest=True, suppress_warnings=True)[0] + except IndexError: + return None + +def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[Module]: # search through all valid 'modules' paths. Default is 'modules' in the current directory # see odoo/modules/module.py -> get_modules @@ -142,8 +141,9 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [] manifest = {} all_modules.append(Module(possible_module, possible_module_path, manifest)) - for module in limit_to_modules: - if not any(module == m.name for m in all_modules): - logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?") + if not suppress_warnings: + for module in limit_to_modules: + if not any(module == m.name for m in all_modules): + logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?") return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index ee3a190..214c704 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -19,7 +19,7 @@ from .context import ArchivingContext from .metadata import Metadata from ..version import __version__ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG -from .loader import available_modules, Module, MODULE_TYPES, load_modules +from .loader import available_modules, Module, MODULE_TYPES, load_module import tempfile, traceback from loguru import logger @@ -114,12 +114,6 @@ class ArchivingOrchestrator: self.config = yaml_config - logger.info("FEEDERS: " + ", ".join(self.config['steps']['feeders'])) - logger.info("EXTRACTORS: " + ", ".join(self.config['steps']['extractors'])) - logger.info("ENRICHERS: " + ", ".join(self.config['steps']['enrichers'])) - logger.info("DATABASES: " + ", ".join(self.config['steps']['databases'])) - logger.info("STORAGES: " + ", ".join(self.config['steps']['storages'])) - logger.info("FORMATTERS: " + ", ".join(self.config['steps']['formatters'])) return self.config def add_steps_args(self, parser: argparse.ArgumentParser = None): @@ -164,10 +158,33 @@ class ArchivingOrchestrator: exit() def install_modules(self): - modules = set() - [modules.update(*m) for m in self.config['steps'].values()] + """ + Swaps out the previous 'strings' in the config with the actual modules + """ + + for module_type in MODULE_TYPES: + step_items = [] + modules_to_load = self.config['steps'][f"{module_type}s"] - load_modules(modules) + def check_steps_ok(): + if not len(step_items): + logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration file and try again. Tried to load the following modules, but none were available: {modules_to_load}") + exit() + + if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1: + logger.error(f"Only one feeder is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}") + exit() + + for i, module in enumerate(modules_to_load): + loaded_module = load_module(module) + if loaded_module: + step_items.append(loaded_module) + check_steps_ok() + self.config['steps'][f"{module_type}s"] = step_items + + + assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again." + self.config['steps'][f"{module_type}s"] = step_items def run(self) -> None: self.setup_basic_parser() @@ -190,17 +207,26 @@ class ArchivingOrchestrator: yaml_config = read_yaml(basic_config.config_file) - breakpoint() + self.setup_complete_parser(basic_config, yaml_config, unused_args) self.install_modules() + logger.info("FEEDERS: " + ", ".join(m.name for m in self.config['steps']['feeders'])) + logger.info("EXTRACTORS: " + ", ".join(m.name for m in self.config['steps']['extractors'])) + logger.info("ENRICHERS: " + ", ".join(m.name for m in self.config['steps']['enrichers'])) + logger.info("DATABASES: " + ", ".join(m.name for m in self.config['steps']['databases'])) + logger.info("STORAGES: " + ", ".join(m.name for m in self.config['steps']['storages'])) + logger.info("FORMATTERS: " + ", ".join(m.name for m in self.config['steps']['formatters'])) + for item in self.feed(): pass def cleanup(self)->None: logger.info("Cleaning up") - for a in self.all_archivers_for_setup(): a.cleanup() + for e in self.config['steps']['extractors']: + breakpoint() + e.cleanup() def feed(self) -> Generator[Metadata]: for feeder in self.config['steps']['feeders']: @@ -222,12 +248,12 @@ class ArchivingOrchestrator: except KeyboardInterrupt: # catches keyboard interruptions to do a clean exit logger.warning(f"caught interrupt on {item=}") - for d in self.databases: d.aborted(item) + for d in self.config['steps']['databases']: d.aborted(item) self.cleanup() exit() except Exception as e: logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') - for d in self.databases: + for d in self.config['steps']['databases']: if type(e) == AssertionError: d.failed(item, str(e)) else: d.failed(item) @@ -317,7 +343,4 @@ class ArchivingOrchestrator: assert ip.is_global, f"Invalid IP used" assert not ip.is_reserved, f"Invalid IP used" assert not ip.is_link_local, f"Invalid IP used" - assert not ip.is_private, f"Invalid IP used" - - def all_archivers_for_setup(self) -> List[Archiver]: - return self.archivers + [e for e in self.enrichers if isinstance(e, Archiver)] \ No newline at end of file + assert not ip.is_private, f"Invalid IP used" \ No newline at end of file diff --git a/src/auto_archiver/core/step.py b/src/auto_archiver/core/step.py index 9f294fe..0c14381 100644 --- a/src/auto_archiver/core/step.py +++ b/src/auto_archiver/core/step.py @@ -5,44 +5,7 @@ by handling user configuration, validating the steps properties, and implementin """ from __future__ import annotations -from dataclasses import dataclass -from inspect import ClassFoundException -from typing import Type -from abc import ABC - -@dataclass -class Step(ABC): - name: str = None - - def __init__(self, config: dict) -> None: - # Initialises each step by reading the relevant entries - # reads the configs into object properties - # self.config = config[self.name] - for k, v in config.get(self.name, {}).items(): - self.__setattr__(k, v) - - @staticmethod - def configs() -> dict: return {} - - def init(name: str, config: dict, child: Type[Step]) -> Step: - """ - Attempts to instantiate a subclass of the provided `child` type - matching the given `name`. - Raises ClassFoundException if no matching subclass is found. - TODO: cannot find subclasses of child.subclasses - """ - for sub in child.__subclasses__(): - if sub.name == name: - return sub(config) - raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names, and make sure you made the step discoverable by putting it into __init__.py") - - def assert_valid_string(self, prop: str) -> None: - """ - Receives a property name and ensures it exists and is a valid non-empty string, - raising an AssertionError if not. - TODO: replace assertions with custom exceptions - """ - assert hasattr(self, prop), f"property {prop} not found" - s = getattr(self, prop) - assert s is not None and type(s) == str and len(s) > 0, f"invalid property {prop} value '{s}', it should be a valid string" +class Step: + # TODO: try and get this name from the manifest, so we don't have to set it twice + name: str \ No newline at end of file diff --git a/src/auto_archiver/modules/generic_extractor/__init__.py b/src/auto_archiver/modules/generic_extractor/__init__.py index e69de29..5bfcd01 100644 --- a/src/auto_archiver/modules/generic_extractor/__init__.py +++ b/src/auto_archiver/modules/generic_extractor/__init__.py @@ -0,0 +1 @@ +from .generic_extractor import GenericExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index d9d0669..6c5a9b8 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -3,10 +3,9 @@ 'version': '0.1.0', 'author': 'Bellingcat', 'type': ['extractor'], - 'entry_point': 'generic_extractor:GenericExtractor', + 'entry_point': 'GenericExtractor', # this class should be present in the __init__.py 'requires_setup': False, - 'depends': ['core'], - 'external_dependencies': { + 'dependencies': { 'python': ['yt_dlp', 'requests', 'loguru', 'slugify'], }, 'description': """ diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 1fd6a18..27fe157 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -12,17 +12,6 @@ class GenericExtractor(Archiver): name = "youtubedl_archiver" #left as is for backwards compat _dropins = {} - def __init__(self, config: dict) -> None: - super().__init__(config) - self.subtitles = bool(self.subtitles) - self.comments = bool(self.comments) - self.livestreams = bool(self.livestreams) - self.live_from_start = bool(self.live_from_start) - self.end_means_success = bool(self.end_means_success) - self.allow_playlist = bool(self.allow_playlist) - self.max_downloads = self.max_downloads - - def suitable_extractors(self, url: str) -> list[str]: """ Returns a list of valid extractors for the given URL""" From 65ef46d01eb4c7094f12b345031ba8ff960fb121 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 23 Jan 2025 00:09:39 +0100 Subject: [PATCH 012/110] Fix loading already loaded modules - don't load them twice --- src/auto_archiver/core/loader.py | 13 +++++++---- src/auto_archiver/core/orchestrator.py | 23 +++++++++++-------- .../modules/generic_extractor/__manifest__.py | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index f0a6ee7..1ae9810 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -10,6 +10,8 @@ from loguru import logger import sys import shutil +_LOADED_MODULES = {} + MODULE_TYPES = [ 'feeder', 'enricher', @@ -68,6 +70,9 @@ class Module: def load_module(module: str) -> object: # TODO: change return type to Step + if module in _LOADED_MODULES: + return _LOADED_MODULES[module] + # load a module by name module = get_module(module) if not module: @@ -83,11 +88,11 @@ def load_module(module: str) -> object: # TODO: change return type to Step check_deps(module.dependencies.get('bin', []), lambda dep: shutil.which(dep)) qualname = f'auto_archiver.modules.{module.name}' - if qualname in sys.modules: - return + logger.info(f"Loading module '{module.display_name}'...") loaded_module = __import__(qualname) - return getattr(sys.modules[qualname], module.entry_point)() + _LOADED_MODULES[module.name] = getattr(sys.modules[qualname], module.entry_point)() + return _LOADED_MODULES[module.name] # finally, load the module @@ -144,6 +149,6 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [] if not suppress_warnings: for module in limit_to_modules: if not any(module == m.name for m in all_modules): - logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?") + logger.warning(f"Module '{module}' not found in available modules. Are you sure it's installed?") return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 214c704..1b4fee0 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -85,7 +85,7 @@ class ArchivingOrchestrator: if modules := getattr(basic_config, f"{module_type}s", []): enabled_modules.extend(modules) - self.add_module_args(available_modules(with_manifest=True, limit_to_modules=enabled_modules), parser) + self.add_module_args(available_modules(with_manifest=True, limit_to_modules=set(enabled_modules)), parser) elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) @@ -98,6 +98,7 @@ class ArchivingOrchestrator: self.add_module_args(available_modules(with_manifest=True), parser) + breakpoint() parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -106,13 +107,13 @@ class ArchivingOrchestrator: logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?") # merge the new config with the old one - yaml_config = merge_dicts(vars(parsed), yaml_config) + merged_yaml_config = merge_dicts(vars(parsed), yaml_config) - if basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): + if (merged_yaml_config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file): logger.info(f"Storing configuration file to {basic_config.config_file}") store_yaml(yaml_config, basic_config.config_file) - self.config = yaml_config + self.config = merged_yaml_config return self.config @@ -120,12 +121,12 @@ class ArchivingOrchestrator: if not parser: parser = self.parser - parser.add_argument('--feeders', action='store', dest='steps.feeders', nargs='+', required=True, help='the feeders to use') - parser.add_argument('--enrichers', action='store', dest='steps.enrichers', nargs='+', required=True, help='the enrichers to use') - parser.add_argument('--extractors', action='store', dest='steps.extractors', nargs='+', required=True, help='the extractors to use') - parser.add_argument('--databases', action='store', dest='steps.databases', nargs='+', required=True, help='the databases to use') - parser.add_argument('--storages', action='store', dest='steps.storages', nargs='+', required=True, help='the storages to use') - parser.add_argument('--formatters', action='store', dest='steps.formatters', nargs='+', required=True, help='the formatter to use') + parser.add_argument('--feeders', action='store', dest='steps.feeders', nargs='+', help='the feeders to use') + parser.add_argument('--enrichers', action='store', dest='steps.enrichers', nargs='+', help='the enrichers to use') + parser.add_argument('--extractors', action='store', dest='steps.extractors', nargs='+', help='the extractors to use') + parser.add_argument('--databases', action='store', dest='steps.databases', nargs='+', help='the databases to use') + parser.add_argument('--storages', action='store', dest='steps.storages', nargs='+', help='the storages to use') + parser.add_argument('--formatters', action='store', dest='steps.formatters', nargs='+', help='the formatter to use') def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None): @@ -163,6 +164,8 @@ class ArchivingOrchestrator: """ for module_type in MODULE_TYPES: + if module_type == 'enricher': + breakpoint() step_items = [] modules_to_load = self.config['steps'][f"{module_type}s"] diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 6c5a9b8..6f469c9 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -2,7 +2,7 @@ 'name': 'Generic Extractor', 'version': '0.1.0', 'author': 'Bellingcat', - 'type': ['extractor'], + 'type': ['extractor', 'feeder', 'enricher'], 'entry_point': 'GenericExtractor', # this class should be present in the __init__.py 'requires_setup': False, 'dependencies': { From 79684f834856ceebd525981429e387d10a3186f6 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 23 Jan 2025 09:16:42 +0000 Subject: [PATCH 013/110] Set up feeder manifests (not merged by source yet) --- poetry.lock | 188 +++++++++--------- src/auto_archiver/core/__init__.py | 7 +- src/auto_archiver/core/orchestrator.py | 1 - src/auto_archiver/databases/__init__.py | 5 - src/auto_archiver/databases/api_db/api_db.py | 70 ------- .../databases/atlos_db/atlos_db.py | 79 -------- .../databases/console_db/console_db.py | 32 --- src/auto_archiver/databases/csv_db/csv_db.py | 34 ---- .../databases/gsheet_db/gsheet_db.py | 112 ----------- .../{databases => modules}/api_db/__init__.py | 0 .../api_db/__manifest__.py | 0 .../{databases => modules/api_db}/api_db.py | 17 +- .../atlos_db/__init__.py | 0 .../atlos_db/__manifest__.py | 13 +- .../atlos_db}/atlos_db.py | 7 +- .../atlos_feeder}/__init__.py | 0 .../modules/atlos_feeder/__manifest__.py | 34 ++++ .../atlos_feeder}/atlos_feeder.py | 7 +- .../csv_db => modules/cli_feeder}/__init__.py | 0 .../modules/cli_feeder/__manifest__.py | 24 +++ .../cli_feeder}/cli_feeder.py | 22 +- .../console_db}/__init__.py | 0 .../console_db/__manifest__.py | 0 .../console_db}/console_db.py | 8 +- src/auto_archiver/modules/csv_db/__init__.py | 0 .../csv_db/__manifest__.py | 0 .../{databases => modules/csv_db}/csv_db.py | 9 +- .../modules/csv_feeder/__init__.py | 0 .../modules/csv_feeder/__manifest__.py | 33 +++ .../csv_feeder}/csv_feeder.py | 9 +- .../modules/gsheet_db/__init__.py | 0 .../gsheet_db/__manifest__.py | 0 .../gsheet_db}/gsheet_db.py | 10 +- .../modules/gsheet_feeder/__init__.py | 0 .../modules/gsheet_feeder/__manifest__.py | 40 ++++ .../gsheet_feeder}/gsheet_feeder.py | 46 ++--- .../modules/hash_enricher/__init__.py | 0 .../modules/hash_enricher/__manifest__.py | 27 +++ .../hash_enricher}/hash_enricher.py | 11 +- .../instagram_api_archiver/__manifest__.py | 21 +- .../instagram_api_archiver.py | 19 -- .../instagram_archiver/__manifest__.py | 8 +- .../instagram_archiver/instagram_archiver.py | 11 +- .../instagram_tbot_archiver.py | 9 - .../modules/meta_enricher/__init__.py | 0 .../modules/meta_enricher/__manifest__.py | 22 ++ .../meta_enricher}/meta_enricher.py | 14 +- .../modules/metadata_enricher/__init__.py | 0 .../modules/metadata_enricher/__manifest__.py | 22 ++ .../metadata_enricher}/metadata_enricher.py | 7 +- .../modules/pdq_hash_enricher/__init__.py | 0 .../modules/pdq_hash_enricher/__manifest__.py | 21 ++ .../pdq_hash_enricher}/pdq_hash_enricher.py | 8 +- .../modules/screenshot_enricher/__init__.py | 0 .../screenshot_enricher/__manifest__.py | 30 +++ .../screenshot_enricher.py | 34 ++-- .../modules/ssl_enricher/__init__.py | 0 .../modules/ssl_enricher/__manifest__.py | 22 ++ .../ssl_enricher}/ssl_enricher.py | 12 +- .../telegram_archiver/telegram_archiver.py | 3 - .../modules/telethon_archiver/__manifest__.py | 2 +- .../telethon_archiver/telethon_archiver.py | 14 -- .../modules/thumbnail_enricher/__init__.py | 0 .../thumbnail_enricher/__manifest__.py | 27 +++ .../thumbnail_enricher}/thumbnail_enricher.py | 13 +- .../modules/timestamping_enricher/__init__.py | 0 .../timestamping_enricher/__manifest__.py | 40 ++++ .../timestamping_enricher.py | 66 +++--- .../twitter_api_archiver/__manifest__.py | 3 +- .../twitter_api_archiver.py | 11 - .../modules/vk_archiver/vk_archiver.py | 8 - .../modules/wacz_enricher/__init__.py | 0 .../modules/wacz_enricher/__manifest__.py | 39 ++++ .../wacz_enricher}/wacz_enricher.py | 21 +- .../modules/wayback_enricher/__init__.py | 0 .../modules/wayback_enricher/__manifest__.py | 29 +++ .../wayback_enricher}/wayback_enricher.py | 19 +- .../modules/whisper_enricher/__init__.py | 0 .../modules/whisper_enricher/__manifest__.py | 30 +++ .../whisper_enricher}/whisper_enricher.py | 17 +- tests/databases/test_csv_db.py | 2 +- tests/enrichers/test_hash_enricher.py | 2 +- 82 files changed, 721 insertions(+), 730 deletions(-) delete mode 100644 src/auto_archiver/databases/api_db/api_db.py delete mode 100644 src/auto_archiver/databases/atlos_db/atlos_db.py delete mode 100644 src/auto_archiver/databases/console_db/console_db.py delete mode 100644 src/auto_archiver/databases/csv_db/csv_db.py delete mode 100644 src/auto_archiver/databases/gsheet_db/gsheet_db.py rename src/auto_archiver/{databases => modules}/api_db/__init__.py (100%) rename src/auto_archiver/{databases => modules}/api_db/__manifest__.py (100%) rename src/auto_archiver/{databases => modules/api_db}/api_db.py (69%) rename src/auto_archiver/{databases => modules}/atlos_db/__init__.py (100%) rename src/auto_archiver/{databases => modules}/atlos_db/__manifest__.py (59%) rename src/auto_archiver/{databases => modules/atlos_db}/atlos_db.py (94%) rename src/auto_archiver/{databases/console_db => modules/atlos_feeder}/__init__.py (100%) create mode 100644 src/auto_archiver/modules/atlos_feeder/__manifest__.py rename src/auto_archiver/{feeders => modules/atlos_feeder}/atlos_feeder.py (91%) rename src/auto_archiver/{databases/csv_db => modules/cli_feeder}/__init__.py (100%) create mode 100644 src/auto_archiver/modules/cli_feeder/__manifest__.py rename src/auto_archiver/{feeders => modules/cli_feeder}/cli_feeder.py (57%) rename src/auto_archiver/{databases/gsheet_db => modules/console_db}/__init__.py (100%) rename src/auto_archiver/{databases => modules}/console_db/__manifest__.py (100%) rename src/auto_archiver/{databases => modules/console_db}/console_db.py (86%) create mode 100644 src/auto_archiver/modules/csv_db/__init__.py rename src/auto_archiver/{databases => modules}/csv_db/__manifest__.py (100%) rename src/auto_archiver/{databases => modules/csv_db}/csv_db.py (81%) create mode 100644 src/auto_archiver/modules/csv_feeder/__init__.py create mode 100644 src/auto_archiver/modules/csv_feeder/__manifest__.py rename src/auto_archiver/{feeders => modules/csv_feeder}/csv_feeder.py (88%) create mode 100644 src/auto_archiver/modules/gsheet_db/__init__.py rename src/auto_archiver/{databases => modules}/gsheet_db/__manifest__.py (100%) rename src/auto_archiver/{databases => modules/gsheet_db}/gsheet_db.py (96%) create mode 100644 src/auto_archiver/modules/gsheet_feeder/__init__.py create mode 100644 src/auto_archiver/modules/gsheet_feeder/__manifest__.py rename src/auto_archiver/{feeders => modules/gsheet_feeder}/gsheet_feeder.py (74%) create mode 100644 src/auto_archiver/modules/hash_enricher/__init__.py create mode 100644 src/auto_archiver/modules/hash_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/hash_enricher}/hash_enricher.py (85%) create mode 100644 src/auto_archiver/modules/meta_enricher/__init__.py create mode 100644 src/auto_archiver/modules/meta_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/meta_enricher}/meta_enricher.py (93%) create mode 100644 src/auto_archiver/modules/metadata_enricher/__init__.py create mode 100644 src/auto_archiver/modules/metadata_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/metadata_enricher}/metadata_enricher.py (92%) create mode 100644 src/auto_archiver/modules/pdq_hash_enricher/__init__.py create mode 100644 src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/pdq_hash_enricher}/pdq_hash_enricher.py (95%) create mode 100644 src/auto_archiver/modules/screenshot_enricher/__init__.py create mode 100644 src/auto_archiver/modules/screenshot_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/screenshot_enricher}/screenshot_enricher.py (59%) create mode 100644 src/auto_archiver/modules/ssl_enricher/__init__.py create mode 100644 src/auto_archiver/modules/ssl_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/ssl_enricher}/ssl_enricher.py (73%) create mode 100644 src/auto_archiver/modules/thumbnail_enricher/__init__.py create mode 100644 src/auto_archiver/modules/thumbnail_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/thumbnail_enricher}/thumbnail_enricher.py (86%) create mode 100644 src/auto_archiver/modules/timestamping_enricher/__init__.py create mode 100644 src/auto_archiver/modules/timestamping_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/timestamping_enricher}/timestamping_enricher.py (72%) create mode 100644 src/auto_archiver/modules/wacz_enricher/__init__.py create mode 100644 src/auto_archiver/modules/wacz_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/wacz_enricher}/wacz_enricher.py (87%) create mode 100644 src/auto_archiver/modules/wayback_enricher/__init__.py create mode 100644 src/auto_archiver/modules/wayback_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/wayback_enricher}/wayback_enricher.py (78%) create mode 100644 src/auto_archiver/modules/whisper_enricher/__init__.py create mode 100644 src/auto_archiver/modules/whisper_enricher/__manifest__.py rename src/auto_archiver/{enrichers => modules/whisper_enricher}/whisper_enricher.py (87%) diff --git a/poetry.lock b/poetry.lock index bbfb975..a8d43e6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -152,34 +152,34 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.35.99" +version = "1.36.3" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.35.99-py3-none-any.whl", hash = "sha256:83e560faaec38a956dfb3d62e05e1703ee50432b45b788c09e25107c5058bd71"}, - {file = "boto3-1.35.99.tar.gz", hash = "sha256:e0abd794a7a591d90558e92e29a9f8837d25ece8e3c120e530526fe27eba5fca"}, + {file = "boto3-1.36.3-py3-none-any.whl", hash = "sha256:f9843a5d06f501d66ada06f5a5417f671823af2cf319e36ceefa1bafaaaaa953"}, + {file = "boto3-1.36.3.tar.gz", hash = "sha256:53a5307f6a3526ee2f8590e3c45efa504a3ea4532c1bfe4926c0c19bf188d141"}, ] [package.dependencies] -botocore = ">=1.35.99,<1.36.0" +botocore = ">=1.36.3,<1.37.0" jmespath = ">=0.7.1,<2.0.0" -s3transfer = ">=0.10.0,<0.11.0" +s3transfer = ">=0.11.0,<0.12.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.99" +version = "1.36.3" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.35.99-py3-none-any.whl", hash = "sha256:b22d27b6b617fc2d7342090d6129000af2efd20174215948c0d7ae2da0fab445"}, - {file = "botocore-1.35.99.tar.gz", hash = "sha256:1eab44e969c39c5f3d9a3104a0836c24715579a455f12b3979a31d7cde51b3c3"}, + {file = "botocore-1.36.3-py3-none-any.whl", hash = "sha256:536ab828e6f90dbb000e3702ac45fd76642113ae2db1b7b1373ad24104e89255"}, + {file = "botocore-1.36.3.tar.gz", hash = "sha256:775b835e979da5c96548ed1a0b798101a145aec3cd46541d62e27dda5a94d7f8"}, ] [package.dependencies] @@ -188,7 +188,7 @@ python-dateutil = ">=2.1,<3.0.0" urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} [package.extras] -crt = ["awscrt (==0.22.0)"] +crt = ["awscrt (==0.23.4)"] [[package]] name = "brotli" @@ -343,14 +343,14 @@ beautifulsoup4 = "*" [[package]] name = "cachetools" -version = "5.5.0" +version = "5.5.1" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, - {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, + {file = "cachetools-5.5.1-py3-none-any.whl", hash = "sha256:b76651fdc3b24ead3c648bbdeeb940c1b04d365b38b4af66788f9ec4a81d42bb"}, + {file = "cachetools-5.5.1.tar.gz", hash = "sha256:70f238fbba50383ef62e55c6aff6d9673175fe59f7c6782c7a0b9e38f4a9df95"}, ] [[package]] @@ -2083,32 +2083,32 @@ pyasn1 = ">=0.1.3" [[package]] name = "s3transfer" -version = "0.10.4" +version = "0.11.1" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e"}, - {file = "s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7"}, + {file = "s3transfer-0.11.1-py3-none-any.whl", hash = "sha256:8fa0aa48177be1f3425176dfe1ab85dcd3d962df603c3dbfc585e6bf857ef0ff"}, + {file = "s3transfer-0.11.1.tar.gz", hash = "sha256:3f25c900a367c8b7f7d8f9c34edc87e300bde424f779dc9f0a8ae4f9df9264f6"}, ] [package.dependencies] -botocore = ">=1.33.2,<2.0a.0" +botocore = ">=1.36.0,<2.0a.0" [package.extras] -crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] +crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"] [[package]] name = "selenium" -version = "4.27.1" +version = "4.28.0" description = "Official Python bindings for Selenium WebDriver" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "selenium-4.27.1-py3-none-any.whl", hash = "sha256:b89b1f62b5cfe8025868556fe82360d6b649d464f75d2655cb966c8f8447ea18"}, - {file = "selenium-4.27.1.tar.gz", hash = "sha256:5296c425a75ff1b44d0d5199042b36a6d1ef76c04fb775b97b40be739a9caae2"}, + {file = "selenium-4.28.0-py3-none-any.whl", hash = "sha256:3d6a2e8e1b850a1078884ea19f4e011ecdc12263434d87a0b78769836fb82dd8"}, + {file = "selenium-4.28.0.tar.gz", hash = "sha256:a9fae6eef48d470a1b0c6e45185d96f0dafb025e8da4b346cc41e4da3ac54fa0"}, ] [package.dependencies] @@ -2617,15 +2617,15 @@ typing-extensions = ">=3.7.4" [[package]] name = "tzdata" -version = "2024.2" +version = "2025.1" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" groups = ["main"] markers = "platform_system == \"Windows\"" files = [ - {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"}, - {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, + {file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"}, + {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"}, ] [[package]] @@ -2868,81 +2868,81 @@ test = ["websockets"] [[package]] name = "websockets" -version = "14.1" +version = "14.2" description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" optional = false python-versions = ">=3.9" groups = ["main", "docs"] files = [ - {file = "websockets-14.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a0adf84bc2e7c86e8a202537b4fd50e6f7f0e4a6b6bf64d7ccb96c4cd3330b29"}, - {file = "websockets-14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90b5d9dfbb6d07a84ed3e696012610b6da074d97453bd01e0e30744b472c8179"}, - {file = "websockets-14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2177ee3901075167f01c5e335a6685e71b162a54a89a56001f1c3e9e3d2ad250"}, - {file = "websockets-14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f14a96a0034a27f9d47fd9788913924c89612225878f8078bb9d55f859272b0"}, - {file = "websockets-14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f874ba705deea77bcf64a9da42c1f5fc2466d8f14daf410bc7d4ceae0a9fcb0"}, - {file = "websockets-14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9607b9a442392e690a57909c362811184ea429585a71061cd5d3c2b98065c199"}, - {file = "websockets-14.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bea45f19b7ca000380fbd4e02552be86343080120d074b87f25593ce1700ad58"}, - {file = "websockets-14.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:219c8187b3ceeadbf2afcf0f25a4918d02da7b944d703b97d12fb01510869078"}, - {file = "websockets-14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ad2ab2547761d79926effe63de21479dfaf29834c50f98c4bf5b5480b5838434"}, - {file = "websockets-14.1-cp310-cp310-win32.whl", hash = "sha256:1288369a6a84e81b90da5dbed48610cd7e5d60af62df9851ed1d1d23a9069f10"}, - {file = "websockets-14.1-cp310-cp310-win_amd64.whl", hash = "sha256:e0744623852f1497d825a49a99bfbec9bea4f3f946df6eb9d8a2f0c37a2fec2e"}, - {file = "websockets-14.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:449d77d636f8d9c17952628cc7e3b8faf6e92a17ec581ec0c0256300717e1512"}, - {file = "websockets-14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a35f704be14768cea9790d921c2c1cc4fc52700410b1c10948511039be824aac"}, - {file = "websockets-14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b1f3628a0510bd58968c0f60447e7a692933589b791a6b572fcef374053ca280"}, - {file = "websockets-14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c3deac3748ec73ef24fc7be0b68220d14d47d6647d2f85b2771cb35ea847aa1"}, - {file = "websockets-14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7048eb4415d46368ef29d32133134c513f507fff7d953c18c91104738a68c3b3"}, - {file = "websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6cf0ad281c979306a6a34242b371e90e891bce504509fb6bb5246bbbf31e7b6"}, - {file = "websockets-14.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cc1fc87428c1d18b643479caa7b15db7d544652e5bf610513d4a3478dbe823d0"}, - {file = "websockets-14.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f95ba34d71e2fa0c5d225bde3b3bdb152e957150100e75c86bc7f3964c450d89"}, - {file = "websockets-14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9481a6de29105d73cf4515f2bef8eb71e17ac184c19d0b9918a3701c6c9c4f23"}, - {file = "websockets-14.1-cp311-cp311-win32.whl", hash = "sha256:368a05465f49c5949e27afd6fbe0a77ce53082185bbb2ac096a3a8afaf4de52e"}, - {file = "websockets-14.1-cp311-cp311-win_amd64.whl", hash = "sha256:6d24fc337fc055c9e83414c94e1ee0dee902a486d19d2a7f0929e49d7d604b09"}, - {file = "websockets-14.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ed907449fe5e021933e46a3e65d651f641975a768d0649fee59f10c2985529ed"}, - {file = "websockets-14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:87e31011b5c14a33b29f17eb48932e63e1dcd3fa31d72209848652310d3d1f0d"}, - {file = "websockets-14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bc6ccf7d54c02ae47a48ddf9414c54d48af9c01076a2e1023e3b486b6e72c707"}, - {file = "websockets-14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9777564c0a72a1d457f0848977a1cbe15cfa75fa2f67ce267441e465717dcf1a"}, - {file = "websockets-14.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a655bde548ca98f55b43711b0ceefd2a88a71af6350b0c168aa77562104f3f45"}, - {file = "websockets-14.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3dfff83ca578cada2d19e665e9c8368e1598d4e787422a460ec70e531dbdd58"}, - {file = "websockets-14.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6a6c9bcf7cdc0fd41cc7b7944447982e8acfd9f0d560ea6d6845428ed0562058"}, - {file = "websockets-14.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4b6caec8576e760f2c7dd878ba817653144d5f369200b6ddf9771d64385b84d4"}, - {file = "websockets-14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb6d38971c800ff02e4a6afd791bbe3b923a9a57ca9aeab7314c21c84bf9ff05"}, - {file = "websockets-14.1-cp312-cp312-win32.whl", hash = "sha256:1d045cbe1358d76b24d5e20e7b1878efe578d9897a25c24e6006eef788c0fdf0"}, - {file = "websockets-14.1-cp312-cp312-win_amd64.whl", hash = "sha256:90f4c7a069c733d95c308380aae314f2cb45bd8a904fb03eb36d1a4983a4993f"}, - {file = "websockets-14.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3630b670d5057cd9e08b9c4dab6493670e8e762a24c2c94ef312783870736ab9"}, - {file = "websockets-14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36ebd71db3b89e1f7b1a5deaa341a654852c3518ea7a8ddfdf69cc66acc2db1b"}, - {file = "websockets-14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5b918d288958dc3fa1c5a0b9aa3256cb2b2b84c54407f4813c45d52267600cd3"}, - {file = "websockets-14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00fe5da3f037041da1ee0cf8e308374e236883f9842c7c465aa65098b1c9af59"}, - {file = "websockets-14.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8149a0f5a72ca36720981418eeffeb5c2729ea55fa179091c81a0910a114a5d2"}, - {file = "websockets-14.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77569d19a13015e840b81550922056acabc25e3f52782625bc6843cfa034e1da"}, - {file = "websockets-14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cf5201a04550136ef870aa60ad3d29d2a59e452a7f96b94193bee6d73b8ad9a9"}, - {file = "websockets-14.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:88cf9163ef674b5be5736a584c999e98daf3aabac6e536e43286eb74c126b9c7"}, - {file = "websockets-14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:836bef7ae338a072e9d1863502026f01b14027250a4545672673057997d5c05a"}, - {file = "websockets-14.1-cp313-cp313-win32.whl", hash = "sha256:0d4290d559d68288da9f444089fd82490c8d2744309113fc26e2da6e48b65da6"}, - {file = "websockets-14.1-cp313-cp313-win_amd64.whl", hash = "sha256:8621a07991add373c3c5c2cf89e1d277e49dc82ed72c75e3afc74bd0acc446f0"}, - {file = "websockets-14.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01bb2d4f0a6d04538d3c5dfd27c0643269656c28045a53439cbf1c004f90897a"}, - {file = "websockets-14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:414ffe86f4d6f434a8c3b7913655a1a5383b617f9bf38720e7c0799fac3ab1c6"}, - {file = "websockets-14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8fda642151d5affdee8a430bd85496f2e2517be3a2b9d2484d633d5712b15c56"}, - {file = "websockets-14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd7c11968bc3860d5c78577f0dbc535257ccec41750675d58d8dc66aa47fe52c"}, - {file = "websockets-14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a032855dc7db987dff813583d04f4950d14326665d7e714d584560b140ae6b8b"}, - {file = "websockets-14.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7e7ea2f782408c32d86b87a0d2c1fd8871b0399dd762364c731d86c86069a78"}, - {file = "websockets-14.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:39450e6215f7d9f6f7bc2a6da21d79374729f5d052333da4d5825af8a97e6735"}, - {file = "websockets-14.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ceada5be22fa5a5a4cdeec74e761c2ee7db287208f54c718f2df4b7e200b8d4a"}, - {file = "websockets-14.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3fc753451d471cff90b8f467a1fc0ae64031cf2d81b7b34e1811b7e2691bc4bc"}, - {file = "websockets-14.1-cp39-cp39-win32.whl", hash = "sha256:14839f54786987ccd9d03ed7f334baec0f02272e7ec4f6e9d427ff584aeea8b4"}, - {file = "websockets-14.1-cp39-cp39-win_amd64.whl", hash = "sha256:d9fd19ecc3a4d5ae82ddbfb30962cf6d874ff943e56e0c81f5169be2fda62979"}, - {file = "websockets-14.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e5dc25a9dbd1a7f61eca4b7cb04e74ae4b963d658f9e4f9aad9cd00b688692c8"}, - {file = "websockets-14.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:04a97aca96ca2acedf0d1f332c861c5a4486fdcba7bcef35873820f940c4231e"}, - {file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df174ece723b228d3e8734a6f2a6febbd413ddec39b3dc592f5a4aa0aff28098"}, - {file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:034feb9f4286476f273b9a245fb15f02c34d9586a5bc936aff108c3ba1b21beb"}, - {file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:660c308dabd2b380807ab64b62985eaccf923a78ebc572bd485375b9ca2b7dc7"}, - {file = "websockets-14.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a42d3ecbb2db5080fc578314439b1d79eef71d323dc661aa616fb492436af5d"}, - {file = "websockets-14.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ddaa4a390af911da6f680be8be4ff5aaf31c4c834c1a9147bc21cbcbca2d4370"}, - {file = "websockets-14.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a4c805c6034206143fbabd2d259ec5e757f8b29d0a2f0bf3d2fe5d1f60147a4a"}, - {file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:205f672a6c2c671a86d33f6d47c9b35781a998728d2c7c2a3e1cf3333fcb62b7"}, - {file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef440054124728cc49b01c33469de06755e5a7a4e83ef61934ad95fc327fbb0"}, - {file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7591d6f440af7f73c4bd9404f3772bfee064e639d2b6cc8c94076e71b2471c1"}, - {file = "websockets-14.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:25225cc79cfebc95ba1d24cd3ab86aaa35bcd315d12fa4358939bd55e9bd74a5"}, - {file = "websockets-14.1-py3-none-any.whl", hash = "sha256:4d4fc827a20abe6d544a119896f6b78ee13fe81cbfef416f3f2ddf09a03f0e2e"}, - {file = "websockets-14.1.tar.gz", hash = "sha256:398b10c77d471c0aab20a845e7a60076b6390bfdaac7a6d2edb0d2c59d75e8d8"}, + {file = "websockets-14.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e8179f95323b9ab1c11723e5d91a89403903f7b001828161b480a7810b334885"}, + {file = "websockets-14.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d8c3e2cdb38f31d8bd7d9d28908005f6fa9def3324edb9bf336d7e4266fd397"}, + {file = "websockets-14.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:714a9b682deb4339d39ffa674f7b674230227d981a37d5d174a4a83e3978a610"}, + {file = "websockets-14.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2e53c72052f2596fb792a7acd9704cbc549bf70fcde8a99e899311455974ca3"}, + {file = "websockets-14.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3fbd68850c837e57373d95c8fe352203a512b6e49eaae4c2f4088ef8cf21980"}, + {file = "websockets-14.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b27ece32f63150c268593d5fdb82819584831a83a3f5809b7521df0685cd5d8"}, + {file = "websockets-14.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4daa0faea5424d8713142b33825fff03c736f781690d90652d2c8b053345b0e7"}, + {file = "websockets-14.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bc63cee8596a6ec84d9753fd0fcfa0452ee12f317afe4beae6b157f0070c6c7f"}, + {file = "websockets-14.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a570862c325af2111343cc9b0257b7119b904823c675b22d4ac547163088d0d"}, + {file = "websockets-14.2-cp310-cp310-win32.whl", hash = "sha256:75862126b3d2d505e895893e3deac0a9339ce750bd27b4ba515f008b5acf832d"}, + {file = "websockets-14.2-cp310-cp310-win_amd64.whl", hash = "sha256:cc45afb9c9b2dc0852d5c8b5321759cf825f82a31bfaf506b65bf4668c96f8b2"}, + {file = "websockets-14.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3bdc8c692c866ce5fefcaf07d2b55c91d6922ac397e031ef9b774e5b9ea42166"}, + {file = "websockets-14.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c93215fac5dadc63e51bcc6dceca72e72267c11def401d6668622b47675b097f"}, + {file = "websockets-14.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1c9b6535c0e2cf8a6bf938064fb754aaceb1e6a4a51a80d884cd5db569886910"}, + {file = "websockets-14.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a52a6d7cf6938e04e9dceb949d35fbdf58ac14deea26e685ab6368e73744e4c"}, + {file = "websockets-14.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f05702e93203a6ff5226e21d9b40c037761b2cfb637187c9802c10f58e40473"}, + {file = "websockets-14.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22441c81a6748a53bfcb98951d58d1af0661ab47a536af08920d129b4d1c3473"}, + {file = "websockets-14.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd9b868d78b194790e6236d9cbc46d68aba4b75b22497eb4ab64fa640c3af56"}, + {file = "websockets-14.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1a5a20d5843886d34ff8c57424cc65a1deda4375729cbca4cb6b3353f3ce4142"}, + {file = "websockets-14.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:34277a29f5303d54ec6468fb525d99c99938607bc96b8d72d675dee2b9f5bf1d"}, + {file = "websockets-14.2-cp311-cp311-win32.whl", hash = "sha256:02687db35dbc7d25fd541a602b5f8e451a238ffa033030b172ff86a93cb5dc2a"}, + {file = "websockets-14.2-cp311-cp311-win_amd64.whl", hash = "sha256:862e9967b46c07d4dcd2532e9e8e3c2825e004ffbf91a5ef9dde519ee2effb0b"}, + {file = "websockets-14.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1f20522e624d7ffbdbe259c6b6a65d73c895045f76a93719aa10cd93b3de100c"}, + {file = "websockets-14.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:647b573f7d3ada919fd60e64d533409a79dcf1ea21daeb4542d1d996519ca967"}, + {file = "websockets-14.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6af99a38e49f66be5a64b1e890208ad026cda49355661549c507152113049990"}, + {file = "websockets-14.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:091ab63dfc8cea748cc22c1db2814eadb77ccbf82829bac6b2fbe3401d548eda"}, + {file = "websockets-14.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b374e8953ad477d17e4851cdc66d83fdc2db88d9e73abf755c94510ebddceb95"}, + {file = "websockets-14.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a39d7eceeea35db85b85e1169011bb4321c32e673920ae9c1b6e0978590012a3"}, + {file = "websockets-14.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0a6f3efd47ffd0d12080594f434faf1cd2549b31e54870b8470b28cc1d3817d9"}, + {file = "websockets-14.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:065ce275e7c4ffb42cb738dd6b20726ac26ac9ad0a2a48e33ca632351a737267"}, + {file = "websockets-14.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e9d0e53530ba7b8b5e389c02282f9d2aa47581514bd6049d3a7cffe1385cf5fe"}, + {file = "websockets-14.2-cp312-cp312-win32.whl", hash = "sha256:20e6dd0984d7ca3037afcb4494e48c74ffb51e8013cac71cf607fffe11df7205"}, + {file = "websockets-14.2-cp312-cp312-win_amd64.whl", hash = "sha256:44bba1a956c2c9d268bdcdf234d5e5ff4c9b6dc3e300545cbe99af59dda9dcce"}, + {file = "websockets-14.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6f1372e511c7409a542291bce92d6c83320e02c9cf392223272287ce55bc224e"}, + {file = "websockets-14.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4da98b72009836179bb596a92297b1a61bb5a830c0e483a7d0766d45070a08ad"}, + {file = "websockets-14.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8a86a269759026d2bde227652b87be79f8a734e582debf64c9d302faa1e9f03"}, + {file = "websockets-14.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86cf1aaeca909bf6815ea714d5c5736c8d6dd3a13770e885aafe062ecbd04f1f"}, + {file = "websockets-14.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9b0f6c3ba3b1240f602ebb3971d45b02cc12bd1845466dd783496b3b05783a5"}, + {file = "websockets-14.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669c3e101c246aa85bc8534e495952e2ca208bd87994650b90a23d745902db9a"}, + {file = "websockets-14.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eabdb28b972f3729348e632ab08f2a7b616c7e53d5414c12108c29972e655b20"}, + {file = "websockets-14.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2066dc4cbcc19f32c12a5a0e8cc1b7ac734e5b64ac0a325ff8353451c4b15ef2"}, + {file = "websockets-14.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ab95d357cd471df61873dadf66dd05dd4709cae001dd6342edafc8dc6382f307"}, + {file = "websockets-14.2-cp313-cp313-win32.whl", hash = "sha256:a9e72fb63e5f3feacdcf5b4ff53199ec8c18d66e325c34ee4c551ca748623bbc"}, + {file = "websockets-14.2-cp313-cp313-win_amd64.whl", hash = "sha256:b439ea828c4ba99bb3176dc8d9b933392a2413c0f6b149fdcba48393f573377f"}, + {file = "websockets-14.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7cd5706caec1686c5d233bc76243ff64b1c0dc445339bd538f30547e787c11fe"}, + {file = "websockets-14.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ec607328ce95a2f12b595f7ae4c5d71bf502212bddcea528290b35c286932b12"}, + {file = "websockets-14.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da85651270c6bfb630136423037dd4975199e5d4114cae6d3066641adcc9d1c7"}, + {file = "websockets-14.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ecadc7ce90accf39903815697917643f5b7cfb73c96702318a096c00aa71f5"}, + {file = "websockets-14.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1979bee04af6a78608024bad6dfcc0cc930ce819f9e10342a29a05b5320355d0"}, + {file = "websockets-14.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dddacad58e2614a24938a50b85969d56f88e620e3f897b7d80ac0d8a5800258"}, + {file = "websockets-14.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:89a71173caaf75fa71a09a5f614f450ba3ec84ad9fca47cb2422a860676716f0"}, + {file = "websockets-14.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6af6a4b26eea4fc06c6818a6b962a952441e0e39548b44773502761ded8cc1d4"}, + {file = "websockets-14.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:80c8efa38957f20bba0117b48737993643204645e9ec45512579132508477cfc"}, + {file = "websockets-14.2-cp39-cp39-win32.whl", hash = "sha256:2e20c5f517e2163d76e2729104abc42639c41cf91f7b1839295be43302713661"}, + {file = "websockets-14.2-cp39-cp39-win_amd64.whl", hash = "sha256:b4c8cef610e8d7c70dea92e62b6814a8cd24fbd01d7103cc89308d2bfe1659ef"}, + {file = "websockets-14.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d7d9cafbccba46e768be8a8ad4635fa3eae1ffac4c6e7cb4eb276ba41297ed29"}, + {file = "websockets-14.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c76193c1c044bd1e9b3316dcc34b174bbf9664598791e6fb606d8d29000e070c"}, + {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd475a974d5352390baf865309fe37dec6831aafc3014ffac1eea99e84e83fc2"}, + {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c6c0097a41968b2e2b54ed3424739aab0b762ca92af2379f152c1aef0187e1c"}, + {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7ff794c8b36bc402f2e07c0b2ceb4a2424147ed4785ff03e2a7af03711d60a"}, + {file = "websockets-14.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dec254fcabc7bd488dab64846f588fc5b6fe0d78f641180030f8ea27b76d72c3"}, + {file = "websockets-14.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:bbe03eb853e17fd5b15448328b4ec7fb2407d45fb0245036d06a3af251f8e48f"}, + {file = "websockets-14.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3c4aa3428b904d5404a0ed85f3644d37e2cb25996b7f096d77caeb0e96a3b42"}, + {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:577a4cebf1ceaf0b65ffc42c54856214165fb8ceeba3935852fc33f6b0c55e7f"}, + {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad1c1d02357b7665e700eca43a31d52814ad9ad9b89b58118bdabc365454b574"}, + {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f390024a47d904613577df83ba700bd189eedc09c57af0a904e5c39624621270"}, + {file = "websockets-14.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3c1426c021c38cf92b453cdf371228d3430acd775edee6bac5a4d577efc72365"}, + {file = "websockets-14.2-py3-none-any.whl", hash = "sha256:7a6ceec4ea84469f15cf15807a747e9efe57e369c384fa86e022b3bea679b79b"}, + {file = "websockets-14.2.tar.gz", hash = "sha256:5059ed9c54945efb321f097084b4c7e52c246f2c869815876a69d1efc4ad6eb5"}, ] [[package]] diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index 779d3ac..cf95dc0 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -4,4 +4,9 @@ # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator -# from .config import Config \ No newline at end of file +# from .config import Config + +from .media import Media +from .step import Step +from .context import ArchivingContext +from .metadata import Metadata diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index ee3a190..f6c411e 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -190,7 +190,6 @@ class ArchivingOrchestrator: yaml_config = read_yaml(basic_config.config_file) - breakpoint() self.setup_complete_parser(basic_config, yaml_config, unused_args) self.install_modules() diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py index d6de470..3a8d787 100644 --- a/src/auto_archiver/databases/__init__.py +++ b/src/auto_archiver/databases/__init__.py @@ -3,8 +3,3 @@ """ from .database import Database -from .gsheet_db.gsheet_db import GsheetsDb -from .console_db.console_db import ConsoleDb -from .csv_db.csv_db import CSVDb -from .api_db.api_db import AAApiDb -from .atlos_db.atlos_db import AtlosDb \ No newline at end of file diff --git a/src/auto_archiver/databases/api_db/api_db.py b/src/auto_archiver/databases/api_db/api_db.py deleted file mode 100644 index 84bdfcb..0000000 --- a/src/auto_archiver/databases/api_db/api_db.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Union -import requests, os -from loguru import logger - -from .. import Database -from ...core import Metadata - - -class AAApiDb(Database): - """ - Connects to auto-archiver-api instance - """ - name = "auto_archiver_api_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.allow_rearchive = bool(self.allow_rearchive) - self.store_results = bool(self.store_results) - self.assert_valid_string("api_endpoint") - - @staticmethod - def configs() -> dict: - return { - "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, - "api_token": {"default": None, "help": "API Bearer token."}, - "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, - "author_id": {"default": None, "help": "which email to assign as author"}, - "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, - "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, - "store_results": {"default": True, "help": "when set, will send the results to the API database."}, - "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, - } - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """ query the database for the existence of this item. - Helps avoid re-archiving the same URL multiple times. - """ - if not self.allow_rearchive: return - - params = {"url": item.get_url(), "limit": 15} - headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"} - response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers) - - if response.status_code == 200: - if len(response.json()): - logger.success(f"API returned {len(response.json())} previously archived instance(s)") - fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()] - return Metadata.choose_most_complete(fetched_metadata) - else: - logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") - return False - - - def done(self, item: Metadata, cached: bool=False) -> None: - """archival result ready - should be saved to DB""" - if not self.store_results: return - if cached: - logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached") - return - logger.debug(f"saving archive of {item.get_url()} to the AA API.") - - payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)} - headers = {"Authorization": f"Bearer {self.api_token}"} - response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers) - - if response.status_code == 200: - logger.success(f"AA API: {response.json()}") - else: - logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") - diff --git a/src/auto_archiver/databases/atlos_db/atlos_db.py b/src/auto_archiver/databases/atlos_db/atlos_db.py deleted file mode 100644 index 4a00b9d..0000000 --- a/src/auto_archiver/databases/atlos_db/atlos_db.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -from typing import Union -from loguru import logger -from csv import DictWriter -from dataclasses import asdict -import requests - -from .. import Database -from ...core import Metadata -from ...utils import get_atlos_config_options - - -class AtlosDb(Database): - """ - Outputs results to Atlos - """ - - name = "atlos_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - @staticmethod - def configs() -> dict: - return get_atlos_config_options() - - def failed(self, item: Metadata, reason: str) -> None: - """Update DB accordingly for failure""" - # If the item has no Atlos ID, there's nothing for us to do - if not item.metadata.get("atlos_id"): - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") - return - - requests.post( - f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", - headers={"Authorization": f"Bearer {self.api_token}"}, - json={"metadata": {"processed": True, "status": "error", "error": reason}}, - ).raise_for_status() - logger.info( - f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}" - ) - - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """check and fetch if the given item has been archived already, each - database should handle its own caching, and configuration mechanisms""" - return False - - def _process_metadata(self, item: Metadata) -> dict: - """Process metadata for storage on Atlos. Will convert any datetime - objects to ISO format.""" - - return { - k: v.isoformat() if hasattr(v, "isoformat") else v - for k, v in item.metadata.items() - } - - def done(self, item: Metadata, cached: bool = False) -> None: - """archival result ready - should be saved to DB""" - - if not item.metadata.get("atlos_id"): - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") - return - - requests.post( - f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", - headers={"Authorization": f"Bearer {self.api_token}"}, - json={ - "metadata": dict( - processed=True, - status="success", - results=self._process_metadata(item), - ) - }, - ).raise_for_status() - - logger.info( - f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos" - ) diff --git a/src/auto_archiver/databases/console_db/console_db.py b/src/auto_archiver/databases/console_db/console_db.py deleted file mode 100644 index a5e648b..0000000 --- a/src/auto_archiver/databases/console_db/console_db.py +++ /dev/null @@ -1,32 +0,0 @@ -from loguru import logger - -from .. import Database -from ...core import Metadata - - -class ConsoleDb(Database): - """ - Outputs results to the console - """ - name = "console_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - @staticmethod - def configs() -> dict: - return {} - - def started(self, item: Metadata) -> None: - logger.warning(f"STARTED {item}") - - def failed(self, item: Metadata, reason:str) -> None: - logger.error(f"FAILED {item}: {reason}") - - def aborted(self, item: Metadata) -> None: - logger.warning(f"ABORTED {item}") - - def done(self, item: Metadata, cached: bool=False) -> None: - """archival result ready - should be saved to DB""" - logger.success(f"DONE {item}") \ No newline at end of file diff --git a/src/auto_archiver/databases/csv_db/csv_db.py b/src/auto_archiver/databases/csv_db/csv_db.py deleted file mode 100644 index e24306f..0000000 --- a/src/auto_archiver/databases/csv_db/csv_db.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from loguru import logger -from csv import DictWriter -from dataclasses import asdict - -from .. import Database -from ...core import Metadata - - -class CSVDb(Database): - """ - Outputs results to a CSV file - """ - name = "csv_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.assert_valid_string("csv_file") - - @staticmethod - def configs() -> dict: - return { - "csv_file": {"default": "db.csv", "help": "CSV file name"} - } - - def done(self, item: Metadata, cached: bool=False) -> None: - """archival result ready - should be saved to DB""" - logger.success(f"DONE {item}") - is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0 - with open(self.csv_file, "a", encoding="utf-8") as outf: - writer = DictWriter(outf, fieldnames=asdict(Metadata())) - if is_empty: writer.writeheader() - writer.writerow(asdict(item)) diff --git a/src/auto_archiver/databases/gsheet_db/gsheet_db.py b/src/auto_archiver/databases/gsheet_db/gsheet_db.py deleted file mode 100644 index 631a554..0000000 --- a/src/auto_archiver/databases/gsheet_db/gsheet_db.py +++ /dev/null @@ -1,112 +0,0 @@ -from typing import Union, Tuple -import datetime -from urllib.parse import quote - -from loguru import logger - -from .. import Database -from ...core import Metadata, Media, ArchivingContext -from ...utils import GWorksheet - - -class GsheetsDb(Database): - """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata - """ - name = "gsheet_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - @staticmethod - def configs() -> dict: - return {} - - def started(self, item: Metadata) -> None: - logger.warning(f"STARTED {item}") - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', 'Archive in progress') - - def failed(self, item: Metadata, reason:str) -> None: - logger.error(f"FAILED {item}") - self._safe_status_update(item, f'Archive failed {reason}') - - def aborted(self, item: Metadata) -> None: - logger.warning(f"ABORTED {item}") - self._safe_status_update(item, '') - - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """check if the given item has been archived already""" - return False - - def done(self, item: Metadata, cached: bool=False) -> None: - """archival result ready - should be saved to DB""" - logger.success(f"DONE {item.get_url()}") - gw, row = self._retrieve_gsheet(item) - # self._safe_status_update(item, 'done') - - cell_updates = [] - row_values = gw.get_row(row) - - def batch_if_valid(col, val, final_value=None): - final_value = final_value or val - try: - if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': - cell_updates.append((row, col, final_value)) - except Exception as e: - logger.error(f"Unable to batch {col}={final_value} due to {e}") - status_message = item.status - if cached: - status_message = f"[cached] {status_message}" - cell_updates.append((row, 'status', status_message)) - - media: Media = item.get_final_media() - if hasattr(media, "urls"): - batch_if_valid('archive', "\n".join(media.urls)) - batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()) - batch_if_valid('title', item.get_title()) - batch_if_valid('text', item.get("content", "")) - batch_if_valid('timestamp', item.get_timestamp()) - if media: batch_if_valid('hash', media.get("hash", "not-calculated")) - - # merge all pdq hashes into a single string, if present - pdq_hashes = [] - all_media = item.get_all_media() - for m in all_media: - if pdq := m.get("pdq_hash"): - pdq_hashes.append(pdq) - if len(pdq_hashes): - batch_if_valid('pdq_hash', ",".join(pdq_hashes)) - - if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"): - batch_if_valid('screenshot', "\n".join(screenshot.urls)) - - if (thumbnail := item.get_first_image("thumbnail")): - if hasattr(thumbnail, "urls"): - batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")') - - if (browsertrix := item.get_media_by_id("browsertrix")): - batch_if_valid('wacz', "\n".join(browsertrix.urls)) - batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls])) - - gw.batch_set_cell(cell_updates) - - def _safe_status_update(self, item: Metadata, new_status: str) -> None: - try: - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', new_status) - except Exception as e: - logger.debug(f"Unable to update sheet: {e}") - - def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: - # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now - if gsheet := ArchivingContext.get("gsheet"): - gw: GWorksheet = gsheet.get("worksheet") - row: int = gsheet.get("row") - elif self.sheet_id: - print(self.sheet_id) - - - return gw, row diff --git a/src/auto_archiver/databases/api_db/__init__.py b/src/auto_archiver/modules/api_db/__init__.py similarity index 100% rename from src/auto_archiver/databases/api_db/__init__.py rename to src/auto_archiver/modules/api_db/__init__.py diff --git a/src/auto_archiver/databases/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py similarity index 100% rename from src/auto_archiver/databases/api_db/__manifest__.py rename to src/auto_archiver/modules/api_db/__manifest__.py diff --git a/src/auto_archiver/databases/api_db.py b/src/auto_archiver/modules/api_db/api_db.py similarity index 69% rename from src/auto_archiver/databases/api_db.py rename to src/auto_archiver/modules/api_db/api_db.py index 4304855..fa1ae75 100644 --- a/src/auto_archiver/databases/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,8 +2,8 @@ from typing import Union import requests, os from loguru import logger -from . import Database -from ..core import Metadata +from auto_archiver.databases import Database +from auto_archiver.core import Metadata class AAApiDb(Database): @@ -19,18 +19,7 @@ class AAApiDb(Database): self.store_results = bool(self.store_results) self.assert_valid_string("api_endpoint") - @staticmethod - def configs() -> dict: - return { - "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, - "api_token": {"default": None, "help": "API Bearer token."}, - "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, - "author_id": {"default": None, "help": "which email to assign as author"}, - "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, - "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, - "store_results": {"default": True, "help": "when set, will send the results to the API database."}, - "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, - } + def fetch(self, item: Metadata) -> Union[Metadata, bool]: """ query the database for the existence of this item. Helps avoid re-archiving the same URL multiple times. diff --git a/src/auto_archiver/databases/atlos_db/__init__.py b/src/auto_archiver/modules/atlos_db/__init__.py similarity index 100% rename from src/auto_archiver/databases/atlos_db/__init__.py rename to src/auto_archiver/modules/atlos_db/__init__.py diff --git a/src/auto_archiver/databases/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py similarity index 59% rename from src/auto_archiver/databases/atlos_db/__manifest__.py rename to src/auto_archiver/modules/atlos_db/__manifest__.py index 1e2b676..470d07d 100644 --- a/src/auto_archiver/databases/atlos_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_db/__manifest__.py @@ -7,7 +7,18 @@ {"python": ["loguru", ""], "bin": [""]}, - "configs": {}, + "configs": { + "api_token": { + "default": None, + "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", + "cli_set": lambda cli_val, _: cli_val + }, + "atlos_url": { + "default": "https://platform.atlos.org", + "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", + "cli_set": lambda cli_val, _: cli_val + }, + }, "description": """ Handles integration with the Atlos platform for managing archival results. diff --git a/src/auto_archiver/databases/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py similarity index 94% rename from src/auto_archiver/databases/atlos_db.py rename to src/auto_archiver/modules/atlos_db/atlos_db.py index 16c4910..376ba32 100644 --- a/src/auto_archiver/databases/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -5,9 +5,9 @@ from csv import DictWriter from dataclasses import asdict import requests -from . import Database -from ..core import Metadata -from ..utils import get_atlos_config_options +from auto_archiver.databases import Database +from auto_archiver.core import Metadata +from auto_archiver.utils import get_atlos_config_options class AtlosDb(Database): @@ -21,6 +21,7 @@ class AtlosDb(Database): # without this STEP.__init__ is not called super().__init__(config) + # TODO @staticmethod def configs() -> dict: return get_atlos_config_options() diff --git a/src/auto_archiver/databases/console_db/__init__.py b/src/auto_archiver/modules/atlos_feeder/__init__.py similarity index 100% rename from src/auto_archiver/databases/console_db/__init__.py rename to src/auto_archiver/modules/atlos_feeder/__init__.py diff --git a/src/auto_archiver/modules/atlos_feeder/__manifest__.py b/src/auto_archiver/modules/atlos_feeder/__manifest__.py new file mode 100644 index 0000000..f0b216b --- /dev/null +++ b/src/auto_archiver/modules/atlos_feeder/__manifest__.py @@ -0,0 +1,34 @@ +{ + "name": "Atlos Feeder", + "type": ["feeder"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "requests"], + }, + "configs": { + "api_token": { + "default": None, + "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", + "cli_set": lambda cli_val, _: cli_val + }, + "atlos_url": { + "default": "https://platform.atlos.org", + "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", + "cli_set": lambda cli_val, _: cli_val + }, + }, + "description": """ + AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival. + + ### Features + - Connects to the Atlos API to retrieve a list of source material URLs. + - Filters source materials based on visibility, processing status, and metadata. + - Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL. + - Iterates through paginated results using a cursor for efficient API interaction. + + ### Notes + - Requires an Atlos API endpoint and a valid API token for authentication. + - Ensures only unprocessed, visible, and ready-to-archive URLs are returned. + - Handles pagination transparently when retrieving data from the Atlos API. + """ +} diff --git a/src/auto_archiver/feeders/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py similarity index 91% rename from src/auto_archiver/feeders/atlos_feeder.py rename to src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index d3acc00..d344139 100644 --- a/src/auto_archiver/feeders/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -1,9 +1,9 @@ from loguru import logger import requests -from . import Feeder -from ..core import Metadata, ArchivingContext -from ..utils import get_atlos_config_options +from auto_archiver.feeders import Feeder +from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.utils import get_atlos_config_options class AtlosFeeder(Feeder): @@ -15,6 +15,7 @@ class AtlosFeeder(Feeder): if type(self.api_token) != str: raise Exception("Atlos Feeder did not receive an Atlos API token") + # TODO @staticmethod def configs() -> dict: return get_atlos_config_options() diff --git a/src/auto_archiver/databases/csv_db/__init__.py b/src/auto_archiver/modules/cli_feeder/__init__.py similarity index 100% rename from src/auto_archiver/databases/csv_db/__init__.py rename to src/auto_archiver/modules/cli_feeder/__init__.py diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py new file mode 100644 index 0000000..fcb9099 --- /dev/null +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -0,0 +1,24 @@ +{ + "name": "CLI Feeder", + "type": ["feeder"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + }, + "configs": { + "urls": { + "default": None, + "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", + "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + }, + }, + "description": """ + Processes URLs to archive passed via the command line and feeds them into the archiving pipeline. + + ### Features + - Takes a single URL or a list of URLs provided via the command line. + - Converts each URL into a `Metadata` object and yields it for processing. + - Ensures URLs are processed only if they are explicitly provided. + + """ +} diff --git a/src/auto_archiver/feeders/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py similarity index 57% rename from src/auto_archiver/feeders/cli_feeder.py rename to src/auto_archiver/modules/cli_feeder/cli_feeder.py index b2f0add..1376379 100644 --- a/src/auto_archiver/feeders/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,7 +1,7 @@ from loguru import logger -from . import Feeder -from ..core import Metadata, ArchivingContext +from auto_archiver.feeders import Feeder +from auto_archiver.core import Metadata, ArchivingContext class CLIFeeder(Feeder): @@ -13,15 +13,15 @@ class CLIFeeder(Feeder): if type(self.urls) != list or len(self.urls) == 0: raise Exception("CLI Feeder did not receive any URL to process") - @staticmethod - def configs() -> dict: - return { - "urls": { - "default": None, - "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) - }, - } + # @staticmethod + # def configs() -> dict: + # return { + # "urls": { + # "default": None, + # "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", + # "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + # }, + # } def __iter__(self) -> Metadata: for url in self.urls: diff --git a/src/auto_archiver/databases/gsheet_db/__init__.py b/src/auto_archiver/modules/console_db/__init__.py similarity index 100% rename from src/auto_archiver/databases/gsheet_db/__init__.py rename to src/auto_archiver/modules/console_db/__init__.py diff --git a/src/auto_archiver/databases/console_db/__manifest__.py b/src/auto_archiver/modules/console_db/__manifest__.py similarity index 100% rename from src/auto_archiver/databases/console_db/__manifest__.py rename to src/auto_archiver/modules/console_db/__manifest__.py diff --git a/src/auto_archiver/databases/console_db.py b/src/auto_archiver/modules/console_db/console_db.py similarity index 86% rename from src/auto_archiver/databases/console_db.py rename to src/auto_archiver/modules/console_db/console_db.py index bd45f95..357c696 100644 --- a/src/auto_archiver/databases/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,7 +1,7 @@ from loguru import logger -from . import Database -from ..core import Metadata +from auto_archiver.databases import Database +from auto_archiver.core import Metadata class ConsoleDb(Database): @@ -14,10 +14,6 @@ class ConsoleDb(Database): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} - def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") diff --git a/src/auto_archiver/modules/csv_db/__init__.py b/src/auto_archiver/modules/csv_db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/databases/csv_db/__manifest__.py b/src/auto_archiver/modules/csv_db/__manifest__.py similarity index 100% rename from src/auto_archiver/databases/csv_db/__manifest__.py rename to src/auto_archiver/modules/csv_db/__manifest__.py diff --git a/src/auto_archiver/databases/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py similarity index 81% rename from src/auto_archiver/databases/csv_db.py rename to src/auto_archiver/modules/csv_db/csv_db.py index f0d7153..642e889 100644 --- a/src/auto_archiver/databases/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -3,8 +3,8 @@ from loguru import logger from csv import DictWriter from dataclasses import asdict -from . import Database -from ..core import Metadata +from auto_archiver.databases import Database +from auto_archiver.core import Metadata class CSVDb(Database): @@ -18,11 +18,6 @@ class CSVDb(Database): super().__init__(config) self.assert_valid_string("csv_file") - @staticmethod - def configs() -> dict: - return { - "csv_file": {"default": "db.csv", "help": "CSV file name"} - } def done(self, item: Metadata, cached: bool=False) -> None: """archival result ready - should be saved to DB""" diff --git a/src/auto_archiver/modules/csv_feeder/__init__.py b/src/auto_archiver/modules/csv_feeder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py new file mode 100644 index 0000000..ad5d40b --- /dev/null +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -0,0 +1,33 @@ +{ + "name": "CSV Feeder", + "type": ["feeder"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + "bin": [""] + }, + "configs": { + "files": { + "default": None, + "help": "Path to the input file(s) to read the URLs from, comma separated. \ + Input files should be formatted with one URL per line", + "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + }, + "column": { + "default": None, + "help": "Column number or name to read the URLs from, 0-indexed", + } + }, + "description": """ + Reads URLs from CSV files and feeds them into the archiving process. + + ### Features + - Supports reading URLs from multiple input files, specified as a comma-separated list. + - Allows specifying the column number or name to extract URLs from. + - Skips header rows if the first value is not a valid URL. + - Integrates with the `ArchivingContext` to manage URL feeding. + + ### Setu N + - Input files should be formatted with one URL per line. + """ +} diff --git a/src/auto_archiver/feeders/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py similarity index 88% rename from src/auto_archiver/feeders/csv_feeder.py rename to src/auto_archiver/modules/csv_feeder/csv_feeder.py index 00bf7d7..b665bd9 100644 --- a/src/auto_archiver/feeders/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,12 +1,15 @@ from loguru import logger import csv -from . import Feeder -from ..core import Metadata, ArchivingContext -from ..utils import url_or_none +from auto_archiver.feeders import Feeder +from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.utils import url_or_none class CSVFeeder(Feeder): + name = "csv_feeder" + + @staticmethod def configs() -> dict: return { diff --git a/src/auto_archiver/modules/gsheet_db/__init__.py b/src/auto_archiver/modules/gsheet_db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/databases/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py similarity index 100% rename from src/auto_archiver/databases/gsheet_db/__manifest__.py rename to src/auto_archiver/modules/gsheet_db/__manifest__.py diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py similarity index 96% rename from src/auto_archiver/databases/gsheet_db.py rename to src/auto_archiver/modules/gsheet_db/gsheet_db.py index 98e72dc..8e17966 100644 --- a/src/auto_archiver/databases/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -4,9 +4,9 @@ from urllib.parse import quote from loguru import logger -from . import Database -from ..core import Metadata, Media, ArchivingContext -from ..utils import GWorksheet +from auto_archiver.databases import Database +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.utils import GWorksheet class GsheetsDb(Database): @@ -20,10 +20,6 @@ class GsheetsDb(Database): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} - def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") gw, row = self._retrieve_gsheet(item) diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py new file mode 100644 index 0000000..2af090c --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -0,0 +1,40 @@ +{ + "name": "Google Sheets Feeder", + "type": ["feeder"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "gspread", "python-slugify"], + }, + "configs": { + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + }, + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + }, + "use_sheet_names_in_stored_paths": { + "default": True, + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + } + }, + "description": """ + GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. + + This reads data from Google Sheets and filters rows based on user-defined rules. + The filtered rows are processed into `Metadata` objects. + + ### Features + - Validates the sheet structure and filters rows based on input configurations. + - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations. + - Ensures only rows with valid URLs and unprocessed statuses are included for archival. + - Supports organizing stored files into folder paths based on sheet and worksheet names. + + ### Notes + - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. + - Create the sheet using the template provided in the docs. + """ +} diff --git a/src/auto_archiver/feeders/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py similarity index 74% rename from src/auto_archiver/feeders/gsheet_feeder.py rename to src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 1c4fc32..5c73bf6 100644 --- a/src/auto_archiver/feeders/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -14,9 +14,9 @@ from loguru import logger from slugify import slugify # from . import Enricher -from . import Feeder -from ..core import Metadata, ArchivingContext -from ..utils import Gsheets, GWorksheet +from auto_archiver.feeders import Feeder +from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.utils import Gsheets, GWorksheet class GsheetsFeeder(Gsheets, Feeder): @@ -27,26 +27,26 @@ class GsheetsFeeder(Gsheets, Feeder): super().__init__(config) self.gsheets_client = gspread.service_account(filename=self.service_account) - @staticmethod - def configs() -> dict: - return dict( - Gsheets.configs(), - ** { - "allow_worksheets": { - "default": set(), - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - }, - "block_worksheets": { - "default": set(), - "help": "(CSV) explicitly block some worksheets from being processed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - }, - "use_sheet_names_in_stored_paths": { - "default": True, - "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", - } - }) + # @staticmethod + # def configs() -> dict: + # return dict( + # Gsheets.configs(), + # ** { + # "allow_worksheets": { + # "default": set(), + # "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + # }, + # "block_worksheets": { + # "default": set(), + # "help": "(CSV) explicitly block some worksheets from being processed", + # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + # }, + # "use_sheet_names_in_stored_paths": { + # "default": True, + # "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + # } + # }) def __iter__(self) -> Metadata: sh = self.open_sheet() diff --git a/src/auto_archiver/modules/hash_enricher/__init__.py b/src/auto_archiver/modules/hash_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/hash_enricher/__manifest__.py b/src/auto_archiver/modules/hash_enricher/__manifest__.py new file mode 100644 index 0000000..311ed6f --- /dev/null +++ b/src/auto_archiver/modules/hash_enricher/__manifest__.py @@ -0,0 +1,27 @@ +{ + "name": "Hash Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + }, + "configs": { + "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, + "chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, + }, + "description": """ +Generates cryptographic hashes for media files to ensure data integrity and authenticity. + +### Features +- Calculates cryptographic hashes (SHA-256 or SHA3-512) for media files stored in `Metadata` objects. +- Ensures content authenticity, integrity validation, and duplicate identification. +- Efficiently processes large files by reading file bytes in configurable chunk sizes. +- Supports dynamic configuration of hash algorithms and chunk sizes. +- Updates media metadata with the computed hash value in the format `:`. + +### Notes +- Default hash algorithm is SHA-256, but SHA3-512 is also supported. +- Chunk size defaults to 16 MB but can be adjusted based on memory requirements. +- Useful for workflows requiring hash-based content validation or deduplication. +""", +} diff --git a/src/auto_archiver/enrichers/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py similarity index 85% rename from src/auto_archiver/enrichers/hash_enricher.py rename to src/auto_archiver/modules/hash_enricher/hash_enricher.py index 69973b7..355413a 100644 --- a/src/auto_archiver/enrichers/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -10,8 +10,8 @@ making it suitable for handling large files efficiently. import hashlib from loguru import logger -from . import Enricher -from ..core import Metadata, ArchivingContext +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata, ArchivingContext class HashEnricher(Enricher): @@ -45,13 +45,6 @@ class HashEnricher(Enricher): ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True) - @staticmethod - def configs() -> dict: - return { - "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, - "chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, - } - def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") diff --git a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py index fb12dc2..b2225fa 100644 --- a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py @@ -8,7 +8,7 @@ "retrying", "tqdm",], }, - "no_setup_required": False, + "requires_setup": True, "configs": { "access_token": {"default": None, "help": "a valid instagrapi-api token"}, "api_endpoint": {"default": None, "help": "API endpoint to use"}, @@ -25,5 +25,22 @@ "help": "if true, will remove empty values from the json output", }, }, - "description": "", + "description": """ +Archives various types of Instagram content using the Instagrapi API. + +### Features +- Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content. +- Supports advanced configuration options, including: + - Full profile download (all posts, stories, highlights, and tagged content). + - Limiting the number of posts to fetch for large profiles. + - Minimising JSON output to remove empty fields and redundant data. +- Provides robust error handling and retries for API calls. +- Ensures efficient media scraping, including handling nested or carousel media items. +- Adds downloaded media and metadata to the result for further processing. + +### Notes +- Requires a valid Instagrapi API token (`access_token`) and API endpoint (`api_endpoint`). +- Full-profile downloads can be limited by setting `full_profile_max_posts`. +- Designed to fetch content in batches for large profiles, minimising API load. +""", } diff --git a/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py index cc6e074..dc3f1ec 100644 --- a/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py +++ b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py @@ -45,25 +45,6 @@ class InstagramAPIArchiver(Archiver): self.full_profile = bool(self.full_profile) self.minimize_json_output = bool(self.minimize_json_output) - @staticmethod - def configs() -> dict: - return { - "access_token": {"default": None, "help": "a valid instagrapi-api token"}, - "api_endpoint": {"default": None, "help": "API endpoint to use"}, - "full_profile": { - "default": False, - "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.", - }, - "full_profile_max_posts": { - "default": 0, - "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights", - }, - "minimize_json_output": { - "default": True, - "help": "if true, will remove empty values from the json output", - }, - } - def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/instagram_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_archiver/__manifest__.py index bb143b3..44cd7bb 100644 --- a/src/auto_archiver/modules/instagram_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_archiver/__manifest__.py @@ -3,10 +3,12 @@ "type": ["extractor"], "entry_point": "instagram_archiver:InstagramArchiver", "external_dependencies": { - "python": ["instaloader", - "loguru",], + "python": [ + "instaloader", + "loguru", + ], }, - "no_setup_required": False, + "requires_setup": True, "configs": { "username": {"default": None, "help": "a valid Instagram username"}, "password": { diff --git a/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py index 4cf001d..7daf291 100644 --- a/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py +++ b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py @@ -45,16 +45,7 @@ class InstagramArchiver(Archiver): except Exception as e2: logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}") - @staticmethod - def configs() -> dict: - return { - "username": {"default": None, "help": "a valid Instagram username"}, - "password": {"default": None, "help": "the corresponding Instagram account password"}, - "download_folder": {"default": "instaloader", "help": "name of a folder to temporarily download content to"}, - "session_file": {"default": "secrets/instaloader.session", "help": "path to the instagram session which saves session credentials"}, - #TODO: fine-grain - # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, - } + def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py index 9fdc208..3423010 100644 --- a/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py +++ b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py @@ -34,15 +34,6 @@ class InstagramTbotArchiver(Archiver): self.assert_valid_string("api_hash") self.timeout = int(self.timeout) - @staticmethod - def configs() -> dict: - return { - "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, - "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, - "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, - "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."}, - } - def setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup diff --git a/src/auto_archiver/modules/meta_enricher/__init__.py b/src/auto_archiver/modules/meta_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/meta_enricher/__manifest__.py b/src/auto_archiver/modules/meta_enricher/__manifest__.py new file mode 100644 index 0000000..10acf71 --- /dev/null +++ b/src/auto_archiver/modules/meta_enricher/__manifest__.py @@ -0,0 +1,22 @@ +{ + "name": "Archive Metadata Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + }, + "description": """ + Adds metadata information about the archive operations, Adds metadata about archive operations, including file sizes and archive duration./ + To be included at the end of all enrichments. + + ### Features +- Calculates the total size of all archived media files, storing the result in human-readable and byte formats. +- Computes the duration of the archival process, storing the elapsed time in seconds. +- Ensures all enrichments are performed only if the `Metadata` object contains valid data. +- Adds detailed metadata to provide insights into file sizes and archival performance. + +### Notes +- Skips enrichment if no media or metadata is available in the `Metadata` object. +- File sizes are calculated using the `os.stat` module, ensuring accurate byte-level reporting. +""", +} diff --git a/src/auto_archiver/enrichers/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py similarity index 93% rename from src/auto_archiver/enrichers/meta_enricher.py rename to src/auto_archiver/modules/meta_enricher/meta_enricher.py index b721bb5..ab0e73d 100644 --- a/src/auto_archiver/enrichers/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -2,8 +2,8 @@ import datetime import os from loguru import logger -from . import Enricher -from ..core import Metadata +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata class MetaEnricher(Enricher): @@ -17,10 +17,6 @@ class MetaEnricher(Enricher): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} - def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() if to_enrich.is_empty(): @@ -28,7 +24,7 @@ class MetaEnricher(Enricher): return logger.debug(f"calculating archive metadata information for {url=}") - + self.enrich_file_sizes(to_enrich) self.enrich_archive_duration(to_enrich) @@ -40,10 +36,10 @@ class MetaEnricher(Enricher): media.set("bytes", file_stats.st_size) media.set("size", self.human_readable_bytes(file_stats.st_size)) total_size += file_stats.st_size - + to_enrich.set("total_bytes", total_size) to_enrich.set("total_size", self.human_readable_bytes(total_size)) - + def human_readable_bytes(self, size: int) -> str: # receives number of bytes and returns human readble size diff --git a/src/auto_archiver/modules/metadata_enricher/__init__.py b/src/auto_archiver/modules/metadata_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/metadata_enricher/__manifest__.py b/src/auto_archiver/modules/metadata_enricher/__manifest__.py new file mode 100644 index 0000000..bfc9b75 --- /dev/null +++ b/src/auto_archiver/modules/metadata_enricher/__manifest__.py @@ -0,0 +1,22 @@ +{ + "name": "Media Metadata Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + "bin": ["exiftool"] + + }, + "description": """ + Extracts metadata information from files using ExifTool. + + ### Features + - Uses ExifTool to extract detailed metadata from media files. + - Processes file-specific data like camera settings, geolocation, timestamps, and other embedded metadata. + - Adds extracted metadata to the corresponding `Media` object within the `Metadata`. + + ### Notes + - Requires ExifTool to be installed and accessible via the system's PATH. + - Skips enrichment for files where metadata extraction fails. + """ +} diff --git a/src/auto_archiver/enrichers/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py similarity index 92% rename from src/auto_archiver/enrichers/metadata_enricher.py rename to src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index 9fe257e..5887d16 100644 --- a/src/auto_archiver/enrichers/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -2,8 +2,8 @@ import subprocess import traceback from loguru import logger -from . import Enricher -from ..core import Metadata +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata class MetadataEnricher(Enricher): @@ -16,9 +16,6 @@ class MetadataEnricher(Enricher): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/pdq_hash_enricher/__init__.py b/src/auto_archiver/modules/pdq_hash_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py new file mode 100644 index 0000000..7b418b1 --- /dev/null +++ b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py @@ -0,0 +1,21 @@ +{ + "name": "PDQ Hash Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru", "pdqhash", "numpy", "Pillow"], + }, + "description": """ + PDQ Hash Enricher for generating perceptual hashes of media files. + + ### Features + - Calculates perceptual hashes for image files using the PDQ hashing algorithm. + - Enables detection of duplicate or near-duplicate visual content. + - Processes images stored in `Metadata` objects, adding computed hashes to the corresponding `Media` entries. + - Skips non-image media or files unsuitable for hashing (e.g., corrupted or unsupported formats). + + ### Notes + - Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available. + - Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings. + """ +} diff --git a/src/auto_archiver/enrichers/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py similarity index 95% rename from src/auto_archiver/enrichers/pdq_hash_enricher.py rename to src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index 36f793d..e3e9d10 100644 --- a/src/auto_archiver/enrichers/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -16,8 +16,8 @@ import numpy as np from PIL import Image, UnidentifiedImageError from loguru import logger -from . import Enricher -from ..core import Metadata +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata class PdqHashEnricher(Enricher): @@ -31,10 +31,6 @@ class PdqHashEnricher(Enricher): # Without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} - def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"calculating perceptual hashes for {url=}") diff --git a/src/auto_archiver/modules/screenshot_enricher/__init__.py b/src/auto_archiver/modules/screenshot_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py new file mode 100644 index 0000000..c1a30e7 --- /dev/null +++ b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py @@ -0,0 +1,30 @@ +{ + "name": "Screenshot Enricher", + "type": ["enricher"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "selenium"], + "bin": ["chromedriver"] + }, + "configs": { + "width": {"default": 1280, "help": "width of the screenshots"}, + "height": {"default": 720, "help": "height of the screenshots"}, + "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, + "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, + "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, + "print_options": {"default": {}, "help": "options to pass to the pdf printer"} + }, + "description": """ + Captures screenshots and optionally saves web pages as PDFs using a WebDriver. + + ### Features + - Takes screenshots of web pages, with configurable width, height, and timeout settings. + - Optionally saves pages as PDFs, with additional configuration for PDF printing options. + - Bypasses URLs detected as authentication walls. + - Integrates seamlessly with the metadata enrichment pipeline, adding screenshots and PDFs as media. + + ### Notes + - Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH. + """ +} diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py similarity index 59% rename from src/auto_archiver/enrichers/screenshot_enricher.py rename to src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index b2ef096..dd1d38a 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -5,24 +5,30 @@ import base64 from selenium.common.exceptions import TimeoutException -from . import Enricher -from ..utils import Webdriver, UrlUtil, random_str -from ..core import Media, Metadata, ArchivingContext +from auto_archiver.enrichers import Enricher +from auto_archiver.utils import Webdriver, UrlUtil, random_str +from auto_archiver.core import Media, Metadata, ArchivingContext class ScreenshotEnricher(Enricher): name = "screenshot_enricher" - @staticmethod - def configs() -> dict: - return { - "width": {"default": 1280, "help": "width of the screenshots"}, - "height": {"default": 720, "help": "height of the screenshots"}, - "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, - "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, - "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, - "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, - "print_options": {"default": {}, "help": "options to pass to the pdf printer"} - } + def __init__(self, config: dict) -> None: + super().__init__(config) + # TODO? + + + + # @staticmethod + # def configs() -> dict: + # return { + # "width": {"default": 1280, "help": "width of the screenshots"}, + # "height": {"default": 720, "help": "height of the screenshots"}, + # "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, + # "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, + # "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + # "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, + # "print_options": {"default": {}, "help": "options to pass to the pdf printer"} + # } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/ssl_enricher/__init__.py b/src/auto_archiver/modules/ssl_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/ssl_enricher/__manifest__.py b/src/auto_archiver/modules/ssl_enricher/__manifest__.py new file mode 100644 index 0000000..f44fc94 --- /dev/null +++ b/src/auto_archiver/modules/ssl_enricher/__manifest__.py @@ -0,0 +1,22 @@ +{ + "name": "SSL Certificate Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru", "python-slugify"], + }, + "configs": { + "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"}, + }, + "description": """ + Retrieves SSL certificate information for a domain and stores it as a file. + + ### Features + - Fetches SSL certificates for domains using the HTTPS protocol. + - Stores certificates in PEM format and adds them as media to the metadata. + - Skips enrichment if no media has been archived, based on the `skip_when_nothing_archived` configuration. + + ### Notes + - Requires the target URL to use the HTTPS scheme; other schemes are not supported. + """ +} diff --git a/src/auto_archiver/enrichers/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py similarity index 73% rename from src/auto_archiver/enrichers/ssl_enricher.py rename to src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 396df2e..0474d8f 100644 --- a/src/auto_archiver/enrichers/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -3,8 +3,8 @@ from slugify import slugify from urllib.parse import urlparse from loguru import logger -from . import Enricher -from ..core import Metadata, ArchivingContext, Media +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata, ArchivingContext, Media class SSLEnricher(Enricher): @@ -15,13 +15,7 @@ class SSLEnricher(Enricher): def __init__(self, config: dict) -> None: super().__init__(config) - self. skip_when_nothing_archived = bool(self.skip_when_nothing_archived) - - @staticmethod - def configs() -> dict: - return { - "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"}, - } + self.skip_when_nothing_archived = bool(self.skip_when_nothing_archived) def enrich(self, to_enrich: Metadata) -> None: if not to_enrich.media and self.skip_when_nothing_archived: return diff --git a/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py index c793095..c5e5ef0 100644 --- a/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py +++ b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py @@ -16,9 +16,6 @@ class TelegramArchiver(Archiver): def __init__(self, config: dict) -> None: super().__init__(config) - @staticmethod - def configs() -> dict: - return {} def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/telethon_archiver/__manifest__.py b/src/auto_archiver/modules/telethon_archiver/__manifest__.py index e7359d7..d44acf3 100644 --- a/src/auto_archiver/modules/telethon_archiver/__manifest__.py +++ b/src/auto_archiver/modules/telethon_archiver/__manifest__.py @@ -21,7 +21,7 @@ "default": {}, "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", # TODO - #"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) } }, "description": """ diff --git a/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py index 89668f3..fc89c9e 100644 --- a/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py +++ b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py @@ -23,20 +23,6 @@ class TelethonArchiver(Archiver): self.assert_valid_string("api_id") self.assert_valid_string("api_hash") - @staticmethod - def configs() -> dict: - return { - "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, - "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, - "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"}, - "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, - "join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"}, - "channel_invites": { - "default": {}, - "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) - } - } def setup(self) -> None: """ diff --git a/src/auto_archiver/modules/thumbnail_enricher/__init__.py b/src/auto_archiver/modules/thumbnail_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py new file mode 100644 index 0000000..2b0f167 --- /dev/null +++ b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py @@ -0,0 +1,27 @@ +{ + "name": "Thumbnail Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru", "ffmpeg-python"], + "bin": ["ffmpeg"] + }, + "configs": { + "thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"}, + "max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"}, + }, + "description": """ + Generates thumbnails for video files to provide visual previews. + + ### Features + - Processes video files and generates evenly distributed thumbnails. + - Calculates the number of thumbnails based on video duration, `thumbnails_per_minute`, and `max_thumbnails`. + - Distributes thumbnails equally across the video's duration and stores them as media objects. + - Adds metadata for each thumbnail, including timestamps and IDs. + + ### Notes + - Requires `ffmpeg` to be installed and accessible via the system's PATH. + - Handles videos without pre-existing duration metadata by probing with `ffmpeg`. + - Skips enrichment for non-video media files. + """ +} diff --git a/src/auto_archiver/enrichers/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py similarity index 86% rename from src/auto_archiver/enrichers/thumbnail_enricher.py rename to src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 5d8bee2..3edd40c 100644 --- a/src/auto_archiver/enrichers/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,9 +9,9 @@ and identify important moments without watching the entire video. import ffmpeg, os from loguru import logger -from . import Enricher -from ..core import Media, Metadata, ArchivingContext -from ..utils.misc import random_str +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Media, Metadata, ArchivingContext +from auto_archiver.utils.misc import random_str class ThumbnailEnricher(Enricher): @@ -25,13 +25,6 @@ class ThumbnailEnricher(Enricher): super().__init__(config) self.thumbnails_per_second = int(self.thumbnails_per_minute) / 60 self.max_thumbnails = int(self.max_thumbnails) - - @staticmethod - def configs() -> dict: - return { - "thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"}, - "max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"}, - } def enrich(self, to_enrich: Metadata) -> None: """ diff --git a/src/auto_archiver/modules/timestamping_enricher/__init__.py b/src/auto_archiver/modules/timestamping_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py new file mode 100644 index 0000000..a66cc31 --- /dev/null +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -0,0 +1,40 @@ +{ + "name": "Timestamping Enricher", + "type": ["enricher"], + "requires_setup": True, + "external_dependencies": { + "python": [ + "loguru", + "slugify", + "tsp_client", + "asn1crypto", + "certvalidator", + "certifi" + ], + }, + "configs": { + "tsa_urls": { + "default": [ + "http://timestamp.digicert.com", + "http://timestamp.identrust.com", + "http://timestamp.globalsign.com/tsa/r6advanced1", + "http://tss.accv.es:8318/tsa" + ], + "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + } + }, + "description": """ + Generates RFC3161-compliant timestamp tokens using Time Stamp Authorities (TSA) for archived files. + + ### Features + - Creates timestamp tokens to prove the existence of files at a specific time, useful for legal and authenticity purposes. + - Aggregates file hashes into a text file and timestamps the concatenated data. + - Uses multiple Time Stamp Authorities (TSAs) to ensure reliability and redundancy. + - Validates timestamping certificates against trusted Certificate Authorities (CAs) using the `certifi` trust store. + + ### Notes + - Should be run after the `hash_enricher` to ensure file hashes are available. + - Requires internet access to interact with the configured TSAs. + """ +} diff --git a/src/auto_archiver/enrichers/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py similarity index 72% rename from src/auto_archiver/enrichers/timestamping_enricher.py rename to src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index dffa1a3..a9cf753 100644 --- a/src/auto_archiver/enrichers/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext from asn1crypto import pem import certifi -from . import Enricher -from ..core import Metadata, ArchivingContext, Media -from ..archivers import Archiver +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata, ArchivingContext, Media +from auto_archiver.archivers import Archiver class TimestampingEnricher(Enricher): @@ -26,36 +26,36 @@ class TimestampingEnricher(Enricher): def __init__(self, config: dict) -> None: super().__init__(config) - @staticmethod - def configs() -> dict: - return { - "tsa_urls": { - "default": [ - # [Adobe Approved Trust List] and [Windows Cert Store] - "http://timestamp.digicert.com", - "http://timestamp.identrust.com", - # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping - # "https://timestamp.sectigo.com", # wait 15 seconds between each request. - - # [Adobe: European Union Trusted Lists]. - # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. - - # [Windows Cert Store] - "http://timestamp.globalsign.com/tsa/r6advanced1", - - # [Adobe: European Union Trusted Lists] and [Windows Cert Store] - # "http://ts.quovadisglobal.com/eu", # not valid for timestamping - # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain - # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain - # "http://tsa.sep.bg", # self-signed certificate in certificate chain - # "http://tsa.izenpe.com", #unable to get local issuer certificate - # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate - "http://tss.accv.es:8318/tsa", - ], - "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - } - } + # @staticmethod + # def configs() -> dict: + # return { + # "tsa_urls": { + # "default": [ + # # [Adobe Approved Trust List] and [Windows Cert Store] + # "http://timestamp.digicert.com", + # "http://timestamp.identrust.com", + # # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping + # # "https://timestamp.sectigo.com", # wait 15 seconds between each request. + # + # # [Adobe: European Union Trusted Lists]. + # # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. + # + # # [Windows Cert Store] + # "http://timestamp.globalsign.com/tsa/r6advanced1", + # + # # [Adobe: European Union Trusted Lists] and [Windows Cert Store] + # # "http://ts.quovadisglobal.com/eu", # not valid for timestamping + # # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain + # # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain + # # "http://tsa.sep.bg", # self-signed certificate in certificate chain + # # "http://tsa.izenpe.com", #unable to get local issuer certificate + # # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate + # "http://tss.accv.es:8318/tsa", + # ], + # "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", + # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + # } + # } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py index 203eee9..5dc7364 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py @@ -12,7 +12,8 @@ }, "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, - "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line"}, + "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", + "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, diff --git a/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py index eb607cc..9c931ef 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py +++ b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py @@ -34,17 +34,6 @@ class TwitterApiArchiver(Archiver): access_token=self.access_token, access_secret=self.access_secret)) assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results." - @staticmethod - def configs() -> dict: - return { - "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, - "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, - "consumer_key": {"default": None, "help": "twitter API consumer_key"}, - "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, - "access_token": {"default": None, "help": "twitter API access_token"}, - "access_secret": {"default": None, "help": "twitter API access_secret"}, - } - @property # getter .mimetype def api_client(self) -> str: return self.apis[self.api_index] diff --git a/src/auto_archiver/modules/vk_archiver/vk_archiver.py b/src/auto_archiver/modules/vk_archiver/vk_archiver.py index 3cfb446..7ba7a68 100644 --- a/src/auto_archiver/modules/vk_archiver/vk_archiver.py +++ b/src/auto_archiver/modules/vk_archiver/vk_archiver.py @@ -19,14 +19,6 @@ class VkArchiver(Archiver): self.assert_valid_string("password") self.vks = VkScraper(self.username, self.password, session_file=self.session_file) - @staticmethod - def configs() -> dict: - return { - "username": {"default": None, "help": "valid VKontakte username"}, - "password": {"default": None, "help": "valid VKontakte password"}, - "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, - } - def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/wacz_enricher/__init__.py b/src/auto_archiver/modules/wacz_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/wacz_enricher/__manifest__.py b/src/auto_archiver/modules/wacz_enricher/__manifest__.py new file mode 100644 index 0000000..07983d9 --- /dev/null +++ b/src/auto_archiver/modules/wacz_enricher/__manifest__.py @@ -0,0 +1,39 @@ +{ + "name": "WACZ Enricher", + "type": ["enricher", "archiver"], + "requires_setup": True, + "external_dependencies": { + "python": [ + "loguru", + "jsonlines", + "warcio" + ], + # TODO? + "bin": [ + "docker" + ] + }, + "configs": { + "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, + "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"}, + "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"}, + "extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}, + "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}, + "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"}, + "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"}, + "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"}, + }, + "description": """ + Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving. + + ### Features + - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`. + - Supports custom profiles for archiving private or dynamic content. + - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline. + - Generates metadata from the archived page's content and structure (e.g., titles, text). + + ### Notes + - Requires Docker for running `browsertrix-crawler` unless explicitly disabled. + - Configurable via parameters for timeout, media extraction, screenshots, and proxy settings. + """ +} diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py similarity index 87% rename from src/auto_archiver/enrichers/wacz_enricher.py rename to src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index dc38488..124382b 100644 --- a/src/auto_archiver/enrichers/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -5,10 +5,10 @@ from zipfile import ZipFile from loguru import logger from warcio.archiveiterator import ArchiveIterator -from ..core import Media, Metadata, ArchivingContext -from . import Enricher -from ..archivers import Archiver -from ..utils import UrlUtil, random_str +from auto_archiver.core import Media, Metadata, ArchivingContext +from auto_archiver.enrichers import Enricher +from auto_archiver.archivers import Archiver +from auto_archiver.utils import UrlUtil, random_str class WaczArchiverEnricher(Enricher, Archiver): @@ -24,19 +24,6 @@ class WaczArchiverEnricher(Enricher, Archiver): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return { - "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, - "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"}, - "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"}, - "extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}, - "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}, - "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"}, - "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"}, - "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"}, - } - def setup(self) -> None: self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/wayback_enricher/__init__.py b/src/auto_archiver/modules/wayback_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/wayback_enricher/__manifest__.py b/src/auto_archiver/modules/wayback_enricher/__manifest__.py new file mode 100644 index 0000000..b3af284 --- /dev/null +++ b/src/auto_archiver/modules/wayback_enricher/__manifest__.py @@ -0,0 +1,29 @@ +{ + "name": "Wayback Machine Enricher", + "type": ["enricher", "archiver"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "requests"], + }, + "configs": { + "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."}, + "if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"}, + "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, + "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}, + "proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"}, + "proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"}, + }, + "description": """ + Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL. + + ### Features + - Archives URLs using the Internet Archive's Wayback Machine API. + - Supports conditional archiving based on the existence of prior archives within a specified time range. + - Provides proxies for HTTP and HTTPS requests. + - Fetches and confirms the archive URL or provides a job ID for later status checks. + + ### Notes + - Requires a valid Wayback Machine API key and secret. + - Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff. + """ +} diff --git a/src/auto_archiver/enrichers/wayback_enricher.py b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py similarity index 78% rename from src/auto_archiver/enrichers/wayback_enricher.py rename to src/auto_archiver/modules/wayback_enricher/wayback_enricher.py index 305bfcf..8ddec82 100644 --- a/src/auto_archiver/enrichers/wayback_enricher.py +++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py @@ -2,10 +2,10 @@ import json from loguru import logger import time, requests -from . import Enricher -from ..archivers import Archiver -from ..utils import UrlUtil -from ..core import Metadata +from auto_archiver.enrichers import Enricher +from auto_archiver.archivers import Archiver +from auto_archiver.utils import UrlUtil +from auto_archiver.core import Metadata class WaybackArchiverEnricher(Enricher, Archiver): """ @@ -21,17 +21,6 @@ class WaybackArchiverEnricher(Enricher, Archiver): assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key" assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret" - @staticmethod - def configs() -> dict: - return { - "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."}, - "if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"}, - "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, - "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}, - "proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"}, - "proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"}, - } - def download(self, item: Metadata) -> Metadata: # this new Metadata object is required to avoid duplication result = Metadata() diff --git a/src/auto_archiver/modules/whisper_enricher/__init__.py b/src/auto_archiver/modules/whisper_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py new file mode 100644 index 0000000..25eae25 --- /dev/null +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -0,0 +1,30 @@ +{ + "name": "Whisper Enricher", + "type": ["enricher"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "requests"], + }, + "configs": { + "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."}, + "api_key": {"default": None, "help": "WhisperApi api key for authentication"}, + "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."}, + "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, + "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]}, + }, + "description": """ + Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files. + + ### Features + - Submits audio or video files to a Whisper API deployment for processing. + - Supports operations such as transcription, translation, and language detection. + - Optionally generates SRT subtitle files for video content. + - Integrates with S3-compatible storage systems to make files publicly accessible for processing. + - Handles job submission, status checking, artifact retrieval, and cleanup. + + ### Notes + - Requires a Whisper API endpoint and API key for authentication. + - Only compatible with S3-compatible storage systems for media file accessibility. + - Handles multiple jobs and retries for failed or incomplete processing. + """ +} diff --git a/src/auto_archiver/enrichers/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py similarity index 87% rename from src/auto_archiver/enrichers/whisper_enricher.py rename to src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index c0089a4..f6294f3 100644 --- a/src/auto_archiver/enrichers/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -2,9 +2,9 @@ import traceback import requests, time from loguru import logger -from . import Enricher -from ..core import Metadata, Media, ArchivingContext -from ..storages import S3Storage +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.storages import S3Storage class WhisperEnricher(Enricher): @@ -22,17 +22,6 @@ class WhisperEnricher(Enricher): assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key" self.timeout = int(self.timeout) - @staticmethod - def configs() -> dict: - return { - "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."}, - "api_key": {"default": None, "help": "WhisperApi api key for authentication"}, - "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."}, - "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, - "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]}, - - } - def enrich(self, to_enrich: Metadata) -> None: if not self._get_s3_storage(): logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") diff --git a/tests/databases/test_csv_db.py b/tests/databases/test_csv_db.py index 4395ef0..989f1e9 100644 --- a/tests/databases/test_csv_db.py +++ b/tests/databases/test_csv_db.py @@ -1,5 +1,5 @@ -from auto_archiver.databases.csv_db import CSVDb +from auto_archiver.modules.csv_db import CSVDb from auto_archiver.core import Metadata diff --git a/tests/enrichers/test_hash_enricher.py b/tests/enrichers/test_hash_enricher.py index 99f8117..1477cde 100644 --- a/tests/enrichers/test_hash_enricher.py +++ b/tests/enrichers/test_hash_enricher.py @@ -1,6 +1,6 @@ import pytest -from auto_archiver.enrichers.hash_enricher import HashEnricher +from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.core import Metadata, Media @pytest.mark.parametrize("algorithm, filename, expected_hash", [ From 1274a1b23168a3802c19d8972542f7f6628b2b85 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 23 Jan 2025 16:40:48 +0000 Subject: [PATCH 014/110] More manifests, base modules and rename from archiver to extractor. --- Dockerfile | 15 +++---- src/auto_archiver/archivers/__init__.py | 8 ---- src/auto_archiver/base_modules/__init__.py | 6 +++ .../{databases => base_modules}/database.py | 4 +- src/auto_archiver/base_modules/enricher.py | 31 ++++++++++++++ .../archiver.py => base_modules/extractor.py} | 26 ++++++------ .../{feeders => base_modules}/feeder.py | 4 +- .../{formatters => base_modules}/formatter.py | 2 +- .../{storages => base_modules}/storage.py | 6 +-- src/auto_archiver/core/__init__.py | 7 +--- src/auto_archiver/core/config.py | 4 +- src/auto_archiver/core/orchestrator.py | 11 ++--- src/auto_archiver/databases/__init__.py | 5 --- src/auto_archiver/enrichers/__init__.py | 12 ------ src/auto_archiver/enrichers/enricher.py | 22 ---------- src/auto_archiver/feeders/__init__.py | 3 -- src/auto_archiver/formatters/__init__.py | 1 - src/auto_archiver/modules/api_db/api_db.py | 2 +- .../templates => modules/atlos}/__init__.py | 0 .../modules/atlos/__manifest__.py | 38 ++++++++++++++++++ .../{storages => modules/atlos}/atlos.py | 6 +-- .../modules/atlos_db/atlos_db.py | 3 +- .../modules/atlos_db/base_configs.py | 13 ++++++ .../modules/atlos_feeder/atlos_feeder.py | 2 +- .../modules/cli_feeder/cli_feeder.py | 2 +- .../modules/console_db/console_db.py | 2 +- src/auto_archiver/modules/csv_db/csv_db.py | 2 +- .../modules/csv_feeder/csv_feeder.py | 2 +- .../{gsheet_db => gdrive_storage}/__init__.py | 0 .../modules/gdrive_storage/__manifest__.py | 34 ++++++++++++++++ .../gdrive_storage/gdrive_storage.py} | 4 +- .../modules/generic_extractor/bluesky.py | 11 ++--- .../modules/generic_extractor/dropin.py | 4 +- .../generic_extractor/generic_extractor.py | 4 +- .../modules/generic_extractor/truth.py | 4 +- .../modules/generic_extractor/twitter.py | 4 +- .../modules/gsheet_db/__manifest__.py | 21 ---------- .../__init__.py | 0 .../__manifest__.py | 22 +++++++++- .../gsheet_db.py | 3 +- .../gsheet_feeder.py | 3 +- .../modules/hash_enricher/__init__.py | 1 + .../modules/hash_enricher/__manifest__.py | 2 +- .../modules/hash_enricher/hash_enricher.py | 8 +++- .../__init__.py | 0 .../modules/html_formatter/__manifest__.py | 13 ++++++ .../html_formatter}/html_formatter.py | 20 +++++----- .../templates}/__init__.py | 0 .../templates/html_template.html | 0 .../html_formatter}/templates/macros.html | 0 .../__init__.py | 0 .../__manifest__.py | 3 +- .../instagram_api_archiver.py | 8 ++-- .../__init__.py | 0 .../__manifest__.py | 3 +- .../instagram_archiver.py | 8 ++-- .../__init__.py | 0 .../__manifest__.py | 7 ++-- .../instagram_tbot_archiver.py | 10 ++--- .../__init__.py | 0 .../modules/local/__manifest__.py | 26 ++++++++++++ .../{storages => modules/local}/local.py | 4 +- .../modules/meta_enricher/meta_enricher.py | 2 +- .../metadata_enricher/metadata_enricher.py | 2 +- .../__init__.py | 0 .../modules/mute_formatter/__manifest__.py | 9 +++++ .../mute_formatter}/mute_formatter.py | 0 .../pdq_hash_enricher/pdq_hash_enricher.py | 2 +- src/auto_archiver/modules/s3/__init__.py | 0 src/auto_archiver/modules/s3/__manifest__.py | 40 +++++++++++++++++++ .../{storages => modules/s3}/s3.py | 9 +++-- .../screenshot_enricher.py | 2 +- .../modules/ssl_enricher/ssl_enricher.py | 2 +- .../modules/telegram_extractor/__init__.py | 0 .../__manifest__.py | 5 +-- .../telegram_extractor.py} | 8 ++-- .../modules/telethon_extractor/__init__.py | 0 .../__manifest__.py | 7 ++-- .../telethon_archiver.py | 6 +-- .../thumbnail_enricher/thumbnail_enricher.py | 2 +- .../timestamping_enricher.py | 4 +- .../modules/twitter_api_extractor/__init__.py | 0 .../__manifest__.py | 7 ++-- .../twitter_api_archiver.py | 6 +-- .../modules/vk_extractor/__init__.py | 0 .../__manifest__.py | 5 +-- .../vk_archiver.py | 6 +-- .../modules/wacz_enricher/wacz_enricher.py | 5 +-- .../wayback_enricher/wayback_enricher.py | 5 +-- .../whisper_enricher/whisper_enricher.py | 4 +- src/auto_archiver/storages/__init__.py | 3 -- tests/archivers/test_archiver_base.py | 6 +-- tests/formatters/test_html_formatter.py | 3 +- 93 files changed, 378 insertions(+), 238 deletions(-) delete mode 100644 src/auto_archiver/archivers/__init__.py create mode 100644 src/auto_archiver/base_modules/__init__.py rename src/auto_archiver/{databases => base_modules}/database.py (96%) create mode 100644 src/auto_archiver/base_modules/enricher.py rename src/auto_archiver/{archivers/archiver.py => base_modules/extractor.py} (81%) rename src/auto_archiver/{feeders => base_modules}/feeder.py (86%) rename src/auto_archiver/{formatters => base_modules}/formatter.py (90%) rename src/auto_archiver/{storages => base_modules}/storage.py (94%) delete mode 100644 src/auto_archiver/databases/__init__.py delete mode 100644 src/auto_archiver/enrichers/__init__.py delete mode 100644 src/auto_archiver/enrichers/enricher.py delete mode 100644 src/auto_archiver/feeders/__init__.py delete mode 100644 src/auto_archiver/formatters/__init__.py rename src/auto_archiver/{formatters/templates => modules/atlos}/__init__.py (100%) create mode 100644 src/auto_archiver/modules/atlos/__manifest__.py rename src/auto_archiver/{storages => modules/atlos}/atlos.py (94%) create mode 100644 src/auto_archiver/modules/atlos_db/base_configs.py rename src/auto_archiver/modules/{gsheet_db => gdrive_storage}/__init__.py (100%) create mode 100644 src/auto_archiver/modules/gdrive_storage/__manifest__.py rename src/auto_archiver/{storages/gd.py => modules/gdrive_storage/gdrive_storage.py} (99%) delete mode 100644 src/auto_archiver/modules/gsheet_db/__manifest__.py rename src/auto_archiver/modules/{gsheet_feeder => gsheet_processor}/__init__.py (100%) rename src/auto_archiver/modules/{gsheet_feeder => gsheet_processor}/__manifest__.py (65%) rename src/auto_archiver/modules/{gsheet_db => gsheet_processor}/gsheet_db.py (98%) rename src/auto_archiver/modules/{gsheet_feeder => gsheet_processor}/gsheet_feeder.py (98%) rename src/auto_archiver/modules/{instagram_api_archiver => html_formatter}/__init__.py (100%) create mode 100644 src/auto_archiver/modules/html_formatter/__manifest__.py rename src/auto_archiver/{formatters => modules/html_formatter}/html_formatter.py (84%) rename src/auto_archiver/modules/{instagram_archiver => html_formatter/templates}/__init__.py (100%) rename src/auto_archiver/{formatters => modules/html_formatter}/templates/html_template.html (100%) rename src/auto_archiver/{formatters => modules/html_formatter}/templates/macros.html (100%) rename src/auto_archiver/modules/{instagram_tbot_archiver => instagram_api_extractor}/__init__.py (100%) rename src/auto_archiver/modules/{instagram_api_archiver => instagram_api_extractor}/__manifest__.py (95%) rename src/auto_archiver/modules/{instagram_api_archiver => instagram_api_extractor}/instagram_api_archiver.py (98%) rename src/auto_archiver/modules/{telegram_archiver => instagram_extractor}/__init__.py (100%) rename src/auto_archiver/modules/{instagram_archiver => instagram_extractor}/__manifest__.py (93%) rename src/auto_archiver/modules/{instagram_archiver => instagram_extractor}/instagram_archiver.py (96%) rename src/auto_archiver/modules/{telethon_archiver => instagram_tbot_extractor}/__init__.py (100%) rename src/auto_archiver/modules/{instagram_tbot_archiver => instagram_tbot_extractor}/__manifest__.py (82%) rename src/auto_archiver/modules/{instagram_tbot_archiver => instagram_tbot_extractor}/instagram_tbot_archiver.py (92%) rename src/auto_archiver/modules/{twitter_api_archiver => local}/__init__.py (100%) create mode 100644 src/auto_archiver/modules/local/__manifest__.py rename src/auto_archiver/{storages => modules/local}/local.py (94%) rename src/auto_archiver/modules/{vk_archiver => mute_formatter}/__init__.py (100%) create mode 100644 src/auto_archiver/modules/mute_formatter/__manifest__.py rename src/auto_archiver/{formatters => modules/mute_formatter}/mute_formatter.py (100%) create mode 100644 src/auto_archiver/modules/s3/__init__.py create mode 100644 src/auto_archiver/modules/s3/__manifest__.py rename src/auto_archiver/{storages => modules/s3}/s3.py (95%) create mode 100644 src/auto_archiver/modules/telegram_extractor/__init__.py rename src/auto_archiver/modules/{telegram_archiver => telegram_extractor}/__manifest__.py (78%) rename src/auto_archiver/modules/{telegram_archiver/telegram_archiver.py => telegram_extractor/telegram_extractor.py} (91%) create mode 100644 src/auto_archiver/modules/telethon_extractor/__init__.py rename src/auto_archiver/modules/{telethon_archiver => telethon_extractor}/__manifest__.py (90%) rename src/auto_archiver/modules/{telethon_archiver => telethon_extractor}/telethon_archiver.py (98%) create mode 100644 src/auto_archiver/modules/twitter_api_extractor/__init__.py rename src/auto_archiver/modules/{twitter_api_archiver => twitter_api_extractor}/__manifest__.py (89%) rename src/auto_archiver/modules/{twitter_api_archiver => twitter_api_extractor}/twitter_api_archiver.py (97%) create mode 100644 src/auto_archiver/modules/vk_extractor/__init__.py rename src/auto_archiver/modules/{vk_archiver => vk_extractor}/__manifest__.py (90%) rename src/auto_archiver/modules/{vk_archiver => vk_extractor}/vk_archiver.py (93%) delete mode 100644 src/auto_archiver/storages/__init__.py diff --git a/Dockerfile b/Dockerfile index 0ecc7f3..8272c73 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.0.4 AS base +FROM webrecorder/browsertrix-crawler:1.4.2 AS base ENV RUNNING_IN_DOCKER=1 \ LANG=C.UTF-8 \ @@ -22,28 +22,30 @@ RUN add-apt-repository ppa:mozillateam/ppa && \ # Poetry and runtime -FROM base AS runtime +FROM base AS poetry-env ENV POETRY_NO_INTERACTION=1 \ POETRY_VIRTUALENVS_IN_PROJECT=1 \ POETRY_VIRTUALENVS_CREATE=1 -RUN pip install --upgrade pip && \ - pip install "poetry>=2.0.0,<3.0.0" +# Create a virtual environment for poetry and install it +RUN python3 -m venv /poetry-venv && \ + /poetry-venv/bin/python -m pip install --upgrade pip && \ + /poetry-venv/bin/python -m pip install "poetry>=2.0.0,<3.0.0" WORKDIR /app COPY pyproject.toml poetry.lock README.md ./ # Copy dependency files and install dependencies (excluding the package itself) -RUN poetry install --only main --no-root --no-cache +RUN /poetry-venv/bin/poetry install --only main --no-root --no-cache # Copy code: This is needed for poetry to install the package itself, # but the environment should be cached from the previous step if toml and lock files haven't changed COPY ./src/ . -RUN poetry install --only main --no-cache +RUN /poetry-venv/bin/poetry install --only main --no-cache # Update PATH to include virtual environment binaries @@ -55,4 +57,3 @@ ENTRYPOINT ["python3", "-m", "auto_archiver"] # should be executed with 2 volumes (3 if local_storage is used) # docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml - diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py deleted file mode 100644 index 54515ec..0000000 --- a/src/auto_archiver/archivers/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Archivers are responsible for retrieving the content from various external platforms. -They act as specialized modules, each tailored to interact with a specific platform, -service, or data source. The archivers collectively enable the tool to comprehensively -collect and preserve a variety of content types, such as posts, images, videos and metadata. - -""" -from .archiver import Archiver diff --git a/src/auto_archiver/base_modules/__init__.py b/src/auto_archiver/base_modules/__init__.py new file mode 100644 index 0000000..4995457 --- /dev/null +++ b/src/auto_archiver/base_modules/__init__.py @@ -0,0 +1,6 @@ +from .database import Database +from .enricher import Enricher +from .feeder import Feeder +from .storage import Storage +from .extractor import Extractor +from .formatter import Formatter \ No newline at end of file diff --git a/src/auto_archiver/databases/database.py b/src/auto_archiver/base_modules/database.py similarity index 96% rename from src/auto_archiver/databases/database.py rename to src/auto_archiver/base_modules/database.py index 30cba7e..28f0061 100644 --- a/src/auto_archiver/databases/database.py +++ b/src/auto_archiver/base_modules/database.py @@ -3,13 +3,13 @@ from dataclasses import dataclass from abc import abstractmethod, ABC from typing import Union -from ..core import Metadata, Step +from auto_archiver.core import Metadata, Step @dataclass class Database(Step, ABC): - name = "database" + name = "database" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called super().__init__(config) diff --git a/src/auto_archiver/base_modules/enricher.py b/src/auto_archiver/base_modules/enricher.py new file mode 100644 index 0000000..d26eedf --- /dev/null +++ b/src/auto_archiver/base_modules/enricher.py @@ -0,0 +1,31 @@ +""" +Enrichers are modular components that enhance archived content by adding +context, metadata, or additional processing. + +These add additional information to the context, such as screenshots, hashes, and metadata. +They are designed to work within the archiving pipeline, operating on `Metadata` objects after +the archiving step and before storage or formatting. + +Enrichers are optional but highly useful for making the archived data more powerful. +""" +from __future__ import annotations +from dataclasses import dataclass +from abc import abstractmethod, ABC +from auto_archiver.core import Metadata, Step + +@dataclass +class Enricher(Step, ABC): + """Base classes and utilities for enrichers in the Auto-Archiver system.""" + name = "enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + + # only for typing... + def init(name: str, config: dict) -> Enricher: + return Step.init(name, config, Enricher) + + @abstractmethod + def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/base_modules/extractor.py similarity index 81% rename from src/auto_archiver/archivers/archiver.py rename to src/auto_archiver/base_modules/extractor.py index b5f3f40..c772325 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/base_modules/extractor.py @@ -1,7 +1,7 @@ -""" The `archiver` module defines the base functionality for implementing archivers in the media archiving framework. - This class provides common utility methods and a standard interface for archivers. +""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework. + This class provides common utility methods and a standard interface for extractors. - Factory method to initialize an archiver instance based on its name. + Factory method to initialize an extractor instance based on its name. """ @@ -15,32 +15,32 @@ import mimetypes, requests from loguru import logger from retrying import retry -from ..core import Metadata, Step, ArchivingContext +from ..core import Metadata, ArchivingContext @dataclass -class Archiver: +class Extractor: """ - Base class for implementing archivers in the media archiving framework. + Base class for implementing extractors in the media archiving framework. Subclasses must implement the `download` method to define platform-specific behavior. """ def setup(self) -> None: - # used when archivers need to login or do other one-time setup + # used when extractors need to login or do other one-time setup pass def cleanup(self) -> None: - # called when archivers are done, or upon errors, cleanup any resources + # called when extractors are done, or upon errors, cleanup any resources pass def sanitize_url(self, url: str) -> str: # used to clean unnecessary URL parameters OR unfurl redirect links return url - + def suitable(self, url: str) -> bool: """ - Returns True if this archiver can handle the given URL - + Returns True if this extractor can handle the given URL + Should be overridden by subclasses """ return True @@ -84,10 +84,10 @@ class Archiver: for chunk in d.iter_content(chunk_size=8192): f.write(chunk) return to_filename - + except requests.RequestException as e: logger.warning(f"Failed to fetch the Media URL: {e}") @abstractmethod def download(self, item: Metadata) -> Metadata: - pass + pass \ No newline at end of file diff --git a/src/auto_archiver/feeders/feeder.py b/src/auto_archiver/base_modules/feeder.py similarity index 86% rename from src/auto_archiver/feeders/feeder.py rename to src/auto_archiver/base_modules/feeder.py index 4aa263f..7fbd6b1 100644 --- a/src/auto_archiver/feeders/feeder.py +++ b/src/auto_archiver/base_modules/feeder.py @@ -1,8 +1,8 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod -from ..core import Metadata -from ..core import Step +from auto_archiver.core import Metadata +from auto_archiver.core import Step @dataclass diff --git a/src/auto_archiver/formatters/formatter.py b/src/auto_archiver/base_modules/formatter.py similarity index 90% rename from src/auto_archiver/formatters/formatter.py rename to src/auto_archiver/base_modules/formatter.py index b10477e..4c59af8 100644 --- a/src/auto_archiver/formatters/formatter.py +++ b/src/auto_archiver/base_modules/formatter.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod -from ..core import Metadata, Media, Step +from auto_archiver.core import Metadata, Media, Step @dataclass diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/base_modules/storage.py similarity index 94% rename from src/auto_archiver/storages/storage.py rename to src/auto_archiver/base_modules/storage.py index c9b55e0..147da1f 100644 --- a/src/auto_archiver/storages/storage.py +++ b/src/auto_archiver/base_modules/storage.py @@ -4,10 +4,10 @@ from dataclasses import dataclass from typing import IO, Optional import os -from ..utils.misc import random_str +from auto_archiver.utils.misc import random_str -from ..core import Media, Step, ArchivingContext, Metadata -from ..enrichers import HashEnricher +from auto_archiver.core import Media, Step, ArchivingContext, Metadata +from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher from loguru import logger from slugify import slugify diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index ad3f989..b78df83 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -8,9 +8,4 @@ from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator -# from .config import Config - -from .media import Media -from .step import Step -from .context import ArchivingContext -from .metadata import Metadata +# from .config import Config \ No newline at end of file diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index f5d9fae..3ec0b38 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -15,7 +15,7 @@ from .loader import MODULE_TYPES # configurable_parents = [ # Feeder, # Enricher, -# Archiver, +# Extractor, # Database, # Storage, # Formatter @@ -23,7 +23,7 @@ from .loader import MODULE_TYPES # ] # feeder: Feeder # formatter: Formatter -# archivers: List[Archiver] = field(default_factory=[]) +# extractors: List[Extractor] = field(default_factory=[]) # enrichers: List[Enricher] = field(default_factory=[]) # storages: List[Storage] = field(default_factory=[]) # databases: List[Database] = field(default_factory=[]) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 1b4fee0..38edafe 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -33,7 +33,7 @@ class ArchivingOrchestrator: # self.feeder: Feeder = config.feeder # self.formatter: Formatter = config.formatter # self.enrichers: List[Enricher] = config.enrichers - # self.archivers: List[Archiver] = config.archivers + # self.extractors: List[Extractor] = config.extractors # self.databases: List[Database] = config.databases # self.storages: List[Storage] = config.storages # ArchivingContext.set("storages", self.storages, keep_on_reset=True) @@ -80,7 +80,7 @@ class ArchivingOrchestrator: for module_type in MODULE_TYPES: enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) - # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter' + # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'extractors', 'databases', 'storages', 'formatter' for module_type in MODULE_TYPES: if modules := getattr(basic_config, f"{module_type}s", []): enabled_modules.extend(modules) @@ -98,7 +98,7 @@ class ArchivingOrchestrator: self.add_module_args(available_modules(with_manifest=True), parser) - breakpoint() + # breakpoint() parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -165,7 +165,8 @@ class ArchivingOrchestrator: for module_type in MODULE_TYPES: if module_type == 'enricher': - breakpoint() + pass + # breakpoint() step_items = [] modules_to_load = self.config['steps'][f"{module_type}s"] @@ -228,7 +229,7 @@ class ArchivingOrchestrator: def cleanup(self)->None: logger.info("Cleaning up") for e in self.config['steps']['extractors']: - breakpoint() + # breakpoint() e.cleanup() def feed(self) -> Generator[Metadata]: diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py deleted file mode 100644 index 3a8d787..0000000 --- a/src/auto_archiver/databases/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -""" Databases are used to store the outputs from running the Autp Archiver. - - -""" -from .database import Database diff --git a/src/auto_archiver/enrichers/__init__.py b/src/auto_archiver/enrichers/__init__.py deleted file mode 100644 index 67cb0e5..0000000 --- a/src/auto_archiver/enrichers/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Enrichers are modular components that enhance archived content by adding -context, metadata, or additional processing. - -These add additional information to the context, such as screenshots, hashes, and metadata. -They are designed to work within the archiving pipeline, operating on `Metadata` objects after -the archiving step and before storage or formatting. - -Enrichers are optional but highly useful for making the archived data more powerful. - - -""" diff --git a/src/auto_archiver/enrichers/enricher.py b/src/auto_archiver/enrichers/enricher.py deleted file mode 100644 index f195f23..0000000 --- a/src/auto_archiver/enrichers/enricher.py +++ /dev/null @@ -1,22 +0,0 @@ -""" Base classes and utilities for enrichers in the Auto-Archiver system. -""" -from __future__ import annotations -from dataclasses import dataclass -from abc import abstractmethod, ABC -from ..core import Metadata, Step - -@dataclass -class Enricher(Step, ABC): - name = "enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - - # only for typing... - def init(name: str, config: dict) -> Enricher: - return Step.init(name, config, Enricher) - - @abstractmethod - def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/auto_archiver/feeders/__init__.py b/src/auto_archiver/feeders/__init__.py deleted file mode 100644 index 3eb33d7..0000000 --- a/src/auto_archiver/feeders/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" Feeders handle the input of media into the Auto Archiver. - -""" diff --git a/src/auto_archiver/formatters/__init__.py b/src/auto_archiver/formatters/__init__.py deleted file mode 100644 index 1a9dcd0..0000000 --- a/src/auto_archiver/formatters/__init__.py +++ /dev/null @@ -1 +0,0 @@ -""" Formatters for the output of the content. """ diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index fa1ae75..44373c6 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,7 +2,7 @@ from typing import Union import requests, os from loguru import logger -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/formatters/templates/__init__.py b/src/auto_archiver/modules/atlos/__init__.py similarity index 100% rename from src/auto_archiver/formatters/templates/__init__.py rename to src/auto_archiver/modules/atlos/__init__.py diff --git a/src/auto_archiver/modules/atlos/__manifest__.py b/src/auto_archiver/modules/atlos/__manifest__.py new file mode 100644 index 0000000..cc357e3 --- /dev/null +++ b/src/auto_archiver/modules/atlos/__manifest__.py @@ -0,0 +1,38 @@ +{ + "name": "atlos_storage", + "type": ["storage"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "requests"], + "bin": [""] + }, + "configs": { + # TODO: get base storage configs + # TODO also? get_atlos_config_options() + + "api_token": { + "default": None, + "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", + "cli_set": lambda cli_val, _: cli_val + }, + "atlos_url": { + "default": "https://platform.atlos.org", + "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", + "cli_set": lambda cli_val, _: cli_val + }, + }, + "description": """ + AtlosStorage: A storage module for saving media files to the Atlos platform. + + ### Features + - Uploads media files to Atlos using Atlos-specific APIs. + - Automatically calculates SHA-256 hashes of media files for integrity verification. + - Skips uploads for files that already exist on Atlos with the same hash. + - Supports attaching metadata, such as `atlos_id`, to the uploaded files. + - Provides CDN-like URLs for accessing uploaded media. + + ### Notes + - Requires Atlos API configuration, including `atlos_url` and `api_token`. + - Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials. + """ +} diff --git a/src/auto_archiver/storages/atlos.py b/src/auto_archiver/modules/atlos/atlos.py similarity index 94% rename from src/auto_archiver/storages/atlos.py rename to src/auto_archiver/modules/atlos/atlos.py index 3b13aa0..28b7cb1 100644 --- a/src/auto_archiver/storages/atlos.py +++ b/src/auto_archiver/modules/atlos/atlos.py @@ -4,9 +4,9 @@ from loguru import logger import requests import hashlib -from ..core import Media, Metadata -from ..storages import Storage -from ..utils import get_atlos_config_options +from auto_archiver.core import Media, Metadata +from auto_archiver.base_modules import Storage +from auto_archiver.utils import get_atlos_config_options class AtlosStorage(Storage): diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py index 376ba32..cbf1c89 100644 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -1,11 +1,12 @@ import os + from typing import Union from loguru import logger from csv import DictWriter from dataclasses import asdict import requests -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/atlos_db/base_configs.py b/src/auto_archiver/modules/atlos_db/base_configs.py new file mode 100644 index 0000000..c47c711 --- /dev/null +++ b/src/auto_archiver/modules/atlos_db/base_configs.py @@ -0,0 +1,13 @@ +def get_atlos_config_options(): + return { + "api_token": { + "default": None, + "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", + "cli_set": lambda cli_val, _: cli_val + }, + "atlos_url": { + "default": "https://platform.atlos.org", + "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", + "cli_set": lambda cli_val, _: cli_val + }, + } \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index d344139..0810b73 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -1,7 +1,7 @@ from loguru import logger import requests -from auto_archiver.feeders import Feeder +from auto_archiver.base_modules import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 1376379..e826533 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.feeders import Feeder +from auto_archiver.base_modules import Feeder from auto_archiver.core import Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py index 357c696..a0d43b7 100644 --- a/src/auto_archiver/modules/console_db/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py index 642e889..6e5d873 100644 --- a/src/auto_archiver/modules/csv_db/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -3,7 +3,7 @@ from loguru import logger from csv import DictWriter from dataclasses import asdict -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index b665bd9..4cf2f11 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,7 +1,7 @@ from loguru import logger import csv -from auto_archiver.feeders import Feeder +from auto_archiver.base_modules import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import url_or_none diff --git a/src/auto_archiver/modules/gsheet_db/__init__.py b/src/auto_archiver/modules/gdrive_storage/__init__.py similarity index 100% rename from src/auto_archiver/modules/gsheet_db/__init__.py rename to src/auto_archiver/modules/gdrive_storage/__init__.py diff --git a/src/auto_archiver/modules/gdrive_storage/__manifest__.py b/src/auto_archiver/modules/gdrive_storage/__manifest__.py new file mode 100644 index 0000000..cc598e2 --- /dev/null +++ b/src/auto_archiver/modules/gdrive_storage/__manifest__.py @@ -0,0 +1,34 @@ +m = { + "name": "Google Drive Storage", + "type": ["storage"], + "requires_setup": True, + "external_dependencies": { + "python": [ + "loguru", + "google-api-python-client", + "google-auth", + "google-auth-oauthlib", + "google-auth-httplib2" + ], + }, + "configs": { + # TODO: get base storage configs + "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, + "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, + "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."}, + }, + "description": """ + GDriveStorage: A storage module for saving archived content to Google Drive. + + ### Features + - Saves media files to Google Drive, organizing them into folders based on the provided path structure. + - Supports OAuth token-based authentication or service account credentials for API access. + - Automatically creates folders in Google Drive if they don't exist. + - Retrieves CDN URLs for stored files, enabling easy sharing and access. + + ### Notes + - Requires setup with either a Google OAuth token or a service account JSON file. + - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure. + - Automatically handles Google Drive API token refreshes for long-running jobs. + """ +} diff --git a/src/auto_archiver/storages/gd.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py similarity index 99% rename from src/auto_archiver/storages/gd.py rename to src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 61c5b21..2e4ca48 100644 --- a/src/auto_archiver/storages/gd.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -9,8 +9,8 @@ from google.oauth2 import service_account from google.oauth2.credentials import Credentials from google.auth.transport.requests import Request -from ..core import Media -from . import Storage +from auto_archiver.core import Media +from auto_archiver.base_modules import Storage class GDriveStorage(Storage): diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index 7aa9c39..d4051aa 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -1,17 +1,12 @@ -import os -import mimetypes - -import requests from loguru import logger -from auto_archiver.core.context import ArchivingContext -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor from auto_archiver.core.metadata import Metadata, Media from .dropin import GenericDropin, InfoExtractor class Bluesky(GenericDropin): - def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata: + def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: result = Metadata() result.set_url(url) result.set_title(post["record"]["text"]) @@ -42,7 +37,7 @@ class Bluesky(GenericDropin): - def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]: + def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]: """ Iterates over image(s) or video in a Bluesky post and downloads them """ diff --git a/src/auto_archiver/modules/generic_extractor/dropin.py b/src/auto_archiver/modules/generic_extractor/dropin.py index 37f3faf..9de63d2 100644 --- a/src/auto_archiver/modules/generic_extractor/dropin.py +++ b/src/auto_archiver/modules/generic_extractor/dropin.py @@ -1,6 +1,6 @@ from yt_dlp.extractor.common import InfoExtractor from auto_archiver.core.metadata import Metadata -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor class GenericDropin: """Base class for dropins for the generic extractor. @@ -30,7 +30,7 @@ class GenericDropin: raise NotImplementedError("This method should be implemented in the subclass") - def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata: + def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: """ This method should create a Metadata object from the post data. """ diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 27fe157..8e4b2c4 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -5,10 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor from ...core import Metadata, Media, ArchivingContext -class GenericExtractor(Archiver): +class GenericExtractor(Extractor): name = "youtubedl_archiver" #left as is for backwards compat _dropins = {} diff --git a/src/auto_archiver/modules/generic_extractor/truth.py b/src/auto_archiver/modules/generic_extractor/truth.py index bf19dce..e713c90 100644 --- a/src/auto_archiver/modules/generic_extractor/truth.py +++ b/src/auto_archiver/modules/generic_extractor/truth.py @@ -2,7 +2,7 @@ from typing import Type from auto_archiver.utils import traverse_obj from auto_archiver.core.metadata import Metadata, Media -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor from yt_dlp.extractor.common import InfoExtractor from dateutil.parser import parse as parse_dt @@ -19,7 +19,7 @@ class Truth(GenericDropin): def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool: return True - def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata: + def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: """ Creates metadata from a truth social post diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index ce6c28d..6cd22b1 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -6,7 +6,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import UrlUtil -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor from .dropin import GenericDropin, InfoExtractor @@ -32,7 +32,7 @@ class Twitter(GenericDropin): twid = ie_instance._match_valid_url(url).group('id') return ie_instance._extract_status(twid=twid) - def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata: + def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: result = Metadata() try: if not tweet.get("user") or not tweet.get("created_at"): diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py deleted file mode 100644 index f4db93b..0000000 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ /dev/null @@ -1,21 +0,0 @@ -# TODO merge with feeder manifest? -{ - "name": "gsheet_db", - "type": ["database"], - "requires_setup": True, - "external_dependencies": {"python": [" loguru"], - }, - "description": """ -Handles integration with Google Sheets for tracking archival tasks. - -### Features -- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. -- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. -- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. -- Skips redundant updates for empty or invalid data fields. - -### Notes -- Currently works only with metadata provided by GsheetFeeder. -- Requires configuration of a linked Google Sheet and appropriate API credentials. -""", -} diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_processor/__init__.py similarity index 100% rename from src/auto_archiver/modules/gsheet_feeder/__init__.py rename to src/auto_archiver/modules/gsheet_processor/__init__.py diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_processor/__manifest__.py similarity index 65% rename from src/auto_archiver/modules/gsheet_feeder/__manifest__.py rename to src/auto_archiver/modules/gsheet_processor/__manifest__.py index 2af090c..8a554fe 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_processor/__manifest__.py @@ -1,5 +1,5 @@ { - "name": "Google Sheets Feeder", + "name": "Google Sheets Procesor", "type": ["feeder"], "requires_setup": True, "external_dependencies": { @@ -22,7 +22,12 @@ } }, "description": """ - GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. + Google Sheets Module. + + Handles feeding from a google sheet as well as an optional write back to the sheet. + + ## GsheetsFeeder + A Google Sheets-based feeder for the Auto Archiver. This reads data from Google Sheets and filters rows based on user-defined rules. The filtered rows are processed into `Metadata` objects. @@ -36,5 +41,18 @@ ### Notes - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. - Create the sheet using the template provided in the docs. + + ## GsheetsDatabase: + Handles integration with Google Sheets for tracking archival tasks. + +### Features +- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. +- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. +- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. +- Skips redundant updates for empty or invalid data fields. + +### Notes +- Currently works only with metadata provided by GsheetFeeder. +- Requires configuration of a linked Google Sheet and appropriate API credentials. """ } diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_processor/gsheet_db.py similarity index 98% rename from src/auto_archiver/modules/gsheet_db/gsheet_db.py rename to src/auto_archiver/modules/gsheet_processor/gsheet_db.py index 8e17966..cf46473 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_processor/gsheet_db.py @@ -1,10 +1,11 @@ from typing import Union, Tuple + import datetime from urllib.parse import quote from loguru import logger -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import GWorksheet diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py similarity index 98% rename from src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py rename to src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py index 5c73bf6..4df9042 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py @@ -13,8 +13,7 @@ import gspread, os from loguru import logger from slugify import slugify -# from . import Enricher -from auto_archiver.feeders import Feeder +from auto_archiver.base_modules import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import Gsheets, GWorksheet diff --git a/src/auto_archiver/modules/hash_enricher/__init__.py b/src/auto_archiver/modules/hash_enricher/__init__.py index e69de29..e7faff7 100644 --- a/src/auto_archiver/modules/hash_enricher/__init__.py +++ b/src/auto_archiver/modules/hash_enricher/__init__.py @@ -0,0 +1 @@ +from hash_enricher import HashEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/hash_enricher/__manifest__.py b/src/auto_archiver/modules/hash_enricher/__manifest__.py index 311ed6f..eef1963 100644 --- a/src/auto_archiver/modules/hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/hash_enricher/__manifest__.py @@ -7,7 +7,7 @@ }, "configs": { "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, - "chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, + "chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, }, "description": """ Generates cryptographic hashes for media files to ensure data integrity and authenticity. diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 355413a..c8eacb1 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -10,7 +10,7 @@ making it suitable for handling large files efficiently. import hashlib from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata, ArchivingContext @@ -40,7 +40,11 @@ class HashEnricher(Enricher): else: self.chunksize = self.configs()["chunksize"]["default"] - self.chunksize = int(self.chunksize) + try: + self.chunksize = int(self.chunksize) + except ValueError: + raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.") + assert self.chunksize >= -1, "read length must be non-negative or -1" ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True) diff --git a/src/auto_archiver/modules/instagram_api_archiver/__init__.py b/src/auto_archiver/modules/html_formatter/__init__.py similarity index 100% rename from src/auto_archiver/modules/instagram_api_archiver/__init__.py rename to src/auto_archiver/modules/html_formatter/__init__.py diff --git a/src/auto_archiver/modules/html_formatter/__manifest__.py b/src/auto_archiver/modules/html_formatter/__manifest__.py new file mode 100644 index 0000000..55ca5da --- /dev/null +++ b/src/auto_archiver/modules/html_formatter/__manifest__.py @@ -0,0 +1,13 @@ +m = { + "name": "HTML Formatter", + "type": ["formatter"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru", "jinja2"], + "bin": [""] + }, + "configs": { + "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"} + }, + "description": """ """, +} diff --git a/src/auto_archiver/formatters/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py similarity index 84% rename from src/auto_archiver/formatters/html_formatter.py rename to src/auto_archiver/modules/html_formatter/html_formatter.py index 5d95474..cc8a4da 100644 --- a/src/auto_archiver/formatters/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -7,11 +7,11 @@ from loguru import logger import json import base64 -from ..version import __version__ -from ..core import Metadata, Media, ArchivingContext -from . import Formatter -from ..enrichers import HashEnricher -from ..utils.misc import random_str +from auto_archiver.version import __version__ +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.base_modules import Formatter +from auto_archiver.modules.hash_enricher import HashEnricher +from auto_archiver.utils.misc import random_str @dataclass @@ -28,11 +28,11 @@ class HtmlFormatter(Formatter): }) self.template = self.environment.get_template("html_template.html") - @staticmethod - def configs() -> dict: - return { - "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"} - } + # @staticmethod + # def configs() -> dict: + # return { + # "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"} + # } def format(self, item: Metadata) -> Media: url = item.get_url() diff --git a/src/auto_archiver/modules/instagram_archiver/__init__.py b/src/auto_archiver/modules/html_formatter/templates/__init__.py similarity index 100% rename from src/auto_archiver/modules/instagram_archiver/__init__.py rename to src/auto_archiver/modules/html_formatter/templates/__init__.py diff --git a/src/auto_archiver/formatters/templates/html_template.html b/src/auto_archiver/modules/html_formatter/templates/html_template.html similarity index 100% rename from src/auto_archiver/formatters/templates/html_template.html rename to src/auto_archiver/modules/html_formatter/templates/html_template.html diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/modules/html_formatter/templates/macros.html similarity index 100% rename from src/auto_archiver/formatters/templates/macros.html rename to src/auto_archiver/modules/html_formatter/templates/macros.html diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__init__.py b/src/auto_archiver/modules/instagram_api_extractor/__init__.py similarity index 100% rename from src/auto_archiver/modules/instagram_tbot_archiver/__init__.py rename to src/auto_archiver/modules/instagram_api_extractor/__init__.py diff --git a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py similarity index 95% rename from src/auto_archiver/modules/instagram_api_archiver/__manifest__.py rename to src/auto_archiver/modules/instagram_api_extractor/__manifest__.py index b2225fa..cdaf635 100644 --- a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Instagram API Archiver", + "name": "Instagram API Extractor", "type": ["extractor"], - "entry_point": "instagram_api_archiver:InstagramApiArchiver", "external_dependencies": {"python": ["requests", "loguru", diff --git a/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py similarity index 98% rename from src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py rename to src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py index dc3f1ec..5206b41 100644 --- a/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py @@ -1,5 +1,5 @@ """ -The `instagram_api_archiver` module provides tools for archiving various types of Instagram content +The `instagram_api_extractor` module provides tools for archiving various types of Instagram content using the [Instagrapi API](https://github.com/subzeroid/instagrapi). Connects to an Instagrapi API deployment and allows for downloading Instagram user profiles, @@ -16,19 +16,19 @@ from loguru import logger from retrying import retry from tqdm import tqdm -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Media from auto_archiver.core import Metadata -class InstagramAPIArchiver(Archiver): +class InstagramAPIExtractor(Extractor): """ Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data # TODO: improvement collect aggregates of locations[0].location and mentions for all posts """ - name = "instagram_api_archiver" + name = "instagram_api_extractor" global_pattern = re.compile( r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" diff --git a/src/auto_archiver/modules/telegram_archiver/__init__.py b/src/auto_archiver/modules/instagram_extractor/__init__.py similarity index 100% rename from src/auto_archiver/modules/telegram_archiver/__init__.py rename to src/auto_archiver/modules/instagram_extractor/__init__.py diff --git a/src/auto_archiver/modules/instagram_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py similarity index 93% rename from src/auto_archiver/modules/instagram_archiver/__manifest__.py rename to src/auto_archiver/modules/instagram_extractor/__manifest__.py index 44cd7bb..f1857c2 100644 --- a/src/auto_archiver/modules/instagram_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Instagram Archiver", + "name": "Instagram Extractor", "type": ["extractor"], - "entry_point": "instagram_archiver:InstagramArchiver", "external_dependencies": { "python": [ "instaloader", diff --git a/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py b/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py similarity index 96% rename from src/auto_archiver/modules/instagram_archiver/instagram_archiver.py rename to src/auto_archiver/modules/instagram_extractor/instagram_archiver.py index 7daf291..c6bde62 100644 --- a/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py @@ -7,15 +7,15 @@ import re, os, shutil, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata from auto_archiver.core import Media -class InstagramArchiver(Archiver): +class InstagramExtractor(Extractor): """ Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) """ - name = "instagram_archiver" + name = "instagram_extractor" # NB: post regex should be tested before profile # https://regex101.com/r/MGPquX/1 @@ -67,7 +67,7 @@ class InstagramArchiver(Archiver): elif len(profile_matches): result = self.download_profile(url, profile_matches[0]) except Exception as e: - logger.error(f"Failed to download with instagram archiver due to: {e}, make sure your account credentials are valid.") + logger.error(f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid.") finally: shutil.rmtree(self.download_folder, ignore_errors=True) return result diff --git a/src/auto_archiver/modules/telethon_archiver/__init__.py b/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py similarity index 100% rename from src/auto_archiver/modules/telethon_archiver/__init__.py rename to src/auto_archiver/modules/instagram_tbot_extractor/__init__.py diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py similarity index 82% rename from src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py rename to src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py index 6e934b0..95d6808 100644 --- a/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Instagram Telegram Bot Archiver", + "name": "Instagram Telegram Bot Extractor", "type": ["extractor"], - "entry_point": "instagram_tbot_archiver:InstagramTbotArchiver", "external_dependencies": {"python": ["loguru", "telethon",], }, @@ -13,7 +12,7 @@ "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."}, }, "description": """ -The `InstagramTbotArchiver` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content, +The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content, such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and returned as part of a `Metadata` object. @@ -26,7 +25,7 @@ returned as part of a `Metadata` object. ### Setup -To use the `InstagramTbotArchiver`, you need to provide the following configuration settings: +To use the `InstagramTbotExtractor`, you need to provide the following configuration settings: - **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps). - **Session File**: Optional path to store the Telegram session file for future use. diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py similarity index 92% rename from src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py rename to src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py index 3423010..5c3ad24 100644 --- a/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py @@ -1,5 +1,5 @@ """ -InstagramTbotArchiver Module +InstagramTbotExtractor Module This module provides functionality to archive Instagram content (posts, stories, etc.) using a Telegram bot (`instagram_load_bot`). It interacts with the Telegram API via the Telethon library to send Instagram URLs to the bot, which retrieves the @@ -15,18 +15,18 @@ from sqlite3 import OperationalError from loguru import logger from telethon.sync import TelegramClient -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str -class InstagramTbotArchiver(Archiver): +class InstagramTbotExtractor(Extractor): """ calls a telegram bot to fetch instagram posts/stories... and gets available media from it https://github.com/adw0rd/instagrapi https://t.me/instagram_load_bot """ - name = "instagram_tbot_archiver" + name = "instagram_tbot_extractor" def __init__(self, config: dict) -> None: super().__init__(config) @@ -49,7 +49,7 @@ class InstagramTbotArchiver(Archiver): try: self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) except OperationalError as e: - logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}") + logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}") with self.client.start(): logger.success(f"SETUP {self.name} login works.") diff --git a/src/auto_archiver/modules/twitter_api_archiver/__init__.py b/src/auto_archiver/modules/local/__init__.py similarity index 100% rename from src/auto_archiver/modules/twitter_api_archiver/__init__.py rename to src/auto_archiver/modules/local/__init__.py diff --git a/src/auto_archiver/modules/local/__manifest__.py b/src/auto_archiver/modules/local/__manifest__.py new file mode 100644 index 0000000..5220555 --- /dev/null +++ b/src/auto_archiver/modules/local/__manifest__.py @@ -0,0 +1,26 @@ +m = { + "name": "Local Storage", + "type": ["storage"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + }, + "configs": { + # TODO: get base storage configs + "save_to": {"default": "./archived", "help": "folder where to save archived content"}, + "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, + }, + "description": """ + LocalStorage: A storage module for saving archived content locally on the filesystem. + + ### Features + - Saves archived media files to a specified folder on the local filesystem. + - Maintains file metadata during storage using `shutil.copy2`. + - Supports both absolute and relative paths for stored files, configurable via `save_absolute`. + - Automatically creates directories as needed for storing files. + + ### Notes + - Default storage folder is `./archived`, but this can be changed via the `save_to` configuration. + - The `save_absolute` option can reveal the file structure in output formats; use with caution. + """ +} diff --git a/src/auto_archiver/storages/local.py b/src/auto_archiver/modules/local/local.py similarity index 94% rename from src/auto_archiver/storages/local.py rename to src/auto_archiver/modules/local/local.py index aa08e49..ef0966d 100644 --- a/src/auto_archiver/storages/local.py +++ b/src/auto_archiver/modules/local/local.py @@ -4,8 +4,8 @@ from typing import IO import os from loguru import logger -from ..core import Media -from ..storages import Storage +from auto_archiver.core import Media +from auto_archiver.base_modules import Storage class LocalStorage(Storage): diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index ab0e73d..52d8eb2 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -2,7 +2,7 @@ import datetime import os from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index 5887d16..b729d36 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -2,7 +2,7 @@ import subprocess import traceback from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/vk_archiver/__init__.py b/src/auto_archiver/modules/mute_formatter/__init__.py similarity index 100% rename from src/auto_archiver/modules/vk_archiver/__init__.py rename to src/auto_archiver/modules/mute_formatter/__init__.py diff --git a/src/auto_archiver/modules/mute_formatter/__manifest__.py b/src/auto_archiver/modules/mute_formatter/__manifest__.py new file mode 100644 index 0000000..af3f83a --- /dev/null +++ b/src/auto_archiver/modules/mute_formatter/__manifest__.py @@ -0,0 +1,9 @@ +m = { + "name": "Mute Formatter", + "type": ["formatter"], + "requires_setup": False, + "external_dependencies": { + }, + "description": """ Default formatter. + """, +} diff --git a/src/auto_archiver/formatters/mute_formatter.py b/src/auto_archiver/modules/mute_formatter/mute_formatter.py similarity index 100% rename from src/auto_archiver/formatters/mute_formatter.py rename to src/auto_archiver/modules/mute_formatter/mute_formatter.py diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index e3e9d10..dc70465 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -16,7 +16,7 @@ import numpy as np from PIL import Image, UnidentifiedImageError from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/s3/__init__.py b/src/auto_archiver/modules/s3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/s3/__manifest__.py b/src/auto_archiver/modules/s3/__manifest__.py new file mode 100644 index 0000000..239e0fe --- /dev/null +++ b/src/auto_archiver/modules/s3/__manifest__.py @@ -0,0 +1,40 @@ +m = { + "name": "S3 Storage", + "type": ["storage"], + "requires_setup": True, + "external_dependencies": { + "python": ["boto3", "loguru"], + }, + "configs": { + # TODO: get base storage configs + "bucket": {"default": None, "help": "S3 bucket name"}, + "region": {"default": None, "help": "S3 region name"}, + "key": {"default": None, "help": "S3 API key"}, + "secret": {"default": None, "help": "S3 API secret"}, + "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, + "endpoint_url": { + "default": 'https://{region}.digitaloceanspaces.com', + "help": "S3 bucket endpoint, {region} are inserted at runtime" + }, + "cdn_url": { + "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', + "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" + }, + "private": {"default": False, "help": "if true S3 files will not be readable online"}, + }, + "description": """ + S3Storage: A storage module for saving media files to an S3-compatible object storage. + + ### Features + - Uploads media files to an S3 bucket with customizable configurations. + - Supports `random_no_duplicate` mode to avoid duplicate uploads by checking existing files based on SHA-256 hashes. + - Automatically generates unique paths for files when duplicates are found. + - Configurable endpoint and CDN URL for different S3-compatible providers. + - Supports both private and public file storage, with public files being readable online. + + ### Notes + - Requires S3 credentials (API key and secret) and a bucket name to function. + - The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures. + - Uses `boto3` for interaction with the S3 API. + """ +} diff --git a/src/auto_archiver/storages/s3.py b/src/auto_archiver/modules/s3/s3.py similarity index 95% rename from src/auto_archiver/storages/s3.py rename to src/auto_archiver/modules/s3/s3.py index 5139068..02b0613 100644 --- a/src/auto_archiver/storages/s3.py +++ b/src/auto_archiver/modules/s3/s3.py @@ -2,10 +2,11 @@ from typing import IO import boto3, os -from ..utils.misc import random_str -from ..core import Media -from ..storages import Storage -from ..enrichers import HashEnricher +from auto_archiver.utils.misc import random_str +from auto_archiver.core import Media +from auto_archiver.base_modules import Storage +# TODO +from auto_archiver.modules.hash_enricher import HashEnricher from loguru import logger NO_DUPLICATES_FOLDER = "no-dups/" diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index dd1d38a..f99c100 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -5,7 +5,7 @@ import base64 from selenium.common.exceptions import TimeoutException -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.utils import Webdriver, UrlUtil, random_str from auto_archiver.core import Media, Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 0474d8f..aba1d33 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -3,7 +3,7 @@ from slugify import slugify from urllib.parse import urlparse from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media diff --git a/src/auto_archiver/modules/telegram_extractor/__init__.py b/src/auto_archiver/modules/telegram_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/telegram_archiver/__manifest__.py b/src/auto_archiver/modules/telegram_extractor/__manifest__.py similarity index 78% rename from src/auto_archiver/modules/telegram_archiver/__manifest__.py rename to src/auto_archiver/modules/telegram_extractor/__manifest__.py index f3950b5..86b5e0f 100644 --- a/src/auto_archiver/modules/telegram_archiver/__manifest__.py +++ b/src/auto_archiver/modules/telegram_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Telegram Archiver", + "name": "Telegram Extractor", "type": ["extractor"], - "entry_point": "telegram_archiver:TelegramArchiver", "requires_setup": False, "external_dependencies": { "python": [ @@ -11,7 +10,7 @@ ], }, "description": """ - The `TelegramArchiver` retrieves publicly available media content from Telegram message links without requiring login credentials. + The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials. It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` is advised for more comprehensive functionality. diff --git a/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py similarity index 91% rename from src/auto_archiver/modules/telegram_archiver/telegram_archiver.py rename to src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index c5e5ef0..047d424 100644 --- a/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -2,16 +2,16 @@ import requests, re, html from bs4 import BeautifulSoup from loguru import logger -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata, Media -class TelegramArchiver(Archiver): +class TelegramExtractor(Extractor): """ - Archiver for telegram that does not require login, but the telethon_archiver is much more advised, + Extractor for telegram that does not require login, but the telethon_extractor is much more advised, will only return if at least one image or one video is found """ - name = "telegram_archiver" + name = "telegram_extractor" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/telethon_archiver/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py similarity index 90% rename from src/auto_archiver/modules/telethon_archiver/__manifest__.py rename to src/auto_archiver/modules/telethon_extractor/__manifest__.py index d44acf3..6f09ea6 100644 --- a/src/auto_archiver/modules/telethon_archiver/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -1,8 +1,7 @@ # TODO rm dependency on json { - "name": "telethon_archiver", + "name": "telethon_extractor", "type": ["extractor"], - "entry_point": "telethon_archiver:TelethonArchiver", "requires_setup": True, "external_dependencies": { "python": ["telethon", @@ -25,7 +24,7 @@ } }, "description": """ -The `TelethonArchiver` uses the Telethon library to archive posts and media from Telegram channels and groups. +The `TelethonExtractor` uses the Telethon library to archive posts and media from Telegram channels and groups. It supports private and public channels, downloading grouped posts with media, and can join channels using invite links if provided in the configuration. @@ -37,7 +36,7 @@ if provided in the configuration. - Outputs structured metadata and media using `Metadata` and `Media` objects. ### Setup -To use the `TelethonArchiver`, you must configure the following: +To use the `TelethonExtractor`, you must configure the following: - **API ID and API Hash**: Obtain these from [my.telegram.org](https://my.telegram.org/apps). - **Session File**: Optional, but records login sessions for future use (default: `secrets/anon.session`). - **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving. diff --git a/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py b/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py similarity index 98% rename from src/auto_archiver/modules/telethon_archiver/telethon_archiver.py rename to src/auto_archiver/modules/telethon_extractor/telethon_archiver.py index fc89c9e..811a280 100644 --- a/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py @@ -8,13 +8,13 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str -class TelethonArchiver(Archiver): - name = "telethon_archiver" +class TelethonArchiver(Extractor): + name = "telethon_extractor" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 3edd40c..a16d84a 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,7 +9,7 @@ and identify important moments without watching the entire video. import ffmpeg, os from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Media, Metadata, ArchivingContext from auto_archiver.utils.misc import random_str diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index a9cf753..473f880 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext from asn1crypto import pem import certifi -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor class TimestampingEnricher(Enricher): diff --git a/src/auto_archiver/modules/twitter_api_extractor/__init__.py b/src/auto_archiver/modules/twitter_api_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py similarity index 89% rename from src/auto_archiver/modules/twitter_api_archiver/__manifest__.py rename to src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index 5dc7364..ae1b0ff 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Twitter API Archiver", + "name": "Twitter API Extractor", "type": ["extractor"], - "entry_point": "twitter_api_archiver:TwitterApiArchiver", "requires_setup": True, "external_dependencies": { "python": ["requests", @@ -20,7 +19,7 @@ "access_secret": {"default": None, "help": "twitter API access_secret"}, }, "description": """ - The `TwitterApiArchiver` fetches tweets and associated media using the Twitter API. + The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API. It supports multiple API configurations for extended rate limits and reliable access. Features include URL expansion, media downloads (e.g., images, videos), and structured output via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens @@ -34,7 +33,7 @@ - Outputs structured metadata and media using `Metadata` and `Media` objects. ### Setup - To use the `TwitterApiArchiver`, you must provide valid Twitter API credentials via configuration: + To use the `TwitterApiExtractor`, you must provide valid Twitter API credentials via configuration: - **Bearer Token(s)**: A single token or a list for rate-limited API access. - **Consumer Key and Secret**: Required for user-authenticated API access. - **Access Token and Secret**: Complements the consumer key for enhanced API capabilities. diff --git a/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py similarity index 97% rename from src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py rename to src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py index 9c931ef..c5d03e0 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py @@ -8,11 +8,11 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata,Media -class TwitterApiArchiver(Archiver): - name = "twitter_api_archiver" +class TwitterApiExtractor(Extractor): + name = "twitter_api_extractor" link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") def __init__(self, config: dict) -> None: diff --git a/src/auto_archiver/modules/vk_extractor/__init__.py b/src/auto_archiver/modules/vk_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/vk_archiver/__manifest__.py b/src/auto_archiver/modules/vk_extractor/__manifest__.py similarity index 90% rename from src/auto_archiver/modules/vk_archiver/__manifest__.py rename to src/auto_archiver/modules/vk_extractor/__manifest__.py index 69bf162..bdcaf99 100644 --- a/src/auto_archiver/modules/vk_archiver/__manifest__.py +++ b/src/auto_archiver/modules/vk_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "VKontakte Archiver", + "name": "VKontakte Extractor", "type": ["extractor"], - "entry_point": "vk_archiver:VKArchiver", "requires_setup": True, "depends": ["core", "utils"], "external_dependencies": { @@ -14,7 +13,7 @@ "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, }, "description": """ -The `VkArchiver` fetches posts, text, and images from VK (VKontakte) social media pages. +The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages. This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract and download content. Note that VK videos are handled separately by the `YTDownloader`. diff --git a/src/auto_archiver/modules/vk_archiver/vk_archiver.py b/src/auto_archiver/modules/vk_extractor/vk_archiver.py similarity index 93% rename from src/auto_archiver/modules/vk_archiver/vk_archiver.py rename to src/auto_archiver/modules/vk_extractor/vk_archiver.py index 7ba7a68..2474769 100644 --- a/src/auto_archiver/modules/vk_archiver/vk_archiver.py +++ b/src/auto_archiver/modules/vk_extractor/vk_archiver.py @@ -2,16 +2,16 @@ from loguru import logger from vk_url_scraper import VkScraper from auto_archiver.utils.misc import dump_payload -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext -class VkArchiver(Archiver): +class VkExtractor(Extractor): """" VK videos are handled by YTDownloader, this archiver gets posts text and images. Currently only works for /wall posts """ - name = "vk_archiver" + name = "vk_extractor" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 124382b..3eb2b17 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -6,12 +6,11 @@ from loguru import logger from warcio.archiveiterator import ArchiveIterator from auto_archiver.core import Media, Metadata, ArchivingContext -from auto_archiver.enrichers import Enricher -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor, Enricher from auto_archiver.utils import UrlUtil, random_str -class WaczArchiverEnricher(Enricher, Archiver): +class WaczExtractorEnricher(Enricher, Extractor): """ Uses https://github.com/webrecorder/browsertrix-crawler to generate a .WACZ archive of the URL If used with [profiles](https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) diff --git a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py index 8ddec82..bcd2450 100644 --- a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py +++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py @@ -2,12 +2,11 @@ import json from loguru import logger import time, requests -from auto_archiver.enrichers import Enricher -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor, Enricher from auto_archiver.utils import UrlUtil from auto_archiver.core import Metadata -class WaybackArchiverEnricher(Enricher, Archiver): +class WaybackExtractorEnricher(Enricher, Extractor): """ Submits the current URL to the webarchive and returns a job_id or completed archive. diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index f6294f3..a00ba25 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -2,9 +2,9 @@ import traceback import requests, time from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.storages import S3Storage +from auto_archiver.modules import S3Storage class WhisperEnricher(Enricher): diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py deleted file mode 100644 index 0765833..0000000 --- a/src/auto_archiver/storages/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" This module contains the storage classes for the auto-archiver. - -""" \ No newline at end of file diff --git a/tests/archivers/test_archiver_base.py b/tests/archivers/test_archiver_base.py index d793706..721812a 100644 --- a/tests/archivers/test_archiver_base.py +++ b/tests/archivers/test_archiver_base.py @@ -1,9 +1,7 @@ import pytest -from auto_archiver.core import Metadata -from auto_archiver.core import Step from auto_archiver.core.metadata import Metadata -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor class TestArchiverBase(object): archiver_class: str = None @@ -13,7 +11,7 @@ class TestArchiverBase(object): def setup_archiver(self): assert self.archiver_class is not None, "self.archiver_class must be set on the subclass" assert self.config is not None, "self.config must be a dict set on the subclass" - self.archiver: Archiver = self.archiver_class({self.archiver_class.name: self.config}) + self.archiver: Extractor = self.archiver_class({self.archiver_class.name: self.config}) def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""): assert test_response is not False diff --git a/tests/formatters/test_html_formatter.py b/tests/formatters/test_html_formatter.py index 3540062..2719033 100644 --- a/tests/formatters/test_html_formatter.py +++ b/tests/formatters/test_html_formatter.py @@ -1,5 +1,4 @@ -from auto_archiver.core.context import ArchivingContext -from auto_archiver.formatters.html_formatter import HtmlFormatter +from auto_archiver.modules.html_formatter import HtmlFormatter from auto_archiver.core import Metadata, Media From c3403ced264dee3adedc9f595f3520be1d1518b5 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 23 Jan 2025 16:51:17 +0000 Subject: [PATCH 015/110] Rename storages for clarity --- src/auto_archiver/modules/{local => local_storage}/__init__.py | 0 .../modules/{local => local_storage}/__manifest__.py | 0 src/auto_archiver/modules/{local => local_storage}/local.py | 0 src/auto_archiver/modules/{s3 => s3_storage}/__init__.py | 0 src/auto_archiver/modules/{s3 => s3_storage}/__manifest__.py | 0 src/auto_archiver/modules/{s3 => s3_storage}/s3.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename src/auto_archiver/modules/{local => local_storage}/__init__.py (100%) rename src/auto_archiver/modules/{local => local_storage}/__manifest__.py (100%) rename src/auto_archiver/modules/{local => local_storage}/local.py (100%) rename src/auto_archiver/modules/{s3 => s3_storage}/__init__.py (100%) rename src/auto_archiver/modules/{s3 => s3_storage}/__manifest__.py (100%) rename src/auto_archiver/modules/{s3 => s3_storage}/s3.py (100%) diff --git a/src/auto_archiver/modules/local/__init__.py b/src/auto_archiver/modules/local_storage/__init__.py similarity index 100% rename from src/auto_archiver/modules/local/__init__.py rename to src/auto_archiver/modules/local_storage/__init__.py diff --git a/src/auto_archiver/modules/local/__manifest__.py b/src/auto_archiver/modules/local_storage/__manifest__.py similarity index 100% rename from src/auto_archiver/modules/local/__manifest__.py rename to src/auto_archiver/modules/local_storage/__manifest__.py diff --git a/src/auto_archiver/modules/local/local.py b/src/auto_archiver/modules/local_storage/local.py similarity index 100% rename from src/auto_archiver/modules/local/local.py rename to src/auto_archiver/modules/local_storage/local.py diff --git a/src/auto_archiver/modules/s3/__init__.py b/src/auto_archiver/modules/s3_storage/__init__.py similarity index 100% rename from src/auto_archiver/modules/s3/__init__.py rename to src/auto_archiver/modules/s3_storage/__init__.py diff --git a/src/auto_archiver/modules/s3/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py similarity index 100% rename from src/auto_archiver/modules/s3/__manifest__.py rename to src/auto_archiver/modules/s3_storage/__manifest__.py diff --git a/src/auto_archiver/modules/s3/s3.py b/src/auto_archiver/modules/s3_storage/s3.py similarity index 100% rename from src/auto_archiver/modules/s3/s3.py rename to src/auto_archiver/modules/s3_storage/s3.py From 50f4ebcdc3ab582190c2dc3c4bda750342304b69 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 23 Jan 2025 17:01:30 +0000 Subject: [PATCH 016/110] Move storage configs into individual manifests, assert format on useage. --- src/auto_archiver/base_modules/storage.py | 47 +++++++------------ .../modules/atlos/__manifest__.py | 22 +++++---- .../modules/gdrive_storage/__manifest__.py | 8 ++++ .../modules/local_storage/__manifest__.py | 9 +++- .../modules/s3_storage/__manifest__.py | 9 +++- 5 files changed, 52 insertions(+), 43 deletions(-) diff --git a/src/auto_archiver/base_modules/storage.py b/src/auto_archiver/base_modules/storage.py index 147da1f..da6b2ef 100644 --- a/src/auto_archiver/base_modules/storage.py +++ b/src/auto_archiver/base_modules/storage.py @@ -15,29 +15,6 @@ from slugify import slugify @dataclass class Storage(Step): name = "storage" - PATH_GENERATOR_OPTIONS = ["flat", "url", "random"] - FILENAME_GENERATOR_CHOICES = ["random", "static"] - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - assert self.path_generator in Storage.PATH_GENERATOR_OPTIONS, f"path_generator must be one of {Storage.PATH_GENERATOR_OPTIONS}" - assert self.filename_generator in Storage.FILENAME_GENERATOR_CHOICES, f"filename_generator must be one of {Storage.FILENAME_GENERATOR_CHOICES}" - - @staticmethod - def configs() -> dict: - return { - "path_generator": { - "default": "url", - "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", - "choices": Storage.PATH_GENERATOR_OPTIONS - }, - "filename_generator": { - "default": "random", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", - "choices": Storage.FILENAME_GENERATOR_CHOICES - } - } def init(name: str, config: dict) -> Storage: # only for typing... @@ -68,19 +45,27 @@ class Storage(Step): folder = ArchivingContext.get("folder", "") filename, ext = os.path.splitext(media.filename) - # path_generator logic - if self.path_generator == "flat": + # Handle path_generator logic + path_generator = ArchivingContext.get("path_generator", "url") + if path_generator == "flat": path = "" - filename = slugify(filename) # in case it comes with os.sep - elif self.path_generator == "url": path = slugify(url) - elif self.path_generator == "random": + filename = slugify(filename) # Ensure filename is slugified + elif path_generator == "url": + path = slugify(url) + elif path_generator == "random": path = ArchivingContext.get("random_path", random_str(24), True) + else: + raise ValueError(f"Invalid path_generator: {path_generator}") - # filename_generator logic - if self.filename_generator == "random": filename = random_str(24) - elif self.filename_generator == "static": + # Handle filename_generator logic + filename_generator = ArchivingContext.get("filename_generator", "random") + if filename_generator == "random": + filename = random_str(24) + elif filename_generator == "static": he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) hd = he.calculate_hash(media.filename) filename = hd[:24] + else: + raise ValueError(f"Invalid filename_generator: {filename_generator}") media.key = os.path.join(folder, path, f"{filename}{ext}") diff --git a/src/auto_archiver/modules/atlos/__manifest__.py b/src/auto_archiver/modules/atlos/__manifest__.py index cc357e3..c600e43 100644 --- a/src/auto_archiver/modules/atlos/__manifest__.py +++ b/src/auto_archiver/modules/atlos/__manifest__.py @@ -2,23 +2,25 @@ "name": "atlos_storage", "type": ["storage"], "requires_setup": True, - "external_dependencies": { - "python": ["loguru", "requests"], - "bin": [""] - }, + "external_dependencies": {"python": ["loguru", "requests"], "bin": [""]}, "configs": { - # TODO: get base storage configs - # TODO also? get_atlos_config_options() - + "path_generator": { + "default": "url", + "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + }, + "filename_generator": { + "default": "random", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + }, "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "cli_set": lambda cli_val, _: cli_val + "cli_set": lambda cli_val, _: cli_val, }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "cli_set": lambda cli_val, _: cli_val + "cli_set": lambda cli_val, _: cli_val, }, }, "description": """ @@ -34,5 +36,5 @@ ### Notes - Requires Atlos API configuration, including `atlos_url` and `api_token`. - Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials. - """ + """, } diff --git a/src/auto_archiver/modules/gdrive_storage/__manifest__.py b/src/auto_archiver/modules/gdrive_storage/__manifest__.py index cc598e2..e7e4650 100644 --- a/src/auto_archiver/modules/gdrive_storage/__manifest__.py +++ b/src/auto_archiver/modules/gdrive_storage/__manifest__.py @@ -12,6 +12,14 @@ m = { ], }, "configs": { + "path_generator": { + "default": "url", + "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + }, + "filename_generator": { + "default": "random", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + }, # TODO: get base storage configs "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, diff --git a/src/auto_archiver/modules/local_storage/__manifest__.py b/src/auto_archiver/modules/local_storage/__manifest__.py index 5220555..7247885 100644 --- a/src/auto_archiver/modules/local_storage/__manifest__.py +++ b/src/auto_archiver/modules/local_storage/__manifest__.py @@ -6,7 +6,14 @@ m = { "python": ["loguru"], }, "configs": { - # TODO: get base storage configs + "path_generator": { + "default": "url", + "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + }, + "filename_generator": { + "default": "random", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + }, "save_to": {"default": "./archived", "help": "folder where to save archived content"}, "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, }, diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py index 239e0fe..210eefa 100644 --- a/src/auto_archiver/modules/s3_storage/__manifest__.py +++ b/src/auto_archiver/modules/s3_storage/__manifest__.py @@ -6,7 +6,14 @@ m = { "python": ["boto3", "loguru"], }, "configs": { - # TODO: get base storage configs + "path_generator": { + "default": "url", + "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + }, + "filename_generator": { + "default": "random", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + }, "bucket": {"default": None, "help": "S3 bucket name"}, "region": {"default": None, "help": "S3 region name"}, "key": {"default": None, "help": "S3 API key"}, From b27bf8ffebe931755211e10eeda8e5f21c1cac3d Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 23 Jan 2025 20:32:19 +0100 Subject: [PATCH 017/110] Fix up loading/storing configs + unit tests --- src/auto_archiver/core/config.py | 89 +++++++++++++++-------- src/auto_archiver/core/loader.py | 10 ++- src/auto_archiver/core/orchestrator.py | 86 +++++++++++++++------- src/auto_archiver/core/step.py | 4 +- tests/test_config.py | 99 ++++++++++++++++++++++++++ 5 files changed, 228 insertions(+), 60 deletions(-) create mode 100644 tests/test_config.py diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index f5d9fae..3811e2b 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -6,12 +6,17 @@ flexible setup in various environments. """ import argparse -import yaml +from ruamel.yaml import YAML, CommentedMap +from ruamel.yaml.comments import CommentedMap + from dataclasses import dataclass, field from collections import OrderedDict - +from collections.abc import Iterable +from copy import deepcopy from .loader import MODULE_TYPES +from typing import Any, List + # configurable_parents = [ # Feeder, # Enricher, @@ -50,21 +55,16 @@ from .loader import MODULE_TYPES # parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') # parser.add_argument('--version', action='version', version=__version__) -EMPTY_CONFIG = { +EMPTY_CONFIG = CommentedMap(**{ "steps": dict((f"{module_type}s", []) for module_type in MODULE_TYPES) -} -class LoadFromFile (argparse.Action): - def __call__ (self, parser, namespace, values, option_string = None): - with values as f: - # parse arguments in the file and store them in the target namespace - parser.parse_args(f.read().split(), namespace) +}) -def to_dot_notation(yaml_conf: str) -> argparse.ArgumentParser: +def to_dot_notation(yaml_conf: CommentedMap | dict) -> argparse.ArgumentParser: dotdict = {} def process_subdict(subdict, prefix=""): for key, value in subdict.items(): - if type(value) == dict: + if is_dict_type(value): process_subdict(value, f"{prefix}{key}.") else: dotdict[f"{prefix}{key}"] = value @@ -72,31 +72,64 @@ def to_dot_notation(yaml_conf: str) -> argparse.ArgumentParser: process_subdict(yaml_conf) return dotdict -def merge_dicts(dotdict, yaml_dict): - def process_subdict(subdict, prefix=""): - for key, value in subdict.items(): - if "." in key: - keys = key.split(".") - subdict = yaml_dict - for k in keys[:-1]: - subdict = subdict.setdefault(k, {}) - subdict[keys[-1]] = value - else: - yaml_dict[key] = value +def from_dot_notation(dotdict: dict) -> dict: + normal_dict = {} + + def add_part(key, value, current_dict): + if "." in key: + key_parts = key.split(".") + current_dict.setdefault(key_parts[0], {}) + add_part(".".join(key_parts[1:]), value, current_dict[key_parts[0]]) + else: + current_dict[key] = value + + for key, value in dotdict.items(): + add_part(key, value, normal_dict) + + return normal_dict + + +def is_list_type(value): + return isinstance(value, list) or isinstance(value, tuple) or isinstance(value, set) + +def is_dict_type(value): + return isinstance(value, dict) or isinstance(value, CommentedMap) + +def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap: + yaml_dict: CommentedMap = deepcopy(yaml_dict) + + # first deal with lists, since 'update' replaces lists from a in b, but we want to extend + def update_dict(subdict, yaml_subdict): + for key, value in yaml_subdict.items(): + if not subdict.get(key): + continue + + if is_dict_type(value): + update_dict(subdict[key], value) + elif is_list_type(value): + yaml_subdict[key].extend(s for s in subdict[key] if s not in yaml_subdict[key]) + else: + yaml_subdict[key] = subdict[key] + + update_dict(from_dot_notation(dotdict), yaml_dict) - process_subdict(dotdict) return yaml_dict -def read_yaml(yaml_filename: str) -> dict: +yaml = YAML() +def read_yaml(yaml_filename: str) -> CommentedMap: + config = None try: with open(yaml_filename, "r", encoding="utf-8") as inf: - config = yaml.safe_load(inf) + config = yaml.load(inf) except FileNotFoundError: - config = EMPTY_CONFIG + pass + if not config: + config = EMPTY_CONFIG + return config -def store_yaml(config: dict, yaml_filename: str): +def store_yaml(config: CommentedMap, yaml_filename: str): with open(yaml_filename, "w", encoding="utf-8") as outf: - yaml.dump(config, outf, default_flow_style=False) \ No newline at end of file + yaml.dump(config, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index 1ae9810..310e0e6 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -91,7 +91,11 @@ def load_module(module: str) -> object: # TODO: change return type to Step logger.info(f"Loading module '{module.display_name}'...") loaded_module = __import__(qualname) - _LOADED_MODULES[module.name] = getattr(sys.modules[qualname], module.entry_point)() + instance = getattr(sys.modules[qualname], module.entry_point)() + if not getattr(instance, 'name', None): + instance.name = module.name + + _LOADED_MODULES[module.name] = instance return _LOADED_MODULES[module.name] @@ -109,7 +113,7 @@ def load_manifest(module_path): def get_module(module_name): # get a module by name try: - return available_modules(limit_to_modules=[module_name], with_manifest=True, suppress_warnings=True)[0] + return available_modules(limit_to_modules=[module_name], with_manifest=True)[0] except IndexError: return None @@ -149,6 +153,6 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [] if not suppress_warnings: for module in limit_to_modules: if not any(module == m.name for m in all_modules): - logger.warning(f"Module '{module}' not found in available modules. Are you sure it's installed?") + logger.warning(f"Module '{module}' not found. Are you sure it's installed?") return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 1b4fee0..b17dcec 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -10,7 +10,7 @@ from urllib.parse import urlparse from ipaddress import ip_address import argparse import os -from os.path import join, dirname +import sys from rich_argparse import RichHelpFormatter @@ -27,6 +27,14 @@ from loguru import logger DEFAULT_CONFIG_FILE = "orchestration.yaml" +class UniqueAppendAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + if not hasattr(namespace, self.dest): + setattr(namespace, self.dest, []) + for value in values: + if value not in getattr(namespace, self.dest): + getattr(namespace, self.dest).append(value) + class ArchivingOrchestrator: # def __init__(self, config: Config) -> None: @@ -59,20 +67,22 @@ class ArchivingOrchestrator: parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') # override the default 'help' so we can inject all the configs and show those parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit') - parser.add_argument('-s', '--store', action='store_true', dest='store', help='Store the created config in the config file') + parser.add_argument('-s', '--store', dest='store', default=True, help='Store the created config in the config file', action=argparse.BooleanOptionalAction) + self.basic_parser = parser def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None: parser = argparse.ArgumentParser( - parents = [self.basic_parser], add_help=False, ) - self.add_steps_args(parser) + self.add_additional_args(parser) # check what mode we're in # if we have a config file, use that to decide which modules to load # if simple, we'll load just the modules that has requires_setup = False # if full, we'll load all modules + # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser' + # but should we add them? Or should we just add them to the 'complete' parser? if yaml_config != EMPTY_CONFIG: # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? @@ -85,7 +95,7 @@ class ArchivingOrchestrator: if modules := getattr(basic_config, f"{module_type}s", []): enabled_modules.extend(modules) - self.add_module_args(available_modules(with_manifest=True, limit_to_modules=set(enabled_modules)), parser) + self.add_module_args(available_modules(with_manifest=True, limit_to_modules=set(enabled_modules), suppress_warnings=True), parser) elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) @@ -97,36 +107,45 @@ class ArchivingOrchestrator: # load all modules, they're not using the 'simple' mode self.add_module_args(available_modules(with_manifest=True), parser) - - breakpoint() parser.set_defaults(**to_dot_notation(yaml_config)) + breakpoint() # reload the parser with the new arguments, now that we have them parsed, unknown = parser.parse_known_args(unused_args) + + # merge the new config with the old one + self.config = merge_dicts(vars(parsed), yaml_config) + # clean out args from the base_parser that we don't want in the config + for key in vars(basic_config): + self.config.pop(key, None) + + # setup the logging + self.setup_logging() + if unknown: logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?") - # merge the new config with the old one - merged_yaml_config = merge_dicts(vars(parsed), yaml_config) - - if (merged_yaml_config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file): + if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file): logger.info(f"Storing configuration file to {basic_config.config_file}") - store_yaml(yaml_config, basic_config.config_file) + store_yaml(self.config, basic_config.config_file) - self.config = merged_yaml_config - return self.config - def add_steps_args(self, parser: argparse.ArgumentParser = None): + def add_additional_args(self, parser: argparse.ArgumentParser = None): if not parser: parser = self.parser - parser.add_argument('--feeders', action='store', dest='steps.feeders', nargs='+', help='the feeders to use') - parser.add_argument('--enrichers', action='store', dest='steps.enrichers', nargs='+', help='the enrichers to use') - parser.add_argument('--extractors', action='store', dest='steps.extractors', nargs='+', help='the extractors to use') - parser.add_argument('--databases', action='store', dest='steps.databases', nargs='+', help='the databases to use') - parser.add_argument('--storages', action='store', dest='steps.storages', nargs='+', help='the storages to use') - parser.add_argument('--formatters', action='store', dest='steps.formatters', nargs='+', help='the formatter to use') + parser.add_argument('--feeders', dest='steps.feeders', nargs='+', help='the feeders to use', action=UniqueAppendAction) + parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction) + parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction) + parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction) + parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction) + parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction) + + # logging arguments + parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO') + parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) + parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None): @@ -152,20 +171,29 @@ class ArchivingOrchestrator: # for the help message, we want to load *all* possible modules and show the help # add configs as arg parser arguments - self.add_steps_args(self.basic_parser) + self.add_additional_args(self.basic_parser) self.add_module_args(parser=self.basic_parser) self.basic_parser.print_help() exit() + def setup_logging(self): + # setup loguru logging + logger.remove() # remove the default logger + + logging_config = self.config['logging'] + logger.add(sys.stderr, level=logging_config['level']) + if log_file := logging_config['file']: + logger.add(log_file, rotation=logging_config['logging.rotation']) + + def install_modules(self): """ Swaps out the previous 'strings' in the config with the actual modules """ - + + invalid_modules = [] for module_type in MODULE_TYPES: - if module_type == 'enricher': - breakpoint() step_items = [] modules_to_load = self.config['steps'][f"{module_type}s"] @@ -179,7 +207,12 @@ class ArchivingOrchestrator: exit() for i, module in enumerate(modules_to_load): + if module in invalid_modules: + continue loaded_module = load_module(module) + if not loaded_module: + invalid_modules.append(module) + continue if loaded_module: step_items.append(loaded_module) check_steps_ok() @@ -212,7 +245,7 @@ class ArchivingOrchestrator: self.setup_complete_parser(basic_config, yaml_config, unused_args) - + self.install_modules() logger.info("FEEDERS: " + ", ".join(m.name for m in self.config['steps']['feeders'])) @@ -228,7 +261,6 @@ class ArchivingOrchestrator: def cleanup(self)->None: logger.info("Cleaning up") for e in self.config['steps']['extractors']: - breakpoint() e.cleanup() def feed(self) -> Generator[Metadata]: diff --git a/src/auto_archiver/core/step.py b/src/auto_archiver/core/step.py index 0c14381..2be99c1 100644 --- a/src/auto_archiver/core/step.py +++ b/src/auto_archiver/core/step.py @@ -7,5 +7,5 @@ by handling user configuration, validating the steps properties, and implementin from __future__ import annotations class Step: - # TODO: try and get this name from the manifest, so we don't have to set it twice - name: str \ No newline at end of file + # Nothing to see here :) + pass \ No newline at end of file diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..97793a0 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,99 @@ +import pytest +from auto_archiver.core import config +from ruamel.yaml.scanner import ScannerError +from ruamel.yaml.comments import CommentedMap + +def test_return_default_config_for_nonexistent_file(): + assert config.read_yaml("nonexistent_file.yaml") == config.EMPTY_CONFIG + +def test_return_default_config_for_empty_file(tmp_path): + empty_file = tmp_path / "empty_file.yaml" + empty_file.write_text("") + assert config.read_yaml(empty_file) == config.EMPTY_CONFIG + +def test_raise_error_on_invalid_yaml(tmp_path): + invalid_yaml = tmp_path / "invalid_yaml.yaml" + invalid_yaml.write_text("key: \"value_without_end_quote") + # make sure it raises ScannerError + with pytest.raises(ScannerError): + config.read_yaml(invalid_yaml) + +def test_write_yaml(tmp_path): + yaml_file = tmp_path / "write_yaml.yaml" + config.store_yaml(config.EMPTY_CONFIG, yaml_file.as_posix()) + assert "steps:\n" in yaml_file.read_text() + +def test_round_trip_comments(tmp_path): + yaml_file = tmp_path / "round_trip_comments.yaml" + + with open(yaml_file, "w") as f: + f.write("generic_extractor:\n facebook_cookie: abc # end of line comment\n subtitles: true\n # comments: false\n # livestreams: false\n list_type:\n - value1\n - value2") + + loaded = config.read_yaml(yaml_file) + # check the comments are preserved + assert loaded['generic_extractor']['facebook_cookie'] == "abc" + assert loaded['generic_extractor'].ca.items['facebook_cookie'][2].value == "# end of line comment\n" + + # add some more items to my_settings + loaded['generic_extractor']['list_type'].append("bellingcat") + config.store_yaml(loaded, yaml_file.as_posix()) + + assert "# comments: false" in yaml_file.read_text() + assert "facebook_cookie: abc # end of line comment" in yaml_file.read_text() + assert "abc # end of line comment" in yaml_file.read_text() + assert "- value2\n - bellingcat" in yaml_file.read_text() + +def test_merge_dicts(): + yaml_dict = config.EMPTY_CONFIG + yaml_dict['settings'] = CommentedMap(**{ + "key1": ["a"], + "key2": "old_value", + "key3": ["a", "b", "c"], + }) + + dotdict = { + "settings.key1": ["b", "c"], + "settings.key2": "new_value", + "settings.key3": ["b", "c", "d"], + } + merged = config.merge_dicts(dotdict, yaml_dict) + assert merged["settings"]["key1"] == ["a", "b", "c"] + assert merged["settings"]["key2"] == "new_value" + assert merged["settings"]["key3"] == ["a", "b", "c", "d"] + + +def test_check_types(): + assert config.is_list_type([]) == True + assert config.is_list_type(()) == True + assert config.is_list_type(set()) == True + assert config.is_list_type({}) == False + assert config.is_list_type("") == False + assert config.is_dict_type({}) == True + assert config.is_dict_type(CommentedMap()) == True + assert config.is_dict_type([]) == False + assert config.is_dict_type("") == False + +def test_from_dot_notation(): + dotdict = { + "settings.key1": ["a", "b", "c"], + "settings.key2": "new_value", + "settings.key3.key4": "value", + } + normal_dict = config.from_dot_notation(dotdict) + assert normal_dict["settings"]["key1"] == ["a", "b", "c"] + assert normal_dict["settings"]["key2"] == "new_value" + assert normal_dict["settings"]["key3"]["key4"] == "value" + +def test_to_dot_notation(): + yaml_dict = config.EMPTY_CONFIG + yaml_dict['settings'] = { + "key1": ["a", "b", "c"], + "key2": "new_value", + "key3": { + "key4": "value", + } + } + dotdict = config.to_dot_notation(yaml_dict) + assert dotdict["settings.key1"] == ["a", "b", "c"] + assert dotdict["settings.key2"] == "new_value" + assert dotdict["settings.key3.key4"] == "value" \ No newline at end of file From 06f6e34d9dd0215a97f31d20828a397964854bd5 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 23 Jan 2025 20:38:36 +0100 Subject: [PATCH 018/110] Revert changes to orchestrator to avoid merge conflicts --- src/auto_archiver/core/orchestrator.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 38edafe..1b4fee0 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -33,7 +33,7 @@ class ArchivingOrchestrator: # self.feeder: Feeder = config.feeder # self.formatter: Formatter = config.formatter # self.enrichers: List[Enricher] = config.enrichers - # self.extractors: List[Extractor] = config.extractors + # self.archivers: List[Archiver] = config.archivers # self.databases: List[Database] = config.databases # self.storages: List[Storage] = config.storages # ArchivingContext.set("storages", self.storages, keep_on_reset=True) @@ -80,7 +80,7 @@ class ArchivingOrchestrator: for module_type in MODULE_TYPES: enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) - # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'extractors', 'databases', 'storages', 'formatter' + # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter' for module_type in MODULE_TYPES: if modules := getattr(basic_config, f"{module_type}s", []): enabled_modules.extend(modules) @@ -98,7 +98,7 @@ class ArchivingOrchestrator: self.add_module_args(available_modules(with_manifest=True), parser) - # breakpoint() + breakpoint() parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -165,8 +165,7 @@ class ArchivingOrchestrator: for module_type in MODULE_TYPES: if module_type == 'enricher': - pass - # breakpoint() + breakpoint() step_items = [] modules_to_load = self.config['steps'][f"{module_type}s"] @@ -229,7 +228,7 @@ class ArchivingOrchestrator: def cleanup(self)->None: logger.info("Cleaning up") for e in self.config['steps']['extractors']: - # breakpoint() + breakpoint() e.cleanup() def feed(self) -> Generator[Metadata]: From 9befb9776c3fb311d570d03140de3ab4e22878ef Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 23 Jan 2025 21:08:54 +0100 Subject: [PATCH 019/110] Fix loading modules when entry_point isn't set --- src/auto_archiver/core/config.py | 11 +++++----- src/auto_archiver/core/loader.py | 21 ++++++++++++++++--- src/auto_archiver/core/orchestrator.py | 2 -- .../modules/generic_extractor/__manifest__.py | 1 - tests/test_config.py | 4 ++++ 5 files changed, 28 insertions(+), 11 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 3811e2b..1c19ae2 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -100,16 +100,17 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap: # first deal with lists, since 'update' replaces lists from a in b, but we want to extend def update_dict(subdict, yaml_subdict): - for key, value in yaml_subdict.items(): - if not subdict.get(key): + for key, value in subdict.items(): + if not yaml_subdict.get(key): + yaml_subdict[key] = value continue if is_dict_type(value): - update_dict(subdict[key], value) + update_dict(value, yaml_subdict[key]) elif is_list_type(value): - yaml_subdict[key].extend(s for s in subdict[key] if s not in yaml_subdict[key]) + yaml_subdict[key].extend(s for s in value if s not in yaml_subdict[key]) else: - yaml_subdict[key] = subdict[key] + yaml_subdict[key] = value update_dict(from_dot_notation(dotdict), yaml_dict) diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index 310e0e6..bbd686e 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -25,6 +25,7 @@ MANIFEST_FILE = "__manifest__.py" _DEFAULT_MANIFEST = { 'name': '', 'author': 'Bellingcat', + 'type': [], 'requires_setup': True, 'description': '', 'dependencies': {}, @@ -90,8 +91,18 @@ def load_module(module: str) -> object: # TODO: change return type to Step qualname = f'auto_archiver.modules.{module.name}' logger.info(f"Loading module '{module.display_name}'...") - loaded_module = __import__(qualname) - instance = getattr(sys.modules[qualname], module.entry_point)() + # first import the whole module, to make sure it's working properly + __import__(qualname) + + + # then import the file for the entry point + file_name, class_name = module.entry_point.split('::') + sub_qualname = f'{qualname}.{file_name}' + + __import__(f'{qualname}.{file_name}', fromlist=[module.entry_point]) + + # finally, get the class instance + instance = getattr(sys.modules[sub_qualname], class_name)() if not getattr(instance, 'name', None): instance.name = module.name @@ -107,7 +118,11 @@ def load_manifest(module_path): manifest = copy.deepcopy(_DEFAULT_MANIFEST) with open(join(module_path, MANIFEST_FILE)) as f: - manifest.update(ast.literal_eval(f.read())) + try: + manifest.update(ast.literal_eval(f.read())) + except ( ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e: + logger.error(f"Error loading manifest from file {module_path}/{MANIFEST_FILE}: {e}") + return manifest return manifest def get_module(module_name): diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index b17dcec..2c9841e 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -109,7 +109,6 @@ class ArchivingOrchestrator: parser.set_defaults(**to_dot_notation(yaml_config)) - breakpoint() # reload the parser with the new arguments, now that we have them parsed, unknown = parser.parse_known_args(unused_args) @@ -180,7 +179,6 @@ class ArchivingOrchestrator: def setup_logging(self): # setup loguru logging logger.remove() # remove the default logger - logging_config = self.config['logging'] logger.add(sys.stderr, level=logging_config['level']) if log_file := logging_config['file']: diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 6f469c9..f46c13c 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -3,7 +3,6 @@ 'version': '0.1.0', 'author': 'Bellingcat', 'type': ['extractor', 'feeder', 'enricher'], - 'entry_point': 'GenericExtractor', # this class should be present in the __init__.py 'requires_setup': False, 'dependencies': { 'python': ['yt_dlp', 'requests', 'loguru', 'slugify'], diff --git a/tests/test_config.py b/tests/test_config.py index 97793a0..75fe515 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -49,17 +49,21 @@ def test_merge_dicts(): "key1": ["a"], "key2": "old_value", "key3": ["a", "b", "c"], + "key5": "value5", }) dotdict = { "settings.key1": ["b", "c"], "settings.key2": "new_value", "settings.key3": ["b", "c", "d"], + "settings.key4": "value4", } merged = config.merge_dicts(dotdict, yaml_dict) assert merged["settings"]["key1"] == ["a", "b", "c"] assert merged["settings"]["key2"] == "new_value" assert merged["settings"]["key3"] == ["a", "b", "c", "d"] + assert merged["settings"]["key4"] == "value4" + assert merged["settings"]["key5"] == "value5" def test_check_types(): From cbafbfab3fbf2bee6ff4ea70efb6dfe0740c1caa Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 24 Jan 2025 08:04:09 +0000 Subject: [PATCH 020/110] Revert Dockerfile changes --- Dockerfile | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8272c73..0ecc7f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.4.2 AS base +FROM webrecorder/browsertrix-crawler:1.0.4 AS base ENV RUNNING_IN_DOCKER=1 \ LANG=C.UTF-8 \ @@ -22,30 +22,28 @@ RUN add-apt-repository ppa:mozillateam/ppa && \ # Poetry and runtime -FROM base AS poetry-env +FROM base AS runtime ENV POETRY_NO_INTERACTION=1 \ POETRY_VIRTUALENVS_IN_PROJECT=1 \ POETRY_VIRTUALENVS_CREATE=1 -# Create a virtual environment for poetry and install it -RUN python3 -m venv /poetry-venv && \ - /poetry-venv/bin/python -m pip install --upgrade pip && \ - /poetry-venv/bin/python -m pip install "poetry>=2.0.0,<3.0.0" +RUN pip install --upgrade pip && \ + pip install "poetry>=2.0.0,<3.0.0" WORKDIR /app COPY pyproject.toml poetry.lock README.md ./ # Copy dependency files and install dependencies (excluding the package itself) -RUN /poetry-venv/bin/poetry install --only main --no-root --no-cache +RUN poetry install --only main --no-root --no-cache # Copy code: This is needed for poetry to install the package itself, # but the environment should be cached from the previous step if toml and lock files haven't changed COPY ./src/ . -RUN /poetry-venv/bin/poetry install --only main --no-cache +RUN poetry install --only main --no-cache # Update PATH to include virtual environment binaries @@ -57,3 +55,4 @@ ENTRYPOINT ["python3", "-m", "auto_archiver"] # should be executed with 2 volumes (3 if local_storage is used) # docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml + From aa7ca93a43c6c224da43fb3c1f04e70e4e4dda47 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 24 Jan 2025 12:58:16 +0000 Subject: [PATCH 021/110] Update manifests and modules --- .../__init__.py | 0 .../database.py | 0 .../enricher.py | 0 .../extractor.py | 0 .../feeder.py | 0 .../formatter.py | 0 .../storage.py | 0 src/auto_archiver/modules/api_db/__init__.py | 1 + .../modules/api_db/__manifest__.py | 4 +- src/auto_archiver/modules/api_db/api_db.py | 2 +- src/auto_archiver/modules/atlos/__init__.py | 1 + .../modules/atlos/__manifest__.py | 4 +- src/auto_archiver/modules/atlos/atlos.py | 2 +- .../modules/atlos_db/__init__.py | 1 + .../modules/atlos_db/__manifest__.py | 3 +- .../modules/atlos_db/atlos_db.py | 2 +- .../modules/atlos_db/base_configs.py | 4 +- .../modules/atlos_feeder/__init__.py | 1 + .../modules/atlos_feeder/__manifest__.py | 4 +- .../modules/atlos_feeder/atlos_feeder.py | 2 +- .../modules/cli_feeder/__init__.py | 1 + .../modules/cli_feeder/__manifest__.py | 2 +- .../modules/cli_feeder/cli_feeder.py | 2 +- .../modules/console_db/__init__.py | 1 + .../modules/console_db/console_db.py | 2 +- src/auto_archiver/modules/csv_db/__init__.py | 1 + src/auto_archiver/modules/csv_db/csv_db.py | 2 +- .../modules/csv_feeder/__init__.py | 1 + .../modules/csv_feeder/__manifest__.py | 2 +- .../modules/csv_feeder/csv_feeder.py | 4 +- .../modules/gdrive_storage/__init__.py | 1 + .../modules/gdrive_storage/__manifest__.py | 21 ++++---- .../modules/gdrive_storage/gdrive_storage.py | 2 +- .../modules/generic_extractor/bluesky.py | 2 +- .../modules/generic_extractor/dropin.py | 2 +- .../generic_extractor/generic_extractor.py | 2 +- .../modules/generic_extractor/truth.py | 2 +- .../modules/generic_extractor/twitter.py | 2 +- .../modules/gsheet_db/__init__.py | 1 + .../modules/gsheet_db/__manifest__.py | 38 ++++++++++++++ .../gsheet_db.py | 3 +- .../modules/gsheet_feeder/__init__.py | 1 + .../__manifest__.py | 23 ++------- .../gsheet_feeder.py | 2 +- .../modules/gsheet_processor/__init__.py | 0 .../modules/hash_enricher/__init__.py | 2 +- .../modules/hash_enricher/hash_enricher.py | 2 +- .../modules/html_formatter/__init__.py | 1 + .../modules/html_formatter/__manifest__.py | 2 +- .../modules/html_formatter/html_formatter.py | 2 +- .../instagram_api_extractor/__init__.py | 1 + .../instagram_api_archiver.py | 2 +- .../modules/instagram_extractor/__init__.py | 1 + .../instagram_extractor/instagram_archiver.py | 2 +- .../instagram_tbot_extractor/__init__.py | 1 + .../instagram_tbot_archiver.py | 2 +- .../modules/local_storage/__init__.py | 1 + .../modules/local_storage/__manifest__.py | 4 +- .../modules/local_storage/local.py | 2 +- .../modules/meta_enricher/__init__.py | 1 + .../modules/meta_enricher/meta_enricher.py | 2 +- .../modules/metadata_enricher/__init__.py | 1 + .../metadata_enricher/metadata_enricher.py | 2 +- .../modules/mute_formatter/__init__.py | 1 + .../modules/pdq_hash_enricher/__init__.py | 1 + .../pdq_hash_enricher/pdq_hash_enricher.py | 2 +- .../modules/s3_storage/__init__.py | 1 + .../modules/s3_storage/__manifest__.py | 50 ++++++++++--------- src/auto_archiver/modules/s3_storage/s3.py | 2 +- .../modules/screenshot_enricher/__init__.py | 1 + .../screenshot_enricher.py | 2 +- .../modules/ssl_enricher/__init__.py | 1 + .../modules/ssl_enricher/ssl_enricher.py | 2 +- .../modules/telegram_extractor/__init__.py | 1 + .../telegram_extractor/telegram_extractor.py | 2 +- .../modules/telethon_extractor/__init__.py | 1 + .../telethon_extractor/__manifest__.py | 5 +- .../telethon_extractor/telethon_archiver.py | 2 +- .../modules/thumbnail_enricher/__init__.py | 1 + .../thumbnail_enricher/thumbnail_enricher.py | 2 +- .../modules/timestamping_enricher/__init__.py | 1 + .../timestamping_enricher/__manifest__.py | 2 +- .../timestamping_enricher.py | 4 +- .../modules/twitter_api_extractor/__init__.py | 1 + .../twitter_api_extractor/__manifest__.py | 2 +- .../twitter_api_archiver.py | 2 +- .../modules/vk_extractor/__init__.py | 1 + .../modules/vk_extractor/vk_archiver.py | 2 +- .../modules/wacz_enricher/__init__.py | 1 + .../modules/wacz_enricher/wacz_enricher.py | 2 +- .../modules/wayback_enricher/__init__.py | 1 + .../wayback_enricher/wayback_enricher.py | 2 +- .../modules/whisper_enricher/__init__.py | 1 + .../whisper_enricher/whisper_enricher.py | 4 +- tests/archivers/test_archiver_base.py | 2 +- 95 files changed, 172 insertions(+), 115 deletions(-) rename src/auto_archiver/{base_modules => base_processors}/__init__.py (100%) rename src/auto_archiver/{base_modules => base_processors}/database.py (100%) rename src/auto_archiver/{base_modules => base_processors}/enricher.py (100%) rename src/auto_archiver/{base_modules => base_processors}/extractor.py (100%) rename src/auto_archiver/{base_modules => base_processors}/feeder.py (100%) rename src/auto_archiver/{base_modules => base_processors}/formatter.py (100%) rename src/auto_archiver/{base_modules => base_processors}/storage.py (100%) create mode 100644 src/auto_archiver/modules/gsheet_db/__init__.py create mode 100644 src/auto_archiver/modules/gsheet_db/__manifest__.py rename src/auto_archiver/modules/{gsheet_processor => gsheet_db}/gsheet_db.py (98%) create mode 100644 src/auto_archiver/modules/gsheet_feeder/__init__.py rename src/auto_archiver/modules/{gsheet_processor => gsheet_feeder}/__manifest__.py (63%) rename src/auto_archiver/modules/{gsheet_processor => gsheet_feeder}/gsheet_feeder.py (98%) delete mode 100644 src/auto_archiver/modules/gsheet_processor/__init__.py diff --git a/src/auto_archiver/base_modules/__init__.py b/src/auto_archiver/base_processors/__init__.py similarity index 100% rename from src/auto_archiver/base_modules/__init__.py rename to src/auto_archiver/base_processors/__init__.py diff --git a/src/auto_archiver/base_modules/database.py b/src/auto_archiver/base_processors/database.py similarity index 100% rename from src/auto_archiver/base_modules/database.py rename to src/auto_archiver/base_processors/database.py diff --git a/src/auto_archiver/base_modules/enricher.py b/src/auto_archiver/base_processors/enricher.py similarity index 100% rename from src/auto_archiver/base_modules/enricher.py rename to src/auto_archiver/base_processors/enricher.py diff --git a/src/auto_archiver/base_modules/extractor.py b/src/auto_archiver/base_processors/extractor.py similarity index 100% rename from src/auto_archiver/base_modules/extractor.py rename to src/auto_archiver/base_processors/extractor.py diff --git a/src/auto_archiver/base_modules/feeder.py b/src/auto_archiver/base_processors/feeder.py similarity index 100% rename from src/auto_archiver/base_modules/feeder.py rename to src/auto_archiver/base_processors/feeder.py diff --git a/src/auto_archiver/base_modules/formatter.py b/src/auto_archiver/base_processors/formatter.py similarity index 100% rename from src/auto_archiver/base_modules/formatter.py rename to src/auto_archiver/base_processors/formatter.py diff --git a/src/auto_archiver/base_modules/storage.py b/src/auto_archiver/base_processors/storage.py similarity index 100% rename from src/auto_archiver/base_modules/storage.py rename to src/auto_archiver/base_processors/storage.py diff --git a/src/auto_archiver/modules/api_db/__init__.py b/src/auto_archiver/modules/api_db/__init__.py index e69de29..2070b06 100644 --- a/src/auto_archiver/modules/api_db/__init__.py +++ b/src/auto_archiver/modules/api_db/__init__.py @@ -0,0 +1 @@ +from api_db import AAApiDb \ No newline at end of file diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index a55f26c..c422b49 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -15,7 +15,9 @@ "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, "store_results": {"default": True, "help": "when set, will send the results to the API database."}, - "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, + "tags": {"default": [], "help": "what tags to add to the archived URL", + "type": lambda val: set(val.split(",")), + } }, "description": """ Provides integration with the Auto-Archiver API for querying and storing archival data. diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index 44373c6..d2b43b7 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,7 +2,7 @@ from typing import Union import requests, os from loguru import logger -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/atlos/__init__.py b/src/auto_archiver/modules/atlos/__init__.py index e69de29..de7fead 100644 --- a/src/auto_archiver/modules/atlos/__init__.py +++ b/src/auto_archiver/modules/atlos/__init__.py @@ -0,0 +1 @@ +from .atlos import AtlosStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos/__manifest__.py b/src/auto_archiver/modules/atlos/__manifest__.py index c600e43..ec356a5 100644 --- a/src/auto_archiver/modules/atlos/__manifest__.py +++ b/src/auto_archiver/modules/atlos/__manifest__.py @@ -15,12 +15,12 @@ "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "cli_set": lambda cli_val, _: cli_val, + "type": str, }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "cli_set": lambda cli_val, _: cli_val, + "type": str, }, }, "description": """ diff --git a/src/auto_archiver/modules/atlos/atlos.py b/src/auto_archiver/modules/atlos/atlos.py index 28b7cb1..0b16714 100644 --- a/src/auto_archiver/modules/atlos/atlos.py +++ b/src/auto_archiver/modules/atlos/atlos.py @@ -5,7 +5,7 @@ import requests import hashlib from auto_archiver.core import Media, Metadata -from auto_archiver.base_modules import Storage +from auto_archiver.base_processors import Storage from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/atlos_db/__init__.py b/src/auto_archiver/modules/atlos_db/__init__.py index e69de29..1552e39 100644 --- a/src/auto_archiver/modules/atlos_db/__init__.py +++ b/src/auto_archiver/modules/atlos_db/__init__.py @@ -0,0 +1 @@ +from atlos_db import AtlosDb \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py index 470d07d..941206f 100644 --- a/src/auto_archiver/modules/atlos_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_db/__manifest__.py @@ -11,12 +11,11 @@ "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "cli_set": lambda cli_val, _: cli_val }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "cli_set": lambda cli_val, _: cli_val + "type": str }, }, "description": """ diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py index cbf1c89..c1d20a1 100644 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -6,7 +6,7 @@ from csv import DictWriter from dataclasses import asdict import requests -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/atlos_db/base_configs.py b/src/auto_archiver/modules/atlos_db/base_configs.py index c47c711..f672f82 100644 --- a/src/auto_archiver/modules/atlos_db/base_configs.py +++ b/src/auto_archiver/modules/atlos_db/base_configs.py @@ -3,11 +3,11 @@ def get_atlos_config_options(): "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "cli_set": lambda cli_val, _: cli_val + "type": str }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "cli_set": lambda cli_val, _: cli_val + "type": str }, } \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder/__init__.py b/src/auto_archiver/modules/atlos_feeder/__init__.py index e69de29..67b243a 100644 --- a/src/auto_archiver/modules/atlos_feeder/__init__.py +++ b/src/auto_archiver/modules/atlos_feeder/__init__.py @@ -0,0 +1 @@ +from .atlos_feeder import AtlosFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder/__manifest__.py b/src/auto_archiver/modules/atlos_feeder/__manifest__.py index f0b216b..91fed32 100644 --- a/src/auto_archiver/modules/atlos_feeder/__manifest__.py +++ b/src/auto_archiver/modules/atlos_feeder/__manifest__.py @@ -9,12 +9,12 @@ "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "cli_set": lambda cli_val, _: cli_val + "type": str }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "cli_set": lambda cli_val, _: cli_val + "type": str }, }, "description": """ diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index 0810b73..8a4a31a 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -1,7 +1,7 @@ from loguru import logger import requests -from auto_archiver.base_modules import Feeder +from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/cli_feeder/__init__.py b/src/auto_archiver/modules/cli_feeder/__init__.py index e69de29..9c85787 100644 --- a/src/auto_archiver/modules/cli_feeder/__init__.py +++ b/src/auto_archiver/modules/cli_feeder/__init__.py @@ -0,0 +1 @@ +from .cli_feeder import CLIFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index fcb9099..2e2c53e 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -9,7 +9,7 @@ "urls": { "default": None, "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + "type": lambda val: set(val.split(",")), }, }, "description": """ diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index e826533..3380f90 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_modules import Feeder +from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/console_db/__init__.py b/src/auto_archiver/modules/console_db/__init__.py index e69de29..343f09c 100644 --- a/src/auto_archiver/modules/console_db/__init__.py +++ b/src/auto_archiver/modules/console_db/__init__.py @@ -0,0 +1 @@ +from .console_db import ConsoleDb \ No newline at end of file diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py index a0d43b7..9dfeb2c 100644 --- a/src/auto_archiver/modules/console_db/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_db/__init__.py b/src/auto_archiver/modules/csv_db/__init__.py index e69de29..1092cb2 100644 --- a/src/auto_archiver/modules/csv_db/__init__.py +++ b/src/auto_archiver/modules/csv_db/__init__.py @@ -0,0 +1 @@ +from .csv_db import CSVDb \ No newline at end of file diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py index 6e5d873..eec4ec6 100644 --- a/src/auto_archiver/modules/csv_db/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -3,7 +3,7 @@ from loguru import logger from csv import DictWriter from dataclasses import asdict -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_feeder/__init__.py b/src/auto_archiver/modules/csv_feeder/__init__.py index e69de29..161b78d 100644 --- a/src/auto_archiver/modules/csv_feeder/__init__.py +++ b/src/auto_archiver/modules/csv_feeder/__init__.py @@ -0,0 +1 @@ +from .csv_feeder import CSVFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index ad5d40b..fb644ec 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -11,7 +11,7 @@ "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + "type": lambda val: set(val.split(",")), }, "column": { "default": None, diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 4cf2f11..a830791 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,7 +1,7 @@ from loguru import logger import csv -from auto_archiver.base_modules import Feeder +from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import url_or_none @@ -17,7 +17,7 @@ class CSVFeeder(Feeder): "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + "type": lambda val: set(val.split(",")), }, "column": { "default": None, diff --git a/src/auto_archiver/modules/gdrive_storage/__init__.py b/src/auto_archiver/modules/gdrive_storage/__init__.py index e69de29..2765e4b 100644 --- a/src/auto_archiver/modules/gdrive_storage/__init__.py +++ b/src/auto_archiver/modules/gdrive_storage/__init__.py @@ -0,0 +1 @@ +from .gdrive_storage import GDriveStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/gdrive_storage/__manifest__.py b/src/auto_archiver/modules/gdrive_storage/__manifest__.py index e7e4650..b81b717 100644 --- a/src/auto_archiver/modules/gdrive_storage/__manifest__.py +++ b/src/auto_archiver/modules/gdrive_storage/__manifest__.py @@ -1,4 +1,4 @@ -m = { +{ "name": "Google Drive Storage", "type": ["storage"], "requires_setup": True, @@ -12,15 +12,16 @@ m = { ], }, "configs": { - "path_generator": { - "default": "url", - "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", - }, - "filename_generator": { - "default": "random", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", - }, - # TODO: get base storage configs + "path_generator": { + "default": "url", + "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + "choices": ["flat", "url", "random"], + }, + "filename_generator": { + "default": "random", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "choices": ["random", "static"], + }, "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."}, diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 2e4ca48..652ff91 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -10,7 +10,7 @@ from google.oauth2.credentials import Credentials from google.auth.transport.requests import Request from auto_archiver.core import Media -from auto_archiver.base_modules import Storage +from auto_archiver.base_processors import Storage class GDriveStorage(Storage): diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index d4051aa..c75c373 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor from auto_archiver.core.metadata import Metadata, Media from .dropin import GenericDropin, InfoExtractor diff --git a/src/auto_archiver/modules/generic_extractor/dropin.py b/src/auto_archiver/modules/generic_extractor/dropin.py index 9de63d2..99cd71b 100644 --- a/src/auto_archiver/modules/generic_extractor/dropin.py +++ b/src/auto_archiver/modules/generic_extractor/dropin.py @@ -1,6 +1,6 @@ from yt_dlp.extractor.common import InfoExtractor from auto_archiver.core.metadata import Metadata -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor class GenericDropin: """Base class for dropins for the generic extractor. diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 8e4b2c4..ff9f8b4 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -5,7 +5,7 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor from ...core import Metadata, Media, ArchivingContext class GenericExtractor(Extractor): diff --git a/src/auto_archiver/modules/generic_extractor/truth.py b/src/auto_archiver/modules/generic_extractor/truth.py index e713c90..f52a748 100644 --- a/src/auto_archiver/modules/generic_extractor/truth.py +++ b/src/auto_archiver/modules/generic_extractor/truth.py @@ -2,7 +2,7 @@ from typing import Type from auto_archiver.utils import traverse_obj from auto_archiver.core.metadata import Metadata, Media -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor from yt_dlp.extractor.common import InfoExtractor from dateutil.parser import parse as parse_dt diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 6cd22b1..11399d4 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -6,7 +6,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import UrlUtil -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor from .dropin import GenericDropin, InfoExtractor diff --git a/src/auto_archiver/modules/gsheet_db/__init__.py b/src/auto_archiver/modules/gsheet_db/__init__.py new file mode 100644 index 0000000..01fdee6 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_db/__init__.py @@ -0,0 +1 @@ +from .gsheet_db import GsheetsDb \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py new file mode 100644 index 0000000..df7fb6a --- /dev/null +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -0,0 +1,38 @@ +{ + "name": "Google Sheets Database", + "type": ["database"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "gspread", "python-slugify"], + }, + "configs": { + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + "type": lambda val: set(val.split(",")), + }, + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed", + "type": lambda val: set(val.split(",")), + }, + "use_sheet_names_in_stored_paths": { + "default": True, + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + } + }, + "description": """ + GsheetsDatabase: + Handles integration with Google Sheets for tracking archival tasks. + +### Features +- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. +- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. +- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. +- Skips redundant updates for empty or invalid data fields. + +### Notes +- Currently works only with metadata provided by GsheetFeeder. +- Requires configuration of a linked Google Sheet and appropriate API credentials. + """ +} diff --git a/src/auto_archiver/modules/gsheet_processor/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py similarity index 98% rename from src/auto_archiver/modules/gsheet_processor/gsheet_db.py rename to src/auto_archiver/modules/gsheet_db/gsheet_db.py index cf46473..9ed3642 100644 --- a/src/auto_archiver/modules/gsheet_processor/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -5,7 +5,7 @@ from urllib.parse import quote from loguru import logger -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import GWorksheet @@ -105,5 +105,4 @@ class GsheetsDb(Database): elif self.sheet_id: print(self.sheet_id) - return gw, row diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py new file mode 100644 index 0000000..f122bb2 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder/__init__.py @@ -0,0 +1 @@ +from .gsheet_feeder import GsheetsFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_processor/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py similarity index 63% rename from src/auto_archiver/modules/gsheet_processor/__manifest__.py rename to src/auto_archiver/modules/gsheet_feeder/__manifest__.py index 8a554fe..c6790ca 100644 --- a/src/auto_archiver/modules/gsheet_processor/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -9,12 +9,12 @@ "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + "type": lambda val: set(val.split(",")), }, "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + "type": lambda val: set(val.split(",")), }, "use_sheet_names_in_stored_paths": { "default": True, @@ -22,11 +22,7 @@ } }, "description": """ - Google Sheets Module. - - Handles feeding from a google sheet as well as an optional write back to the sheet. - - ## GsheetsFeeder + GsheetsFeeder A Google Sheets-based feeder for the Auto Archiver. This reads data from Google Sheets and filters rows based on user-defined rules. @@ -41,18 +37,5 @@ ### Notes - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. - Create the sheet using the template provided in the docs. - - ## GsheetsDatabase: - Handles integration with Google Sheets for tracking archival tasks. - -### Features -- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. -- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. -- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. -- Skips redundant updates for empty or invalid data fields. - -### Notes -- Currently works only with metadata provided by GsheetFeeder. -- Requires configuration of a linked Google Sheet and appropriate API credentials. """ } diff --git a/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py similarity index 98% rename from src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py rename to src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 4df9042..a417615 100644 --- a/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -13,7 +13,7 @@ import gspread, os from loguru import logger from slugify import slugify -from auto_archiver.base_modules import Feeder +from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import Gsheets, GWorksheet diff --git a/src/auto_archiver/modules/gsheet_processor/__init__.py b/src/auto_archiver/modules/gsheet_processor/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/auto_archiver/modules/hash_enricher/__init__.py b/src/auto_archiver/modules/hash_enricher/__init__.py index e7faff7..18ec885 100644 --- a/src/auto_archiver/modules/hash_enricher/__init__.py +++ b/src/auto_archiver/modules/hash_enricher/__init__.py @@ -1 +1 @@ -from hash_enricher import HashEnricher \ No newline at end of file +from .hash_enricher import HashEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index c8eacb1..8731b06 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -10,7 +10,7 @@ making it suitable for handling large files efficiently. import hashlib from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/html_formatter/__init__.py b/src/auto_archiver/modules/html_formatter/__init__.py index e69de29..432ef33 100644 --- a/src/auto_archiver/modules/html_formatter/__init__.py +++ b/src/auto_archiver/modules/html_formatter/__init__.py @@ -0,0 +1 @@ +from .html_formatter import HtmlFormatter \ No newline at end of file diff --git a/src/auto_archiver/modules/html_formatter/__manifest__.py b/src/auto_archiver/modules/html_formatter/__manifest__.py index 55ca5da..259a3d1 100644 --- a/src/auto_archiver/modules/html_formatter/__manifest__.py +++ b/src/auto_archiver/modules/html_formatter/__manifest__.py @@ -1,4 +1,4 @@ -m = { +{ "name": "HTML Formatter", "type": ["formatter"], "requires_setup": False, diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index cc8a4da..a1951f3 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -9,7 +9,7 @@ import base64 from auto_archiver.version import __version__ from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.base_modules import Formatter +from auto_archiver.base_processors import Formatter from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str diff --git a/src/auto_archiver/modules/instagram_api_extractor/__init__.py b/src/auto_archiver/modules/instagram_api_extractor/__init__.py index e69de29..068b8c6 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/__init__.py +++ b/src/auto_archiver/modules/instagram_api_extractor/__init__.py @@ -0,0 +1 @@ +from .instagram_api_archiver import InstagramAPIExtractor diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py index 5206b41..c1271fc 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py @@ -16,7 +16,7 @@ from loguru import logger from retrying import retry from tqdm import tqdm -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Media from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/instagram_extractor/__init__.py b/src/auto_archiver/modules/instagram_extractor/__init__.py index e69de29..37ec56c 100644 --- a/src/auto_archiver/modules/instagram_extractor/__init__.py +++ b/src/auto_archiver/modules/instagram_extractor/__init__.py @@ -0,0 +1 @@ +from .instagram_archiver import InstagramExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py b/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py index c6bde62..2b9bece 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py @@ -7,7 +7,7 @@ import re, os, shutil, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata from auto_archiver.core import Media diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py b/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py index e69de29..1b4dbc3 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py @@ -0,0 +1 @@ +from .instagram_tbot_archiver import InstagramTbotExtractor diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py index 5c3ad24..36c8a06 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py @@ -15,7 +15,7 @@ from sqlite3 import OperationalError from loguru import logger from telethon.sync import TelegramClient -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str diff --git a/src/auto_archiver/modules/local_storage/__init__.py b/src/auto_archiver/modules/local_storage/__init__.py index e69de29..6746373 100644 --- a/src/auto_archiver/modules/local_storage/__init__.py +++ b/src/auto_archiver/modules/local_storage/__init__.py @@ -0,0 +1 @@ +from .local import LocalStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/local_storage/__manifest__.py b/src/auto_archiver/modules/local_storage/__manifest__.py index 7247885..c012be0 100644 --- a/src/auto_archiver/modules/local_storage/__manifest__.py +++ b/src/auto_archiver/modules/local_storage/__manifest__.py @@ -1,4 +1,4 @@ -m = { +{ "name": "Local Storage", "type": ["storage"], "requires_setup": False, @@ -9,10 +9,12 @@ m = { "path_generator": { "default": "url", "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + "choices": ["flat", "url", "random"], }, "filename_generator": { "default": "random", "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "choices": ["random", "static"], }, "save_to": {"default": "./archived", "help": "folder where to save archived content"}, "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, diff --git a/src/auto_archiver/modules/local_storage/local.py b/src/auto_archiver/modules/local_storage/local.py index ef0966d..cac692e 100644 --- a/src/auto_archiver/modules/local_storage/local.py +++ b/src/auto_archiver/modules/local_storage/local.py @@ -5,7 +5,7 @@ import os from loguru import logger from auto_archiver.core import Media -from auto_archiver.base_modules import Storage +from auto_archiver.base_processors import Storage class LocalStorage(Storage): diff --git a/src/auto_archiver/modules/meta_enricher/__init__.py b/src/auto_archiver/modules/meta_enricher/__init__.py index e69de29..4e1d330 100644 --- a/src/auto_archiver/modules/meta_enricher/__init__.py +++ b/src/auto_archiver/modules/meta_enricher/__init__.py @@ -0,0 +1 @@ +from .meta_enricher import MetaEnricher diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index 52d8eb2..f9b74f7 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -2,7 +2,7 @@ import datetime import os from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/metadata_enricher/__init__.py b/src/auto_archiver/modules/metadata_enricher/__init__.py index e69de29..020bd4a 100644 --- a/src/auto_archiver/modules/metadata_enricher/__init__.py +++ b/src/auto_archiver/modules/metadata_enricher/__init__.py @@ -0,0 +1 @@ +from .metadata_enricher import MetadataEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index b729d36..cb68b98 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -2,7 +2,7 @@ import subprocess import traceback from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/mute_formatter/__init__.py b/src/auto_archiver/modules/mute_formatter/__init__.py index e69de29..b92fce9 100644 --- a/src/auto_archiver/modules/mute_formatter/__init__.py +++ b/src/auto_archiver/modules/mute_formatter/__init__.py @@ -0,0 +1 @@ +from .mute_formatter import MuteFormatter diff --git a/src/auto_archiver/modules/pdq_hash_enricher/__init__.py b/src/auto_archiver/modules/pdq_hash_enricher/__init__.py index e69de29..b444197 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/__init__.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/__init__.py @@ -0,0 +1 @@ +from .pdq_hash_enricher import PdqHashEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index dc70465..7e3f467 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -16,7 +16,7 @@ import numpy as np from PIL import Image, UnidentifiedImageError from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/s3_storage/__init__.py b/src/auto_archiver/modules/s3_storage/__init__.py index e69de29..1c826fd 100644 --- a/src/auto_archiver/modules/s3_storage/__init__.py +++ b/src/auto_archiver/modules/s3_storage/__init__.py @@ -0,0 +1 @@ +from .s3 import S3Storage \ No newline at end of file diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py index 210eefa..fc41eb3 100644 --- a/src/auto_archiver/modules/s3_storage/__manifest__.py +++ b/src/auto_archiver/modules/s3_storage/__manifest__.py @@ -1,4 +1,4 @@ -m = { +{ "name": "S3 Storage", "type": ["storage"], "requires_setup": True, @@ -6,29 +6,31 @@ m = { "python": ["boto3", "loguru"], }, "configs": { - "path_generator": { - "default": "url", - "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", - }, - "filename_generator": { - "default": "random", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", - }, - "bucket": {"default": None, "help": "S3 bucket name"}, - "region": {"default": None, "help": "S3 region name"}, - "key": {"default": None, "help": "S3 API key"}, - "secret": {"default": None, "help": "S3 API secret"}, - "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, - "endpoint_url": { - "default": 'https://{region}.digitaloceanspaces.com', - "help": "S3 bucket endpoint, {region} are inserted at runtime" - }, - "cdn_url": { - "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', - "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" - }, - "private": {"default": False, "help": "if true S3 files will not be readable online"}, - }, + "path_generator": { + "default": "url", + "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + "choices": ["flat", "url", "random"], + }, + "filename_generator": { + "default": "random", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "choices": ["random", "static"], + }, + "bucket": {"default": None, "help": "S3 bucket name"}, + "region": {"default": None, "help": "S3 region name"}, + "key": {"default": None, "help": "S3 API key"}, + "secret": {"default": None, "help": "S3 API secret"}, + "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, + "endpoint_url": { + "default": 'https://{region}.digitaloceanspaces.com', + "help": "S3 bucket endpoint, {region} are inserted at runtime" + }, + "cdn_url": { + "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', + "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" + }, + "private": {"default": False, "help": "if true S3 files will not be readable online"}, + }, "description": """ S3Storage: A storage module for saving media files to an S3-compatible object storage. diff --git a/src/auto_archiver/modules/s3_storage/s3.py b/src/auto_archiver/modules/s3_storage/s3.py index 02b0613..fe221d0 100644 --- a/src/auto_archiver/modules/s3_storage/s3.py +++ b/src/auto_archiver/modules/s3_storage/s3.py @@ -4,7 +4,7 @@ import boto3, os from auto_archiver.utils.misc import random_str from auto_archiver.core import Media -from auto_archiver.base_modules import Storage +from auto_archiver.base_processors import Storage # TODO from auto_archiver.modules.hash_enricher import HashEnricher from loguru import logger diff --git a/src/auto_archiver/modules/screenshot_enricher/__init__.py b/src/auto_archiver/modules/screenshot_enricher/__init__.py index e69de29..393f726 100644 --- a/src/auto_archiver/modules/screenshot_enricher/__init__.py +++ b/src/auto_archiver/modules/screenshot_enricher/__init__.py @@ -0,0 +1 @@ +from .screenshot_enricher import ScreenshotEnricher diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index f99c100..626cd1f 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -5,7 +5,7 @@ import base64 from selenium.common.exceptions import TimeoutException -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.utils import Webdriver, UrlUtil, random_str from auto_archiver.core import Media, Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/ssl_enricher/__init__.py b/src/auto_archiver/modules/ssl_enricher/__init__.py index e69de29..23d2bee 100644 --- a/src/auto_archiver/modules/ssl_enricher/__init__.py +++ b/src/auto_archiver/modules/ssl_enricher/__init__.py @@ -0,0 +1 @@ +from .ssl_enricher import SSLEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index aba1d33..965f699 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -3,7 +3,7 @@ from slugify import slugify from urllib.parse import urlparse from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media diff --git a/src/auto_archiver/modules/telegram_extractor/__init__.py b/src/auto_archiver/modules/telegram_extractor/__init__.py index e69de29..1fd80c2 100644 --- a/src/auto_archiver/modules/telegram_extractor/__init__.py +++ b/src/auto_archiver/modules/telegram_extractor/__init__.py @@ -0,0 +1 @@ +from .telegram_extractor import TelegramExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index 047d424..31bdaca 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -2,7 +2,7 @@ import requests, re, html from bs4 import BeautifulSoup from loguru import logger -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py index e69de29..424792f 100644 --- a/src/auto_archiver/modules/telethon_extractor/__init__.py +++ b/src/auto_archiver/modules/telethon_extractor/__init__.py @@ -0,0 +1 @@ +from .telethon_archiver import TelethonArchiver \ No newline at end of file diff --git a/src/auto_archiver/modules/telethon_extractor/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py index 6f09ea6..bb49882 100644 --- a/src/auto_archiver/modules/telethon_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -1,4 +1,4 @@ -# TODO rm dependency on json +import json { "name": "telethon_extractor", "type": ["extractor"], @@ -19,8 +19,7 @@ "channel_invites": { "default": {}, "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", - # TODO - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + "type": lambda x: json.loads(x), } }, "description": """ diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py b/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py index 811a280..8b49a10 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py @@ -8,7 +8,7 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str diff --git a/src/auto_archiver/modules/thumbnail_enricher/__init__.py b/src/auto_archiver/modules/thumbnail_enricher/__init__.py index e69de29..fe20719 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/__init__.py +++ b/src/auto_archiver/modules/thumbnail_enricher/__init__.py @@ -0,0 +1 @@ +from .thumbnail_enricher import ThumbnailEnricher diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index a16d84a..8c34502 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,7 +9,7 @@ and identify important moments without watching the entire video. import ffmpeg, os from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Media, Metadata, ArchivingContext from auto_archiver.utils.misc import random_str diff --git a/src/auto_archiver/modules/timestamping_enricher/__init__.py b/src/auto_archiver/modules/timestamping_enricher/__init__.py index e69de29..62d358a 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__init__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__init__.py @@ -0,0 +1 @@ +from .timestamping_enricher import TimestampingEnricher diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py index a66cc31..b49b61b 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -21,7 +21,7 @@ "http://tss.accv.es:8318/tsa" ], "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + "type": lambda val: set(val.split(",")), } }, "description": """ diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index 473f880..0e159fa 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext from asn1crypto import pem import certifi -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor class TimestampingEnricher(Enricher): diff --git a/src/auto_archiver/modules/twitter_api_extractor/__init__.py b/src/auto_archiver/modules/twitter_api_extractor/__init__.py index e69de29..cea3872 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__init__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__init__.py @@ -0,0 +1 @@ +from .twitter_api_archiver import TwitterApiExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index ae1b0ff..0a314b5 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -12,7 +12,7 @@ "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, + "type": lambda val: set(val.split(",")),}, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py index c5d03e0..ea669b4 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py @@ -8,7 +8,7 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata,Media class TwitterApiExtractor(Extractor): diff --git a/src/auto_archiver/modules/vk_extractor/__init__.py b/src/auto_archiver/modules/vk_extractor/__init__.py index e69de29..29fe59d 100644 --- a/src/auto_archiver/modules/vk_extractor/__init__.py +++ b/src/auto_archiver/modules/vk_extractor/__init__.py @@ -0,0 +1 @@ +from .vk_archiver import VkExtractor diff --git a/src/auto_archiver/modules/vk_extractor/vk_archiver.py b/src/auto_archiver/modules/vk_extractor/vk_archiver.py index 2474769..eb4c171 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_archiver.py +++ b/src/auto_archiver/modules/vk_extractor/vk_archiver.py @@ -2,7 +2,7 @@ from loguru import logger from vk_url_scraper import VkScraper from auto_archiver.utils.misc import dump_payload -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext diff --git a/src/auto_archiver/modules/wacz_enricher/__init__.py b/src/auto_archiver/modules/wacz_enricher/__init__.py index e69de29..686b8d8 100644 --- a/src/auto_archiver/modules/wacz_enricher/__init__.py +++ b/src/auto_archiver/modules/wacz_enricher/__init__.py @@ -0,0 +1 @@ +from .wacz_enricher import WaczExtractorEnricher diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 3eb2b17..cd52b67 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -6,7 +6,7 @@ from loguru import logger from warcio.archiveiterator import ArchiveIterator from auto_archiver.core import Media, Metadata, ArchivingContext -from auto_archiver.base_modules import Extractor, Enricher +from auto_archiver.base_processors import Extractor, Enricher from auto_archiver.utils import UrlUtil, random_str diff --git a/src/auto_archiver/modules/wayback_enricher/__init__.py b/src/auto_archiver/modules/wayback_enricher/__init__.py index e69de29..9782831 100644 --- a/src/auto_archiver/modules/wayback_enricher/__init__.py +++ b/src/auto_archiver/modules/wayback_enricher/__init__.py @@ -0,0 +1 @@ +from .wayback_enricher import WaybackExtractorEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py index bcd2450..6942727 100644 --- a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py +++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py @@ -2,7 +2,7 @@ import json from loguru import logger import time, requests -from auto_archiver.base_modules import Extractor, Enricher +from auto_archiver.base_processors import Extractor, Enricher from auto_archiver.utils import UrlUtil from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/whisper_enricher/__init__.py b/src/auto_archiver/modules/whisper_enricher/__init__.py index e69de29..d3d3526 100644 --- a/src/auto_archiver/modules/whisper_enricher/__init__.py +++ b/src/auto_archiver/modules/whisper_enricher/__init__.py @@ -0,0 +1 @@ +from .whisper_enricher import WhisperEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index a00ba25..d14c537 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -2,9 +2,9 @@ import traceback import requests, time from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.modules import S3Storage +from auto_archiver.modules.s3_storage import S3Storage class WhisperEnricher(Enricher): diff --git a/tests/archivers/test_archiver_base.py b/tests/archivers/test_archiver_base.py index 721812a..6223879 100644 --- a/tests/archivers/test_archiver_base.py +++ b/tests/archivers/test_archiver_base.py @@ -1,7 +1,7 @@ import pytest from auto_archiver.core.metadata import Metadata -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor class TestArchiverBase(object): archiver_class: str = None From 0453d95f569639baa490f6159d739e066bd82002 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 24 Jan 2025 13:24:54 +0000 Subject: [PATCH 022/110] fix config parsing in manifests --- src/auto_archiver/modules/api_db/__manifest__.py | 2 +- src/auto_archiver/modules/cli_feeder/__manifest__.py | 2 +- src/auto_archiver/modules/csv_feeder/__manifest__.py | 2 +- src/auto_archiver/modules/csv_feeder/csv_feeder.py | 2 +- src/auto_archiver/modules/gsheet_db/__manifest__.py | 4 ++-- src/auto_archiver/modules/gsheet_feeder/__manifest__.py | 5 +++-- .../modules/telethon_extractor/__manifest__.py | 2 +- .../modules/timestamping_enricher/__manifest__.py | 2 +- .../modules/twitter_api_extractor/__manifest__.py | 2 +- src/auto_archiver/utils/misc.py | 9 ++++++++- 10 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index c422b49..4c85541 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -16,7 +16,7 @@ "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, "store_results": {"default": True, "help": "when set, will send the results to the API database."}, "tags": {"default": [], "help": "what tags to add to the archived URL", - "type": lambda val: set(val.split(",")), + "type": "auto_archiver.utils.parse_csv_to_set", } }, "description": """ diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index 2e2c53e..6f62cd2 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -9,7 +9,7 @@ "urls": { "default": None, "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - "type": lambda val: set(val.split(",")), + "type": "auto_archiver.utils.parse_csv_to_set", }, }, "description": """ diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index fb644ec..7e84a43 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -11,7 +11,7 @@ "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", - "type": lambda val: set(val.split(",")), + "type": "auto_archiver.utils.parse_csv_to_set", }, "column": { "default": None, diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index a830791..91a2b97 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -17,7 +17,7 @@ class CSVFeeder(Feeder): "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", - "type": lambda val: set(val.split(",")), + "type": "auto_archiver.utils.parse_csv_to_set", }, "column": { "default": None, diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py index df7fb6a..2f4f9b4 100644 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -9,12 +9,12 @@ "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "type": lambda val: set(val.split(",")), + "type": "auto_archiver.utils.parse_csv_to_set", }, "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "type": lambda val: set(val.split(",")), + "type": auto_archiver.utils.parse_csv_to_set, }, "use_sheet_names_in_stored_paths": { "default": True, diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index c6790ca..cb58035 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -1,6 +1,7 @@ { "name": "Google Sheets Procesor", "type": ["feeder"], + "entry_point": "gsheet_feeder::GsheetsFeeder", "requires_setup": True, "external_dependencies": { "python": ["loguru", "gspread", "python-slugify"], @@ -9,12 +10,12 @@ "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "type": lambda val: set(val.split(",")), + "type": "auto_archiver.utils.parse_csv_to_set", }, "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "type": lambda val: set(val.split(",")), + "type": "auto_archiver.utils.parse_csv_to_set", }, "use_sheet_names_in_stored_paths": { "default": True, diff --git a/src/auto_archiver/modules/telethon_extractor/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py index bb49882..5d71fdd 100644 --- a/src/auto_archiver/modules/telethon_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -19,7 +19,7 @@ import json "channel_invites": { "default": {}, "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", - "type": lambda x: json.loads(x), + "type": "auto_archiver.utils.json_loader", } }, "description": """ diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py index b49b61b..904fde6 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -21,7 +21,7 @@ "http://tss.accv.es:8318/tsa" ], "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - "type": lambda val: set(val.split(",")), + "type": auto_archiver.utils.parse_csv_to_set, } }, "description": """ diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index 0a314b5..239a0bb 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -12,7 +12,7 @@ "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", - "type": lambda val: set(val.split(",")),}, + "type": auto_archiver.utils.parse_csv_to_set,}, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index e312fc6..ad16401 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -53,4 +53,11 @@ def update_nested_dict(dictionary, update_dict): def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" - return str(uuid.uuid4()).replace("-", "")[:length] \ No newline at end of file + return str(uuid.uuid4()).replace("-", "")[:length] + + +def parse_csv_to_set(cli_val, cur_val): + return set(cli_val.split(",")) + +def json_loader(cli_val): + return json.loads(cli_val) From 024fe58377582ff13cc5f5ca6617bf94d97ec378 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 24 Jan 2025 13:33:12 +0000 Subject: [PATCH 023/110] fix config parsing in manifests, remove module level configs --- src/auto_archiver/modules/atlos/atlos.py | 4 --- .../modules/atlos_db/atlos_db.py | 5 --- .../modules/atlos_feeder/atlos_feeder.py | 5 --- .../modules/cli_feeder/cli_feeder.py | 10 ------ .../modules/csv_feeder/csv_feeder.py | 17 ---------- .../modules/gdrive_storage/gdrive_storage.py | 10 ------ .../modules/gsheet_db/__manifest__.py | 2 +- .../modules/gsheet_feeder/gsheet_feeder.py | 21 ------------- .../modules/html_formatter/html_formatter.py | 6 ---- .../modules/local_storage/local.py | 9 ------ src/auto_archiver/modules/s3_storage/s3.py | 21 ------------- .../screenshot_enricher.py | 15 --------- .../timestamping_enricher/__manifest__.py | 27 ++++++++++++---- .../timestamping_enricher.py | 31 ------------------- .../twitter_api_extractor/__manifest__.py | 2 +- src/auto_archiver/utils/gsheet.py | 29 ----------------- 16 files changed, 23 insertions(+), 191 deletions(-) diff --git a/src/auto_archiver/modules/atlos/atlos.py b/src/auto_archiver/modules/atlos/atlos.py index 0b16714..6a175d3 100644 --- a/src/auto_archiver/modules/atlos/atlos.py +++ b/src/auto_archiver/modules/atlos/atlos.py @@ -15,10 +15,6 @@ class AtlosStorage(Storage): def __init__(self, config: dict) -> None: super().__init__(config) - @staticmethod - def configs() -> dict: - return dict(Storage.configs(), **get_atlos_config_options()) - def get_cdn_url(self, _media: Media) -> str: # It's not always possible to provide an exact URL, because it's # possible that the media once uploaded could have been copied to diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py index c1d20a1..2e24491 100644 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -22,11 +22,6 @@ class AtlosDb(Database): # without this STEP.__init__ is not called super().__init__(config) - # TODO - @staticmethod - def configs() -> dict: - return get_atlos_config_options() - def failed(self, item: Metadata, reason: str) -> None: """Update DB accordingly for failure""" # If the item has no Atlos ID, there's nothing for us to do diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index 8a4a31a..262f21b 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -15,11 +15,6 @@ class AtlosFeeder(Feeder): if type(self.api_token) != str: raise Exception("Atlos Feeder did not receive an Atlos API token") - # TODO - @staticmethod - def configs() -> dict: - return get_atlos_config_options() - def __iter__(self) -> Metadata: # Get all the urls from the Atlos API count = 0 diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 3380f90..7d0d01f 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -13,16 +13,6 @@ class CLIFeeder(Feeder): if type(self.urls) != list or len(self.urls) == 0: raise Exception("CLI Feeder did not receive any URL to process") - # @staticmethod - # def configs() -> dict: - # return { - # "urls": { - # "default": None, - # "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - # "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) - # }, - # } - def __iter__(self) -> Metadata: for url in self.urls: logger.debug(f"Processing {url}") diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 91a2b97..7bff16e 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -9,23 +9,6 @@ class CSVFeeder(Feeder): name = "csv_feeder" - - @staticmethod - def configs() -> dict: - return { - "files": { - "default": None, - "help": "Path to the input file(s) to read the URLs from, comma separated. \ - Input files should be formatted with one URL per line", - "type": "auto_archiver.utils.parse_csv_to_set", - }, - "column": { - "default": None, - "help": "Column number or name to read the URLs from, 0-indexed", - } - } - - def __iter__(self) -> Metadata: url_column = self.column or 0 for file in self.files: diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 652ff91..4bcdb90 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -58,16 +58,6 @@ class GDriveStorage(Storage): self.service = build('drive', 'v3', credentials=creds) - @staticmethod - def configs() -> dict: - return dict( - Storage.configs(), - ** { - "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, - "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."}, - }) - def get_cdn_url(self, media: Media) -> str: """ only support files saved in a folder for GD diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py index 2f4f9b4..edc8d24 100644 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -14,7 +14,7 @@ "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "type": auto_archiver.utils.parse_csv_to_set, + "type": "auto_archiver.utils.parse_csv_to_set", }, "use_sheet_names_in_stored_paths": { "default": True, diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index a417615..01cd3b3 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -26,27 +26,6 @@ class GsheetsFeeder(Gsheets, Feeder): super().__init__(config) self.gsheets_client = gspread.service_account(filename=self.service_account) - # @staticmethod - # def configs() -> dict: - # return dict( - # Gsheets.configs(), - # ** { - # "allow_worksheets": { - # "default": set(), - # "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - # }, - # "block_worksheets": { - # "default": set(), - # "help": "(CSV) explicitly block some worksheets from being processed", - # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - # }, - # "use_sheet_names_in_stored_paths": { - # "default": True, - # "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", - # } - # }) - def __iter__(self) -> Metadata: sh = self.open_sheet() for ii, wks in enumerate(sh.worksheets()): diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index a1951f3..15104b2 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -28,12 +28,6 @@ class HtmlFormatter(Formatter): }) self.template = self.environment.get_template("html_template.html") - # @staticmethod - # def configs() -> dict: - # return { - # "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"} - # } - def format(self, item: Metadata) -> Media: url = item.get_url() if item.is_empty(): diff --git a/src/auto_archiver/modules/local_storage/local.py b/src/auto_archiver/modules/local_storage/local.py index cac692e..530f111 100644 --- a/src/auto_archiver/modules/local_storage/local.py +++ b/src/auto_archiver/modules/local_storage/local.py @@ -15,15 +15,6 @@ class LocalStorage(Storage): super().__init__(config) os.makedirs(self.save_to, exist_ok=True) - @staticmethod - def configs() -> dict: - return dict( - Storage.configs(), - ** { - "save_to": {"default": "./archived", "help": "folder where to save archived content"}, - "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, - }) - def get_cdn_url(self, media: Media) -> str: # TODO: is this viable with Storage.configs on path/filename? dest = os.path.join(self.save_to, media.key) diff --git a/src/auto_archiver/modules/s3_storage/s3.py b/src/auto_archiver/modules/s3_storage/s3.py index fe221d0..a637259 100644 --- a/src/auto_archiver/modules/s3_storage/s3.py +++ b/src/auto_archiver/modules/s3_storage/s3.py @@ -26,27 +26,6 @@ class S3Storage(Storage): if self.random_no_duplicate: logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.") - @staticmethod - def configs() -> dict: - return dict( - Storage.configs(), - ** { - "bucket": {"default": None, "help": "S3 bucket name"}, - "region": {"default": None, "help": "S3 region name"}, - "key": {"default": None, "help": "S3 API key"}, - "secret": {"default": None, "help": "S3 API secret"}, - "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, - "endpoint_url": { - "default": 'https://{region}.digitaloceanspaces.com', - "help": "S3 bucket endpoint, {region} are inserted at runtime" - }, - "cdn_url": { - "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', - "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" - }, - "private": {"default": False, "help": "if true S3 files will not be readable online"}, - }) - def get_cdn_url(self, media: Media) -> str: return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index 626cd1f..0140875 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -14,21 +14,6 @@ class ScreenshotEnricher(Enricher): def __init__(self, config: dict) -> None: super().__init__(config) - # TODO? - - - - # @staticmethod - # def configs() -> dict: - # return { - # "width": {"default": 1280, "help": "width of the screenshots"}, - # "height": {"default": 720, "help": "height of the screenshots"}, - # "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, - # "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, - # "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, - # "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, - # "print_options": {"default": {}, "help": "options to pass to the pdf printer"} - # } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py index 904fde6..e4ac925 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -15,13 +15,28 @@ "configs": { "tsa_urls": { "default": [ - "http://timestamp.digicert.com", - "http://timestamp.identrust.com", - "http://timestamp.globalsign.com/tsa/r6advanced1", - "http://tss.accv.es:8318/tsa" - ], + # [Adobe Approved Trust List] and [Windows Cert Store] + "http://timestamp.digicert.com", + "http://timestamp.identrust.com", + # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping + # "https://timestamp.sectigo.com", # wait 15 seconds between each request. + + # [Adobe: European Union Trusted Lists]. + # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. + + # [Windows Cert Store] + "http://timestamp.globalsign.com/tsa/r6advanced1", + # [Adobe: European Union Trusted Lists] and [Windows Cert Store] + # "http://ts.quovadisglobal.com/eu", # not valid for timestamping + # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain + # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain + # "http://tsa.sep.bg", # self-signed certificate in certificate chain + # "http://tsa.izenpe.com", #unable to get local issuer certificate + # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate + "http://tss.accv.es:8318/tsa", + ], "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - "type": auto_archiver.utils.parse_csv_to_set, + "type": "auto_archiver.utils.parse_csv_to_set", } }, "description": """ diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index 0e159fa..c90d42c 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -26,37 +26,6 @@ class TimestampingEnricher(Enricher): def __init__(self, config: dict) -> None: super().__init__(config) - # @staticmethod - # def configs() -> dict: - # return { - # "tsa_urls": { - # "default": [ - # # [Adobe Approved Trust List] and [Windows Cert Store] - # "http://timestamp.digicert.com", - # "http://timestamp.identrust.com", - # # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping - # # "https://timestamp.sectigo.com", # wait 15 seconds between each request. - # - # # [Adobe: European Union Trusted Lists]. - # # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. - # - # # [Windows Cert Store] - # "http://timestamp.globalsign.com/tsa/r6advanced1", - # - # # [Adobe: European Union Trusted Lists] and [Windows Cert Store] - # # "http://ts.quovadisglobal.com/eu", # not valid for timestamping - # # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain - # # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain - # # "http://tsa.sep.bg", # self-signed certificate in certificate chain - # # "http://tsa.izenpe.com", #unable to get local issuer certificate - # # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate - # "http://tss.accv.es:8318/tsa", - # ], - # "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - # } - # } - def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"RFC3161 timestamping existing files for {url=}") diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index 239a0bb..6e64269 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -12,7 +12,7 @@ "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", - "type": auto_archiver.utils.parse_csv_to_set,}, + "type": "auto_archiver.utils.parse_csv_to_set",}, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py index f84aab2..78f01c5 100644 --- a/src/auto_archiver/utils/gsheet.py +++ b/src/auto_archiver/utils/gsheet.py @@ -16,35 +16,6 @@ class Gsheets(Step): assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}" assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets." - @staticmethod - def configs() -> dict: - return { - "sheet": {"default": None, "help": "name of the sheet to archive"}, - "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, - "header": {"default": 1, "help": "index of the header row (starts at 1)"}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, - "columns": { - "default": { - 'url': 'link', - 'status': 'archive status', - 'folder': 'destination folder', - 'archive': 'archive location', - 'date': 'archive date', - 'thumbnail': 'thumbnail', - 'timestamp': 'upload timestamp', - 'title': 'upload title', - 'text': 'text content', - 'screenshot': 'screenshot', - 'hash': 'hash', - 'pdq_hash': 'perceptual hashes', - 'wacz': 'wacz', - 'replaywebpage': 'replaywebpage', - }, - "help": "names of columns in the google sheet (stringified JSON object)", - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) - }, - } - def open_sheet(self): if self.sheet: return self.gsheets_client.open(self.sheet) From 1942e8b819135128993b149e32702f459afaa0b3 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 24 Jan 2025 13:34:30 +0000 Subject: [PATCH 024/110] Gsheets utility revert --- src/auto_archiver/utils/gsheet.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py index 78f01c5..485344f 100644 --- a/src/auto_archiver/utils/gsheet.py +++ b/src/auto_archiver/utils/gsheet.py @@ -16,6 +16,36 @@ class Gsheets(Step): assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}" assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets." + # TODO merge this into gsheets processors manifest + @staticmethod + def configs() -> dict: + return { + "sheet": {"default": None, "help": "name of the sheet to archive"}, + "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, + "header": {"default": 1, "help": "index of the header row (starts at 1)"}, + "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, + "columns": { + "default": { + 'url': 'link', + 'status': 'archive status', + 'folder': 'destination folder', + 'archive': 'archive location', + 'date': 'archive date', + 'thumbnail': 'thumbnail', + 'timestamp': 'upload timestamp', + 'title': 'upload title', + 'text': 'text content', + 'screenshot': 'screenshot', + 'hash': 'hash', + 'pdq_hash': 'perceptual hashes', + 'wacz': 'wacz', + 'replaywebpage': 'replaywebpage', + }, + "help": "names of columns in the google sheet (stringified JSON object)", + "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + }, + } + def open_sheet(self): if self.sheet: return self.gsheets_client.open(self.sheet) From 3fc6ddfe85304a2104c2c63db8ef4feb3ae32966 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 24 Jan 2025 15:30:00 +0100 Subject: [PATCH 025/110] Tweaks to logging strings --- src/auto_archiver/core/orchestrator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 2c9841e..b610c24 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -197,7 +197,9 @@ class ArchivingOrchestrator: def check_steps_ok(): if not len(step_items): - logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration file and try again. Tried to load the following modules, but none were available: {modules_to_load}") + logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.") + if len(modules_to_load): + logger.error(f"Tried to load the following modules, but none were available: {modules_to_load}") exit() if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1: From dd402b456f748173e5fe83581ea3e1331f8b9183 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 24 Jan 2025 18:50:11 +0000 Subject: [PATCH 026/110] Fix and add types to manifest --- README.md | 2 +- .../modules/api_db/__manifest__.py | 8 ++- .../modules/cli_feeder/__manifest__.py | 1 - .../modules/csv_feeder/__manifest__.py | 1 - .../modules/gsheet_db/__manifest__.py | 2 - .../modules/gsheet_db/gsheet_db.py | 2 +- .../modules/gsheet_feeder/__init__.py | 1 + .../modules/gsheet_feeder/__manifest__.py | 31 +++++++++-- .../modules/gsheet_feeder/gsheet_feeder.py | 53 ++++++++++++++++--- .../gsheet_feeder}/gworksheet.py | 0 .../timestamping_enricher/__manifest__.py | 1 - .../twitter_api_extractor/__manifest__.py | 2 +- src/auto_archiver/utils/misc.py | 4 -- 13 files changed, 80 insertions(+), 28 deletions(-) rename src/auto_archiver/{utils => modules/gsheet_feeder}/gworksheet.py (100%) diff --git a/README.md b/README.md index 1bd6ddd..c52c464 100644 --- a/README.md +++ b/README.md @@ -218,7 +218,7 @@ configurations: ## Running on Google Sheets Feeder (gsheet_feeder) The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`. -This sheet must also have specific columns (case-insensitive) in the `header` as specified in [Gsheet.configs](src/auto_archiver/utils/gsheet.py). The default names of these columns and their purpose is: +This sheet must also have specific columns (case-insensitive) in the `header` as specified in [gsheet_feeder.__manifest__.py](src/auto_archiver/modules/gsheet_feeder/__manifest__.py). The default names of these columns and their purpose is: Inputs: diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index 4c85541..c89165f 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -13,11 +13,9 @@ "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, "author_id": {"default": None, "help": "which email to assign as author"}, "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, - "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, - "store_results": {"default": True, "help": "when set, will send the results to the API database."}, - "tags": {"default": [], "help": "what tags to add to the archived URL", - "type": "auto_archiver.utils.parse_csv_to_set", - } + "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",}, + "store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",}, + "tags": {"default": [], "help": "what tags to add to the archived URL",} }, "description": """ Provides integration with the Auto-Archiver API for querying and storing archival data. diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index 6f62cd2..febebd0 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -9,7 +9,6 @@ "urls": { "default": None, "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - "type": "auto_archiver.utils.parse_csv_to_set", }, }, "description": """ diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index 7e84a43..4d19b70 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -11,7 +11,6 @@ "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", - "type": "auto_archiver.utils.parse_csv_to_set", }, "column": { "default": None, diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py index edc8d24..8c54fe5 100644 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -9,12 +9,10 @@ "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "type": "auto_archiver.utils.parse_csv_to_set", }, "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "type": "auto_archiver.utils.parse_csv_to_set", }, "use_sheet_names_in_stored_paths": { "default": True, diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 9ed3642..239bc06 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -7,7 +7,7 @@ from loguru import logger from auto_archiver.base_processors import Database from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.utils import GWorksheet +from auto_archiver.modules.gsheet_feeder import GWorksheet class GsheetsDb(Database): diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py index f122bb2..bb4230a 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__init__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__init__.py @@ -1 +1,2 @@ +from .gworksheet import GWorksheet from .gsheet_feeder import GsheetsFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index cb58035..685a8fd 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -1,25 +1,48 @@ { - "name": "Google Sheets Procesor", + "name": "Google Sheets Feeder", "type": ["feeder"], - "entry_point": "gsheet_feeder::GsheetsFeeder", + "entry_point": "GsheetsFeeder", "requires_setup": True, "external_dependencies": { "python": ["loguru", "gspread", "python-slugify"], }, "configs": { + "sheet": {"default": None, "help": "name of the sheet to archive"}, + "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, + "header": {"default": 1, "help": "index of the header row (starts at 1)"}, + "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, + "columns": { + "default": { + 'url': 'link', + 'status': 'archive status', + 'folder': 'destination folder', + 'archive': 'archive location', + 'date': 'archive date', + 'thumbnail': 'thumbnail', + 'timestamp': 'upload timestamp', + 'title': 'upload title', + 'text': 'text content', + 'screenshot': 'screenshot', + 'hash': 'hash', + 'pdq_hash': 'perceptual hashes', + 'wacz': 'wacz', + 'replaywebpage': 'replaywebpage', + }, + "help": "names of columns in the google sheet (stringified JSON object)", + "type": "auto_archiver.utils.json_loader", + }, "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "type": "auto_archiver.utils.parse_csv_to_set", }, "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "type": "auto_archiver.utils.parse_csv_to_set", }, "use_sheet_names_in_stored_paths": { "default": True, "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + "type": "bool", } }, "description": """ diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 01cd3b3..321711e 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -8,23 +8,62 @@ The filtered rows are processed into `Metadata` objects. - validates the sheet's structure and filters rows based on input configurations. - Ensures only rows with valid URLs and unprocessed statuses are included. """ -import gspread, os +import os +import gspread from loguru import logger from slugify import slugify from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext -from auto_archiver.utils import Gsheets, GWorksheet +from . import GWorksheet -class GsheetsFeeder(Gsheets, Feeder): +class GsheetsFeeder(Feeder): name = "gsheet_feeder" - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.gsheets_client = gspread.service_account(filename=self.service_account) + # def __init__(self, config: dict) -> None: + # """ + # Initializes the GsheetsFeeder with preloaded configurations. + # """ + # super().__init__(config) + # # Initialize the gspread client with the provided service account file + # self.gsheets_client = gspread.service_account(filename=config["service_account"]) + # + # # Set up feeder-specific configurations from the config + # self.sheet_name = config.get("sheet") + # self.sheet_id = config.get("sheet_id") + # self.header = config.get("header", 1) + # self.columns = config.get("columns", {}) + # assert self.sheet_name or self.sheet_id, ( + # "You need to define either a 'sheet' name or a 'sheet_id' in your manifest." + # ) + + + # # Configuration attributes + # self.sheet = config.get("sheet") + # self.sheet_id = config.get("sheet_id") + # self.header = config.get("header", 1) + # self.columns = config.get("columns", {}) + # self.allow_worksheets = config.get("allow_worksheets", set()) + # self.block_worksheets = config.get("block_worksheets", set()) + # self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True) + + # Ensure the header is an integer + # try: + # self.header = int(self.header) + # except ValueError: + # pass + # assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}" + # assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined." + # + + def open_sheet(self): + if self.sheet: + return self.gsheets_client.open(self.sheet) + else: # self.sheet_id + return self.gsheets_client.open_by_key(self.sheet_id) + def __iter__(self) -> Metadata: sh = self.open_sheet() diff --git a/src/auto_archiver/utils/gworksheet.py b/src/auto_archiver/modules/gsheet_feeder/gworksheet.py similarity index 100% rename from src/auto_archiver/utils/gworksheet.py rename to src/auto_archiver/modules/gsheet_feeder/gworksheet.py diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py index e4ac925..496d211 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -36,7 +36,6 @@ "http://tss.accv.es:8318/tsa", ], "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - "type": "auto_archiver.utils.parse_csv_to_set", } }, "description": """ diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index 6e64269..02d0d6c 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -12,7 +12,7 @@ "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", - "type": "auto_archiver.utils.parse_csv_to_set",}, + }, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index ad16401..e985e3e 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -55,9 +55,5 @@ def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" return str(uuid.uuid4()).replace("-", "")[:length] - -def parse_csv_to_set(cli_val, cur_val): - return set(cli_val.split(",")) - def json_loader(cli_val): return json.loads(cli_val) From 96b35a272cc94f8e857cdd8947edbd7ff8d5d424 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 24 Jan 2025 18:51:15 +0000 Subject: [PATCH 027/110] Rm gsheet references in utils --- src/auto_archiver/utils/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/auto_archiver/utils/__init__.py b/src/auto_archiver/utils/__init__.py index 788f159..d2063d0 100644 --- a/src/auto_archiver/utils/__init__.py +++ b/src/auto_archiver/utils/__init__.py @@ -1,9 +1,7 @@ """ Auto Archiver Utilities. """ # we need to explicitly expose the available imports here -from .gworksheet import GWorksheet from .misc import * from .webdriver import Webdriver -from .gsheet import Gsheets from .url import UrlUtil from .atlos import get_atlos_config_options From 21a7ff052008beeda81f2aa060540e5c66944419 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 27 Jan 2025 08:43:18 +0000 Subject: [PATCH 028/110] Fix types in manifests --- src/auto_archiver/modules/atlos/__manifest__.py | 4 ++-- src/auto_archiver/modules/atlos_db/__manifest__.py | 2 +- .../modules/atlos_feeder/__manifest__.py | 4 ++-- .../modules/gsheet_feeder/gsheet_feeder.py | 14 +++++++------- .../modules/hash_enricher/__manifest__.py | 1 + 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/modules/atlos/__manifest__.py b/src/auto_archiver/modules/atlos/__manifest__.py index ec356a5..459fefe 100644 --- a/src/auto_archiver/modules/atlos/__manifest__.py +++ b/src/auto_archiver/modules/atlos/__manifest__.py @@ -15,12 +15,12 @@ "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "type": str, + "type": "str", }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": str, + "type": "str", }, }, "description": """ diff --git a/src/auto_archiver/modules/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py index 941206f..42ce560 100644 --- a/src/auto_archiver/modules/atlos_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_db/__manifest__.py @@ -15,7 +15,7 @@ "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": str + "type": "str" }, }, "description": """ diff --git a/src/auto_archiver/modules/atlos_feeder/__manifest__.py b/src/auto_archiver/modules/atlos_feeder/__manifest__.py index 91fed32..0d90c8b 100644 --- a/src/auto_archiver/modules/atlos_feeder/__manifest__.py +++ b/src/auto_archiver/modules/atlos_feeder/__manifest__.py @@ -9,12 +9,12 @@ "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "type": str + "type": "str" }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": str + "type": "str" }, }, "description": """ diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 321711e..b57174f 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -22,13 +22,13 @@ from . import GWorksheet class GsheetsFeeder(Feeder): name = "gsheet_feeder" - # def __init__(self, config: dict) -> None: - # """ - # Initializes the GsheetsFeeder with preloaded configurations. - # """ - # super().__init__(config) - # # Initialize the gspread client with the provided service account file - # self.gsheets_client = gspread.service_account(filename=config["service_account"]) + def __init__(self) -> None: + """ + Initializes the GsheetsFeeder with preloaded configurations. + """ + super().__init__() + # Initialize the gspread client with the provided service account file + # self.gsheets_client = gspread.service_account(filename=self.config["service_account"]) # # # Set up feeder-specific configurations from the config # self.sheet_name = config.get("sheet") diff --git a/src/auto_archiver/modules/hash_enricher/__manifest__.py b/src/auto_archiver/modules/hash_enricher/__manifest__.py index eef1963..a7697b9 100644 --- a/src/auto_archiver/modules/hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/hash_enricher/__manifest__.py @@ -7,6 +7,7 @@ }, "configs": { "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, + # TODO add non-negative requirement to match previous implementation? "chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, }, "description": """ From ebebd2789793414f2b45ab74730520e6365d73ce Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 27 Jan 2025 09:11:45 +0000 Subject: [PATCH 029/110] Fix archiver to extractor naming --- src/auto_archiver/modules/instagram_api_extractor/__init__.py | 2 +- .../{instagram_api_archiver.py => instagram_api_extractor.py} | 0 src/auto_archiver/modules/instagram_extractor/__init__.py | 2 +- .../{instagram_archiver.py => instagram_extractor.py} | 0 src/auto_archiver/modules/instagram_tbot_extractor/__init__.py | 2 +- .../{instagram_tbot_archiver.py => instagram_tbot_extractor.py} | 0 src/auto_archiver/modules/telethon_extractor/__init__.py | 2 +- .../{telethon_archiver.py => telethon_extractor.py} | 0 src/auto_archiver/modules/twitter_api_extractor/__init__.py | 2 +- .../{twitter_api_archiver.py => twitter_api_extractor.py} | 0 src/auto_archiver/modules/vk_extractor/__init__.py | 2 +- .../modules/vk_extractor/{vk_archiver.py => vk_extractor.py} | 0 12 files changed, 6 insertions(+), 6 deletions(-) rename src/auto_archiver/modules/instagram_api_extractor/{instagram_api_archiver.py => instagram_api_extractor.py} (100%) rename src/auto_archiver/modules/instagram_extractor/{instagram_archiver.py => instagram_extractor.py} (100%) rename src/auto_archiver/modules/instagram_tbot_extractor/{instagram_tbot_archiver.py => instagram_tbot_extractor.py} (100%) rename src/auto_archiver/modules/telethon_extractor/{telethon_archiver.py => telethon_extractor.py} (100%) rename src/auto_archiver/modules/twitter_api_extractor/{twitter_api_archiver.py => twitter_api_extractor.py} (100%) rename src/auto_archiver/modules/vk_extractor/{vk_archiver.py => vk_extractor.py} (100%) diff --git a/src/auto_archiver/modules/instagram_api_extractor/__init__.py b/src/auto_archiver/modules/instagram_api_extractor/__init__.py index 068b8c6..8805c07 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/__init__.py +++ b/src/auto_archiver/modules/instagram_api_extractor/__init__.py @@ -1 +1 @@ -from .instagram_api_archiver import InstagramAPIExtractor +from .instagram_api_extractor import InstagramAPIExtractor diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py similarity index 100% rename from src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py rename to src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py diff --git a/src/auto_archiver/modules/instagram_extractor/__init__.py b/src/auto_archiver/modules/instagram_extractor/__init__.py index 37ec56c..6f39171 100644 --- a/src/auto_archiver/modules/instagram_extractor/__init__.py +++ b/src/auto_archiver/modules/instagram_extractor/__init__.py @@ -1 +1 @@ -from .instagram_archiver import InstagramExtractor \ No newline at end of file +from .instagram_extractor import InstagramExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py similarity index 100% rename from src/auto_archiver/modules/instagram_extractor/instagram_archiver.py rename to src/auto_archiver/modules/instagram_extractor/instagram_extractor.py diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py b/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py index 1b4dbc3..aa39e63 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py @@ -1 +1 @@ -from .instagram_tbot_archiver import InstagramTbotExtractor +from .instagram_tbot_extractor import InstagramTbotExtractor diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py similarity index 100% rename from src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py rename to src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py index 424792f..a837fdf 100644 --- a/src/auto_archiver/modules/telethon_extractor/__init__.py +++ b/src/auto_archiver/modules/telethon_extractor/__init__.py @@ -1 +1 @@ -from .telethon_archiver import TelethonArchiver \ No newline at end of file +from .telethon_extractor import TelethonArchiver \ No newline at end of file diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py similarity index 100% rename from src/auto_archiver/modules/telethon_extractor/telethon_archiver.py rename to src/auto_archiver/modules/telethon_extractor/telethon_extractor.py diff --git a/src/auto_archiver/modules/twitter_api_extractor/__init__.py b/src/auto_archiver/modules/twitter_api_extractor/__init__.py index cea3872..7005965 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__init__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__init__.py @@ -1 +1 @@ -from .twitter_api_archiver import TwitterApiExtractor \ No newline at end of file +from .twitter_api_extractor import TwitterApiExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py similarity index 100% rename from src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py rename to src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py diff --git a/src/auto_archiver/modules/vk_extractor/__init__.py b/src/auto_archiver/modules/vk_extractor/__init__.py index 29fe59d..0f9bcad 100644 --- a/src/auto_archiver/modules/vk_extractor/__init__.py +++ b/src/auto_archiver/modules/vk_extractor/__init__.py @@ -1 +1 @@ -from .vk_archiver import VkExtractor +from .vk_extractor import VkExtractor diff --git a/src/auto_archiver/modules/vk_extractor/vk_archiver.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py similarity index 100% rename from src/auto_archiver/modules/vk_extractor/vk_archiver.py rename to src/auto_archiver/modules/vk_extractor/vk_extractor.py From 0b03f54f4e662defeaf565db05531d9718a193aa Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 27 Jan 2025 11:00:52 +0100 Subject: [PATCH 030/110] Fix up config validation, and allow for custom 'validators' --- src/auto_archiver/core/orchestrator.py | 6 +++++- src/auto_archiver/core/validators.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 src/auto_archiver/core/validators.py diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index b610c24..5daa857 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -20,6 +20,7 @@ from .metadata import Metadata from ..version import __version__ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG from .loader import available_modules, Module, MODULE_TYPES, load_module +from . import validators import tempfile, traceback from loguru import logger @@ -163,7 +164,10 @@ class ArchivingOrchestrator: # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something kwargs.pop('cli_set', None) kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" - kwargs['type'] = type(kwargs.get('type', 'str')) + try: + kwargs['type'] = getattr(__builtins__, kwargs.get('type', 'str')) + except AttributeError: + kwargs['type'] = getattr(validators, kwargs['type']) group.add_argument(f"--{module.name}.{name}", **kwargs) def show_help(self): diff --git a/src/auto_archiver/core/validators.py b/src/auto_archiver/core/validators.py new file mode 100644 index 0000000..2bd662a --- /dev/null +++ b/src/auto_archiver/core/validators.py @@ -0,0 +1,5 @@ +# used as validators for config values. + +def example_validator(value): + return "example" in value + From 7fd95866a1681d80d58f55815637d2cf7d863f28 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 27 Jan 2025 11:48:04 +0100 Subject: [PATCH 031/110] Further fixes/changes to loading 'types' for config + manifest edits --- src/auto_archiver/base_processors/database.py | 7 ----- src/auto_archiver/base_processors/enricher.py | 9 ------ src/auto_archiver/base_processors/feeder.py | 8 ----- src/auto_archiver/core/config.py | 8 ++--- src/auto_archiver/core/orchestrator.py | 8 ++--- .../modules/cli_feeder/__manifest__.py | 2 ++ .../modules/cli_feeder/cli_feeder.py | 6 ---- .../modules/console_db/console_db.py | 5 ---- .../modules/csv_db/__manifest__.py | 1 + src/auto_archiver/modules/csv_db/csv_db.py | 7 ----- .../modules/csv_feeder/__manifest__.py | 2 ++ .../modules/generic_extractor/__manifest__.py | 2 +- .../modules/gsheet_feeder/__manifest__.py | 2 +- .../modules/hash_enricher/__manifest__.py | 5 +++- .../modules/hash_enricher/hash_enricher.py | 30 ------------------- .../modules/html_formatter/html_formatter.py | 20 ++++++------- .../modules/local_storage/__init__.py | 2 +- .../{local.py => local_storage.py} | 7 +++-- .../modules/meta_enricher/meta_enricher.py | 6 ---- .../metadata_enricher/metadata_enricher.py | 5 ---- .../modules/mute_formatter/__manifest__.py | 4 +-- .../modules/mute_formatter/mute_formatter.py | 8 ++--- .../pdq_hash_enricher/pdq_hash_enricher.py | 5 ---- .../modules/s3_storage/__manifest__.py | 2 +- .../modules/ssl_enricher/__manifest__.py | 1 + .../modules/ssl_enricher/ssl_enricher.py | 5 ---- .../telegram_extractor/telegram_extractor.py | 5 ---- .../telethon_extractor/__manifest__.py | 3 +- .../thumbnail_enricher/thumbnail_enricher.py | 7 ----- 29 files changed, 39 insertions(+), 143 deletions(-) rename src/auto_archiver/modules/local_storage/{local.py => local_storage.py} (86%) diff --git a/src/auto_archiver/base_processors/database.py b/src/auto_archiver/base_processors/database.py index 28f0061..6f13208 100644 --- a/src/auto_archiver/base_processors/database.py +++ b/src/auto_archiver/base_processors/database.py @@ -10,13 +10,6 @@ from auto_archiver.core import Metadata, Step class Database(Step, ABC): name = "database" - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - def init(name: str, config: dict) -> Database: - # only for typing... - return Step.init(name, config, Database) def started(self, item: Metadata) -> None: """signals the DB that the given item archival has started""" diff --git a/src/auto_archiver/base_processors/enricher.py b/src/auto_archiver/base_processors/enricher.py index d26eedf..3cc1a29 100644 --- a/src/auto_archiver/base_processors/enricher.py +++ b/src/auto_archiver/base_processors/enricher.py @@ -18,14 +18,5 @@ class Enricher(Step, ABC): """Base classes and utilities for enrichers in the Auto-Archiver system.""" name = "enricher" - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - - # only for typing... - def init(name: str, config: dict) -> Enricher: - return Step.init(name, config, Enricher) - @abstractmethod def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/auto_archiver/base_processors/feeder.py b/src/auto_archiver/base_processors/feeder.py index 7fbd6b1..0ff541e 100644 --- a/src/auto_archiver/base_processors/feeder.py +++ b/src/auto_archiver/base_processors/feeder.py @@ -9,13 +9,5 @@ from auto_archiver.core import Step class Feeder(Step): name = "feeder" - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - def init(name: str, config: dict) -> Feeder: - # only for code typing - return Step.init(name, config, Feeder) - @abstractmethod def __iter__(self) -> Metadata: return None \ No newline at end of file diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 24f6a61..fd5d49b 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -6,16 +6,12 @@ flexible setup in various environments. """ import argparse -from ruamel.yaml import YAML, CommentedMap -from ruamel.yaml.comments import CommentedMap +from ruamel.yaml import YAML, CommentedMap, add_representer -from dataclasses import dataclass, field -from collections import OrderedDict -from collections.abc import Iterable from copy import deepcopy from .loader import MODULE_TYPES -from typing import Any, List +from typing import Any, List, Type # configurable_parents = [ # Feeder, diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 5daa857..2419b03 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -152,12 +152,12 @@ class ArchivingOrchestrator: if not modules: modules = available_modules(with_manifest=True) + module: Module for module in modules: if not module.configs: # this module has no configs, don't show anything in the help # (TODO: do we want to show something about this module though, like a description?) continue - group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") for name, kwargs in module.configs.items(): # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set @@ -165,8 +165,8 @@ class ArchivingOrchestrator: kwargs.pop('cli_set', None) kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" try: - kwargs['type'] = getattr(__builtins__, kwargs.get('type', 'str')) - except AttributeError: + kwargs['type'] = __builtins__.get(kwargs.get('type'), str) + except KeyError: kwargs['type'] = getattr(validators, kwargs['type']) group.add_argument(f"--{module.name}.{name}", **kwargs) @@ -207,7 +207,7 @@ class ArchivingOrchestrator: exit() if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1: - logger.error(f"Only one feeder is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}") + logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}") exit() for i, module in enumerate(modules_to_load): diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index febebd0..1769a60 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -5,10 +5,12 @@ "external_dependencies": { "python": ["loguru"], }, + 'entry_point': 'cli_feeder::CLIFeeder', "configs": { "urls": { "default": None, "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", + "nargs": "+", }, }, "description": """ diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 7d0d01f..c5f3b23 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -7,12 +7,6 @@ from auto_archiver.core import Metadata, ArchivingContext class CLIFeeder(Feeder): name = "cli_feeder" - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - if type(self.urls) != list or len(self.urls) == 0: - raise Exception("CLI Feeder did not receive any URL to process") - def __iter__(self) -> Metadata: for url in self.urls: logger.debug(f"Processing {url}") diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py index 9dfeb2c..c581552 100644 --- a/src/auto_archiver/modules/console_db/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -8,11 +8,6 @@ class ConsoleDb(Database): """ Outputs results to the console """ - name = "console_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") diff --git a/src/auto_archiver/modules/csv_db/__manifest__.py b/src/auto_archiver/modules/csv_db/__manifest__.py index 1fe2d7d..d97d179 100644 --- a/src/auto_archiver/modules/csv_db/__manifest__.py +++ b/src/auto_archiver/modules/csv_db/__manifest__.py @@ -4,6 +4,7 @@ "requires_setup": False, "external_dependencies": {"python": ["loguru"] }, + 'entry_point': 'csv_db::CSVDb', "configs": { "csv_file": {"default": "db.csv", "help": "CSV file name"} }, diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py index eec4ec6..189b137 100644 --- a/src/auto_archiver/modules/csv_db/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -11,13 +11,6 @@ class CSVDb(Database): """ Outputs results to a CSV file """ - name = "csv_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.assert_valid_string("csv_file") - def done(self, item: Metadata, cached: bool=False) -> None: """archival result ready - should be saved to DB""" diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index 4d19b70..81c4dcd 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -6,6 +6,8 @@ "python": ["loguru"], "bin": [""] }, + 'requires_setup': True, + 'entry_point': "csv_feeder::CSVFeeder", "configs": { "files": { "default": None, diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index f46c13c..73c264d 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -2,7 +2,7 @@ 'name': 'Generic Extractor', 'version': '0.1.0', 'author': 'Bellingcat', - 'type': ['extractor', 'feeder', 'enricher'], + 'type': ['extractor'], 'requires_setup': False, 'dependencies': { 'python': ['yt_dlp', 'requests', 'loguru', 'slugify'], diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index 685a8fd..e1a89a2 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -1,7 +1,7 @@ { "name": "Google Sheets Feeder", "type": ["feeder"], - "entry_point": "GsheetsFeeder", + "entry_point": "gsheet_feeder::GsheetsFeeder", "requires_setup": True, "external_dependencies": { "python": ["loguru", "gspread", "python-slugify"], diff --git a/src/auto_archiver/modules/hash_enricher/__manifest__.py b/src/auto_archiver/modules/hash_enricher/__manifest__.py index a7697b9..6e3cde3 100644 --- a/src/auto_archiver/modules/hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/hash_enricher/__manifest__.py @@ -8,7 +8,10 @@ "configs": { "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, # TODO add non-negative requirement to match previous implementation? - "chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, + "chunksize": {"default": 1.6e7, + "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB", + 'type': 'positive_number', + }, }, "description": """ Generates cryptographic hashes for media files to ensure data integrity and authenticity. diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 8731b06..39ec75c 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -18,36 +18,6 @@ class HashEnricher(Enricher): """ Calculates hashes for Media instances """ - name = "hash_enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - algos = self.configs()["algorithm"] - algo_choices = algos["choices"] - if not getattr(self, 'algorithm', None): - if not config.get('algorithm'): - logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}") - self.algorithm = algos["default"] - else: - self.algorithm = config["algorithm"] - - assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})." - - if not getattr(self, 'chunksize', None): - if config.get('chunksize'): - self.chunksize = config["chunksize"] - else: - self.chunksize = self.configs()["chunksize"]["default"] - - try: - self.chunksize = int(self.chunksize) - except ValueError: - raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.") - - assert self.chunksize >= -1, "read length must be non-negative or -1" - - ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True) def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index 15104b2..afa367b 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -16,17 +16,17 @@ from auto_archiver.utils.misc import random_str @dataclass class HtmlFormatter(Formatter): - name = "html_formatter" - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True) - # JinjaHelper class static methods are added as filters - self.environment.filters.update({ - k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod) - }) - self.template = self.environment.get_template("html_template.html") + # TODO: fix setting up template with new config method + # def __init__(self, config: dict) -> None: + # # without this STEP.__init__ is not called + # super().__init__(config) + # self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True) + # # JinjaHelper class static methods are added as filters + # self.environment.filters.update({ + # k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod) + # }) + # self.template = self.environment.get_template("html_template.html") def format(self, item: Metadata) -> Media: url = item.get_url() diff --git a/src/auto_archiver/modules/local_storage/__init__.py b/src/auto_archiver/modules/local_storage/__init__.py index 6746373..d23147d 100644 --- a/src/auto_archiver/modules/local_storage/__init__.py +++ b/src/auto_archiver/modules/local_storage/__init__.py @@ -1 +1 @@ -from .local import LocalStorage \ No newline at end of file +from .local_storage import LocalStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/local_storage/local.py b/src/auto_archiver/modules/local_storage/local_storage.py similarity index 86% rename from src/auto_archiver/modules/local_storage/local.py rename to src/auto_archiver/modules/local_storage/local_storage.py index 530f111..5d65414 100644 --- a/src/auto_archiver/modules/local_storage/local.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -11,9 +11,10 @@ from auto_archiver.base_processors import Storage class LocalStorage(Storage): name = "local_storage" - def __init__(self, config: dict) -> None: - super().__init__(config) - os.makedirs(self.save_to, exist_ok=True) + def __init__(self) -> None: + super().__init__() + # TODO: fix up passing config values to 'steps' + # os.makedirs(self.save_to, exist_ok=True) def get_cdn_url(self, media: Media) -> str: # TODO: is this viable with Storage.configs on path/filename? diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index f9b74f7..fa86818 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -10,12 +10,6 @@ class MetaEnricher(Enricher): """ Adds metadata information about the archive operations, to be included at the end of all enrichments """ - name = "meta_enricher" - - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index cb68b98..20a278f 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -10,11 +10,6 @@ class MetadataEnricher(Enricher): """ Extracts metadata information from files using exiftool. """ - name = "metadata_enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) def enrich(self, to_enrich: Metadata) -> None: diff --git a/src/auto_archiver/modules/mute_formatter/__manifest__.py b/src/auto_archiver/modules/mute_formatter/__manifest__.py index af3f83a..77f2784 100644 --- a/src/auto_archiver/modules/mute_formatter/__manifest__.py +++ b/src/auto_archiver/modules/mute_formatter/__manifest__.py @@ -1,7 +1,7 @@ -m = { +{ "name": "Mute Formatter", "type": ["formatter"], - "requires_setup": False, + "requires_setup": True, "external_dependencies": { }, "description": """ Default formatter. diff --git a/src/auto_archiver/modules/mute_formatter/mute_formatter.py b/src/auto_archiver/modules/mute_formatter/mute_formatter.py index 19830b1..addb454 100644 --- a/src/auto_archiver/modules/mute_formatter/mute_formatter.py +++ b/src/auto_archiver/modules/mute_formatter/mute_formatter.py @@ -1,16 +1,12 @@ from __future__ import annotations from dataclasses import dataclass -from ..core import Metadata, Media -from . import Formatter +from auto_archiver.core import Metadata, Media +from auto_archiver.base_processors import Formatter @dataclass class MuteFormatter(Formatter): name = "mute_formatter" - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - def format(self, item: Metadata) -> Media: return None diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index 7e3f467..65b0e59 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -25,11 +25,6 @@ class PdqHashEnricher(Enricher): Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection. Ideally this enrichment is orchestrated to run after the thumbnail_enricher. """ - name = "pdq_hash_enricher" - - def __init__(self, config: dict) -> None: - # Without this STEP.__init__ is not called - super().__init__(config) def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py index fc41eb3..811c703 100644 --- a/src/auto_archiver/modules/s3_storage/__manifest__.py +++ b/src/auto_archiver/modules/s3_storage/__manifest__.py @@ -20,7 +20,7 @@ "region": {"default": None, "help": "S3 region name"}, "key": {"default": None, "help": "S3 API key"}, "secret": {"default": None, "help": "S3 API secret"}, - "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, + "random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"}, "endpoint_url": { "default": 'https://{region}.digitaloceanspaces.com', "help": "S3 bucket endpoint, {region} are inserted at runtime" diff --git a/src/auto_archiver/modules/ssl_enricher/__manifest__.py b/src/auto_archiver/modules/ssl_enricher/__manifest__.py index f44fc94..ccde957 100644 --- a/src/auto_archiver/modules/ssl_enricher/__manifest__.py +++ b/src/auto_archiver/modules/ssl_enricher/__manifest__.py @@ -5,6 +5,7 @@ "external_dependencies": { "python": ["loguru", "python-slugify"], }, + 'entry_point': 'ssl_enricher::SSLEnricher', "configs": { "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"}, }, diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 965f699..d15ee95 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -11,11 +11,6 @@ class SSLEnricher(Enricher): """ Retrieves SSL certificate information for a domain, as a file """ - name = "ssl_enricher" - - def __init__(self, config: dict) -> None: - super().__init__(config) - self.skip_when_nothing_archived = bool(self.skip_when_nothing_archived) def enrich(self, to_enrich: Metadata) -> None: if not to_enrich.media and self.skip_when_nothing_archived: return diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index 31bdaca..aa7e46f 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -11,11 +11,6 @@ class TelegramExtractor(Extractor): Extractor for telegram that does not require login, but the telethon_extractor is much more advised, will only return if at least one image or one video is found """ - name = "telegram_extractor" - - def __init__(self, config: dict) -> None: - super().__init__(config) - def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/telethon_extractor/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py index 5d71fdd..2cf1e42 100644 --- a/src/auto_archiver/modules/telethon_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -1,4 +1,3 @@ -import json { "name": "telethon_extractor", "type": ["extractor"], @@ -42,4 +41,4 @@ To use the `TelethonExtractor`, you must configure the following: - **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup. """ -} +} \ No newline at end of file diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 8c34502..4a5a1db 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -18,13 +18,6 @@ class ThumbnailEnricher(Enricher): """ Generates thumbnails for all the media """ - name = "thumbnail_enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.thumbnails_per_second = int(self.thumbnails_per_minute) / 60 - self.max_thumbnails = int(self.max_thumbnails) def enrich(self, to_enrich: Metadata) -> None: """ From f68e2726f2a71578404cebf5658503d9051d8a2f Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 27 Jan 2025 14:01:36 +0100 Subject: [PATCH 032/110] Refactor loader + step into module, use LazyBaseModule and BaseModule --- src/auto_archiver/base_processors/database.py | 6 +- src/auto_archiver/base_processors/enricher.py | 5 +- .../base_processors/extractor.py | 2 +- src/auto_archiver/base_processors/feeder.py | 5 +- .../base_processors/formatter.py | 13 +- src/auto_archiver/base_processors/storage.py | 9 +- src/auto_archiver/core/__init__.py | 2 +- src/auto_archiver/core/config.py | 2 +- src/auto_archiver/core/loader.py | 173 ---------------- src/auto_archiver/core/module.py | 196 ++++++++++++++++++ src/auto_archiver/core/orchestrator.py | 30 +-- src/auto_archiver/core/step.py | 11 - src/auto_archiver/core/validators.py | 2 + .../modules/cli_feeder/__manifest__.py | 2 +- .../modules/cli_feeder/cli_feeder.py | 3 +- .../modules/csv_db/__manifest__.py | 2 +- 16 files changed, 232 insertions(+), 231 deletions(-) delete mode 100644 src/auto_archiver/core/loader.py create mode 100644 src/auto_archiver/core/module.py delete mode 100644 src/auto_archiver/core/step.py diff --git a/src/auto_archiver/base_processors/database.py b/src/auto_archiver/base_processors/database.py index 6f13208..f7deaef 100644 --- a/src/auto_archiver/base_processors/database.py +++ b/src/auto_archiver/base_processors/database.py @@ -3,13 +3,11 @@ from dataclasses import dataclass from abc import abstractmethod, ABC from typing import Union -from auto_archiver.core import Metadata, Step +from auto_archiver.core import Metadata, BaseModule @dataclass -class Database(Step, ABC): - - name = "database" +class Database(BaseModule): def started(self, item: Metadata) -> None: """signals the DB that the given item archival has started""" diff --git a/src/auto_archiver/base_processors/enricher.py b/src/auto_archiver/base_processors/enricher.py index 3cc1a29..fe0d05f 100644 --- a/src/auto_archiver/base_processors/enricher.py +++ b/src/auto_archiver/base_processors/enricher.py @@ -11,12 +11,11 @@ Enrichers are optional but highly useful for making the archived data more power from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC -from auto_archiver.core import Metadata, Step +from auto_archiver.core import Metadata, BaseModule @dataclass -class Enricher(Step, ABC): +class Enricher(BaseModule): """Base classes and utilities for enrichers in the Auto-Archiver system.""" - name = "enricher" @abstractmethod def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/auto_archiver/base_processors/extractor.py b/src/auto_archiver/base_processors/extractor.py index c772325..321b053 100644 --- a/src/auto_archiver/base_processors/extractor.py +++ b/src/auto_archiver/base_processors/extractor.py @@ -25,7 +25,7 @@ class Extractor: Subclasses must implement the `download` method to define platform-specific behavior. """ - def setup(self) -> None: + def setup(self, *args, **kwargs) -> None: # used when extractors need to login or do other one-time setup pass diff --git a/src/auto_archiver/base_processors/feeder.py b/src/auto_archiver/base_processors/feeder.py index 0ff541e..e539f5f 100644 --- a/src/auto_archiver/base_processors/feeder.py +++ b/src/auto_archiver/base_processors/feeder.py @@ -2,12 +2,11 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod from auto_archiver.core import Metadata -from auto_archiver.core import Step +from auto_archiver.core import BaseModule @dataclass -class Feeder(Step): - name = "feeder" +class Feeder(BaseModule): @abstractmethod def __iter__(self) -> Metadata: return None \ No newline at end of file diff --git a/src/auto_archiver/base_processors/formatter.py b/src/auto_archiver/base_processors/formatter.py index 4c59af8..beb0c0d 100644 --- a/src/auto_archiver/base_processors/formatter.py +++ b/src/auto_archiver/base_processors/formatter.py @@ -1,20 +1,11 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod -from auto_archiver.core import Metadata, Media, Step +from auto_archiver.core import Metadata, Media, BaseModule @dataclass -class Formatter(Step): - name = "formatter" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - def init(name: str, config: dict) -> Formatter: - # only for code typing - return Step.init(name, config, Formatter) +class Formatter(BaseModule): @abstractmethod def format(self, item: Metadata) -> Media: return None \ No newline at end of file diff --git a/src/auto_archiver/base_processors/storage.py b/src/auto_archiver/base_processors/storage.py index da6b2ef..e167024 100644 --- a/src/auto_archiver/base_processors/storage.py +++ b/src/auto_archiver/base_processors/storage.py @@ -6,19 +6,14 @@ import os from auto_archiver.utils.misc import random_str -from auto_archiver.core import Media, Step, ArchivingContext, Metadata +from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher from loguru import logger from slugify import slugify @dataclass -class Storage(Step): - name = "storage" - - def init(name: str, config: dict) -> Storage: - # only for typing... - return Step.init(name, config, Storage) +class Storage(BaseModule): def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None: if media.is_stored(): diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index b78df83..10213b2 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -3,7 +3,7 @@ """ from .metadata import Metadata from .media import Media -from .step import Step +from .module import BaseModule from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index fd5d49b..81a1c10 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -9,7 +9,7 @@ import argparse from ruamel.yaml import YAML, CommentedMap, add_representer from copy import deepcopy -from .loader import MODULE_TYPES +from .module import MODULE_TYPES from typing import Any, List, Type diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py deleted file mode 100644 index bbd686e..0000000 --- a/src/auto_archiver/core/loader.py +++ /dev/null @@ -1,173 +0,0 @@ -import ast -from typing import Type -from importlib.util import find_spec -from dataclasses import dataclass -import os -import copy -from os.path import join, dirname -from typing import List -from loguru import logger -import sys -import shutil - -_LOADED_MODULES = {} - -MODULE_TYPES = [ - 'feeder', - 'enricher', - 'extractor', - 'database', - 'storage', - 'formatter' -] - -MANIFEST_FILE = "__manifest__.py" -_DEFAULT_MANIFEST = { - 'name': '', - 'author': 'Bellingcat', - 'type': [], - 'requires_setup': True, - 'description': '', - 'dependencies': {}, - 'entry_point': '', - 'version': '1.0', - 'configs': {} -} - -@dataclass -class Module: - name: str - display_name: str - type: list - dependencies: dict - requires_setup: bool - configs: dict - description: str - path: str - manifest: dict - - def __init__(self, module_name, path, manifest): - self.name = module_name - self.path = path - self.manifest = manifest - if manifest: - self.display_name = manifest['name'] - self.type = manifest['type'] - self._entry_point = manifest['entry_point'] - self.dependencies = manifest['dependencies'] - self.requires_setup = manifest['requires_setup'] - self.configs = manifest['configs'] - self.description = manifest['description'] - - @property - def entry_point(self): - if not self._entry_point: - # try to create the entry point from the module name - self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}" - return self._entry_point - - def __repr__(self): - return f"Module<'{self.display_name}' ({self.name})>" - -def load_module(module: str) -> object: # TODO: change return type to Step - - if module in _LOADED_MODULES: - return _LOADED_MODULES[module] - - # load a module by name - module = get_module(module) - if not module: - return None - # check external dependencies are installed - def check_deps(deps, check): - for dep in deps: - if not check(dep): - logger.error(f"Module '{module.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{module.name}' module? See the README for more information.") - exit(1) - - check_deps(module.dependencies.get('python', []), lambda dep: find_spec(dep)) - check_deps(module.dependencies.get('bin', []), lambda dep: shutil.which(dep)) - - qualname = f'auto_archiver.modules.{module.name}' - - logger.info(f"Loading module '{module.display_name}'...") - # first import the whole module, to make sure it's working properly - __import__(qualname) - - - # then import the file for the entry point - file_name, class_name = module.entry_point.split('::') - sub_qualname = f'{qualname}.{file_name}' - - __import__(f'{qualname}.{file_name}', fromlist=[module.entry_point]) - - # finally, get the class instance - instance = getattr(sys.modules[sub_qualname], class_name)() - if not getattr(instance, 'name', None): - instance.name = module.name - - _LOADED_MODULES[module.name] = instance - return _LOADED_MODULES[module.name] - - - # finally, load the module - -def load_manifest(module_path): - # print(f"Loading manifest for module {module_path}") - # load the manifest file - manifest = copy.deepcopy(_DEFAULT_MANIFEST) - - with open(join(module_path, MANIFEST_FILE)) as f: - try: - manifest.update(ast.literal_eval(f.read())) - except ( ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e: - logger.error(f"Error loading manifest from file {module_path}/{MANIFEST_FILE}: {e}") - return manifest - return manifest - -def get_module(module_name): - # get a module by name - try: - return available_modules(limit_to_modules=[module_name], with_manifest=True)[0] - except IndexError: - return None - -def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[Module]: - # search through all valid 'modules' paths. Default is 'modules' in the current directory - - # see odoo/modules/module.py -> get_modules - def is_really_module(name): - if os.path.isfile(join(name, MANIFEST_FILE)): - return True - - default_path = [join(dirname(dirname((__file__))), "modules")] - all_modules = [] - - for module_folder in default_path + additional_paths: - # walk through each module in module_folder and check if it has a valid manifest - try: - possible_modules = os.listdir(module_folder) - except FileNotFoundError: - logger.warning(f"Module folder {module_folder} does not exist") - continue - - for possible_module in possible_modules: - if limit_to_modules and possible_module not in limit_to_modules: - continue - - possible_module_path = join(module_folder, possible_module) - if not is_really_module(possible_module_path): - continue - # parse manifest and add to list of available modules - if with_manifest: - manifest = load_manifest(possible_module_path) - else: - manifest = {} - all_modules.append(Module(possible_module, possible_module_path, manifest)) - - if not suppress_warnings: - for module in limit_to_modules: - if not any(module == m.name for m in all_modules): - logger.warning(f"Module '{module}' not found. Are you sure it's installed?") - - return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py new file mode 100644 index 0000000..96a8e5e --- /dev/null +++ b/src/auto_archiver/core/module.py @@ -0,0 +1,196 @@ +""" +Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline +by handling user configuration, validating the steps properties, and implementing dynamic instantiation. + +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import List +from abc import ABC +import shutil +import ast +import copy +import sys +from importlib.util import find_spec +import os +from os.path import join, dirname +from loguru import logger + +_LAZY_LOADED_MODULES = {} + +MODULE_TYPES = [ + 'feeder', + 'extractor', + 'enricher', + 'database', + 'storage', + 'formatter' +] + +MANIFEST_FILE = "__manifest__.py" +_DEFAULT_MANIFEST = { + 'name': '', + 'author': 'Bellingcat', + 'type': [], + 'requires_setup': True, + 'description': '', + 'dependencies': {}, + 'entry_point': '', + 'version': '1.0', + 'configs': {} +} + +class BaseModule(ABC): + + config: dict + name: str + + def setup(self, config: dict): + self.config = config + for key, val in config.get(self.name, {}).items(): + setattr(self, key, val) + +def get_module(module_name: str, additional_paths: List[str] = []): + if module_name in _LAZY_LOADED_MODULES: + return _LAZY_LOADED_MODULES[module_name] + + module = available_modules(additional_paths=additional_paths, limit_to_modules=[module_name])[0] + _LAZY_LOADED_MODULES[module_name] = module + return module + +def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[LazyBaseModule]: + # search through all valid 'modules' paths. Default is 'modules' in the current directory + + # see odoo/modules/module.py -> get_modules + def is_really_module(module_path): + if os.path.isfile(join(module_path, MANIFEST_FILE)): + return True + + default_path = [join(dirname(dirname((__file__))), "modules")] + all_modules = [] + + for module_folder in default_path + additional_paths: + # walk through each module in module_folder and check if it has a valid manifest + try: + possible_modules = os.listdir(module_folder) + except FileNotFoundError: + logger.warning(f"Module folder {module_folder} does not exist") + continue + + for possible_module in possible_modules: + if limit_to_modules and possible_module not in limit_to_modules: + continue + + possible_module_path = join(module_folder, possible_module) + if not is_really_module(possible_module_path): + continue + + all_modules.append(LazyBaseModule(possible_module, possible_module_path)) + + if not suppress_warnings: + for module in limit_to_modules: + if not any(module == m.name for m in all_modules): + logger.warning(f"Module '{module}' not found. Are you sure it's installed?") + + return all_modules + +@dataclass +class LazyBaseModule: + name: str + display_name: str + type: list + requires_setup: bool + description: str + path: str + + _manifest: dict = None + _instance: BaseModule = None + _entry_point: str = None + + def __init__(self, module_name, path): + self.name = module_name + self.path = path + + @property + def entry_point(self): + if not self._entry_point and not self.manifest['entry_point']: + # try to create the entry point from the module name + self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}" + return self._entry_point + + @property + def dependencies(self): + return self.manifest['dependencies'] + + @property + def configs(self): + return self.manifest['configs'] + + @property + def manifest(self): + if self._manifest: + return self._manifest + # print(f"Loading manifest for module {module_path}") + # load the manifest file + manifest = copy.deepcopy(_DEFAULT_MANIFEST) + + with open(join(self.path, MANIFEST_FILE)) as f: + try: + manifest.update(ast.literal_eval(f.read())) + except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e: + logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}") + + self._manifest = manifest + self.display_name = manifest['name'] + self.type = manifest['type'] + self._entry_point = manifest['entry_point'] + self.requires_setup = manifest['requires_setup'] + self.description = manifest['description'] + + return manifest + + def load(self): + if self._instance: + return self._instance + + # check external dependencies are installed + def check_deps(deps, check): + for dep in deps: + if not check(dep): + logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") + exit(1) + + check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep)) + check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep)) + + + logger.debug(f"Loading module '{self.display_name}'...") + + for qualname in [self.name, f'auto_archiver.modules.{self.name}']: + try: + # first import the whole module, to make sure it's working properly + __import__(qualname) + break + except ImportError: + pass + + # then import the file for the entry point + file_name, class_name = self.entry_point.split('::') + sub_qualname = f'{qualname}.{file_name}' + + __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) + + # finally, get the class instance + instance = getattr(sys.modules[sub_qualname], class_name)() + if not getattr(instance, 'name', None): + instance.name = self.name + + if not getattr(instance, 'display_name', None): + instance.display_name = self.display_name + + self._instance = instance + return instance + + def __repr__(self): + return f"Module<'{self.display_name}' ({self.name})>" \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 2419b03..2a5cf4a 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -19,8 +19,9 @@ from .context import ArchivingContext from .metadata import Metadata from ..version import __version__ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG -from .loader import available_modules, Module, MODULE_TYPES, load_module +from .module import available_modules, LazyBaseModule, MODULE_TYPES, get_module from . import validators +from .module import BaseModule import tempfile, traceback from loguru import logger @@ -107,7 +108,7 @@ class ArchivingOrchestrator: else: # load all modules, they're not using the 'simple' mode self.add_module_args(available_modules(with_manifest=True), parser) - + parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -147,22 +148,27 @@ class ArchivingOrchestrator: parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) - def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None): + # additional modules + parser.add_argument('--additional-modules', dest='additional_modules', nargs='+', help='additional paths to search for modules', action=UniqueAppendAction) + + def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None): if not modules: modules = available_modules(with_manifest=True) - module: Module + module: LazyBaseModule for module in modules: if not module.configs: # this module has no configs, don't show anything in the help # (TODO: do we want to show something about this module though, like a description?) continue group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") + for name, kwargs in module.configs.items(): # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something kwargs.pop('cli_set', None) + kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" try: kwargs['type'] = __builtins__.get(kwargs.get('type'), str) @@ -210,10 +216,11 @@ class ArchivingOrchestrator: logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}") exit() - for i, module in enumerate(modules_to_load): + for module in modules_to_load: if module in invalid_modules: continue - loaded_module = load_module(module) + loaded_module: BaseModule = get_module(module).load() + loaded_module.setup(self.config) if not loaded_module: invalid_modules.append(module) continue @@ -238,6 +245,8 @@ class ArchivingOrchestrator: if basic_config.help: self.show_help() + logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") + # load the config file yaml_config = {} @@ -252,12 +261,9 @@ class ArchivingOrchestrator: self.install_modules() - logger.info("FEEDERS: " + ", ".join(m.name for m in self.config['steps']['feeders'])) - logger.info("EXTRACTORS: " + ", ".join(m.name for m in self.config['steps']['extractors'])) - logger.info("ENRICHERS: " + ", ".join(m.name for m in self.config['steps']['enrichers'])) - logger.info("DATABASES: " + ", ".join(m.name for m in self.config['steps']['databases'])) - logger.info("STORAGES: " + ", ".join(m.name for m in self.config['steps']['storages'])) - logger.info("FORMATTERS: " + ", ".join(m.name for m in self.config['steps']['formatters'])) + # log out the modules that were loaded + for module_type in MODULE_TYPES: + logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"])) for item in self.feed(): pass diff --git a/src/auto_archiver/core/step.py b/src/auto_archiver/core/step.py deleted file mode 100644 index 2be99c1..0000000 --- a/src/auto_archiver/core/step.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline -by handling user configuration, validating the steps properties, and implementing dynamic instantiation. - -""" - -from __future__ import annotations - -class Step: - # Nothing to see here :) - pass \ No newline at end of file diff --git a/src/auto_archiver/core/validators.py b/src/auto_archiver/core/validators.py index 2bd662a..681d564 100644 --- a/src/auto_archiver/core/validators.py +++ b/src/auto_archiver/core/validators.py @@ -3,3 +3,5 @@ def example_validator(value): return "example" in value +def positive_number(value): + return value > 0 \ No newline at end of file diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index 1769a60..4790a25 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -8,9 +8,9 @@ 'entry_point': 'cli_feeder::CLIFeeder', "configs": { "urls": { - "default": None, "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", "nargs": "+", + "required": True, }, }, "description": """ diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index c5f3b23..09c46d4 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -5,11 +5,10 @@ from auto_archiver.core import Metadata, ArchivingContext class CLIFeeder(Feeder): - name = "cli_feeder" def __iter__(self) -> Metadata: for url in self.urls: - logger.debug(f"Processing {url}") + logger.debug(f"Processing URL: '{url}'") yield Metadata().set_url(url) ArchivingContext.set("folder", "cli") diff --git a/src/auto_archiver/modules/csv_db/__manifest__.py b/src/auto_archiver/modules/csv_db/__manifest__.py index d97d179..3131188 100644 --- a/src/auto_archiver/modules/csv_db/__manifest__.py +++ b/src/auto_archiver/modules/csv_db/__manifest__.py @@ -1,5 +1,5 @@ { - "name": "csv_db", + "name": "CSV Database", "type": ["database"], "requires_setup": False, "external_dependencies": {"python": ["loguru"] From e3074013d01cae74f722732124d188871a43f7fc Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 27 Jan 2025 14:28:04 +0100 Subject: [PATCH 033/110] Fix loading/saving to orchestration file with comments --- src/auto_archiver/core/config.py | 52 +++++++------------------- src/auto_archiver/core/module.py | 8 ++-- src/auto_archiver/core/orchestrator.py | 7 ++-- 3 files changed, 22 insertions(+), 45 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 81a1c10..f724828 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -13,47 +13,23 @@ from .module import MODULE_TYPES from typing import Any, List, Type -# configurable_parents = [ -# Feeder, -# Enricher, -# Extractor, -# Database, -# Storage, -# Formatter -# # Util -# ] -# feeder: Feeder -# formatter: Formatter -# extractors: List[Extractor] = field(default_factory=[]) -# enrichers: List[Enricher] = field(default_factory=[]) -# storages: List[Storage] = field(default_factory=[]) -# databases: List[Database] = field(default_factory=[]) +yaml = YAML() -# def __init__(self) -> None: -# self.defaults = {} -# self.cli_ops = {} -# self.config = {} +EMPTY_CONFIG = yaml.load(""" +# Auto Archiver Configuration +# Steps are the modules that will be run in the order they are defined - # def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}): - # """ - # if yaml_config_filename is provided, the --config argument is ignored, - # useful for library usage when the config values are preloaded - # overwrite_configs is a dict that overwrites the yaml file contents - # """ - # # 1. parse CLI values - # if use_cli: - # parser = argparse.ArgumentParser( - # # prog = "auto-archiver", - # description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", - # epilog="Check the code at https://github.com/bellingcat/auto-archiver" - # ) +steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \ +""" - # parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') - # parser.add_argument('--version', action='version', version=__version__) +# Global configuration +# These are the global configurations that are used by the modules + +logging: + level: INFO +""") +# note: 'logging' is explicitly added above in order to better format the config file -EMPTY_CONFIG = CommentedMap(**{ - "steps": dict((f"{module_type}s", []) for module_type in MODULE_TYPES) -}) def to_dot_notation(yaml_conf: CommentedMap | dict) -> argparse.ArgumentParser: dotdict = {} @@ -112,8 +88,6 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap: return yaml_dict -yaml = YAML() - def read_yaml(yaml_filename: str) -> CommentedMap: config = None try: diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 96a8e5e..29f9769 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -100,7 +100,6 @@ class LazyBaseModule: name: str display_name: str type: list - requires_setup: bool description: str path: str @@ -111,7 +110,7 @@ class LazyBaseModule: def __init__(self, module_name, path): self.name = module_name self.path = path - + @property def entry_point(self): if not self._entry_point and not self.manifest['entry_point']: @@ -126,6 +125,10 @@ class LazyBaseModule: @property def configs(self): return self.manifest['configs'] + + @property + def requires_setup(self): + return self.manifest['requires_setup'] @property def manifest(self): @@ -145,7 +148,6 @@ class LazyBaseModule: self.display_name = manifest['name'] self.type = manifest['type'] self._entry_point = manifest['entry_point'] - self.requires_setup = manifest['requires_setup'] self.description = manifest['description'] return manifest diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 2a5cf4a..967f652 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -168,13 +168,14 @@ class ArchivingOrchestrator: # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something kwargs.pop('cli_set', None) - + should_store = kwargs.pop('should_store', False) kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" try: kwargs['type'] = __builtins__.get(kwargs.get('type'), str) except KeyError: kwargs['type'] = getattr(validators, kwargs['type']) - group.add_argument(f"--{module.name}.{name}", **kwargs) + arg = group.add_argument(f"--{module.name}.{name}", **kwargs) + arg.should_store = should_store def show_help(self): # for the help message, we want to load *all* possible modules and show the help @@ -255,7 +256,7 @@ class ArchivingOrchestrator: exit() yaml_config = read_yaml(basic_config.config_file) - + self.setup_complete_parser(basic_config, yaml_config, unused_args) From e1a937333666862217ddda1e9baea869535d3377 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 27 Jan 2025 19:03:02 +0000 Subject: [PATCH 034/110] Refactoring for new config setup --- poetry.lock | 55 ++++++------- src/auto_archiver/base_processors/__init__.py | 6 -- src/auto_archiver/core/__init__.py | 9 ++- .../{base_processors => core}/database.py | 0 .../{base_processors => core}/enricher.py | 0 .../{base_processors => core}/extractor.py | 8 +- .../{base_processors => core}/feeder.py | 0 .../{base_processors => core}/formatter.py | 0 src/auto_archiver/core/module.py | 69 ++++++++-------- src/auto_archiver/core/orchestrator.py | 27 ++++--- .../{base_processors => core}/storage.py | 0 src/auto_archiver/modules/api_db/api_db.py | 3 +- src/auto_archiver/modules/atlos/atlos.py | 6 +- .../modules/atlos_db/atlos_db.py | 8 +- .../modules/atlos_feeder/atlos_feeder.py | 3 +- .../modules/cli_feeder/cli_feeder.py | 2 +- .../modules/console_db/console_db.py | 2 +- src/auto_archiver/modules/csv_db/csv_db.py | 2 +- .../modules/csv_feeder/csv_feeder.py | 4 +- .../modules/gdrive_storage/gdrive_storage.py | 3 +- .../modules/generic_extractor/__manifest__.py | 80 ++++++++++++++----- .../modules/generic_extractor/bluesky.py | 2 +- .../modules/generic_extractor/dropin.py | 2 +- .../generic_extractor/generic_extractor.py | 7 +- .../modules/generic_extractor/truth.py | 2 +- .../modules/generic_extractor/twitter.py | 2 +- .../modules/gsheet_db/gsheet_db.py | 7 +- .../modules/gsheet_feeder/gsheet_feeder.py | 3 +- .../modules/hash_enricher/__manifest__.py | 4 +- .../modules/hash_enricher/hash_enricher.py | 13 ++- .../modules/html_formatter/html_formatter.py | 28 ++++--- .../instagram_api_extractor.py | 4 +- .../instagram_extractor.py | 4 +- .../instagram_tbot_extractor.py | 9 +-- .../modules/local_storage/local_storage.py | 7 +- .../modules/meta_enricher/meta_enricher.py | 2 +- .../metadata_enricher/metadata_enricher.py | 2 +- .../modules/mute_formatter/mute_formatter.py | 3 +- .../pdq_hash_enricher/pdq_hash_enricher.py | 2 +- src/auto_archiver/modules/s3_storage/s3.py | 5 +- .../screenshot_enricher.py | 6 +- .../modules/ssl_enricher/ssl_enricher.py | 2 +- .../telegram_extractor/telegram_extractor.py | 2 +- .../telethon_extractor/telethon_extractor.py | 8 +- .../thumbnail_enricher/thumbnail_enricher.py | 4 +- .../timestamping_enricher.py | 8 +- .../twitter_api_extractor.py | 3 +- .../modules/vk_extractor/vk_extractor.py | 3 +- .../modules/wacz_enricher/wacz_enricher.py | 7 +- .../wayback_enricher/wayback_enricher.py | 9 +-- .../whisper_enricher/whisper_enricher.py | 10 +-- src/auto_archiver/utils/gsheet.py | 4 +- 52 files changed, 219 insertions(+), 242 deletions(-) delete mode 100644 src/auto_archiver/base_processors/__init__.py rename src/auto_archiver/{base_processors => core}/database.py (100%) rename src/auto_archiver/{base_processors => core}/enricher.py (100%) rename src/auto_archiver/{base_processors => core}/extractor.py (94%) rename src/auto_archiver/{base_processors => core}/feeder.py (100%) rename src/auto_archiver/{base_processors => core}/formatter.py (100%) rename src/auto_archiver/{base_processors => core}/storage.py (100%) diff --git a/poetry.lock b/poetry.lock index 128ede2..6d6ad8c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -64,14 +64,14 @@ typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} [[package]] name = "attrs" -version = "24.3.0" +version = "25.1.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308"}, - {file = "attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff"}, + {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"}, + {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"}, ] [package.extras] @@ -152,18 +152,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.36.3" +version = "1.36.6" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.36.3-py3-none-any.whl", hash = "sha256:f9843a5d06f501d66ada06f5a5417f671823af2cf319e36ceefa1bafaaaaa953"}, - {file = "boto3-1.36.3.tar.gz", hash = "sha256:53a5307f6a3526ee2f8590e3c45efa504a3ea4532c1bfe4926c0c19bf188d141"}, + {file = "boto3-1.36.6-py3-none-any.whl", hash = "sha256:6d473f0f340d02b4e9ad5b8e68786a09728101a8b950231b89ebdaf72b6dca21"}, + {file = "boto3-1.36.6.tar.gz", hash = "sha256:b36feae061dc0793cf311468956a0a9e99215ce38bc99a1a4e55a5b105f16297"}, ] [package.dependencies] -botocore = ">=1.36.3,<1.37.0" +botocore = ">=1.36.6,<1.37.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.11.0,<0.12.0" @@ -172,14 +172,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.36.3" +version = "1.36.6" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.36.3-py3-none-any.whl", hash = "sha256:536ab828e6f90dbb000e3702ac45fd76642113ae2db1b7b1373ad24104e89255"}, - {file = "botocore-1.36.3.tar.gz", hash = "sha256:775b835e979da5c96548ed1a0b798101a145aec3cd46541d62e27dda5a94d7f8"}, + {file = "botocore-1.36.6-py3-none-any.whl", hash = "sha256:f77bbbb03fb420e260174650fb5c0cc142ec20a96967734eed2b0ef24334ef34"}, + {file = "botocore-1.36.6.tar.gz", hash = "sha256:4864c53d638da191a34daf3ede3ff1371a3719d952cc0c6bd24ce2836a38dd77"}, ] [package.dependencies] @@ -798,14 +798,14 @@ uritemplate = ">=3.0.1,<5" [[package]] name = "google-auth" -version = "2.37.0" +version = "2.38.0" description = "Google Authentication Library" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_auth-2.37.0-py2.py3-none-any.whl", hash = "sha256:42664f18290a6be591be5329a96fe30184be1a1badb7292a7f686a9659de9ca0"}, - {file = "google_auth-2.37.0.tar.gz", hash = "sha256:0054623abf1f9c83492c63d3f47e77f0a544caa3d40b2d98e099a611c2dd5d00"}, + {file = "google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a"}, + {file = "google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4"}, ] [package.dependencies] @@ -958,13 +958,14 @@ files = [ [[package]] name = "instaloader" -version = "4.14" +version = "4.14.1" description = "Download pictures (or videos) along with their captions and other metadata from Instagram." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "instaloader-4.14.tar.gz", hash = "sha256:754425eb17af44ce4bb6056e4eacd044a518d13b5efc11b9d80eb229bb96c652"}, + {file = "instaloader-4.14.1-py3-none-any.whl", hash = "sha256:43356f696231621ea5a93354f9a4578124fe131940ee9aa1e83c20f57e18f26d"}, + {file = "instaloader-4.14.1.tar.gz", hash = "sha256:a41a7372a18fb096b3ed545469479884de9cf768e12020c0e0e67c488d9d599c"}, ] [package.dependencies] @@ -1135,14 +1136,14 @@ files = [ [[package]] name = "marshmallow" -version = "3.25.1" +version = "3.26.0" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "marshmallow-3.25.1-py3-none-any.whl", hash = "sha256:ec5d00d873ce473b7f2ffcb7104286a376c354cab0c2fa12f5573dab03e87210"}, - {file = "marshmallow-3.25.1.tar.gz", hash = "sha256:f4debda3bb11153d81ac34b0d582bf23053055ee11e791b54b4b35493468040a"}, + {file = "marshmallow-3.26.0-py3-none-any.whl", hash = "sha256:1287bca04e6a5f4094822ac153c03da5e214a0a60bcd557b140f3e66991b8ca1"}, + {file = "marshmallow-3.26.0.tar.gz", hash = "sha256:eb36762a1cc76d7abf831e18a3a1b26d3d481bbc74581b8e532a3d3a8115e1cb"}, ] [package.dependencies] @@ -2087,14 +2088,14 @@ pyasn1 = ">=0.1.3" [[package]] name = "s3transfer" -version = "0.11.1" +version = "0.11.2" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "s3transfer-0.11.1-py3-none-any.whl", hash = "sha256:8fa0aa48177be1f3425176dfe1ab85dcd3d962df603c3dbfc585e6bf857ef0ff"}, - {file = "s3transfer-0.11.1.tar.gz", hash = "sha256:3f25c900a367c8b7f7d8f9c34edc87e300bde424f779dc9f0a8ae4f9df9264f6"}, + {file = "s3transfer-0.11.2-py3-none-any.whl", hash = "sha256:be6ecb39fadd986ef1701097771f87e4d2f821f27f6071c872143884d2950fbc"}, + {file = "s3transfer-0.11.2.tar.gz", hash = "sha256:3b39185cb72f5acc77db1a58b6e25b977f28d20496b6e58d6813d75f464d632f"}, ] [package.dependencies] @@ -2105,14 +2106,14 @@ crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"] [[package]] name = "selenium" -version = "4.28.0" +version = "4.28.1" description = "Official Python bindings for Selenium WebDriver" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "selenium-4.28.0-py3-none-any.whl", hash = "sha256:3d6a2e8e1b850a1078884ea19f4e011ecdc12263434d87a0b78769836fb82dd8"}, - {file = "selenium-4.28.0.tar.gz", hash = "sha256:a9fae6eef48d470a1b0c6e45185d96f0dafb025e8da4b346cc41e4da3ac54fa0"}, + {file = "selenium-4.28.1-py3-none-any.whl", hash = "sha256:4238847e45e24e4472cfcf3554427512c7aab9443396435b1623ef406fff1cc1"}, + {file = "selenium-4.28.1.tar.gz", hash = "sha256:0072d08670d7ec32db901bd0107695a330cecac9f196e3afb3fa8163026e022a"}, ] [package.dependencies] @@ -2421,14 +2422,14 @@ test = ["pytest"] [[package]] name = "starlette" -version = "0.45.2" +version = "0.45.3" description = "The little ASGI library that shines." optional = false python-versions = ">=3.9" groups = ["docs"] files = [ - {file = "starlette-0.45.2-py3-none-any.whl", hash = "sha256:4daec3356fb0cb1e723a5235e5beaf375d2259af27532958e2d79df549dad9da"}, - {file = "starlette-0.45.2.tar.gz", hash = "sha256:bba1831d15ae5212b22feab2f218bab6ed3cd0fc2dc1d4442443bb1ee52260e0"}, + {file = "starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d"}, + {file = "starlette-0.45.3.tar.gz", hash = "sha256:2cbcba2a75806f8a41c722141486f37c28e30a0921c5f6fe4346cb0dcee1302f"}, ] [package.dependencies] diff --git a/src/auto_archiver/base_processors/__init__.py b/src/auto_archiver/base_processors/__init__.py deleted file mode 100644 index 4995457..0000000 --- a/src/auto_archiver/base_processors/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .database import Database -from .enricher import Enricher -from .feeder import Feeder -from .storage import Storage -from .extractor import Extractor -from .formatter import Formatter \ No newline at end of file diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index 10213b2..858bdfd 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -8,4 +8,11 @@ from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator -# from .config import Config \ No newline at end of file +# from .config import Config + +from .database import Database +from .enricher import Enricher +from .feeder import Feeder +from .storage import Storage +from .extractor import Extractor +from .formatter import Formatter \ No newline at end of file diff --git a/src/auto_archiver/base_processors/database.py b/src/auto_archiver/core/database.py similarity index 100% rename from src/auto_archiver/base_processors/database.py rename to src/auto_archiver/core/database.py diff --git a/src/auto_archiver/base_processors/enricher.py b/src/auto_archiver/core/enricher.py similarity index 100% rename from src/auto_archiver/base_processors/enricher.py rename to src/auto_archiver/core/enricher.py diff --git a/src/auto_archiver/base_processors/extractor.py b/src/auto_archiver/core/extractor.py similarity index 94% rename from src/auto_archiver/base_processors/extractor.py rename to src/auto_archiver/core/extractor.py index 321b053..8d509ec 100644 --- a/src/auto_archiver/base_processors/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -15,20 +15,16 @@ import mimetypes, requests from loguru import logger from retrying import retry -from ..core import Metadata, ArchivingContext +from ..core import Metadata, ArchivingContext, BaseModule @dataclass -class Extractor: +class Extractor(BaseModule): """ Base class for implementing extractors in the media archiving framework. Subclasses must implement the `download` method to define platform-specific behavior. """ - def setup(self, *args, **kwargs) -> None: - # used when extractors need to login or do other one-time setup - pass - def cleanup(self) -> None: # called when extractors are done, or upon errors, cleanup any resources pass diff --git a/src/auto_archiver/base_processors/feeder.py b/src/auto_archiver/core/feeder.py similarity index 100% rename from src/auto_archiver/base_processors/feeder.py rename to src/auto_archiver/core/feeder.py diff --git a/src/auto_archiver/base_processors/formatter.py b/src/auto_archiver/core/formatter.py similarity index 100% rename from src/auto_archiver/base_processors/formatter.py rename to src/auto_archiver/core/formatter.py diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 29f9769..3ef43e5 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -153,46 +153,47 @@ class LazyBaseModule: return manifest def load(self): - if self._instance: - return self._instance - # check external dependencies are installed - def check_deps(deps, check): - for dep in deps: - if not check(dep): - logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") - exit(1) - - check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep)) - check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep)) - + if self._instance: + return self._instance - logger.debug(f"Loading module '{self.display_name}'...") + # check external dependencies are installed + def check_deps(deps, check): + for dep in deps: + if not check(dep): + logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") + exit(1) - for qualname in [self.name, f'auto_archiver.modules.{self.name}']: - try: - # first import the whole module, to make sure it's working properly - __import__(qualname) - break - except ImportError: - pass + check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep)) + check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep)) - # then import the file for the entry point - file_name, class_name = self.entry_point.split('::') - sub_qualname = f'{qualname}.{file_name}' - __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) - - # finally, get the class instance - instance = getattr(sys.modules[sub_qualname], class_name)() - if not getattr(instance, 'name', None): - instance.name = self.name - - if not getattr(instance, 'display_name', None): - instance.display_name = self.display_name + logger.debug(f"Loading module '{self.display_name}'...") - self._instance = instance - return instance + for qualname in [self.name, f'auto_archiver.modules.{self.name}']: + try: + # first import the whole module, to make sure it's working properly + __import__(qualname) + break + except ImportError: + pass + + # then import the file for the entry point + file_name, class_name = self.entry_point.split('::') + sub_qualname = f'{qualname}.{file_name}' + + __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) + + # finally, get the class instance + instance = getattr(sys.modules[sub_qualname], class_name)() + if not getattr(instance, 'name', None): + instance.name = self.name + + if not getattr(instance, 'display_name', None): + instance.display_name = self.display_name + + self._instance = instance + return instance def __repr__(self): return f"Module<'{self.display_name}' ({self.name})>" \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 967f652..4f155db 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -227,6 +227,10 @@ class ArchivingOrchestrator: continue if loaded_module: step_items.append(loaded_module) + # TODO temp solution + if module_type == "storage": + ArchivingContext.set("storages", step_items, keep_on_reset=True) + check_steps_ok() self.config['steps'][f"{module_type}s"] = step_items @@ -256,10 +260,7 @@ class ArchivingOrchestrator: exit() yaml_config = read_yaml(basic_config.config_file) - - self.setup_complete_parser(basic_config, yaml_config, unused_args) - self.install_modules() # log out the modules that were loaded @@ -301,7 +302,7 @@ class ArchivingOrchestrator: logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') for d in self.config['steps']['databases']: if type(e) == AssertionError: d.failed(item, str(e)) - else: d.failed(item) + else: d.failed(item, reason="unexpected error") def archive(self, result: Metadata) -> Union[Metadata, None]: @@ -319,27 +320,27 @@ class ArchivingOrchestrator: # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs url = original_url - for a in self.archivers: url = a.sanitize_url(url) + for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url) result.set_url(url) if original_url != url: result.set("original_url", original_url) # 2 - notify start to DBs, propagate already archived if feature enabled in DBs cached_result = None - for d in self.databases: + for d in self.config["steps"]["databases"]: d.started(result) if (local_result := d.fetch(result)): cached_result = (cached_result or Metadata()).merge(local_result) if cached_result: logger.debug("Found previously archived entry") - for d in self.databases: + for d in self.config["steps"]["databases"]: try: d.done(cached_result, cached=True) except Exception as e: logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") return cached_result - # 3 - call archivers until one succeeds - for a in self.archivers: - logger.info(f"Trying archiver {a.name} for {url}") + # 3 - call extractors until one succeeds + for a in self.config["steps"]["extractors"]: + logger.info(f"Trying extractor {a.name} for {url}") try: result.merge(a.download(result)) if result.is_success(): break @@ -347,7 +348,7 @@ class ArchivingOrchestrator: logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}") # 4 - call enrichers to work with archived content - for e in self.enrichers: + for e in self.config["steps"]["enrichers"]: try: e.enrich(result) except Exception as exc: logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}") @@ -356,7 +357,7 @@ class ArchivingOrchestrator: result.store() # 6 - format and store formatted if needed - if (final_media := self.formatter.format(result)): + if final_media := self.config["steps"]["formatters"][0].format(result): final_media.store(url=url, metadata=result) result.set_final_media(final_media) @@ -364,7 +365,7 @@ class ArchivingOrchestrator: result.status = "nothing archived" # signal completion to databases and archivers - for d in self.databases: + for d in self.config["steps"]["databases"]: try: d.done(result) except Exception as e: logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") diff --git a/src/auto_archiver/base_processors/storage.py b/src/auto_archiver/core/storage.py similarity index 100% rename from src/auto_archiver/base_processors/storage.py rename to src/auto_archiver/core/storage.py diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index d2b43b7..a893aee 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,7 +2,7 @@ from typing import Union import requests, os from loguru import logger -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata @@ -10,7 +10,6 @@ class AAApiDb(Database): """ Connects to auto-archiver-api instance """ - name = "auto_archiver_api_db" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called diff --git a/src/auto_archiver/modules/atlos/atlos.py b/src/auto_archiver/modules/atlos/atlos.py index 6a175d3..abc8a1a 100644 --- a/src/auto_archiver/modules/atlos/atlos.py +++ b/src/auto_archiver/modules/atlos/atlos.py @@ -5,15 +5,11 @@ import requests import hashlib from auto_archiver.core import Media, Metadata -from auto_archiver.base_processors import Storage +from auto_archiver.core import Storage from auto_archiver.utils import get_atlos_config_options class AtlosStorage(Storage): - name = "atlos_storage" - - def __init__(self, config: dict) -> None: - super().__init__(config) def get_cdn_url(self, _media: Media) -> str: # It's not always possible to provide an exact URL, because it's diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py index 2e24491..c45e215 100644 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -6,7 +6,7 @@ from csv import DictWriter from dataclasses import asdict import requests -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata from auto_archiver.utils import get_atlos_config_options @@ -16,12 +16,6 @@ class AtlosDb(Database): Outputs results to Atlos """ - name = "atlos_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - def failed(self, item: Metadata, reason: str) -> None: """Update DB accordingly for failure""" # If the item has no Atlos ID, there's nothing for us to do diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index 262f21b..9811a82 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -1,13 +1,12 @@ from loguru import logger import requests -from auto_archiver.base_processors import Feeder +from auto_archiver.core import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import get_atlos_config_options class AtlosFeeder(Feeder): - name = "atlos_feeder" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 09c46d4..62cb659 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_processors import Feeder +from auto_archiver.core import Feeder from auto_archiver.core import Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py index c581552..48609b0 100644 --- a/src/auto_archiver/modules/console_db/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py index 189b137..b5985e2 100644 --- a/src/auto_archiver/modules/csv_db/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -3,7 +3,7 @@ from loguru import logger from csv import DictWriter from dataclasses import asdict -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 7bff16e..ad0a035 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,14 +1,12 @@ from loguru import logger import csv -from auto_archiver.base_processors import Feeder +from auto_archiver.core import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import url_or_none class CSVFeeder(Feeder): - name = "csv_feeder" - def __iter__(self) -> Metadata: url_column = self.column or 0 for file in self.files: diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 4bcdb90..c2d326d 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -10,11 +10,10 @@ from google.oauth2.credentials import Credentials from google.auth.transport.requests import Request from auto_archiver.core import Media -from auto_archiver.base_processors import Storage +from auto_archiver.core import Storage class GDriveStorage(Storage): - name = "gdrive_storage" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 73c264d..d5f363f 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -1,13 +1,13 @@ { - 'name': 'Generic Extractor', - 'version': '0.1.0', - 'author': 'Bellingcat', - 'type': ['extractor'], - 'requires_setup': False, - 'dependencies': { - 'python': ['yt_dlp', 'requests', 'loguru', 'slugify'], + "name": "Generic Extractor", + "version": "0.1.0", + "author": "Bellingcat", + "type": ["extractor"], + "requires_setup": False, + "dependencies": { + "python": ["yt_dlp", "requests", "loguru", "slugify"], }, - 'description': """ + "description": """ This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood. This module is responsible for downloading and processing media content from platforms @@ -28,17 +28,53 @@ the broader archiving framework. custom dropins can be created to handle additional websites and passed to the archiver via the command line using the `--dropins` option (TODO!). """, - 'configs': { - "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"}, - "subtitles": {"default": True, "help": "download subtitles if available"}, - "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"}, - "livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"}, - "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."}, - "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"}, - "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."}, - 'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."}, - "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."}, - "cookies_from_browser": {"default": None, 'type': 'str', "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"}, - "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"}, - } -} \ No newline at end of file + "configs": { + "facebook_cookie": { + "default": None, + "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'", + }, + "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"}, + "comments": { + "default": False, + "help": "download all comments if available, may lead to large metadata", + "type": "bool", + }, + "livestreams": { + "default": False, + "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control", + "type": "bool", + }, + "live_from_start": { + "default": False, + "help": "if set, will download live streams from their earliest available moment, otherwise starts now.", + "type": "bool", + }, + "proxy": { + "default": "", + "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port", + }, + "end_means_success": { + "default": True, + "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.", + "type": "bool", + }, + "allow_playlist": { + "default": False, + "help": "If True will also download playlists, set to False if the expectation is to download a single video.", + "type": "bool", + }, + "max_downloads": { + "default": "inf", + "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.", + }, + "cookies_from_browser": { + "default": None, + "type": "str", + "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale", + }, + "cookie_file": { + "default": None, + "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp", + }, + }, +} diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index c75c373..1f92fd8 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor from auto_archiver.core.metadata import Metadata, Media from .dropin import GenericDropin, InfoExtractor diff --git a/src/auto_archiver/modules/generic_extractor/dropin.py b/src/auto_archiver/modules/generic_extractor/dropin.py index 99cd71b..c5749ff 100644 --- a/src/auto_archiver/modules/generic_extractor/dropin.py +++ b/src/auto_archiver/modules/generic_extractor/dropin.py @@ -1,6 +1,6 @@ from yt_dlp.extractor.common import InfoExtractor from auto_archiver.core.metadata import Metadata -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor class GenericDropin: """Base class for dropins for the generic extractor. diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 8ceaabc..57924d9 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -5,11 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor from ...core import Metadata, Media, ArchivingContext class GenericExtractor(Extractor): - name = "youtubedl_archiver" #left as is for backwards compat _dropins = {} def suitable_extractors(self, url: str) -> list[str]: @@ -268,7 +267,7 @@ class GenericExtractor(Extractor): if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie: logger.debug('Using Facebook cookie') yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie - + ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads} if item.netloc in ['youtube.com', 'www.youtube.com']: @@ -285,6 +284,6 @@ class GenericExtractor(Extractor): result = self.download_for_extractor(info_extractor, url, ydl) if result: return result - + return False diff --git a/src/auto_archiver/modules/generic_extractor/truth.py b/src/auto_archiver/modules/generic_extractor/truth.py index f52a748..e65b4b1 100644 --- a/src/auto_archiver/modules/generic_extractor/truth.py +++ b/src/auto_archiver/modules/generic_extractor/truth.py @@ -2,7 +2,7 @@ from typing import Type from auto_archiver.utils import traverse_obj from auto_archiver.core.metadata import Metadata, Media -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor from yt_dlp.extractor.common import InfoExtractor from dateutil.parser import parse as parse_dt diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 11399d4..83c1f4f 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -6,7 +6,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import UrlUtil -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor from .dropin import GenericDropin, InfoExtractor diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 239bc06..e7e8e5c 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -5,7 +5,7 @@ from urllib.parse import quote from loguru import logger -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.modules.gsheet_feeder import GWorksheet @@ -15,11 +15,6 @@ class GsheetsDb(Database): NB: only works if GsheetFeeder is used. could be updated in the future to support non-GsheetFeeder metadata """ - name = "gsheet_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index b57174f..66dd014 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -14,13 +14,12 @@ import gspread from loguru import logger from slugify import slugify -from auto_archiver.base_processors import Feeder +from auto_archiver.core import Feeder from auto_archiver.core import Metadata, ArchivingContext from . import GWorksheet class GsheetsFeeder(Feeder): - name = "gsheet_feeder" def __init__(self) -> None: """ diff --git a/src/auto_archiver/modules/hash_enricher/__manifest__.py b/src/auto_archiver/modules/hash_enricher/__manifest__.py index 6e3cde3..f306808 100644 --- a/src/auto_archiver/modules/hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/hash_enricher/__manifest__.py @@ -8,9 +8,9 @@ "configs": { "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, # TODO add non-negative requirement to match previous implementation? - "chunksize": {"default": 1.6e7, + "chunksize": {"default": 16000000, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB", - 'type': 'positive_number', + 'type': 'int', }, }, "description": """ diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 39ec75c..827b65f 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -10,7 +10,7 @@ making it suitable for handling large files efficiently. import hashlib from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata, ArchivingContext @@ -19,6 +19,17 @@ class HashEnricher(Enricher): Calculates hashes for Media instances """ + def __init__(self, config: dict = None): + """ + Initialize the HashEnricher with a configuration dictionary. + """ + super().__init__() + # TODO set these from the manifest? + # Set default values + self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256" + self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7) + + def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index afa367b..e6e5e58 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -9,24 +9,30 @@ import base64 from auto_archiver.version import __version__ from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.base_processors import Formatter +from auto_archiver.core import Formatter from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str @dataclass class HtmlFormatter(Formatter): + environment: Environment = None + template: any = None - # TODO: fix setting up template with new config method - # def __init__(self, config: dict) -> None: - # # without this STEP.__init__ is not called - # super().__init__(config) - # self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True) - # # JinjaHelper class static methods are added as filters - # self.environment.filters.update({ - # k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod) - # }) - # self.template = self.environment.get_template("html_template.html") + def setup(self, config: dict) -> None: + """Sets up the Jinja2 environment and loads the template.""" + super().setup(config) # Ensure the base class logic is executed + template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/") + self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True) + + # JinjaHelper class static methods are added as filters + self.environment.filters.update({ + k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod) + }) + + # Load a specific template or default to "html_template.html" + template_name = self.config.get("template_name", "html_template.html") + self.template = self.environment.get_template(template_name) def format(self, item: Metadata) -> Media: url = item.get_url() diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index c1271fc..3d7f9e5 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -16,7 +16,7 @@ from loguru import logger from retrying import retry from tqdm import tqdm -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Media from auto_archiver.core import Metadata @@ -28,8 +28,6 @@ class InstagramAPIExtractor(Extractor): # TODO: improvement collect aggregates of locations[0].location and mentions for all posts """ - name = "instagram_api_extractor" - global_pattern = re.compile( r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" ) diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 2b9bece..1a246fb 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -7,7 +7,7 @@ import re, os, shutil, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata from auto_archiver.core import Media @@ -15,8 +15,6 @@ class InstagramExtractor(Extractor): """ Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) """ - name = "instagram_extractor" - # NB: post regex should be tested before profile # https://regex101.com/r/MGPquX/1 post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)") diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 36c8a06..60fa397 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -15,7 +15,7 @@ from sqlite3 import OperationalError from loguru import logger from telethon.sync import TelegramClient -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str @@ -26,13 +26,6 @@ class InstagramTbotExtractor(Extractor): https://github.com/adw0rd/instagrapi https://t.me/instagram_load_bot """ - name = "instagram_tbot_extractor" - - def __init__(self, config: dict) -> None: - super().__init__(config) - self.assert_valid_string("api_id") - self.assert_valid_string("api_hash") - self.timeout = int(self.timeout) def setup(self) -> None: """ diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index 5d65414..4c44e9c 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -5,17 +5,12 @@ import os from loguru import logger from auto_archiver.core import Media -from auto_archiver.base_processors import Storage +from auto_archiver.core import Storage class LocalStorage(Storage): name = "local_storage" - def __init__(self) -> None: - super().__init__() - # TODO: fix up passing config values to 'steps' - # os.makedirs(self.save_to, exist_ok=True) - def get_cdn_url(self, media: Media) -> str: # TODO: is this viable with Storage.configs on path/filename? dest = os.path.join(self.save_to, media.key) diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index fa86818..03fb01e 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -2,7 +2,7 @@ import datetime import os from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index 20a278f..c052d0a 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -2,7 +2,7 @@ import subprocess import traceback from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/mute_formatter/mute_formatter.py b/src/auto_archiver/modules/mute_formatter/mute_formatter.py index addb454..1c7cca2 100644 --- a/src/auto_archiver/modules/mute_formatter/mute_formatter.py +++ b/src/auto_archiver/modules/mute_formatter/mute_formatter.py @@ -2,11 +2,10 @@ from __future__ import annotations from dataclasses import dataclass from auto_archiver.core import Metadata, Media -from auto_archiver.base_processors import Formatter +from auto_archiver.core import Formatter @dataclass class MuteFormatter(Formatter): - name = "mute_formatter" def format(self, item: Metadata) -> Media: return None diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index 65b0e59..e812e8b 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -16,7 +16,7 @@ import numpy as np from PIL import Image, UnidentifiedImageError from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/s3_storage/s3.py b/src/auto_archiver/modules/s3_storage/s3.py index a637259..10d5f61 100644 --- a/src/auto_archiver/modules/s3_storage/s3.py +++ b/src/auto_archiver/modules/s3_storage/s3.py @@ -4,14 +4,13 @@ import boto3, os from auto_archiver.utils.misc import random_str from auto_archiver.core import Media -from auto_archiver.base_processors import Storage -# TODO +from auto_archiver.core import Storage + from auto_archiver.modules.hash_enricher import HashEnricher from loguru import logger NO_DUPLICATES_FOLDER = "no-dups/" class S3Storage(Storage): - name = "s3_storage" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index 0140875..be775ce 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -5,15 +5,11 @@ import base64 from selenium.common.exceptions import TimeoutException -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.utils import Webdriver, UrlUtil, random_str from auto_archiver.core import Media, Metadata, ArchivingContext class ScreenshotEnricher(Enricher): - name = "screenshot_enricher" - - def __init__(self, config: dict) -> None: - super().__init__(config) def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index d15ee95..52237ee 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -3,7 +3,7 @@ from slugify import slugify from urllib.parse import urlparse from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index aa7e46f..d612e24 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -2,7 +2,7 @@ import requests, re, html from bs4 import BeautifulSoup from loguru import logger -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 8b49a10..f378e7e 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -8,21 +8,15 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str class TelethonArchiver(Extractor): - name = "telethon_extractor" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def __init__(self, config: dict) -> None: - super().__init__(config) - self.assert_valid_string("api_id") - self.assert_valid_string("api_hash") - def setup(self) -> None: """ diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 4a5a1db..b27243b 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,7 +9,7 @@ and identify important moments without watching the entire video. import ffmpeg, os from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Media, Metadata, ArchivingContext from auto_archiver.utils.misc import random_str @@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher): logger.error(f"error getting duration of video {m.filename}: {e}") return - num_thumbs = int(min(max(1, duration * self.thumbnails_per_second), self.max_thumbnails)) + num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails)) timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)] thumbnails_media = [] diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index c90d42c..a7a0aee 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext from asn1crypto import pem import certifi -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor class TimestampingEnricher(Enricher): @@ -21,10 +21,6 @@ class TimestampingEnricher(Enricher): See https://gist.github.com/Manouchehri/fd754e402d98430243455713efada710 for list of timestamp authorities. """ - name = "timestamping_enricher" - - def __init__(self, config: dict) -> None: - super().__init__(config) def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index ea669b4..6a4930a 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -8,11 +8,10 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata,Media class TwitterApiExtractor(Extractor): - name = "twitter_api_extractor" link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") def __init__(self, config: dict) -> None: diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index eb4c171..1bce167 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -2,7 +2,7 @@ from loguru import logger from vk_url_scraper import VkScraper from auto_archiver.utils.misc import dump_payload -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext @@ -11,7 +11,6 @@ class VkExtractor(Extractor): VK videos are handled by YTDownloader, this archiver gets posts text and images. Currently only works for /wall posts """ - name = "vk_extractor" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 9ba43ae..1eb7398 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -6,7 +6,7 @@ from loguru import logger from warcio.archiveiterator import ArchiveIterator from auto_archiver.core import Media, Metadata, ArchivingContext -from auto_archiver.base_processors import Extractor, Enricher +from auto_archiver.core import Extractor, Enricher from auto_archiver.utils import UrlUtil, random_str @@ -17,11 +17,6 @@ class WaczExtractorEnricher(Enricher, Extractor): it can become quite powerful for archiving private content. When used as an archiver it will extract the media from the .WACZ archive so it can be enriched. """ - name = "wacz_archiver_enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) def setup(self) -> None: self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py index 6942727..0e25440 100644 --- a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py +++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py @@ -2,7 +2,7 @@ import json from loguru import logger import time, requests -from auto_archiver.base_processors import Extractor, Enricher +from auto_archiver.core import Extractor, Enricher from auto_archiver.utils import UrlUtil from auto_archiver.core import Metadata @@ -12,13 +12,6 @@ class WaybackExtractorEnricher(Enricher, Extractor): The Wayback machine will rate-limit IP heavy usage. """ - name = "wayback_archiver_enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key" - assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret" def download(self, item: Metadata) -> Metadata: # this new Metadata object is required to avoid duplication diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index d14c537..09eb3db 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -2,7 +2,7 @@ import traceback import requests, time from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.modules.s3_storage import S3Storage @@ -13,14 +13,6 @@ class WhisperEnricher(Enricher): whisper API repository: https://github.com/bellingcat/whisperbox-transcribe/ Only works if an S3 compatible storage is used """ - name = "whisper_enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - assert type(self.api_endpoint) == str and len(self.api_endpoint) > 0, "please provide a value for the whisper_enricher api_endpoint" - assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key" - self.timeout = int(self.timeout) def enrich(self, to_enrich: Metadata) -> None: if not self._get_s3_storage(): diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py index 485344f..7a8862f 100644 --- a/src/auto_archiver/utils/gsheet.py +++ b/src/auto_archiver/utils/gsheet.py @@ -1,9 +1,9 @@ import json, gspread -from ..core import Step +from ..core import BaseModule -class Gsheets(Step): +class Gsheets(BaseModule): name = "gsheets" def __init__(self, config: dict) -> None: From 6c67effd8c458a451b0549d477027e9b0baf34ce Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 27 Jan 2025 19:17:18 +0000 Subject: [PATCH 035/110] remove name reference in local_storage.py --- src/auto_archiver/modules/local_storage/local_storage.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index 4c44e9c..b995577 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -9,7 +9,6 @@ from auto_archiver.core import Storage class LocalStorage(Storage): - name = "local_storage" def get_cdn_url(self, media: Media) -> str: # TODO: is this viable with Storage.configs on path/filename? @@ -22,7 +21,7 @@ class LocalStorage(Storage): # override parent so that we can use shutil.copy2 and keep metadata dest = os.path.join(self.save_to, media.key) os.makedirs(os.path.dirname(dest), exist_ok=True) - logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}') + logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}') res = shutil.copy2(media.filename, dest) logger.info(res) return True From 57b3bec9351237f24116e91b6b5665a7db300033 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 27 Jan 2025 20:13:12 +0000 Subject: [PATCH 036/110] Google sheets feeder and database implemented. --- .../modules/gsheet_db/__manifest__.py | 1 + .../modules/gsheet_feeder/__manifest__.py | 56 ++++++++++--------- .../modules/gsheet_feeder/gsheet_feeder.py | 43 +++----------- 3 files changed, 39 insertions(+), 61 deletions(-) diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py index 8c54fe5..f2f1c35 100644 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -1,6 +1,7 @@ { "name": "Google Sheets Database", "type": ["database"], + "entry_point": "gsheet_db::GsheetsDb", "requires_setup": True, "external_dependencies": { "python": ["loguru", "gspread", "python-slugify"], diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index e1a89a2..3d9cb08 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -7,30 +7,36 @@ "python": ["loguru", "gspread", "python-slugify"], }, "configs": { - "sheet": {"default": None, "help": "name of the sheet to archive"}, - "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, - "header": {"default": 1, "help": "index of the header row (starts at 1)"}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, - "columns": { - "default": { - 'url': 'link', - 'status': 'archive status', - 'folder': 'destination folder', - 'archive': 'archive location', - 'date': 'archive date', - 'thumbnail': 'thumbnail', - 'timestamp': 'upload timestamp', - 'title': 'upload title', - 'text': 'text content', - 'screenshot': 'screenshot', - 'hash': 'hash', - 'pdq_hash': 'perceptual hashes', - 'wacz': 'wacz', - 'replaywebpage': 'replaywebpage', - }, - "help": "names of columns in the google sheet (stringified JSON object)", - "type": "auto_archiver.utils.json_loader", + "sheet": {"default": None, "help": "name of the sheet to archive"}, + "sheet_id": { + "default": None, + "help": "(alternative to sheet name) the id of the sheet to archive", + }, + "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"}, + "service_account": { + "default": "secrets/service_account.json", + "help": "service account JSON file path", + }, + "columns": { + "default": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", }, + "help": "names of columns in the google sheet (stringified JSON object)", + "type": "auto_archiver.utils.json_loader", + }, "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", @@ -43,7 +49,7 @@ "default": True, "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", "type": "bool", - } + }, }, "description": """ GsheetsFeeder @@ -61,5 +67,5 @@ ### Notes - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. - Create the sheet using the template provided in the docs. - """ + """, } diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 66dd014..235dd63 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -21,41 +21,13 @@ from . import GWorksheet class GsheetsFeeder(Feeder): - def __init__(self) -> None: - """ - Initializes the GsheetsFeeder with preloaded configurations. - """ - super().__init__() - # Initialize the gspread client with the provided service account file - # self.gsheets_client = gspread.service_account(filename=self.config["service_account"]) - # - # # Set up feeder-specific configurations from the config - # self.sheet_name = config.get("sheet") - # self.sheet_id = config.get("sheet_id") - # self.header = config.get("header", 1) - # self.columns = config.get("columns", {}) - # assert self.sheet_name or self.sheet_id, ( - # "You need to define either a 'sheet' name or a 'sheet_id' in your manifest." - # ) - - - # # Configuration attributes - # self.sheet = config.get("sheet") - # self.sheet_id = config.get("sheet_id") - # self.header = config.get("header", 1) - # self.columns = config.get("columns", {}) - # self.allow_worksheets = config.get("allow_worksheets", set()) - # self.block_worksheets = config.get("block_worksheets", set()) - # self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True) - - # Ensure the header is an integer - # try: - # self.header = int(self.header) - # except ValueError: - # pass - # assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}" - # assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined." - # + def setup(self, config: dict): + super().setup(config) + self.gsheets_client = gspread.service_account(filename=self.service_account) + # TODO mv to validators + assert self.sheet or self.sheet_id, ( + "You need to define either a 'sheet' name or a 'sheet_id' in your manifest." + ) def open_sheet(self): if self.sheet: @@ -63,7 +35,6 @@ class GsheetsFeeder(Feeder): else: # self.sheet_id return self.gsheets_client.open_by_key(self.sheet_id) - def __iter__(self) -> Metadata: sh = self.open_sheet() for ii, wks in enumerate(sh.worksheets()): From 1d2a1d4db7be58073428eff9a310e1cfc4268b5e Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 28 Jan 2025 11:14:12 +0100 Subject: [PATCH 037/110] Allow framework for config settings that should not be stored in config (e.g. cli_feeder.urls Use 'do_not_store': True in the config settings to apply this. Also: fix up generic archiver dropins loading + local_storage defaults (same as what's in example orchestration) --- src/auto_archiver/core/config.py | 15 ++++++-- src/auto_archiver/core/orchestrator.py | 34 +++++++++---------- .../modules/cli_feeder/__manifest__.py | 1 + .../generic_extractor/generic_extractor.py | 3 ++ .../modules/local_storage/__manifest__.py | 6 ++-- 5 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index f724828..f98d64d 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer from copy import deepcopy from .module import MODULE_TYPES -from typing import Any, List, Type +from typing import Any, List, Type, Tuple yaml = YAML() @@ -101,6 +101,15 @@ def read_yaml(yaml_filename: str) -> CommentedMap: return config -def store_yaml(config: CommentedMap, yaml_filename: str): +# TODO: make this tidier/find a way to notify of which keys should not be stored + + +def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None: + config_to_save = deepcopy(config) + + for key1, key2 in do_not_store_keys: + if key1 in config_to_save and key2 in config_to_save[key1]: + del config_to_save[key1][key2] + with open(yaml_filename, "w", encoding="utf-8") as outf: - yaml.dump(config, outf) \ No newline at end of file + yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 4f155db..bc897ef 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -39,20 +39,7 @@ class UniqueAppendAction(argparse.Action): class ArchivingOrchestrator: - # def __init__(self, config: Config) -> None: - # self.feeder: Feeder = config.feeder - # self.formatter: Formatter = config.formatter - # self.enrichers: List[Enricher] = config.enrichers - # self.archivers: List[Archiver] = config.archivers - # self.databases: List[Database] = config.databases - # self.storages: List[Storage] = config.storages - # ArchivingContext.set("storages", self.storages, keep_on_reset=True) - - # try: - # for a in self.all_archivers_for_setup(): a.setup() - # except (KeyboardInterrupt, Exception) as e: - # logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") - # self.cleanup() + _do_not_store_keys = [] def setup_basic_parser(self): parser = argparse.ArgumentParser( @@ -125,10 +112,10 @@ class ArchivingOrchestrator: if unknown: logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?") - + if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file): logger.info(f"Storing configuration file to {basic_config.config_file}") - store_yaml(self.config, basic_config.config_file) + store_yaml(self.config, basic_config.config_file, self._do_not_store_keys) return self.config @@ -167,6 +154,10 @@ class ArchivingOrchestrator: for name, kwargs in module.configs.items(): # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something + do_not_store = kwargs.pop('do_not_store', False) + if do_not_store: + self._do_not_store_keys.append((module.name, name)) + kwargs.pop('cli_set', None) should_store = kwargs.pop('should_store', False) kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" @@ -193,7 +184,7 @@ class ArchivingOrchestrator: logging_config = self.config['logging'] logger.add(sys.stderr, level=logging_config['level']) if log_file := logging_config['file']: - logger.add(log_file, rotation=logging_config['logging.rotation']) + logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) def install_modules(self): @@ -221,7 +212,14 @@ class ArchivingOrchestrator: if module in invalid_modules: continue loaded_module: BaseModule = get_module(module).load() - loaded_module.setup(self.config) + try: + loaded_module.setup(self.config) + except (KeyboardInterrupt, Exception) as e: + logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") + if module_type == 'extractor': + loaded_module.cleanup() + exit() + if not loaded_module: invalid_modules.append(module) continue diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index 4790a25..01ef2e7 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -11,6 +11,7 @@ "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", "nargs": "+", "required": True, + "do_not_store": True, }, }, "description": """ diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 57924d9..36fb71e 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -121,6 +121,7 @@ class GenericExtractor(Extractor): ie_instance = info_extractor(downloader=ydl) dropin = self.dropin_for_name(info_extractor.ie_key()) + if not dropin: # TODO: add a proper link to 'how to create your own dropin' logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}. @@ -172,6 +173,8 @@ class GenericExtractor(Extractor): def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]: + dropin_name = dropin_name.lower() + if dropin_name == "generic": # no need for a dropin for the generic extractor (?) return None diff --git a/src/auto_archiver/modules/local_storage/__manifest__.py b/src/auto_archiver/modules/local_storage/__manifest__.py index c012be0..ce00953 100644 --- a/src/auto_archiver/modules/local_storage/__manifest__.py +++ b/src/auto_archiver/modules/local_storage/__manifest__.py @@ -7,16 +7,16 @@ }, "configs": { "path_generator": { - "default": "url", + "default": "flat", "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", "choices": ["flat", "url", "random"], }, "filename_generator": { - "default": "random", + "default": "static", "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", "choices": ["random", "static"], }, - "save_to": {"default": "./archived", "help": "folder where to save archived content"}, + "save_to": {"default": "./local_archive", "help": "folder where to save archived content"}, "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, }, "description": """ From 27b25c5bd42cd2724dbeb8301da7a5456a2c3754 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 28 Jan 2025 11:37:23 +0100 Subject: [PATCH 038/110] Validate orchestration.yaml file inputs - so if a user enters invalid values, it also validates them --- src/auto_archiver/core/config.py | 18 +++++++++++++++++- src/auto_archiver/core/orchestrator.py | 16 +++++++++++----- .../modules/cli_feeder/__manifest__.py | 1 + .../modules/wayback_enricher/__manifest__.py | 5 +++-- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index f98d64d..245fdaf 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -8,6 +8,8 @@ flexible setup in various environments. import argparse from ruamel.yaml import YAML, CommentedMap, add_representer +from loguru import logger + from copy import deepcopy from .module import MODULE_TYPES @@ -30,8 +32,22 @@ logging: """) # note: 'logging' is explicitly added above in order to better format the config file +class DefaultValidatingParser(argparse.ArgumentParser): + def parse_known_args(self, args=None, namespace=None): + for action in self._actions: + if not namespace or action.dest not in namespace: + if action.default is not None: + try: + self._check_value(action, action.default) + except argparse.ArgumentError as e: + logger.error(f"You have an invalid setting in your configuration file ({action.dest}):") + logger.error(e) + exit() -def to_dot_notation(yaml_conf: CommentedMap | dict) -> argparse.ArgumentParser: + return super().parse_known_args(args, namespace) + + +def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict: dotdict = {} def process_subdict(subdict, prefix=""): diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index bc897ef..16cf9c4 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -18,7 +18,7 @@ from .context import ArchivingContext from .metadata import Metadata from ..version import __version__ -from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG +from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, MODULE_TYPES, get_module from . import validators from .module import BaseModule @@ -56,12 +56,12 @@ class ArchivingOrchestrator: parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') # override the default 'help' so we can inject all the configs and show those parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit') - parser.add_argument('-s', '--store', dest='store', default=True, help='Store the created config in the config file', action=argparse.BooleanOptionalAction) + parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction) self.basic_parser = parser def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None: - parser = argparse.ArgumentParser( + parser = DefaultValidatingParser( add_help=False, ) self.add_additional_args(parser) @@ -149,6 +149,7 @@ class ArchivingOrchestrator: # this module has no configs, don't show anything in the help # (TODO: do we want to show something about this module though, like a description?) continue + group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") for name, kwargs in module.configs.items(): @@ -157,6 +158,10 @@ class ArchivingOrchestrator: do_not_store = kwargs.pop('do_not_store', False) if do_not_store: self._do_not_store_keys.append((module.name, name)) + + if not kwargs.get('metavar', None): + # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR] + kwargs['metavar'] = name.upper() kwargs.pop('cli_set', None) should_store = kwargs.pop('should_store', False) @@ -248,8 +253,6 @@ class ArchivingOrchestrator: if basic_config.help: self.show_help() - logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") - # load the config file yaml_config = {} @@ -257,8 +260,11 @@ class ArchivingOrchestrator: logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") exit() + yaml_config = read_yaml(basic_config.config_file) self.setup_complete_parser(basic_config, yaml_config, unused_args) + + logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") self.install_modules() # log out the modules that were loaded diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index 01ef2e7..fe784c3 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -12,6 +12,7 @@ "nargs": "+", "required": True, "do_not_store": True, + "metavar": "INPUT URLS", }, }, "description": """ diff --git a/src/auto_archiver/modules/wayback_enricher/__manifest__.py b/src/auto_archiver/modules/wayback_enricher/__manifest__.py index b3af284..bff10af 100644 --- a/src/auto_archiver/modules/wayback_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wayback_enricher/__manifest__.py @@ -5,11 +5,12 @@ "external_dependencies": { "python": ["loguru", "requests"], }, + "entry_point": "wayback_enricher::WaybackExtractorEnricher", "configs": { "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."}, "if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"}, - "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, - "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}, + "key": {"default": None, "required": True, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, + "secret": {"default": None, "required": True, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}, "proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"}, "proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"}, }, From 9635449ac081870c55af25b1139714afc1aa3486 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 28 Jan 2025 11:44:52 +0100 Subject: [PATCH 039/110] more user friendly error logging when config issues are found --- src/auto_archiver/core/config.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 245fdaf..529e1c2 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -33,7 +33,19 @@ logging: # note: 'logging' is explicitly added above in order to better format the config file class DefaultValidatingParser(argparse.ArgumentParser): + + def error(self, message): + """ + Override of error to format a nicer looking error message using logger + """ + logger.error("Problem with configuration file (tip: use --help to see the available options):") + logger.error(message) + self.exit(2) + def parse_known_args(self, args=None, namespace=None): + """ + Override of parse_known_args to also check the 'defaults' values - which are passed in from the config file + """ for action in self._actions: if not namespace or action.dest not in namespace: if action.default is not None: From 7a4871db6bf8994087f513a90a88b07f68d04921 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 28 Jan 2025 14:40:12 +0100 Subject: [PATCH 040/110] Fix up unit tests for new structure --- src/auto_archiver/core/module.py | 13 ++--- .../generic_extractor/generic_extractor.py | 1 - .../twitter_api_extractor.py | 9 +--- tests/conftest.py | 33 +++++++++++- tests/databases/test_csv_db.py | 6 +-- tests/enrichers/test_hash_enricher.py | 24 ++++----- tests/{archivers => extractors}/__init__.py | 0 .../test_extractor_base.py} | 13 ++--- .../test_generic_extractor.py} | 54 ++++++++++--------- .../test_twitter_api_extractor.py} | 42 ++++++++------- tests/formatters/test_html_formatter.py | 5 +- tests/test_modules.py | 38 +++++++++++++ 12 files changed, 150 insertions(+), 88 deletions(-) rename tests/{archivers => extractors}/__init__.py (100%) rename tests/{archivers/test_archiver_base.py => extractors/test_extractor_base.py} (60%) rename tests/{archivers/test_generic_archiver.py => extractors/test_generic_extractor.py} (85%) rename tests/{archivers/test_twitter_api_archiver.py => extractors/test_twitter_api_extractor.py} (84%) create mode 100644 tests/test_modules.py diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 3ef43e5..18f791b 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -51,7 +51,7 @@ class BaseModule(ABC): for key, val in config.get(self.name, {}).items(): setattr(self, key, val) -def get_module(module_name: str, additional_paths: List[str] = []): +def get_module(module_name: str, additional_paths: List[str] = []) -> LazyBaseModule: if module_name in _LAZY_LOADED_MODULES: return _LAZY_LOADED_MODULES[module_name] @@ -119,19 +119,19 @@ class LazyBaseModule: return self._entry_point @property - def dependencies(self): + def dependencies(self) -> dict: return self.manifest['dependencies'] @property - def configs(self): + def configs(self) -> dict: return self.manifest['configs'] @property - def requires_setup(self): + def requires_setup(self) -> bool: return self.manifest['requires_setup'] @property - def manifest(self): + def manifest(self) -> dict: if self._manifest: return self._manifest # print(f"Loading manifest for module {module_path}") @@ -149,10 +149,11 @@ class LazyBaseModule: self.type = manifest['type'] self._entry_point = manifest['entry_point'] self.description = manifest['description'] + self.version = manifest['version'] return manifest - def load(self): + def load(self) -> BaseModule: if self._instance: return self._instance diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 36fb71e..e643c21 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -172,7 +172,6 @@ class GenericExtractor(Extractor): return self.add_metadata(data, info_extractor, url, result) def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]: - dropin_name = dropin_name.lower() if dropin_name == "generic": diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 6a4930a..ede0239 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -14,21 +14,16 @@ from auto_archiver.core import Metadata,Media class TwitterApiExtractor(Extractor): link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") - def __init__(self, config: dict) -> None: - super().__init__(config) + def setup(self, config: dict) -> None: + super().setup(config) self.api_index = 0 self.apis = [] if len(self.bearer_tokens): self.apis.extend([Api(bearer_token=bearer_token) for bearer_token in self.bearer_tokens]) if self.bearer_token: - self.assert_valid_string("bearer_token") self.apis.append(Api(bearer_token=self.bearer_token)) if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret: - self.assert_valid_string("consumer_key") - self.assert_valid_string("consumer_secret") - self.assert_valid_string("access_token") - self.assert_valid_string("access_secret") self.apis.append(Api(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret, access_token=self.access_token, access_secret=self.access_secret)) assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results." diff --git a/tests/conftest.py b/tests/conftest.py index 553b573..c2c74f2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,9 +3,10 @@ pytest conftest file, for shared fixtures and configuration """ from typing import Dict, Tuple - +import hashlib import pytest from auto_archiver.core.metadata import Metadata +from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES # Test names inserted into this list will be run last. This is useful for expensive/costly tests # that you only want to run if everything else succeeds (e.g. API calls). The order here is important @@ -13,6 +14,36 @@ from auto_archiver.core.metadata import Metadata # format is the name of the module (python file) without the .py extension TESTS_TO_RUN_LAST = ['test_twitter_api_archiver'] +@pytest.fixture +def setup_module(request): + def _setup_module(module_name, config={}): + + if isinstance(module_name, type): + # get the module name: + # if the class does not have a .name, use the name of the parent folder + module_name = module_name.__module__.rsplit(".",2)[-2] + + m = get_module(module_name).load() + m.name = module_name + m.setup({module_name : config}) + + + def cleanup(): + _LAZY_LOADED_MODULES.pop(module_name) + request.addfinalizer(cleanup) + + return m + + return _setup_module + +@pytest.fixture +def check_hash(): + def _check_hash(filename: str, hash: str): + with open(filename, "rb") as f: + buf = f.read() + assert hash == hashlib.sha256(buf).hexdigest() + + return _check_hash @pytest.fixture def make_item(): diff --git a/tests/databases/test_csv_db.py b/tests/databases/test_csv_db.py index 989f1e9..afca0d8 100644 --- a/tests/databases/test_csv_db.py +++ b/tests/databases/test_csv_db.py @@ -3,13 +3,11 @@ from auto_archiver.modules.csv_db import CSVDb from auto_archiver.core import Metadata -def test_store_item(tmp_path): +def test_store_item(tmp_path, setup_module): """Tests storing an item in the CSV database""" temp_db = tmp_path / "temp_db.csv" - db = CSVDb({ - "csv_db": {"csv_file": temp_db.as_posix()} - }) + db = setup_module(CSVDb, {"csv_file": temp_db.as_posix()}) item = Metadata().set_url("http://example.com").set_title("Example").set_content("Example content").success("my-archiver") diff --git a/tests/enrichers/test_hash_enricher.py b/tests/enrichers/test_hash_enricher.py index 1477cde..63e4824 100644 --- a/tests/enrichers/test_hash_enricher.py +++ b/tests/enrichers/test_hash_enricher.py @@ -2,6 +2,7 @@ import pytest from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.core import Metadata, Media +from auto_archiver.core.module import get_module @pytest.mark.parametrize("algorithm, filename, expected_hash", [ ("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"), @@ -9,36 +10,29 @@ from auto_archiver.core import Metadata, Media ("SHA3-512", "tests/data/testfile_1.txt", "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e"), ("SHA3-512", "tests/data/testfile_2.txt", "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6") ]) -def test_calculate_hash(algorithm, filename, expected_hash): +def test_calculate_hash(algorithm, filename, expected_hash, setup_module): # test SHA-256 - he = HashEnricher({"algorithm": algorithm, "chunksize": 1}) + he = setup_module(HashEnricher, {"algorithm": algorithm, "chunksize": 1}) assert he.calculate_hash(filename) == expected_hash -def test_default_config_values(): - he = HashEnricher(config={}) +def test_default_config_values(setup_module): + he = setup_module(HashEnricher) assert he.algorithm == "SHA-256" assert he.chunksize == 16000000 -def test_invalid_chunksize(): - with pytest.raises(AssertionError): - he = HashEnricher({"chunksize": "-100"}) - -def test_invalid_algorithm(): - with pytest.raises(AssertionError): - HashEnricher({"algorithm": "SHA-123"}) - def test_config(): # test default config - c = HashEnricher.configs() + c = get_module('hash_enricher').configs assert c["algorithm"]["default"] == "SHA-256" assert c["chunksize"]["default"] == 16000000 assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"] assert c["algorithm"]["help"] == "hash algorithm to use" assert c["chunksize"]["help"] == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB" -def test_hash_media(): - he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1}) +def test_hash_media(setup_module): + + he = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 1}) # generate metadata with two test files m = Metadata().set_url("https://example.com") diff --git a/tests/archivers/__init__.py b/tests/extractors/__init__.py similarity index 100% rename from tests/archivers/__init__.py rename to tests/extractors/__init__.py diff --git a/tests/archivers/test_archiver_base.py b/tests/extractors/test_extractor_base.py similarity index 60% rename from tests/archivers/test_archiver_base.py rename to tests/extractors/test_extractor_base.py index 6223879..bb78794 100644 --- a/tests/archivers/test_archiver_base.py +++ b/tests/extractors/test_extractor_base.py @@ -1,17 +1,18 @@ import pytest from auto_archiver.core.metadata import Metadata -from auto_archiver.base_processors.extractor import Extractor -class TestArchiverBase(object): +from auto_archiver.core.extractor import Extractor +from auto_archiver.core.module import get_module +class TestExtractorBase(object): - archiver_class: str = None + extractor_module: str = None config: dict = None @pytest.fixture(autouse=True) - def setup_archiver(self): - assert self.archiver_class is not None, "self.archiver_class must be set on the subclass" + def setup_archiver(self, setup_module): + assert self.extractor_module is not None, "self.extractor_module must be set on the subclass" assert self.config is not None, "self.config must be a dict set on the subclass" - self.archiver: Extractor = self.archiver_class({self.archiver_class.name: self.config}) + self.extractor: Extractor = setup_module(self.extractor_module, self.config) def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""): assert test_response is not False diff --git a/tests/archivers/test_generic_archiver.py b/tests/extractors/test_generic_extractor.py similarity index 85% rename from tests/archivers/test_generic_archiver.py rename to tests/extractors/test_generic_extractor.py index b0190b6..c70a51f 100644 --- a/tests/archivers/test_generic_archiver.py +++ b/tests/extractors/test_generic_extractor.py @@ -6,13 +6,15 @@ from os.path import dirname import pytest -from auto_archiver.archivers.generic_extractor.generic_extractor import GenericExtractor -from .test_archiver_base import TestArchiverBase +from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor +from .test_extractor_base import TestExtractorBase -class TestGenericExtractor(TestArchiverBase): - """Tests Base Archiver +class TestGenericExtractor(TestExtractorBase): + """Tests Generic Extractor """ - archiver_class = GenericExtractor + extractor_module = 'generic_extractor' + extractor: GenericExtractor + config = { 'subtitles': False, 'comments': False, @@ -28,12 +30,12 @@ class TestGenericExtractor(TestArchiverBase): def test_load_dropin(self): # test loading dropins that are in the generic_archiver package - package = "auto_archiver.archivers.generic_archiver" - assert self.archiver.dropin_for_name("bluesky", package=package) + package = "auto_archiver.modules.generic_extractor" + assert self.extractor.dropin_for_name("bluesky", package=package) # test loading dropings via filepath path = os.path.join(dirname(dirname(__file__)), "data/") - assert self.archiver.dropin_for_name("dropin", additional_paths=[path]) + assert self.extractor.dropin_for_name("dropin", additional_paths=[path]) @@ -51,12 +53,12 @@ class TestGenericExtractor(TestArchiverBase): This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for, and then if and only if all archivers fails, does it fall back to the generic archiver) """ - assert self.archiver.suitable(url) == is_suitable + assert self.extractor.suitable(url) == is_suitable @pytest.mark.download def test_download_tiktok(self, make_item): item = make_item("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970") - result = self.archiver.download(item) + result = self.extractor.download(item) assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970" @pytest.mark.download @@ -72,7 +74,7 @@ class TestGenericExtractor(TestArchiverBase): It should return 'False' """ item = make_item(url) - result = self.archiver.download(item) + result = self.extractor.download(item) assert not result @@ -80,7 +82,7 @@ class TestGenericExtractor(TestArchiverBase): def test_youtube_download(self, make_item): # url https://www.youtube.com/watch?v=5qap5aO4i9A item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ") - result = self.archiver.download(item) + result = self.extractor.download(item) assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ" assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!" assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/" @@ -91,78 +93,78 @@ class TestGenericExtractor(TestArchiverBase): @pytest.mark.download def test_bluesky_download_multiple_images(self, make_item): item = make_item("https://bsky.app/profile/bellingcat.com/post/3lffjoxcu7k2w") - result = self.archiver.download(item) + result = self.extractor.download(item) assert result is not False @pytest.mark.download def test_bluesky_download_single_image(self, make_item): item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q") - result = self.archiver.download(item) + result = self.extractor.download(item) assert result is not False @pytest.mark.download def test_bluesky_download_no_media(self, make_item): item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z") - result = self.archiver.download(item) + result = self.extractor.download(item) assert result is not False @pytest.mark.download def test_bluesky_download_video(self, make_item): item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i") - result = self.archiver.download(item) + result = self.extractor.download(item) assert result is not False @pytest.mark.download def test_truthsocial_download_video(self, make_item): item = make_item("https://truthsocial.com/@DaynaTrueman/posts/110602446619561579") - result = self.archiver.download(item) + result = self.extractor.download(item) assert len(result.media) == 1 assert result is not False @pytest.mark.download def test_truthsocial_download_no_media(self, make_item): item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628") - result = self.archiver.download(item) + result = self.extractor.download(item) assert result is not False @pytest.mark.download def test_truthsocial_download_poll(self, make_item): item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098") - result = self.archiver.download(item) + result = self.extractor.download(item) assert result is not False @pytest.mark.download def test_truthsocial_download_single_image(self, make_item): item = make_item("https://truthsocial.com/@mariabartiromo/posts/113861116433335006") - result = self.archiver.download(item) + result = self.extractor.download(item) assert len(result.media) == 1 assert result is not False @pytest.mark.download def test_truthsocial_download_multiple_images(self, make_item): item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135") - result = self.archiver.download(item) + result = self.extractor.download(item) assert len(result.media) == 3 @pytest.mark.download def test_twitter_download_nonexistend_tweet(self, make_item): # this tweet does not exist url = "https://x.com/Bellingcat/status/17197025860711058" - response = self.archiver.download(make_item(url)) + response = self.extractor.download(make_item(url)) assert not response @pytest.mark.download def test_twitter_download_malformed_tweetid(self, make_item): # this tweet does not exist url = "https://x.com/Bellingcat/status/1719702a586071100058" - response = self.archiver.download(make_item(url)) + response = self.extractor.download(make_item(url)) assert not response @pytest.mark.download def test_twitter_download_tweet_no_media(self, make_item): item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w") - post = self.archiver.download(item) + post = self.extractor.download(item) self.assertValidResponseMetadata( post, @@ -174,7 +176,7 @@ class TestGenericExtractor(TestArchiverBase): @pytest.mark.download def test_twitter_download_video(self, make_item): url = "https://x.com/bellingcat/status/1871552600346415571" - post = self.archiver.download(make_item(url)) + post = self.extractor.download(make_item(url)) self.assertValidResponseMetadata( post, "Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services", @@ -193,7 +195,7 @@ class TestGenericExtractor(TestArchiverBase): """Download tweets with sensitive media""" - post = self.archiver.download(make_item(url)) + post = self.extractor.download(make_item(url)) self.assertValidResponseMetadata( post, title, diff --git a/tests/archivers/test_twitter_api_archiver.py b/tests/extractors/test_twitter_api_extractor.py similarity index 84% rename from tests/archivers/test_twitter_api_archiver.py rename to tests/extractors/test_twitter_api_extractor.py index a95f2c7..d9a8eb0 100644 --- a/tests/archivers/test_twitter_api_archiver.py +++ b/tests/extractors/test_twitter_api_extractor.py @@ -1,17 +1,18 @@ import os import datetime - +import hashlib import pytest from pytwitter.models.media import MediaVariant -from .test_archiver_base import TestArchiverBase -from auto_archiver.archivers import TwitterApiArchiver +from .test_extractor_base import TestExtractorBase +from auto_archiver.modules.twitter_api_extractor import TwitterApiExtractor @pytest.mark.incremental -class TestTwitterApiArchiver(TestArchiverBase): +class TestTwitterApiExtractor(TestExtractorBase): + + extractor_module = 'twitter_api_extractor' - archiver_class = TwitterApiArchiver config = { "bearer_tokens": [], "bearer_token": os.environ.get("TWITTER_BEARER_TOKEN", "TEST_KEY"), @@ -30,7 +31,7 @@ class TestTwitterApiArchiver(TestArchiverBase): ("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs ]) def test_sanitize_url(self, url, expected): - assert expected == self.archiver.sanitize_url(url) + assert expected == self.extractor.sanitize_url(url) @pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [ ("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"), @@ -39,7 +40,7 @@ class TestTwitterApiArchiver(TestArchiverBase): ]) def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid): - username, tweet_id = self.archiver.get_username_tweet_id(url) + username, tweet_id = self.extractor.get_username_tweet_id(url) assert exptected_username == username assert exptected_tweetid == tweet_id @@ -50,7 +51,7 @@ class TestTwitterApiArchiver(TestArchiverBase): MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'), MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12') ] - chosen_variant = self.archiver.choose_variant(variant_list) + chosen_variant = self.extractor.choose_variant(variant_list) assert chosen_variant == variant_list[3] @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided") @@ -58,7 +59,7 @@ class TestTwitterApiArchiver(TestArchiverBase): def test_download_nonexistent_tweet(self, make_item): # this tweet does not exist url = "https://x.com/Bellingcat/status/17197025860711058" - response = self.archiver.download(make_item(url)) + response = self.extractor.download(make_item(url)) assert not response @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided") @@ -66,7 +67,7 @@ class TestTwitterApiArchiver(TestArchiverBase): def test_download_malformed_tweetid(self, make_item): # this tweet does not exist url = "https://x.com/Bellingcat/status/1719702586071100058" - response = self.archiver.download(make_item(url)) + response = self.extractor.download(make_item(url)) assert not response @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided") @@ -74,7 +75,7 @@ class TestTwitterApiArchiver(TestArchiverBase): def test_download_tweet_no_media(self, make_item): item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w") - post = self.archiver.download(item) + post = self.extractor.download(item) self.assertValidResponseMetadata( post, @@ -87,7 +88,7 @@ class TestTwitterApiArchiver(TestArchiverBase): @pytest.mark.download def test_download_video(self, make_item): url = "https://x.com/bellingcat/status/1871552600346415571" - post = self.archiver.download(make_item(url)) + post = self.extractor.download(make_item(url)) self.assertValidResponseMetadata( post, "This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8", @@ -95,22 +96,23 @@ class TestTwitterApiArchiver(TestArchiverBase): ) @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided") - @pytest.mark.parametrize("url, title, timestamp, image_src", [ - ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "https://pbs.twimg.com/media/GgtqkomWkAAHUUl.jpg"), - ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "https://pbs.twimg.com/media/GgtqkomWkAAHUUl.jpg"), - ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "https://pbs.twimg.com/media/GgtqkomWkAAHUUl.jpg"), - ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "https://pbs.twimg.com/media/GgtqkomWkAAHUUl.jpg"), + @pytest.mark.parametrize("url, title, timestamp", [ + ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)), + ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)), + ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)), + ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)), ]) @pytest.mark.download - def test_download_sensitive_media(self, url, title, timestamp, image_src, make_item): + def test_download_sensitive_media(self, url, title, timestamp, check_hash, make_item): """Download tweets with sensitive media""" - post = self.archiver.download(make_item(url)) + post = self.extractor.download(make_item(url)) self.assertValidResponseMetadata( post, title, timestamp ) assert len(post.media) == 1 - assert post.media[0].get('src') == image_src \ No newline at end of file + # check the SHA1 hash (quick) of the media, to make sure it's valid + check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733") \ No newline at end of file diff --git a/tests/formatters/test_html_formatter.py b/tests/formatters/test_html_formatter.py index 2719033..60abaa7 100644 --- a/tests/formatters/test_html_formatter.py +++ b/tests/formatters/test_html_formatter.py @@ -2,8 +2,9 @@ from auto_archiver.modules.html_formatter import HtmlFormatter from auto_archiver.core import Metadata, Media -def test_format(): - formatter = HtmlFormatter({}) +def test_format(setup_module): + formatter = setup_module(HtmlFormatter) + metadata = Metadata().set("content", "Hello, world!").set_url('https://example.com') final_media = formatter.format(metadata) diff --git a/tests/test_modules.py b/tests/test_modules.py new file mode 100644 index 0000000..619906b --- /dev/null +++ b/tests/test_modules.py @@ -0,0 +1,38 @@ +import pytest +from auto_archiver.core.module import get_module, BaseModule, LazyBaseModule + +@pytest.mark.parametrize("module_name", ["cli_feeder", "local_storage", "generic_extractor", "html_formatter", "csv_db"]) +def test_load_modules(module_name): + # test that specific modules can be loaded + module = get_module(module_name) + assert module is not None + assert isinstance(module, LazyBaseModule) + assert module.name == module_name + + loaded_module = module.load() + assert isinstance(loaded_module, BaseModule) + + # test module setup + loaded_module.setup(config={}) + + assert loaded_module.config == {} + + +@pytest.mark.parametrize("module_name", ["cli_feeder", "local_storage", "generic_extractor", "html_formatter", "csv_db"]) +def test_lazy_base_module(module_name): + lazy_module = get_module(module_name) + + assert lazy_module is not None + assert isinstance(lazy_module, LazyBaseModule) + assert lazy_module.name == module_name + assert len(lazy_module.display_name) > 0 + assert module_name in lazy_module.path + assert isinstance(lazy_module.manifest, dict) + + assert lazy_module.requires_setup == lazy_module.manifest.get("requires_setup", True) + assert len(lazy_module.entry_point) > 0 + assert len(lazy_module.configs) > 0 + assert len(lazy_module.description) > 0 + assert len(lazy_module.version) > 0 + + From dcd5576f297e19b65d698594e8037c2edc82ed4c Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 29 Jan 2025 00:10:40 +0100 Subject: [PATCH 041/110] set metadata enricher to requires_setup=True (requires exiftool which isn't installed by default on most machines) --- src/auto_archiver/modules/metadata_enricher/__manifest__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/metadata_enricher/__manifest__.py b/src/auto_archiver/modules/metadata_enricher/__manifest__.py index bfc9b75..50064e9 100644 --- a/src/auto_archiver/modules/metadata_enricher/__manifest__.py +++ b/src/auto_archiver/modules/metadata_enricher/__manifest__.py @@ -1,11 +1,10 @@ { "name": "Media Metadata Enricher", "type": ["enricher"], - "requires_setup": False, + "requires_setup": True, "external_dependencies": { "python": ["loguru"], "bin": ["exiftool"] - }, "description": """ Extracts metadata information from files using ExifTool. From 3d37c494aaadea59169cd563011216c444d83569 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 29 Jan 2025 18:42:12 +0100 Subject: [PATCH 042/110] Tidy ups + unit tests: 1. Allow loading modules from --module_paths=/extra/path/here 2. Improved unit tests for module loading 3. Further small tidy ups/clean ups --- src/auto_archiver/core/config.py | 4 +- src/auto_archiver/core/module.py | 161 ++++++++++++++---- src/auto_archiver/core/orchestrator.py | 29 ++-- .../modules/hash_enricher/hash_enricher.py | 10 -- .../modules/html_formatter/html_formatter.py | 4 +- src/auto_archiver/utils/misc.py | 5 +- tests/conftest.py | 5 +- tests/data/example_module/__init__.py | 1 + tests/data/example_module/__manifest__.py | 10 ++ tests/data/example_module/example_module.py | 4 + tests/enrichers/test_hash_enricher.py | 6 +- tests/extractors/test_extractor_base.py | 3 +- tests/test_modules.py | 55 +++++- 13 files changed, 216 insertions(+), 81 deletions(-) create mode 100644 tests/data/example_module/__init__.py create mode 100644 tests/data/example_module/__manifest__.py create mode 100644 tests/data/example_module/example_module.py diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 529e1c2..46dbe28 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer from loguru import logger from copy import deepcopy -from .module import MODULE_TYPES +from .module import BaseModule from typing import Any, List, Type, Tuple @@ -21,7 +21,7 @@ EMPTY_CONFIG = yaml.load(""" # Auto Archiver Configuration # Steps are the modules that will be run in the order they are defined -steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \ +steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \ """ # Global configuration diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 18f791b..0888378 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -16,33 +16,53 @@ from importlib.util import find_spec import os from os.path import join, dirname from loguru import logger +import auto_archiver _LAZY_LOADED_MODULES = {} -MODULE_TYPES = [ - 'feeder', - 'extractor', - 'enricher', - 'database', - 'storage', - 'formatter' -] - MANIFEST_FILE = "__manifest__.py" -_DEFAULT_MANIFEST = { - 'name': '', - 'author': 'Bellingcat', - 'type': [], - 'requires_setup': True, - 'description': '', - 'dependencies': {}, - 'entry_point': '', - 'version': '1.0', - 'configs': {} -} class BaseModule(ABC): + """ + Base module class. All modules should inherit from this class. + + The exact methods a class implements will depend on the type of module it is, + however all modules have a .setup(config: dict) method to run any setup code + (e.g. logging in to a site, spinning up a browser etc.) + + See BaseModule.MODULE_TYPES for the types of modules you can create, noting that + a subclass can be of multiple types. For example, a module that extracts data from + a website and stores it in a database would be both an 'extractor' and a 'database' module. + + Each module is a python package, and should have a __manifest__.py file in the + same directory as the module file. The __manifest__.py specifies the module information + like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the + default manifest structure. + + """ + + MODULE_TYPES = [ + 'feeder', + 'extractor', + 'enricher', + 'database', + 'storage', + 'formatter' + ] + + _DEFAULT_MANIFEST = { + 'name': '', # the display name of the module + 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! + 'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES + 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare + 'description': '', # a description of the module + 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format + 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName + 'version': '1.0', # the version of the module + 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line +} + config: dict name: str @@ -51,15 +71,51 @@ class BaseModule(ABC): for key, val in config.get(self.name, {}).items(): setattr(self, key, val) -def get_module(module_name: str, additional_paths: List[str] = []) -> LazyBaseModule: + def repr(self): + return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" + + +def setup_paths(paths: list[str]) -> None: + """ + Sets up the paths for the modules to be loaded from + + This is necessary for the modules to be imported correctly + + """ + for path in paths: + # see odoo/module/module.py -> initialize_sys_path + if path not in auto_archiver.modules.__path__: + auto_archiver.modules.__path__.append(path) + + # sort based on the length of the path, so that the longest path is last in the list + auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True) + + +def get_module(module_name: str, config: dict) -> BaseModule: + """ + Gets and sets up a module using the provided config + + This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy) + + """ + return get_module_lazy(module_name).load(config) + +def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule: + """ + Lazily loads a module, returning a LazyBaseModule + + This has all the information about the module, but does not load the module itself or its dependencies + + To load an actual module, call .setup() on a laz module + + """ if module_name in _LAZY_LOADED_MODULES: return _LAZY_LOADED_MODULES[module_name] - module = available_modules(additional_paths=additional_paths, limit_to_modules=[module_name])[0] - _LAZY_LOADED_MODULES[module_name] = module + module = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)[0] return module -def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[LazyBaseModule]: +def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: # search through all valid 'modules' paths. Default is 'modules' in the current directory # see odoo/modules/module.py -> get_modules @@ -67,10 +123,9 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [] if os.path.isfile(join(module_path, MANIFEST_FILE)): return True - default_path = [join(dirname(dirname((__file__))), "modules")] all_modules = [] - for module_folder in default_path + additional_paths: + for module_folder in auto_archiver.modules.__path__: # walk through each module in module_folder and check if it has a valid manifest try: possible_modules = os.listdir(module_folder) @@ -85,8 +140,12 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [] possible_module_path = join(module_folder, possible_module) if not is_really_module(possible_module_path): continue - - all_modules.append(LazyBaseModule(possible_module, possible_module_path)) + if _LAZY_LOADED_MODULES.get(possible_module): + continue + lazy_module = LazyBaseModule(possible_module, possible_module_path) + _LAZY_LOADED_MODULES[possible_module] = lazy_module + + all_modules.append(lazy_module) if not suppress_warnings: for module in limit_to_modules: @@ -97,8 +156,14 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [] @dataclass class LazyBaseModule: + + """ + A lazy module class, which only loads the manifest and does not load the module itself. + + This is useful for getting information about a module without actually loading it. + + """ name: str - display_name: str type: list description: str path: str @@ -129,6 +194,10 @@ class LazyBaseModule: @property def requires_setup(self) -> bool: return self.manifest['requires_setup'] + + @property + def display_name(self) -> str: + return self.manifest['name'] @property def manifest(self) -> dict: @@ -136,7 +205,7 @@ class LazyBaseModule: return self._manifest # print(f"Loading manifest for module {module_path}") # load the manifest file - manifest = copy.deepcopy(_DEFAULT_MANIFEST) + manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST) with open(join(self.path, MANIFEST_FILE)) as f: try: @@ -145,7 +214,6 @@ class LazyBaseModule: logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}") self._manifest = manifest - self.display_name = manifest['name'] self.type = manifest['type'] self._entry_point = manifest['entry_point'] self.description = manifest['description'] @@ -153,7 +221,7 @@ class LazyBaseModule: return manifest - def load(self) -> BaseModule: + def load(self, config) -> BaseModule: if self._instance: return self._instance @@ -162,10 +230,27 @@ class LazyBaseModule: def check_deps(deps, check): for dep in deps: if not check(dep): - logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") + logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") exit(1) - check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep)) + def check_python_dep(dep): + # first check if it's a module: + try: + m = get_module_lazy(dep, suppress_warnings=True) + try: + # we must now load this module and set it up with the config + m.load(config) + return True + except: + logger.error(f"Unable to setup module '{dep}' for use in module '{self.name}'") + return False + except IndexError: + # not a module, continue + pass + + return find_spec(dep) + + check_deps(self.dependencies.get('python', []), check_python_dep) check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep)) @@ -184,9 +269,8 @@ class LazyBaseModule: sub_qualname = f'{qualname}.{file_name}' __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) - # finally, get the class instance - instance = getattr(sys.modules[sub_qualname], class_name)() + instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)() if not getattr(instance, 'name', None): instance.name = self.name @@ -194,6 +278,11 @@ class LazyBaseModule: instance.display_name = self.display_name self._instance = instance + + # merge the default config with the user config + default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) + config[self.name] = default_config | config.get(self.name, {}) + instance.setup(config) return instance def __repr__(self): diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 16cf9c4..dc15809 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -19,7 +19,7 @@ from .context import ArchivingContext from .metadata import Metadata from ..version import __version__ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser -from .module import available_modules, LazyBaseModule, MODULE_TYPES, get_module +from .module import available_modules, LazyBaseModule, get_module, setup_paths from . import validators from .module import BaseModule @@ -57,6 +57,7 @@ class ArchivingOrchestrator: # override the default 'help' so we can inject all the configs and show those parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit') parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction) + parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction) self.basic_parser = parser @@ -72,19 +73,21 @@ class ArchivingOrchestrator: # if full, we'll load all modules # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser' # but should we add them? Or should we just add them to the 'complete' parser? + if yaml_config != EMPTY_CONFIG: # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] - for module_type in MODULE_TYPES: + for module_type in BaseModule.MODULE_TYPES: enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter' - for module_type in MODULE_TYPES: + for module_type in BaseModule.MODULE_TYPES: if modules := getattr(basic_config, f"{module_type}s", []): enabled_modules.extend(modules) - self.add_module_args(available_modules(with_manifest=True, limit_to_modules=set(enabled_modules), suppress_warnings=True), parser) + avail_modules = available_modules(with_manifest=True, limit_to_modules=list(dict.fromkeys(enabled_modules)), suppress_warnings=True) + self.add_module_args(avail_modules, parser) elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) @@ -135,10 +138,7 @@ class ArchivingOrchestrator: parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) - # additional modules - parser.add_argument('--additional-modules', dest='additional_modules', nargs='+', help='additional paths to search for modules', action=UniqueAppendAction) - - def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None): + def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: if not modules: modules = available_modules(with_manifest=True) @@ -173,7 +173,7 @@ class ArchivingOrchestrator: arg = group.add_argument(f"--{module.name}.{name}", **kwargs) arg.should_store = should_store - def show_help(self): + def show_help(self, basic_config: dict): # for the help message, we want to load *all* possible modules and show the help # add configs as arg parser arguments @@ -198,7 +198,7 @@ class ArchivingOrchestrator: """ invalid_modules = [] - for module_type in MODULE_TYPES: + for module_type in BaseModule.MODULE_TYPES: step_items = [] modules_to_load = self.config['steps'][f"{module_type}s"] @@ -216,9 +216,8 @@ class ArchivingOrchestrator: for module in modules_to_load: if module in invalid_modules: continue - loaded_module: BaseModule = get_module(module).load() try: - loaded_module.setup(self.config) + loaded_module: BaseModule = get_module(module, self.config) except (KeyboardInterrupt, Exception) as e: logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") if module_type == 'extractor': @@ -249,9 +248,11 @@ class ArchivingOrchestrator: # load the config file to get the list of enabled items basic_config, unused_args = self.basic_parser.parse_known_args() + setup_paths(basic_config.module_paths) + # if help flag was called, then show the help if basic_config.help: - self.show_help() + self.show_help(basic_config) # load the config file yaml_config = {} @@ -268,7 +269,7 @@ class ArchivingOrchestrator: self.install_modules() # log out the modules that were loaded - for module_type in MODULE_TYPES: + for module_type in BaseModule.MODULE_TYPES: logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"])) for item in self.feed(): diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 827b65f..94b5dce 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -19,16 +19,6 @@ class HashEnricher(Enricher): Calculates hashes for Media instances """ - def __init__(self, config: dict = None): - """ - Initialize the HashEnricher with a configuration dictionary. - """ - super().__init__() - # TODO set these from the manifest? - # Set default values - self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256" - self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7) - def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index e6e5e58..570fc6f 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -12,7 +12,7 @@ from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.core import Formatter from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str - +from auto_archiver.core.module import get_module @dataclass class HtmlFormatter(Formatter): @@ -53,7 +53,7 @@ class HtmlFormatter(Formatter): outf.write(content) final_media = Media(filename=html_path, _mimetype="text/html") - he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) + he = get_module('hash_enricher', self.config) if len(hd := he.calculate_hash(final_media.filename)): final_media.set("hash", f"{he.algorithm}:{hd}") diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index e985e3e..300a710 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -1,7 +1,10 @@ -import os, json, requests + +import os +import json import uuid from datetime import datetime +import requests from loguru import logger diff --git a/tests/conftest.py b/tests/conftest.py index c2c74f2..af0fd6d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,10 +23,7 @@ def setup_module(request): # if the class does not have a .name, use the name of the parent folder module_name = module_name.__module__.rsplit(".",2)[-2] - m = get_module(module_name).load() - m.name = module_name - m.setup({module_name : config}) - + m = get_module(module_name, {module_name: config}) def cleanup(): _LAZY_LOADED_MODULES.pop(module_name) diff --git a/tests/data/example_module/__init__.py b/tests/data/example_module/__init__.py new file mode 100644 index 0000000..560a9b9 --- /dev/null +++ b/tests/data/example_module/__init__.py @@ -0,0 +1 @@ +from .example_module import ExampleModule \ No newline at end of file diff --git a/tests/data/example_module/__manifest__.py b/tests/data/example_module/__manifest__.py new file mode 100644 index 0000000..ca3a678 --- /dev/null +++ b/tests/data/example_module/__manifest__.py @@ -0,0 +1,10 @@ +{ + "name": "Example Module", + "type": ["extractor"], + "requires_setup": False, + "external_dependencies": {"python": ["loguru"] + }, + "configs": { + "csv_file": {"default": "db.csv", "help": "CSV file name"} + }, +} \ No newline at end of file diff --git a/tests/data/example_module/example_module.py b/tests/data/example_module/example_module.py new file mode 100644 index 0000000..b752743 --- /dev/null +++ b/tests/data/example_module/example_module.py @@ -0,0 +1,4 @@ +from auto_archiver.core.extractor import Extractor + +class ExampleModule(Extractor): + pass \ No newline at end of file diff --git a/tests/enrichers/test_hash_enricher.py b/tests/enrichers/test_hash_enricher.py index 63e4824..4b61fc2 100644 --- a/tests/enrichers/test_hash_enricher.py +++ b/tests/enrichers/test_hash_enricher.py @@ -2,7 +2,7 @@ import pytest from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.core import Metadata, Media -from auto_archiver.core.module import get_module +from auto_archiver.core.module import get_module_lazy @pytest.mark.parametrize("algorithm, filename, expected_hash", [ ("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"), @@ -12,7 +12,7 @@ from auto_archiver.core.module import get_module ]) def test_calculate_hash(algorithm, filename, expected_hash, setup_module): # test SHA-256 - he = setup_module(HashEnricher, {"algorithm": algorithm, "chunksize": 1}) + he = setup_module(HashEnricher, {"algorithm": algorithm, "chunksize": 100}) assert he.calculate_hash(filename) == expected_hash def test_default_config_values(setup_module): @@ -22,7 +22,7 @@ def test_default_config_values(setup_module): def test_config(): # test default config - c = get_module('hash_enricher').configs + c = get_module_lazy('hash_enricher').configs assert c["algorithm"]["default"] == "SHA-256" assert c["chunksize"]["default"] == 16000000 assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"] diff --git a/tests/extractors/test_extractor_base.py b/tests/extractors/test_extractor_base.py index bb78794..f6be70b 100644 --- a/tests/extractors/test_extractor_base.py +++ b/tests/extractors/test_extractor_base.py @@ -2,7 +2,7 @@ import pytest from auto_archiver.core.metadata import Metadata from auto_archiver.core.extractor import Extractor -from auto_archiver.core.module import get_module + class TestExtractorBase(object): extractor_module: str = None @@ -12,6 +12,7 @@ class TestExtractorBase(object): def setup_archiver(self, setup_module): assert self.extractor_module is not None, "self.extractor_module must be set on the subclass" assert self.config is not None, "self.config must be a dict set on the subclass" + self.extractor: Extractor = setup_module(self.extractor_module, self.config) def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""): diff --git a/tests/test_modules.py b/tests/test_modules.py index 619906b..109bc52 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -1,26 +1,65 @@ +import sys import pytest -from auto_archiver.core.module import get_module, BaseModule, LazyBaseModule +from auto_archiver.core.module import get_module_lazy, BaseModule, LazyBaseModule, _LAZY_LOADED_MODULES +from auto_archiver.core.extractor import Extractor + +@pytest.fixture +def example_module(): + yield get_module_lazy("example_module", ["tests/data/"]) + # cleanup + _LAZY_LOADED_MODULES.pop("example_module") + +def test_get_module_lazy(example_module): + assert example_module.name == "example_module" + assert example_module.display_name == "Example Module" + + assert example_module.manifest is not None + + +def test_load_module_abc_check(example_module): + + # example_module is an extractor but doesn't have the 'download' method, should raise an ABC error + with pytest.raises(TypeError) as load_error: + example_module.load({}) + assert "Can't instantiate abstract class ExampleModule with abstract method download" in str(load_error.value) + + +def test_load_module(example_module, monkeypatch): + # hack - remove the 'download' method from the required methods of Extractor + monkeypatch.setattr(Extractor, "__abstractmethods__", set()) + + # setup the module, and check that config is set to the default values + loaded_module = example_module.load({}) + assert loaded_module is not None + assert isinstance(loaded_module, BaseModule) + assert loaded_module.name == "example_module" + assert loaded_module.display_name == "Example Module" + assert loaded_module.config["example_module"] == {"csv_file" : "db.csv"} + + # check that the vlaue is set on the module itself + assert loaded_module.csv_file == "db.csv" @pytest.mark.parametrize("module_name", ["cli_feeder", "local_storage", "generic_extractor", "html_formatter", "csv_db"]) def test_load_modules(module_name): # test that specific modules can be loaded - module = get_module(module_name) + module = get_module_lazy(module_name) assert module is not None assert isinstance(module, LazyBaseModule) assert module.name == module_name - loaded_module = module.load() + loaded_module = module.load({}) assert isinstance(loaded_module, BaseModule) + assert loaded_module.name == module_name + assert loaded_module.display_name == module.display_name - # test module setup - loaded_module.setup(config={}) - - assert loaded_module.config == {} + # check that default settings are applied + default_config = module.configs + assert loaded_module.name in loaded_module.config.keys() @pytest.mark.parametrize("module_name", ["cli_feeder", "local_storage", "generic_extractor", "html_formatter", "csv_db"]) def test_lazy_base_module(module_name): - lazy_module = get_module(module_name) + lazy_module = get_module_lazy(module_name) assert lazy_module is not None assert isinstance(lazy_module, LazyBaseModule) From 00a7018f365b651b9b36da93319b94c4c71a375e Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 29 Jan 2025 19:25:22 +0100 Subject: [PATCH 043/110] Fix up dependency checking (use 'dependencies' instead of 'external_dependencies' -> simpler/easier to remember --- src/auto_archiver/core/module.py | 4 ++ .../modules/api_db/__manifest__.py | 2 +- .../modules/atlos/__manifest__.py | 2 +- .../modules/atlos_db/__manifest__.py | 2 +- .../modules/atlos_feeder/__manifest__.py | 2 +- .../modules/cli_feeder/__manifest__.py | 2 +- .../modules/console_db/__manifest__.py | 2 +- .../modules/csv_db/__manifest__.py | 2 +- .../modules/csv_feeder/__manifest__.py | 2 +- .../modules/gdrive_storage/__manifest__.py | 2 +- .../modules/gsheet_db/__manifest__.py | 2 +- .../modules/gsheet_feeder/__manifest__.py | 2 +- .../modules/hash_enricher/__manifest__.py | 2 +- .../modules/html_formatter/__manifest__.py | 4 +- .../modules/html_formatter/html_formatter.py | 1 + .../instagram_api_extractor/__manifest__.py | 2 +- .../instagram_extractor/__manifest__.py | 2 +- .../instagram_tbot_extractor/__manifest__.py | 2 +- .../modules/local_storage/__manifest__.py | 2 +- .../modules/meta_enricher/__manifest__.py | 2 +- .../modules/metadata_enricher/__manifest__.py | 2 +- .../modules/mute_formatter/__manifest__.py | 2 +- .../modules/pdq_hash_enricher/__manifest__.py | 2 +- .../modules/s3_storage/__manifest__.py | 2 +- .../screenshot_enricher/__manifest__.py | 2 +- .../modules/ssl_enricher/__manifest__.py | 2 +- .../telegram_extractor/__manifest__.py | 2 +- .../telethon_extractor/__manifest__.py | 2 +- .../thumbnail_enricher/__manifest__.py | 2 +- .../timestamping_enricher/__manifest__.py | 2 +- .../twitter_api_extractor/__manifest__.py | 2 +- .../modules/vk_extractor/__manifest__.py | 2 +- .../modules/wacz_enricher/__manifest__.py | 2 +- .../modules/wayback_enricher/__manifest__.py | 2 +- .../modules/whisper_enricher/__manifest__.py | 2 +- tests/data/example_module/__manifest__.py | 2 +- tests/data/example_module/example_module.py | 4 +- tests/test_modules.py | 51 ++++++++++++++----- 38 files changed, 81 insertions(+), 49 deletions(-) diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 0888378..cb380cf 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -143,6 +143,7 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [] if _LAZY_LOADED_MODULES.get(possible_module): continue lazy_module = LazyBaseModule(possible_module, possible_module_path) + _LAZY_LOADED_MODULES[possible_module] = lazy_module all_modules.append(lazy_module) @@ -229,6 +230,9 @@ class LazyBaseModule: # check external dependencies are installed def check_deps(deps, check): for dep in deps: + if not len(dep): + # clear out any empty strings that a user may have erroneously added + continue if not check(dep): logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") exit(1) diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index c89165f..d22fa59 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -3,7 +3,7 @@ "type": ["database"], "entry_point": "api_db:AAApiDb", "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["requests", "loguru"], }, diff --git a/src/auto_archiver/modules/atlos/__manifest__.py b/src/auto_archiver/modules/atlos/__manifest__.py index 459fefe..7ba2f72 100644 --- a/src/auto_archiver/modules/atlos/__manifest__.py +++ b/src/auto_archiver/modules/atlos/__manifest__.py @@ -2,7 +2,7 @@ "name": "atlos_storage", "type": ["storage"], "requires_setup": True, - "external_dependencies": {"python": ["loguru", "requests"], "bin": [""]}, + "dependencies": {"python": ["loguru", "requests"], "bin": [""]}, "configs": { "path_generator": { "default": "url", diff --git a/src/auto_archiver/modules/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py index 42ce560..8f9473f 100644 --- a/src/auto_archiver/modules/atlos_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_db/__manifest__.py @@ -3,7 +3,7 @@ "type": ["database"], "entry_point": "atlos_db:AtlosDb", "requires_setup": True, - "external_dependencies": + "dependencies": {"python": ["loguru", ""], "bin": [""]}, diff --git a/src/auto_archiver/modules/atlos_feeder/__manifest__.py b/src/auto_archiver/modules/atlos_feeder/__manifest__.py index 0d90c8b..f2772f2 100644 --- a/src/auto_archiver/modules/atlos_feeder/__manifest__.py +++ b/src/auto_archiver/modules/atlos_feeder/__manifest__.py @@ -2,7 +2,7 @@ "name": "Atlos Feeder", "type": ["feeder"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["loguru", "requests"], }, "configs": { diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index fe784c3..cf5c1b7 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -2,7 +2,7 @@ "name": "CLI Feeder", "type": ["feeder"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": ["loguru"], }, 'entry_point': 'cli_feeder::CLIFeeder', diff --git a/src/auto_archiver/modules/console_db/__manifest__.py b/src/auto_archiver/modules/console_db/__manifest__.py index cd40496..a1d0d48 100644 --- a/src/auto_archiver/modules/console_db/__manifest__.py +++ b/src/auto_archiver/modules/console_db/__manifest__.py @@ -2,7 +2,7 @@ "name": "Console Database", "type": ["database"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": ["loguru"], }, "description": """ diff --git a/src/auto_archiver/modules/csv_db/__manifest__.py b/src/auto_archiver/modules/csv_db/__manifest__.py index 3131188..507ce14 100644 --- a/src/auto_archiver/modules/csv_db/__manifest__.py +++ b/src/auto_archiver/modules/csv_db/__manifest__.py @@ -2,7 +2,7 @@ "name": "CSV Database", "type": ["database"], "requires_setup": False, - "external_dependencies": {"python": ["loguru"] + "dependencies": {"python": ["loguru"] }, 'entry_point': 'csv_db::CSVDb', "configs": { diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index 81c4dcd..b062ee6 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -2,7 +2,7 @@ "name": "CSV Feeder", "type": ["feeder"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": ["loguru"], "bin": [""] }, diff --git a/src/auto_archiver/modules/gdrive_storage/__manifest__.py b/src/auto_archiver/modules/gdrive_storage/__manifest__.py index b81b717..e24f21b 100644 --- a/src/auto_archiver/modules/gdrive_storage/__manifest__.py +++ b/src/auto_archiver/modules/gdrive_storage/__manifest__.py @@ -2,7 +2,7 @@ "name": "Google Drive Storage", "type": ["storage"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": [ "loguru", "google-api-python-client", diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py index f2f1c35..f926adc 100644 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -3,7 +3,7 @@ "type": ["database"], "entry_point": "gsheet_db::GsheetsDb", "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["loguru", "gspread", "python-slugify"], }, "configs": { diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index 3d9cb08..1c9acab 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -3,7 +3,7 @@ "type": ["feeder"], "entry_point": "gsheet_feeder::GsheetsFeeder", "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["loguru", "gspread", "python-slugify"], }, "configs": { diff --git a/src/auto_archiver/modules/hash_enricher/__manifest__.py b/src/auto_archiver/modules/hash_enricher/__manifest__.py index f306808..c7a023e 100644 --- a/src/auto_archiver/modules/hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/hash_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "Hash Enricher", "type": ["enricher"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": ["loguru"], }, "configs": { diff --git a/src/auto_archiver/modules/html_formatter/__manifest__.py b/src/auto_archiver/modules/html_formatter/__manifest__.py index 259a3d1..ec19cf8 100644 --- a/src/auto_archiver/modules/html_formatter/__manifest__.py +++ b/src/auto_archiver/modules/html_formatter/__manifest__.py @@ -2,8 +2,8 @@ "name": "HTML Formatter", "type": ["formatter"], "requires_setup": False, - "external_dependencies": { - "python": ["loguru", "jinja2"], + "dependencies": { + "python": ["hash_enricher", "loguru", "jinja2"], "bin": [""] }, "configs": { diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index 570fc6f..8f006e0 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -53,6 +53,7 @@ class HtmlFormatter(Formatter): outf.write(content) final_media = Media(filename=html_path, _mimetype="text/html") + # get the already instantiated hash_enricher module he = get_module('hash_enricher', self.config) if len(hd := he.calculate_hash(final_media.filename)): final_media.set("hash", f"{he.algorithm}:{hd}") diff --git a/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py index cdaf635..57f378e 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py @@ -1,7 +1,7 @@ { "name": "Instagram API Extractor", "type": ["extractor"], - "external_dependencies": + "dependencies": {"python": ["requests", "loguru", "retrying", diff --git a/src/auto_archiver/modules/instagram_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py index f1857c2..6e7518e 100644 --- a/src/auto_archiver/modules/instagram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -1,7 +1,7 @@ { "name": "Instagram Extractor", "type": ["extractor"], - "external_dependencies": { + "dependencies": { "python": [ "instaloader", "loguru", diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py index 95d6808..8a1f74f 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py @@ -1,7 +1,7 @@ { "name": "Instagram Telegram Bot Extractor", "type": ["extractor"], - "external_dependencies": {"python": ["loguru", + "dependencies": {"python": ["loguru", "telethon",], }, "requires_setup": True, diff --git a/src/auto_archiver/modules/local_storage/__manifest__.py b/src/auto_archiver/modules/local_storage/__manifest__.py index ce00953..6d9cf53 100644 --- a/src/auto_archiver/modules/local_storage/__manifest__.py +++ b/src/auto_archiver/modules/local_storage/__manifest__.py @@ -2,7 +2,7 @@ "name": "Local Storage", "type": ["storage"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": ["loguru"], }, "configs": { diff --git a/src/auto_archiver/modules/meta_enricher/__manifest__.py b/src/auto_archiver/modules/meta_enricher/__manifest__.py index 10acf71..37c9201 100644 --- a/src/auto_archiver/modules/meta_enricher/__manifest__.py +++ b/src/auto_archiver/modules/meta_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "Archive Metadata Enricher", "type": ["enricher"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": ["loguru"], }, "description": """ diff --git a/src/auto_archiver/modules/metadata_enricher/__manifest__.py b/src/auto_archiver/modules/metadata_enricher/__manifest__.py index 50064e9..f8ccdc6 100644 --- a/src/auto_archiver/modules/metadata_enricher/__manifest__.py +++ b/src/auto_archiver/modules/metadata_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "Media Metadata Enricher", "type": ["enricher"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["loguru"], "bin": ["exiftool"] }, diff --git a/src/auto_archiver/modules/mute_formatter/__manifest__.py b/src/auto_archiver/modules/mute_formatter/__manifest__.py index 77f2784..e81dc4c 100644 --- a/src/auto_archiver/modules/mute_formatter/__manifest__.py +++ b/src/auto_archiver/modules/mute_formatter/__manifest__.py @@ -2,7 +2,7 @@ "name": "Mute Formatter", "type": ["formatter"], "requires_setup": True, - "external_dependencies": { + "dependencies": { }, "description": """ Default formatter. """, diff --git a/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py index 7b418b1..6353d12 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "PDQ Hash Enricher", "type": ["enricher"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": ["loguru", "pdqhash", "numpy", "Pillow"], }, "description": """ diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py index 811c703..16ac7bd 100644 --- a/src/auto_archiver/modules/s3_storage/__manifest__.py +++ b/src/auto_archiver/modules/s3_storage/__manifest__.py @@ -2,7 +2,7 @@ "name": "S3 Storage", "type": ["storage"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["boto3", "loguru"], }, "configs": { diff --git a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py index c1a30e7..52842c9 100644 --- a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py +++ b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "Screenshot Enricher", "type": ["enricher"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["loguru", "selenium"], "bin": ["chromedriver"] }, diff --git a/src/auto_archiver/modules/ssl_enricher/__manifest__.py b/src/auto_archiver/modules/ssl_enricher/__manifest__.py index ccde957..0fb7cd9 100644 --- a/src/auto_archiver/modules/ssl_enricher/__manifest__.py +++ b/src/auto_archiver/modules/ssl_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "SSL Certificate Enricher", "type": ["enricher"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": ["loguru", "python-slugify"], }, 'entry_point': 'ssl_enricher::SSLEnricher', diff --git a/src/auto_archiver/modules/telegram_extractor/__manifest__.py b/src/auto_archiver/modules/telegram_extractor/__manifest__.py index 86b5e0f..e1c49c2 100644 --- a/src/auto_archiver/modules/telegram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telegram_extractor/__manifest__.py @@ -2,7 +2,7 @@ "name": "Telegram Extractor", "type": ["extractor"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": [ "requests", "bs4", diff --git a/src/auto_archiver/modules/telethon_extractor/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py index 2cf1e42..6b37654 100644 --- a/src/auto_archiver/modules/telethon_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -2,7 +2,7 @@ "name": "telethon_extractor", "type": ["extractor"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["telethon", "loguru", "tqdm", diff --git a/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py index 2b0f167..bd7836d 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py +++ b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "Thumbnail Enricher", "type": ["enricher"], "requires_setup": False, - "external_dependencies": { + "dependencies": { "python": ["loguru", "ffmpeg-python"], "bin": ["ffmpeg"] }, diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py index 496d211..6ad9c57 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "Timestamping Enricher", "type": ["enricher"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": [ "loguru", "slugify", diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index 02d0d6c..05d1ac0 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -2,7 +2,7 @@ "name": "Twitter API Extractor", "type": ["extractor"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["requests", "loguru", "pytwitter", diff --git a/src/auto_archiver/modules/vk_extractor/__manifest__.py b/src/auto_archiver/modules/vk_extractor/__manifest__.py index bdcaf99..116b430 100644 --- a/src/auto_archiver/modules/vk_extractor/__manifest__.py +++ b/src/auto_archiver/modules/vk_extractor/__manifest__.py @@ -3,7 +3,7 @@ "type": ["extractor"], "requires_setup": True, "depends": ["core", "utils"], - "external_dependencies": { + "dependencies": { "python": ["loguru", "vk_url_scraper"], }, diff --git a/src/auto_archiver/modules/wacz_enricher/__manifest__.py b/src/auto_archiver/modules/wacz_enricher/__manifest__.py index 07983d9..bb9d290 100644 --- a/src/auto_archiver/modules/wacz_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wacz_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "WACZ Enricher", "type": ["enricher", "archiver"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": [ "loguru", "jsonlines", diff --git a/src/auto_archiver/modules/wayback_enricher/__manifest__.py b/src/auto_archiver/modules/wayback_enricher/__manifest__.py index bff10af..5d1fe25 100644 --- a/src/auto_archiver/modules/wayback_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wayback_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "Wayback Machine Enricher", "type": ["enricher", "archiver"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["loguru", "requests"], }, "entry_point": "wayback_enricher::WaybackExtractorEnricher", diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index 25eae25..0adf9ff 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -2,7 +2,7 @@ "name": "Whisper Enricher", "type": ["enricher"], "requires_setup": True, - "external_dependencies": { + "dependencies": { "python": ["loguru", "requests"], }, "configs": { diff --git a/tests/data/example_module/__manifest__.py b/tests/data/example_module/__manifest__.py index ca3a678..19a85f9 100644 --- a/tests/data/example_module/__manifest__.py +++ b/tests/data/example_module/__manifest__.py @@ -2,7 +2,7 @@ "name": "Example Module", "type": ["extractor"], "requires_setup": False, - "external_dependencies": {"python": ["loguru"] + "dependencies": {"python": ["loguru"] }, "configs": { "csv_file": {"default": "db.csv", "help": "CSV file name"} diff --git a/tests/data/example_module/example_module.py b/tests/data/example_module/example_module.py index b752743..bce8ba4 100644 --- a/tests/data/example_module/example_module.py +++ b/tests/data/example_module/example_module.py @@ -1,4 +1,4 @@ from auto_archiver.core.extractor import Extractor - class ExampleModule(Extractor): - pass \ No newline at end of file + def download(self, item): + print("do something") \ No newline at end of file diff --git a/tests/test_modules.py b/tests/test_modules.py index 109bc52..decc616 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -1,13 +1,24 @@ import sys import pytest from auto_archiver.core.module import get_module_lazy, BaseModule, LazyBaseModule, _LAZY_LOADED_MODULES -from auto_archiver.core.extractor import Extractor @pytest.fixture def example_module(): - yield get_module_lazy("example_module", ["tests/data/"]) + import auto_archiver + + previous_path = auto_archiver.modules.__path__ + auto_archiver.modules.__path__.append("tests/data/") + + module = get_module_lazy("example_module") + yield module # cleanup - _LAZY_LOADED_MODULES.pop("example_module") + try: + del module._manifest + except AttributeError: + pass + del _LAZY_LOADED_MODULES["example_module"] + sys.modules.pop("auto_archiver.modules.example_module.example_module", None) + auto_archiver.modules.__path__ = previous_path def test_get_module_lazy(example_module): assert example_module.name == "example_module" @@ -15,18 +26,34 @@ def test_get_module_lazy(example_module): assert example_module.manifest is not None +def test_python_dependency_check(example_module): + # example_module requires loguru, which is not installed + # monkey patch the manifest to include a nonexistnet dependency + example_module.manifest["dependencies"]["python"] = ["does_not_exist"] -def test_load_module_abc_check(example_module): - - # example_module is an extractor but doesn't have the 'download' method, should raise an ABC error - with pytest.raises(TypeError) as load_error: + with pytest.raises(SystemExit) as load_error: example_module.load({}) - assert "Can't instantiate abstract class ExampleModule with abstract method download" in str(load_error.value) - -def test_load_module(example_module, monkeypatch): - # hack - remove the 'download' method from the required methods of Extractor - monkeypatch.setattr(Extractor, "__abstractmethods__", set()) + assert load_error.value.code == 1 + +def test_binary_dependency_check(example_module): + # example_module requires ffmpeg, which is not installed + # monkey patch the manifest to include a nonexistnet dependency + example_module.manifest["dependencies"]["binary"] = ["does_not_exist"] + +def test_module_dependency_check_loads_module(example_module): + # example_module requires cli_feeder, which is not installed + # monkey patch the manifest to include a nonexistnet dependency + example_module.manifest["dependencies"]["python"] = ["hash_enricher"] + + loaded_module = example_module.load({}) + assert loaded_module is not None + + # check the dependency is loaded + assert _LAZY_LOADED_MODULES["hash_enricher"] is not None + assert _LAZY_LOADED_MODULES["hash_enricher"]._instance is not None + +def test_load_module(example_module): # setup the module, and check that config is set to the default values loaded_module = example_module.load({}) From 18ff36ce154dad2f083c9f4488b0f8027bcbe278 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 29 Jan 2025 19:37:41 +0100 Subject: [PATCH 044/110] Add ruamel to dependencies (replaces pyyaml) --- poetry.lock | 80 ++++++++++++++++++++++++++++++++++++++++++++++++-- pyproject.toml | 2 +- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6d6ad8c..e8a899a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1818,7 +1818,7 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" -groups = ["main", "docs"] +groups = ["docs"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -2086,6 +2086,82 @@ files = [ [package.dependencies] pyasn1 = ">=0.1.3" +[[package]] +name = "ruamel-yaml" +version = "0.18.10" +description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "ruamel.yaml-0.18.10-py3-none-any.whl", hash = "sha256:30f22513ab2301b3d2b577adc121c6471f28734d3d9728581245f1e76468b4f1"}, + {file = "ruamel.yaml-0.18.10.tar.gz", hash = "sha256:20c86ab29ac2153f80a428e1254a8adf686d3383df04490514ca3b79a362db58"}, +] + +[package.dependencies] +"ruamel.yaml.clib" = {version = ">=0.2.7", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.13\""} + +[package.extras] +docs = ["mercurial (>5.7)", "ryd"] +jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"] + +[[package]] +name = "ruamel-yaml-clib" +version = "0.2.12" +description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml" +optional = false +python-versions = ">=3.9" +groups = ["main"] +markers = "platform_python_implementation == \"CPython\"" +files = [ + {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:11f891336688faf5156a36293a9c362bdc7c88f03a8a027c2c1d8e0bcde998e5"}, + {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"}, + {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"}, + {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"}, + {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"}, + {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"}, + {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a52d48f4e7bf9005e8f0a89209bf9a73f7190ddf0489eee5eb51377385f59f2a"}, + {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"}, + {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"}, + {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"}, + {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"}, + {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"}, + {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"}, + {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"}, + {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"}, + {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3"}, + {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"}, + {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"}, + {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"}, + {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"}, + {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"}, + {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"}, + {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"}, + {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"}, + {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d"}, + {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"}, + {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"}, + {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"}, + {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"}, + {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"}, + {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"}, + {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"}, + {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"}, + {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f6f3eac23941b32afccc23081e1f50612bdbe4e982012ef4f5797986828cd01"}, + {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"}, + {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"}, + {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"}, + {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"}, + {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"}, + {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"}, + {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"}, + {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"}, + {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2c59aa6170b990d8d2719323e628aaf36f3bfbc1c26279c0eeeb24d05d2d11c7"}, + {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"}, + {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"}, + {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"}, +] + [[package]] name = "s3transfer" version = "0.11.2" @@ -3006,4 +3082,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "d1af74e7fc7c919eda55dd383208edab906508353b4a9eff8e979967484823f8" +content-hash = "1556d53c5a94392c120ebaafc495d3b322daf64dac4a19f9726588c7f3d84bca" diff --git a/pyproject.toml b/pyproject.toml index ec78212..b3a2456 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,6 @@ dependencies = [ "pdqhash (>=0.0.0)", "pillow (>=0.0.0)", "python-slugify (>=0.0.0)", - "pyyaml (>=0.0.0)", "dateparser (>=0.0.0)", "python-twitter-v2 (>=0.0.0)", "instaloader (>=0.0.0)", @@ -58,6 +57,7 @@ dependencies = [ "tsp-client (>=0.0.0)", "certvalidator (>=0.0.0)", "rich-argparse (>=1.6.0,<2.0.0)", + "ruamel-yaml (>=0.18.10,<0.19.0)", ] [tool.poetry.group.dev.dependencies] From cddae65a90a0dc225f9fcac26cdb5fce21448ccc Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 30 Jan 2025 08:42:23 +0000 Subject: [PATCH 045/110] Update modules for new core structure. --- scripts/create_update_gdrive_oauth_token.py | 43 +++++---- scripts/telegram_setup.py | 29 ++++++ src/auto_archiver/core/orchestrator.py | 2 +- src/auto_archiver/core/storage.py | 2 +- src/auto_archiver/modules/api_db/__init__.py | 2 +- .../modules/api_db/__manifest__.py | 42 ++++++-- src/auto_archiver/modules/api_db/api_db.py | 16 +--- src/auto_archiver/modules/atlos/__init__.py | 1 - .../modules/atlos/__manifest__.py | 40 -------- .../modules/atlos_db/atlos_db.py | 8 +- .../modules/atlos_feeder/__manifest__.py | 3 +- .../modules/atlos_feeder/atlos_feeder.py | 11 +-- .../atlos_storage.py} | 10 +- .../modules/gdrive_storage/__manifest__.py | 73 ++++++++++++-- .../modules/gdrive_storage/gdrive_storage.py | 96 +++++++++---------- .../modules/gsheet_db/__manifest__.py | 3 +- .../modules/gsheet_feeder/__manifest__.py | 2 +- .../instagram_api_extractor/__manifest__.py | 13 ++- .../instagram_api_extractor.py | 9 +- .../instagram_extractor/__manifest__.py | 13 ++- .../instagram_extractor.py | 12 +-- .../instagram_tbot_extractor/__manifest__.py | 15 ++- .../instagram_tbot_extractor.py | 7 +- .../modules/pdq_hash_enricher/__manifest__.py | 2 +- .../modules/s3_storage/__init__.py | 2 +- .../modules/s3_storage/__manifest__.py | 12 ++- .../s3_storage/{s3.py => s3_storage.py} | 29 +++--- .../modules/ssl_enricher/__manifest__.py | 2 +- .../thumbnail_enricher/__manifest__.py | 2 +- .../modules/vk_extractor/__manifest__.py | 19 ++-- .../modules/vk_extractor/vk_extractor.py | 6 +- .../modules/wacz_enricher/__manifest__.py | 4 +- .../modules/wacz_enricher/wacz_enricher.py | 4 +- .../modules/whisper_enricher/__manifest__.py | 2 +- .../whisper_enricher/whisper_enricher.py | 4 +- 35 files changed, 307 insertions(+), 233 deletions(-) create mode 100644 scripts/telegram_setup.py delete mode 100644 src/auto_archiver/modules/atlos/__init__.py delete mode 100644 src/auto_archiver/modules/atlos/__manifest__.py rename src/auto_archiver/modules/{atlos/atlos.py => atlos_storage/atlos_storage.py} (96%) rename src/auto_archiver/modules/s3_storage/{s3.py => s3_storage.py} (88%) diff --git a/scripts/create_update_gdrive_oauth_token.py b/scripts/create_update_gdrive_oauth_token.py index ec8a120..eb6fdbe 100644 --- a/scripts/create_update_gdrive_oauth_token.py +++ b/scripts/create_update_gdrive_oauth_token.py @@ -12,7 +12,7 @@ from googleapiclient.errors import HttpError # Code below from https://developers.google.com/drive/api/quickstart/python # Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json -SCOPES = ['https://www.googleapis.com/auth/drive'] +SCOPES = ["https://www.googleapis.com/auth/drive.file"] @click.command( @@ -23,7 +23,7 @@ SCOPES = ['https://www.googleapis.com/auth/drive'] "-c", type=click.Path(exists=True), help="path to the credentials.json file downloaded from https://console.cloud.google.com/apis/credentials", - required=True + required=True, ) @click.option( "--token", @@ -31,59 +31,62 @@ SCOPES = ['https://www.googleapis.com/auth/drive'] type=click.Path(exists=False), default="gd-token.json", help="file where to place the OAuth token, defaults to gd-token.json which you must then move to where your orchestration file points to, defaults to gd-token.json", - required=True + required=True, ) def main(credentials, token): # The file token.json stores the user's access and refresh tokens, and is # created automatically when the authorization flow completes for the first time. creds = None if os.path.exists(token): - with open(token, 'r') as stream: + with open(token, "r") as stream: creds_json = json.load(stream) # creds = Credentials.from_authorized_user_file(creds_json, SCOPES) - creds_json['refresh_token'] = creds_json.get("refresh_token", "") + creds_json["refresh_token"] = creds_json.get("refresh_token", "") creds = Credentials.from_authorized_user_info(creds_json, SCOPES) # If there are no (valid) credentials available, let the user log in. if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: - print('Requesting new token') + print("Requesting new token") creds.refresh(Request()) else: - print('First run through so putting up login dialog') + print("First run through so putting up login dialog") # credentials.json downloaded from https://console.cloud.google.com/apis/credentials flow = InstalledAppFlow.from_client_secrets_file(credentials, SCOPES) creds = flow.run_local_server(port=55192) # Save the credentials for the next run - with open(token, 'w') as token: - print('Saving new token') + with open(token, "w") as token: + print("Saving new token") token.write(creds.to_json()) else: - print('Token valid') + print("Token valid") try: - service = build('drive', 'v3', credentials=creds) + service = build("drive", "v3", credentials=creds) # About the user results = service.about().get(fields="*").execute() - emailAddress = results['user']['emailAddress'] + emailAddress = results["user"]["emailAddress"] print(emailAddress) # Call the Drive v3 API and return some files - results = service.files().list( - pageSize=10, fields="nextPageToken, files(id, name)").execute() - items = results.get('files', []) + results = ( + service.files() + .list(pageSize=10, fields="nextPageToken, files(id, name)") + .execute() + ) + items = results.get("files", []) if not items: - print('No files found.') + print("No files found.") return - print('Files:') + print("Files:") for item in items: - print(u'{0} ({1})'.format(item['name'], item['id'])) + print("{0} ({1})".format(item["name"], item["id"])) except HttpError as error: - print(f'An error occurred: {error}') + print(f"An error occurred: {error}") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/telegram_setup.py b/scripts/telegram_setup.py new file mode 100644 index 0000000..e6fa43c --- /dev/null +++ b/scripts/telegram_setup.py @@ -0,0 +1,29 @@ +""" +This script is used to create a new session file for the Telegram client. +To do this you must first create a Telegram application at https://my.telegram.org/apps +And store your id and hash in the environment variables TELEGRAM_API_ID and TELEGRAM_API_HASH. +Create a .env file, or add the following to your environment : +``` +export TELEGRAM_API_ID=[YOUR_ID_HERE] +export TELEGRAM_API_HASH=[YOUR_HASH_HERE] +``` +Then run this script to create a new session file. + +You will need to provide your phone number and a 2FA code the first time you run this script. +""" + + +import os +from telethon.sync import TelegramClient +from loguru import logger + + +# Create a +API_ID = os.getenv("TELEGRAM_API_ID") +API_HASH = os.getenv("TELEGRAM_API_HASH") +SESSION_FILE = "secrets/anon-insta" + +os.makedirs("secrets", exist_ok=True) +with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client: + logger.success(f"New session file created: {SESSION_FILE}.session") + diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index dc15809..b305963 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -220,7 +220,7 @@ class ArchivingOrchestrator: loaded_module: BaseModule = get_module(module, self.config) except (KeyboardInterrupt, Exception) as e: logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") - if module_type == 'extractor': + if module_type == 'extractor' and loaded_module.name == module: loaded_module.cleanup() exit() diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index e167024..5274204 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -30,7 +30,7 @@ class Storage(BaseModule): def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass def upload(self, media: Media, **kwargs) -> bool: - logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}') + logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}') with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) diff --git a/src/auto_archiver/modules/api_db/__init__.py b/src/auto_archiver/modules/api_db/__init__.py index 2070b06..a4f39a1 100644 --- a/src/auto_archiver/modules/api_db/__init__.py +++ b/src/auto_archiver/modules/api_db/__init__.py @@ -1 +1 @@ -from api_db import AAApiDb \ No newline at end of file +from .api_db import AAApiDb \ No newline at end of file diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index d22fa59..3874496 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -4,19 +4,41 @@ "entry_point": "api_db:AAApiDb", "requires_setup": True, "dependencies": { - "python": ["requests", - "loguru"], + "python": ["requests", "loguru"], }, "configs": { - "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, - "api_token": {"default": None, "help": "API Bearer token."}, - "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, - "author_id": {"default": None, "help": "which email to assign as author"}, - "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, - "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",}, - "store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",}, - "tags": {"default": [], "help": "what tags to add to the archived URL",} + "api_endpoint": { + "default": None, + "required": True, + "help": "API endpoint where calls are made to", }, + "api_token": {"default": None, + "help": "API Bearer token."}, + "public": { + "default": False, + "type": "bool", + "help": "whether the URL should be publicly available via the API", + }, + "author_id": {"default": None, "help": "which email to assign as author"}, + "group_id": { + "default": None, + "help": "which group of users have access to the archive in case public=false as author", + }, + "allow_rearchive": { + "default": True, + "type": "bool", + "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", + }, + "store_results": { + "default": True, + "type": "bool", + "help": "when set, will send the results to the API database.", + }, + "tags": { + "default": [], + "help": "what tags to add to the archived URL", + }, + }, "description": """ Provides integration with the Auto-Archiver API for querying and storing archival data. diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index a893aee..e1f67ce 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -1,5 +1,7 @@ from typing import Union -import requests, os + +import os +import requests from loguru import logger from auto_archiver.core import Database @@ -7,17 +9,7 @@ from auto_archiver.core import Metadata class AAApiDb(Database): - """ - Connects to auto-archiver-api instance - """ - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.allow_rearchive = bool(self.allow_rearchive) - self.store_results = bool(self.store_results) - self.assert_valid_string("api_endpoint") - + """Connects to auto-archiver-api instance""" def fetch(self, item: Metadata) -> Union[Metadata, bool]: """ query the database for the existence of this item. diff --git a/src/auto_archiver/modules/atlos/__init__.py b/src/auto_archiver/modules/atlos/__init__.py deleted file mode 100644 index de7fead..0000000 --- a/src/auto_archiver/modules/atlos/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .atlos import AtlosStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos/__manifest__.py b/src/auto_archiver/modules/atlos/__manifest__.py deleted file mode 100644 index 7ba2f72..0000000 --- a/src/auto_archiver/modules/atlos/__manifest__.py +++ /dev/null @@ -1,40 +0,0 @@ -{ - "name": "atlos_storage", - "type": ["storage"], - "requires_setup": True, - "dependencies": {"python": ["loguru", "requests"], "bin": [""]}, - "configs": { - "path_generator": { - "default": "url", - "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", - }, - "filename_generator": { - "default": "random", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", - }, - "api_token": { - "default": None, - "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "type": "str", - }, - "atlos_url": { - "default": "https://platform.atlos.org", - "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": "str", - }, - }, - "description": """ - AtlosStorage: A storage module for saving media files to the Atlos platform. - - ### Features - - Uploads media files to Atlos using Atlos-specific APIs. - - Automatically calculates SHA-256 hashes of media files for integrity verification. - - Skips uploads for files that already exist on Atlos with the same hash. - - Supports attaching metadata, such as `atlos_id`, to the uploaded files. - - Provides CDN-like URLs for accessing uploaded media. - - ### Notes - - Requires Atlos API configuration, including `atlos_url` and `api_token`. - - Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials. - """, -} diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py index c45e215..baa9fef 100644 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -1,14 +1,10 @@ -import os - from typing import Union -from loguru import logger -from csv import DictWriter -from dataclasses import asdict + import requests +from loguru import logger from auto_archiver.core import Database from auto_archiver.core import Metadata -from auto_archiver.utils import get_atlos_config_options class AtlosDb(Database): diff --git a/src/auto_archiver/modules/atlos_feeder/__manifest__.py b/src/auto_archiver/modules/atlos_feeder/__manifest__.py index f2772f2..5ae3540 100644 --- a/src/auto_archiver/modules/atlos_feeder/__manifest__.py +++ b/src/auto_archiver/modules/atlos_feeder/__manifest__.py @@ -8,8 +8,9 @@ "configs": { "api_token": { "default": None, + "type": "str", + "required": True, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "type": "str" }, "atlos_url": { "default": "https://platform.atlos.org", diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index 9811a82..bbf06f6 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -1,19 +1,12 @@ -from loguru import logger import requests +from loguru import logger from auto_archiver.core import Feeder -from auto_archiver.core import Metadata, ArchivingContext -from auto_archiver.utils import get_atlos_config_options +from auto_archiver.core import Metadata class AtlosFeeder(Feeder): - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - if type(self.api_token) != str: - raise Exception("Atlos Feeder did not receive an Atlos API token") - def __iter__(self) -> Metadata: # Get all the urls from the Atlos API count = 0 diff --git a/src/auto_archiver/modules/atlos/atlos.py b/src/auto_archiver/modules/atlos_storage/atlos_storage.py similarity index 96% rename from src/auto_archiver/modules/atlos/atlos.py rename to src/auto_archiver/modules/atlos_storage/atlos_storage.py index abc8a1a..f8eef68 100644 --- a/src/auto_archiver/modules/atlos/atlos.py +++ b/src/auto_archiver/modules/atlos_storage/atlos_storage.py @@ -1,12 +1,12 @@ -import os -from typing import IO, List, Optional -from loguru import logger -import requests import hashlib +import os +from typing import IO, Optional + +import requests +from loguru import logger from auto_archiver.core import Media, Metadata from auto_archiver.core import Storage -from auto_archiver.utils import get_atlos_config_options class AtlosStorage(Storage): diff --git a/src/auto_archiver/modules/gdrive_storage/__manifest__.py b/src/auto_archiver/modules/gdrive_storage/__manifest__.py index e24f21b..2ca7e27 100644 --- a/src/auto_archiver/modules/gdrive_storage/__manifest__.py +++ b/src/auto_archiver/modules/gdrive_storage/__manifest__.py @@ -1,14 +1,14 @@ { "name": "Google Drive Storage", "type": ["storage"], + "author": "Dave Mateer", + "entry_point": "gdrive_storage::GDriveStorage", "requires_setup": True, "dependencies": { "python": [ "loguru", - "google-api-python-client", - "google-auth", - "google-auth-oauthlib", - "google-auth-httplib2" + "googleapiclient", + "google", ], }, "configs": { @@ -18,17 +18,24 @@ "choices": ["flat", "url", "random"], }, "filename_generator": { - "default": "random", + "default": "static", "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", "choices": ["random", "static"], }, - "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, - "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, + "root_folder_id": {"default": None, + # "required": True, + "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, + "oauth_token": {"default": None, + "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."}, }, "description": """ + GDriveStorage: A storage module for saving archived content to Google Drive. + Author: Dave Mateer, (And maintained by: ) + Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python + ### Features - Saves media files to Google Drive, organizing them into folders based on the provided path structure. - Supports OAuth token-based authentication or service account credentials for API access. @@ -39,5 +46,55 @@ - Requires setup with either a Google OAuth token or a service account JSON file. - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure. - Automatically handles Google Drive API token refreshes for long-running jobs. - """ + + ## Overview +This module integrates Google Drive as a storage backend, enabling automatic folder creation and file uploads. It supports authentication via **service accounts** (recommended for automation) or **OAuth tokens** (for user-based authentication). + +## Features +- Saves files to Google Drive, organizing them into structured folders. +- Supports both **service account** and **OAuth token** authentication. +- Automatically creates folders if they don't exist. +- Generates public URLs for easy file sharing. + +## Setup Guide +1. **Enable Google Drive API** + - Create a Google Cloud project at [Google Cloud Console](https://console.cloud.google.com/) + - Enable the **Google Drive API**. + +2. **Set Up a Google Drive Folder** + - Create a folder in **Google Drive** and copy its **folder ID** from the URL. + - Add the **folder ID** to your configuration (`orchestration.yaml`): + ```yaml + root_folder_id: "FOLDER_ID" + ``` + +3. **Authentication Options** + - **Option 1: Service Account (Recommended)** + - Create a **service account** in Google Cloud IAM. + - Download the JSON key file and save it as: + ``` + secrets/service_account.json + ``` + - **Share your Drive folder** with the service account’s `client_email` (found in the JSON file). + + - **Option 2: OAuth Token (User Authentication)** + - Create OAuth **Desktop App credentials** in Google Cloud. + - Save the credentials as: + ``` + secrets/oauth_credentials.json + ``` + - Generate an OAuth token by running: + ```sh + python scripts/create_update_gdrive_oauth_token.py -c secrets/oauth_credentials.json + ``` + + + Notes on the OAuth token: + Tokens are refreshed after 1 hour however keep working for 7 days (tbc) + so as long as the job doesn't last for 7 days then this method of refreshing only once per run will work + see this link for details on the token: + https://davemateer.com/2022/04/28/google-drive-with-python#tokens + + +""" } diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index c2d326d..b764f1d 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -1,68 +1,69 @@ -import shutil, os, time, json +import json +import os +import time from typing import IO -from loguru import logger -from googleapiclient.discovery import build -from googleapiclient.http import MediaFileUpload +from google.auth.transport.requests import Request from google.oauth2 import service_account from google.oauth2.credentials import Credentials -from google.auth.transport.requests import Request +from googleapiclient.discovery import build +from googleapiclient.http import MediaFileUpload +from loguru import logger from auto_archiver.core import Media from auto_archiver.core import Storage + + class GDriveStorage(Storage): - def __init__(self, config: dict) -> None: - super().__init__(config) + def setup(self, config: dict) -> None: + # Step 1: Call the BaseModule setup to dynamically assign configs + super().setup(config) + self.scopes = ['https://www.googleapis.com/auth/drive'] + # Initialize Google Drive service + self._setup_google_drive_service() - SCOPES = ['https://www.googleapis.com/auth/drive'] - - if self.oauth_token is not None: - """ - Tokens are refreshed after 1 hour - however keep working for 7 days (tbc) - so as long as the job doesn't last for 7 days - then this method of refreshing only once per run will work - see this link for details on the token - https://davemateer.com/2022/04/28/google-drive-with-python#tokens - """ - logger.debug(f'Using GD OAuth token {self.oauth_token}') - # workaround for missing 'refresh_token' in from_authorized_user_file - with open(self.oauth_token, 'r') as stream: - creds_json = json.load(stream) - creds_json['refresh_token'] = creds_json.get("refresh_token", "") - creds = Credentials.from_authorized_user_info(creds_json, SCOPES) - # creds = Credentials.from_authorized_user_file(self.oauth_token, SCOPES) - - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - logger.debug('Requesting new GD OAuth token') - creds.refresh(Request()) - else: - raise Exception("Problem with creds - create the token again") - - # Save the credentials for the next run - with open(self.oauth_token, 'w') as token: - logger.debug('Saving new GD OAuth token') - token.write(creds.to_json()) - else: - logger.debug('GD OAuth Token valid') + def _setup_google_drive_service(self): + """Initialize Google Drive service based on provided credentials.""" + if self.oauth_token: + logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}") + self.service = self._initialize_with_oauth_token() + elif self.service_account: + logger.debug(f"Using Google Drive service account: {self.service_account}") + self.service = self._initialize_with_service_account() else: - gd_service_account = self.service_account - logger.debug(f'Using GD Service Account {gd_service_account}') - creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES) + raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.") - self.service = build('drive', 'v3', credentials=creds) + def _initialize_with_oauth_token(self): + """Initialize Google Drive service with OAuth token.""" + with open(self.oauth_token, 'r') as stream: + creds_json = json.load(stream) + creds_json['refresh_token'] = creds_json.get("refresh_token", "") + + creds = Credentials.from_authorized_user_info(creds_json, self.scopes) + if not creds.valid and creds.expired and creds.refresh_token: + creds.refresh(Request()) + with open(self.oauth_token, 'w') as token_file: + logger.debug("Saving refreshed OAuth token.") + token_file.write(creds.to_json()) + elif not creds.valid: + raise ValueError("Invalid OAuth token. Please regenerate the token.") + + return build('drive', 'v3', credentials=creds) + + def _initialize_with_service_account(self): + """Initialize Google Drive service with service account.""" + creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes) + return build('drive', 'v3', credentials=creds) def get_cdn_url(self, media: Media) -> str: """ only support files saved in a folder for GD S3 supports folder and all stored in the root """ - # full_name = os.path.join(self.folder, media.key) parent_id, folder_id = self.root_folder_id, None path_parts = media.key.split(os.path.sep) @@ -77,7 +78,7 @@ class GDriveStorage(Storage): return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" def upload(self, media: Media, **kwargs) -> bool: - logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}') + logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}') """ 1. for each sub-folder in the path check if exists or create 2. upload file to root_id/other_paths.../filename @@ -168,8 +169,3 @@ class GDriveStorage(Storage): gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute() return gd_folder.get('id') - # def exists(self, key): - # try: - # self.get_cdn_url(key) - # return True - # except: return False diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py index f926adc..cf95245 100644 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -4,7 +4,7 @@ "entry_point": "gsheet_db::GsheetsDb", "requires_setup": True, "dependencies": { - "python": ["loguru", "gspread", "python-slugify"], + "python": ["loguru", "gspread", "slugify"], }, "configs": { "allow_worksheets": { @@ -17,6 +17,7 @@ }, "use_sheet_names_in_stored_paths": { "default": True, + "type": "bool", "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", } }, diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index 1c9acab..7b74072 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -4,7 +4,7 @@ "entry_point": "gsheet_feeder::GsheetsFeeder", "requires_setup": True, "dependencies": { - "python": ["loguru", "gspread", "python-slugify"], + "python": ["loguru", "gspread", "slugify"], }, "configs": { "sheet": {"default": None, "help": "name of the sheet to archive"}, diff --git a/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py index 57f378e..a958a99 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py @@ -1,6 +1,7 @@ { "name": "Instagram API Extractor", "type": ["extractor"], + "entry_point": "instagram_api_extractor::InstagramAPIExtractor", "dependencies": {"python": ["requests", "loguru", @@ -9,24 +10,32 @@ }, "requires_setup": True, "configs": { - "access_token": {"default": None, "help": "a valid instagrapi-api token"}, - "api_endpoint": {"default": None, "help": "API endpoint to use"}, + "access_token": {"default": None, + "help": "a valid instagrapi-api token"}, + "api_endpoint": {"default": None, + # "required": True, + "help": "API endpoint to use"}, "full_profile": { "default": False, + "type": "bool", "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.", }, "full_profile_max_posts": { "default": 0, + "type": "int", "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights", }, "minimize_json_output": { "default": True, + "type": "bool", "help": "if true, will remove empty values from the json output", }, }, "description": """ Archives various types of Instagram content using the Instagrapi API. +Requires setting up an Instagrapi API deployment and providing an access token and API endpoint. + ### Features - Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content. - Supports advanced configuration options, including: diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 3d7f9e5..4a18228 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -32,16 +32,11 @@ class InstagramAPIExtractor(Extractor): r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" ) - def __init__(self, config: dict) -> None: - super().__init__(config) - self.assert_valid_string("access_token") - self.assert_valid_string("api_endpoint") - self.full_profile_max_posts = int(self.full_profile_max_posts) + def setup(self, config: dict) -> None: + super().setup(config) if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] - self.full_profile = bool(self.full_profile) - self.minimize_json_output = bool(self.minimize_json_output) def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/instagram_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py index 6e7518e..d8e4a9b 100644 --- a/src/auto_archiver/modules/instagram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -9,9 +9,12 @@ }, "requires_setup": True, "configs": { - "username": {"default": None, "help": "a valid Instagram username"}, + "username": {"default": None, + "required": True, + "help": "a valid Instagram username"}, "password": { "default": None, + "required": True, "help": "the corresponding Instagram account password", }, "download_folder": { @@ -25,9 +28,11 @@ # TODO: fine-grain # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, }, - "description": """Uses the Instaloader library to download content from Instagram. This class handles both individual posts - and user profiles, downloading as much information as possible, including images, videos, text, stories, - highlights, and tagged posts. Authentication is required via username/password or a session file. + "description": """ + Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts + and user profiles, downloading as much information as possible, including images, videos, text, stories, + highlights, and tagged posts. + Authentication is required via username/password or a session file. """, } diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 1a246fb..1cdb0b1 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -4,7 +4,7 @@ """ import re, os, shutil, traceback -import instaloader # https://instaloader.github.io/as-module.html +import instaloader from loguru import logger from auto_archiver.core import Extractor @@ -22,13 +22,9 @@ class InstagramExtractor(Extractor): profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)") # TODO: links to stories - def __init__(self, config: dict) -> None: - super().__init__(config) - # TODO: refactor how configuration validation is done - self.assert_valid_string("username") - self.assert_valid_string("password") - self.assert_valid_string("download_folder") - self.assert_valid_string("session_file") + def setup(self, config: dict) -> None: + super().setup(config) + self.insta = instaloader.Instaloader( download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" ) diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py index 8a1f74f..a24a864 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py @@ -1,15 +1,16 @@ { "name": "Instagram Telegram Bot Extractor", "type": ["extractor"], - "dependencies": {"python": ["loguru", - "telethon",], + "dependencies": {"python": ["loguru", "telethon",], }, "requires_setup": True, "configs": { "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, - "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."}, + "timeout": {"default": 45, + "type": "int", + "help": "timeout to fetch the instagram content in seconds."}, }, "description": """ The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content, @@ -28,6 +29,12 @@ returned as part of a `Metadata` object. To use the `InstagramTbotExtractor`, you need to provide the following configuration settings: - **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps). - **Session File**: Optional path to store the Telegram session file for future use. - +- The session file is created automatically and should be unique for each instance. +- You may need to enter your Telegram credentials (phone) and use the a 2FA code sent to you the first time you run the extractor.: +```2025-01-30 00:43:49.348 | INFO | auto_archiver.modules.instagram_tbot_extractor.instagram_tbot_extractor:setup:36 - SETUP instagram_tbot_extractor checking login... +Please enter your phone (or bot token): +447123456789 +Please enter the code you received: 00000 +Signed in successfully as E C; remember to not break the ToS or you will risk an account ban! +``` """, } diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 60fa397..791b9c0 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -27,15 +27,19 @@ class InstagramTbotExtractor(Extractor): https://t.me/instagram_load_bot """ - def setup(self) -> None: + def setup(self, configs) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. checks if the session file is valid """ + super().setup(configs) logger.info(f"SETUP {self.name} checking login...") # make a copy of the session that is used exclusively with this archiver instance new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session") + if not os.path.exists(f"{self.session_file}.session"): + raise FileNotFoundError(f"session file {self.session_file}.session not found, " + f"to set this up run the setup script in scripts/telegram_setup.py") shutil.copy(self.session_file + ".session", new_session_file) self.session_file = new_session_file.replace(".session", "") @@ -43,7 +47,6 @@ class InstagramTbotExtractor(Extractor): self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) except OperationalError as e: logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}") - with self.client.start(): logger.success(f"SETUP {self.name} login works.") diff --git a/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py index 6353d12..133fef7 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py @@ -3,7 +3,7 @@ "type": ["enricher"], "requires_setup": False, "dependencies": { - "python": ["loguru", "pdqhash", "numpy", "Pillow"], + "python": ["loguru", "pdqhash", "numpy", "PIL"], }, "description": """ PDQ Hash Enricher for generating perceptual hashes of media files. diff --git a/src/auto_archiver/modules/s3_storage/__init__.py b/src/auto_archiver/modules/s3_storage/__init__.py index 1c826fd..cbf3237 100644 --- a/src/auto_archiver/modules/s3_storage/__init__.py +++ b/src/auto_archiver/modules/s3_storage/__init__.py @@ -1 +1 @@ -from .s3 import S3Storage \ No newline at end of file +from .s3_storage import S3Storage \ No newline at end of file diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py index 16ac7bd..df05055 100644 --- a/src/auto_archiver/modules/s3_storage/__manifest__.py +++ b/src/auto_archiver/modules/s3_storage/__manifest__.py @@ -7,12 +7,12 @@ }, "configs": { "path_generator": { - "default": "url", + "default": "flat", "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", "choices": ["flat", "url", "random"], }, "filename_generator": { - "default": "random", + "default": "static", "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", "choices": ["random", "static"], }, @@ -20,7 +20,9 @@ "region": {"default": None, "help": "S3 region name"}, "key": {"default": None, "help": "S3 API key"}, "secret": {"default": None, "help": "S3 API secret"}, - "random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"}, + "random_no_duplicate": {"default": False, + "type": "bool", + "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"}, "endpoint_url": { "default": 'https://{region}.digitaloceanspaces.com', "help": "S3 bucket endpoint, {region} are inserted at runtime" @@ -29,7 +31,9 @@ "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" }, - "private": {"default": False, "help": "if true S3 files will not be readable online"}, + "private": {"default": False, + "type": "bool", + "help": "if true S3 files will not be readable online"}, }, "description": """ S3Storage: A storage module for saving media files to an S3-compatible object storage. diff --git a/src/auto_archiver/modules/s3_storage/s3.py b/src/auto_archiver/modules/s3_storage/s3_storage.py similarity index 88% rename from src/auto_archiver/modules/s3_storage/s3.py rename to src/auto_archiver/modules/s3_storage/s3_storage.py index 10d5f61..f324d5c 100644 --- a/src/auto_archiver/modules/s3_storage/s3.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -1,19 +1,21 @@ from typing import IO -import boto3, os -from auto_archiver.utils.misc import random_str -from auto_archiver.core import Media -from auto_archiver.core import Storage - -from auto_archiver.modules.hash_enricher import HashEnricher +import boto3 +import os from loguru import logger -NO_DUPLICATES_FOLDER = "no-dups/" -class S3Storage(Storage): +from auto_archiver.core import Media +from auto_archiver.core import Storage +from auto_archiver.modules.hash_enricher import HashEnricher +from auto_archiver.utils.misc import random_str - def __init__(self, config: dict) -> None: - super().__init__(config) +NO_DUPLICATES_FOLDER = "no-dups/" + +class S3Storage(Storage, HashEnricher): + + def setup(self, config: dict) -> None: + super().setup(config) self.s3 = boto3.client( 's3', region_name=self.region, @@ -21,7 +23,6 @@ class S3Storage(Storage): aws_access_key_id=self.key, aws_secret_access_key=self.secret ) - self.random_no_duplicate = bool(self.random_no_duplicate) if self.random_no_duplicate: logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.") @@ -48,8 +49,7 @@ class S3Storage(Storage): def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload - he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}}) - hd = he.calculate_hash(media.filename) + hd = self.calculate_hash(media.filename) path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): @@ -61,8 +61,7 @@ class S3Storage(Storage): _, ext = os.path.splitext(media.key) media.key = os.path.join(path, f"{random_str(24)}{ext}") return True - - + def file_in_folder(self, path:str) -> str: # checks if path exists and is not an empty folder if not path.endswith('/'): diff --git a/src/auto_archiver/modules/ssl_enricher/__manifest__.py b/src/auto_archiver/modules/ssl_enricher/__manifest__.py index 0fb7cd9..9028f14 100644 --- a/src/auto_archiver/modules/ssl_enricher/__manifest__.py +++ b/src/auto_archiver/modules/ssl_enricher/__manifest__.py @@ -3,7 +3,7 @@ "type": ["enricher"], "requires_setup": False, "dependencies": { - "python": ["loguru", "python-slugify"], + "python": ["loguru", "slugify"], }, 'entry_point': 'ssl_enricher::SSLEnricher', "configs": { diff --git a/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py index bd7836d..e47397f 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py +++ b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py @@ -3,7 +3,7 @@ "type": ["enricher"], "requires_setup": False, "dependencies": { - "python": ["loguru", "ffmpeg-python"], + "python": ["loguru", "ffmpeg"], "bin": ["ffmpeg"] }, "configs": { diff --git a/src/auto_archiver/modules/vk_extractor/__manifest__.py b/src/auto_archiver/modules/vk_extractor/__manifest__.py index 116b430..033fe50 100644 --- a/src/auto_archiver/modules/vk_extractor/__manifest__.py +++ b/src/auto_archiver/modules/vk_extractor/__manifest__.py @@ -4,14 +4,20 @@ "requires_setup": True, "depends": ["core", "utils"], "dependencies": { - "python": ["loguru", - "vk_url_scraper"], + "python": ["loguru", "vk_url_scraper"], }, "configs": { - "username": {"default": None, "help": "valid VKontakte username"}, - "password": {"default": None, "help": "valid VKontakte password"}, - "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, + "username": {"default": None, + "required": True, + "help": "valid VKontakte username"}, + "password": {"default": None, + "required": True, + "help": "valid VKontakte password"}, + "session_file": { + "default": "secrets/vk_config.v2.json", + "help": "valid VKontakte password", }, + }, "description": """ The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages. This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract @@ -31,6 +37,5 @@ To use the `VkArchiver`, you must provide valid VKontakte login credentials and Credentials can be set in the configuration file or directly via environment variables. Ensure you have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/). -""" -, +""", } diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index 1bce167..301fa89 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -12,10 +12,8 @@ class VkExtractor(Extractor): Currently only works for /wall posts """ - def __init__(self, config: dict) -> None: - super().__init__(config) - self.assert_valid_string("username") - self.assert_valid_string("password") + def setup(self, config: dict) -> None: + super().setup(config) self.vks = VkScraper(self.username, self.password, session_file=self.session_file) def download(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/modules/wacz_enricher/__manifest__.py b/src/auto_archiver/modules/wacz_enricher/__manifest__.py index bb9d290..46ce05e 100644 --- a/src/auto_archiver/modules/wacz_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wacz_enricher/__manifest__.py @@ -1,6 +1,7 @@ { "name": "WACZ Enricher", "type": ["enricher", "archiver"], + "entry_point": "wacz_enricher::WaczExtractorEnricher", "requires_setup": True, "dependencies": { "python": [ @@ -25,6 +26,7 @@ }, "description": """ Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving. + [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format. ### Features - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`. @@ -33,7 +35,7 @@ - Generates metadata from the archived page's content and structure (e.g., titles, text). ### Notes - - Requires Docker for running `browsertrix-crawler` unless explicitly disabled. + - Requires Docker for running `browsertrix-crawler` . - Configurable via parameters for timeout, media extraction, screenshots, and proxy settings. """ } diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 1eb7398..8810b84 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -18,7 +18,9 @@ class WaczExtractorEnricher(Enricher, Extractor): When used as an archiver it will extract the media from the .WACZ archive so it can be enriched. """ - def setup(self) -> None: + def setup(self, configs) -> None: + super().setup(configs) + self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index 0adf9ff..f7ad1b3 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -3,7 +3,7 @@ "type": ["enricher"], "requires_setup": True, "dependencies": { - "python": ["loguru", "requests"], + "python": ["s3_storage", "loguru", "requests"], }, "configs": { "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."}, diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 09eb3db..b8fe634 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -5,7 +5,7 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.modules.s3_storage import S3Storage - +from auto_archiver.core.module import get_module class WhisperEnricher(Enricher): """ @@ -53,7 +53,7 @@ class WhisperEnricher(Enricher): to_enrich.set_content(f"\n[automatic video transcript]: {v}") def submit_job(self, media: Media): - s3 = self._get_s3_storage() + s3 = get_module("s3_storage", self.config) s3_url = s3.get_cdn_url(media) assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls " payload = { From b7d9145f6c1b9c50af572156f76df764f4373182 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 30 Jan 2025 13:21:10 +0100 Subject: [PATCH 046/110] Further tidyups + refactoring for new structure * Add implementation tests for orchestrator + logging tests * Standardise method/class vars for extractors to see if they are suitable * Fix bugs with removing default loguru logger (allows further customisation) * Fix bug loading required fields from file * --- poetry.lock | 24 +++- pyproject.toml | 1 + src/auto_archiver/__main__.py | 3 +- src/auto_archiver/core/authentication.py | 0 src/auto_archiver/core/config.py | 4 + src/auto_archiver/core/extractor.py | 14 +- src/auto_archiver/core/module.py | 5 + src/auto_archiver/core/orchestrator.py | 60 +++++---- .../instagram_api_extractor.py | 4 +- .../instagram_extractor.py | 7 +- .../telethon_extractor/telethon_extractor.py | 4 +- .../twitter_api_extractor.py | 4 +- src/auto_archiver/utils/url.py | 12 +- tests/data/example_module/example_module.py | 4 - .../example_module/__init__.py | 0 .../example_module/__manifest__.py | 5 +- .../example_module/example_module.py | 28 ++++ tests/data/test_orchestration.yaml | 16 +++ tests/extractors/test_extractor_base.py | 2 +- tests/extractors/test_instagram_extractor.py | 21 +++ tests/test_modules.py | 2 +- tests/test_orchestrator.py | 123 ++++++++++++++++++ 22 files changed, 292 insertions(+), 51 deletions(-) create mode 100644 src/auto_archiver/core/authentication.py delete mode 100644 tests/data/example_module/example_module.py rename tests/data/{ => test_modules}/example_module/__init__.py (100%) rename tests/data/{ => test_modules}/example_module/__manifest__.py (55%) create mode 100644 tests/data/test_modules/example_module/example_module.py create mode 100644 tests/data/test_orchestration.yaml create mode 100644 tests/extractors/test_instagram_extractor.py create mode 100644 tests/test_orchestrator.py diff --git a/poetry.lock b/poetry.lock index e8a899a..088fc70 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1025,7 +1025,7 @@ version = "0.7.3" description = "Python logging made (stupidly) simple" optional = false python-versions = "<4.0,>=3.5" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c"}, {file = "loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6"}, @@ -1750,6 +1750,24 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-loguru" +version = "0.4.0" +description = "Pytest Loguru" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pytest_loguru-0.4.0-py3-none-any.whl", hash = "sha256:3cc7b9c6b22cb158209ccbabf0d678dacd3f3c7497d6f46f1c338c13bee1ac77"}, + {file = "pytest_loguru-0.4.0.tar.gz", hash = "sha256:0d9e4e72ae9bfd92f774c666e7353766af11b0b78edd59c290e89be116050f03"}, +] + +[package.dependencies] +loguru = "*" + +[package.extras] +test = ["pytest", "pytest-cov"] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -3032,7 +3050,7 @@ version = "1.2.0" description = "A small Python utility to set file creation time on Windows" optional = false python-versions = ">=3.5" -groups = ["main"] +groups = ["main", "dev"] markers = "sys_platform == \"win32\"" files = [ {file = "win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390"}, @@ -3082,4 +3100,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "1556d53c5a94392c120ebaafc495d3b322daf64dac4a19f9726588c7f3d84bca" +content-hash = "5a54c84ba388db7b77d1c28973b710fc99aa3822a2860b30acaf5b02ba1927bd" diff --git a/pyproject.toml b/pyproject.toml index b3a2456..3cd47e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ dependencies = [ [tool.poetry.group.dev.dependencies] pytest = "^8.3.4" autopep8 = "^2.3.1" +pytest-loguru = "^0.4.0" [tool.poetry.group.docs.dependencies] sphinx = "^8.1.3" diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index d31ec5c..0e2f54f 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -1,8 +1,9 @@ """ Entry point for the auto_archiver package. """ from auto_archiver.core.orchestrator import ArchivingOrchestrator +import sys def main(): - ArchivingOrchestrator().run() + ArchivingOrchestrator().run(sys.argv) if __name__ == "__main__": main() diff --git a/src/auto_archiver/core/authentication.py b/src/auto_archiver/core/authentication.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 46dbe28..ca8ed25 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -48,6 +48,10 @@ class DefaultValidatingParser(argparse.ArgumentParser): """ for action in self._actions: if not namespace or action.dest not in namespace: + # for actions that are required and already have a default value, remove the 'required' check + if action.required and action.default is not None: + action.required = False + if action.default is not None: try: self._check_value(action, action.default) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 8d509ec..51d784f 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -11,9 +11,12 @@ from abc import abstractmethod from dataclasses import dataclass import mimetypes import os -import mimetypes, requests +import mimetypes + +import requests from loguru import logger from retrying import retry +import re from ..core import Metadata, ArchivingContext, BaseModule @@ -25,6 +28,8 @@ class Extractor(BaseModule): Subclasses must implement the `download` method to define platform-specific behavior. """ + valid_url: re.Pattern = None + def cleanup(self) -> None: # called when extractors are done, or upon errors, cleanup any resources pass @@ -32,13 +37,20 @@ class Extractor(BaseModule): def sanitize_url(self, url: str) -> str: # used to clean unnecessary URL parameters OR unfurl redirect links return url + + def match_link(self, url: str) -> re.Match: + return self.valid_url.match(url) def suitable(self, url: str) -> bool: """ Returns True if this extractor can handle the given URL Should be overridden by subclasses + """ + if self.valid_url: + return self.match_link(url) is not None + return True def _guess_file_type(self, path: str) -> str: diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index cb380cf..4542b88 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -83,6 +83,11 @@ def setup_paths(paths: list[str]) -> None: """ for path in paths: + # check path exists, if it doesn't, log a warning + if not os.path.exists(path): + logger.warning(f"Path '{path}' does not exist. Skipping...") + continue + # see odoo/module/module.py -> initialize_sys_path if path not in auto_archiver.modules.__path__: auto_archiver.modules.__path__.append(path) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index b305963..ba46492 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -43,6 +43,7 @@ class ArchivingOrchestrator: def setup_basic_parser(self): parser = argparse.ArgumentParser( + prog="auto-archiver", add_help=False, description=""" Auto Archiver is a CLI tool to archive media/metadata from online URLs; @@ -51,15 +52,16 @@ class ArchivingOrchestrator: epilog="Check the code at https://github.com/bellingcat/auto-archiver", formatter_class=RichHelpFormatter, ) - parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) + parser.add_argument('--help', '-h', action='store_true', dest='help', help='show this help message and exit') parser.add_argument('--version', action='version', version=__version__) + parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') # override the default 'help' so we can inject all the configs and show those - parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit') parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction) parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction) self.basic_parser = parser + return parser def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None: parser = DefaultValidatingParser( @@ -78,15 +80,15 @@ class ArchivingOrchestrator: # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] - for module_type in BaseModule.MODULE_TYPES: - enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) - # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter' - for module_type in BaseModule.MODULE_TYPES: - if modules := getattr(basic_config, f"{module_type}s", []): - enabled_modules.extend(modules) + # first loads the modules from the config file, then from the command line + for config in [yaml_config['steps'], basic_config.__dict__]: + for module_type in BaseModule.MODULE_TYPES: + enabled_modules.extend(config.get(f"{module_type}s", [])) - avail_modules = available_modules(with_manifest=True, limit_to_modules=list(dict.fromkeys(enabled_modules)), suppress_warnings=True) + # clear out duplicates, but keep the order + enabled_modules = list(dict.fromkeys(enabled_modules)) + avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True) self.add_module_args(avail_modules, parser) elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] @@ -163,6 +165,10 @@ class ArchivingOrchestrator: # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR] kwargs['metavar'] = name.upper() + if kwargs.get('required', False): + # required args shouldn't have a 'default' value, remove it + kwargs.pop('default', None) + kwargs.pop('cli_set', None) should_store = kwargs.pop('should_store', False) kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" @@ -179,13 +185,12 @@ class ArchivingOrchestrator: self.add_additional_args(self.basic_parser) self.add_module_args(parser=self.basic_parser) - self.basic_parser.print_help() - exit() + self.basic_parser.exit() def setup_logging(self): # setup loguru logging - logger.remove() # remove the default logger + logger.remove(0) # remove the default logger logging_config = self.config['logging'] logger.add(sys.stderr, level=logging_config['level']) if log_file := logging_config['file']: @@ -194,14 +199,18 @@ class ArchivingOrchestrator: def install_modules(self): """ - Swaps out the previous 'strings' in the config with the actual modules + Swaps out the previous 'strings' in the config with the actual modules and loads them """ invalid_modules = [] for module_type in BaseModule.MODULE_TYPES: + step_items = [] modules_to_load = self.config['steps'][f"{module_type}s"] + assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} \ + in your configuration file or on the command line (using --{module_type}s)" + def check_steps_ok(): if not len(step_items): logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.") @@ -239,30 +248,29 @@ class ArchivingOrchestrator: assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again." self.config['steps'][f"{module_type}s"] = step_items + + def load_config(self, config_file: str) -> dict: + if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE: + logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") + exit() - def run(self) -> None: + return read_yaml(config_file) + + def run(self, args: list) -> None: + self.setup_basic_parser() # parse the known arguments for now (basically, we want the config file) + basic_config, unused_args = self.basic_parser.parse_known_args(args) - # load the config file to get the list of enabled items - basic_config, unused_args = self.basic_parser.parse_known_args() - + # setup any custom module paths, so they'll show in the help and for arg parsing setup_paths(basic_config.module_paths) # if help flag was called, then show the help if basic_config.help: self.show_help(basic_config) - # load the config file - yaml_config = {} - - if not os.path.exists(basic_config.config_file) and basic_config.config_file != DEFAULT_CONFIG_FILE: - logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") - exit() - - - yaml_config = read_yaml(basic_config.config_file) + yaml_config = self.load_config(basic_config.config_file) self.setup_complete_parser(basic_config, yaml_config, unused_args) logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 4a18228..5dad0ba 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -28,7 +28,7 @@ class InstagramAPIExtractor(Extractor): # TODO: improvement collect aggregates of locations[0].location and mentions for all posts """ - global_pattern = re.compile( + valid_url = re.compile( r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" ) @@ -44,7 +44,7 @@ class InstagramAPIExtractor(Extractor): url.replace("instagr.com", "instagram.com").replace( "instagr.am", "instagram.com" ) - insta_matches = self.global_pattern.findall(url) + insta_matches = self.valid_url.findall(url) logger.info(f"{insta_matches=}") if not len(insta_matches) or len(insta_matches[0]) != 3: return diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 1cdb0b1..3cf0362 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -16,10 +16,13 @@ class InstagramExtractor(Extractor): Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) """ # NB: post regex should be tested before profile + + valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/") + # https://regex101.com/r/MGPquX/1 - post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)") + post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url)) # https://regex101.com/r/6Wbsxa/1 - profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)") + profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url)) # TODO: links to stories def setup(self, config: dict) -> None: diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index f378e7e..8a08954 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -14,7 +14,7 @@ from auto_archiver.utils import random_str class TelethonArchiver(Extractor): - link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") + valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") @@ -92,7 +92,7 @@ class TelethonArchiver(Extractor): """ url = item.get_url() # detect URLs that we definitely cannot handle - match = self.link_pattern.search(url) + match = self.valid_url.search(url) logger.debug(f"TELETHON: {match=}") if not match: return False diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index ede0239..0434190 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -12,7 +12,7 @@ from auto_archiver.core import Extractor from auto_archiver.core import Metadata,Media class TwitterApiExtractor(Extractor): - link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") + valid_url = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") def setup(self, config: dict) -> None: super().setup(config) @@ -54,7 +54,7 @@ class TwitterApiExtractor(Extractor): def get_username_tweet_id(self, url): # detect URLs that we definitely cannot handle - matches = self.link_pattern.findall(url) + matches = self.valid_url.findall(url) if not len(matches): return False, False username, tweet_id = matches[0] # only one URL supported diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 7586cca..3b67514 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -2,8 +2,11 @@ import re from urllib.parse import urlparse, urlunparse class UrlUtil: - telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)") - is_istagram = re.compile(r"https:\/\/www\.instagram\.com") + + AUTHWALL_URLS = [ + re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels + re.compile(r"https:\/\/www\.instagram\.com"), # instagram + ] @staticmethod def clean(url: str) -> str: return url @@ -13,8 +16,9 @@ class UrlUtil: """ checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work """ - if UrlUtil.telegram_private.match(url): return True - if UrlUtil.is_istagram.match(url): return True + for regex in UrlUtil.AUTHWALL_URLS: + if regex.match(url): + return True return False diff --git a/tests/data/example_module/example_module.py b/tests/data/example_module/example_module.py deleted file mode 100644 index bce8ba4..0000000 --- a/tests/data/example_module/example_module.py +++ /dev/null @@ -1,4 +0,0 @@ -from auto_archiver.core.extractor import Extractor -class ExampleModule(Extractor): - def download(self, item): - print("do something") \ No newline at end of file diff --git a/tests/data/example_module/__init__.py b/tests/data/test_modules/example_module/__init__.py similarity index 100% rename from tests/data/example_module/__init__.py rename to tests/data/test_modules/example_module/__init__.py diff --git a/tests/data/example_module/__manifest__.py b/tests/data/test_modules/example_module/__manifest__.py similarity index 55% rename from tests/data/example_module/__manifest__.py rename to tests/data/test_modules/example_module/__manifest__.py index 19a85f9..f2ebdbf 100644 --- a/tests/data/example_module/__manifest__.py +++ b/tests/data/test_modules/example_module/__manifest__.py @@ -1,10 +1,11 @@ { "name": "Example Module", - "type": ["extractor"], + "type": ["extractor", "feeder", "formatter", "storage", "enricher", "database"], "requires_setup": False, "dependencies": {"python": ["loguru"] }, "configs": { - "csv_file": {"default": "db.csv", "help": "CSV file name"} + "csv_file": {"default": "db.csv", "help": "CSV file name"}, + "required_field": {"required": True, "help": "required field in the CSV file"}, }, } \ No newline at end of file diff --git a/tests/data/test_modules/example_module/example_module.py b/tests/data/test_modules/example_module/example_module.py new file mode 100644 index 0000000..7def054 --- /dev/null +++ b/tests/data/test_modules/example_module/example_module.py @@ -0,0 +1,28 @@ +from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata + +class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter): + def download(self, item): + print("download") + + def __iter__(self): + yield Metadata().set_url("https://example.com") + + + def done(self, result): + print("done") + + def enrich(self, to_enrich): + print("enrich") + + def get_cdn_url(self, media): + return "nice_url" + + def save(self, item): + print("save") + + def uploadf(self, file, key, **kwargs): + print("uploadf") + + + def format(self, item): + print("format") diff --git a/tests/data/test_orchestration.yaml b/tests/data/test_orchestration.yaml new file mode 100644 index 0000000..ec6af35 --- /dev/null +++ b/tests/data/test_orchestration.yaml @@ -0,0 +1,16 @@ +steps: + feeders: + - example_module + extractors: + - example_module + formatters: + - example_module + storages: + - example_module + databases: + - example_module + enrichers: + - example_module + + +# Global configuration \ No newline at end of file diff --git a/tests/extractors/test_extractor_base.py b/tests/extractors/test_extractor_base.py index f6be70b..24689b4 100644 --- a/tests/extractors/test_extractor_base.py +++ b/tests/extractors/test_extractor_base.py @@ -9,7 +9,7 @@ class TestExtractorBase(object): config: dict = None @pytest.fixture(autouse=True) - def setup_archiver(self, setup_module): + def setup_extractor(self, setup_module): assert self.extractor_module is not None, "self.extractor_module must be set on the subclass" assert self.config is not None, "self.config must be a dict set on the subclass" diff --git a/tests/extractors/test_instagram_extractor.py b/tests/extractors/test_instagram_extractor.py new file mode 100644 index 0000000..7efe1b1 --- /dev/null +++ b/tests/extractors/test_instagram_extractor.py @@ -0,0 +1,21 @@ +import pytest + +from auto_archiver.modules.instagram_extractor import InstagramExtractor +from .test_extractor_base import TestExtractorBase + +class TestInstagramExtractor(TestExtractorBase): + + extractor_module: str = 'instagram_extractor' + config: dict = {} + + @pytest.mark.parametrize("url", [ + "https://www.instagram.com/p/", + "https://www.instagram.com/p/1234567890/", + "https://www.instagram.com/reel/1234567890/", + "https://www.instagram.com/username/", + "https://www.instagram.com/username/stories/", + "https://www.instagram.com/username/highlights/", + ]) + def test_regex_matches(self, url): + # post + assert InstagramExtractor.valid_url.match(url) diff --git a/tests/test_modules.py b/tests/test_modules.py index decc616..a4c0ec8 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -7,7 +7,7 @@ def example_module(): import auto_archiver previous_path = auto_archiver.modules.__path__ - auto_archiver.modules.__path__.append("tests/data/") + auto_archiver.modules.__path__.append("tests/data/test_modules/") module = get_module_lazy("example_module") yield module diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py new file mode 100644 index 0000000..9e81df7 --- /dev/null +++ b/tests/test_orchestrator.py @@ -0,0 +1,123 @@ +import pytest +import sys +from argparse import ArgumentParser +from auto_archiver.core.orchestrator import ArchivingOrchestrator +from auto_archiver.version import __version__ +from auto_archiver.core.config import read_yaml, store_yaml + +TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml" +TEST_MODULES = "tests/data/test_modules/" + +@pytest.fixture +def test_args(): + return ["--config", TEST_ORCHESTRATION, + "--module_paths", TEST_MODULES, + "--example_module.required_field", "some_value"] # just set this for normal testing, we will remove it later + +@pytest.fixture +def orchestrator(): + yield ArchivingOrchestrator() + # hack - the loguru logger starts with one logger, but if orchestrator has run before + # it'll remove the default logger, add it back in: + + from loguru import logger + + if not logger._core.handlers.get(0): + logger._core.handlers_count = 0 + logger.add(sys.stderr) + # and remove the custom logger + if logger._core.handlers.get(1): + logger.remove(1) + +@pytest.fixture +def basic_parser(orchestrator) -> ArgumentParser: + return orchestrator.setup_basic_parser() + +def test_setup_orchestrator(orchestrator): + assert orchestrator is not None + +def test_parse_config(): + pass + +def test_parse_basic(basic_parser): + args = basic_parser.parse_args(["--config", TEST_ORCHESTRATION]) + assert args.config_file == TEST_ORCHESTRATION + +@pytest.mark.parametrize("mode", ["simple", "full"]) +def test_mode(basic_parser, mode): + args = basic_parser.parse_args(["--mode", mode]) + assert args.mode == mode + +def test_mode_invalid(basic_parser, capsys): + with pytest.raises(SystemExit) as exit_error: + basic_parser.parse_args(["--mode", "invalid"]) + assert exit_error.value.code == 2 + assert "invalid choice" in capsys.readouterr().err + +def test_version(basic_parser, capsys): + with pytest.raises(SystemExit) as exit_error: + basic_parser.parse_args(["--version"]) + assert exit_error.value.code == 0 + assert capsys.readouterr().out == f"{__version__}\n" + +def test_help(orchestrator, basic_parser, capsys): + + args = basic_parser.parse_args(["--help"]) + assert args.help == True + + # test the show_help() on orchestrator + with pytest.raises(SystemExit) as exit_error: + orchestrator.show_help(args) + + assert exit_error.value.code == 0 + assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in capsys.readouterr().out + + +def test_add_custom_modules_path(orchestrator, test_args): + orchestrator.run(test_args) + + import auto_archiver + assert "tests/data/test_modules/" in auto_archiver.modules.__path__ + +def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args): + + orchestrator.run(test_args + # we still need to load the real path to get the example_module + ["--module_paths", "tests/data/invalid_test_modules/"]) + + # assert False + assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..." + + +def test_check_required_values(orchestrator, caplog, test_args): + # drop the example_module.required_field from the test_args + test_args = test_args[:-2] + + with pytest.raises(SystemExit) as exit_error: + orchestrator.run(test_args) + + assert caplog.records[1].message == "the following arguments are required: --example_module.required_field" + +def test_get_required_values_from_config(orchestrator, test_args, tmp_path): + + # load the default example yaml, add a required field, then run the orchestrator + test_yaml = read_yaml(TEST_ORCHESTRATION) + test_yaml['example_module'] = {'required_field': 'some_value'} + # write it to a temp file + tmp_file = (tmp_path / "temp_config.yaml").as_posix() + store_yaml(test_yaml, tmp_file) + + # run the orchestrator + orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES]) + + # should run OK, since there are no missing required fields + + # basic_args = basic_parser.parse_known_args(test_args) + # test_yaml = read_yaml(TEST_ORCHESTRATION) + # test_yaml['example_module'] = {'required_field': 'some_value'} + + # # monkey patch the example_module to have a 'configs' setting of 'my_var' with required=True + # # load the module first + # m = get_module_lazy("example_module") + + # orchestrator.setup_complete_parser(basic_args, test_yaml, unused_args=[]) + # assert orchestrator.config is not None \ No newline at end of file From fade68c6f48bcc6cc69c6dcf05e4b398e5439dd0 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 30 Jan 2025 13:45:24 +0100 Subject: [PATCH 047/110] Fix up unit tests - dataclass + subclasses not having @dataclass was breaking it --- src/auto_archiver/core/extractor.py | 1 - .../modules/twitter_api_extractor/twitter_api_extractor.py | 5 +++-- tests/extractors/test_extractor_base.py | 5 ++++- tests/test_orchestrator.py | 5 +++++ 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 51d784f..ed261eb 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -21,7 +21,6 @@ import re from ..core import Metadata, ArchivingContext, BaseModule -@dataclass class Extractor(BaseModule): """ Base class for implementing extractors in the media archiving framework. diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 0434190..6573475 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -9,10 +9,11 @@ from pytwitter import Api from slugify import slugify from auto_archiver.core import Extractor -from auto_archiver.core import Metadata,Media +from auto_archiver.core import Metadata, Media class TwitterApiExtractor(Extractor): - valid_url = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") + + valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") def setup(self, config: dict) -> None: super().setup(config) diff --git a/tests/extractors/test_extractor_base.py b/tests/extractors/test_extractor_base.py index 24689b4..6e77ec3 100644 --- a/tests/extractors/test_extractor_base.py +++ b/tests/extractors/test_extractor_base.py @@ -1,8 +1,11 @@ +from typing import Type + import pytest from auto_archiver.core.metadata import Metadata from auto_archiver.core.extractor import Extractor + class TestExtractorBase(object): extractor_module: str = None @@ -13,7 +16,7 @@ class TestExtractorBase(object): assert self.extractor_module is not None, "self.extractor_module must be set on the subclass" assert self.config is not None, "self.config must be a dict set on the subclass" - self.extractor: Extractor = setup_module(self.extractor_module, self.config) + self.extractor: Type[Extractor] = setup_module(self.extractor_module, self.config) def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""): assert test_response is not False diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 9e81df7..03cb521 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -4,6 +4,7 @@ from argparse import ArgumentParser from auto_archiver.core.orchestrator import ArchivingOrchestrator from auto_archiver.version import __version__ from auto_archiver.core.config import read_yaml, store_yaml +from auto_archiver.core.module import _LAZY_LOADED_MODULES TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml" TEST_MODULES = "tests/data/test_modules/" @@ -29,6 +30,10 @@ def orchestrator(): if logger._core.handlers.get(1): logger.remove(1) + # delete out any loaded modules + _LAZY_LOADED_MODULES.clear() + + @pytest.fixture def basic_parser(orchestrator) -> ArgumentParser: return orchestrator.setup_basic_parser() From 527438826c65cf5340b1d3560e1f001b77017324 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 30 Jan 2025 13:04:51 +0000 Subject: [PATCH 048/110] Fix manifests for required configs. --- .../modules/api_db/__manifest__.py | 1 - .../modules/atlos_feeder/__manifest__.py | 1 - .../modules/gdrive_storage/__manifest__.py | 3 +- .../instagram_api_extractor/__manifest__.py | 3 +- .../instagram_extractor/__manifest__.py | 4 +- .../modules/vk_extractor/__manifest__.py | 6 +- .../modules/wayback_enricher/__init__.py | 1 - .../modules/wayback_enricher/__manifest__.py | 30 ---------- .../wayback_extractor_enricher/__init__.py | 1 + .../__manifest__.py | 56 +++++++++++++++++++ .../wayback_extractor_enricher.py} | 0 11 files changed, 62 insertions(+), 44 deletions(-) delete mode 100644 src/auto_archiver/modules/wayback_enricher/__init__.py delete mode 100644 src/auto_archiver/modules/wayback_enricher/__manifest__.py create mode 100644 src/auto_archiver/modules/wayback_extractor_enricher/__init__.py create mode 100644 src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py rename src/auto_archiver/modules/{wayback_enricher/wayback_enricher.py => wayback_extractor_enricher/wayback_extractor_enricher.py} (100%) diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index 3874496..698c2e4 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -8,7 +8,6 @@ }, "configs": { "api_endpoint": { - "default": None, "required": True, "help": "API endpoint where calls are made to", }, diff --git a/src/auto_archiver/modules/atlos_feeder/__manifest__.py b/src/auto_archiver/modules/atlos_feeder/__manifest__.py index 5ae3540..d59f420 100644 --- a/src/auto_archiver/modules/atlos_feeder/__manifest__.py +++ b/src/auto_archiver/modules/atlos_feeder/__manifest__.py @@ -7,7 +7,6 @@ }, "configs": { "api_token": { - "default": None, "type": "str", "required": True, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", diff --git a/src/auto_archiver/modules/gdrive_storage/__manifest__.py b/src/auto_archiver/modules/gdrive_storage/__manifest__.py index 2ca7e27..632e52b 100644 --- a/src/auto_archiver/modules/gdrive_storage/__manifest__.py +++ b/src/auto_archiver/modules/gdrive_storage/__manifest__.py @@ -22,8 +22,7 @@ "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", "choices": ["random", "static"], }, - "root_folder_id": {"default": None, - # "required": True, + "root_folder_id": {"required": True, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, diff --git a/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py index a958a99..2d8f1d9 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py @@ -12,8 +12,7 @@ "configs": { "access_token": {"default": None, "help": "a valid instagrapi-api token"}, - "api_endpoint": {"default": None, - # "required": True, + "api_endpoint": {"required": True, "help": "API endpoint to use"}, "full_profile": { "default": False, diff --git a/src/auto_archiver/modules/instagram_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py index d8e4a9b..05cae19 100644 --- a/src/auto_archiver/modules/instagram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -9,11 +9,9 @@ }, "requires_setup": True, "configs": { - "username": {"default": None, - "required": True, + "username": {"required": True, "help": "a valid Instagram username"}, "password": { - "default": None, "required": True, "help": "the corresponding Instagram account password", }, diff --git a/src/auto_archiver/modules/vk_extractor/__manifest__.py b/src/auto_archiver/modules/vk_extractor/__manifest__.py index 033fe50..61e454e 100644 --- a/src/auto_archiver/modules/vk_extractor/__manifest__.py +++ b/src/auto_archiver/modules/vk_extractor/__manifest__.py @@ -7,11 +7,9 @@ "python": ["loguru", "vk_url_scraper"], }, "configs": { - "username": {"default": None, - "required": True, + "username": {"required": True, "help": "valid VKontakte username"}, - "password": {"default": None, - "required": True, + "password": {"required": True, "help": "valid VKontakte password"}, "session_file": { "default": "secrets/vk_config.v2.json", diff --git a/src/auto_archiver/modules/wayback_enricher/__init__.py b/src/auto_archiver/modules/wayback_enricher/__init__.py deleted file mode 100644 index 9782831..0000000 --- a/src/auto_archiver/modules/wayback_enricher/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .wayback_enricher import WaybackExtractorEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/wayback_enricher/__manifest__.py b/src/auto_archiver/modules/wayback_enricher/__manifest__.py deleted file mode 100644 index 5d1fe25..0000000 --- a/src/auto_archiver/modules/wayback_enricher/__manifest__.py +++ /dev/null @@ -1,30 +0,0 @@ -{ - "name": "Wayback Machine Enricher", - "type": ["enricher", "archiver"], - "requires_setup": True, - "dependencies": { - "python": ["loguru", "requests"], - }, - "entry_point": "wayback_enricher::WaybackExtractorEnricher", - "configs": { - "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."}, - "if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"}, - "key": {"default": None, "required": True, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, - "secret": {"default": None, "required": True, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}, - "proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"}, - "proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"}, - }, - "description": """ - Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL. - - ### Features - - Archives URLs using the Internet Archive's Wayback Machine API. - - Supports conditional archiving based on the existence of prior archives within a specified time range. - - Provides proxies for HTTP and HTTPS requests. - - Fetches and confirms the archive URL or provides a job ID for later status checks. - - ### Notes - - Requires a valid Wayback Machine API key and secret. - - Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff. - """ -} diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/__init__.py b/src/auto_archiver/modules/wayback_extractor_enricher/__init__.py new file mode 100644 index 0000000..b69332d --- /dev/null +++ b/src/auto_archiver/modules/wayback_extractor_enricher/__init__.py @@ -0,0 +1 @@ +from .wayback_extractor_enricher import WaybackExtractorEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py new file mode 100644 index 0000000..baecc14 --- /dev/null +++ b/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py @@ -0,0 +1,56 @@ +{ + "name": "Wayback Machine Enricher", + "type": ["enricher", "archiver"], + "entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher", + "requires_setup": True, + "dependencies": { + "python": ["loguru", "requests"], + }, + "configs": { + "timeout": { + "default": 15, + "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.", + }, + "if_not_archived_within": { + "default": None, + "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA", + }, + "key": { + "required": True, + "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php", + }, + "secret": { + "required": True, + "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php", + }, + "proxy_http": { + "default": None, + "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port", + }, + "proxy_https": { + "default": None, + "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port", + }, + }, + "description": """ + Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL. + + ### Features + - Archives URLs using the Internet Archive's Wayback Machine API. + - Supports conditional archiving based on the existence of prior archives within a specified time range. + - Provides proxies for HTTP and HTTPS requests. + - Fetches and confirms the archive URL or provides a job ID for later status checks. + + ### Notes + - Requires a valid Wayback Machine API key and secret. + - Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff. + + ### Steps to Get an Wayback API Key: + - Sign up for an account at [Internet Archive](https://archive.org/account/signup). + - Log in to your account. + - Navigte to your [account settings](https://archive.org/account). + - or: https://archive.org/developers/tutorial-get-ia-credentials.html + - Under Wayback Machine API Keys, generate a new key. + - Note down your API key and secret, as they will be required for authentication. + """, +} diff --git a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py similarity index 100% rename from src/auto_archiver/modules/wayback_enricher/wayback_enricher.py rename to src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py From 953011f36851b887715788679e66c052484d573f Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 30 Jan 2025 14:39:52 +0100 Subject: [PATCH 049/110] Don't make modules 'dataclasses' --- src/auto_archiver/core/database.py | 5 +---- src/auto_archiver/core/enricher.py | 4 +--- src/auto_archiver/core/feeder.py | 3 --- src/auto_archiver/core/formatter.py | 2 -- src/auto_archiver/core/storage.py | 8 +++----- .../modules/html_formatter/html_formatter.py | 2 -- .../modules/mute_formatter/mute_formatter.py | 2 -- 7 files changed, 5 insertions(+), 21 deletions(-) diff --git a/src/auto_archiver/core/database.py b/src/auto_archiver/core/database.py index f7deaef..0eb5d81 100644 --- a/src/auto_archiver/core/database.py +++ b/src/auto_archiver/core/database.py @@ -1,12 +1,9 @@ from __future__ import annotations -from dataclasses import dataclass -from abc import abstractmethod, ABC +from abc import abstractmethod from typing import Union from auto_archiver.core import Metadata, BaseModule - -@dataclass class Database(BaseModule): def started(self, item: Metadata) -> None: diff --git a/src/auto_archiver/core/enricher.py b/src/auto_archiver/core/enricher.py index fe0d05f..0e50fa9 100644 --- a/src/auto_archiver/core/enricher.py +++ b/src/auto_archiver/core/enricher.py @@ -9,11 +9,9 @@ the archiving step and before storage or formatting. Enrichers are optional but highly useful for making the archived data more powerful. """ from __future__ import annotations -from dataclasses import dataclass -from abc import abstractmethod, ABC +from abc import abstractmethod from auto_archiver.core import Metadata, BaseModule -@dataclass class Enricher(BaseModule): """Base classes and utilities for enrichers in the Auto-Archiver system.""" diff --git a/src/auto_archiver/core/feeder.py b/src/auto_archiver/core/feeder.py index e539f5f..352cfd9 100644 --- a/src/auto_archiver/core/feeder.py +++ b/src/auto_archiver/core/feeder.py @@ -1,11 +1,8 @@ from __future__ import annotations -from dataclasses import dataclass from abc import abstractmethod from auto_archiver.core import Metadata from auto_archiver.core import BaseModule - -@dataclass class Feeder(BaseModule): @abstractmethod diff --git a/src/auto_archiver/core/formatter.py b/src/auto_archiver/core/formatter.py index beb0c0d..cf27cb3 100644 --- a/src/auto_archiver/core/formatter.py +++ b/src/auto_archiver/core/formatter.py @@ -1,10 +1,8 @@ from __future__ import annotations -from dataclasses import dataclass from abc import abstractmethod from auto_archiver.core import Metadata, Media, BaseModule -@dataclass class Formatter(BaseModule): @abstractmethod diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 5274204..b40c5cc 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -1,18 +1,16 @@ from __future__ import annotations from abc import abstractmethod -from dataclasses import dataclass from typing import IO, Optional import os +from loguru import logger +from slugify import slugify + from auto_archiver.utils.misc import random_str from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher -from loguru import logger -from slugify import slugify - -@dataclass class Storage(BaseModule): def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None: diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index 8f006e0..bfc2efa 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -1,5 +1,4 @@ from __future__ import annotations -from dataclasses import dataclass import mimetypes, os, pathlib from jinja2 import Environment, FileSystemLoader from urllib.parse import quote @@ -14,7 +13,6 @@ from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str from auto_archiver.core.module import get_module -@dataclass class HtmlFormatter(Formatter): environment: Environment = None template: any = None diff --git a/src/auto_archiver/modules/mute_formatter/mute_formatter.py b/src/auto_archiver/modules/mute_formatter/mute_formatter.py index 1c7cca2..129ddcb 100644 --- a/src/auto_archiver/modules/mute_formatter/mute_formatter.py +++ b/src/auto_archiver/modules/mute_formatter/mute_formatter.py @@ -1,11 +1,9 @@ from __future__ import annotations -from dataclasses import dataclass from auto_archiver.core import Metadata, Media from auto_archiver.core import Formatter -@dataclass class MuteFormatter(Formatter): def format(self, item: Metadata) -> Media: return None From d6b4b7a932b7c8840265890583b79dc7e5038b47 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 30 Jan 2025 16:43:09 +0100 Subject: [PATCH 050/110] Further cleanup * Removes (partly) the ArchivingOrchestrator * Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com * More unit tests * Improved error handling --- src/auto_archiver/__main__.py | 2 +- src/auto_archiver/core/base_module.py | 100 +++++++++ src/auto_archiver/core/config.py | 33 ++- src/auto_archiver/core/context.py | 10 +- src/auto_archiver/core/extractor.py | 3 +- src/auto_archiver/core/module.py | 54 +---- src/auto_archiver/core/orchestrator.py | 200 ++++++++++++++---- .../enrichers/screenshot_enricher.py | 40 ++++ src/auto_archiver/feeders/csv_feeder.py | 38 ++++ .../modules/atlos_feeder/atlos_feeder.py | 2 - .../modules/cli_feeder/__init__.py | 1 - .../modules/cli_feeder/__manifest__.py | 27 --- .../modules/cli_feeder/cli_feeder.py | 15 -- .../modules/csv_feeder/__manifest__.py | 1 - .../modules/csv_feeder/csv_feeder.py | 4 +- .../generic_extractor/generic_extractor.py | 6 +- .../modules/html_formatter/html_formatter.py | 4 +- .../screenshot_enricher.py | 6 +- .../modules/ssl_enricher/ssl_enricher.py | 2 +- .../telethon_extractor/telethon_extractor.py | 4 +- .../thumbnail_enricher/thumbnail_enricher.py | 2 +- .../timestamping_enricher.py | 8 +- .../modules/vk_extractor/vk_extractor.py | 4 +- .../modules/wacz_enricher/wacz_enricher.py | 6 +- tests/__init__.py | 3 +- tests/conftest.py | 6 + tests/test_orchestrator.py | 27 ++- 27 files changed, 417 insertions(+), 191 deletions(-) create mode 100644 src/auto_archiver/core/base_module.py create mode 100644 src/auto_archiver/enrichers/screenshot_enricher.py create mode 100644 src/auto_archiver/feeders/csv_feeder.py delete mode 100644 src/auto_archiver/modules/cli_feeder/__init__.py delete mode 100644 src/auto_archiver/modules/cli_feeder/__manifest__.py delete mode 100644 src/auto_archiver/modules/cli_feeder/cli_feeder.py diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index 0e2f54f..0023a59 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator import sys def main(): - ArchivingOrchestrator().run(sys.argv) + ArchivingOrchestrator().run(sys.argv[1:]) if __name__ == "__main__": main() diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py new file mode 100644 index 0000000..a9a904f --- /dev/null +++ b/src/auto_archiver/core/base_module.py @@ -0,0 +1,100 @@ + + +from urllib.parse import urlparse +from typing import Mapping, Any +from abc import ABC +from copy import deepcopy, copy +from tempfile import TemporaryDirectory + +from loguru import logger + +class BaseModule(ABC): + + """ + Base module class. All modules should inherit from this class. + + The exact methods a class implements will depend on the type of module it is, + however all modules have a .setup(config: dict) method to run any setup code + (e.g. logging in to a site, spinning up a browser etc.) + + See BaseModule.MODULE_TYPES for the types of modules you can create, noting that + a subclass can be of multiple types. For example, a module that extracts data from + a website and stores it in a database would be both an 'extractor' and a 'database' module. + + Each module is a python package, and should have a __manifest__.py file in the + same directory as the module file. The __manifest__.py specifies the module information + like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the + default manifest structure. + + """ + + MODULE_TYPES = [ + 'feeder', + 'extractor', + 'enricher', + 'database', + 'storage', + 'formatter' + ] + + _DEFAULT_MANIFEST = { + 'name': '', # the display name of the module + 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! + 'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES + 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare + 'description': '', # a description of the module + 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format + 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName + 'version': '1.0', # the version of the module + 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line +} + + config: Mapping[str, Any] + authentication: Mapping[str, Mapping[str, str]] + name: str + + # this is set by the orchestrator prior to archiving + tmp_dir: TemporaryDirectory = None + + def setup(self, config: dict): + + authentication = config.get('authentication', {}) + # extract out contatenated sites + for key, val in copy(authentication).items(): + if "," in key: + for site in key.split(","): + authentication[site] = val + del authentication[key] + + # this is important. Each instance is given its own deepcopied config, so modules cannot + # change values to affect other modules + config = deepcopy(config) + authentication = deepcopy(config.pop('authentication', {})) + + self.authentication = authentication + self.config = config + for key, val in config.get(self.name, {}).items(): + setattr(self, key, val) + + def repr(self): + return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" + + def auth_for_site(self, site: str) -> dict: + # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) + # for now, just hard code those. + + # SECURITY: parse the domain using urllib + site = urlparse(site).netloc + # add the 'www' version of the site to the list of sites to check + for to_try in [site, f"www.{site}"]: + if to_try in self.authentication: + return self.authentication[to_try] + + # do a fuzzy string match just to print a warning - don't use it since it's insecure + for key in self.authentication.keys(): + if key in site or site in key: + logger.warning(f"Could not find exact authentication information for site '{site}'. \ + did find information for '{key}' which is close, is this what you meant? \ + If so, edit your authentication settings to make sure it exactly matches.") + + return {} \ No newline at end of file diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index ca8ed25..2d462e4 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -15,8 +15,14 @@ from .module import BaseModule from typing import Any, List, Type, Tuple -yaml = YAML() +yaml: YAML = YAML() +b = yaml.load(""" + # This is a comment + site.com,site2.com: + key: value + key2: value2 + """) EMPTY_CONFIG = yaml.load(""" # Auto Archiver Configuration # Steps are the modules that will be run in the order they are defined @@ -25,6 +31,24 @@ steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES """ # Global configuration + +# Authentication +# a dictionary of authentication information that can be used by extractors to login to website. +# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com) +# Common login 'types' are username/password, cookie, api key/token. +# Some Examples: +# facebook.com: +# username: "my_username" +# password: "my_password" +# or for a site that uses an API key: +# twitter.com,x.com: +# api_key +# api_secret +# youtube.com: +# cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ; + +authentication: {} + # These are the global configurations that are used by the modules logging: @@ -136,12 +160,9 @@ def read_yaml(yaml_filename: str) -> CommentedMap: # TODO: make this tidier/find a way to notify of which keys should not be stored -def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None: +def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save = deepcopy(config) - for key1, key2 in do_not_store_keys: - if key1 in config_to_save and key2 in config_to_save[key1]: - del config_to_save[key1][key2] - + config.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/context.py b/src/auto_archiver/core/context.py index 9a21b5c..0db5359 100644 --- a/src/auto_archiver/core/context.py +++ b/src/auto_archiver/core/context.py @@ -53,12 +53,4 @@ class ArchivingContext: if full_reset: ac.keep_on_reset = set() ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset} - # ---- custom getters/setters for widely used context values - - @staticmethod - def set_tmp_dir(tmp_dir: str): - ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir - - @staticmethod - def get_tmp_dir() -> str: - return ArchivingContext.get_instance().configs.get("tmp_dir") + # ---- custom getters/setters for widely used context values \ No newline at end of file diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index ed261eb..b0d80bc 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -12,7 +12,6 @@ from dataclasses import dataclass import mimetypes import os import mimetypes - import requests from loguru import logger from retrying import retry @@ -71,7 +70,7 @@ class Extractor(BaseModule): to_filename = url.split('/')[-1].split('?')[0] if len(to_filename) > 64: to_filename = to_filename[-64:] - to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename) + to_filename = os.path.join(self.tmp_dir, to_filename) if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 4542b88..501f238 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -7,7 +7,6 @@ from __future__ import annotations from dataclasses import dataclass from typing import List -from abc import ABC import shutil import ast import copy @@ -17,63 +16,12 @@ import os from os.path import join, dirname from loguru import logger import auto_archiver +from .base_module import BaseModule _LAZY_LOADED_MODULES = {} MANIFEST_FILE = "__manifest__.py" -class BaseModule(ABC): - - """ - Base module class. All modules should inherit from this class. - - The exact methods a class implements will depend on the type of module it is, - however all modules have a .setup(config: dict) method to run any setup code - (e.g. logging in to a site, spinning up a browser etc.) - - See BaseModule.MODULE_TYPES for the types of modules you can create, noting that - a subclass can be of multiple types. For example, a module that extracts data from - a website and stores it in a database would be both an 'extractor' and a 'database' module. - - Each module is a python package, and should have a __manifest__.py file in the - same directory as the module file. The __manifest__.py specifies the module information - like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the - default manifest structure. - - """ - - MODULE_TYPES = [ - 'feeder', - 'extractor', - 'enricher', - 'database', - 'storage', - 'formatter' - ] - - _DEFAULT_MANIFEST = { - 'name': '', # the display name of the module - 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! - 'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES - 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare - 'description': '', # a description of the module - 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format - 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName - 'version': '1.0', # the version of the module - 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line -} - - config: dict - name: str - - def setup(self, config: dict): - self.config = config - for key, val in config.get(self.name, {}).items(): - setattr(self, key, val) - - def repr(self): - return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" - def setup_paths(paths: list[str]) -> None: """ diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index ba46492..ad11849 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -5,12 +5,15 @@ """ from __future__ import annotations -from typing import Generator, Union, List +from typing import Generator, Union, List, Type from urllib.parse import urlparse from ipaddress import ip_address import argparse import os import sys +import json +from tempfile import TemporaryDirectory +import traceback from rich_argparse import RichHelpFormatter @@ -18,17 +21,46 @@ from .context import ArchivingContext from .metadata import Metadata from ..version import __version__ -from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser +from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, get_module, setup_paths -from . import validators +from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from .module import BaseModule -import tempfile, traceback from loguru import logger DEFAULT_CONFIG_FILE = "orchestration.yaml" +class JsonParseAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + try: + setattr(namespace, self.dest, json.loads(values)) + except json.JSONDecodeError as e: + raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}") + + +class AuthenticationJsonParseAction(JsonParseAction): + def __call__(self, parser, namespace, values, option_string=None): + super().__call__(parser, namespace, values, option_string) + auth_dict = getattr(namespace, self.dest) + if isinstance(auth_dict, str): + # if it's a string + try: + with open(auth_dict, 'r') as f: + try: + auth_dict = json.load(f) + except json.JSONDecodeError: + # maybe it's yaml, try that + auth_dict = yaml.load(f) + except: + pass + + if not isinstance(auth_dict, dict): + raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods") + for site, auth in auth_dict.items(): + if not isinstance(site, str) or not isinstance(auth, dict): + raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods") + setattr(namespace, self.dest, auth_dict) class UniqueAppendAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): if not hasattr(namespace, self.dest): @@ -38,9 +70,7 @@ class UniqueAppendAction(argparse.Action): getattr(namespace, self.dest).append(value) class ArchivingOrchestrator: - - _do_not_store_keys = [] - + def setup_basic_parser(self): parser = argparse.ArgumentParser( prog="auto-archiver", @@ -52,7 +82,7 @@ class ArchivingOrchestrator: epilog="Check the code at https://github.com/bellingcat/auto-archiver", formatter_class=RichHelpFormatter, ) - parser.add_argument('--help', '-h', action='store_true', dest='help', help='show this help message and exit') + parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit') parser.add_argument('--version', action='version', version=__version__) parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') @@ -80,7 +110,6 @@ class ArchivingOrchestrator: # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] - # first loads the modules from the config file, then from the command line for config in [yaml_config['steps'], basic_config.__dict__]: for module_type in BaseModule.MODULE_TYPES: @@ -120,7 +149,7 @@ class ArchivingOrchestrator: if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file): logger.info(f"Storing configuration file to {basic_config.config_file}") - store_yaml(self.config, basic_config.config_file, self._do_not_store_keys) + store_yaml(self.config, basic_config.config_file) return self.config @@ -128,18 +157,29 @@ class ArchivingOrchestrator: if not parser: parser = self.parser - parser.add_argument('--feeders', dest='steps.feeders', nargs='+', help='the feeders to use', action=UniqueAppendAction) + + # allow passing URLs directly on the command line + parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml') + + parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction) parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction) parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction) parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction) parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction) parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction) + parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \ + (token, username etc.) that extractors can use to log into \ + a website. If passing this on the command line, use a JSON string. \ + You may also pass a path to a valid JSON/YAML file which will be parsed.',\ + default={}, + action=AuthenticationJsonParseAction) # logging arguments parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO') parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) + def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: if not modules: @@ -147,6 +187,7 @@ class ArchivingOrchestrator: module: LazyBaseModule for module in modules: + if not module.configs: # this module has no configs, don't show anything in the help # (TODO: do we want to show something about this module though, like a description?) @@ -155,12 +196,6 @@ class ArchivingOrchestrator: group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") for name, kwargs in module.configs.items(): - # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set - # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something - do_not_store = kwargs.pop('do_not_store', False) - if do_not_store: - self._do_not_store_keys.append((module.name, name)) - if not kwargs.get('metavar', None): # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR] kwargs['metavar'] = name.upper() @@ -208,8 +243,7 @@ class ArchivingOrchestrator: step_items = [] modules_to_load = self.config['steps'][f"{module_type}s"] - assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} \ - in your configuration file or on the command line (using --{module_type}s)" + assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)" def check_steps_ok(): if not len(step_items): @@ -223,12 +257,37 @@ class ArchivingOrchestrator: exit() for module in modules_to_load: + if module == 'cli_feeder': + urls = self.config['urls'] + if not urls: + logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder.") + self.basic_parser.print_help() + exit() + # cli_feeder is a pseudo module, it just takes the command line args + def feed(self) -> Generator[Metadata]: + for url in urls: + logger.debug(f"Processing URL: '{url}'") + yield Metadata().set_url(url) + ArchivingContext.set("folder", "cli") + + pseudo_module = type('CLIFeeder', (Feeder,), { + 'name': 'cli_feeder', + 'display_name': 'CLI Feeder', + '__iter__': feed + + })() + + + pseudo_module.__iter__ = feed + step_items.append(pseudo_module) + continue + if module in invalid_modules: continue try: loaded_module: BaseModule = get_module(module, self.config) except (KeyboardInterrupt, Exception) as e: - logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") + logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") if module_type == 'extractor' and loaded_module.name == module: loaded_module.cleanup() exit() @@ -285,13 +344,18 @@ class ArchivingOrchestrator: def cleanup(self)->None: logger.info("Cleaning up") - for e in self.config['steps']['extractors']: + for e in self.extractors: e.cleanup() def feed(self) -> Generator[Metadata]: - for feeder in self.config['steps']['feeders']: + + url_count = 0 + for feeder in self.feeders: for item in feeder: yield self.feed_item(item) + url_count += 1 + + logger.success(f"Processed {url_count} URL(s)") self.cleanup() def feed_item(self, item: Metadata) -> Metadata: @@ -300,22 +364,33 @@ class ArchivingOrchestrator: - catches keyboard interruptions to do a clean exit - catches any unexpected error, logs it, and does a clean exit """ + tmp_dir: TemporaryDirectory = None try: - ArchivingContext.reset() - with tempfile.TemporaryDirectory(dir="./") as tmp_dir: - ArchivingContext.set_tmp_dir(tmp_dir) - return self.archive(item) + tmp_dir = TemporaryDirectory(dir="./") + # set tmp_dir on all modules + for m in self.all_modules: + m.tmp_dir = tmp_dir.name + return self.archive(item) except KeyboardInterrupt: # catches keyboard interruptions to do a clean exit logger.warning(f"caught interrupt on {item=}") - for d in self.config['steps']['databases']: d.aborted(item) + for d in self.databases: + d.aborted(item) self.cleanup() exit() except Exception as e: logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') - for d in self.config['steps']['databases']: - if type(e) == AssertionError: d.failed(item, str(e)) - else: d.failed(item, reason="unexpected error") + for d in self.databases: + if type(e) == AssertionError: + d.failed(item, str(e)) + else: + d.failed(item, reason="unexpected error") + finally: + if tmp_dir: + # remove the tmp_dir from all modules + for m in self.all_modules: + m.tmp_dir = None + tmp_dir.cleanup() def archive(self, result: Metadata) -> Union[Metadata, None]: @@ -328,31 +403,38 @@ class ArchivingOrchestrator: 5. Store all downloaded/generated media 6. Call selected Formatter and store formatted if needed """ + original_url = result.get_url().strip() - self.assert_valid_url(original_url) + try: + self.assert_valid_url(original_url) + except AssertionError as e: + logger.error(f"Error archiving URL {original_url}: {e}") + raise e # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs url = original_url - for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url) + for a in self.extractors: + url = a.sanitize_url(url) + result.set_url(url) if original_url != url: result.set("original_url", original_url) # 2 - notify start to DBs, propagate already archived if feature enabled in DBs cached_result = None - for d in self.config["steps"]["databases"]: + for d in self.databases: d.started(result) if (local_result := d.fetch(result)): cached_result = (cached_result or Metadata()).merge(local_result) if cached_result: logger.debug("Found previously archived entry") - for d in self.config["steps"]["databases"]: + for d in self.databases: try: d.done(cached_result, cached=True) except Exception as e: logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") return cached_result # 3 - call extractors until one succeeds - for a in self.config["steps"]["extractors"]: + for a in self.extractors: logger.info(f"Trying extractor {a.name} for {url}") try: result.merge(a.download(result)) @@ -361,7 +443,7 @@ class ArchivingOrchestrator: logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}") # 4 - call enrichers to work with archived content - for e in self.config["steps"]["enrichers"]: + for e in self.enrichers: try: e.enrich(result) except Exception as exc: logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}") @@ -370,7 +452,7 @@ class ArchivingOrchestrator: result.store() # 6 - format and store formatted if needed - if final_media := self.config["steps"]["formatters"][0].format(result): + if final_media := self.formatters[0].format(result): final_media.store(url=url, metadata=result) result.set_final_media(final_media) @@ -378,7 +460,7 @@ class ArchivingOrchestrator: result.status = "nothing archived" # signal completion to databases and archivers - for d in self.config["steps"]["databases"]: + for d in self.databases: try: d.done(result) except Exception as e: logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") @@ -403,4 +485,44 @@ class ArchivingOrchestrator: assert ip.is_global, f"Invalid IP used" assert not ip.is_reserved, f"Invalid IP used" assert not ip.is_link_local, f"Invalid IP used" - assert not ip.is_private, f"Invalid IP used" \ No newline at end of file + assert not ip.is_private, f"Invalid IP used" + + + # Helper Properties + + @property + def feeders(self) -> List[Type[Feeder]]: + return self._get_property('feeders') + + @property + def extractors(self) -> List[Type[Extractor]]: + return self._get_property('extractors') + + @property + def enrichers(self) -> List[Type[Enricher]]: + return self._get_property('enrichers') + + @property + def databases(self) -> List[Type[Database]]: + return self._get_property('databases') + + @property + def storages(self) -> List[Type[Storage]]: + return self._get_property('storages') + + @property + def formatters(self) -> List[Type[Formatter]]: + return self._get_property('formatters') + + @property + def all_modules(self) -> List[Type[BaseModule]]: + return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters + + def _get_property(self, prop): + try: + f = self.config['steps'][prop] + if not (isinstance(f[0], BaseModule) or isinstance(f[0], LazyBaseModule)): + raise TypeError + return f + except: + exit("Property called prior to full initialisation") \ No newline at end of file diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py new file mode 100644 index 0000000..0d05d92 --- /dev/null +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -0,0 +1,40 @@ +from loguru import logger +import time, os +from selenium.common.exceptions import TimeoutException + + +from auto_archiver.core import Enricher +from ..utils import Webdriver, UrlUtil, random_str +from ..core import Media, Metadata + +class ScreenshotEnricher(Enricher): + name = "screenshot_enricher" + + @staticmethod + def configs() -> dict: + return { + "width": {"default": 1280, "help": "width of the screenshots"}, + "height": {"default": 720, "help": "height of the screenshots"}, + "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, + "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, + "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + } + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + if UrlUtil.is_auth_wall(url): + logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") + return + + logger.debug(f"Enriching screenshot for {url=}") + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: + try: + driver.get(url) + time.sleep(int(self.sleep_before_screenshot)) + screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png") + driver.save_screenshot(screenshot_file) + to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") + except TimeoutException: + logger.info("TimeoutException loading page for screenshot") + except Exception as e: + logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") diff --git a/src/auto_archiver/feeders/csv_feeder.py b/src/auto_archiver/feeders/csv_feeder.py new file mode 100644 index 0000000..e9da518 --- /dev/null +++ b/src/auto_archiver/feeders/csv_feeder.py @@ -0,0 +1,38 @@ +from loguru import logger +import csv + +from . import Feeder +from ..core import Metadata, ArchivingContext +from ..utils import url_or_none + +class CSVFeeder(Feeder): + + @staticmethod + def configs() -> dict: + return { + "files": { + "default": None, + "help": "Path to the input file(s) to read the URLs from, comma separated. \ + Input files should be formatted with one URL per line", + "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + }, + "column": { + "default": None, + "help": "Column number or name to read the URLs from, 0-indexed", + } + } + + + def __iter__(self) -> Metadata: + url_column = self.column or 0 + for file in self.files: + with open(file, "r") as f: + reader = csv.reader(f) + first_row = next(reader) + if not(url_or_none(first_row[url_column])): + # it's a header row, skip it + for row in reader: + url = row[0] + logger.debug(f"Processing {url}") + yield Metadata().set_url(url) + ArchivingContext.set("folder", "cli") \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index bbf06f6..8c8f9cb 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -40,5 +40,3 @@ class AtlosFeeder(Feeder): if len(data["results"]) == 0 or cursor is None: break - - logger.success(f"Processed {count} URL(s)") diff --git a/src/auto_archiver/modules/cli_feeder/__init__.py b/src/auto_archiver/modules/cli_feeder/__init__.py deleted file mode 100644 index 9c85787..0000000 --- a/src/auto_archiver/modules/cli_feeder/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .cli_feeder import CLIFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py deleted file mode 100644 index cf5c1b7..0000000 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "CLI Feeder", - "type": ["feeder"], - "requires_setup": False, - "dependencies": { - "python": ["loguru"], - }, - 'entry_point': 'cli_feeder::CLIFeeder', - "configs": { - "urls": { - "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - "nargs": "+", - "required": True, - "do_not_store": True, - "metavar": "INPUT URLS", - }, - }, - "description": """ - Processes URLs to archive passed via the command line and feeds them into the archiving pipeline. - - ### Features - - Takes a single URL or a list of URLs provided via the command line. - - Converts each URL into a `Metadata` object and yields it for processing. - - Ensures URLs are processed only if they are explicitly provided. - - """ -} diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py deleted file mode 100644 index 62cb659..0000000 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ /dev/null @@ -1,15 +0,0 @@ -from loguru import logger - -from auto_archiver.core import Feeder -from auto_archiver.core import Metadata, ArchivingContext - - -class CLIFeeder(Feeder): - - def __iter__(self) -> Metadata: - for url in self.urls: - logger.debug(f"Processing URL: '{url}'") - yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") - - logger.success(f"Processed {len(self.urls)} URL(s)") diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index b062ee6..7249395 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -26,7 +26,6 @@ - Supports reading URLs from multiple input files, specified as a comma-separated list. - Allows specifying the column number or name to extract URLs from. - Skips header rows if the first value is not a valid URL. - - Integrates with the `ArchivingContext` to manage URL feeding. ### Setu N - Input files should be formatted with one URL per line. diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index ad0a035..1cd9022 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -20,6 +20,4 @@ class CSVFeeder(Feeder): url = row[0] logger.debug(f"Processing {url}") yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") - - logger.success(f"Processed {len(self.urls)} URL(s)") \ No newline at end of file + ArchivingContext.set("folder", "cli") \ No newline at end of file diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index e643c21..2879c05 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -270,7 +270,11 @@ class GenericExtractor(Extractor): logger.debug('Using Facebook cookie') yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie - ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads} + ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), + 'quiet': False, 'noplaylist': not self.allow_playlist , + 'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles, + "live_from_start": self.live_from_start, "proxy": self.proxy, + "max_downloads": self.max_downloads, "playlistend": self.max_downloads} if item.netloc in ['youtube.com', 'www.youtube.com']: if self.cookies_from_browser: diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index bfc2efa..4da82c8 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -7,7 +7,7 @@ import json import base64 from auto_archiver.version import __version__ -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.core import Formatter from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str @@ -46,7 +46,7 @@ class HtmlFormatter(Formatter): version=__version__ ) - html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html") + html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html") with open(html_path, mode="w", encoding="utf-8") as outf: outf.write(content) final_media = Media(filename=html_path, _mimetype="text/html") diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index be775ce..8e7639a 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -7,7 +7,7 @@ from selenium.common.exceptions import TimeoutException from auto_archiver.core import Enricher from auto_archiver.utils import Webdriver, UrlUtil, random_str -from auto_archiver.core import Media, Metadata, ArchivingContext +from auto_archiver.core import Media, Metadata class ScreenshotEnricher(Enricher): @@ -23,11 +23,11 @@ class ScreenshotEnricher(Enricher): try: driver.get(url) time.sleep(int(self.sleep_before_screenshot)) - screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png") + screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png") driver.save_screenshot(screenshot_file) to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") if self.save_to_pdf: - pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf") + pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf") pdf = driver.print_page(driver.print_options) with open(pdf_file, "wb") as f: f.write(base64.b64decode(pdf)) diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 52237ee..76784fa 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -23,6 +23,6 @@ class SSLEnricher(Enricher): logger.debug(f"fetching SSL certificate for {domain=} in {url=}") cert = ssl.get_server_certificate((domain, 443)) - cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem") + cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem") with open(cert_fn, "w") as f: f.write(cert) to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate") diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 8a08954..3e952e8 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -9,7 +9,7 @@ from tqdm import tqdm import re, time, json, os from auto_archiver.core import Extractor -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.utils import random_str @@ -120,7 +120,7 @@ class TelethonArchiver(Extractor): media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') - tmp_dir = ArchivingContext.get_tmp_dir() + tmp_dir = self.tmp_dir group_id = post.grouped_id if post.grouped_id is not None else post.id title = post.message diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index b27243b..429ba38 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher): logger.debug(f"generating thumbnails for {to_enrich.get_url()}") for m_id, m in enumerate(to_enrich.media[::]): if m.is_video(): - folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24)) + folder = os.path.join(self.tmp_dir, random_str(24)) os.makedirs(folder, exist_ok=True) logger.debug(f"generating thumbnails for {m.filename}") duration = m.get("duration") diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index a7a0aee..078c1ba 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -9,9 +9,7 @@ from asn1crypto import pem import certifi from auto_archiver.core import Enricher -from auto_archiver.core import Metadata, ArchivingContext, Media -from auto_archiver.core import Extractor - +from auto_archiver.core import Metadata, Media class TimestampingEnricher(Enricher): """ @@ -33,7 +31,7 @@ class TimestampingEnricher(Enricher): logger.warning(f"No hashes found in {url=}") return - tmp_dir = ArchivingContext.get_tmp_dir() + tmp_dir = self.tmp_dir hashes_fn = os.path.join(tmp_dir, "hashes.txt") data_to_sign = "\n".join(hashes) @@ -93,7 +91,7 @@ class TimestampingEnricher(Enricher): cert_chain = [] for cert in path: - cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt") + cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt") with open(cert_fn, "wb") as f: f.write(cert.dump()) cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"])) diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index 301fa89..2d09138 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper from auto_archiver.utils.misc import dump_payload from auto_archiver.core import Extractor -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media class VkExtractor(Extractor): @@ -35,7 +35,7 @@ class VkExtractor(Extractor): result.set_content(dump_payload(vk_scrapes)) - filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir()) + filenames = self.vks.download_media(vk_scrapes, self.tmp_dir) for filename in filenames: result.add_media(Media(filename)) diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 8810b84..3f67b7c 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -5,7 +5,7 @@ from zipfile import ZipFile from loguru import logger from warcio.archiveiterator import ArchiveIterator -from auto_archiver.core import Media, Metadata, ArchivingContext +from auto_archiver.core import Media, Metadata from auto_archiver.core import Extractor, Enricher from auto_archiver.utils import UrlUtil, random_str @@ -51,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor): url = to_enrich.get_url() collection = random_str(8) - browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir()) + browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir) browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host cmd = [ @@ -154,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor): logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}") # unzipping the .wacz - tmp_dir = ArchivingContext.get_tmp_dir() + tmp_dir = self.tmp_dir unzipped_dir = os.path.join(tmp_dir, "unzipped") with ZipFile(wacz_filename, 'r') as z_obj: z_obj.extractall(path=unzipped_dir) diff --git a/tests/__init__.py b/tests/__init__.py index 3d66aff..31f38cb 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -2,5 +2,4 @@ import tempfile from auto_archiver.core.context import ArchivingContext -ArchivingContext.reset(full_reset=True) -ArchivingContext.set_tmp_dir(tempfile.gettempdir()) \ No newline at end of file +ArchivingContext.reset(full_reset=True) \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index af0fd6d..3bd382b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ pytest conftest file, for shared fixtures and configuration """ +from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib import pytest @@ -25,8 +26,13 @@ def setup_module(request): m = get_module(module_name, {module_name: config}) + # add the tmp_dir to the module + tmp_dir = TemporaryDirectory() + m.tmp_dir = tmp_dir + def cleanup(): _LAZY_LOADED_MODULES.pop(module_name) + tmp_dir.cleanup() request.addfinalizer(cleanup) return m diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 03cb521..68417aa 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -1,6 +1,6 @@ import pytest import sys -from argparse import ArgumentParser +from argparse import ArgumentParser, ArgumentTypeError from auto_archiver.core.orchestrator import ArchivingOrchestrator from auto_archiver.version import __version__ from auto_archiver.core.config import read_yaml, store_yaml @@ -113,16 +113,23 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path): # run the orchestrator orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES]) + assert orchestrator.config is not None - # should run OK, since there are no missing required fields +def test_load_authentication_string(orchestrator, test_args): - # basic_args = basic_parser.parse_known_args(test_args) - # test_yaml = read_yaml(TEST_ORCHESTRATION) - # test_yaml['example_module'] = {'required_field': 'some_value'} + orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}']) + assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}} - # # monkey patch the example_module to have a 'configs' setting of 'my_var' with required=True - # # load the module first - # m = get_module_lazy("example_module") +def test_load_authentication_string_concat_site(orchestrator, test_args): + + orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}']) + assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"}, + "twitter.com": {"api_key": "my_key"}} - # orchestrator.setup_complete_parser(basic_args, test_yaml, unused_args=[]) - # assert orchestrator.config is not None \ No newline at end of file +def test_load_invalid_authentication_string(orchestrator, test_args): + with pytest.raises(ArgumentTypeError): + orchestrator.run(test_args + ["--authentication", "{\''invalid_json"]) + +def test_load_authentication_invalid_dict(orchestrator, test_args): + with pytest.raises(ArgumentTypeError): + orchestrator.run(test_args + ["--authentication", "[true, false]"]) \ No newline at end of file From d76063c3f3c287230b4fee06267c64699f6f802a Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 30 Jan 2025 16:46:53 +0100 Subject: [PATCH 051/110] Fix unit tests --- tests/conftest.py | 2 +- tests/test_modules.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3bd382b..f909bfb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,7 +28,7 @@ def setup_module(request): # add the tmp_dir to the module tmp_dir = TemporaryDirectory() - m.tmp_dir = tmp_dir + m.tmp_dir = tmp_dir.name def cleanup(): _LAZY_LOADED_MODULES.pop(module_name) diff --git a/tests/test_modules.py b/tests/test_modules.py index a4c0ec8..854edb5 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -66,7 +66,7 @@ def test_load_module(example_module): # check that the vlaue is set on the module itself assert loaded_module.csv_file == "db.csv" -@pytest.mark.parametrize("module_name", ["cli_feeder", "local_storage", "generic_extractor", "html_formatter", "csv_db"]) +@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) def test_load_modules(module_name): # test that specific modules can be loaded module = get_module_lazy(module_name) @@ -84,7 +84,7 @@ def test_load_modules(module_name): assert loaded_module.name in loaded_module.config.keys() -@pytest.mark.parametrize("module_name", ["cli_feeder", "local_storage", "generic_extractor", "html_formatter", "csv_db"]) +@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) def test_lazy_base_module(module_name): lazy_module = get_module_lazy(module_name) From c25d5cae84de7521aea517920463ab97aec8506d Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 30 Jan 2025 17:50:54 +0100 Subject: [PATCH 052/110] Remove ArchivingContext completely Context for a specific url/item is now passed around via the metadata (metadata.set_context('key', 'val') and metadata.get_context('key', default='something') The only other thing that was passed around in ArchivingContext was the storage info, which is already accessible now via self.config --- src/auto_archiver/core/__init__.py | 1 - src/auto_archiver/core/base_module.py | 12 ++-- src/auto_archiver/core/context.py | 56 ------------------- src/auto_archiver/core/extractor.py | 2 +- src/auto_archiver/core/media.py | 10 ++-- src/auto_archiver/core/metadata.py | 15 +++-- src/auto_archiver/core/module.py | 2 +- src/auto_archiver/core/orchestrator.py | 12 ++-- src/auto_archiver/core/storage.py | 23 ++++---- src/auto_archiver/feeders/csv_feeder.py | 5 +- .../modules/csv_feeder/csv_feeder.py | 5 +- .../generic_extractor/generic_extractor.py | 2 +- .../modules/gsheet_db/gsheet_db.py | 5 +- .../modules/gsheet_feeder/gsheet_feeder.py | 12 ++-- .../modules/hash_enricher/hash_enricher.py | 2 +- .../instagram_tbot_extractor.py | 4 +- .../modules/ssl_enricher/ssl_enricher.py | 2 +- .../whisper_enricher/whisper_enricher.py | 6 +- tests/__init__.py | 5 -- 19 files changed, 59 insertions(+), 122 deletions(-) delete mode 100644 src/auto_archiver/core/context.py diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index 858bdfd..ae4c41c 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -4,7 +4,6 @@ from .metadata import Metadata from .media import Media from .module import BaseModule -from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index a9a904f..2c1e8a3 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -56,6 +56,10 @@ class BaseModule(ABC): # this is set by the orchestrator prior to archiving tmp_dir: TemporaryDirectory = None + @property + def storages(self) -> list: + return self.config.get('storages', []) + def setup(self, config: dict): authentication = config.get('authentication', {}) @@ -75,9 +79,6 @@ class BaseModule(ABC): self.config = config for key, val in config.get(self.name, {}).items(): setattr(self, key, val) - - def repr(self): - return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" def auth_for_site(self, site: str) -> dict: # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) @@ -97,4 +98,7 @@ class BaseModule(ABC): did find information for '{key}' which is close, is this what you meant? \ If so, edit your authentication settings to make sure it exactly matches.") - return {} \ No newline at end of file + return {} + + def repr(self): + return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" \ No newline at end of file diff --git a/src/auto_archiver/core/context.py b/src/auto_archiver/core/context.py deleted file mode 100644 index 0db5359..0000000 --- a/src/auto_archiver/core/context.py +++ /dev/null @@ -1,56 +0,0 @@ -""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process. - -This singleton class allows for: -- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle. -- Marking certain values to persist across resets using `keep_on_reset`. -- Managing temporary directories and other shared data used during the archiving process. - -### Key Features: -- Creates a single global instance. -- Reset functionality allows for clearing configurations, with options for partial or full resets. -- Custom getters and setters for commonly used context values like temporary directories. - -""" - -class ArchivingContext: - """ - Singleton context class for managing global configurations and temporary data. - - ArchivingContext._get_instance() to retrieve it if needed - otherwise just - ArchivingContext.set(key, value) - and - ArchivingContext.get(key, default) - - When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True) - reset(full_reset=True) will recreate everything including the keep_on_reset status - """ - _instance = None - - def __init__(self): - self.configs = {} - self.keep_on_reset = set() - - @staticmethod - def get_instance(): - if ArchivingContext._instance is None: - ArchivingContext._instance = ArchivingContext() - return ArchivingContext._instance - - @staticmethod - def set(key, value, keep_on_reset: bool = False): - ac = ArchivingContext.get_instance() - ac.configs[key] = value - if keep_on_reset: ac.keep_on_reset.add(key) - - @staticmethod - def get(key: str, default=None): - return ArchivingContext.get_instance().configs.get(key, default) - - @staticmethod - def reset(full_reset: bool = False): - ac = ArchivingContext.get_instance() - if full_reset: ac.keep_on_reset = set() - ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset} - - # ---- custom getters/setters for widely used context values \ No newline at end of file diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index b0d80bc..98f1370 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -17,7 +17,7 @@ from loguru import logger from retrying import retry import re -from ..core import Metadata, ArchivingContext, BaseModule +from ..core import Metadata, BaseModule class Extractor(BaseModule): diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index e5026af..2cb6fc9 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -11,8 +11,6 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes -from .context import ArchivingContext - from loguru import logger @@ -36,12 +34,11 @@ class Media: _mimetype: str = None # eg: image/jpeg _stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude - def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None): + def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None: # 'Any' typing for metadata to avoid circular imports. Stores the media # into the provided/available storages [Storage] repeats the process for # its properties, in case they have inner media themselves for now it # only goes down 1 level but it's easy to make it recursive if needed. - storages = override_storages or ArchivingContext.get("storages") if not len(storages): logger.warning(f"No storages found in local context or provided directly for {self.filename}.") return @@ -66,8 +63,9 @@ class Media: for inner_media in prop_media.all_inner_media(include_self=True): yield inner_media - def is_stored(self) -> bool: - return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages")) + def is_stored(self, in_storage) -> bool: + # checks if the media is already stored in the given storage + return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u]) def set(self, key: str, value: Any) -> Media: self.properties[key] = value diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 04683dd..d20ea5e 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt from loguru import logger from .media import Media -from .context import ArchivingContext - @dataclass_json # annotation order matters @dataclass @@ -32,6 +30,7 @@ class Metadata: def __post_init__(self): self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc)) + self._context = {} def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ @@ -57,12 +56,11 @@ class Metadata: return right.merge(self) return self - def store(self: Metadata, override_storages: List = None): + def store(self, storages=[]): # calls .store for all contained media. storages [Storage] self.remove_duplicate_media_by_hash() - storages = override_storages or ArchivingContext.get("storages") for media in self.media: - media.store(override_storages=storages, url=self.get_url(), metadata=self) + media.store(url=self.get_url(), metadata=self, storages=storages) def set(self, key: str, val: Any) -> Metadata: self.metadata[key] = val @@ -206,3 +204,10 @@ class Metadata: if len(r.media) > len(most_complete.media): most_complete = r elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r return most_complete + + def set_context(self, key: str, val: Any) -> Metadata: + self._context[key] = val + return self + + def get_context(self, key: str, default: Any = None) -> Any: + return self._context.get(key, default) \ No newline at end of file diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 501f238..dec67e1 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -43,7 +43,6 @@ def setup_paths(paths: list[str]) -> None: # sort based on the length of the path, so that the longest path is last in the list auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True) - def get_module(module_name: str, config: dict) -> BaseModule: """ Gets and sets up a module using the provided config @@ -69,6 +68,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa return module def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: + # search through all valid 'modules' paths. Default is 'modules' in the current directory # see odoo/modules/module.py -> get_modules diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index ad11849..f046bfe 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -17,9 +17,8 @@ import traceback from rich_argparse import RichHelpFormatter -from .context import ArchivingContext -from .metadata import Metadata +from .metadata import Metadata, Media from ..version import __version__ from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, get_module, setup_paths @@ -268,7 +267,6 @@ class ArchivingOrchestrator: for url in urls: logger.debug(f"Processing URL: '{url}'") yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") pseudo_module = type('CLIFeeder', (Feeder,), { 'name': 'cli_feeder', @@ -297,9 +295,6 @@ class ArchivingOrchestrator: continue if loaded_module: step_items.append(loaded_module) - # TODO temp solution - if module_type == "storage": - ArchivingContext.set("storages", step_items, keep_on_reset=True) check_steps_ok() self.config['steps'][f"{module_type}s"] = step_items @@ -449,11 +444,12 @@ class ArchivingOrchestrator: logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}") # 5 - store all downloaded/generated media - result.store() + result.store(storages=self.storages) # 6 - format and store formatted if needed + final_media: Media if final_media := self.formatters[0].format(result): - final_media.store(url=url, metadata=result) + final_media.store(url=url, metadata=result, storages=self.storages) result.set_final_media(final_media) if result.is_empty(): diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index b40c5cc..9373ff9 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -8,16 +8,16 @@ from slugify import slugify from auto_archiver.utils.misc import random_str -from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata +from auto_archiver.core import Media, BaseModule, Metadata from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher - +from auto_archiver.core.module import get_module class Storage(BaseModule): - def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None: - if media.is_stored(): + def store(self, media: Media, url: str, metadata: Metadata=None) -> None: + if media.is_stored(in_storage=self): logger.debug(f"{media.key} already stored, skipping") return - self.set_key(media, url) + self.set_key(media, url, metadata) self.upload(media, metadata=metadata) media.add_url(self.get_cdn_url(media)) @@ -32,30 +32,31 @@ class Storage(BaseModule): with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) - def set_key(self, media: Media, url) -> None: + def set_key(self, media: Media, url, metadata: Metadata) -> None: """takes the media and optionally item info and generates a key""" if media.key is not None and len(media.key) > 0: return - folder = ArchivingContext.get("folder", "") + folder = metadata.folder filename, ext = os.path.splitext(media.filename) # Handle path_generator logic - path_generator = ArchivingContext.get("path_generator", "url") + path_generator = self.config.get("path_generator", "url") if path_generator == "flat": path = "" filename = slugify(filename) # Ensure filename is slugified elif path_generator == "url": path = slugify(url) elif path_generator == "random": - path = ArchivingContext.get("random_path", random_str(24), True) + path = self.config.get("random_path", random_str(24), True) else: raise ValueError(f"Invalid path_generator: {path_generator}") # Handle filename_generator logic - filename_generator = ArchivingContext.get("filename_generator", "random") + filename_generator = self.config.get("filename_generator", "random") if filename_generator == "random": filename = random_str(24) elif filename_generator == "static": - he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) + # load the hash_enricher module + he = get_module(HashEnricher, self.config) hd = he.calculate_hash(media.filename) filename = hd[:24] else: diff --git a/src/auto_archiver/feeders/csv_feeder.py b/src/auto_archiver/feeders/csv_feeder.py index e9da518..b1aedb7 100644 --- a/src/auto_archiver/feeders/csv_feeder.py +++ b/src/auto_archiver/feeders/csv_feeder.py @@ -2,7 +2,7 @@ from loguru import logger import csv from . import Feeder -from ..core import Metadata, ArchivingContext +from ..core import Metadata from ..utils import url_or_none class CSVFeeder(Feeder): @@ -34,5 +34,4 @@ class CSVFeeder(Feeder): for row in reader: url = row[0] logger.debug(f"Processing {url}") - yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") \ No newline at end of file + yield Metadata().set_url(url) \ No newline at end of file diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 1cd9022..15dfa85 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -2,7 +2,7 @@ from loguru import logger import csv from auto_archiver.core import Feeder -from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.core import Metadata from auto_archiver.utils import url_or_none class CSVFeeder(Feeder): @@ -19,5 +19,4 @@ class CSVFeeder(Feeder): for row in reader: url = row[0] logger.debug(f"Processing {url}") - yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") \ No newline at end of file + yield Metadata().set_url(url) \ No newline at end of file diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 2879c05..4838489 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger from auto_archiver.core.extractor import Extractor -from ...core import Metadata, Media, ArchivingContext +from ...core import Metadata, Media class GenericExtractor(Extractor): _dropins = {} diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index e7e8e5c..5e1ed1e 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -6,7 +6,7 @@ from urllib.parse import quote from loguru import logger from auto_archiver.core import Database -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.modules.gsheet_feeder import GWorksheet @@ -93,8 +93,7 @@ class GsheetsDb(Database): logger.debug(f"Unable to update sheet: {e}") def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: - # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now - if gsheet := ArchivingContext.get("gsheet"): + if gsheet := item.get_context("gsheet"): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") elif self.sheet_id: diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 235dd63..d129182 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -15,7 +15,7 @@ from loguru import logger from slugify import slugify from auto_archiver.core import Feeder -from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.core import Metadata from . import GWorksheet @@ -60,17 +60,15 @@ class GsheetsFeeder(Feeder): # All checks done - archival process starts here m = Metadata().set_url(url) - ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True) if gw.get_cell_or_default(row, 'folder', "") is None: folder = '' else: folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) - if len(folder): - if self.use_sheet_names_in_stored_paths: - ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True) - else: - ArchivingContext.set("folder", folder, True) + if len(folder) and self.use_sheet_names_in_stored_paths: + folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title)) + m.set_context('folder', folder) + m.set_context('worksheet', {"row": row, "worksheet": gw}) yield m logger.success(f'Finished worksheet {wks.title}') diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 94b5dce..58c6abe 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -11,7 +11,7 @@ import hashlib from loguru import logger from auto_archiver.core import Enricher -from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.core import Metadata class HashEnricher(Enricher): diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 791b9c0..5b49484 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -16,7 +16,7 @@ from loguru import logger from telethon.sync import TelegramClient from auto_archiver.core import Extractor -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.utils import random_str @@ -61,7 +61,7 @@ class InstagramTbotExtractor(Extractor): if not "instagram.com" in url: return False result = Metadata() - tmp_dir = ArchivingContext.get_tmp_dir() + tmp_dir = self.tmp_dir with self.client.start(): chat = self.client.get_entity("instagram_load_bot") since_id = self.client.send_message(entity=chat, message=url).id diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 76784fa..b429163 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -4,7 +4,7 @@ from urllib.parse import urlparse from loguru import logger from auto_archiver.core import Enricher -from auto_archiver.core import Metadata, ArchivingContext, Media +from auto_archiver.core import Metadata, Media class SSLEnricher(Enricher): diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index b8fe634..8ca2131 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -3,7 +3,7 @@ import requests, time from loguru import logger from auto_archiver.core import Enricher -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.modules.s3_storage import S3Storage from auto_archiver.core.module import get_module @@ -25,7 +25,7 @@ class WhisperEnricher(Enricher): job_results = {} for i, m in enumerate(to_enrich.media): if m.is_video() or m.is_audio(): - m.store(url=url, metadata=to_enrich) + m.store(url=url, metadata=to_enrich, storages=self.storages) try: job_id = self.submit_job(m) job_results[job_id] = False @@ -110,7 +110,7 @@ class WhisperEnricher(Enricher): def _get_s3_storage(self) -> S3Storage: try: - return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage) + return next(s for s in self.storages if s.__class__ == S3Storage) except: logger.warning("No S3Storage instance found in storages") return diff --git a/tests/__init__.py b/tests/__init__.py index 31f38cb..e69de29 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,5 +0,0 @@ -import tempfile - -from auto_archiver.core.context import ArchivingContext - -ArchivingContext.reset(full_reset=True) \ No newline at end of file From 9a8c94b641581b61e0696d24b5e4ed7bdb778e32 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Feb 2025 16:02:17 +0100 Subject: [PATCH 053/110] Fix getting/setting folder context for metadata --- src/auto_archiver/core/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 9373ff9..9f355f6 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -35,7 +35,7 @@ class Storage(BaseModule): def set_key(self, media: Media, url, metadata: Metadata) -> None: """takes the media and optionally item info and generates a key""" if media.key is not None and len(media.key) > 0: return - folder = metadata.folder + folder = metadata.get_context('folder', '') filename, ext = os.path.splitext(media.filename) # Handle path_generator logic From 9c9e9b370e73675da6ec9028b4a798b2ba81cf53 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Feb 2025 16:02:38 +0100 Subject: [PATCH 054/110] Remove lingering reference to ArchivingContext --- .../modules/thumbnail_enricher/thumbnail_enricher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 429ba38..e0ac937 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -10,7 +10,7 @@ import ffmpeg, os from loguru import logger from auto_archiver.core import Enricher -from auto_archiver.core import Media, Metadata, ArchivingContext +from auto_archiver.core import Media, Metadata from auto_archiver.utils.misc import random_str From 7a2be5a0da13713980ced0a34aed37cc0b891979 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Feb 2025 16:03:07 +0100 Subject: [PATCH 055/110] Add cookie extraction to 'authentication' options, get generic_extractor working using this info --- src/auto_archiver/core/base_module.py | 57 +++++++++++++++---- src/auto_archiver/core/orchestrator.py | 3 +- .../generic_extractor/generic_extractor.py | 29 ++++++---- 3 files changed, 66 insertions(+), 23 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 2c1e8a3..d23643c 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -1,5 +1,4 @@ - from urllib.parse import urlparse from typing import Mapping, Any from abc import ABC @@ -80,25 +79,63 @@ class BaseModule(ABC): for key, val in config.get(self.name, {}).items(): setattr(self, key, val) - def auth_for_site(self, site: str) -> dict: + def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]: + """ + Returns the authentication information for a given site. This is used to authenticate + with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com' + + extract_cookies: bool - whether or not to extract cookies from the given browser and return the + cookie jar (disabling can speed up) processing if you don't actually need the cookies jar + + Currently, the dict can have keys of the following types: + - username: str - the username to use for login + - password: str - the password to use for login + - api_key: str - the API key to use for login + - api_secret: str - the API secret to use for login + - cookie: str - a cookie string to use for login (specific to this site) + - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`) + """ # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) - # for now, just hard code those. + # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code? # SECURITY: parse the domain using urllib site = urlparse(site).netloc # add the 'www' version of the site to the list of sites to check + authdict = {} + + for to_try in [site, f"www.{site}"]: if to_try in self.authentication: - return self.authentication[to_try] + authdict.update(self.authentication[to_try]) + break # do a fuzzy string match just to print a warning - don't use it since it's insecure - for key in self.authentication.keys(): - if key in site or site in key: - logger.warning(f"Could not find exact authentication information for site '{site}'. \ - did find information for '{key}' which is close, is this what you meant? \ - If so, edit your authentication settings to make sure it exactly matches.") + if not authdict: + for key in self.authentication.keys(): + if key in site or site in key: + logger.debug(f"Could not find exact authentication information for site '{site}'. \ + did find information for '{key}' which is close, is this what you meant? \ + If so, edit your authentication settings to make sure it exactly matches.") - return {} + + def get_ytdlp_cookiejar(args): + import yt_dlp + from yt_dlp import parse_options + + # parse_options returns a named tuple as follows, we only need the ydl_options part + # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts')) + ytdlp_opts = getattr(parse_options(args), 'ydl_opts') + return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar + + # get the cookies jar, prefer the browser cookies than the file + if 'cookies_from_browser' in self.authentication: + authdict['cookies_from_browser'] = self.authentication['cookies_from_browser'] + authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']]) + elif 'cookies_file' in self.authentication: + authdict['cookies_file'] = self.authentication['cookies_file'] + authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']]) + + return authdict def repr(self): return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index f046bfe..85b3d61 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -259,8 +259,7 @@ class ArchivingOrchestrator: if module == 'cli_feeder': urls = self.config['urls'] if not urls: - logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder.") - self.basic_parser.print_help() + logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder. Use --help for more information.") exit() # cli_feeder is a pseudo module, it just takes the command line args def feed(self) -> Generator[Metadata]: diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 4838489..bc884a6 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -266,23 +266,30 @@ class GenericExtractor(Extractor): def download(self, item: Metadata) -> Metadata: url = item.get_url() - if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie: - logger.debug('Using Facebook cookie') - yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads} - - if item.netloc in ['youtube.com', 'www.youtube.com']: - if self.cookies_from_browser: - logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube') - ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,) - elif self.cookie_file: - logger.debug(f'Using cookies from file {self.cookie_file}') - ydl_options['cookiefile'] = self.cookie_file + + # set up auth + auth = self.auth_for_site(url) + # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file + if auth: + if 'username' in auth and 'password' in auth: + logger.debug(f'Using provided auth username and password for {url}') + ydl_options['username'] = auth['username'] + ydl_options['password'] = auth['password'] + elif 'cookie' in auth: + logger.debug(f'Using provided auth cookie for {url}') + yt_dlp.utils.std_headers['cookie'] = auth['cookie'] + elif 'cookie_from_browser' in auth: + logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}') + ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser'] + elif 'cookies_file' in auth: + logger.debug(f'Using cookies from file {self.cookie_file} for {url}') + ydl_options['cookiesfile'] = auth['cookies_file'] ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" From 7ec328ab409064e4e81a443f84565195b1848655 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Feb 2025 16:04:36 +0100 Subject: [PATCH 056/110] Remove cookie options from generic_extractor - it now uses 'authentication' global settings :D --- .../modules/generic_extractor/__manifest__.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index d5f363f..caa3ae1 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -20,6 +20,7 @@ the broader archiving framework. - Retrieves metadata like titles, descriptions, upload dates, and durations. - Downloads subtitles and comments when enabled. - Configurable options for handling live streams, proxies, and more. +- Supports authentication of websites using the 'authentication' settings from your orchestration. ### Dropins - For websites supported by `yt-dlp` that also contain posts in addition to videos @@ -29,10 +30,6 @@ custom dropins can be created to handle additional websites and passed to the ar via the command line using the `--dropins` option (TODO!). """, "configs": { - "facebook_cookie": { - "default": None, - "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'", - }, "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"}, "comments": { "default": False, @@ -67,14 +64,5 @@ via the command line using the `--dropins` option (TODO!). "default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.", }, - "cookies_from_browser": { - "default": None, - "type": "str", - "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale", - }, - "cookie_file": { - "default": None, - "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp", - }, }, } From c574b694ed0db50792b0719504486252848adfdd Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Feb 2025 17:25:59 +0100 Subject: [PATCH 057/110] Set up screenshot enricher to use authentication/cookies --- src/auto_archiver/core/base_module.py | 15 +- src/auto_archiver/core/orchestrator.py | 2 +- .../enrichers/screenshot_enricher.py | 2 +- .../generic_extractor/generic_extractor.py | 2 +- .../modules/generic_extractor/twitter.py | 2 +- .../screenshot_enricher.py | 6 +- .../modules/wacz_enricher/wacz_enricher.py | 2 +- .../wayback_extractor_enricher.py | 2 +- src/auto_archiver/utils/__init__.py | 1 - src/auto_archiver/utils/url.py | 129 +++++++++--------- src/auto_archiver/utils/webdriver.py | 86 +++++++++--- 11 files changed, 153 insertions(+), 96 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index d23643c..fcfe9ea 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -4,6 +4,7 @@ from typing import Mapping, Any from abc import ABC from copy import deepcopy, copy from tempfile import TemporaryDirectory +from auto_archiver.utils import url as UrlUtil from loguru import logger @@ -78,7 +79,7 @@ class BaseModule(ABC): self.config = config for key, val in config.get(self.name, {}).items(): setattr(self, key, val) - + def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]: """ Returns the authentication information for a given site. This is used to authenticate @@ -98,8 +99,7 @@ class BaseModule(ABC): # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code? - # SECURITY: parse the domain using urllib - site = urlparse(site).netloc + site = UrlUtil.domain_for_url(site) # add the 'www' version of the site to the list of sites to check authdict = {} @@ -116,12 +116,11 @@ class BaseModule(ABC): logger.debug(f"Could not find exact authentication information for site '{site}'. \ did find information for '{key}' which is close, is this what you meant? \ If so, edit your authentication settings to make sure it exactly matches.") - def get_ytdlp_cookiejar(args): import yt_dlp from yt_dlp import parse_options - + logger.debug(f"Extracting cookies from settings: {args[1]}") # parse_options returns a named tuple as follows, we only need the ydl_options part # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts')) ytdlp_opts = getattr(parse_options(args), 'ydl_opts') @@ -130,10 +129,12 @@ class BaseModule(ABC): # get the cookies jar, prefer the browser cookies than the file if 'cookies_from_browser' in self.authentication: authdict['cookies_from_browser'] = self.authentication['cookies_from_browser'] - authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']]) + if extract_cookies: + authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']]) elif 'cookies_file' in self.authentication: authdict['cookies_file'] = self.authentication['cookies_file'] - authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']]) + if extract_cookies: + authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']]) return authdict diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 85b3d61..dbc8a33 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -174,7 +174,7 @@ class ArchivingOrchestrator: default={}, action=AuthenticationJsonParseAction) # logging arguments - parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO') + parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper) parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py index 0d05d92..abb1e16 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -4,7 +4,7 @@ from selenium.common.exceptions import TimeoutException from auto_archiver.core import Enricher -from ..utils import Webdriver, UrlUtil, random_str +from ..utils import Webdriver, url as UrlUtil, random_str from ..core import Media, Metadata class ScreenshotEnricher(Enricher): diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index bc884a6..d1b1fb6 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -274,7 +274,7 @@ class GenericExtractor(Extractor): "max_downloads": self.max_downloads, "playlistend": self.max_downloads} # set up auth - auth = self.auth_for_site(url) + auth = self.auth_for_site(url, extract_cookies=False) # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file if auth: if 'username' in auth and 'password' in auth: diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 83c1f4f..3faed6b 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -5,7 +5,7 @@ from loguru import logger from slugify import slugify from auto_archiver.core.metadata import Metadata, Media -from auto_archiver.utils import UrlUtil +from auto_archiver.utils import url as UrlUtil from auto_archiver.core.extractor import Extractor from .dropin import GenericDropin, InfoExtractor diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index 8e7639a..e1da99d 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -6,7 +6,7 @@ from selenium.common.exceptions import TimeoutException from auto_archiver.core import Enricher -from auto_archiver.utils import Webdriver, UrlUtil, random_str +from auto_archiver.utils import Webdriver, url as UrlUtil, random_str from auto_archiver.core import Media, Metadata class ScreenshotEnricher(Enricher): @@ -19,7 +19,9 @@ class ScreenshotEnricher(Enricher): return logger.debug(f"Enriching screenshot for {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver: + auth = self.auth_for_site(url) + with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url, + http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver: try: driver.get(url) time.sleep(int(self.sleep_before_screenshot)) diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 3f67b7c..1586b75 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -7,7 +7,7 @@ from warcio.archiveiterator import ArchiveIterator from auto_archiver.core import Media, Metadata from auto_archiver.core import Extractor, Enricher -from auto_archiver.utils import UrlUtil, random_str +from auto_archiver.utils import url as UrlUtil, random_str class WaczExtractorEnricher(Enricher, Extractor): diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py index 0e25440..1763b12 100644 --- a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py +++ b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py @@ -3,7 +3,7 @@ from loguru import logger import time, requests from auto_archiver.core import Extractor, Enricher -from auto_archiver.utils import UrlUtil +from auto_archiver.utils import url as UrlUtil from auto_archiver.core import Metadata class WaybackExtractorEnricher(Enricher, Extractor): diff --git a/src/auto_archiver/utils/__init__.py b/src/auto_archiver/utils/__init__.py index d2063d0..ed2d3bb 100644 --- a/src/auto_archiver/utils/__init__.py +++ b/src/auto_archiver/utils/__init__.py @@ -2,7 +2,6 @@ # we need to explicitly expose the available imports here from .misc import * from .webdriver import Webdriver -from .url import UrlUtil from .atlos import get_atlos_config_options # handy utils from ytdlp diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 3b67514..40884da 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -1,83 +1,84 @@ import re from urllib.parse import urlparse, urlunparse -class UrlUtil: - AUTHWALL_URLS = [ - re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels - re.compile(r"https:\/\/www\.instagram\.com"), # instagram - ] +AUTHWALL_URLS = [ + re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels + re.compile(r"https:\/\/www\.instagram\.com"), # instagram +] - @staticmethod - def clean(url: str) -> str: return url +def domain_for_url(url: str) -> str: + """ + SECURITY: parse the domain using urllib to avoid any potential security issues + """ + return urlparse(url).netloc - @staticmethod - def is_auth_wall(url: str) -> bool: - """ - checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work - """ - for regex in UrlUtil.AUTHWALL_URLS: - if regex.match(url): - return True +def clean(url: str) -> str: + return url - return False +def is_auth_wall(url: str) -> bool: + """ + checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work + """ + for regex in AUTHWALL_URLS: + if regex.match(url): + return True - @staticmethod - def remove_get_parameters(url: str) -> str: - # http://example.com/file.mp4?t=1 -> http://example.com/file.mp4 - # useful for mimetypes to work - parsed_url = urlparse(url) - new_url = urlunparse(parsed_url._replace(query='')) - return new_url + return False - @staticmethod - def is_relevant_url(url: str) -> bool: - """ - Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc. - """ - clean_url = UrlUtil.remove_get_parameters(url) +def remove_get_parameters(url: str) -> str: + # http://example.com/file.mp4?t=1 -> http://example.com/file.mp4 + # useful for mimetypes to work + parsed_url = urlparse(url) + new_url = urlunparse(parsed_url._replace(query='')) + return new_url - # favicons - if "favicon" in url: return False - # ifnore icons - if clean_url.endswith(".ico"): return False - # ignore SVGs - if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False +def is_relevant_url(url: str) -> bool: + """ + Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc. + """ + clean_url = remove_get_parameters(url) - # twitter profile pictures - if "twimg.com/profile_images" in url: return False - if "twimg.com" in url and "/default_profile_images" in url: return False + # favicons + if "favicon" in url: return False + # ifnore icons + if clean_url.endswith(".ico"): return False + # ignore SVGs + if remove_get_parameters(url).endswith(".svg"): return False - # instagram profile pictures - if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False - # instagram recurring images - if "https://static.cdninstagram.com/rsrc.php/" in url: return False + # twitter profile pictures + if "twimg.com/profile_images" in url: return False + if "twimg.com" in url and "/default_profile_images" in url: return False - # telegram - if "https://telegram.org/img/emoji/" in url: return False + # instagram profile pictures + if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False + # instagram recurring images + if "https://static.cdninstagram.com/rsrc.php/" in url: return False - # youtube - if "https://www.youtube.com/s/gaming/emoji/" in url: return False - if "https://yt3.ggpht.com" in url and "default-user=" in url: return False - if "https://www.youtube.com/s/search/audio/" in url: return False + # telegram + if "https://telegram.org/img/emoji/" in url: return False - # ok - if " https://ok.ru/res/i/" in url: return False + # youtube + if "https://www.youtube.com/s/gaming/emoji/" in url: return False + if "https://yt3.ggpht.com" in url and "default-user=" in url: return False + if "https://www.youtube.com/s/search/audio/" in url: return False - # vk - if "https://vk.com/emoji/" in url: return False - if "vk.com/images/" in url: return False - if "vk.com/images/reaction/" in url: return False + # ok + if " https://ok.ru/res/i/" in url: return False - # wikipedia - if "wikipedia.org/static" in url: return False + # vk + if "https://vk.com/emoji/" in url: return False + if "vk.com/images/" in url: return False + if "vk.com/images/reaction/" in url: return False - return True + # wikipedia + if "wikipedia.org/static" in url: return False - @staticmethod - def twitter_best_quality_url(url: str) -> str: - """ - some twitter image URLs point to a less-than best quality - this returns the URL pointing to the highest (original) quality - """ - return re.sub(r"name=(\w+)", "name=orig", url, 1) + return True + +def twitter_best_quality_url(url: str) -> str: + """ + some twitter image URLs point to a less-than best quality + this returns the URL pointing to the highest (original) quality + """ + return re.sub(r"name=(\w+)", "name=orig", url, 1) diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index cf84c35..efb1102 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -9,12 +9,72 @@ from loguru import logger from selenium.webdriver.common.by import By import time +#import domain_for_url +from urllib.parse import urlparse, urlunparse +from http.cookiejar import MozillaCookieJar +class CookieSettingDriver(webdriver.Firefox): + + facebook_accept_cookies: bool + cookies: str + cookiejar: MozillaCookieJar + + def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs): + super(CookieSettingDriver, self).__init__(*args, **kwargs) + self.cookies = cookies + self.cookiejar = cookiejar + self.facebook_accept_cookies = facebook_accept_cookies + + def get(self, url: str): + if self.cookies or self.cookiejar: + # set up the driver to make it not 'cookie averse' (needs a context/URL) + # get the 'robots.txt' file which should be quick and easy + robots_url = urlunparse(urlparse(url)._replace(path='/robots.txt', query='', fragment='')) + super(CookieSettingDriver, self).get(robots_url) + + if self.cookies: + # an explicit cookie is set for this site, use that first + for cookie in self.cookies.split(";"): + for name, value in cookie.split("="): + self.driver.add_cookie({'name': name, 'value': value}) + elif self.cookiejar: + domain = urlparse(url).netloc.lstrip("www.") + for cookie in self.cookiejar: + if domain in cookie.domain: + try: + self.add_cookie({ + 'name': cookie.name, + 'value': cookie.value, + 'path': cookie.path, + 'domain': cookie.domain, + 'secure': bool(cookie.secure), + 'expiry': cookie.expires + }) + except Exception as e: + logger.warning(f"Failed to add cookie to webdriver: {e}") + + if self.facebook_accept_cookies: + try: + logger.debug(f'Trying fb click accept cookie popup.') + super(CookieSettingDriver, self).get("http://www.facebook.com") + essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]") + essential_only.click() + logger.debug(f'fb click worked') + # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page + time.sleep(2) + except Exception as e: + logger.warning(f'Failed on fb accept cookies.', e) + # now get the actual URL + super(CookieSettingDriver, self).get(url) + class Webdriver: - def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver: + def __init__(self, width: int, height: int, timeout_seconds: int, + facebook_accept_cookies: bool = False, http_proxy: str = "", + print_options: dict = {}, auth: dict = {}) -> webdriver: self.width = width self.height = height self.timeout_seconds = timeout_seconds + self.auth = auth self.facebook_accept_cookies = facebook_accept_cookies self.http_proxy = http_proxy # create and set print options @@ -23,32 +83,26 @@ class Webdriver: setattr(self.print_options, k, v) def __enter__(self) -> webdriver: + options = webdriver.FirefoxOptions() - options.add_argument("--headless") + # options.add_argument("--headless") options.add_argument(f'--proxy-server={self.http_proxy}') options.set_preference('network.protocol-handler.external.tg', False) + # if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option + if self.facebook_accept_cookies: + options.add_argument('--lang=en') + try: - self.driver = webdriver.Firefox(options=options) + self.driver = CookieSettingDriver(cookies=self.auth.get('cookies'), cookiejar=self.auth.get('cookies_jar'), + facebook_accept_cookies=self.facebook_accept_cookies, options=options) self.driver.set_window_size(self.width, self.height) self.driver.set_page_load_timeout(self.timeout_seconds) self.driver.print_options = self.print_options except TimeoutException as e: logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") - if self.facebook_accept_cookies: - try: - logger.debug(f'Trying fb click accept cookie popup.') - self.driver.get("http://www.facebook.com") - foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']") - foo.click() - logger.debug(f'fb click worked') - # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page - time.sleep(2) - except: - logger.warning(f'Failed on fb accept cookies.') - return self.driver - + def __exit__(self, exc_type, exc_val, exc_tb): self.driver.close() self.driver.quit() From 72b5ea9ab61d8ae367339cf06b380ed7de1323f2 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Feb 2025 17:40:40 +0100 Subject: [PATCH 058/110] Restore headless arg --- src/auto_archiver/utils/webdriver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index efb1102..005f49d 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -85,7 +85,7 @@ class Webdriver: def __enter__(self) -> webdriver: options = webdriver.FirefoxOptions() - # options.add_argument("--headless") + options.add_argument("--headless") options.add_argument(f'--proxy-server={self.http_proxy}') options.set_preference('network.protocol-handler.external.tg', False) # if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option From a873e56b8726c20a3eb93c951451e7bc84133d9a Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 12:57:35 +0100 Subject: [PATCH 059/110] Remove old csv_feeder file - now inside a module --- src/auto_archiver/feeders/csv_feeder.py | 37 ------------------------- 1 file changed, 37 deletions(-) delete mode 100644 src/auto_archiver/feeders/csv_feeder.py diff --git a/src/auto_archiver/feeders/csv_feeder.py b/src/auto_archiver/feeders/csv_feeder.py deleted file mode 100644 index b1aedb7..0000000 --- a/src/auto_archiver/feeders/csv_feeder.py +++ /dev/null @@ -1,37 +0,0 @@ -from loguru import logger -import csv - -from . import Feeder -from ..core import Metadata -from ..utils import url_or_none - -class CSVFeeder(Feeder): - - @staticmethod - def configs() -> dict: - return { - "files": { - "default": None, - "help": "Path to the input file(s) to read the URLs from, comma separated. \ - Input files should be formatted with one URL per line", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) - }, - "column": { - "default": None, - "help": "Column number or name to read the URLs from, 0-indexed", - } - } - - - def __iter__(self) -> Metadata: - url_column = self.column or 0 - for file in self.files: - with open(file, "r") as f: - reader = csv.reader(f) - first_row = next(reader) - if not(url_or_none(first_row[url_column])): - # it's a header row, skip it - for row in reader: - url = row[0] - logger.debug(f"Processing {url}") - yield Metadata().set_url(url) \ No newline at end of file From b301f60ea3ba8d8b10658ab2e5a8f592bb3e2af4 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 13:36:05 +0100 Subject: [PATCH 060/110] Fix using validators set in __manifest__.py E.g. you can use the validator 'is_file' to check if a config is a valid file --- src/auto_archiver/core/orchestrator.py | 4 ++-- src/auto_archiver/core/validators.py | 18 +++++++++++++++--- .../modules/csv_feeder/__manifest__.py | 3 +++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index dbc8a33..8a634de 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -207,9 +207,9 @@ class ArchivingOrchestrator: should_store = kwargs.pop('should_store', False) kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" try: + kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__')) + except AttributeError: kwargs['type'] = __builtins__.get(kwargs.get('type'), str) - except KeyError: - kwargs['type'] = getattr(validators, kwargs['type']) arg = group.add_argument(f"--{module.name}.{name}", **kwargs) arg.should_store = should_store diff --git a/src/auto_archiver/core/validators.py b/src/auto_archiver/core/validators.py index 681d564..b868ddf 100644 --- a/src/auto_archiver/core/validators.py +++ b/src/auto_archiver/core/validators.py @@ -1,7 +1,19 @@ -# used as validators for config values. +# used as validators for config values. Should raise an exception if the value is invalid. +from pathlib import Path +import argparse def example_validator(value): - return "example" in value + if "example" not in value: + raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument") + return value def positive_number(value): - return value > 0 \ No newline at end of file + if value < 0: + raise argparse.ArgumentTypeError(f"{value} is not a positive number") + return value + + +def valid_file(value): + if not Path(value).is_file(): + raise argparse.ArgumentTypeError(f"File '{value}' does not exist.") + return value \ No newline at end of file diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index 7249395..b6d7543 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -13,6 +13,9 @@ "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", + "required": True, + "type": "valid_file", + "nargs": "+", }, "column": { "default": None, From 78e6418249fbf5806e6d0ac110cad81fe526cf8c Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 13:37:17 +0100 Subject: [PATCH 061/110] Unit tests for csv feeder + fix some bugs --- .../modules/csv_feeder/csv_feeder.py | 24 ++++++-- tests/data/csv_no_headers.csv | 2 + tests/data/csv_with_headers.csv | 3 + tests/feeders/test_csv_feeder.py | 57 +++++++++++++++++++ 4 files changed, 82 insertions(+), 4 deletions(-) create mode 100644 tests/data/csv_no_headers.csv create mode 100644 tests/data/csv_with_headers.csv create mode 100644 tests/feeders/test_csv_feeder.py diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 15dfa85..c3f6eea 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -7,16 +7,32 @@ from auto_archiver.utils import url_or_none class CSVFeeder(Feeder): + column = None + + def __iter__(self) -> Metadata: - url_column = self.column or 0 for file in self.files: with open(file, "r") as f: reader = csv.reader(f) first_row = next(reader) - if not(url_or_none(first_row[url_column])): - # it's a header row, skip it + url_column = self.column or 0 + if isinstance(url_column, str): + try: + url_column = first_row.index(url_column) + except ValueError: + logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?") + return + elif not(url_or_none(first_row[url_column])): + # it's a header row, but we've been given a column number already logger.debug(f"Skipping header row: {first_row}") + else: + # first row isn't a header row, rewind the file + f.seek(0) + for row in reader: - url = row[0] + if not url_or_none(row[url_column]): + logger.warning(f"Not a valid URL in row: {row}, skipping") + continue + url = row[url_column] logger.debug(f"Processing {url}") yield Metadata().set_url(url) \ No newline at end of file diff --git a/tests/data/csv_no_headers.csv b/tests/data/csv_no_headers.csv new file mode 100644 index 0000000..cd66b33 --- /dev/null +++ b/tests/data/csv_no_headers.csv @@ -0,0 +1,2 @@ +https://example.com/1/,data 1 +https://example.com/2/,data 2 \ No newline at end of file diff --git a/tests/data/csv_with_headers.csv b/tests/data/csv_with_headers.csv new file mode 100644 index 0000000..c3e296d --- /dev/null +++ b/tests/data/csv_with_headers.csv @@ -0,0 +1,3 @@ +webpages,other data +https://example.com/1/,data 1 +https://example.com/2/,data 2 \ No newline at end of file diff --git a/tests/feeders/test_csv_feeder.py b/tests/feeders/test_csv_feeder.py new file mode 100644 index 0000000..546c3a7 --- /dev/null +++ b/tests/feeders/test_csv_feeder.py @@ -0,0 +1,57 @@ +import pytest + +@pytest.fixture +def headerless_csv_file(): + return "tests/data/csv_no_headers.csv" + +@pytest.fixture +def header_csv_file(): + return "tests/data/csv_with_headers.csv" + +@pytest.fixture +def header_csv_file_non_default_column(): + return "tests/data/csv_with_headers_non_default_column.csv" + + +def test_csv_feeder_no_headers(headerless_csv_file, setup_module): + from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder + + feeder = setup_module(CSVFeeder, {"files": [headerless_csv_file]}) + + urls = list(feeder) + assert len(urls) == 2 + assert urls[0].get_url() == "https://example.com/1/" + assert urls[1].get_url() == "https://example.com/2/" + +def test_csv_feeder_with_headers(header_csv_file, setup_module): + from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder + + feeder = setup_module(CSVFeeder, {"files": [header_csv_file]}) + + urls = list(feeder) + assert len(urls) == 2 + assert urls[0].get_url() == "https://example.com/1/" + assert urls[1].get_url() == "https://example.com/2/" + +def test_csv_feeder_wrong_column(header_csv_file, setup_module, caplog): + from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder + + + with caplog.at_level("WARNING"): + feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": 1}) + urls = list(feeder) + + assert len(urls) == 0 + assert "Not a valid URL in row" in caplog.text + assert len(caplog.records) == 2 + + +def test_csv_feeder_column_by_name(header_csv_file, setup_module): + from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder + + feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": "webpages"}) + + urls = list(feeder) + assert len(urls) == 2 + assert urls[0].get_url() == "https://example.com/1/" + assert urls[1].get_url() == "https://example.com/2/" \ No newline at end of file From 034197a81f83210d6a4350010d8432e823226231 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 13:40:07 +0100 Subject: [PATCH 062/110] Fix typos in csv feeder docs (in manifest) --- src/auto_archiver/modules/csv_feeder/__manifest__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index b6d7543..6d4c7bf 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -30,7 +30,8 @@ - Allows specifying the column number or name to extract URLs from. - Skips header rows if the first value is not a valid URL. - ### Setu N - - Input files should be formatted with one URL per line. + ### Setup + - Input files should be formatted with one URL per line, with or without a header row. + - If you have a header row, you can specify the column number or name to read URLs from using the 'column' config option. """ } From 0633e17998807e6d3c4c564b103003e93df39b98 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 14:18:46 +0100 Subject: [PATCH 063/110] Close the facebook 'login' window if it's there - to allow for proper screenshots --- src/auto_archiver/utils/webdriver.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index 005f49d..db26d04 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -66,6 +66,13 @@ class CookieSettingDriver(webdriver.Firefox): logger.warning(f'Failed on fb accept cookies.', e) # now get the actual URL super(CookieSettingDriver, self).get(url) + if self.facebook_accept_cookies: + # try and click the 'close' button on the 'login' window to close it + close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']") + if close_button: + close_button.click() + + class Webdriver: def __init__(self, width: int, height: int, timeout_seconds: int, From 91ca325fd510c0c1989b6b31050571331a074c3b Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 17:46:46 +0100 Subject: [PATCH 064/110] Update yt-dlp to latest version + remove code no longer needed from bluesky dropin --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- .../modules/generic_extractor/bluesky.py | 13 +------------ .../modules/generic_extractor/facebook.py | 17 +++++++++++++++++ 4 files changed, 23 insertions(+), 17 deletions(-) create mode 100644 src/auto_archiver/modules/generic_extractor/facebook.py diff --git a/poetry.lock b/poetry.lock index 088fc70..8fb48ec 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3077,14 +3077,14 @@ h11 = ">=0.9.0,<1" [[package]] name = "yt-dlp" -version = "2025.1.12" +version = "2025.1.26" description = "A feature-rich command-line audio/video downloader" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "yt_dlp-2025.1.12-py3-none-any.whl", hash = "sha256:f7ea19afb64f8e457a1b9598ddb67f8deaa313bf1d57abd5612db9272ab10795"}, - {file = "yt_dlp-2025.1.12.tar.gz", hash = "sha256:8e7e246e2a5a2cff0a9c13db46844a37a547680702012058c94ec18fce0ca25a"}, + {file = "yt_dlp-2025.1.26-py3-none-any.whl", hash = "sha256:3e76bd896b9f96601021ca192ca0fbdd195e3c3dcc28302a3a34c9bc4979da7b"}, + {file = "yt_dlp-2025.1.26.tar.gz", hash = "sha256:1c9738266921ad43c568ad01ac3362fb7c7af549276fbec92bd72f140da16240"}, ] [package.extras] @@ -3100,4 +3100,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "5a54c84ba388db7b77d1c28973b710fc99aa3822a2860b30acaf5b02ba1927bd" +content-hash = "9ca114395e73af8982abbccc25b385bbca62e50ba7cca8239e52e5c1227cb4b0" diff --git a/pyproject.toml b/pyproject.toml index 3cd47e7..f1be273 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ "cryptography (>=41.0.0,<42.0.0)", "boto3 (>=1.28.0,<2.0.0)", "dataclasses-json (>=0.0.0)", - "yt-dlp (==2025.1.12)", + "yt-dlp (>=2025.1.26,<2026.0.0)", "numpy (==2.1.3)", "vk-url-scraper (>=0.0.0)", "requests[socks] (>=0.0.0)", diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index 1f92fd8..f2086b0 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -23,19 +23,8 @@ class Bluesky(GenericDropin): def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict: # TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below - # handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id') - # return ie_instance._extract_post(handle=handle, post_id=video_id) - handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id') - return ie_instance._download_json( - 'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread', - video_id, query={ - 'uri': f'at://{handle}/app.bsky.feed.post/{video_id}', - 'depth': 0, - 'parentHeight': 0, - })['thread']['post'] - - + return ie_instance._extract_post(handle=handle, post_id=video_id) def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]: """ diff --git a/src/auto_archiver/modules/generic_extractor/facebook.py b/src/auto_archiver/modules/generic_extractor/facebook.py new file mode 100644 index 0000000..352d44e --- /dev/null +++ b/src/auto_archiver/modules/generic_extractor/facebook.py @@ -0,0 +1,17 @@ +from .dropin import GenericDropin + + +class Facebook(GenericDropin): + def extract_post(self, url: str, ie_instance): + video_id = ie_instance._match_valid_url(url).group('id') + ie_instance._download_webpage( + url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) + webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id')) + + post_data = ie_instance._extract_from_url.extract_metadata(webpage) + return post_data + + def create_metadata(self, post: dict, ie_instance, archiver, url): + metadata = archiver.create_metadata(url) + metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post) + return metadata \ No newline at end of file From 48abb5e66b989f7b97dad5b3ea2479b22c90e2d0 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 4 Feb 2025 18:16:03 +0100 Subject: [PATCH 065/110] Remove dangling screenshot_enricher file. Moved to modules/screenshot_enricher --- .../enrichers/screenshot_enricher.py | 40 ------------------- 1 file changed, 40 deletions(-) delete mode 100644 src/auto_archiver/enrichers/screenshot_enricher.py diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py deleted file mode 100644 index abb1e16..0000000 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ /dev/null @@ -1,40 +0,0 @@ -from loguru import logger -import time, os -from selenium.common.exceptions import TimeoutException - - -from auto_archiver.core import Enricher -from ..utils import Webdriver, url as UrlUtil, random_str -from ..core import Media, Metadata - -class ScreenshotEnricher(Enricher): - name = "screenshot_enricher" - - @staticmethod - def configs() -> dict: - return { - "width": {"default": 1280, "help": "width of the screenshots"}, - "height": {"default": 720, "help": "height of the screenshots"}, - "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, - "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, - "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, - } - - def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - if UrlUtil.is_auth_wall(url): - logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") - return - - logger.debug(f"Enriching screenshot for {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: - try: - driver.get(url) - time.sleep(int(self.sleep_before_screenshot)) - screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png") - driver.save_screenshot(screenshot_file) - to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") - except TimeoutException: - logger.info("TimeoutException loading page for screenshot") - except Exception as e: - logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") From 52542812dcbd171f1606a4f7502becb1101bd570 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 5 Feb 2025 16:42:58 +0000 Subject: [PATCH 066/110] Merge tests from version with context. --- .../modules/gsheet_db/gsheet_db.py | 15 +- .../instagram_tbot_extractor.py | 80 ++++-- .../modules/telethon_extractor/__init__.py | 2 +- .../telethon_extractor/telethon_extractor.py | 2 +- tests/conftest.py | 19 +- tests/databases/test_gsheet_db.py | 140 +++++++++ .../test_instagram_api_extractor.py | 108 +++++++ .../test_instagram_tbot_extractor.py | 111 ++++++++ tests/feeders/test_gsheet_feeder.py | 268 ++++++++++++++++++ tests/feeders/test_gworksheet.py | 144 ++++++++++ tests/storages/test_S3_storage.py | 100 +++++++ tests/storages/test_gdrive_storage.py | 43 +++ tests/storages/test_storage_base.py | 23 ++ 13 files changed, 1022 insertions(+), 33 deletions(-) create mode 100644 tests/databases/test_gsheet_db.py create mode 100644 tests/extractors/test_instagram_api_extractor.py create mode 100644 tests/extractors/test_instagram_tbot_extractor.py create mode 100644 tests/feeders/test_gsheet_feeder.py create mode 100644 tests/feeders/test_gworksheet.py create mode 100644 tests/storages/test_S3_storage.py create mode 100644 tests/storages/test_gdrive_storage.py create mode 100644 tests/storages/test_storage_base.py diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 5e1ed1e..644015e 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -12,10 +12,11 @@ from auto_archiver.modules.gsheet_feeder import GWorksheet class GsheetsDb(Database): """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata + NB: only works if GsheetFeeder is used. + could be updated in the future to support non-GsheetFeeder metadata """ + def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") gw, row = self._retrieve_gsheet(item) @@ -57,7 +58,7 @@ class GsheetsDb(Database): media: Media = item.get_final_media() if hasattr(media, "urls"): batch_if_valid('archive', "\n".join(media.urls)) - batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()) + batch_if_valid('date', True, self._get_current_datetime_iso()) batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")) batch_if_valid('timestamp', item.get_timestamp()) @@ -85,6 +86,12 @@ class GsheetsDb(Database): gw.batch_set_cell(cell_updates) + @staticmethod + def _get_current_datetime_iso() -> str: + """Helper method to generate the current datetime in ISO format.""" + return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat() + + def _safe_status_update(self, item: Metadata, new_status: str) -> None: try: gw, row = self._retrieve_gsheet(item) @@ -93,9 +100,11 @@ class GsheetsDb(Database): logger.debug(f"Unable to update sheet: {e}") def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: + if gsheet := item.get_context("gsheet"): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") + # todo doesn't exist, should be passed from elif self.sheet_id: print(self.sheet_id) diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 5b49484..5660cd2 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -34,19 +34,30 @@ class InstagramTbotExtractor(Extractor): """ super().setup(configs) logger.info(f"SETUP {self.name} checking login...") + self._prepare_session_file() + self._initialize_telegram_client() - # make a copy of the session that is used exclusively with this archiver instance + def _prepare_session_file(self): + """ + Creates a copy of the session file for exclusive use with this archiver instance. + Ensures that a valid session file exists before proceeding. + """ new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session") if not os.path.exists(f"{self.session_file}.session"): - raise FileNotFoundError(f"session file {self.session_file}.session not found, " - f"to set this up run the setup script in scripts/telegram_setup.py") + raise FileNotFoundError(f"Session file {self.session_file}.session not found.") shutil.copy(self.session_file + ".session", new_session_file) self.session_file = new_session_file.replace(".session", "") + def _initialize_telegram_client(self): + """Initializes the Telegram client.""" try: self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) except OperationalError as e: - logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}") + logger.error( + f"Unable to access the {self.session_file} session. " + "Ensure that you don't use the same session file here and in telethon_extractor. " + "If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}" + ) with self.client.start(): logger.success(f"SETUP {self.name} login works.") @@ -63,32 +74,49 @@ class InstagramTbotExtractor(Extractor): result = Metadata() tmp_dir = self.tmp_dir with self.client.start(): - chat = self.client.get_entity("instagram_load_bot") - since_id = self.client.send_message(entity=chat, message=url).id - attempts = 0 - seen_media = [] - message = "" - time.sleep(3) - # media is added before text by the bot so it can be used as a stop-logic mechanism - while attempts < (self.timeout - 3) and (not message or not len(seen_media)): - attempts += 1 - time.sleep(1) - for post in self.client.iter_messages(chat, min_id=since_id): - since_id = max(since_id, post.id) - if post.media and post.id not in seen_media: - filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}') - media = self.client.download_media(post.media, filename_dest) - if media: - result.add_media(Media(media)) - seen_media.append(post.id) - if post.message: message += post.message + chat, since_id = self._send_url_to_bot(url) + message = self._process_messages(chat, since_id, tmp_dir, result) - if "You must enter a URL to a post" in message: + if "You must enter a URL to a post" in message: logger.debug(f"invalid link {url=} for {self.name}: {message}") return False - + # # TODO: It currently returns this as a success - is that intentional? + # if "Media not found or unavailable" in message: + # logger.debug(f"invalid link {url=} for {self.name}: {message}") + # return False + if message: result.set_content(message).set_title(message[:128]) - return result.success("insta-via-bot") + + def _send_url_to_bot(self, url: str): + """ + Sends the URL to the 'instagram_load_bot' and returns (chat, since_id). + """ + chat = self.client.get_entity("instagram_load_bot") + since_message = self.client.send_message(entity=chat, message=url) + return chat, since_message.id + + def _process_messages(self, chat, since_id, tmp_dir, result): + attempts = 0 + seen_media = [] + message = "" + time.sleep(3) + # media is added before text by the bot so it can be used as a stop-logic mechanism + while attempts < (self.timeout - 3) and (not message or not len(seen_media)): + attempts += 1 + time.sleep(1) + for post in self.client.iter_messages(chat, min_id=since_id): + since_id = max(since_id, post.id) + # Skip known filler message: + if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi': + continue + if post.media and post.id not in seen_media: + filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}') + media = self.client.download_media(post.media, filename_dest) + if media: + result.add_media(Media(media)) + seen_media.append(post.id) + if post.message: message += post.message + return message.strip() \ No newline at end of file diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py index a837fdf..2eaa57c 100644 --- a/src/auto_archiver/modules/telethon_extractor/__init__.py +++ b/src/auto_archiver/modules/telethon_extractor/__init__.py @@ -1 +1 @@ -from .telethon_extractor import TelethonArchiver \ No newline at end of file +from .telethon_extractor import TelethonExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 3e952e8..0147ff2 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -13,7 +13,7 @@ from auto_archiver.core import Metadata, Media from auto_archiver.utils import random_str -class TelethonArchiver(Extractor): +class TelethonExtractor(Extractor): valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") diff --git a/tests/conftest.py b/tests/conftest.py index f909bfb..8675fbc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,8 @@ """ pytest conftest file, for shared fixtures and configuration """ - +import os +import pickle from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib @@ -113,4 +114,18 @@ def pytest_runtest_setup(item): test_name = _test_failed_incremental[cls_name].get((), None) # if name found, test has failed for the combination of class name & test name if test_name is not None: - pytest.xfail(f"previous test failed ({test_name})") \ No newline at end of file + pytest.xfail(f"previous test failed ({test_name})") + + + +@pytest.fixture() +def unpickle(): + """ + Returns a helper function that unpickles a file + ** gets the file from the test_files directory: tests/data/test_files ** + """ + def _unpickle(path): + test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files") + with open(os.path.join(test_data_dir, path), "rb") as f: + return pickle.load(f) + return _unpickle \ No newline at end of file diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py new file mode 100644 index 0000000..bdc2811 --- /dev/null +++ b/tests/databases/test_gsheet_db.py @@ -0,0 +1,140 @@ +from datetime import datetime, timezone +from unittest.mock import MagicMock, patch + +import pytest + +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.gsheet_db import GsheetsDb +from auto_archiver.modules.gsheet_feeder import GWorksheet + + +@pytest.fixture +def mock_gworksheet(): + mock_gworksheet = MagicMock(spec=GWorksheet) + mock_gworksheet.col_exists.return_value = True + mock_gworksheet.get_cell.return_value = "" + mock_gworksheet.get_row.return_value = {} + return mock_gworksheet + + +@pytest.fixture +def mock_metadata(): + metadata: Metadata = MagicMock(spec=Metadata) + metadata.get_url.return_value = "http://example.com" + metadata.status = "done" + metadata.get_title.return_value = "Example Title" + metadata.get.return_value = "Example Content" + metadata.get_timestamp.return_value = "2025-01-01T00:00:00Z" + metadata.get_final_media.return_value = MagicMock(spec=Media) + metadata.get_all_media.return_value = [] + metadata.get_media_by_id.return_value = None + metadata.get_first_image.return_value = None + return metadata + +@pytest.fixture +def metadata(): + metadata = Metadata() + metadata.add_media(Media(filename="screenshot", urls=["http://example.com/screenshot.png"])) + metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"])) + metadata.add_media(Media(filename="thumbnail", urls=["http://example.com/thumbnail.png"])) + metadata.set_url("http://example.com") + metadata.set_title("Example Title") + metadata.set_content("Example Content") + metadata.success("my-archiver") + metadata.set("timestamp", "2025-01-01T00:00:00Z") + metadata.set("date", "2025-02-04T18:22:24.909112+00:00") + return metadata + + +@pytest.fixture +def mock_media(): + """Fixture for a mock Media object.""" + mock_media = MagicMock(spec=Media) + mock_media.urls = ["http://example.com/media"] + mock_media.get.return_value = "not-calculated" + return mock_media + +@pytest.fixture +def gsheets_db(mock_gworksheet, setup_module): + db = setup_module("gsheet_db", { + "allow_worksheets": "set()", + "block_worksheets": "set()", + "use_sheet_names_in_stored_paths": "True", + }) + db._retrieve_gsheet = MagicMock(return_value=(mock_gworksheet, 1)) + return db + + +@pytest.fixture +def fixed_timestamp(): + """Fixture for a fixed timestamp.""" + return datetime(2025, 1, 1, tzinfo=timezone.utc) + + +@pytest.fixture +def expected_calls(mock_media, fixed_timestamp): + """Fixture for the expected cell updates.""" + return [ + (1, 'status', 'my-archiver: success'), + (1, 'archive', 'http://example.com/screenshot.png'), + (1, 'date', '2025-02-01T00:00:00+00:00'), + (1, 'title', 'Example Title'), + (1, 'text', 'Example Content'), + (1, 'timestamp', '2025-01-01T00:00:00+00:00'), + (1, 'hash', 'not-calculated'), + # (1, 'screenshot', 'http://example.com/screenshot.png'), + # (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'), + # (1, 'wacz', 'http://example.com/browsertrix.wacz'), + # (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=') + ] + +def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet): + gw, row = gsheets_db._retrieve_gsheet(metadata) + assert gw == mock_gworksheet + assert row == 1 + + +def test_started(gsheets_db, mock_metadata, mock_gworksheet): + gsheets_db.started(mock_metadata) + mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Archive in progress') + +def test_failed(gsheets_db, mock_metadata, mock_gworksheet): + reason = "Test failure" + gsheets_db.failed(mock_metadata, reason) + mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}') + +def test_aborted(gsheets_db, mock_metadata, mock_gworksheet): + gsheets_db.aborted(mock_metadata) + mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '') + + +def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls): + with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + gsheets_db.done(metadata) + mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls) + + +def test_done_cached(gsheets_db, metadata, mock_gworksheet): + with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + gsheets_db.done(metadata, cached=True) + + # Verify the status message includes "[cached]" + call_args = mock_gworksheet.batch_set_cell.call_args[0][0] + assert any(call[2].startswith("[cached]") for call in call_args) + + +def test_done_missing_media(gsheets_db, metadata, mock_gworksheet): + # clear media from metadata + metadata.media = [] + with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + gsheets_db.done(metadata) + # Verify nothing media-related gets updated + call_args = mock_gworksheet.batch_set_cell.call_args[0][0] + media_fields = {'archive', 'screenshot', 'thumbnail', 'wacz', 'replaywebpage'} + assert all(call[1] not in media_fields for call in call_args) + +def test_safe_status_update(gsheets_db, metadata, mock_gworksheet): + gsheets_db._safe_status_update(metadata, "Test status") + mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Test status') + + diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py new file mode 100644 index 0000000..7a19233 --- /dev/null +++ b/tests/extractors/test_instagram_api_extractor.py @@ -0,0 +1,108 @@ +from datetime import datetime +from typing import Type + +import pytest +from unittest.mock import patch, MagicMock + +from auto_archiver.core import Metadata +from auto_archiver.modules.instagram_api_extractor.instagram_api_extractor import InstagramAPIExtractor +from .test_extractor_base import TestExtractorBase + + +@pytest.fixture +def mock_user_response(): + return { + "user": { + "pk": "123", + "username": "test_user", + "full_name": "Test User", + "profile_pic_url_hd": "http://example.com/profile.jpg", + "profile_pic_url": "http://example.com/profile_lowres.jpg" + } + } + +@pytest.fixture +def mock_post_response(): + return { + "id": "post_123", + "code": "abc123", + "caption_text": "Test Caption", + "taken_at": datetime.now().timestamp(), + "video_url": "http://example.com/video.mp4", + "thumbnail_url": "http://example.com/thumbnail.jpg" + } + +@pytest.fixture +def mock_story_response(): + return [{ + "id": "story_123", + "taken_at": datetime.now().timestamp(), + "video_url": "http://example.com/story.mp4" + }] + +@pytest.fixture +def mock_highlight_response(): + return { + "response": { + "reels": { + "highlight:123": { + "id": "123", + "title": "Test Highlight", + "items": [{ + "id": "item_123", + "taken_at": datetime.now().timestamp(), + "video_url": "http://example.com/highlight.mp4" + }] + } + } + } + } + + +# @pytest.mark.incremental +class TestInstagramAPIExtractor(TestExtractorBase): + """ + Test suite for InstagramAPIExtractor. + """ + + extractor_module = "instagram_api_extractor" + extractor: InstagramAPIExtractor + + config = { + "access_token": "test_access_token", + "api_endpoint": "https://api.instagram.com/v1", + # "full_profile": False, + # "full_profile_max_posts": 0, + # "minimize_json_output": True, + } + + @pytest.mark.parametrize("url,expected", [ + ("https://instagram.com/user", [("", "user", "")]), + ("https://instagr.am/p/post_id", []), + ("https://youtube.com", []), + ("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]), + ("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]), + ("https://instagram.com/stories/user/123", [("stories", "user", "123")]), + ]) + def test_url_parsing(self, url, expected): + assert self.extractor.valid_url.findall(url) == expected + + def test_initialize(self): + self.extractor.initialise() + assert self.extractor.api_endpoint[-1] != "/" + + @pytest.mark.parametrize("input_dict,expected", [ + ({"x": 0, "valid": "data"}, {"valid": "data"}), + ({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}), + ]) + def test_cleanup_dict(self, input_dict, expected): + assert self.extractor.cleanup_dict(input_dict) == expected + + def test_download_post(self): + # test with context=reel + # test with context=post + # test with multiple images + # test gets text (metadata title) + + + pass \ No newline at end of file diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py new file mode 100644 index 0000000..4fe80be --- /dev/null +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -0,0 +1,111 @@ +import os +import pickle +from typing import Type +from unittest.mock import patch, MagicMock + +import pytest + +from auto_archiver.core.extractor import Extractor +from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor + + +TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles") + + +@pytest.fixture +def test_session_file(tmpdir): + """Fixture to create a test session file.""" + session_file = os.path.join(tmpdir, "test_session.session") + with open(session_file, "w") as f: + f.write("mock_session_data") + return session_file.replace(".session", "") + + +@pytest.mark.incremental +class TestInstagramTbotExtractor(object): + """ + Test suite for InstagramTbotExtractor. + """ + + extractor_module = "instagram_tbot_extractor" + extractor: InstagramTbotExtractor + config = { + "api_id": 12345, + "api_hash": "test_api_hash", + # "session_file" + } + + @pytest.fixture(autouse=True) + def setup_extractor(self, setup_module): + assert self.extractor_module is not None, "self.extractor_module must be set on the subclass" + assert self.config is not None, "self.config must be a dict set on the subclass" + extractor: Type[Extractor] = setup_module(self.extractor_module, self.config) + return extractor + + @pytest.fixture + def mock_telegram_client(self): + """Fixture to mock TelegramClient interactions.""" + with patch("auto_archiver.modules.instagram_tbot_extractor._initialize_telegram_client") as mock_client: + instance = MagicMock() + mock_client.return_value = instance + yield instance + + + # @pytest.fixture + # def mock_session_file(self, temp_session_file): + # """Patch the extractor’s session file setup to use a temporary path.""" + # with patch.object(InstagramTbotExtractor, "session_file", temp_session_file): + # with patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None): + # yield # Mocks are applied for the duration of the test + + @pytest.fixture + def metadata_sample(self): + """Loads a Metadata object from a pickle file.""" + with open(os.path.join(TESTFILES, "metadata_item.pkl"), "rb") as f: + return pickle.load(f) + + + @pytest.mark.download + @pytest.mark.parametrize("url, expected_status, bot_responses", [ + ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou")]), + ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol")]), + # todo tbot not working for stories :( + ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, [MagicMock(id=101, media=None, message="Media not found or unavailable")]), + ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, []), + ("https://www.instagram.com/p/INVALID", False, [MagicMock(id=101, media=None, message="You must enter a URL to a post")]), + ]) + def test_download(self, url, expected_status, bot_responses, metadata_sample): + """Test the `download()` method with various Instagram URLs.""" + metadata_sample.set_url(url) + self.extractor.initialise() + result = self.extractor.download(metadata_sample) + if expected_status: + assert result.is_success() + assert result.status == expected_status + assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message] + else: + assert result is False + # self.extractor.cleanup() + + # @patch.object(InstagramTbotExtractor, '_send_url_to_bot') + # @patch.object(InstagramTbotExtractor, '_process_messages') + # def test_download_invalid_link_returns_false( + # self, mock_process, mock_send, extractor, metadata_instagram + # ): + # # Setup Mocks + # # _send_url_to_bot -> simulate it returns (chat=MagicMock, since_id=100) + # mock_chat = MagicMock() + # mock_send.return_value = (mock_chat, 100) + # # _process_messages -> simulate it returns the text "You must enter a URL to a post" + # mock_process.return_value = "You must enter a URL to a post" + # result = extractor.download(metadata_instagram) + # assert result is False, "Should return False if message includes 'You must enter a URL to a post'" + + + + + # Test story +# Test expired story +# Test requires login/ access (?) +# Test post +# Test multiple images? \ No newline at end of file diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py new file mode 100644 index 0000000..dbd2416 --- /dev/null +++ b/tests/feeders/test_gsheet_feeder.py @@ -0,0 +1,268 @@ +from typing import Type + +import gspread +import pytest +from unittest.mock import patch, MagicMock +from auto_archiver.modules.gsheet_feeder import GsheetsFeeder +from auto_archiver.core import Metadata, Feeder, ArchivingContext + + +def test_initialise_without_sheet_and_sheet_id(setup_module): + """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set. + (shouldn't really be asserting in there) + """ + with patch("gspread.service_account"): + feeder = setup_module("gsheet_feeder", + {"service_account": "dummy.json", + "sheet": None, + "sheet_id": None}) + with pytest.raises(AssertionError): + feeder.initialise() + + +@pytest.fixture +def gsheet_feeder(setup_module) -> GsheetsFeeder: + feeder = setup_module("gsheet_feeder", + {"service_account": "dummy.json", + "sheet": "test-auto-archiver", + "sheet_id": None, + "header": 1, + "columns": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", + }, + "allow_worksheets": set(), + "block_worksheets": set(), + "use_sheet_names_in_stored_paths": True, + } + ) + feeder.gsheets_client = MagicMock() + return feeder + + +@pytest.fixture() +def worksheet(unpickle): + # Load the worksheet data from the pickle file + # only works for simple usage, cant reauthenticate but give structure + return unpickle("test_worksheet.pickle") + + +class TestWorksheet(): + """ + mimics the bits we need from gworksheet + """ + + class SheetSheet: + title = "TestSheet" + + rows = [ + { "row": 2, "url": "http://example.com", "status": "", "folder": "" }, + { "row": 3, "url": "http://example.com", "status": "", "folder": "" }, + { "row": 4, "url": "", "status": "", "folder": "" }, + { "row": 5, "url": "https://another.com", "status": None, "folder": "" }, + { "row": 6, "url": "https://another.com", "status": "success", "folder": "some_folder" }, + ] + + def __init__(self): + self.wks = self.SheetSheet() + + def count_rows(self): + if not self.rows: + return 0 + return max(r["row"] for r in self.rows) + + def get_cell(self, row, col_name, fresh=False): + matching = next((r for r in self.rows if r["row"] == row), {}) + return matching.get(col_name, "") + + def get_cell_or_default(self, row, col_name, default): + matching = next((r for r in self.rows if r["row"] == row), {}) + return matching.get(col_name, default) + +def test__process_rows(gsheet_feeder: GsheetsFeeder): + testworksheet = TestWorksheet() + metadata_items = list(gsheet_feeder._process_rows(testworksheet)) + assert len(metadata_items) == 3 + assert isinstance(metadata_items[0], Metadata) + assert metadata_items[0].get("url") == "http://example.com" + +def test__set_metadata(gsheet_feeder: GsheetsFeeder, worksheet): + gsheet_feeder._set_context(worksheet, 1) + assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet} + + +@pytest.mark.skip(reason="Not recognising folder column") +def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet): + gsheet_feeder._set_context(worksheet, 7) + assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet} + + +def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): + testworksheet = TestWorksheet() + testworksheet.wks.title = "TestSheet" + gsheet_feeder._set_context(testworksheet, 6) + assert ArchivingContext.get("gsheet") == {"row": 6, "worksheet": testworksheet} + assert ArchivingContext.get("folder") == "some-folder/test-auto-archiver/testsheet" + + +@pytest.mark.usefixtures("setup_module") +@pytest.mark.parametrize("sheet, sheet_id, expected_method, expected_arg, description", [ + ("TestSheet", None, "open", "TestSheet", "opening by sheet name"), + (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID") +]) +def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description): + """Ensure open_sheet() correctly opens by name or ID based on configuration.""" + with patch("gspread.service_account") as mock_service_account: + mock_client = MagicMock() + mock_service_account.return_value = mock_client + mock_client.open.return_value = "MockSheet" + mock_client.open_by_key.return_value = "MockSheet" + + # Setup module with parameterized values + feeder = setup_module("gsheet_feeder", { + "service_account": "dummy.json", + "sheet": sheet, + "sheet_id": sheet_id + }) + feeder.initialise() + sheet_result = feeder.open_sheet() + # Validate the correct method was called + getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}" + assert sheet_result == "MockSheet", f"Failed: {description}" + + +@pytest.mark.usefixtures("setup_module") +def test_open_sheet_with_sheet_id(setup_module): + """Ensure open_sheet() correctly opens a sheet by ID.""" + with patch("gspread.service_account") as mock_service_account: + mock_client = MagicMock() + mock_service_account.return_value = mock_client + mock_client.open_by_key.return_value = "MockSheet" + feeder = setup_module("gsheet_feeder", + {"service_account": "dummy.json", + "sheet": None, + "sheet_id": "ABC123"}) + feeder.initialise() + sheet = feeder.open_sheet() + mock_client.open_by_key.assert_called_once_with("ABC123") + assert sheet == "MockSheet" + + +def test_should_process_sheet(setup_module): + gdb = setup_module("gsheet_feeder", {"service_account": "dummy.json", + "sheet": "TestSheet", + "sheet_id": None, + "allow_worksheets": {"TestSheet", "Sheet2"}, + "block_worksheets": {"Sheet3"}} + ) + assert gdb.should_process_sheet("TestSheet") == True + assert gdb.should_process_sheet("Sheet3") == False + # False if allow_worksheets is set + assert gdb.should_process_sheet("AnotherSheet") == False + + + +@pytest.mark.skip +class TestGSheetsFeederReal: + + """ Testing GSheetsFeeder class """ + module_name: str = 'gsheet_feeder' + feeder: GsheetsFeeder + config: dict = { + # TODO: Create test creds + "service_account": "secrets/service_account.json", + "sheet": "test-auto-archiver", + "sheet_id": None, + "header": 1, + "columns": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", + }, + "allow_worksheets": set(), + "block_worksheets": set(), + "use_sheet_names_in_stored_paths": True, + } + + @pytest.fixture(autouse=True) + def setup_feeder(self, setup_module): + assert ( + self.module_name is not None + ), "self.module_name must be set on the subclass" + assert self.config is not None, "self.config must be a dict set on the subclass" + self.feeder: Type[Feeder] = setup_module( + self.module_name, self.config + ) + + def reset_test_sheet(self): + """Clears test sheet and re-adds headers to ensure consistent test results.""" + client = gspread.service_account(self.config["service_account"]) + sheet = client.open(self.config["sheet"]) + worksheet = sheet.get_worksheet(0) + worksheet.clear() + worksheet.append_row(["Link", "Archive Status"]) + + def test_initialise(self): + self.feeder.initialise() + assert hasattr(self.feeder, "gsheets_client") + + @pytest.mark.download + def test_open_sheet_real_connection(self): + """Ensure open_sheet() connects to a real Google Sheets instance.""" + self.feeder.initialise() + sheet = self.feeder.open_sheet() + assert sheet is not None, "open_sheet() should return a valid sheet instance" + assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method" + + @pytest.mark.download + def test_iter_yields_metadata_real_data(self): + """Ensure __iter__() yields Metadata objects for real test sheet data.""" + self.reset_test_sheet() + client = gspread.service_account(self.config["service_account"]) + sheet = client.open(self.config["sheet"]) + worksheet = sheet.get_worksheet(0) + # Insert test rows as a temp method + # Next we will refactor the feeder for better testing + test_rows = [ + ["https://example.com", ""], + ["", ""], + ["https://example.com", "done"], + ] + worksheet.append_rows(test_rows) + self.feeder.initialise() + metadata_list = list(self.feeder) + + # Validate that only the first row is processed + assert len(metadata_list) == 1 + assert metadata_list[0].metadata.get("url") == "https://example.com" + + + +# TODO + +# Test two sheets +# test two sheets with different columns +# test folder implementation diff --git a/tests/feeders/test_gworksheet.py b/tests/feeders/test_gworksheet.py new file mode 100644 index 0000000..e6f5cc6 --- /dev/null +++ b/tests/feeders/test_gworksheet.py @@ -0,0 +1,144 @@ +import pytest +from unittest.mock import MagicMock + +from auto_archiver.modules.gsheet_feeder import GWorksheet + + +class TestGWorksheet: + @pytest.fixture + def mock_worksheet(self): + mock_ws = MagicMock() + mock_ws.get_values.return_value = [ + ["Link", "Archive Status", "Archive Location", "Archive Date"], + ["url1", "archived", "filepath1", "2023-01-01"], + ["url2", "pending", "filepath2", "2023-01-02"], + ] + return mock_ws + + @pytest.fixture + def gworksheet(self, mock_worksheet): + return GWorksheet(mock_worksheet) + + # Test initialization and basic properties + def test_initialization_sets_headers(self, gworksheet): + assert gworksheet.headers == ["link", "archive status", "archive location", "archive date"] + + def test_count_rows_returns_correct_value(self, gworksheet): + # inc header row + assert gworksheet.count_rows() == 3 + + # Test column validation and lookup + @pytest.mark.parametrize( + "col,expected_index", + [ + ("url", 0), + ("status", 1), + ("archive", 2), + ("date", 3), + ], + ) + def test_col_index_returns_correct_index(self, gworksheet, col, expected_index): + assert gworksheet._col_index(col) == expected_index + + def test_check_col_exists_raises_for_invalid_column(self, gworksheet): + with pytest.raises(Exception, match="Column invalid_col"): + gworksheet._check_col_exists("invalid_col") + + # Test data retrieval + @pytest.mark.parametrize( + "row,expected", + [ + (1, ["Link", "Archive Status", "Archive Location", "Archive Date"]), + (2, ["url1", "archived", "filepath1", "2023-01-01"]), + (3, ["url2", "pending", "filepath2", "2023-01-02"]), + ], + ) + def test_get_row_returns_correct_data(self, gworksheet, row, expected): + assert gworksheet.get_row(row) == expected + + @pytest.mark.parametrize( + "row,col,expected", + [ + (2, "url", "url1"), + (2, "status", "archived"), + (3, "date", "2023-01-02"), + ], + ) + def test_get_cell_returns_correct_value(self, gworksheet, row, col, expected): + assert gworksheet.get_cell(row, col) == expected + + def test_get_cell_handles_fresh_data(self, mock_worksheet, gworksheet): + mock_worksheet.cell.return_value.value = "fresh_value" + result = gworksheet.get_cell(2, "url", fresh=True) + assert result == "fresh_value" + mock_worksheet.cell.assert_called_once_with(2, 1) + + # Test edge cases and error handling + @pytest.mark.parametrize( + "when_empty,expected", + [ + (True, "default"), + (False, ""), + ], + ) + def test_get_cell_or_default_handles_empty_values( + self, mock_worksheet, when_empty, expected + ): + mock_worksheet.get_values.return_value[1][0] = "" # Empty URL cell + g = GWorksheet(mock_worksheet) + assert ( + g.get_cell_or_default( + 2, "url", default="default", when_empty_use_default=when_empty + ) + == expected + ) + + def test_get_cell_or_default_handles_missing_columns(self, gworksheet): + assert ( + gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe" + ) + + # Test write operations + def test_set_cell_updates_correct_position(self, mock_worksheet, gworksheet): + gworksheet.set_cell(2, "url", "new_url") + mock_worksheet.update_cell.assert_called_once_with(2, 1, "new_url") + + def test_batch_set_cell_formats_requests_correctly( + self, mock_worksheet, gworksheet + ): + updates = [(2, "url", "new_url"), (3, "status", "processed")] + gworksheet.batch_set_cell(updates) + expected_batch = [ + {"range": "A2", "values": [["new_url"]]}, + {"range": "B3", "values": [["processed"]]}, + ] + mock_worksheet.batch_update.assert_called_once_with( + expected_batch, value_input_option="USER_ENTERED" + ) + + def test_batch_set_cell_truncates_long_values(self, mock_worksheet, gworksheet): + long_value = "x" * 50000 + gworksheet.batch_set_cell([(1, "url", long_value)]) + submitted_value = mock_worksheet.batch_update.call_args[0][0][0]["values"][0][0] + assert len(submitted_value) == 49999 + + # Test coordinate conversion + @pytest.mark.parametrize( + "row,col,expected", + [ + (1, "url", "A1"), + (2, "status", "B2"), + (3, "archive", "C3"), + (4, "date", "D4"), + ], + ) + def test_to_a1_conversion(self, gworksheet, row, col, expected): + assert gworksheet.to_a1(row, col) == expected + + # Test empty worksheet + def test_empty_worksheet_initialization(self): + mock_ws = MagicMock() + mock_ws.get_values.return_value = [] + g = GWorksheet(mock_ws) + assert g.headers == [] + assert g.count_rows() == 0 diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py new file mode 100644 index 0000000..df1c1f1 --- /dev/null +++ b/tests/storages/test_S3_storage.py @@ -0,0 +1,100 @@ +from typing import Type +import pytest +from unittest.mock import MagicMock, patch, mock_open +from auto_archiver.core import Media +from auto_archiver.modules.s3_storage import s3_storage +from tests.storages.test_storage_base import TestStorageBase + + +class TestGDriveStorage: + """ + Test suite for GDriveStorage. + """ + module_name: str = "s3_storage" + storage: Type[s3_storage] + s3: MagicMock + config: dict = { + "path_generator": "flat", + "filename_generator": "static", + "bucket": "test-bucket", + "region": "test-region", + "key": "test-key", + "secret": "test-secret", + "random_no_duplicate": False, + "endpoint_url": "https://{region}.example.com", + "cdn_url": "https://cdn.example.com/{key}", + "private": False, + } + + @patch('boto3.client') + @pytest.fixture(autouse=True) + def setup_storage(self, setup_module): + self.storage = setup_module(self.module_name, self.config) + self.storage.initialise() + + @patch('boto3.client') + def test_client_initialization(self, mock_boto_client, setup_module): + """Test that S3 client is initialized with correct parameters""" + self.storage.initialise() + mock_boto_client.assert_called_once_with( + 's3', + region_name='test-region', + endpoint_url='https://test-region.example.com', + aws_access_key_id='test-key', + aws_secret_access_key='test-secret' + ) + + def test_get_cdn_url_generation(self): + """Test CDN URL formatting """ + media = Media("test.txt") + media.key = "path/to/file.txt" + url = self.storage.get_cdn_url(media) + assert url == "https://cdn.example.com/path/to/file.txt" + media.key = "another/path.jpg" + assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" + + + @patch.object(s3_storage.S3Storage, 'file_in_folder') + def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder): + """Test that upload skips when file_in_folder finds existing object""" + # Setup test-specific configuration + self.storage.random_no_duplicate = True + mock_file_in_folder.return_value = "existing_folder/existing_file.txt" + # Create test media with calculated hash + media = Media("test.txt") + media.key = "original_path.txt" + + # Mock hash calculation + with patch.object(self.storage, 'calculate_hash') as mock_calculate_hash: + mock_calculate_hash.return_value = "testhash123" + # Verify upload + assert self.storage.is_upload_needed(media) is False + assert media.key == "existing_folder/existing_file.txt" + assert media.get("previously archived") is True + + with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: + result = self.storage.uploadf(None, media) + mock_upload.assert_not_called() + assert result is True + + @patch.object(s3_storage.S3Storage, 'is_upload_needed') + def test_uploads_with_correct_parameters(self, mock_upload_needed): + media = Media("test.txt") + mock_upload_needed.return_value = True + media.mimetype = 'image/png' + mock_file = MagicMock() + + with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: + self.storage.uploadf(mock_file, media) + + # Verify core upload parameters + mock_upload.assert_called_once_with( + mock_file, + Bucket='test-bucket', + # Key='original_key.txt', + Key=None, + ExtraArgs={ + 'ACL': 'public-read', + 'ContentType': 'image/png' + } + ) \ No newline at end of file diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py new file mode 100644 index 0000000..b7417ad --- /dev/null +++ b/tests/storages/test_gdrive_storage.py @@ -0,0 +1,43 @@ +from typing import Type +import pytest +from unittest.mock import MagicMock, patch +from auto_archiver.core import Media +from auto_archiver.modules.gdrive_storage import GDriveStorage +from auto_archiver.core.metadata import Metadata +from tests.storages.test_storage_base import TestStorageBase + + +class TestGDriveStorage(TestStorageBase): + """ + Test suite for GDriveStorage. + """ + + module_name: str = "gdrive_storage" + storage: Type[GDriveStorage] + config: dict = {'path_generator': 'url', + 'filename_generator': 'static', + 'root_folder_id': "fake_root_folder_id", + 'oauth_token': None, + 'service_account': 'fake_service_account.json' + } + + @pytest.mark.skip(reason="Requires real credentials") + @pytest.mark.download + def test_initialize_with_real_credentials(self): + """ + Test that the Google Drive service can be initialized with real credentials. + """ + self.storage.service_account = 'secrets/service_account.json' # Path to real credentials + self.storage.initialise() + assert self.storage.service is not None + + + def test_initialize_fails_with_non_existent_creds(self): + """ + Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist. + """ + # Act and Assert + with pytest.raises(FileNotFoundError) as exc_info: + self.storage.initialise() + assert "No such file or directory" in str(exc_info.value) + diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py new file mode 100644 index 0000000..50d8846 --- /dev/null +++ b/tests/storages/test_storage_base.py @@ -0,0 +1,23 @@ +from typing import Type + +import pytest + +from auto_archiver.core.context import ArchivingContext +from auto_archiver.core.metadata import Metadata +from auto_archiver.core.storage import Storage + + +class TestStorageBase(object): + + module_name: str = None + config: dict = None + + @pytest.fixture(autouse=True) + def setup_storage(self, setup_module): + assert ( + self.module_name is not None + ), "self.module_name must be set on the subclass" + assert self.config is not None, "self.config must be a dict set on the subclass" + self.storage: Type[Storage] = setup_module( + self.module_name, self.config + ) From 6ab8fd2ee49d5ec4c1bc5f7e22cddf732a0e9371 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 5 Feb 2025 20:39:53 +0100 Subject: [PATCH 067/110] Tidy up setting modules as Orchestrator attributes on startup. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't override the values in config['steps'] – the config should be left as is --- src/auto_archiver/core/orchestrator.py | 52 ++++++-------------------- tests/test_orchestrator.py | 1 - 2 files changed, 11 insertions(+), 42 deletions(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 8a634de..5ac091c 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -69,6 +69,13 @@ class UniqueAppendAction(argparse.Action): getattr(namespace, self.dest).append(value) class ArchivingOrchestrator: + + feeders: List[Type[Feeder]] + extractors: List[Type[Extractor]] + enrichers: List[Type[Enricher]] + databases: List[Type[Database]] + storages: List[Type[Storage]] + formatters: List[Type[Formatter]] def setup_basic_parser(self): parser = argparse.ArgumentParser( @@ -296,11 +303,7 @@ class ArchivingOrchestrator: step_items.append(loaded_module) check_steps_ok() - self.config['steps'][f"{module_type}s"] = step_items - - - assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again." - self.config['steps'][f"{module_type}s"] = step_items + setattr(self, f"{module_type}s", step_items) def load_config(self, config_file: str) -> dict: if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE: @@ -331,9 +334,9 @@ class ArchivingOrchestrator: # log out the modules that were loaded for module_type in BaseModule.MODULE_TYPES: - logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"])) + logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) - for item in self.feed(): + for _ in self.feed(): pass def cleanup(self)->None: @@ -484,40 +487,7 @@ class ArchivingOrchestrator: # Helper Properties - - @property - def feeders(self) -> List[Type[Feeder]]: - return self._get_property('feeders') - - @property - def extractors(self) -> List[Type[Extractor]]: - return self._get_property('extractors') - - @property - def enrichers(self) -> List[Type[Enricher]]: - return self._get_property('enrichers') - - @property - def databases(self) -> List[Type[Database]]: - return self._get_property('databases') - - @property - def storages(self) -> List[Type[Storage]]: - return self._get_property('storages') - - @property - def formatters(self) -> List[Type[Formatter]]: - return self._get_property('formatters') @property def all_modules(self) -> List[Type[BaseModule]]: - return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters - - def _get_property(self, prop): - try: - f = self.config['steps'][prop] - if not (isinstance(f[0], BaseModule) or isinstance(f[0], LazyBaseModule)): - raise TypeError - return f - except: - exit("Property called prior to full initialisation") \ No newline at end of file + return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters \ No newline at end of file diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 68417aa..5ba57d0 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -89,7 +89,6 @@ def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args): orchestrator.run(test_args + # we still need to load the real path to get the example_module ["--module_paths", "tests/data/invalid_test_modules/"]) - # assert False assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..." From a506f2a88f6e073a2c45367a0f4cb8bab1f79bb7 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 6 Feb 2025 10:19:28 +0100 Subject: [PATCH 068/110] Clarify that an extractor's method can also return False if no valid data was found --- src/auto_archiver/core/extractor.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 98f1370..57320df 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -95,5 +95,11 @@ class Extractor(BaseModule): logger.warning(f"Failed to fetch the Media URL: {e}") @abstractmethod - def download(self, item: Metadata) -> Metadata: + def download(self, item: Metadata) -> Metadata | False: + """ + Downloads the media from the given URL and returns a Metadata object with the downloaded media. + + If the URL is not supported or the download fails, this method should return False. + + """ pass \ No newline at end of file From 5b0bad832f0bcf787979f18c5b8027f10b95b0a6 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 6 Feb 2025 10:11:56 +0000 Subject: [PATCH 069/110] Updated test, test metadata --- .../modules/gsheet_db/gsheet_db.py | 1 - .../modules/gsheet_feeder/gsheet_feeder.py | 59 ++++--- .../test_instagram_api_extractor.py | 89 +++++++++- tests/feeders/test_gsheet_feeder.py | 10 +- tests/test_metadata.py | 161 ++++++++++++++++++ 5 files changed, 284 insertions(+), 36 deletions(-) create mode 100644 tests/test_metadata.py diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 644015e..3bb27b7 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -104,7 +104,6 @@ class GsheetsDb(Database): if gsheet := item.get_context("gsheet"): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") - # todo doesn't exist, should be passed from elif self.sheet_id: print(self.sheet_id) diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index d129182..a51574e 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -37,41 +37,48 @@ class GsheetsFeeder(Feeder): def __iter__(self) -> Metadata: sh = self.open_sheet() - for ii, wks in enumerate(sh.worksheets()): - if not self.should_process_sheet(wks.title): - logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules") + for ii, worksheet in enumerate(sh.worksheets()): + if not self.should_process_sheet(worksheet.title): + logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") continue - - logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}') - gw = GWorksheet(wks, header_row=self.header, columns=self.columns) - + logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') + gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) if len(missing_cols := self.missing_required_columns(gw)): logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}") continue - for row in range(1 + self.header, gw.count_rows() + 1): - url = gw.get_cell(row, 'url').strip() - if not len(url): continue + # process and yield metadata here: + yield from self._process_rows(gw) + logger.success(f'Finished worksheet {worksheet.title}') - original_status = gw.get_cell(row, 'status') - status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) - # TODO: custom status parser(?) aka should_retry_from_status - if status not in ['', None]: continue + def _process_rows(self, gw: GWorksheet) -> Metadata: + for row in range(1 + self.header, gw.count_rows() + 1): + url = gw.get_cell(row, 'url').strip() + if not len(url): continue + original_status = gw.get_cell(row, 'status') + status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) + # TODO: custom status parser(?) aka should_retry_from_status + if status not in ['', None]: continue - # All checks done - archival process starts here - m = Metadata().set_url(url) - if gw.get_cell_or_default(row, 'folder', "") is None: - folder = '' - else: - folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) - if len(folder) and self.use_sheet_names_in_stored_paths: - folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title)) + # All checks done - archival process starts here + m = Metadata().set_url(url) + self._set_context(m, gw, row) + yield m - m.set_context('folder', folder) - m.set_context('worksheet', {"row": row, "worksheet": gw}) - yield m + def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: + # TODO: Check folder value not being recognised + m.set_context("gsheet", {"row": row, "worksheet": gw}) + + if gw.get_cell_or_default(row, 'folder', "") is None: + folder = '' + else: + folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) + if len(folder): + if self.use_sheet_names_in_stored_paths: + m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title))) + else: + m.set_context("folder", folder) - logger.success(f'Finished worksheet {wks.title}') def should_process_sheet(self, sheet_name: str) -> bool: if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py index 7a19233..d3f7bd6 100644 --- a/tests/extractors/test_instagram_api_extractor.py +++ b/tests/extractors/test_instagram_api_extractor.py @@ -9,6 +9,7 @@ from auto_archiver.modules.instagram_api_extractor.instagram_api_extractor impor from .test_extractor_base import TestExtractorBase + @pytest.fixture def mock_user_response(): return { @@ -71,11 +72,18 @@ class TestInstagramAPIExtractor(TestExtractorBase): config = { "access_token": "test_access_token", "api_endpoint": "https://api.instagram.com/v1", - # "full_profile": False, + "full_profile": False, # "full_profile_max_posts": 0, # "minimize_json_output": True, } + @pytest.fixture + def metadata(self): + m = Metadata() + m.set_url("https://instagram.com/test_user") + m.set("netloc", "instagram.com") + return m + @pytest.mark.parametrize("url,expected", [ ("https://instagram.com/user", [("", "user", "")]), ("https://instagr.am/p/post_id", []), @@ -88,7 +96,6 @@ class TestInstagramAPIExtractor(TestExtractorBase): assert self.extractor.valid_url.findall(url) == expected def test_initialize(self): - self.extractor.initialise() assert self.extractor.api_endpoint[-1] != "/" @pytest.mark.parametrize("input_dict,expected", [ @@ -98,11 +105,85 @@ class TestInstagramAPIExtractor(TestExtractorBase): def test_cleanup_dict(self, input_dict, expected): assert self.extractor.cleanup_dict(input_dict) == expected - def test_download_post(self): + def test_download(self): + pass + + def test_download_post(self, metadata, mock_user_response): # test with context=reel # test with context=post # test with multiple images # test gets text (metadata title) + pass + def test_download_profile_basic(self, metadata, mock_user_response): + """Test basic profile download without full_profile""" + with patch.object(self.extractor, 'call_api') as mock_call, \ + patch.object(self.extractor, 'download_from_url') as mock_download: + # Mock API responses + mock_call.return_value = mock_user_response + mock_download.return_value = "profile.jpg" - pass \ No newline at end of file + result = self.extractor.download_profile(metadata, "test_user") + assert result.status == "insta profile: success" + assert result.get_title() == "Test User" + assert result.get("data") == self.extractor.cleanup_dict(mock_user_response["user"]) + # Verify profile picture download + mock_call.assert_called_once_with("v2/user/by/username", {"username": "test_user"}) + mock_download.assert_called_once_with("http://example.com/profile.jpg") + assert len(result.media) == 1 + assert result.media[0].filename == "profile.jpg" + + def test_download_profile_full(self, metadata, mock_user_response, mock_story_response): + """Test full profile download with stories/posts""" + with patch.object(self.extractor, 'call_api') as mock_call, \ + patch.object(self.extractor, 'download_all_posts') as mock_posts, \ + patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \ + patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \ + patch.object(self.extractor, '_download_stories_reusable') as mock_stories: + + self.extractor.full_profile = True + mock_call.side_effect = [ + mock_user_response, + mock_story_response + ] + mock_highlights.return_value = None + mock_stories.return_value = mock_story_response + mock_posts.return_value = None + mock_tagged.return_value = None + + result = self.extractor.download_profile(metadata, "test_user") + assert result.get("#stories") == len(mock_story_response) + mock_posts.assert_called_once_with(result, "123") + assert "errors" not in result.metadata + + def test_download_profile_not_found(self, metadata): + """Test profile not found error""" + with patch.object(self.extractor, 'call_api') as mock_call: + mock_call.return_value = {"user": None} + with pytest.raises(AssertionError) as exc_info: + self.extractor.download_profile(metadata, "invalid_user") + assert "User invalid_user not found" in str(exc_info.value) + + def test_download_profile_error_handling(self, metadata, mock_user_response): + """Test error handling in full profile mode""" + with (patch.object(self.extractor, 'call_api') as mock_call, \ + patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \ + patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \ + patch.object(self.extractor, '_download_stories_reusable') as stories_tagged, \ + patch.object(self.extractor, 'download_all_posts') as mock_posts + ): + self.extractor.full_profile = True + mock_call.side_effect = [ + mock_user_response, + Exception("Stories API failed"), + Exception("Posts API failed") + ] + mock_highlights.return_value = None + mock_tagged.return_value = None + stories_tagged.return_value = None + mock_posts.return_value = None + result = self.extractor.download_profile(metadata, "test_user") + + assert result.is_success() + assert "Error downloading stories for test_user" in result.metadata["errors"] + # assert "Error downloading posts for test_user" in result.metadata["errors"] \ No newline at end of file diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index dbd2416..62380f5 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -4,7 +4,7 @@ import gspread import pytest from unittest.mock import patch, MagicMock from auto_archiver.modules.gsheet_feeder import GsheetsFeeder -from auto_archiver.core import Metadata, Feeder, ArchivingContext +from auto_archiver.core import Metadata, Feeder def test_initialise_without_sheet_and_sheet_id(setup_module): @@ -100,21 +100,21 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder): def test__set_metadata(gsheet_feeder: GsheetsFeeder, worksheet): gsheet_feeder._set_context(worksheet, 1) - assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet} + assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} @pytest.mark.skip(reason="Not recognising folder column") def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet): gsheet_feeder._set_context(worksheet, 7) - assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet} + assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): testworksheet = TestWorksheet() testworksheet.wks.title = "TestSheet" gsheet_feeder._set_context(testworksheet, 6) - assert ArchivingContext.get("gsheet") == {"row": 6, "worksheet": testworksheet} - assert ArchivingContext.get("folder") == "some-folder/test-auto-archiver/testsheet" + assert Metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet} + assert Metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet" @pytest.mark.usefixtures("setup_module") diff --git a/tests/test_metadata.py b/tests/test_metadata.py new file mode 100644 index 0000000..7270c80 --- /dev/null +++ b/tests/test_metadata.py @@ -0,0 +1,161 @@ +import pytest +from datetime import datetime, timezone +from dataclasses import dataclass +from typing import Any +from auto_archiver.core.metadata import Metadata + + +@pytest.fixture +def basic_metadata(): + m = Metadata() + m.set_url("https://example.com") + m.set("title", "Test Page") + return m + + +@dataclass +class MockMedia: + filename: str = "" + mimetype: str = "" + data: dict = None + + def get(self, key: str, default: Any = None) -> Any: + return self.data.get(key, default) if self.data else default + + def set(self, key: str, value: Any) -> None: + if not self.data: + self.data = {} + self.data[key] = value + + +@pytest.fixture +def media_file(): + def _create(filename="test.txt", mimetype="text/plain", hash_value=None): + m = MockMedia(filename=filename, mimetype=mimetype) + if hash_value: + m.set("hash", hash_value) + return m + + return _create + + +def test_initial_state(): + m = Metadata() + assert m.status == "no archiver" + assert m.metadata == {"_processed_at": m.get("_processed_at")} + assert m.media == [] + assert isinstance(m.get("_processed_at"), datetime) + + +def test_url_properties(basic_metadata): + assert basic_metadata.get_url() == "https://example.com" + assert basic_metadata.netloc == "example.com" + + +def test_simple_merge(basic_metadata): + right = Metadata(status="success") + right.set("title", "Test Title") + + basic_metadata.merge(right) + assert basic_metadata.status == "success" + assert basic_metadata.get("title") == "Test Title" + + +def test_left_merge(): + left = ( + Metadata() + .set("tags", ["a"]) + .set("stats", {"views": 10}) + .set("status", "success") + ) + right = ( + Metadata() + .set("tags", ["b"]) + .set("stats", {"likes": 5}) + .set("status", "no archiver") + ) + + left.merge(right, overwrite_left=True) + assert left.get("status") == "no archiver" + assert left.get("tags") == ["a", "b"] + assert left.get("stats") == {"views": 10, "likes": 5} + + +def test_media_management(basic_metadata, media_file): + media1 = media_file(hash_value="abc") + media2 = media_file(hash_value="abc") # Duplicate + media3 = media_file(hash_value="def") + + basic_metadata.add_media(media1, "m1") + basic_metadata.add_media(media2, "m2") + basic_metadata.add_media(media3) + + assert len(basic_metadata.media) == 3 + basic_metadata.remove_duplicate_media_by_hash() + assert len(basic_metadata.media) == 2 + assert basic_metadata.get_media_by_id("m1") == media1 + + +def test_success(): + m = Metadata() + assert not m.is_success() + m.success("context") + assert m.is_success() + assert m.status == "context: success" + + +def test_is_empty(): + m = Metadata() + assert m.is_empty() + # meaningless ids + ( + m.set("url", "example.com") + .set("total_bytes", 100) + .set("archive_duration_seconds", 10) + .set("_processed_at", datetime.now(timezone.utc)) + ) + assert m.is_empty() + + +def test_store(): + pass + +# Test Media operations + + +# Test custom getter/setters + + +def test_get_set_url(): + m = Metadata() + m.set_url("http://example.com") + assert m.get_url() == "http://example.com" + with pytest.raises(AssertionError): + m.set_url("") + assert m.get("url") == "http://example.com" + + +def test_set_content(): + m = Metadata() + m.set_content("Some content") + assert m.get("content") == "Some content" + # Test appending + m.set_content("New content") + # Do we want to add a line break to the method? + assert m.get("content") == "Some contentNew content" + + +def test_choose_most_complex(): + pass + + +def test_get_context(): + m = Metadata() + m.set_context("somekey", "somevalue") + assert m.get_context("somekey") == "somevalue" + assert m.get_context("nonexistent") is None + m.set_context("anotherkey", "anothervalue") + # check the previous is retained + assert m.get_context("somekey") == "somevalue" + assert m.get_context("anotherkey") == "anothervalue" + assert len(m._context) == 2 From 266c7a14e6606cfd1c478cb4ed0ece602646035d Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 6 Feb 2025 16:53:00 +0000 Subject: [PATCH 070/110] Context related fixes, some more tests. --- .../modules/gsheet_feeder/gsheet_feeder.py | 4 +- .../modules/s3_storage/__manifest__.py | 3 +- .../modules/s3_storage/s3_storage.py | 6 +- src/auto_archiver/utils/gsheet.py | 53 ----- tests/enrichers/test_meta_enricher.py | 103 +++++++++ .../test_instagram_tbot_extractor.py | 88 +++---- tests/feeders/test_gsheet_feeder.py | 216 +++++++++--------- tests/storages/test_S3_storage.py | 123 ++++++++-- tests/storages/test_storage_base.py | 1 - 9 files changed, 370 insertions(+), 227 deletions(-) delete mode 100644 src/auto_archiver/utils/gsheet.py create mode 100644 tests/enrichers/test_meta_enricher.py diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index a51574e..50bf430 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -44,14 +44,14 @@ class GsheetsFeeder(Feeder): logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) if len(missing_cols := self.missing_required_columns(gw)): - logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}") + logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}") continue # process and yield metadata here: yield from self._process_rows(gw) logger.success(f'Finished worksheet {worksheet.title}') - def _process_rows(self, gw: GWorksheet) -> Metadata: + def _process_rows(self, gw: GWorksheet): for row in range(1 + self.header, gw.count_rows() + 1): url = gw.get_cell(row, 'url').strip() if not len(url): continue diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py index df05055..bf032e7 100644 --- a/src/auto_archiver/modules/s3_storage/__manifest__.py +++ b/src/auto_archiver/modules/s3_storage/__manifest__.py @@ -3,7 +3,7 @@ "type": ["storage"], "requires_setup": True, "dependencies": { - "python": ["boto3", "loguru"], + "python": ["hash_enricher", "boto3", "loguru"], }, "configs": { "path_generator": { @@ -49,5 +49,6 @@ - Requires S3 credentials (API key and secret) and a bucket name to function. - The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures. - Uses `boto3` for interaction with the S3 API. + - Depends on the `HashEnricher` module for hash calculation. """ } diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index f324d5c..0c0e275 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -9,10 +9,11 @@ from auto_archiver.core import Media from auto_archiver.core import Storage from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str +from auto_archiver.core.module import get_module NO_DUPLICATES_FOLDER = "no-dups/" -class S3Storage(Storage, HashEnricher): +class S3Storage(Storage): def setup(self, config: dict) -> None: super().setup(config) @@ -49,7 +50,8 @@ class S3Storage(Storage, HashEnricher): def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload - hd = self.calculate_hash(media.filename) + he = get_module('hash_enricher', self.config) + hd = he.calculate_hash(media.filename) path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py deleted file mode 100644 index 7a8862f..0000000 --- a/src/auto_archiver/utils/gsheet.py +++ /dev/null @@ -1,53 +0,0 @@ -import json, gspread - -from ..core import BaseModule - - -class Gsheets(BaseModule): - name = "gsheets" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.gsheets_client = gspread.service_account(filename=self.service_account) - # TODO: config should be responsible for conversions - try: self.header = int(self.header) - except: pass - assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}" - assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets." - - # TODO merge this into gsheets processors manifest - @staticmethod - def configs() -> dict: - return { - "sheet": {"default": None, "help": "name of the sheet to archive"}, - "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, - "header": {"default": 1, "help": "index of the header row (starts at 1)"}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, - "columns": { - "default": { - 'url': 'link', - 'status': 'archive status', - 'folder': 'destination folder', - 'archive': 'archive location', - 'date': 'archive date', - 'thumbnail': 'thumbnail', - 'timestamp': 'upload timestamp', - 'title': 'upload title', - 'text': 'text content', - 'screenshot': 'screenshot', - 'hash': 'hash', - 'pdq_hash': 'perceptual hashes', - 'wacz': 'wacz', - 'replaywebpage': 'replaywebpage', - }, - "help": "names of columns in the google sheet (stringified JSON object)", - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) - }, - } - - def open_sheet(self): - if self.sheet: - return self.gsheets_client.open(self.sheet) - else: # self.sheet_id - return self.gsheets_client.open_by_key(self.sheet_id) diff --git a/tests/enrichers/test_meta_enricher.py b/tests/enrichers/test_meta_enricher.py new file mode 100644 index 0000000..a09aaa9 --- /dev/null +++ b/tests/enrichers/test_meta_enricher.py @@ -0,0 +1,103 @@ +import datetime +from datetime import datetime, timedelta, timezone +from unittest.mock import MagicMock, patch + +import pytest + +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.meta_enricher import MetaEnricher + + +@pytest.fixture +def mock_metadata(): + """Creates a mock Metadata object.""" + mock: Metadata = MagicMock(spec=Metadata) + mock.get_url.return_value = "https://example.com" + mock.is_empty.return_value = False # Default to not empty + mock.get_all_media.return_value = [] + return mock + +@pytest.fixture +def mock_media(): + """Creates a mock Media object.""" + mock: Media = MagicMock(spec=Media) + mock.filename = "mock_file.txt" + return mock + +@pytest.fixture +def metadata(): + m = Metadata() + m.set_url("https://example.com") + m.set_title("Test Title") + m.set_content("Test Content") + return m + + +@pytest.fixture(autouse=True) +def meta_enricher(setup_module): + return setup_module(MetaEnricher, {}) + + +def test_enrich_skips_empty_metadata(meta_enricher, mock_metadata): + """Test that enrich() does nothing when Metadata is empty.""" + mock_metadata.is_empty.return_value = True + meta_enricher.enrich(mock_metadata) + mock_metadata.get_url.assert_called_once() + + +def test_enrich_file_sizes(meta_enricher, metadata, tmp_path): + """Test that enrich_file_sizes() calculates and sets file sizes correctly.""" + file1 = tmp_path / "testfile_1.txt" + file2 = tmp_path / "testfile_2.txt" + file1.write_text("A" * 1000) + file2.write_text("B" * 2000) + metadata.add_media(Media(str(file1))) + metadata.add_media(Media(str(file2))) + + meta_enricher.enrich_file_sizes(metadata) + + # Verify individual media file sizes + media1 = metadata.get_all_media()[0] + media2 = metadata.get_all_media()[1] + + assert media1.get("bytes") == 1000 + assert media1.get("size") == "1000.0 bytes" + assert media2.get("bytes") == 2000 + assert media2.get("size") == "2.0 KB" + + assert metadata.get("total_bytes") == 3000 + assert metadata.get("total_size") == "2.9 KB" + +@pytest.mark.parametrize( + "size, expected", + [ + (500, "500.0 bytes"), + (1024, "1.0 KB"), + (2048, "2.0 KB"), + (1048576, "1.0 MB"), + (1073741824, "1.0 GB"), + ], +) +def test_human_readable_bytes(size, expected): + """Test that human_readable_bytes() converts sizes correctly.""" + enricher = MetaEnricher() + assert enricher.human_readable_bytes(size) == expected + +def test_enrich_file_sizes_no_media(meta_enricher, metadata): + """Test that enrich_file_sizes() handles empty media list gracefully.""" + meta_enricher.enrich_file_sizes(metadata) + assert metadata.get("total_bytes") == 0 + assert metadata.get("total_size") == "0.0 bytes" + + +def test_enrich_archive_duration(meta_enricher, metadata): + # Set fixed "processed at" time in the past + processed_at = datetime.now(timezone.utc) - timedelta(minutes=10, seconds=30) + metadata.set("_processed_at", processed_at) + # patch datetime + with patch("datetime.datetime") as mock_datetime: + mock_now = datetime.now(timezone.utc) + mock_datetime.now.return_value = mock_now + meta_enricher.enrich_archive_duration(metadata) + + assert metadata.get("archive_duration_seconds") == 630 \ No newline at end of file diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py index 4fe80be..b82641d 100644 --- a/tests/extractors/test_instagram_tbot_extractor.py +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -5,15 +5,16 @@ from unittest.mock import patch, MagicMock import pytest +from auto_archiver.core import Metadata from auto_archiver.core.extractor import Extractor from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor - +from tests.extractors.test_extractor_base import TestExtractorBase TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles") @pytest.fixture -def test_session_file(tmpdir): +def session_file(tmpdir): """Fixture to create a test session file.""" session_file = os.path.join(tmpdir, "test_session.session") with open(session_file, "w") as f: @@ -21,27 +22,34 @@ def test_session_file(tmpdir): return session_file.replace(".session", "") -@pytest.mark.incremental -class TestInstagramTbotExtractor(object): - """ - Test suite for InstagramTbotExtractor. - """ +@pytest.fixture(autouse=True) +def patch_extractor_methods(request, setup_module): + with patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None), \ + patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None): + if hasattr(request, 'cls') and hasattr(request.cls, 'config'): + request.cls.extractor = setup_module("instagram_tbot_extractor", request.cls.config) + + yield + +@pytest.fixture +def metadata_sample(): + m = Metadata() + m.set_title("Test Title") + m.set_timestamp("2021-01-01T00:00:00Z") + m.set_url("https://www.instagram.com/p/1234567890") + return m + + +class TestInstagramTbotExtractor: extractor_module = "instagram_tbot_extractor" extractor: InstagramTbotExtractor config = { "api_id": 12345, "api_hash": "test_api_hash", - # "session_file" + "session_file": "test_session", } - @pytest.fixture(autouse=True) - def setup_extractor(self, setup_module): - assert self.extractor_module is not None, "self.extractor_module must be set on the subclass" - assert self.config is not None, "self.config must be a dict set on the subclass" - extractor: Type[Extractor] = setup_module(self.extractor_module, self.config) - return extractor - @pytest.fixture def mock_telegram_client(self): """Fixture to mock TelegramClient interactions.""" @@ -50,22 +58,11 @@ class TestInstagramTbotExtractor(object): mock_client.return_value = instance yield instance - - # @pytest.fixture - # def mock_session_file(self, temp_session_file): - # """Patch the extractor’s session file setup to use a temporary path.""" - # with patch.object(InstagramTbotExtractor, "session_file", temp_session_file): - # with patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None): - # yield # Mocks are applied for the duration of the test - - @pytest.fixture - def metadata_sample(self): - """Loads a Metadata object from a pickle file.""" - with open(os.path.join(TESTFILES, "metadata_item.pkl"), "rb") as f: - return pickle.load(f) + def test_extractor_is_initialized(self): + assert self.extractor is not None - @pytest.mark.download + @patch("time.sleep") @pytest.mark.parametrize("url, expected_status, bot_responses", [ ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou")]), ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol")]), @@ -74,32 +71,19 @@ class TestInstagramTbotExtractor(object): ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, []), ("https://www.instagram.com/p/INVALID", False, [MagicMock(id=101, media=None, message="You must enter a URL to a post")]), ]) - def test_download(self, url, expected_status, bot_responses, metadata_sample): + def test_download(self, mock_sleep, url, expected_status, bot_responses, metadata_sample): """Test the `download()` method with various Instagram URLs.""" metadata_sample.set_url(url) - self.extractor.initialise() + self.extractor.client = MagicMock() result = self.extractor.download(metadata_sample) - if expected_status: - assert result.is_success() - assert result.status == expected_status - assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message] - else: - assert result is False - # self.extractor.cleanup() - - # @patch.object(InstagramTbotExtractor, '_send_url_to_bot') - # @patch.object(InstagramTbotExtractor, '_process_messages') - # def test_download_invalid_link_returns_false( - # self, mock_process, mock_send, extractor, metadata_instagram - # ): - # # Setup Mocks - # # _send_url_to_bot -> simulate it returns (chat=MagicMock, since_id=100) - # mock_chat = MagicMock() - # mock_send.return_value = (mock_chat, 100) - # # _process_messages -> simulate it returns the text "You must enter a URL to a post" - # mock_process.return_value = "You must enter a URL to a post" - # result = extractor.download(metadata_instagram) - # assert result is False, "Should return False if message includes 'You must enter a URL to a post'" + pass + # TODO fully mock or use as authenticated test + # if expected_status: + # assert result.is_success() + # assert result.status == expected_status + # assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message] + # else: + # assert result is False diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index 62380f5..103610e 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -9,57 +9,52 @@ from auto_archiver.core import Metadata, Feeder def test_initialise_without_sheet_and_sheet_id(setup_module): """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set. - (shouldn't really be asserting in there) + (shouldn't really be asserting in there) """ with patch("gspread.service_account"): - feeder = setup_module("gsheet_feeder", - {"service_account": "dummy.json", - "sheet": None, - "sheet_id": None}) with pytest.raises(AssertionError): - feeder.initialise() + setup_module( + "gsheet_feeder", + {"service_account": "dummy.json", "sheet": None, "sheet_id": None}, + ) @pytest.fixture def gsheet_feeder(setup_module) -> GsheetsFeeder: - feeder = setup_module("gsheet_feeder", - {"service_account": "dummy.json", - "sheet": "test-auto-archiver", - "sheet_id": None, - "header": 1, - "columns": { - "url": "link", - "status": "archive status", - "folder": "destination folder", - "archive": "archive location", - "date": "archive date", - "thumbnail": "thumbnail", - "timestamp": "upload timestamp", - "title": "upload title", - "text": "text content", - "screenshot": "screenshot", - "hash": "hash", - "pdq_hash": "perceptual hashes", - "wacz": "wacz", - "replaywebpage": "replaywebpage", - }, - "allow_worksheets": set(), - "block_worksheets": set(), - "use_sheet_names_in_stored_paths": True, - } - ) + with patch("gspread.service_account"): + feeder = setup_module( + "gsheet_feeder", + { + "service_account": "dummy.json", + "sheet": "test-auto-archiver", + "sheet_id": None, + "header": 1, + "columns": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", + }, + "allow_worksheets": set(), + "block_worksheets": set(), + "use_sheet_names_in_stored_paths": True, + }, + ) feeder.gsheets_client = MagicMock() return feeder -@pytest.fixture() -def worksheet(unpickle): - # Load the worksheet data from the pickle file - # only works for simple usage, cant reauthenticate but give structure - return unpickle("test_worksheet.pickle") - - -class TestWorksheet(): +class TestWorksheet: """ mimics the bits we need from gworksheet """ @@ -68,12 +63,17 @@ class TestWorksheet(): title = "TestSheet" rows = [ - { "row": 2, "url": "http://example.com", "status": "", "folder": "" }, - { "row": 3, "url": "http://example.com", "status": "", "folder": "" }, - { "row": 4, "url": "", "status": "", "folder": "" }, - { "row": 5, "url": "https://another.com", "status": None, "folder": "" }, - { "row": 6, "url": "https://another.com", "status": "success", "folder": "some_folder" }, - ] + {"row": 2, "url": "http://example.com", "status": "", "folder": ""}, + {"row": 3, "url": "http://example.com", "status": "", "folder": ""}, + {"row": 4, "url": "", "status": "", "folder": ""}, + {"row": 5, "url": "https://another.com", "status": None, "folder": ""}, + { + "row": 6, + "url": "https://another.com", + "status": "success", + "folder": "some_folder", + }, + ] def __init__(self): self.wks = self.SheetSheet() @@ -91,6 +91,7 @@ class TestWorksheet(): matching = next((r for r in self.rows if r["row"] == row), {}) return matching.get(col_name, default) + def test__process_rows(gsheet_feeder: GsheetsFeeder): testworksheet = TestWorksheet() metadata_items = list(gsheet_feeder._process_rows(testworksheet)) @@ -98,9 +99,12 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder): assert isinstance(metadata_items[0], Metadata) assert metadata_items[0].get("url") == "http://example.com" -def test__set_metadata(gsheet_feeder: GsheetsFeeder, worksheet): - gsheet_feeder._set_context(worksheet, 1) - assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} + +def test__set_metadata(gsheet_feeder: GsheetsFeeder): + worksheet = TestWorksheet() + metadata = Metadata() + gsheet_feeder._set_context(metadata, worksheet, 1) + assert metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} @pytest.mark.skip(reason="Not recognising folder column") @@ -111,18 +115,24 @@ def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, workshe def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): testworksheet = TestWorksheet() + metadata = Metadata() testworksheet.wks.title = "TestSheet" - gsheet_feeder._set_context(testworksheet, 6) - assert Metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet} - assert Metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet" + gsheet_feeder._set_context(metadata, testworksheet, 6) + assert metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet} + assert metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet" @pytest.mark.usefixtures("setup_module") -@pytest.mark.parametrize("sheet, sheet_id, expected_method, expected_arg, description", [ - ("TestSheet", None, "open", "TestSheet", "opening by sheet name"), - (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID") -]) -def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description): +@pytest.mark.parametrize( + "sheet, sheet_id, expected_method, expected_arg, description", + [ + ("TestSheet", None, "open", "TestSheet", "opening by sheet name"), + (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID"), + ], +) +def test_open_sheet_with_name_or_id( + setup_module, sheet, sheet_id, expected_method, expected_arg, description +): """Ensure open_sheet() correctly opens by name or ID based on configuration.""" with patch("gspread.service_account") as mock_service_account: mock_client = MagicMock() @@ -131,15 +141,16 @@ def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_meth mock_client.open_by_key.return_value = "MockSheet" # Setup module with parameterized values - feeder = setup_module("gsheet_feeder", { - "service_account": "dummy.json", - "sheet": sheet, - "sheet_id": sheet_id - }) + feeder = setup_module( + "gsheet_feeder", + {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, + ) feeder.initialise() sheet_result = feeder.open_sheet() # Validate the correct method was called - getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}" + getattr(mock_client, expected_method).assert_called_once_with( + expected_arg + ), f"Failed: {description}" assert sheet_result == "MockSheet", f"Failed: {description}" @@ -150,10 +161,10 @@ def test_open_sheet_with_sheet_id(setup_module): mock_client = MagicMock() mock_service_account.return_value = mock_client mock_client.open_by_key.return_value = "MockSheet" - feeder = setup_module("gsheet_feeder", - {"service_account": "dummy.json", - "sheet": None, - "sheet_id": "ABC123"}) + feeder = setup_module( + "gsheet_feeder", + {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, + ) feeder.initialise() sheet = feeder.open_sheet() mock_client.open_by_key.assert_called_once_with("ABC123") @@ -161,47 +172,51 @@ def test_open_sheet_with_sheet_id(setup_module): def test_should_process_sheet(setup_module): - gdb = setup_module("gsheet_feeder", {"service_account": "dummy.json", - "sheet": "TestSheet", - "sheet_id": None, - "allow_worksheets": {"TestSheet", "Sheet2"}, - "block_worksheets": {"Sheet3"}} - ) + with patch("gspread.service_account"): + gdb = setup_module( + "gsheet_feeder", + { + "service_account": "dummy.json", + "sheet": "TestSheet", + "sheet_id": None, + "allow_worksheets": {"TestSheet", "Sheet2"}, + "block_worksheets": {"Sheet3"}, + }, + ) assert gdb.should_process_sheet("TestSheet") == True assert gdb.should_process_sheet("Sheet3") == False # False if allow_worksheets is set assert gdb.should_process_sheet("AnotherSheet") == False - -@pytest.mark.skip +# @pytest.mark.skip(reason="Requires a real connection") class TestGSheetsFeederReal: + """Testing GSheetsFeeder class""" - """ Testing GSheetsFeeder class """ - module_name: str = 'gsheet_feeder' + module_name: str = "gsheet_feeder" feeder: GsheetsFeeder + # You must follow the setup process explain in the docs for this to work config: dict = { - # TODO: Create test creds "service_account": "secrets/service_account.json", "sheet": "test-auto-archiver", "sheet_id": None, "header": 1, "columns": { - "url": "link", - "status": "archive status", - "folder": "destination folder", - "archive": "archive location", - "date": "archive date", - "thumbnail": "thumbnail", - "timestamp": "upload timestamp", - "title": "upload title", - "text": "text content", - "screenshot": "screenshot", - "hash": "hash", - "pdq_hash": "perceptual hashes", - "wacz": "wacz", - "replaywebpage": "replaywebpage", - }, + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", + }, "allow_worksheets": set(), "block_worksheets": set(), "use_sheet_names_in_stored_paths": True, @@ -213,9 +228,7 @@ class TestGSheetsFeederReal: self.module_name is not None ), "self.module_name must be set on the subclass" assert self.config is not None, "self.config must be a dict set on the subclass" - self.feeder: Type[Feeder] = setup_module( - self.module_name, self.config - ) + self.feeder: Type[Feeder] = setup_module(self.module_name, self.config) def reset_test_sheet(self): """Clears test sheet and re-adds headers to ensure consistent test results.""" @@ -225,19 +238,17 @@ class TestGSheetsFeederReal: worksheet.clear() worksheet.append_row(["Link", "Archive Status"]) - def test_initialise(self): - self.feeder.initialise() + def test_setup(self): assert hasattr(self.feeder, "gsheets_client") - @pytest.mark.download def test_open_sheet_real_connection(self): """Ensure open_sheet() connects to a real Google Sheets instance.""" - self.feeder.initialise() sheet = self.feeder.open_sheet() assert sheet is not None, "open_sheet() should return a valid sheet instance" - assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method" + assert hasattr( + sheet, "worksheets" + ), "Returned object should have worksheets method" - @pytest.mark.download def test_iter_yields_metadata_real_data(self): """Ensure __iter__() yields Metadata objects for real test sheet data.""" self.reset_test_sheet() @@ -260,7 +271,6 @@ class TestGSheetsFeederReal: assert metadata_list[0].metadata.get("url") == "https://example.com" - # TODO # Test two sheets diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index df1c1f1..60b40e6 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -1,9 +1,101 @@ from typing import Type import pytest -from unittest.mock import MagicMock, patch, mock_open +from unittest.mock import MagicMock, patch, PropertyMock from auto_archiver.core import Media +from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.modules.s3_storage import s3_storage -from tests.storages.test_storage_base import TestStorageBase + + +@patch('boto3.client') +@pytest.fixture +def s3_store(setup_module): + config: dict = { + "path_generator": "flat", + "filename_generator": "static", + "bucket": "test-bucket", + "region": "test-region", + "key": "test-key", + "secret": "test-secret", + "random_no_duplicate": False, + "endpoint_url": "https://{region}.example.com", + "cdn_url": "https://cdn.example.com/{key}", + "private": False, + } + s3_storage = setup_module("s3_storage", config) + return s3_storage + +def test_client_initialization(s3_store): + """Test that S3 client is initialized with correct parameters""" + assert s3_store.s3 is not None + assert s3_store.s3.meta.region_name == 'test-region' + + +def test_get_cdn_url_generation(s3_store): + """Test CDN URL formatting """ + media = Media("test.txt") + media.key = "path/to/file.txt" + url = s3_store.get_cdn_url(media) + assert url == "https://cdn.example.com/path/to/file.txt" + media.key = "another/path.jpg" + assert s3_store.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" + + +@patch.object(s3_storage.S3Storage, 'file_in_folder') +def test_skips_upload_when_duplicate_exists(mock_file_in_folder, s3_store): + """Test that upload skips when file_in_folder finds existing object""" + # Setup test-specific configuration + s3_store.random_no_duplicate = True + mock_file_in_folder.return_value = "existing_folder/existing_file.txt" + # Create test media with calculated hash + media = Media("test.txt") + media.key = "original_path.txt" + + # Mock hash calculation + with patch.object(s3_store, 'calculate_hash') as mock_calculate_hash: + mock_calculate_hash.return_value = "testhash123" + # Verify upload + assert s3_store.is_upload_needed(media) is False + assert media.key == "existing_folder/existing_file.txt" + assert media.get("previously archived") is True + + with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload: + result = s3_store.uploadf(None, media) + mock_upload.assert_not_called() + assert result is True + +@patch.object(s3_storage.S3Storage, 'is_upload_needed') +def test_uploads_with_correct_parameters(mock_upload_needed, s3_store): + media = Media("test.txt") + mock_upload_needed.return_value = True + media.mimetype = 'image/png' + mock_file = MagicMock() + + with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload: + s3_store.uploadf(mock_file, media) + + # Verify core upload parameters + mock_upload.assert_called_once_with( + mock_file, + Bucket='test-bucket', + # Key='original_key.txt', + Key=None, + ExtraArgs={ + 'ACL': 'public-read', + 'ContentType': 'image/png' + } + ) + + + + + + + + +# ============================================================ + + + class TestGDriveStorage: @@ -29,20 +121,13 @@ class TestGDriveStorage: @patch('boto3.client') @pytest.fixture(autouse=True) def setup_storage(self, setup_module): + he = HashEnricher() self.storage = setup_module(self.module_name, self.config) - self.storage.initialise() - @patch('boto3.client') - def test_client_initialization(self, mock_boto_client, setup_module): + def test_client_initialization(self, setup_storage): """Test that S3 client is initialized with correct parameters""" - self.storage.initialise() - mock_boto_client.assert_called_once_with( - 's3', - region_name='test-region', - endpoint_url='https://test-region.example.com', - aws_access_key_id='test-key', - aws_secret_access_key='test-secret' - ) + assert self.storage.s3 is not None + assert self.storage.s3.meta.region_name == 'test-region' def test_get_cdn_url_generation(self): """Test CDN URL formatting """ @@ -53,6 +138,18 @@ class TestGDriveStorage: media.key = "another/path.jpg" assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" + def test_upload_decision_logic(self): + """Test is_upload_needed under different conditions""" + media = Media("test.txt") + + # Test random_no_duplicate disabled + assert self.storage.is_upload_needed(media) is True + + # Test duplicate exists + self.storage.random_no_duplicate = True + with patch.object(self.storage, 'file_in_folder', return_value='existing.txt'): + assert self.storage.is_upload_needed(media) is False + assert media.key == 'existing.txt' @patch.object(s3_storage.S3Storage, 'file_in_folder') def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder): diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py index 50d8846..7578acd 100644 --- a/tests/storages/test_storage_base.py +++ b/tests/storages/test_storage_base.py @@ -2,7 +2,6 @@ from typing import Type import pytest -from auto_archiver.core.context import ArchivingContext from auto_archiver.core.metadata import Metadata from auto_archiver.core.storage import Storage From e9ad1e1b85dbea08354189e775ae4718b4ea52cb Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 6 Feb 2025 22:01:55 +0000 Subject: [PATCH 071/110] Pass media to storage cdn_call --- src/auto_archiver/core/media.py | 2 +- .../modules/gdrive_storage/gdrive_storage.py | 11 +- tests/storages/test_S3_storage.py | 149 +++++------------- 3 files changed, 49 insertions(+), 113 deletions(-) diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 2cb6fc9..952a025 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -65,7 +65,7 @@ class Media: def is_stored(self, in_storage) -> bool: # checks if the media is already stored in the given storage - return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u]) + return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url(self) in u]) def set(self, key: str, value: Any) -> Media: self.properties[key] = value diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index b764f1d..cc9cf3d 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -74,7 +74,8 @@ class GDriveStorage(Storage): parent_id = folder_id # get id of file inside folder (or sub folder) - file_id = self._get_id_from_parent_and_name(folder_id, filename) + # TODO: supressing the error as being checked before first upload + file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False) return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" def upload(self, media: Media, **kwargs) -> bool: @@ -106,7 +107,13 @@ class GDriveStorage(Storage): # must be implemented even if unused def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass - def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False): + def _get_id_from_parent_and_name(self, parent_id: str, + name: str, + retries: int = 1, + sleep_seconds: int = 10, + use_mime_type: bool = False, + raise_on_missing: bool = True, + use_cache=False): """ Retrieves the id of a folder or file from its @name and the @parent_id folder Optionally does multiple @retries and sleeps @sleep_seconds between them diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index 60b40e6..2594e73 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -1,103 +1,11 @@ from typing import Type import pytest -from unittest.mock import MagicMock, patch, PropertyMock +from unittest.mock import MagicMock, patch from auto_archiver.core import Media from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.modules.s3_storage import s3_storage -@patch('boto3.client') -@pytest.fixture -def s3_store(setup_module): - config: dict = { - "path_generator": "flat", - "filename_generator": "static", - "bucket": "test-bucket", - "region": "test-region", - "key": "test-key", - "secret": "test-secret", - "random_no_duplicate": False, - "endpoint_url": "https://{region}.example.com", - "cdn_url": "https://cdn.example.com/{key}", - "private": False, - } - s3_storage = setup_module("s3_storage", config) - return s3_storage - -def test_client_initialization(s3_store): - """Test that S3 client is initialized with correct parameters""" - assert s3_store.s3 is not None - assert s3_store.s3.meta.region_name == 'test-region' - - -def test_get_cdn_url_generation(s3_store): - """Test CDN URL formatting """ - media = Media("test.txt") - media.key = "path/to/file.txt" - url = s3_store.get_cdn_url(media) - assert url == "https://cdn.example.com/path/to/file.txt" - media.key = "another/path.jpg" - assert s3_store.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" - - -@patch.object(s3_storage.S3Storage, 'file_in_folder') -def test_skips_upload_when_duplicate_exists(mock_file_in_folder, s3_store): - """Test that upload skips when file_in_folder finds existing object""" - # Setup test-specific configuration - s3_store.random_no_duplicate = True - mock_file_in_folder.return_value = "existing_folder/existing_file.txt" - # Create test media with calculated hash - media = Media("test.txt") - media.key = "original_path.txt" - - # Mock hash calculation - with patch.object(s3_store, 'calculate_hash') as mock_calculate_hash: - mock_calculate_hash.return_value = "testhash123" - # Verify upload - assert s3_store.is_upload_needed(media) is False - assert media.key == "existing_folder/existing_file.txt" - assert media.get("previously archived") is True - - with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload: - result = s3_store.uploadf(None, media) - mock_upload.assert_not_called() - assert result is True - -@patch.object(s3_storage.S3Storage, 'is_upload_needed') -def test_uploads_with_correct_parameters(mock_upload_needed, s3_store): - media = Media("test.txt") - mock_upload_needed.return_value = True - media.mimetype = 'image/png' - mock_file = MagicMock() - - with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload: - s3_store.uploadf(mock_file, media) - - # Verify core upload parameters - mock_upload.assert_called_once_with( - mock_file, - Bucket='test-bucket', - # Key='original_key.txt', - Key=None, - ExtraArgs={ - 'ACL': 'public-read', - 'ContentType': 'image/png' - } - ) - - - - - - - - -# ============================================================ - - - - - class TestGDriveStorage: """ Test suite for GDriveStorage. @@ -121,10 +29,9 @@ class TestGDriveStorage: @patch('boto3.client') @pytest.fixture(autouse=True) def setup_storage(self, setup_module): - he = HashEnricher() self.storage = setup_module(self.module_name, self.config) - def test_client_initialization(self, setup_storage): + def test_client_initialization(self): """Test that S3 client is initialized with correct parameters""" assert self.storage.s3 is not None assert self.storage.s3.meta.region_name == 'test-region' @@ -138,37 +45,55 @@ class TestGDriveStorage: media.key = "another/path.jpg" assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" + def test_uploadf_sets_acl_public(self): + media = Media("test.txt") + mock_file = MagicMock() + with patch.object(self.storage.s3, 'upload_fileobj') as mock_s3_upload, \ + patch.object(self.storage, 'is_upload_needed', return_value=True): + self.storage.uploadf(mock_file, media) + mock_s3_upload.assert_called_once_with( + mock_file, + Bucket='test-bucket', + Key=media.key, + ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'} + ) + def test_upload_decision_logic(self): """Test is_upload_needed under different conditions""" media = Media("test.txt") - - # Test random_no_duplicate disabled + # Test default state (random_no_duplicate=False) assert self.storage.is_upload_needed(media) is True + # Set duplicate checking config to true: - # Test duplicate exists self.storage.random_no_duplicate = True - with patch.object(self.storage, 'file_in_folder', return_value='existing.txt'): + with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calc_hash, \ + patch.object(self.storage, 'file_in_folder') as mock_file_in_folder: + mock_calc_hash.return_value = 'beepboop123beepboop123beepboop123' + mock_file_in_folder.return_value = 'existing_key.txt' + # Test duplicate result assert self.storage.is_upload_needed(media) is False - assert media.key == 'existing.txt' + assert media.key == 'existing_key.txt' + mock_file_in_folder.assert_called_with( + # (first 24 chars of hash) + 'no-dups/beepboop123beepboop123be' + ) + @patch.object(s3_storage.S3Storage, 'file_in_folder') def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder): """Test that upload skips when file_in_folder finds existing object""" - # Setup test-specific configuration self.storage.random_no_duplicate = True mock_file_in_folder.return_value = "existing_folder/existing_file.txt" # Create test media with calculated hash media = Media("test.txt") media.key = "original_path.txt" - # Mock hash calculation - with patch.object(self.storage, 'calculate_hash') as mock_calculate_hash: - mock_calculate_hash.return_value = "testhash123" + with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calculate_hash: + mock_calculate_hash.return_value = "beepboop123beepboop123beepboop123" # Verify upload assert self.storage.is_upload_needed(media) is False assert media.key == "existing_folder/existing_file.txt" assert media.get("previously archived") is True - with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: result = self.storage.uploadf(None, media) mock_upload.assert_not_called() @@ -177,21 +102,25 @@ class TestGDriveStorage: @patch.object(s3_storage.S3Storage, 'is_upload_needed') def test_uploads_with_correct_parameters(self, mock_upload_needed): media = Media("test.txt") + media.key = "original_key.txt" mock_upload_needed.return_value = True media.mimetype = 'image/png' mock_file = MagicMock() with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: self.storage.uploadf(mock_file, media) - - # Verify core upload parameters + # verify call occured with these params mock_upload.assert_called_once_with( mock_file, Bucket='test-bucket', - # Key='original_key.txt', - Key=None, + Key='original_key.txt', ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'image/png' } - ) \ No newline at end of file + ) + + def test_file_in_folder_exists(self): + with patch.object(self.storage.s3, 'list_objects') as mock_list_objects: + mock_list_objects.return_value = {'Contents': [{'Key': 'path/to/file.txt'}]} + assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt' \ No newline at end of file From 2920cf685f8c556cbdfa8d805f1eb20b8fe41d66 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Feb 2025 12:35:40 +0000 Subject: [PATCH 072/110] Small fixes to whisper_enricher.py. --- src/auto_archiver/modules/whisper_enricher/__manifest__.py | 6 ++++-- .../modules/whisper_enricher/whisper_enricher.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index f7ad1b3..884de66 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -6,8 +6,10 @@ "python": ["s3_storage", "loguru", "requests"], }, "configs": { - "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."}, - "api_key": {"default": None, "help": "WhisperApi api key for authentication"}, + "api_endpoint": {"required": True, + "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."}, + "api_key": {"required": True, + "help": "WhisperApi api key for authentication"}, "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."}, "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]}, diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 8ca2131..a7298e4 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -110,7 +110,7 @@ class WhisperEnricher(Enricher): def _get_s3_storage(self) -> S3Storage: try: - return next(s for s in self.storages if s.__class__ == S3Storage) + return next(s for s in self.config['steps']['storages'] if s == 's3_storage') except: logger.warning("No S3Storage instance found in storages") return From 950624dd4bb0e917abbe58c98351bbabd26d0bb3 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Feb 2025 20:26:00 +0000 Subject: [PATCH 073/110] Fix S3 storage to media in whisper_enricher.py. --- .../modules/whisper_enricher/__manifest__.py | 7 +++++-- .../whisper_enricher/whisper_enricher.py | 19 ++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index 884de66..1539df6 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -1,4 +1,4 @@ -{ +a={ "name": "Whisper Enricher", "type": ["enricher"], "requires_setup": True, @@ -12,7 +12,9 @@ "help": "WhisperApi api key for authentication"}, "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."}, "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, - "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]}, + "action": {"default": "translate", + "help": "which Whisper operation to execute", + "choices": ["transcribe", "translate", "language_detection"]}, }, "description": """ Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files. @@ -27,6 +29,7 @@ ### Notes - Requires a Whisper API endpoint and API key for authentication. - Only compatible with S3-compatible storage systems for media file accessibility. + - ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files. - Handles multiple jobs and retries for failed or incomplete processing. """ } diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index a7298e4..004d91c 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -15,17 +15,21 @@ class WhisperEnricher(Enricher): """ def enrich(self, to_enrich: Metadata) -> None: - if not self._get_s3_storage(): + storages = self.config['steps']['storages'] + if not "s3_storage" in storages: logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") return + self.s3 = get_module("s3_storage", self.config) url = to_enrich.get_url() logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.") job_results = {} for i, m in enumerate(to_enrich.media): if m.is_video() or m.is_audio(): - m.store(url=url, metadata=to_enrich, storages=self.storages) + # TODO: this used to pass all storage items to store now + # Now only passing S3, the rest will get added later in the usual order (?) + m.store(url=url, metadata=to_enrich, storages=[self.s3]) try: job_id = self.submit_job(m) job_results[job_id] = False @@ -53,8 +57,8 @@ class WhisperEnricher(Enricher): to_enrich.set_content(f"\n[automatic video transcript]: {v}") def submit_job(self, media: Media): - s3 = get_module("s3_storage", self.config) - s3_url = s3.get_cdn_url(media) + + s3_url = self.s3.get_cdn_url(media) assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls " payload = { "url": s3_url, @@ -107,10 +111,3 @@ class WhisperEnricher(Enricher): logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}") return result return False - - def _get_s3_storage(self) -> S3Storage: - try: - return next(s for s in self.config['steps']['storages'] if s == 's3_storage') - except: - logger.warning("No S3Storage instance found in storages") - return From 63aba6ad3994a27b7e95116dd9d6b8c4fd40e452 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Feb 2025 21:54:49 +0100 Subject: [PATCH 074/110] Fix sphinx-autoapi imports --- src/auto_archiver/core/extractor.py | 2 +- src/auto_archiver/core/orchestrator.py | 2 +- .../modules/generic_extractor/generic_extractor.py | 2 +- src/auto_archiver/utils/gsheet.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 57320df..794c06c 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -17,7 +17,7 @@ from loguru import logger from retrying import retry import re -from ..core import Metadata, BaseModule +from auto_archiver.core import Metadata, BaseModule class Extractor(BaseModule): diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 5ac091c..641f099 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -19,7 +19,7 @@ from rich_argparse import RichHelpFormatter from .metadata import Metadata, Media -from ..version import __version__ +from auto_archiver.version import __version__ from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, get_module, setup_paths from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index d1b1fb6..86e978f 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger from auto_archiver.core.extractor import Extractor -from ...core import Metadata, Media +from auto_archiver.core import Metadata, Media class GenericExtractor(Extractor): _dropins = {} diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py index 7a8862f..c36a032 100644 --- a/src/auto_archiver/utils/gsheet.py +++ b/src/auto_archiver/utils/gsheet.py @@ -1,6 +1,6 @@ import json, gspread -from ..core import BaseModule +from auto_archiver.core import BaseModule class Gsheets(BaseModule): From 1fad37fd934ba26835c9eb20222d95210a1e513a Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Feb 2025 23:08:30 +0100 Subject: [PATCH 075/110] Remove blank file --- src/auto_archiver/core/authentication.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/auto_archiver/core/authentication.py diff --git a/src/auto_archiver/core/authentication.py b/src/auto_archiver/core/authentication.py deleted file mode 100644 index e69de29..0000000 From e9dd321dcd548cc02d7fa2a0d0171feed1226c51 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 13:06:24 +0100 Subject: [PATCH 076/110] Fix setting cli_feeder as default feeder on clean install --- src/auto_archiver/core/config.py | 3 ++- src/auto_archiver/core/orchestrator.py | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 2d462e4..8f36c54 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -36,6 +36,7 @@ steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES # a dictionary of authentication information that can be used by extractors to login to website. # you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com) # Common login 'types' are username/password, cookie, api key/token. +# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser. # Some Examples: # facebook.com: # username: "my_username" @@ -163,6 +164,6 @@ def read_yaml(yaml_filename: str) -> CommentedMap: def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save = deepcopy(config) - config.pop('urls', None) + config_to_save.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 641f099..20212ce 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -128,6 +128,10 @@ class ArchivingOrchestrator: elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) + + # for simple mode, we use the cli_feeder and any modules that don't require setup + yaml_config['steps']['feeders'] = ['cli_feeder'] + # add them to the config for module in simple_modules: for module_type in module.type: @@ -237,18 +241,18 @@ class ArchivingOrchestrator: if log_file := logging_config['file']: logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) - - def install_modules(self): + def install_modules(self, modules_by_type): """ - Swaps out the previous 'strings' in the config with the actual modules and loads them + Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the + orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type + are loaded, the program will exit with an error message. """ invalid_modules = [] for module_type in BaseModule.MODULE_TYPES: step_items = [] - modules_to_load = self.config['steps'][f"{module_type}s"] - + modules_to_load = modules_by_type[f"{module_type}s"] assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)" def check_steps_ok(): @@ -264,9 +268,11 @@ class ArchivingOrchestrator: for module in modules_to_load: if module == 'cli_feeder': + # pseudo module, don't load it + breakpoint() urls = self.config['urls'] if not urls: - logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder. Use --help for more information.") + logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.") exit() # cli_feeder is a pseudo module, it just takes the command line args def feed(self) -> Generator[Metadata]: @@ -330,7 +336,7 @@ class ArchivingOrchestrator: self.setup_complete_parser(basic_config, yaml_config, unused_args) logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") - self.install_modules() + self.install_modules(self.config['steps']) # log out the modules that were loaded for module_type in BaseModule.MODULE_TYPES: From 74207d7821e0306de8e3b6da00cf263edfe0293c Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 13:27:11 +0100 Subject: [PATCH 077/110] Implementation tests for auto-archiver --- src/auto_archiver/core/orchestrator.py | 2 -- tests/test_implementation.py | 35 ++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 tests/test_implementation.py diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 20212ce..a451443 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -111,7 +111,6 @@ class ArchivingOrchestrator: # if full, we'll load all modules # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser' # but should we add them? Or should we just add them to the 'complete' parser? - if yaml_config != EMPTY_CONFIG: # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? @@ -269,7 +268,6 @@ class ArchivingOrchestrator: for module in modules_to_load: if module == 'cli_feeder': # pseudo module, don't load it - breakpoint() urls = self.config['urls'] if not urls: logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.") diff --git a/tests/test_implementation.py b/tests/test_implementation.py new file mode 100644 index 0000000..82d5d0f --- /dev/null +++ b/tests/test_implementation.py @@ -0,0 +1,35 @@ +import sys +import pytest + +from auto_archiver.__main__ import main + +@pytest.fixture +def orchestration_file(tmp_path): + return (tmp_path / "example_orch.yaml").as_posix() + +@pytest.fixture +def autoarchiver(tmp_path, monkeypatch): + + def _autoarchiver(args=["--config", "example_orch.yaml"]): + # change dir to tmp_path + monkeypatch.chdir(tmp_path) + with monkeypatch.context() as m: + m.setattr(sys, "argv", ["auto-archiver"] + args) + return main() + + return _autoarchiver + + +def test_run_auto_archiver_no_args(caplog, autoarchiver): + with pytest.raises(SystemExit): + autoarchiver([]) + + assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text + + +def test_run_auto_archiver_invalid_file(caplog, autoarchiver, monkeypatch): + # exec 'auto-archiver' on the command lin + with pytest.raises(SystemExit): + autoarchiver() + + assert "Make sure the file exists and try again, or run without th" in caplog.text \ No newline at end of file From f3f6b928172fe597c772e2c677a3f3d118f02bef Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 12:43:21 +0000 Subject: [PATCH 078/110] Implementation test cleanup --- tests/test_implementation.py | 45 ++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/tests/test_implementation.py b/tests/test_implementation.py index 82d5d0f..7e33651 100644 --- a/tests/test_implementation.py +++ b/tests/test_implementation.py @@ -3,33 +3,60 @@ import pytest from auto_archiver.__main__ import main + @pytest.fixture -def orchestration_file(tmp_path): +def orchestration_file_path(tmp_path): return (tmp_path / "example_orch.yaml").as_posix() @pytest.fixture -def autoarchiver(tmp_path, monkeypatch): +def orchestration_file(orchestration_file_path): + def _orchestration_file(content=''): + with open(orchestration_file_path, "w") as f: + f.write(content) + return orchestration_file_path + + return _orchestration_file + +@pytest.fixture +def autoarchiver(tmp_path, monkeypatch, request): + def _autoarchiver(args=[]): + + def cleanup(): + from loguru import logger + if not logger._core.handlers.get(0): + logger._core.handlers_count = 0 + logger.add(sys.stderr) + + request.addfinalizer(cleanup) - def _autoarchiver(args=["--config", "example_orch.yaml"]): # change dir to tmp_path monkeypatch.chdir(tmp_path) with monkeypatch.context() as m: m.setattr(sys, "argv", ["auto-archiver"] + args) return main() - + return _autoarchiver def test_run_auto_archiver_no_args(caplog, autoarchiver): with pytest.raises(SystemExit): - autoarchiver([]) + autoarchiver() assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text - -def test_run_auto_archiver_invalid_file(caplog, autoarchiver, monkeypatch): +def test_run_auto_archiver_invalid_file(caplog, autoarchiver): # exec 'auto-archiver' on the command lin with pytest.raises(SystemExit): - autoarchiver() + autoarchiver(["--config", "nonexistent_file.yaml"]) - assert "Make sure the file exists and try again, or run without th" in caplog.text \ No newline at end of file + assert "Make sure the file exists and try again, or run without th" in caplog.text + +def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file): + # create a valid (empty) orchestration file + path = orchestration_file(content="") + # exec 'auto-archiver' on the command lin + with pytest.raises(SystemExit): + autoarchiver(["--config", path]) + + # should treat an empty file as if there is no file at all + assert " No URLs provided. Please provide at least one URL via the com" in caplog.text From 7c848046e88a12d6b9ea89c7b6b34ab76ef009e8 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:59:32 +0000 Subject: [PATCH 079/110] adds better info about wrong/missing modules --- src/auto_archiver/core/module.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index dec67e1..f3fbec5 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -13,7 +13,7 @@ import copy import sys from importlib.util import find_spec import os -from os.path import join, dirname +from os.path import join from loguru import logger import auto_archiver from .base_module import BaseModule @@ -64,8 +64,10 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa if module_name in _LAZY_LOADED_MODULES: return _LAZY_LOADED_MODULES[module_name] - module = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)[0] - return module + available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings) + if not available: + raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?") + return available[0] def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: From 8fb3dc754b14b76833a12daa091ef608edf6a61c Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:59:51 +0000 Subject: [PATCH 080/110] fixing telethon extractor to use default entrypoint --- src/auto_archiver/modules/telethon_extractor/__init__.py | 2 +- .../modules/telethon_extractor/telethon_extractor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py index a837fdf..2eaa57c 100644 --- a/src/auto_archiver/modules/telethon_extractor/__init__.py +++ b/src/auto_archiver/modules/telethon_extractor/__init__.py @@ -1 +1 @@ -from .telethon_extractor import TelethonArchiver \ No newline at end of file +from .telethon_extractor import TelethonExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 3e952e8..21fc4dc 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -6,14 +6,14 @@ from telethon.tl.functions.messages import ImportChatInviteRequest from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError from loguru import logger from tqdm import tqdm -import re, time, json, os +import re, time, os from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media from auto_archiver.utils import random_str -class TelethonArchiver(Extractor): +class TelethonExtractor(Extractor): valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") From 15abf686b1315b3a35a628df12f687b9aec431d5 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Feb 2025 15:48:54 +0000 Subject: [PATCH 081/110] decouples s3_storage from hash_enricher --- src/auto_archiver/core/base_module.py | 2 +- .../modules/hash_enricher/hash_enricher.py | 8 ++------ src/auto_archiver/modules/s3_storage/s3_storage.py | 8 +++----- src/auto_archiver/utils/misc.py | 12 ++++++++++++ 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index fcfe9ea..5c6ecbb 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -63,7 +63,7 @@ class BaseModule(ABC): def setup(self, config: dict): authentication = config.get('authentication', {}) - # extract out contatenated sites + # extract out concatenated sites for key, val in copy(authentication).items(): if "," in key: for site in key.split(","): diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 58c6abe..b3ca8be 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -12,6 +12,7 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata +from auto_archiver.utils.misc import calculate_file_hash class HashEnricher(Enricher): @@ -35,9 +36,4 @@ class HashEnricher(Enricher): elif self.algorithm == "SHA3-512": hash = hashlib.sha3_512() else: return "" - with open(filename, "rb") as f: - while True: - buf = f.read(self.chunksize) - if not buf: break - hash.update(buf) - return hash.hexdigest() + return calculate_file_hash(filename, hash, self.chunksize) diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index f324d5c..2f85164 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -7,12 +7,11 @@ from loguru import logger from auto_archiver.core import Media from auto_archiver.core import Storage -from auto_archiver.modules.hash_enricher import HashEnricher -from auto_archiver.utils.misc import random_str +from auto_archiver.utils.misc import calculate_file_hash, random_str NO_DUPLICATES_FOLDER = "no-dups/" -class S3Storage(Storage, HashEnricher): +class S3Storage(Storage): def setup(self, config: dict) -> None: super().setup(config) @@ -42,14 +41,13 @@ class S3Storage(Storage, HashEnricher): extra_args['ContentType'] = media.mimetype except Exception as e: logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") - self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) return True def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload - hd = self.calculate_hash(media.filename) + hd = calculate_file_hash(media.filename) path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 300a710..3af5a54 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -5,6 +5,7 @@ import json import uuid from datetime import datetime import requests +import hashlib from loguru import logger @@ -54,9 +55,20 @@ def update_nested_dict(dictionary, update_dict): else: dictionary[key] = value + def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" return str(uuid.uuid4()).replace("-", "")[:length] + def json_loader(cli_val): return json.loads(cli_val) + + +def calculate_file_hash(filename: str, hash_algo = hashlib.sha256(), chunksize: int = 16000000) -> str: + with open(filename, "rb") as f: + while True: + buf = f.read(chunksize) + if not buf: break + hash_algo.update(buf) + return hash_algo.hexdigest() From f311621e58446983fb95d9e510249855a7687f61 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Feb 2025 15:57:42 +0000 Subject: [PATCH 082/110] Small fixes. Add timestamp helper method. --- .../modules/gdrive_storage/gdrive_storage.py | 7 +- .../modules/gsheet_db/gsheet_db.py | 70 ++++++++++--------- .../telethon_extractor/telethon_extractor.py | 4 +- .../modules/whisper_enricher/__manifest__.py | 2 +- .../whisper_enricher/whisper_enricher.py | 13 ++-- src/auto_archiver/utils/misc.py | 36 +++++++++- tests/databases/test_gsheet_db.py | 8 ++- .../test_instagram_api_extractor.py | 3 +- .../test_instagram_tbot_extractor.py | 1 - tests/feeders/test_gsheet_feeder.py | 9 +-- tests/storages/test_gdrive_storage.py | 41 ++++++++--- tests/test_metadata.py | 4 ++ 12 files changed, 129 insertions(+), 69 deletions(-) diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index cc9cf3d..910f48b 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -70,12 +70,15 @@ class GDriveStorage(Storage): filename = path_parts[-1] logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}") for folder in path_parts[0:-1]: - folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) + folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) parent_id = folder_id - # get id of file inside folder (or sub folder) # TODO: supressing the error as being checked before first upload file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False) + if not file_id: + # + logger.info(f"file {filename} not found in folder {folder_id}") + return None return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" def upload(self, media: Media, **kwargs) -> bool: diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 3bb27b7..682eb94 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -1,6 +1,4 @@ from typing import Union, Tuple - -import datetime from urllib.parse import quote from loguru import logger @@ -8,33 +6,33 @@ from loguru import logger from auto_archiver.core import Database from auto_archiver.core import Metadata, Media from auto_archiver.modules.gsheet_feeder import GWorksheet +from auto_archiver.utils.misc import get_current_timestamp class GsheetsDb(Database): """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata + NB: only works if GsheetFeeder is used. + could be updated in the future to support non-GsheetFeeder metadata """ - def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', 'Archive in progress') + gw.set_cell(row, "status", "Archive in progress") - def failed(self, item: Metadata, reason:str) -> None: + def failed(self, item: Metadata, reason: str) -> None: logger.error(f"FAILED {item}") - self._safe_status_update(item, f'Archive failed {reason}') + self._safe_status_update(item, f"Archive failed {reason}") def aborted(self, item: Metadata) -> None: logger.warning(f"ABORTED {item}") - self._safe_status_update(item, '') + self._safe_status_update(item, "") def fetch(self, item: Metadata) -> Union[Metadata, bool]: """check if the given item has been archived already""" return False - def done(self, item: Metadata, cached: bool=False) -> None: + def done(self, item: Metadata, cached: bool = False) -> None: """archival result ready - should be saved to DB""" logger.success(f"DONE {item.get_url()}") gw, row = self._retrieve_gsheet(item) @@ -46,23 +44,25 @@ class GsheetsDb(Database): def batch_if_valid(col, val, final_value=None): final_value = final_value or val try: - if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": cell_updates.append((row, col, final_value)) except Exception as e: logger.error(f"Unable to batch {col}={final_value} due to {e}") + status_message = item.status if cached: status_message = f"[cached] {status_message}" - cell_updates.append((row, 'status', status_message)) + cell_updates.append((row, "status", status_message)) media: Media = item.get_final_media() if hasattr(media, "urls"): - batch_if_valid('archive', "\n".join(media.urls)) - batch_if_valid('date', True, self._get_current_datetime_iso()) - batch_if_valid('title', item.get_title()) - batch_if_valid('text', item.get("content", "")) - batch_if_valid('timestamp', item.get_timestamp()) - if media: batch_if_valid('hash', media.get("hash", "not-calculated")) + batch_if_valid("archive", "\n".join(media.urls)) + batch_if_valid("date", True, get_current_timestamp()) + batch_if_valid("title", item.get_title()) + batch_if_valid("text", item.get("content", "")) + batch_if_valid("timestamp", item.get_timestamp()) + if media: + batch_if_valid("hash", media.get("hash", "not-calculated")) # merge all pdq hashes into a single string, if present pdq_hashes = [] @@ -71,31 +71,35 @@ class GsheetsDb(Database): if pdq := m.get("pdq_hash"): pdq_hashes.append(pdq) if len(pdq_hashes): - batch_if_valid('pdq_hash', ",".join(pdq_hashes)) + batch_if_valid("pdq_hash", ",".join(pdq_hashes)) - if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"): - batch_if_valid('screenshot', "\n".join(screenshot.urls)) + if (screenshot := item.get_media_by_id("screenshot")) and hasattr( + screenshot, "urls" + ): + batch_if_valid("screenshot", "\n".join(screenshot.urls)) - if (thumbnail := item.get_first_image("thumbnail")): + if thumbnail := item.get_first_image("thumbnail"): if hasattr(thumbnail, "urls"): - batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")') + batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")') - if (browsertrix := item.get_media_by_id("browsertrix")): - batch_if_valid('wacz', "\n".join(browsertrix.urls)) - batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls])) + if browsertrix := item.get_media_by_id("browsertrix"): + batch_if_valid("wacz", "\n".join(browsertrix.urls)) + batch_if_valid( + "replaywebpage", + "\n".join( + [ + f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}" + for wacz in browsertrix.urls + ] + ), + ) gw.batch_set_cell(cell_updates) - @staticmethod - def _get_current_datetime_iso() -> str: - """Helper method to generate the current datetime in ISO format.""" - return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat() - - def _safe_status_update(self, item: Metadata, new_status: str) -> None: try: gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', new_status) + gw.set_cell(row, "status", new_status) except Exception as e: logger.debug(f"Unable to update sheet: {e}") diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 0147ff2..947db9e 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -18,12 +18,14 @@ class TelethonExtractor(Extractor): invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def setup(self) -> None: + def setup(self, config: dict) -> None: + """ 1. makes a copy of session_file that is removed in cleanup 2. trigger login process for telegram or proceed if already saved in a session file 3. joins channel_invites where needed """ + super().setup(config) logger.info(f"SETUP {self.name} checking login...") # make a copy of the session that is used exclusively with this archiver instance diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index 1539df6..98e743e 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -1,4 +1,4 @@ -a={ +{ "name": "Whisper Enricher", "type": ["enricher"], "requires_setup": True, diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 004d91c..a51ffc1 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -4,7 +4,6 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media -from auto_archiver.modules.s3_storage import S3Storage from auto_archiver.core.module import get_module class WhisperEnricher(Enricher): @@ -14,13 +13,17 @@ class WhisperEnricher(Enricher): Only works if an S3 compatible storage is used """ - def enrich(self, to_enrich: Metadata) -> None: - storages = self.config['steps']['storages'] - if not "s3_storage" in storages: + def setup(self, config: dict) -> None: + super().setup(config) + self.stores = self.config['steps']['storages'] + self.s3 = get_module("s3_storage", self.config) + if not "s3_storage" in self.stores: logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") return - self.s3 = get_module("s3_storage", self.config) + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.") diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 300a710..e4c214c 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -1,9 +1,7 @@ - - import os import json import uuid -from datetime import datetime +from datetime import datetime, timezone import requests from loguru import logger @@ -58,5 +56,37 @@ def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" return str(uuid.uuid4()).replace("-", "")[:length] + def json_loader(cli_val): return json.loads(cli_val) + +def get_current_datetime_iso() -> str: + return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat() + + +def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None: + # parse a datetime string with option of passing a specific format + try: + return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str) + except ValueError as e: + logger.error(f"Unable to parse datestring {dt_str}: {e}") + return None + + +def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None: + # Consistent parsing of timestamps + # If utc=True, the timezone is set to UTC, + # if iso=True, the output is an iso string + if not ts: return + try: + if isinstance(ts, str): ts = datetime.fromisoformat(ts) + if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts) + if utc: ts = ts.replace(tzinfo=timezone.utc) + if iso: return ts.isoformat() + return ts + except Exception as e: + logger.error(f"Unable to parse timestamp {ts}: {e}") + return None + +def get_current_timestamp() -> str: + return get_timestamp(datetime.now()) \ No newline at end of file diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index bdc2811..0a655a8 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -103,19 +103,20 @@ def test_failed(gsheets_db, mock_metadata, mock_gworksheet): gsheets_db.failed(mock_metadata, reason) mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}') + def test_aborted(gsheets_db, mock_metadata, mock_gworksheet): gsheets_db.aborted(mock_metadata) mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '') def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls): - with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'): gsheets_db.done(metadata) mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls) def test_done_cached(gsheets_db, metadata, mock_gworksheet): - with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'): gsheets_db.done(metadata, cached=True) # Verify the status message includes "[cached]" @@ -126,7 +127,8 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet): def test_done_missing_media(gsheets_db, metadata, mock_gworksheet): # clear media from metadata metadata.media = [] - with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", + return_value='2025-02-01T00:00:00+00:00'): gsheets_db.done(metadata) # Verify nothing media-related gets updated call_args = mock_gworksheet.batch_set_cell.call_args[0][0] diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py index d3f7bd6..c119e3f 100644 --- a/tests/extractors/test_instagram_api_extractor.py +++ b/tests/extractors/test_instagram_api_extractor.py @@ -185,5 +185,4 @@ class TestInstagramAPIExtractor(TestExtractorBase): result = self.extractor.download_profile(metadata, "test_user") assert result.is_success() - assert "Error downloading stories for test_user" in result.metadata["errors"] - # assert "Error downloading posts for test_user" in result.metadata["errors"] \ No newline at end of file + assert "Error downloading stories for test_user" in result.metadata["errors"] \ No newline at end of file diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py index b82641d..d7a1e53 100644 --- a/tests/extractors/test_instagram_tbot_extractor.py +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -1,5 +1,4 @@ import os -import pickle from typing import Type from unittest.mock import patch, MagicMock diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index 103610e..ecf57f1 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -7,10 +7,8 @@ from auto_archiver.modules.gsheet_feeder import GsheetsFeeder from auto_archiver.core import Metadata, Feeder -def test_initialise_without_sheet_and_sheet_id(setup_module): - """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set. - (shouldn't really be asserting in there) - """ +def test_setup_without_sheet_and_sheet_id(setup_module): + # Ensure setup() raises AssertionError if neither sheet nor sheet_id is set. with patch("gspread.service_account"): with pytest.raises(AssertionError): setup_module( @@ -145,7 +143,6 @@ def test_open_sheet_with_name_or_id( "gsheet_feeder", {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, ) - feeder.initialise() sheet_result = feeder.open_sheet() # Validate the correct method was called getattr(mock_client, expected_method).assert_called_once_with( @@ -165,7 +162,6 @@ def test_open_sheet_with_sheet_id(setup_module): "gsheet_feeder", {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, ) - feeder.initialise() sheet = feeder.open_sheet() mock_client.open_by_key.assert_called_once_with("ABC123") assert sheet == "MockSheet" @@ -263,7 +259,6 @@ class TestGSheetsFeederReal: ["https://example.com", "done"], ] worksheet.append_rows(test_rows) - self.feeder.initialise() metadata_list = list(self.feeder) # Validate that only the first row is processed diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index b7417ad..4259cb2 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -21,16 +21,6 @@ class TestGDriveStorage(TestStorageBase): 'service_account': 'fake_service_account.json' } - @pytest.mark.skip(reason="Requires real credentials") - @pytest.mark.download - def test_initialize_with_real_credentials(self): - """ - Test that the Google Drive service can be initialized with real credentials. - """ - self.storage.service_account = 'secrets/service_account.json' # Path to real credentials - self.storage.initialise() - assert self.storage.service is not None - def test_initialize_fails_with_non_existent_creds(self): """ @@ -38,6 +28,35 @@ class TestGDriveStorage(TestStorageBase): """ # Act and Assert with pytest.raises(FileNotFoundError) as exc_info: - self.storage.initialise() + self.storage.setup(self.config) assert "No such file or directory" in str(exc_info.value) + def test_path_parts(self): + media = Media(filename="test.jpg") + media.key = "folder1/folder2/test.jpg" + +# @pytest.mark.skip(reason="Requires real credentials") +@pytest.mark.download +class TestGDriveStorageConnected(TestStorageBase): + """ + 'Real' tests for GDriveStorage. + """ + + module_name: str = "gdrive_storage" + storage: Type[GDriveStorage] + config: dict = {'path_generator': 'url', + 'filename_generator': 'static', + # TODO: replace with real root folder id + 'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk", + 'oauth_token': None, + 'service_account': 'secrets/service_account.json' + } + + + def test_initialize_with_real_credentials(self): + """ + Test that the Google Drive service can be initialized with real credentials. + """ + assert self.storage.service is not None + + diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 7270c80..b07e107 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -159,3 +159,7 @@ def test_get_context(): assert m.get_context("somekey") == "somevalue" assert m.get_context("anotherkey") == "anothervalue" assert len(m._context) == 2 + + +def test_choose_most_complete(): + pass \ No newline at end of file From ab6cf52533c29b7c5815c94c8c27ca60b32f8ad7 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Feb 2025 16:45:28 +0000 Subject: [PATCH 083/110] fixes bad hash initialization --- src/auto_archiver/modules/hash_enricher/hash_enricher.py | 8 ++++---- src/auto_archiver/utils/misc.py | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index b3ca8be..7a0587c 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -30,10 +30,10 @@ class HashEnricher(Enricher): to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}") def calculate_hash(self, filename) -> str: - hash = None + hash_algo = None if self.algorithm == "SHA-256": - hash = hashlib.sha256() + hash_algo = hashlib.sha256 elif self.algorithm == "SHA3-512": - hash = hashlib.sha3_512() + hash_algo = hashlib.sha3_512 else: return "" - return calculate_file_hash(filename, hash, self.chunksize) + return calculate_file_hash(filename, hash_algo, self.chunksize) diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 3af5a54..4d48372 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -65,10 +65,11 @@ def json_loader(cli_val): return json.loads(cli_val) -def calculate_file_hash(filename: str, hash_algo = hashlib.sha256(), chunksize: int = 16000000) -> str: +def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str: + hash = hash_algo() with open(filename, "rb") as f: while True: buf = f.read(chunksize) if not buf: break - hash_algo.update(buf) - return hash_algo.hexdigest() + hash.update(buf) + return hash.hexdigest() From 12f14cccc933d44b02906b2b9e239d1dd98af036 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Feb 2025 16:58:35 +0000 Subject: [PATCH 084/110] fixes gsheet feeder<->db connection via context. --- src/auto_archiver/core/storage.py | 2 +- src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 9f355f6..5dfa39d 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -1,6 +1,6 @@ from __future__ import annotations from abc import abstractmethod -from typing import IO, Optional +from typing import IO import os from loguru import logger diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index d129182..1b81385 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -68,7 +68,7 @@ class GsheetsFeeder(Feeder): folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title)) m.set_context('folder', folder) - m.set_context('worksheet', {"row": row, "worksheet": gw}) + m.set_context('gsheet', {"row": row, "worksheet": gw}) yield m logger.success(f'Finished worksheet {wks.title}') From 2c3d1f591f4a721597e2cd9906c1cdc05db8a78e Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Feb 2025 17:25:15 +0000 Subject: [PATCH 085/110] Separate setup() and module_setup(). --- src/auto_archiver/core/base_module.py | 4 ++++ src/auto_archiver/core/module.py | 1 + src/auto_archiver/modules/gdrive_storage/gdrive_storage.py | 4 +--- src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py | 3 +-- src/auto_archiver/modules/html_formatter/html_formatter.py | 3 +-- .../instagram_api_extractor/instagram_api_extractor.py | 3 +-- .../modules/instagram_extractor/instagram_extractor.py | 3 +-- .../instagram_tbot_extractor/instagram_tbot_extractor.py | 3 +-- src/auto_archiver/modules/s3_storage/s3_storage.py | 3 +-- .../modules/telethon_extractor/telethon_extractor.py | 3 +-- .../modules/twitter_api_extractor/twitter_api_extractor.py | 4 +--- src/auto_archiver/modules/vk_extractor/vk_extractor.py | 3 +-- src/auto_archiver/modules/wacz_enricher/wacz_enricher.py | 3 +-- .../modules/whisper_enricher/whisper_enricher.py | 3 +-- 14 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 5c6ecbb..95575e3 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -80,6 +80,10 @@ class BaseModule(ABC): for key, val in config.get(self.name, {}).items(): setattr(self, key, val) + def module_setup(self): + # For any additional setup required by modules, e.g. autehntication + pass + def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]: """ Returns the authentication information for a given site. This is used to authenticate diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index f3fbec5..69f9fcc 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -242,6 +242,7 @@ class LazyBaseModule: default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) config[self.name] = default_config | config.get(self.name, {}) instance.setup(config) + instance.module_setup() return instance def __repr__(self): diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 910f48b..51c13c2 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -19,9 +19,7 @@ from auto_archiver.core import Storage class GDriveStorage(Storage): - def setup(self, config: dict) -> None: - # Step 1: Call the BaseModule setup to dynamically assign configs - super().setup(config) + def module_setup(self) -> None: self.scopes = ['https://www.googleapis.com/auth/drive'] # Initialize Google Drive service self._setup_google_drive_service() diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 50bf430..dd98032 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -21,8 +21,7 @@ from . import GWorksheet class GsheetsFeeder(Feeder): - def setup(self, config: dict): - super().setup(config) + def module_setup(self) -> None: self.gsheets_client = gspread.service_account(filename=self.service_account) # TODO mv to validators assert self.sheet or self.sheet_id, ( diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index 4da82c8..bbba097 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -17,9 +17,8 @@ class HtmlFormatter(Formatter): environment: Environment = None template: any = None - def setup(self, config: dict) -> None: + def module_setup(self) -> None: """Sets up the Jinja2 environment and loads the template.""" - super().setup(config) # Ensure the base class logic is executed template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/") self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True) diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 5dad0ba..367cc75 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -32,8 +32,7 @@ class InstagramAPIExtractor(Extractor): r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" ) - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 3cf0362..e4e210f 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -25,8 +25,7 @@ class InstagramExtractor(Extractor): profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url)) # TODO: links to stories - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.insta = instaloader.Instaloader( download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 5660cd2..707dcc3 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -27,12 +27,11 @@ class InstagramTbotExtractor(Extractor): https://t.me/instagram_load_bot """ - def setup(self, configs) -> None: + def module_setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. checks if the session file is valid """ - super().setup(configs) logger.info(f"SETUP {self.name} checking login...") self._prepare_session_file() self._initialize_telegram_client() diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index 2f85164..c77bbc3 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -13,8 +13,7 @@ NO_DUPLICATES_FOLDER = "no-dups/" class S3Storage(Storage): - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.s3 = boto3.client( 's3', region_name=self.region, diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 97d3e94..3762f01 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -18,14 +18,13 @@ class TelethonExtractor(Extractor): invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def setup(self, config: dict) -> None: + def module_setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. trigger login process for telegram or proceed if already saved in a session file 3. joins channel_invites where needed """ - super().setup(config) logger.info(f"SETUP {self.name} checking login...") # make a copy of the session that is used exclusively with this archiver instance diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 6573475..0b27e22 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -15,9 +15,7 @@ class TwitterApiExtractor(Extractor): valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") - def setup(self, config: dict) -> None: - super().setup(config) - + def module_setup(self) -> None: self.api_index = 0 self.apis = [] if len(self.bearer_tokens): diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index 2d09138..0d1fc04 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -12,8 +12,7 @@ class VkExtractor(Extractor): Currently only works for /wall posts """ - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.vks = VkScraper(self.username, self.password, session_file=self.session_file) def download(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 1586b75..7d91f43 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -18,8 +18,7 @@ class WaczExtractorEnricher(Enricher, Extractor): When used as an archiver it will extract the media from the .WACZ archive so it can be enriched. """ - def setup(self, configs) -> None: - super().setup(configs) + def module_setup(self) -> None: self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index a51ffc1..d83319e 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -13,8 +13,7 @@ class WhisperEnricher(Enricher): Only works if an S3 compatible storage is used """ - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.stores = self.config['steps']['storages'] self.s3 = get_module("s3_storage", self.config) if not "s3_storage" in self.stores: From e97ccf8a736fc6bd01a0efdf9a54c8cca16d5d97 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Feb 2025 18:07:47 +0000 Subject: [PATCH 086/110] Separate setup() and module_setup(). --- src/auto_archiver/core/base_module.py | 6 +++--- src/auto_archiver/core/module.py | 6 +++--- src/auto_archiver/modules/gdrive_storage/gdrive_storage.py | 2 +- src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py | 2 +- src/auto_archiver/modules/html_formatter/html_formatter.py | 2 +- .../instagram_api_extractor/instagram_api_extractor.py | 2 +- .../modules/instagram_extractor/instagram_extractor.py | 2 +- .../instagram_tbot_extractor/instagram_tbot_extractor.py | 2 +- src/auto_archiver/modules/s3_storage/s3_storage.py | 2 +- .../modules/telethon_extractor/telethon_extractor.py | 2 +- .../modules/twitter_api_extractor/twitter_api_extractor.py | 2 +- src/auto_archiver/modules/vk_extractor/vk_extractor.py | 2 +- src/auto_archiver/modules/wacz_enricher/wacz_enricher.py | 2 +- .../modules/whisper_enricher/whisper_enricher.py | 2 +- 14 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 95575e3..ece4719 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -14,7 +14,7 @@ class BaseModule(ABC): Base module class. All modules should inherit from this class. The exact methods a class implements will depend on the type of module it is, - however all modules have a .setup(config: dict) method to run any setup code + however modules can have a .setup() method to run any setup code (e.g. logging in to a site, spinning up a browser etc.) See BaseModule.MODULE_TYPES for the types of modules you can create, noting that @@ -60,7 +60,7 @@ class BaseModule(ABC): def storages(self) -> list: return self.config.get('storages', []) - def setup(self, config: dict): + def config_setup(self, config: dict): authentication = config.get('authentication', {}) # extract out concatenated sites @@ -80,7 +80,7 @@ class BaseModule(ABC): for key, val in config.get(self.name, {}).items(): setattr(self, key, val) - def module_setup(self): + def setup(self): # For any additional setup required by modules, e.g. autehntication pass diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 69f9fcc..c81e26a 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -58,7 +58,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa This has all the information about the module, but does not load the module itself or its dependencies - To load an actual module, call .setup() on a laz module + To load an actual module, call .setup() on a lazy module """ if module_name in _LAZY_LOADED_MODULES: @@ -241,8 +241,8 @@ class LazyBaseModule: # merge the default config with the user config default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) config[self.name] = default_config | config.get(self.name, {}) - instance.setup(config) - instance.module_setup() + instance.config_setup(config) + instance.setup() return instance def __repr__(self): diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 51c13c2..f38feb6 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -19,7 +19,7 @@ from auto_archiver.core import Storage class GDriveStorage(Storage): - def module_setup(self) -> None: + def setup(self) -> None: self.scopes = ['https://www.googleapis.com/auth/drive'] # Initialize Google Drive service self._setup_google_drive_service() diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index dd98032..8612d02 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -21,7 +21,7 @@ from . import GWorksheet class GsheetsFeeder(Feeder): - def module_setup(self) -> None: + def setup(self) -> None: self.gsheets_client = gspread.service_account(filename=self.service_account) # TODO mv to validators assert self.sheet or self.sheet_id, ( diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index bbba097..3691735 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -17,7 +17,7 @@ class HtmlFormatter(Formatter): environment: Environment = None template: any = None - def module_setup(self) -> None: + def setup(self) -> None: """Sets up the Jinja2 environment and loads the template.""" template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/") self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True) diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 367cc75..a75e065 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -32,7 +32,7 @@ class InstagramAPIExtractor(Extractor): r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" ) - def module_setup(self) -> None: + def setup(self) -> None: if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index e4e210f..0af2c32 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -25,7 +25,7 @@ class InstagramExtractor(Extractor): profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url)) # TODO: links to stories - def module_setup(self) -> None: + def setup(self) -> None: self.insta = instaloader.Instaloader( download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 707dcc3..d4b7a8e 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -27,7 +27,7 @@ class InstagramTbotExtractor(Extractor): https://t.me/instagram_load_bot """ - def module_setup(self) -> None: + def setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. checks if the session file is valid diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index c77bbc3..6590ac9 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -13,7 +13,7 @@ NO_DUPLICATES_FOLDER = "no-dups/" class S3Storage(Storage): - def module_setup(self) -> None: + def setup(self) -> None: self.s3 = boto3.client( 's3', region_name=self.region, diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 3762f01..65ea8cd 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -18,7 +18,7 @@ class TelethonExtractor(Extractor): invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def module_setup(self) -> None: + def setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 0b27e22..72fd2f2 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -15,7 +15,7 @@ class TwitterApiExtractor(Extractor): valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") - def module_setup(self) -> None: + def setup(self) -> None: self.api_index = 0 self.apis = [] if len(self.bearer_tokens): diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index 0d1fc04..99527c4 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -12,7 +12,7 @@ class VkExtractor(Extractor): Currently only works for /wall posts """ - def module_setup(self) -> None: + def setup(self) -> None: self.vks = VkScraper(self.username, self.password, session_file=self.session_file) def download(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 7d91f43..c324c62 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -18,7 +18,7 @@ class WaczExtractorEnricher(Enricher, Extractor): When used as an archiver it will extract the media from the .WACZ archive so it can be enriched. """ - def module_setup(self) -> None: + def setup(self) -> None: self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index d83319e..89579f9 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -13,7 +13,7 @@ class WhisperEnricher(Enricher): Only works if an S3 compatible storage is used """ - def module_setup(self) -> None: + def setup(self) -> None: self.stores = self.config['steps']['storages'] self.s3 = get_module("s3_storage", self.config) if not "s3_storage" in self.stores: From 3dae2337a1e3a97b913780b58e45adbc1d0aff5a Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Feb 2025 18:56:46 +0000 Subject: [PATCH 087/110] remove cdn_url check before storage. --- src/auto_archiver/core/media.py | 2 +- src/auto_archiver/modules/gdrive_storage/gdrive_storage.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 952a025..b6820ab 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -65,7 +65,7 @@ class Media: def is_stored(self, in_storage) -> bool: # checks if the media is already stored in the given storage - return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url(self) in u]) + return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"]) def set(self, key: str, value: Any) -> Media: self.properties[key] = value diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index f38feb6..4971030 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -68,11 +68,10 @@ class GDriveStorage(Storage): filename = path_parts[-1] logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}") for folder in path_parts[0:-1]: - folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) + folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) parent_id = folder_id # get id of file inside folder (or sub folder) - # TODO: supressing the error as being checked before first upload - file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False) + file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True) if not file_id: # logger.info(f"file {filename} not found in folder {folder_id}") From ed81dcdaf081613b44035fae9d2b9de9d6fbc5b1 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 23:07:03 +0000 Subject: [PATCH 088/110] Remove dangling 'b = ' from config.py --- src/auto_archiver/core/config.py | 14 ++++---------- src/auto_archiver/core/orchestrator.py | 4 ++-- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 8f36c54..9bb080f 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -15,15 +15,9 @@ from .module import BaseModule from typing import Any, List, Type, Tuple -yaml: YAML = YAML() +_yaml: YAML = YAML() -b = yaml.load(""" - # This is a comment - site.com,site2.com: - key: value - key2: value2 - """) -EMPTY_CONFIG = yaml.load(""" +EMPTY_CONFIG = _yaml.load(""" # Auto Archiver Configuration # Steps are the modules that will be run in the order they are defined @@ -149,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap: config = None try: with open(yaml_filename, "r", encoding="utf-8") as inf: - config = yaml.load(inf) + config = _yaml.load(inf) except FileNotFoundError: pass @@ -166,4 +160,4 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: - yaml.dump(config_to_save, outf) \ No newline at end of file + _yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index a451443..473f882 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -20,7 +20,7 @@ from rich_argparse import RichHelpFormatter from .metadata import Metadata, Media from auto_archiver.version import __version__ -from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser +from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, get_module, setup_paths from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from .module import BaseModule @@ -50,7 +50,7 @@ class AuthenticationJsonParseAction(JsonParseAction): auth_dict = json.load(f) except json.JSONDecodeError: # maybe it's yaml, try that - auth_dict = yaml.load(f) + auth_dict = _yaml.load(f) except: pass From a69ac3e509eed60f1801aca605531b6bc8f3e506 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 09:46:22 +0000 Subject: [PATCH 089/110] Fix file hash reference in S3 tests --- tests/storages/test_S3_storage.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index 2594e73..e532a18 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -2,13 +2,12 @@ from typing import Type import pytest from unittest.mock import MagicMock, patch from auto_archiver.core import Media -from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.modules.s3_storage import s3_storage -class TestGDriveStorage: +class TestS3Storage: """ - Test suite for GDriveStorage. + Test suite for S3Storage. """ module_name: str = "s3_storage" storage: Type[s3_storage] @@ -66,7 +65,7 @@ class TestGDriveStorage: # Set duplicate checking config to true: self.storage.random_no_duplicate = True - with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calc_hash, \ + with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calc_hash, \ patch.object(self.storage, 'file_in_folder') as mock_file_in_folder: mock_calc_hash.return_value = 'beepboop123beepboop123beepboop123' mock_file_in_folder.return_value = 'existing_key.txt' @@ -87,8 +86,7 @@ class TestGDriveStorage: # Create test media with calculated hash media = Media("test.txt") media.key = "original_path.txt" - - with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calculate_hash: + with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calculate_hash: mock_calculate_hash.return_value = "beepboop123beepboop123beepboop123" # Verify upload assert self.storage.is_upload_needed(media) is False From 18666ff027526b99114d2b4ffb6304f9b3a83461 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 11:28:24 +0000 Subject: [PATCH 090/110] skip authenticated tests in test_gsheet_feeder.py --- tests/feeders/test_gsheet_feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index ecf57f1..bdf3e70 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -185,7 +185,7 @@ def test_should_process_sheet(setup_module): assert gdb.should_process_sheet("AnotherSheet") == False -# @pytest.mark.skip(reason="Requires a real connection") +@pytest.mark.skip(reason="Requires a real connection") class TestGSheetsFeederReal: """Testing GSheetsFeeder class""" From 1792e02d1d32c99ca1a59aeb0cab33a74d3a783e Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 11:34:36 +0000 Subject: [PATCH 091/110] skip authenticated tests in test_gdrive_storage.py --- tests/storages/test_gdrive_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index 4259cb2..57480d0 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -35,7 +35,7 @@ class TestGDriveStorage(TestStorageBase): media = Media(filename="test.jpg") media.key = "folder1/folder2/test.jpg" -# @pytest.mark.skip(reason="Requires real credentials") +@pytest.mark.skip(reason="Requires real credentials") @pytest.mark.download class TestGDriveStorageConnected(TestStorageBase): """ From 89d9140d15eb9e4261abf27f9c71df47ef8efb07 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 11:47:11 +0000 Subject: [PATCH 092/110] Fixed setup/ config_setup reference --- tests/storages/test_gdrive_storage.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index 57480d0..aba0a25 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -7,7 +7,7 @@ from auto_archiver.core.metadata import Metadata from tests.storages.test_storage_base import TestStorageBase -class TestGDriveStorage(TestStorageBase): +class TestGDriveStorage: """ Test suite for GDriveStorage. """ @@ -21,6 +21,10 @@ class TestGDriveStorage(TestStorageBase): 'service_account': 'fake_service_account.json' } + @pytest.fixture(autouse=True) + def gdrive(self, setup_module): + with patch('google.oauth2.service_account.Credentials.from_service_account_file') as mock_creds: + self.storage = setup_module(self.module_name, self.config) def test_initialize_fails_with_non_existent_creds(self): """ @@ -28,13 +32,15 @@ class TestGDriveStorage(TestStorageBase): """ # Act and Assert with pytest.raises(FileNotFoundError) as exc_info: - self.storage.setup(self.config) + self.storage.setup() assert "No such file or directory" in str(exc_info.value) + def test_path_parts(self): media = Media(filename="test.jpg") media.key = "folder1/folder2/test.jpg" + @pytest.mark.skip(reason="Requires real credentials") @pytest.mark.download class TestGDriveStorageConnected(TestStorageBase): From f97ec6a9e0ac20268f045b661f2e080ff1eb8574 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 11:58:28 +0000 Subject: [PATCH 093/110] Fixed S3 module import --- tests/storages/test_S3_storage.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index e532a18..2a5d026 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -2,7 +2,7 @@ from typing import Type import pytest from unittest.mock import MagicMock, patch from auto_archiver.core import Media -from auto_archiver.modules.s3_storage import s3_storage +from auto_archiver.modules.s3_storage import S3Storage class TestS3Storage: @@ -10,7 +10,7 @@ class TestS3Storage: Test suite for S3Storage. """ module_name: str = "s3_storage" - storage: Type[s3_storage] + storage: Type[S3Storage] s3: MagicMock config: dict = { "path_generator": "flat", @@ -78,7 +78,7 @@ class TestS3Storage: ) - @patch.object(s3_storage.S3Storage, 'file_in_folder') + @patch.object(S3Storage, 'file_in_folder') def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder): """Test that upload skips when file_in_folder finds existing object""" self.storage.random_no_duplicate = True @@ -97,7 +97,7 @@ class TestS3Storage: mock_upload.assert_not_called() assert result is True - @patch.object(s3_storage.S3Storage, 'is_upload_needed') + @patch.object(S3Storage, 'is_upload_needed') def test_uploads_with_correct_parameters(self, mock_upload_needed): media = Media("test.txt") media.key = "original_key.txt" From 5e2e93382ffc47893183aae83ff138055b0edeb8 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 12:17:42 +0000 Subject: [PATCH 094/110] Test fixes for 3.10 compliance. --- tests/databases/test_gsheet_db.py | 2 +- tests/feeders/test_gsheet_feeder.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 0a655a8..32e8403 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -24,7 +24,7 @@ def mock_metadata(): metadata.status = "done" metadata.get_title.return_value = "Example Title" metadata.get.return_value = "Example Content" - metadata.get_timestamp.return_value = "2025-01-01T00:00:00Z" + metadata.get_timestamp.return_value = "2025-01-01T00:00:00" metadata.get_final_media.return_value = MagicMock(spec=Media) metadata.get_all_media.return_value = [] metadata.get_media_by_id.return_value = None diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index bdf3e70..b86e329 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -52,7 +52,7 @@ def gsheet_feeder(setup_module) -> GsheetsFeeder: return feeder -class TestWorksheet: +class MockWorksheet: """ mimics the bits we need from gworksheet """ @@ -91,7 +91,7 @@ class TestWorksheet: def test__process_rows(gsheet_feeder: GsheetsFeeder): - testworksheet = TestWorksheet() + testworksheet = MockWorksheet() metadata_items = list(gsheet_feeder._process_rows(testworksheet)) assert len(metadata_items) == 3 assert isinstance(metadata_items[0], Metadata) @@ -99,7 +99,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder): def test__set_metadata(gsheet_feeder: GsheetsFeeder): - worksheet = TestWorksheet() + worksheet = MockWorksheet() metadata = Metadata() gsheet_feeder._set_context(metadata, worksheet, 1) assert metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} @@ -112,7 +112,7 @@ def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, workshe def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): - testworksheet = TestWorksheet() + testworksheet = MockWorksheet() metadata = Metadata() testworksheet.wks.title = "TestSheet" gsheet_feeder._set_context(metadata, testworksheet, 6) From d1d6cde008861f508b8689ff6fd30cdde2fccd3a Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 12:27:48 +0000 Subject: [PATCH 095/110] Set mock timestamp without z format --- tests/databases/test_gsheet_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 32e8403..18a22f1 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -41,7 +41,7 @@ def metadata(): metadata.set_title("Example Title") metadata.set_content("Example Content") metadata.success("my-archiver") - metadata.set("timestamp", "2025-01-01T00:00:00Z") + metadata.set("timestamp", "2025-01-01T00:00:00") metadata.set("date", "2025-02-04T18:22:24.909112+00:00") return metadata From 7309cd32e7df6ebf21b32ed0cba288ba8ecea297 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:51:17 +0000 Subject: [PATCH 096/110] fix: context to be updated on Metadata.merge --- src/auto_archiver/core/metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index d20ea5e..a8d2ad4 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -44,6 +44,7 @@ class Metadata: if overwrite_left: if right.status and len(right.status): self.status = right.status + self._context.update(right._context) for k, v in right.metadata.items(): assert k not in self.metadata or type(v) == type(self.get(k)) if type(v) not in [dict, list, set] or k not in self.metadata: From e6594ad3dcb1f8e95919b1ef8a632ea321f7be7a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:52:42 +0000 Subject: [PATCH 097/110] merge result into cached results for context preservation --- src/auto_archiver/core/orchestrator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 473f882..bb5f9e3 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -424,8 +424,8 @@ class ArchivingOrchestrator: cached_result = None for d in self.databases: d.started(result) - if (local_result := d.fetch(result)): - cached_result = (cached_result or Metadata()).merge(local_result) + if local_result := d.fetch(result): + cached_result = (cached_result or Metadata()).merge(local_result).merge(result) if cached_result: logger.debug("Found previously archived entry") for d in self.databases: From 6fdd5f0e662293731ffe435d41b1d5e93d094cec Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:53:12 +0000 Subject: [PATCH 098/110] fix cases of single : vs :: in entrypoint --- src/auto_archiver/modules/api_db/__manifest__.py | 2 +- src/auto_archiver/modules/atlos_db/__manifest__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index 698c2e4..19129a4 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -1,7 +1,7 @@ { "name": "Auto-Archiver API Database", "type": ["database"], - "entry_point": "api_db:AAApiDb", + "entry_point": "api_db::AAApiDb", "requires_setup": True, "dependencies": { "python": ["requests", "loguru"], diff --git a/src/auto_archiver/modules/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py index 8f9473f..b9cabf2 100644 --- a/src/auto_archiver/modules/atlos_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_db/__manifest__.py @@ -1,7 +1,7 @@ { "name": "Atlos Database", "type": ["database"], - "entry_point": "atlos_db:AtlosDb", + "entry_point": "atlos_db::AtlosDb", "requires_setup": True, "dependencies": {"python": ["loguru", From 4eeb39477c4b3cf81be680fddbaa3ce91bfad8a1 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:53:46 +0000 Subject: [PATCH 099/110] improves gsheetdb feedback on retrieve sheet failure --- src/auto_archiver/modules/gsheet_db/gsheet_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 5e1ed1e..5b270bf 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -97,6 +97,6 @@ class GsheetsDb(Database): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") elif self.sheet_id: - print(self.sheet_id) + logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.") return gw, row From 5c590292212ffb3aeec56b11b0d854ad993be8e7 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:53:58 +0000 Subject: [PATCH 100/110] updates api_db for new API endpoint --- src/auto_archiver/modules/api_db/api_db.py | 23 +++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index e1f67ce..374e755 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -16,10 +16,10 @@ class AAApiDb(Database): Helps avoid re-archiving the same URL multiple times. """ if not self.allow_rearchive: return - + params = {"url": item.get_url(), "limit": 15} headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"} - response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers) + response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers) if response.status_code == 200: if len(response.json()): @@ -30,21 +30,26 @@ class AAApiDb(Database): logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") return False - - def done(self, item: Metadata, cached: bool=False) -> None: + def done(self, item: Metadata, cached: bool = False) -> None: """archival result ready - should be saved to DB""" if not self.store_results: return - if cached: + if cached: logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached") return logger.debug(f"saving archive of {item.get_url()} to the AA API.") - payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)} + payload = { + 'author_id': self.author_id, + 'url': item.get_url(), + 'public': self.public, + 'group_id': self.group_id, + 'tags': list(self.tags), + 'result': item.to_json(), + } headers = {"Authorization": f"Bearer {self.api_token}"} - response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers) + response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers) - if response.status_code == 200: + if response.status_code == 201: logger.success(f"AA API: {response.json()}") else: logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") - From 977f06c37a159a9557170409f726530b0903f0e9 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:56:33 +0000 Subject: [PATCH 101/110] renames api_db property for clarity --- src/auto_archiver/modules/api_db/__manifest__.py | 4 ++-- src/auto_archiver/modules/api_db/api_db.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index 19129a4..8359174 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -23,7 +23,7 @@ "default": None, "help": "which group of users have access to the archive in case public=false as author", }, - "allow_rearchive": { + "use_api_cache": { "default": True, "type": "bool", "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", @@ -43,7 +43,7 @@ ### Features - **API Integration**: Supports querying for existing archives and submitting results. -- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled. +- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled. - **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions. - **Tagging and Metadata**: Adds tags and manages metadata for archives. - **Optional Storage**: Archives results conditionally based on configuration. diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index 374e755..753ff3f 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -15,7 +15,7 @@ class AAApiDb(Database): """ query the database for the existence of this item. Helps avoid re-archiving the same URL multiple times. """ - if not self.allow_rearchive: return + if not self.use_api_cache: return params = {"url": item.get_url(), "limit": 15} headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"} From d90d3cec28d2424a7370d232f6445965507b5d92 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 13:03:18 +0000 Subject: [PATCH 102/110] fix telethon_extractor setup --- .../modules/telethon_extractor/telethon_extractor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 21fc4dc..8088364 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -18,12 +18,13 @@ class TelethonExtractor(Extractor): invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def setup(self) -> None: + def setup(self, config: dict) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. trigger login process for telegram or proceed if already saved in a session file 3. joins channel_invites where needed """ + super().setup(config) logger.info(f"SETUP {self.name} checking login...") # make a copy of the session that is used exclusively with this archiver instance From 977618b4ceeb8c02d5a561905cf37f3391c3db3e Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 13:04:59 +0000 Subject: [PATCH 103/110] doc: adds note about telethon vs telegram extractors --- src/auto_archiver/modules/telegram_extractor/__manifest__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/telegram_extractor/__manifest__.py b/src/auto_archiver/modules/telegram_extractor/__manifest__.py index e1c49c2..cb0ee1e 100644 --- a/src/auto_archiver/modules/telegram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telegram_extractor/__manifest__.py @@ -13,7 +13,7 @@ The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials. It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` - is advised for more comprehensive functionality. + is advised for more comprehensive functionality, and higher quality media extraction. ### Features - Extracts images and videos from public Telegram message links (`t.me`). From 47d1dc9d476db7f452d5aada273c1e90b9e08ac4 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:01:37 +0000 Subject: [PATCH 104/110] typing warnings fixed --- .../modules/generic_extractor/generic_extractor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 86e978f..afbe91b 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -1,6 +1,6 @@ import datetime, os, yt_dlp, pysubs2 import importlib -from typing import Type +from typing import Generator, Type from yt_dlp.extractor.common import InfoExtractor from loguru import logger @@ -11,7 +11,7 @@ from auto_archiver.core import Metadata, Media class GenericExtractor(Extractor): _dropins = {} - def suitable_extractors(self, url: str) -> list[str]: + def suitable_extractors(self, url: str) -> Generator[str, None, None]: """ Returns a list of valid extractors for the given URL""" for info_extractor in yt_dlp.YoutubeDL()._ies.values(): @@ -116,7 +116,7 @@ class GenericExtractor(Extractor): def get_metadata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata: """ - Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata. + Calls into the ytdlp InfoExtract subclass to use the private _extract_post method to get the post metadata. """ ie_instance = info_extractor(downloader=ydl) From 5478ed3860396b9e4b1862495cd17444399e01b7 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:02:00 +0000 Subject: [PATCH 105/110] bsky fix media fetching --- src/auto_archiver/modules/generic_extractor/bluesky.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index f2086b0..5eef520 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -39,11 +39,11 @@ class Bluesky(GenericDropin): for image_media in image_medias: url = media_url.format(image_media['image']['ref']['$link'], post['author']['did']) image_media = archiver.download_from_url(url) - media.append(image_media) + media.append(Media(image_media)) for video_media in video_medias: url = media_url.format(video_media['ref']['$link'], post['author']['did']) video_media = archiver.download_from_url(url) - media.append(video_media) + media.append(Media(video_media)) return media From e507fc81d2ed8c175a79891308e7a78bd397d213 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:02:49 +0000 Subject: [PATCH 106/110] improves mimetype guessing, previously file.sub.something would not have an extension --- src/auto_archiver/core/extractor.py | 4 ++-- src/auto_archiver/modules/html_formatter/html_formatter.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 794c06c..2792184 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -80,8 +80,8 @@ class Extractor(BaseModule): d.raise_for_status() # get mimetype from the response headers - if not Path(to_filename).suffix: - content_type = d.headers.get('Content-Type') + if not mimetypes.guess_type(to_filename)[0]: + content_type = d.headers.get('Content-Type') or self._guess_file_type(url) extension = mimetypes.guess_extension(content_type) if extension: to_filename += extension diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index 3691735..ce4e67b 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -9,7 +9,6 @@ import base64 from auto_archiver.version import __version__ from auto_archiver.core import Metadata, Media from auto_archiver.core import Formatter -from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str from auto_archiver.core.module import get_module From c720541de20a6b045c9f26a453e900d86bd41607 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:22:06 +0000 Subject: [PATCH 107/110] merge conflicts --- src/auto_archiver/modules/wacz_enricher/wacz_enricher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index c324c62..ff7314a 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -221,4 +221,4 @@ class WaczExtractorEnricher(Enricher, Extractor): to_enrich.add_media(m, warc_fn) counter += 1 seen_urls.add(record_url) - logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)") + logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)") \ No newline at end of file From 91f1ebf7b39556e9f070657332a9436af300989d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:23:16 +0000 Subject: [PATCH 108/110] fix temp for yandex new shortlink --- .../modules/generic_extractor/generic_extractor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index afbe91b..6bcb249 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -266,6 +266,11 @@ class GenericExtractor(Extractor): def download(self, item: Metadata) -> Metadata: url = item.get_url() + #TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025 + if url.startswith("https://ya.ru"): + url = url.replace("https://ya.ru", "https://yandex.ru") + item.set("replaced_url", url) + ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , From ea728a7a97e8ec6b70e904a4054a2439c2d11385 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 11 Feb 2025 15:55:19 +0000 Subject: [PATCH 109/110] TODO on facebook dropin not working --- src/auto_archiver/modules/generic_extractor/facebook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/generic_extractor/facebook.py b/src/auto_archiver/modules/generic_extractor/facebook.py index 352d44e..fed8e09 100644 --- a/src/auto_archiver/modules/generic_extractor/facebook.py +++ b/src/auto_archiver/modules/generic_extractor/facebook.py @@ -8,7 +8,8 @@ class Facebook(GenericDropin): url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id')) - post_data = ie_instance._extract_from_url.extract_metadata(webpage) + # TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged + post_data = ie_instance._extract_metadata(webpage) return post_data def create_metadata(self, post: dict, ie_instance, archiver, url): From 3787577a961a6dbac9d1145b165413348471b5ec Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 11 Feb 2025 18:18:52 +0000 Subject: [PATCH 110/110] Screenshot enricher depends on geckodriver not chromedriver --- src/auto_archiver/modules/screenshot_enricher/__manifest__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py index 52842c9..831959e 100644 --- a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py +++ b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py @@ -4,7 +4,7 @@ "requires_setup": True, "dependencies": { "python": ["loguru", "selenium"], - "bin": ["chromedriver"] + "bin": ["geckodriver"] }, "configs": { "width": {"default": 1280, "help": "width of the screenshots"},