diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index fcfe9ea..5c6ecbb 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -63,7 +63,7 @@ class BaseModule(ABC): def setup(self, config: dict): authentication = config.get('authentication', {}) - # extract out contatenated sites + # extract out concatenated sites for key, val in copy(authentication).items(): if "," in key: for site in key.split(","): diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 2d462e4..8f36c54 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -36,6 +36,7 @@ steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES # a dictionary of authentication information that can be used by extractors to login to website. # you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com) # Common login 'types' are username/password, cookie, api key/token. +# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser. # Some Examples: # facebook.com: # username: "my_username" @@ -163,6 +164,6 @@ def read_yaml(yaml_filename: str) -> CommentedMap: def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save = deepcopy(config) - config.pop('urls', None) + config_to_save.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 57320df..794c06c 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -17,7 +17,7 @@ from loguru import logger from retrying import retry import re -from ..core import Metadata, BaseModule +from auto_archiver.core import Metadata, BaseModule class Extractor(BaseModule): diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index dec67e1..f3fbec5 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -13,7 +13,7 @@ import copy import sys from importlib.util import find_spec import os -from os.path import join, dirname +from os.path import join from loguru import logger import auto_archiver from .base_module import BaseModule @@ -64,8 +64,10 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa if module_name in _LAZY_LOADED_MODULES: return _LAZY_LOADED_MODULES[module_name] - module = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)[0] - return module + available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings) + if not available: + raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?") + return available[0] def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 5ac091c..a451443 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -19,7 +19,7 @@ from rich_argparse import RichHelpFormatter from .metadata import Metadata, Media -from ..version import __version__ +from auto_archiver.version import __version__ from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, get_module, setup_paths from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher @@ -111,7 +111,6 @@ class ArchivingOrchestrator: # if full, we'll load all modules # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser' # but should we add them? Or should we just add them to the 'complete' parser? - if yaml_config != EMPTY_CONFIG: # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? @@ -128,6 +127,10 @@ class ArchivingOrchestrator: elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) + + # for simple mode, we use the cli_feeder and any modules that don't require setup + yaml_config['steps']['feeders'] = ['cli_feeder'] + # add them to the config for module in simple_modules: for module_type in module.type: @@ -237,18 +240,18 @@ class ArchivingOrchestrator: if log_file := logging_config['file']: logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) - - def install_modules(self): + def install_modules(self, modules_by_type): """ - Swaps out the previous 'strings' in the config with the actual modules and loads them + Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the + orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type + are loaded, the program will exit with an error message. """ invalid_modules = [] for module_type in BaseModule.MODULE_TYPES: step_items = [] - modules_to_load = self.config['steps'][f"{module_type}s"] - + modules_to_load = modules_by_type[f"{module_type}s"] assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)" def check_steps_ok(): @@ -264,9 +267,10 @@ class ArchivingOrchestrator: for module in modules_to_load: if module == 'cli_feeder': + # pseudo module, don't load it urls = self.config['urls'] if not urls: - logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder. Use --help for more information.") + logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.") exit() # cli_feeder is a pseudo module, it just takes the command line args def feed(self) -> Generator[Metadata]: @@ -330,7 +334,7 @@ class ArchivingOrchestrator: self.setup_complete_parser(basic_config, yaml_config, unused_args) logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") - self.install_modules() + self.install_modules(self.config['steps']) # log out the modules that were loaded for module_type in BaseModule.MODULE_TYPES: diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index d1b1fb6..86e978f 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger from auto_archiver.core.extractor import Extractor -from ...core import Metadata, Media +from auto_archiver.core import Metadata, Media class GenericExtractor(Extractor): _dropins = {} diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 58c6abe..b3ca8be 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -12,6 +12,7 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata +from auto_archiver.utils.misc import calculate_file_hash class HashEnricher(Enricher): @@ -35,9 +36,4 @@ class HashEnricher(Enricher): elif self.algorithm == "SHA3-512": hash = hashlib.sha3_512() else: return "" - with open(filename, "rb") as f: - while True: - buf = f.read(self.chunksize) - if not buf: break - hash.update(buf) - return hash.hexdigest() + return calculate_file_hash(filename, hash, self.chunksize) diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index 0c0e275..2f85164 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -7,9 +7,7 @@ from loguru import logger from auto_archiver.core import Media from auto_archiver.core import Storage -from auto_archiver.modules.hash_enricher import HashEnricher -from auto_archiver.utils.misc import random_str -from auto_archiver.core.module import get_module +from auto_archiver.utils.misc import calculate_file_hash, random_str NO_DUPLICATES_FOLDER = "no-dups/" @@ -43,15 +41,13 @@ class S3Storage(Storage): extra_args['ContentType'] = media.mimetype except Exception as e: logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") - self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) return True def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload - he = get_module('hash_enricher', self.config) - hd = he.calculate_hash(media.filename) + hd = calculate_file_hash(media.filename) path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 947db9e..97d3e94 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -6,7 +6,7 @@ from telethon.tl.functions.messages import ImportChatInviteRequest from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError from loguru import logger from tqdm import tqdm -import re, time, json, os +import re, time, os from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/core/authentication.py b/src/auto_archiver/utils/gsheet.py similarity index 100% rename from src/auto_archiver/core/authentication.py rename to src/auto_archiver/utils/gsheet.py diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index e4c214c..2b64a3d 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -3,6 +3,7 @@ import json import uuid from datetime import datetime, timezone import requests +import hashlib from loguru import logger @@ -52,6 +53,7 @@ def update_nested_dict(dictionary, update_dict): else: dictionary[key] = value + def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" return str(uuid.uuid4()).replace("-", "")[:length] @@ -60,6 +62,15 @@ def random_str(length: int = 32) -> str: def json_loader(cli_val): return json.loads(cli_val) + +def calculate_file_hash(filename: str, hash_algo = hashlib.sha256(), chunksize: int = 16000000) -> str: + with open(filename, "rb") as f: + while True: + buf = f.read(chunksize) + if not buf: break + hash_algo.update(buf) + return hash_algo.hexdigest() + def get_current_datetime_iso() -> str: return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat() diff --git a/tests/test_implementation.py b/tests/test_implementation.py new file mode 100644 index 0000000..7e33651 --- /dev/null +++ b/tests/test_implementation.py @@ -0,0 +1,62 @@ +import sys +import pytest + +from auto_archiver.__main__ import main + + +@pytest.fixture +def orchestration_file_path(tmp_path): + return (tmp_path / "example_orch.yaml").as_posix() + +@pytest.fixture +def orchestration_file(orchestration_file_path): + def _orchestration_file(content=''): + with open(orchestration_file_path, "w") as f: + f.write(content) + return orchestration_file_path + + return _orchestration_file + +@pytest.fixture +def autoarchiver(tmp_path, monkeypatch, request): + def _autoarchiver(args=[]): + + def cleanup(): + from loguru import logger + if not logger._core.handlers.get(0): + logger._core.handlers_count = 0 + logger.add(sys.stderr) + + request.addfinalizer(cleanup) + + # change dir to tmp_path + monkeypatch.chdir(tmp_path) + with monkeypatch.context() as m: + m.setattr(sys, "argv", ["auto-archiver"] + args) + return main() + + return _autoarchiver + + +def test_run_auto_archiver_no_args(caplog, autoarchiver): + with pytest.raises(SystemExit): + autoarchiver() + + assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text + +def test_run_auto_archiver_invalid_file(caplog, autoarchiver): + # exec 'auto-archiver' on the command lin + with pytest.raises(SystemExit): + autoarchiver(["--config", "nonexistent_file.yaml"]) + + assert "Make sure the file exists and try again, or run without th" in caplog.text + +def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file): + # create a valid (empty) orchestration file + path = orchestration_file(content="") + # exec 'auto-archiver' on the command lin + with pytest.raises(SystemExit): + autoarchiver(["--config", path]) + + # should treat an empty file as if there is no file at all + assert " No URLs provided. Please provide at least one URL via the com" in caplog.text