From 63aba6ad3994a27b7e95116dd9d6b8c4fd40e452 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Feb 2025 21:54:49 +0100 Subject: [PATCH 1/8] Fix sphinx-autoapi imports --- src/auto_archiver/core/extractor.py | 2 +- src/auto_archiver/core/orchestrator.py | 2 +- .../modules/generic_extractor/generic_extractor.py | 2 +- src/auto_archiver/utils/gsheet.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 57320df..794c06c 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -17,7 +17,7 @@ from loguru import logger from retrying import retry import re -from ..core import Metadata, BaseModule +from auto_archiver.core import Metadata, BaseModule class Extractor(BaseModule): diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 5ac091c..641f099 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -19,7 +19,7 @@ from rich_argparse import RichHelpFormatter from .metadata import Metadata, Media -from ..version import __version__ +from auto_archiver.version import __version__ from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, get_module, setup_paths from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index d1b1fb6..86e978f 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger from auto_archiver.core.extractor import Extractor -from ...core import Metadata, Media +from auto_archiver.core import Metadata, Media class GenericExtractor(Extractor): _dropins = {} diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py index 7a8862f..c36a032 100644 --- a/src/auto_archiver/utils/gsheet.py +++ b/src/auto_archiver/utils/gsheet.py @@ -1,6 +1,6 @@ import json, gspread -from ..core import BaseModule +from auto_archiver.core import BaseModule class Gsheets(BaseModule): From 1fad37fd934ba26835c9eb20222d95210a1e513a Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Feb 2025 23:08:30 +0100 Subject: [PATCH 2/8] Remove blank file --- src/auto_archiver/core/authentication.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/auto_archiver/core/authentication.py diff --git a/src/auto_archiver/core/authentication.py b/src/auto_archiver/core/authentication.py deleted file mode 100644 index e69de29..0000000 From e9dd321dcd548cc02d7fa2a0d0171feed1226c51 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 13:06:24 +0100 Subject: [PATCH 3/8] Fix setting cli_feeder as default feeder on clean install --- src/auto_archiver/core/config.py | 3 ++- src/auto_archiver/core/orchestrator.py | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 2d462e4..8f36c54 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -36,6 +36,7 @@ steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES # a dictionary of authentication information that can be used by extractors to login to website. # you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com) # Common login 'types' are username/password, cookie, api key/token. +# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser. # Some Examples: # facebook.com: # username: "my_username" @@ -163,6 +164,6 @@ def read_yaml(yaml_filename: str) -> CommentedMap: def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save = deepcopy(config) - config.pop('urls', None) + config_to_save.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 641f099..20212ce 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -128,6 +128,10 @@ class ArchivingOrchestrator: elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) + + # for simple mode, we use the cli_feeder and any modules that don't require setup + yaml_config['steps']['feeders'] = ['cli_feeder'] + # add them to the config for module in simple_modules: for module_type in module.type: @@ -237,18 +241,18 @@ class ArchivingOrchestrator: if log_file := logging_config['file']: logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) - - def install_modules(self): + def install_modules(self, modules_by_type): """ - Swaps out the previous 'strings' in the config with the actual modules and loads them + Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the + orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type + are loaded, the program will exit with an error message. """ invalid_modules = [] for module_type in BaseModule.MODULE_TYPES: step_items = [] - modules_to_load = self.config['steps'][f"{module_type}s"] - + modules_to_load = modules_by_type[f"{module_type}s"] assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)" def check_steps_ok(): @@ -264,9 +268,11 @@ class ArchivingOrchestrator: for module in modules_to_load: if module == 'cli_feeder': + # pseudo module, don't load it + breakpoint() urls = self.config['urls'] if not urls: - logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder. Use --help for more information.") + logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.") exit() # cli_feeder is a pseudo module, it just takes the command line args def feed(self) -> Generator[Metadata]: @@ -330,7 +336,7 @@ class ArchivingOrchestrator: self.setup_complete_parser(basic_config, yaml_config, unused_args) logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") - self.install_modules() + self.install_modules(self.config['steps']) # log out the modules that were loaded for module_type in BaseModule.MODULE_TYPES: From 74207d7821e0306de8e3b6da00cf263edfe0293c Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 13:27:11 +0100 Subject: [PATCH 4/8] Implementation tests for auto-archiver --- src/auto_archiver/core/orchestrator.py | 2 -- tests/test_implementation.py | 35 ++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 tests/test_implementation.py diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 20212ce..a451443 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -111,7 +111,6 @@ class ArchivingOrchestrator: # if full, we'll load all modules # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser' # but should we add them? Or should we just add them to the 'complete' parser? - if yaml_config != EMPTY_CONFIG: # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? @@ -269,7 +268,6 @@ class ArchivingOrchestrator: for module in modules_to_load: if module == 'cli_feeder': # pseudo module, don't load it - breakpoint() urls = self.config['urls'] if not urls: logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.") diff --git a/tests/test_implementation.py b/tests/test_implementation.py new file mode 100644 index 0000000..82d5d0f --- /dev/null +++ b/tests/test_implementation.py @@ -0,0 +1,35 @@ +import sys +import pytest + +from auto_archiver.__main__ import main + +@pytest.fixture +def orchestration_file(tmp_path): + return (tmp_path / "example_orch.yaml").as_posix() + +@pytest.fixture +def autoarchiver(tmp_path, monkeypatch): + + def _autoarchiver(args=["--config", "example_orch.yaml"]): + # change dir to tmp_path + monkeypatch.chdir(tmp_path) + with monkeypatch.context() as m: + m.setattr(sys, "argv", ["auto-archiver"] + args) + return main() + + return _autoarchiver + + +def test_run_auto_archiver_no_args(caplog, autoarchiver): + with pytest.raises(SystemExit): + autoarchiver([]) + + assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text + + +def test_run_auto_archiver_invalid_file(caplog, autoarchiver, monkeypatch): + # exec 'auto-archiver' on the command lin + with pytest.raises(SystemExit): + autoarchiver() + + assert "Make sure the file exists and try again, or run without th" in caplog.text \ No newline at end of file From f3f6b928172fe597c772e2c677a3f3d118f02bef Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 12:43:21 +0000 Subject: [PATCH 5/8] Implementation test cleanup --- tests/test_implementation.py | 45 ++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/tests/test_implementation.py b/tests/test_implementation.py index 82d5d0f..7e33651 100644 --- a/tests/test_implementation.py +++ b/tests/test_implementation.py @@ -3,33 +3,60 @@ import pytest from auto_archiver.__main__ import main + @pytest.fixture -def orchestration_file(tmp_path): +def orchestration_file_path(tmp_path): return (tmp_path / "example_orch.yaml").as_posix() @pytest.fixture -def autoarchiver(tmp_path, monkeypatch): +def orchestration_file(orchestration_file_path): + def _orchestration_file(content=''): + with open(orchestration_file_path, "w") as f: + f.write(content) + return orchestration_file_path + + return _orchestration_file + +@pytest.fixture +def autoarchiver(tmp_path, monkeypatch, request): + def _autoarchiver(args=[]): + + def cleanup(): + from loguru import logger + if not logger._core.handlers.get(0): + logger._core.handlers_count = 0 + logger.add(sys.stderr) + + request.addfinalizer(cleanup) - def _autoarchiver(args=["--config", "example_orch.yaml"]): # change dir to tmp_path monkeypatch.chdir(tmp_path) with monkeypatch.context() as m: m.setattr(sys, "argv", ["auto-archiver"] + args) return main() - + return _autoarchiver def test_run_auto_archiver_no_args(caplog, autoarchiver): with pytest.raises(SystemExit): - autoarchiver([]) + autoarchiver() assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text - -def test_run_auto_archiver_invalid_file(caplog, autoarchiver, monkeypatch): +def test_run_auto_archiver_invalid_file(caplog, autoarchiver): # exec 'auto-archiver' on the command lin with pytest.raises(SystemExit): - autoarchiver() + autoarchiver(["--config", "nonexistent_file.yaml"]) - assert "Make sure the file exists and try again, or run without th" in caplog.text \ No newline at end of file + assert "Make sure the file exists and try again, or run without th" in caplog.text + +def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file): + # create a valid (empty) orchestration file + path = orchestration_file(content="") + # exec 'auto-archiver' on the command lin + with pytest.raises(SystemExit): + autoarchiver(["--config", path]) + + # should treat an empty file as if there is no file at all + assert " No URLs provided. Please provide at least one URL via the com" in caplog.text From 7c848046e88a12d6b9ea89c7b6b34ab76ef009e8 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:59:32 +0000 Subject: [PATCH 6/8] adds better info about wrong/missing modules --- src/auto_archiver/core/module.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index dec67e1..f3fbec5 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -13,7 +13,7 @@ import copy import sys from importlib.util import find_spec import os -from os.path import join, dirname +from os.path import join from loguru import logger import auto_archiver from .base_module import BaseModule @@ -64,8 +64,10 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa if module_name in _LAZY_LOADED_MODULES: return _LAZY_LOADED_MODULES[module_name] - module = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)[0] - return module + available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings) + if not available: + raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?") + return available[0] def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: From 8fb3dc754b14b76833a12daa091ef608edf6a61c Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:59:51 +0000 Subject: [PATCH 7/8] fixing telethon extractor to use default entrypoint --- src/auto_archiver/modules/telethon_extractor/__init__.py | 2 +- .../modules/telethon_extractor/telethon_extractor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py index a837fdf..2eaa57c 100644 --- a/src/auto_archiver/modules/telethon_extractor/__init__.py +++ b/src/auto_archiver/modules/telethon_extractor/__init__.py @@ -1 +1 @@ -from .telethon_extractor import TelethonArchiver \ No newline at end of file +from .telethon_extractor import TelethonExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 3e952e8..21fc4dc 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -6,14 +6,14 @@ from telethon.tl.functions.messages import ImportChatInviteRequest from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError from loguru import logger from tqdm import tqdm -import re, time, json, os +import re, time, os from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media from auto_archiver.utils import random_str -class TelethonArchiver(Extractor): +class TelethonExtractor(Extractor): valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") From 15abf686b1315b3a35a628df12f687b9aec431d5 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Feb 2025 15:48:54 +0000 Subject: [PATCH 8/8] decouples s3_storage from hash_enricher --- src/auto_archiver/core/base_module.py | 2 +- .../modules/hash_enricher/hash_enricher.py | 8 ++------ src/auto_archiver/modules/s3_storage/s3_storage.py | 8 +++----- src/auto_archiver/utils/misc.py | 12 ++++++++++++ 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index fcfe9ea..5c6ecbb 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -63,7 +63,7 @@ class BaseModule(ABC): def setup(self, config: dict): authentication = config.get('authentication', {}) - # extract out contatenated sites + # extract out concatenated sites for key, val in copy(authentication).items(): if "," in key: for site in key.split(","): diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 58c6abe..b3ca8be 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -12,6 +12,7 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata +from auto_archiver.utils.misc import calculate_file_hash class HashEnricher(Enricher): @@ -35,9 +36,4 @@ class HashEnricher(Enricher): elif self.algorithm == "SHA3-512": hash = hashlib.sha3_512() else: return "" - with open(filename, "rb") as f: - while True: - buf = f.read(self.chunksize) - if not buf: break - hash.update(buf) - return hash.hexdigest() + return calculate_file_hash(filename, hash, self.chunksize) diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index f324d5c..2f85164 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -7,12 +7,11 @@ from loguru import logger from auto_archiver.core import Media from auto_archiver.core import Storage -from auto_archiver.modules.hash_enricher import HashEnricher -from auto_archiver.utils.misc import random_str +from auto_archiver.utils.misc import calculate_file_hash, random_str NO_DUPLICATES_FOLDER = "no-dups/" -class S3Storage(Storage, HashEnricher): +class S3Storage(Storage): def setup(self, config: dict) -> None: super().setup(config) @@ -42,14 +41,13 @@ class S3Storage(Storage, HashEnricher): extra_args['ContentType'] = media.mimetype except Exception as e: logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") - self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) return True def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload - hd = self.calculate_hash(media.filename) + hd = calculate_file_hash(media.filename) path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 300a710..3af5a54 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -5,6 +5,7 @@ import json import uuid from datetime import datetime import requests +import hashlib from loguru import logger @@ -54,9 +55,20 @@ def update_nested_dict(dictionary, update_dict): else: dictionary[key] = value + def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" return str(uuid.uuid4()).replace("-", "")[:length] + def json_loader(cli_val): return json.loads(cli_val) + + +def calculate_file_hash(filename: str, hash_algo = hashlib.sha256(), chunksize: int = 16000000) -> str: + with open(filename, "rb") as f: + while True: + buf = f.read(chunksize) + if not buf: break + hash_algo.update(buf) + return hash_algo.hexdigest()