From 20bc80b9efa92b2f8f58cf9523cb2ce27ce4436c Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 26 Mar 2025 15:57:11 +0400 Subject: [PATCH 1/7] Slightly more consistent/tidier naming for the session files Don't add/remove .session from name, keep the file name without .session at all times --- .../telethon_extractor/telethon_extractor.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index b06962e..018b62b 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -31,10 +31,14 @@ class TelethonExtractor(Extractor): """ logger.info(f"SETUP {self.name} checking login...") + # in case the user already added '.session' to the session_file + self.session_file = self.session_file.removesuffix(".session") + # make a copy of the session that is used exclusively with this archiver instance - new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session") - shutil.copy(self.session_file + ".session", new_session_file) - self.session_file = new_session_file.replace(".session", "") + new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}") + logger.debug(f"Making a copy of the session file {self.session_file}.session to {new_session_file}.session") + shutil.copy(f"{self.session_file}.session", f"{new_session_file}.session") + self.session_file = new_session_file # initiate the client self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) @@ -87,8 +91,8 @@ class TelethonExtractor(Extractor): pbar.update() def cleanup(self) -> None: - logger.info(f"CLEANUP {self.name}.") - session_file_name = self.session_file + ".session" + logger.info(f"CLEANUP {self.name} - removing session file {self.session_file}.session") + session_file_name = f"{self.session_file}.session" if os.path.exists(session_file_name): os.remove(session_file_name) From 580de8836629ea8ec0a7b11f9542ca957ced4c50 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 26 Mar 2025 21:32:23 +0400 Subject: [PATCH 2/7] Set the new session filename *before* copying Fixes a potential bug whereby if the copy fails for some reason, the 'cleanup' command would remove the original session file --- .../modules/telethon_extractor/telethon_extractor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 018b62b..434d97c 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -35,10 +35,10 @@ class TelethonExtractor(Extractor): self.session_file = self.session_file.removesuffix(".session") # make a copy of the session that is used exclusively with this archiver instance - new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}") - logger.debug(f"Making a copy of the session file {self.session_file}.session to {new_session_file}.session") - shutil.copy(f"{self.session_file}.session", f"{new_session_file}.session") - self.session_file = new_session_file + old_session_file = f"{self.session_file}.session" + self.session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}") + logger.debug(f"Making a copy of the session file {old_session_file} to {self.session_file}.session") + shutil.copy(old_session_file, f"{self.session_file}.session") # initiate the client self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) From 17d2d14680d51edd5bc91a2012b4fb03e18cb4b9 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 26 Mar 2025 22:52:52 +0400 Subject: [PATCH 3/7] Fix running 'cleanup' method on extractors that fail to start --- src/auto_archiver/core/module.py | 5 +++-- src/auto_archiver/core/orchestrator.py | 6 ++++-- .../example_extractor/example_extractor.py | 7 ++++++- .../example_module/example_module.py | 14 +++++++------ tests/test_orchestrator.py | 20 +++++++++++++++++++ 5 files changed, 41 insertions(+), 11 deletions(-) diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 3fdc3ae..168550d 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -277,6 +277,9 @@ class LazyBaseModule: # finally, get the class instance instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)() + # save the instance for future easy loading + self._instance = instance + # set the name, display name and module factory instance.name = self.name instance.display_name = self.display_name @@ -289,8 +292,6 @@ class LazyBaseModule: instance.config_setup(config) instance.setup() - # save the instance for future easy loading - self._instance = instance return instance def __repr__(self): diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index cbd1af5..b637878 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -387,8 +387,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ except (KeyboardInterrupt, Exception) as e: if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError): logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") - if loaded_module and module_type == "extractor": - loaded_module.cleanup() + + # access the _instance here because loaded_module may not return if there's an error + if lazy_module._instance and module_type == "extractor": + lazy_module._instance.cleanup() raise e if not loaded_module: diff --git a/tests/data/test_modules/example_extractor/example_extractor.py b/tests/data/test_modules/example_extractor/example_extractor.py index 1c63383..ade26e4 100644 --- a/tests/data/test_modules/example_extractor/example_extractor.py +++ b/tests/data/test_modules/example_extractor/example_extractor.py @@ -1,6 +1,11 @@ from auto_archiver.core import Extractor +from loguru import logger + class ExampleExtractor(Extractor): def download(self, item): - print("download") + logger.info("download") + + def cleanup(self): + logger.info("cleanup") diff --git a/tests/data/test_modules/example_module/example_module.py b/tests/data/test_modules/example_module/example_module.py index 392abe0..898df96 100644 --- a/tests/data/test_modules/example_module/example_module.py +++ b/tests/data/test_modules/example_module/example_module.py @@ -1,27 +1,29 @@ from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata +from loguru import logger + class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter): def download(self, item): - print("download") + logger.info("download") def __iter__(self): yield Metadata().set_url("https://example.com") def done(self, result): - print("done") + logger.info("done") def enrich(self, to_enrich): - print("enrich") + logger.info("enrich") def get_cdn_url(self, media): return "nice_url" def save(self, item): - print("save") + logger.info("save") def uploadf(self, file, key, **kwargs): - print("uploadf") + logger.info("uploadf") def format(self, item): - print("format") + logger.info("format") diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 3367ce0..86e3125 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -237,3 +237,23 @@ def test_wrong_step_type(test_args, caplog): with pytest.raises(SetupError) as err: orchestrator.setup(args) assert "Module 'example_extractor' is not a feeder" in str(err.value) + + +def test_load_failed_extractor_cleanup(test_args, mocker, caplog): + orchestrator = ArchivingOrchestrator() + + # hack to set up the paths so we can patch properly + orchestrator.module_factory.setup_paths([TEST_MODULES]) + + # patch example_module.setup to throw an exception + mocker.patch( + "auto_archiver.modules.example_extractor.example_extractor.ExampleExtractor.setup", + side_effect=Exception("Test exception"), + ) + + with pytest.raises(Exception): + orchestrator.setup(test_args + ["--extractors", "example_extractor"]) + + assert "Error during setup of modules: Test exception" in caplog.text + # make sure the 'cleanup' is called + assert "cleanup" in caplog.text From 95ea9fb23184f2628f4798cf8383baac8a86bdc5 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 26 Mar 2025 22:53:27 +0400 Subject: [PATCH 4/7] Telethon unit tests + tidyup --- src/auto_archiver/core/validators.py | 6 --- .../telethon_extractor/__manifest__.py | 2 +- .../telethon_extractor/telethon_extractor.py | 30 ++++++++++----- tests/conftest.py | 8 ++++ tests/extractors/test_telethon_extractor.py | 38 +++++++++++++++++++ 5 files changed, 68 insertions(+), 16 deletions(-) create mode 100644 tests/extractors/test_telethon_extractor.py diff --git a/src/auto_archiver/core/validators.py b/src/auto_archiver/core/validators.py index 765a02b..decd9f1 100644 --- a/src/auto_archiver/core/validators.py +++ b/src/auto_archiver/core/validators.py @@ -4,12 +4,6 @@ import argparse import json -def example_validator(value): - if "example" not in value: - raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument") - return value - - def positive_number(value): if value < 0: raise argparse.ArgumentTypeError(f"{value} is not a positive number") diff --git a/src/auto_archiver/modules/telethon_extractor/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py index 150b62c..e29a9b0 100644 --- a/src/auto_archiver/modules/telethon_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -19,7 +19,7 @@ }, "session_file": { "default": "secrets/anon", - "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.", + "help": "Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.", }, "join_channels": { "default": True, diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 434d97c..9acec30 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -1,4 +1,9 @@ +import os import shutil +import re +import time +from datetime import date + from telethon.sync import TelegramClient from telethon.errors import ChannelInvalidError from telethon.tl.functions.messages import ImportChatInviteRequest @@ -8,11 +13,9 @@ from telethon.errors.rpcerrorlist import ( InviteRequestSentError, InviteHashExpiredError, ) -from loguru import logger + from tqdm import tqdm -import re -import time -import os +from loguru import logger from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media @@ -32,13 +35,22 @@ class TelethonExtractor(Extractor): logger.info(f"SETUP {self.name} checking login...") # in case the user already added '.session' to the session_file - self.session_file = self.session_file.removesuffix(".session") + base_session_name = self.session_file.removesuffix(".session") + base_session_filepath = f"{base_session_name}.session" + + if self.session_file and not os.path.exists(base_session_filepath): + logger.warning( + f"SETUP - Session file {base_session_filepath} does not exist for {self.name}, creating an empty one." + ) + with open(base_session_filepath, "w") as f: + f.write("") # make a copy of the session that is used exclusively with this archiver instance - old_session_file = f"{self.session_file}.session" - self.session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}") - logger.debug(f"Making a copy of the session file {old_session_file} to {self.session_file}.session") - shutil.copy(old_session_file, f"{self.session_file}.session") + self.session_file = os.path.join( + os.path.dirname(base_session_filepath), f"telethon-{date.today().strftime('%Y-%m-%d')}{random_str(8)}" + ) + logger.debug(f"Making a copy of the session file {base_session_filepath} to {self.session_file}.session") + shutil.copy(base_session_filepath, f"{self.session_file}.session") # initiate the client self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) diff --git a/tests/conftest.py b/tests/conftest.py index 44d8058..b686319 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,6 +20,14 @@ from auto_archiver.core.module import ModuleFactory TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"] +@pytest.fixture +def get_lazy_module(): + def _get_lazy_module(module_name): + return ModuleFactory().get_module_lazy(module_name) + + return _get_lazy_module + + @pytest.fixture def setup_module(request): def _setup_module(module_name, config=None): diff --git a/tests/extractors/test_telethon_extractor.py b/tests/extractors/test_telethon_extractor.py new file mode 100644 index 0000000..23ba338 --- /dev/null +++ b/tests/extractors/test_telethon_extractor.py @@ -0,0 +1,38 @@ +import os +from datetime import date + +import pytest + +from .test_extractor_base import TestExtractorBase +from auto_archiver.modules.telethon_extractor import TelethonExtractor + + +class TestTelethonExtractor(TestExtractorBase): + extractor_module = "telethon_extractor" + extractor: TelethonExtractor + config = { + "api_id": 123, + "api_hash": "ABC", + } + + @pytest.fixture(autouse=True) + def mock_client_setup(self, mocker): + mocker.patch("telethon.client.auth.AuthMethods.start") + + def test_setup_fails_clear_session_file(self, get_lazy_module, tmp_path, mocker): + start = mocker.patch("telethon.client.auth.AuthMethods.start") + start.side_effect = Exception("Test exception") + + # make sure the default setup file is created + session_file = tmp_path / "test.session" + + lazy_module = get_lazy_module("telethon_extractor") + + with pytest.raises(Exception): + lazy_module.load( + {"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}} + ) + + assert session_file.exists() + assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file + assert os.path.exists(lazy_module._instance.session_file + ".session") From e06b0c058584d5e77699a41befca70b9c4a2b8b7 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 26 Mar 2025 23:03:48 +0400 Subject: [PATCH 5/7] Skip checking if docker is running for tests + more graceful test for filename --- tests/conftest.py | 1 + tests/extractors/test_generic_extractor.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b686319..b831e65 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -142,6 +142,7 @@ def unpickle(): @pytest.fixture def mock_binary_dependencies(mocker): + mocker.patch("subprocess.run").return_value = mocker.Mock(returncode=0) mock_shutil_which = mocker.patch("shutil.which") # Mock all binary dependencies as available mock_shutil_which.return_value = "/usr/bin/fake_binary" diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py index 85a0364..2fb5c35 100644 --- a/tests/extractors/test_generic_extractor.py +++ b/tests/extractors/test_generic_extractor.py @@ -36,7 +36,7 @@ class TestGenericExtractor(TestExtractorBase): package = "auto_archiver.modules.generic_extractor" assert self.extractor.dropin_for_name("bluesky", package=package) - # test loading dropings via filepath + # test loading dropins via filepath path = os.path.join(dirname(dirname(__file__)), "data/") assert self.extractor.dropin_for_name("dropin", additional_paths=[path]) @@ -121,7 +121,7 @@ class TestGenericExtractor(TestExtractorBase): == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/" ) assert len(result.media) == 2 - assert Path(result.media[0].filename).name == "J---aiyznGQ.webm" + assert "J---aiyznGQ" in Path(result.media[0].filename).name assert Path(result.media[1].filename).name == "hqdefault.jpg" @pytest.mark.download From e0e9f9306584cc5e965a87aaac5818a71ea9e385 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 26 Mar 2025 23:28:46 +0400 Subject: [PATCH 6/7] Skip update checks for ytdlp when running tests --- tests/conftest.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index b831e65..6377646 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,7 +17,16 @@ from auto_archiver.core.module import ModuleFactory # that you only want to run if everything else succeeds (e.g. API calls). The order here is important # what comes first will be run first (at the end of all other tests not mentioned) # format is the name of the module (python file) without the .py extension -TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"] +TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"] + + +# don't check for ytdlp updates in tests +@pytest.fixture(autouse=True) +def skip_check_for_update(mocker): + update_ytdlp = mocker.patch( + "auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.update_ytdlp" + ) + update_ytdlp.return_value = False @pytest.fixture From b7949a489f1343fbee0565e589b3cc0bb988c2e6 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 26 Mar 2025 23:51:21 +0400 Subject: [PATCH 7/7] Simplify telethon unit tests for CI (don't use TestExtractorBase - it causes loading issues) --- .../telethon_extractor/telethon_extractor.py | 4 +- tests/extractors/test_telethon_extractor.py | 42 +++++++------------ 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 9acec30..73fb4e8 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -2,6 +2,7 @@ import os import shutil import re import time +from pathlib import Path from datetime import date from telethon.sync import TelegramClient @@ -42,8 +43,7 @@ class TelethonExtractor(Extractor): logger.warning( f"SETUP - Session file {base_session_filepath} does not exist for {self.name}, creating an empty one." ) - with open(base_session_filepath, "w") as f: - f.write("") + Path(base_session_filepath).touch() # make a copy of the session that is used exclusively with this archiver instance self.session_file = os.path.join( diff --git a/tests/extractors/test_telethon_extractor.py b/tests/extractors/test_telethon_extractor.py index 23ba338..0a2d5c8 100644 --- a/tests/extractors/test_telethon_extractor.py +++ b/tests/extractors/test_telethon_extractor.py @@ -3,36 +3,24 @@ from datetime import date import pytest -from .test_extractor_base import TestExtractorBase -from auto_archiver.modules.telethon_extractor import TelethonExtractor + +@pytest.fixture(autouse=True) +def mock_client_setup(mocker): + mocker.patch("telethon.client.auth.AuthMethods.start") -class TestTelethonExtractor(TestExtractorBase): - extractor_module = "telethon_extractor" - extractor: TelethonExtractor - config = { - "api_id": 123, - "api_hash": "ABC", - } +def test_setup_fails_clear_session_file(get_lazy_module, tmp_path, mocker): + start = mocker.patch("telethon.client.auth.AuthMethods.start") + start.side_effect = Exception("Test exception") - @pytest.fixture(autouse=True) - def mock_client_setup(self, mocker): - mocker.patch("telethon.client.auth.AuthMethods.start") + # make sure the default setup file is created + session_file = tmp_path / "test.session" - def test_setup_fails_clear_session_file(self, get_lazy_module, tmp_path, mocker): - start = mocker.patch("telethon.client.auth.AuthMethods.start") - start.side_effect = Exception("Test exception") + lazy_module = get_lazy_module("telethon_extractor") - # make sure the default setup file is created - session_file = tmp_path / "test.session" + with pytest.raises(Exception): + lazy_module.load({"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}}) - lazy_module = get_lazy_module("telethon_extractor") - - with pytest.raises(Exception): - lazy_module.load( - {"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}} - ) - - assert session_file.exists() - assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file - assert os.path.exists(lazy_module._instance.session_file + ".session") + assert session_file.exists() + assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file + assert os.path.exists(lazy_module._instance.session_file + ".session")