diff --git a/.github/workflows/docker-publish.yaml b/.github/workflows/docker-publish.yaml index 2031ad6..06b66a0 100644 --- a/.github/workflows/docker-publish.yaml +++ b/.github/workflows/docker-publish.yaml @@ -22,7 +22,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up QEMU uses: docker/setup-qemu-action@v3 @@ -33,14 +33,14 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Log in to Docker Hub - uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@369eb591f429131d6889c46b94e711f089e6ca96 + uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 with: images: bellingcat/auto-archiver diff --git a/.github/workflows/tests-download.yaml b/.github/workflows/tests-download.yaml index a8337cf..0aafdd6 100644 --- a/.github/workflows/tests-download.yaml +++ b/.github/workflows/tests-download.yaml @@ -10,9 +10,6 @@ on: workflows: ["Core Tests"] types: - completed - branches: [main] - paths: - - src/** jobs: tests: @@ -30,7 +27,10 @@ jobs: steps: - uses: actions/checkout@v4 with: - ref: ${{ github.event.workflow_run.head_branch || github.ref }} + # For PRs, use the head commit SHA from the triggering workflow + ref: ${{ github.event.workflow_run.head_sha || github.ref }} + # If PR is from a fork, we need fetch-depth: 0 + fetch-depth: ${{ github.event.workflow_run.head_repository.fork && '0' || '1' }} - name: Install poetry run: pipx install poetry diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 1494fd6..f620500 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -214,7 +214,7 @@ class LazyBaseModule: # check external dependencies are installed def check_deps(deps, check): - for dep in filter(lambda d: len(d.strip()), deps): + for dep in filter(lambda d: len(d.strip()) > 0, deps): if not check(dep.strip()): logger.error( f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \ @@ -274,6 +274,9 @@ class LazyBaseModule: # finally, get the class instance instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)() + # save the instance for future easy loading + self._instance = instance + # set the name, display name and module factory instance.name = self.name instance.display_name = self.display_name @@ -286,8 +289,6 @@ class LazyBaseModule: instance.config_setup(config) instance.setup() - # save the instance for future easy loading - self._instance = instance return instance def __repr__(self): diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index cbd1af5..b637878 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -387,8 +387,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ except (KeyboardInterrupt, Exception) as e: if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError): logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") - if loaded_module and module_type == "extractor": - loaded_module.cleanup() + + # access the _instance here because loaded_module may not return if there's an error + if lazy_module._instance and module_type == "extractor": + lazy_module._instance.cleanup() raise e if not loaded_module: diff --git a/src/auto_archiver/core/validators.py b/src/auto_archiver/core/validators.py index 765a02b..decd9f1 100644 --- a/src/auto_archiver/core/validators.py +++ b/src/auto_archiver/core/validators.py @@ -4,12 +4,6 @@ import argparse import json -def example_validator(value): - if "example" not in value: - raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument") - return value - - def positive_number(value): if value < 0: raise argparse.ArgumentTypeError(f"{value} is not a positive number") diff --git a/src/auto_archiver/modules/telethon_extractor/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py index 150b62c..e29a9b0 100644 --- a/src/auto_archiver/modules/telethon_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -19,7 +19,7 @@ }, "session_file": { "default": "secrets/anon", - "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.", + "help": "Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.", }, "join_channels": { "default": True, diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index b06962e..73fb4e8 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -1,4 +1,10 @@ +import os import shutil +import re +import time +from pathlib import Path +from datetime import date + from telethon.sync import TelegramClient from telethon.errors import ChannelInvalidError from telethon.tl.functions.messages import ImportChatInviteRequest @@ -8,11 +14,9 @@ from telethon.errors.rpcerrorlist import ( InviteRequestSentError, InviteHashExpiredError, ) -from loguru import logger + from tqdm import tqdm -import re -import time -import os +from loguru import logger from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media @@ -31,10 +35,22 @@ class TelethonExtractor(Extractor): """ logger.info(f"SETUP {self.name} checking login...") + # in case the user already added '.session' to the session_file + base_session_name = self.session_file.removesuffix(".session") + base_session_filepath = f"{base_session_name}.session" + + if self.session_file and not os.path.exists(base_session_filepath): + logger.warning( + f"SETUP - Session file {base_session_filepath} does not exist for {self.name}, creating an empty one." + ) + Path(base_session_filepath).touch() + # make a copy of the session that is used exclusively with this archiver instance - new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session") - shutil.copy(self.session_file + ".session", new_session_file) - self.session_file = new_session_file.replace(".session", "") + self.session_file = os.path.join( + os.path.dirname(base_session_filepath), f"telethon-{date.today().strftime('%Y-%m-%d')}{random_str(8)}" + ) + logger.debug(f"Making a copy of the session file {base_session_filepath} to {self.session_file}.session") + shutil.copy(base_session_filepath, f"{self.session_file}.session") # initiate the client self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) @@ -87,8 +103,8 @@ class TelethonExtractor(Extractor): pbar.update() def cleanup(self) -> None: - logger.info(f"CLEANUP {self.name}.") - session_file_name = self.session_file + ".session" + logger.info(f"CLEANUP {self.name} - removing session file {self.session_file}.session") + session_file_name = f"{self.session_file}.session" if os.path.exists(session_file_name): os.remove(session_file_name) diff --git a/tests/conftest.py b/tests/conftest.py index 44d8058..6377646 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,7 +17,24 @@ from auto_archiver.core.module import ModuleFactory # that you only want to run if everything else succeeds (e.g. API calls). The order here is important # what comes first will be run first (at the end of all other tests not mentioned) # format is the name of the module (python file) without the .py extension -TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"] +TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"] + + +# don't check for ytdlp updates in tests +@pytest.fixture(autouse=True) +def skip_check_for_update(mocker): + update_ytdlp = mocker.patch( + "auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.update_ytdlp" + ) + update_ytdlp.return_value = False + + +@pytest.fixture +def get_lazy_module(): + def _get_lazy_module(module_name): + return ModuleFactory().get_module_lazy(module_name) + + return _get_lazy_module @pytest.fixture @@ -134,6 +151,7 @@ def unpickle(): @pytest.fixture def mock_binary_dependencies(mocker): + mocker.patch("subprocess.run").return_value = mocker.Mock(returncode=0) mock_shutil_which = mocker.patch("shutil.which") # Mock all binary dependencies as available mock_shutil_which.return_value = "/usr/bin/fake_binary" diff --git a/tests/data/test_modules/example_extractor/example_extractor.py b/tests/data/test_modules/example_extractor/example_extractor.py index 1c63383..ade26e4 100644 --- a/tests/data/test_modules/example_extractor/example_extractor.py +++ b/tests/data/test_modules/example_extractor/example_extractor.py @@ -1,6 +1,11 @@ from auto_archiver.core import Extractor +from loguru import logger + class ExampleExtractor(Extractor): def download(self, item): - print("download") + logger.info("download") + + def cleanup(self): + logger.info("cleanup") diff --git a/tests/data/test_modules/example_module/example_module.py b/tests/data/test_modules/example_module/example_module.py index 392abe0..898df96 100644 --- a/tests/data/test_modules/example_module/example_module.py +++ b/tests/data/test_modules/example_module/example_module.py @@ -1,27 +1,29 @@ from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata +from loguru import logger + class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter): def download(self, item): - print("download") + logger.info("download") def __iter__(self): yield Metadata().set_url("https://example.com") def done(self, result): - print("done") + logger.info("done") def enrich(self, to_enrich): - print("enrich") + logger.info("enrich") def get_cdn_url(self, media): return "nice_url" def save(self, item): - print("save") + logger.info("save") def uploadf(self, file, key, **kwargs): - print("uploadf") + logger.info("uploadf") def format(self, item): - print("format") + logger.info("format") diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py index 6becbea..b221f3e 100644 --- a/tests/extractors/test_generic_extractor.py +++ b/tests/extractors/test_generic_extractor.py @@ -37,7 +37,7 @@ class TestGenericExtractor(TestExtractorBase): package = "auto_archiver.modules.generic_extractor" assert self.extractor.dropin_for_name("bluesky", package=package) - # test loading dropings via filepath + # test loading dropins via filepath path = os.path.join(dirname(dirname(__file__)), "data/") assert self.extractor.dropin_for_name("dropin", additional_paths=[path]) @@ -122,7 +122,7 @@ class TestGenericExtractor(TestExtractorBase): == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/" ) assert len(result.media) == 2 - assert Path(result.media[0].filename).name == "J---aiyznGQ.webm" + assert "J---aiyznGQ" in Path(result.media[0].filename).name assert Path(result.media[1].filename).name == "hqdefault.jpg" @pytest.mark.download diff --git a/tests/extractors/test_telethon_extractor.py b/tests/extractors/test_telethon_extractor.py new file mode 100644 index 0000000..0a2d5c8 --- /dev/null +++ b/tests/extractors/test_telethon_extractor.py @@ -0,0 +1,26 @@ +import os +from datetime import date + +import pytest + + +@pytest.fixture(autouse=True) +def mock_client_setup(mocker): + mocker.patch("telethon.client.auth.AuthMethods.start") + + +def test_setup_fails_clear_session_file(get_lazy_module, tmp_path, mocker): + start = mocker.patch("telethon.client.auth.AuthMethods.start") + start.side_effect = Exception("Test exception") + + # make sure the default setup file is created + session_file = tmp_path / "test.session" + + lazy_module = get_lazy_module("telethon_extractor") + + with pytest.raises(Exception): + lazy_module.load({"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}}) + + assert session_file.exists() + assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file + assert os.path.exists(lazy_module._instance.session_file + ".session") diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 3367ce0..86e3125 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -237,3 +237,23 @@ def test_wrong_step_type(test_args, caplog): with pytest.raises(SetupError) as err: orchestrator.setup(args) assert "Module 'example_extractor' is not a feeder" in str(err.value) + + +def test_load_failed_extractor_cleanup(test_args, mocker, caplog): + orchestrator = ArchivingOrchestrator() + + # hack to set up the paths so we can patch properly + orchestrator.module_factory.setup_paths([TEST_MODULES]) + + # patch example_module.setup to throw an exception + mocker.patch( + "auto_archiver.modules.example_extractor.example_extractor.ExampleExtractor.setup", + side_effect=Exception("Test exception"), + ) + + with pytest.raises(Exception): + orchestrator.setup(test_args + ["--extractors", "example_extractor"]) + + assert "Error during setup of modules: Test exception" in caplog.text + # make sure the 'cleanup' is called + assert "cleanup" in caplog.text