Merge branch 'main' into feat/yt-dlp-pots

This commit is contained in:
erinhmclark
2025-03-28 10:42:24 +00:00
13 changed files with 122 additions and 38 deletions

View File

@@ -22,7 +22,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Set up QEMU - name: Set up QEMU
uses: docker/setup-qemu-action@v3 uses: docker/setup-qemu-action@v3
@@ -33,14 +33,14 @@ jobs:
uses: docker/setup-buildx-action@v3 uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub - name: Log in to Docker Hub
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
with: with:
username: ${{ secrets.DOCKER_USERNAME }} username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }} password: ${{ secrets.DOCKER_PASSWORD }}
- name: Extract metadata (tags, labels) for Docker - name: Extract metadata (tags, labels) for Docker
id: meta id: meta
uses: docker/metadata-action@369eb591f429131d6889c46b94e711f089e6ca96 uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
with: with:
images: bellingcat/auto-archiver images: bellingcat/auto-archiver

View File

@@ -10,9 +10,6 @@ on:
workflows: ["Core Tests"] workflows: ["Core Tests"]
types: types:
- completed - completed
branches: [main]
paths:
- src/**
jobs: jobs:
tests: tests:
@@ -30,7 +27,10 @@ jobs:
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
ref: ${{ github.event.workflow_run.head_branch || github.ref }} # For PRs, use the head commit SHA from the triggering workflow
ref: ${{ github.event.workflow_run.head_sha || github.ref }}
# If PR is from a fork, we need fetch-depth: 0
fetch-depth: ${{ github.event.workflow_run.head_repository.fork && '0' || '1' }}
- name: Install poetry - name: Install poetry
run: pipx install poetry run: pipx install poetry

View File

@@ -214,7 +214,7 @@ class LazyBaseModule:
# check external dependencies are installed # check external dependencies are installed
def check_deps(deps, check): def check_deps(deps, check):
for dep in filter(lambda d: len(d.strip()), deps): for dep in filter(lambda d: len(d.strip()) > 0, deps):
if not check(dep.strip()): if not check(dep.strip()):
logger.error( logger.error(
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \ f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
@@ -274,6 +274,9 @@ class LazyBaseModule:
# finally, get the class instance # finally, get the class instance
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)() instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
# save the instance for future easy loading
self._instance = instance
# set the name, display name and module factory # set the name, display name and module factory
instance.name = self.name instance.name = self.name
instance.display_name = self.display_name instance.display_name = self.display_name
@@ -286,8 +289,6 @@ class LazyBaseModule:
instance.config_setup(config) instance.config_setup(config)
instance.setup() instance.setup()
# save the instance for future easy loading
self._instance = instance
return instance return instance
def __repr__(self): def __repr__(self):

View File

@@ -387,8 +387,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
except (KeyboardInterrupt, Exception) as e: except (KeyboardInterrupt, Exception) as e:
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError): if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if loaded_module and module_type == "extractor":
loaded_module.cleanup() # access the _instance here because loaded_module may not return if there's an error
if lazy_module._instance and module_type == "extractor":
lazy_module._instance.cleanup()
raise e raise e
if not loaded_module: if not loaded_module:

View File

@@ -4,12 +4,6 @@ import argparse
import json import json
def example_validator(value):
if "example" not in value:
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
return value
def positive_number(value): def positive_number(value):
if value < 0: if value < 0:
raise argparse.ArgumentTypeError(f"{value} is not a positive number") raise argparse.ArgumentTypeError(f"{value} is not a positive number")

View File

@@ -19,7 +19,7 @@
}, },
"session_file": { "session_file": {
"default": "secrets/anon", "default": "secrets/anon",
"help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.", "help": "Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.",
}, },
"join_channels": { "join_channels": {
"default": True, "default": True,

View File

@@ -1,4 +1,10 @@
import os
import shutil import shutil
import re
import time
from pathlib import Path
from datetime import date
from telethon.sync import TelegramClient from telethon.sync import TelegramClient
from telethon.errors import ChannelInvalidError from telethon.errors import ChannelInvalidError
from telethon.tl.functions.messages import ImportChatInviteRequest from telethon.tl.functions.messages import ImportChatInviteRequest
@@ -8,11 +14,9 @@ from telethon.errors.rpcerrorlist import (
InviteRequestSentError, InviteRequestSentError,
InviteHashExpiredError, InviteHashExpiredError,
) )
from loguru import logger
from tqdm import tqdm from tqdm import tqdm
import re from loguru import logger
import time
import os
from auto_archiver.core import Extractor from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
@@ -31,10 +35,22 @@ class TelethonExtractor(Extractor):
""" """
logger.info(f"SETUP {self.name} checking login...") logger.info(f"SETUP {self.name} checking login...")
# in case the user already added '.session' to the session_file
base_session_name = self.session_file.removesuffix(".session")
base_session_filepath = f"{base_session_name}.session"
if self.session_file and not os.path.exists(base_session_filepath):
logger.warning(
f"SETUP - Session file {base_session_filepath} does not exist for {self.name}, creating an empty one."
)
Path(base_session_filepath).touch()
# make a copy of the session that is used exclusively with this archiver instance # make a copy of the session that is used exclusively with this archiver instance
new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session") self.session_file = os.path.join(
shutil.copy(self.session_file + ".session", new_session_file) os.path.dirname(base_session_filepath), f"telethon-{date.today().strftime('%Y-%m-%d')}{random_str(8)}"
self.session_file = new_session_file.replace(".session", "") )
logger.debug(f"Making a copy of the session file {base_session_filepath} to {self.session_file}.session")
shutil.copy(base_session_filepath, f"{self.session_file}.session")
# initiate the client # initiate the client
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
@@ -87,8 +103,8 @@ class TelethonExtractor(Extractor):
pbar.update() pbar.update()
def cleanup(self) -> None: def cleanup(self) -> None:
logger.info(f"CLEANUP {self.name}.") logger.info(f"CLEANUP {self.name} - removing session file {self.session_file}.session")
session_file_name = self.session_file + ".session" session_file_name = f"{self.session_file}.session"
if os.path.exists(session_file_name): if os.path.exists(session_file_name):
os.remove(session_file_name) os.remove(session_file_name)

View File

@@ -17,7 +17,24 @@ from auto_archiver.core.module import ModuleFactory
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important # that you only want to run if everything else succeeds (e.g. API calls). The order here is important
# what comes first will be run first (at the end of all other tests not mentioned) # what comes first will be run first (at the end of all other tests not mentioned)
# format is the name of the module (python file) without the .py extension # format is the name of the module (python file) without the .py extension
TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"] TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"]
# don't check for ytdlp updates in tests
@pytest.fixture(autouse=True)
def skip_check_for_update(mocker):
update_ytdlp = mocker.patch(
"auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.update_ytdlp"
)
update_ytdlp.return_value = False
@pytest.fixture
def get_lazy_module():
def _get_lazy_module(module_name):
return ModuleFactory().get_module_lazy(module_name)
return _get_lazy_module
@pytest.fixture @pytest.fixture
@@ -134,6 +151,7 @@ def unpickle():
@pytest.fixture @pytest.fixture
def mock_binary_dependencies(mocker): def mock_binary_dependencies(mocker):
mocker.patch("subprocess.run").return_value = mocker.Mock(returncode=0)
mock_shutil_which = mocker.patch("shutil.which") mock_shutil_which = mocker.patch("shutil.which")
# Mock all binary dependencies as available # Mock all binary dependencies as available
mock_shutil_which.return_value = "/usr/bin/fake_binary" mock_shutil_which.return_value = "/usr/bin/fake_binary"

View File

@@ -1,6 +1,11 @@
from auto_archiver.core import Extractor from auto_archiver.core import Extractor
from loguru import logger
class ExampleExtractor(Extractor): class ExampleExtractor(Extractor):
def download(self, item): def download(self, item):
print("download") logger.info("download")
def cleanup(self):
logger.info("cleanup")

View File

@@ -1,27 +1,29 @@
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
from loguru import logger
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter): class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
def download(self, item): def download(self, item):
print("download") logger.info("download")
def __iter__(self): def __iter__(self):
yield Metadata().set_url("https://example.com") yield Metadata().set_url("https://example.com")
def done(self, result): def done(self, result):
print("done") logger.info("done")
def enrich(self, to_enrich): def enrich(self, to_enrich):
print("enrich") logger.info("enrich")
def get_cdn_url(self, media): def get_cdn_url(self, media):
return "nice_url" return "nice_url"
def save(self, item): def save(self, item):
print("save") logger.info("save")
def uploadf(self, file, key, **kwargs): def uploadf(self, file, key, **kwargs):
print("uploadf") logger.info("uploadf")
def format(self, item): def format(self, item):
print("format") logger.info("format")

View File

@@ -37,7 +37,7 @@ class TestGenericExtractor(TestExtractorBase):
package = "auto_archiver.modules.generic_extractor" package = "auto_archiver.modules.generic_extractor"
assert self.extractor.dropin_for_name("bluesky", package=package) assert self.extractor.dropin_for_name("bluesky", package=package)
# test loading dropings via filepath # test loading dropins via filepath
path = os.path.join(dirname(dirname(__file__)), "data/") path = os.path.join(dirname(dirname(__file__)), "data/")
assert self.extractor.dropin_for_name("dropin", additional_paths=[path]) assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
@@ -122,7 +122,7 @@ class TestGenericExtractor(TestExtractorBase):
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/" == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
) )
assert len(result.media) == 2 assert len(result.media) == 2
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm" assert "J---aiyznGQ" in Path(result.media[0].filename).name
assert Path(result.media[1].filename).name == "hqdefault.jpg" assert Path(result.media[1].filename).name == "hqdefault.jpg"
@pytest.mark.download @pytest.mark.download

View File

@@ -0,0 +1,26 @@
import os
from datetime import date
import pytest
@pytest.fixture(autouse=True)
def mock_client_setup(mocker):
mocker.patch("telethon.client.auth.AuthMethods.start")
def test_setup_fails_clear_session_file(get_lazy_module, tmp_path, mocker):
start = mocker.patch("telethon.client.auth.AuthMethods.start")
start.side_effect = Exception("Test exception")
# make sure the default setup file is created
session_file = tmp_path / "test.session"
lazy_module = get_lazy_module("telethon_extractor")
with pytest.raises(Exception):
lazy_module.load({"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}})
assert session_file.exists()
assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file
assert os.path.exists(lazy_module._instance.session_file + ".session")

View File

@@ -237,3 +237,23 @@ def test_wrong_step_type(test_args, caplog):
with pytest.raises(SetupError) as err: with pytest.raises(SetupError) as err:
orchestrator.setup(args) orchestrator.setup(args)
assert "Module 'example_extractor' is not a feeder" in str(err.value) assert "Module 'example_extractor' is not a feeder" in str(err.value)
def test_load_failed_extractor_cleanup(test_args, mocker, caplog):
orchestrator = ArchivingOrchestrator()
# hack to set up the paths so we can patch properly
orchestrator.module_factory.setup_paths([TEST_MODULES])
# patch example_module.setup to throw an exception
mocker.patch(
"auto_archiver.modules.example_extractor.example_extractor.ExampleExtractor.setup",
side_effect=Exception("Test exception"),
)
with pytest.raises(Exception):
orchestrator.setup(test_args + ["--extractors", "example_extractor"])
assert "Error during setup of modules: Test exception" in caplog.text
# make sure the 'cleanup' is called
assert "cleanup" in caplog.text