Ruff format with defaults.

This commit is contained in:
erinhmclark
2025-03-10 18:44:54 +00:00
parent cbb0414e5f
commit 85abe1837a
155 changed files with 2539 additions and 1908 deletions

View File

@@ -1,6 +1,7 @@
"""
pytest conftest file, for shared fixtures and configuration
"""
import os
import pickle
from datetime import datetime, timezone
@@ -16,32 +17,34 @@ from auto_archiver.core.module import ModuleFactory
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
# what comes first will be run first (at the end of all other tests not mentioned)
# format is the name of the module (python file) without the .py extension
TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
@pytest.fixture
def setup_module(request):
def _setup_module(module_name, config={}):
module_factory = ModuleFactory()
if isinstance(module_name, type):
# get the module name:
# if the class does not have a .name, use the name of the parent folder
module_name = module_name.__module__.rsplit(".",2)[-2]
module_name = module_name.__module__.rsplit(".", 2)[-2]
m = module_factory.get_module(module_name, {module_name: config})
# add the tmp_dir to the module
tmp_dir = TemporaryDirectory()
m.tmp_dir = tmp_dir.name
def cleanup():
tmp_dir.cleanup()
request.addfinalizer(cleanup)
return m
return _setup_module
@pytest.fixture
def check_hash():
def _check_hash(filename: str, hash: str):
@@ -51,6 +54,7 @@ def check_hash():
return _check_hash
@pytest.fixture
def make_item():
def _make_item(url: str, **kwargs) -> Metadata:
@@ -62,7 +66,6 @@ def make_item():
return _make_item
def pytest_collection_modifyitems(items):
module_mapping = {item: item.module.__name__.split(".")[-1] for item in items}
@@ -78,13 +81,13 @@ def pytest_collection_modifyitems(items):
items[:] = sorted_items
# Incremental testing - fail tests in a class if any previous test fails
# taken from https://docs.pytest.org/en/latest/example/simple.html#incremental-testing-test-steps
# store history of failures per test class name and per index in parametrize (if parametrize used)
_test_failed_incremental: Dict[str, Dict[Tuple[int, ...], str]] = {}
def pytest_runtest_makereport(item, call):
if "incremental" in item.keywords:
# incremental marker is used
@@ -93,17 +96,11 @@ def pytest_runtest_makereport(item, call):
# retrieve the class name of the test
cls_name = str(item.cls)
# retrieve the index of the test (if parametrize is used in combination with incremental)
parametrize_index = (
tuple(item.callspec.indices.values())
if hasattr(item, "callspec")
else ()
)
parametrize_index = tuple(item.callspec.indices.values()) if hasattr(item, "callspec") else ()
# retrieve the name of the test function
test_name = item.originalname or item.name
# store in _test_failed_incremental the original name of the failed test
_test_failed_incremental.setdefault(cls_name, {}).setdefault(
parametrize_index, test_name
)
_test_failed_incremental.setdefault(cls_name, {}).setdefault(parametrize_index, test_name)
def pytest_runtest_setup(item):
@@ -119,16 +116,17 @@ def pytest_runtest_setup(item):
pytest.xfail(f"previous test failed ({test_name})")
@pytest.fixture()
def unpickle():
"""
Returns a helper function that unpickles a file
** gets the file from the test_files directory: tests/data/ **
"""
def _unpickle(path):
with open(os.path.join("tests/data", path), "rb") as f:
return pickle.load(f)
return _unpickle
@@ -156,4 +154,4 @@ def metadata():
metadata = Metadata()
metadata.set("_processed_at", "2021-01-01T00:00:00")
metadata.set_url("https://example.com")
return metadata
return metadata

View File

@@ -1,5 +1,6 @@
# this is a dummy class used to test importing a dropin in the
# generic extractor by filename/path
class Dropin:
pass
pass

View File

@@ -1 +1 @@
from .example_module import ExampleModule
from .example_module import ExampleModule

View File

@@ -16,14 +16,14 @@
"dependencies": {
"python": ["loguru"],
"bin": ["bash"],
},
# configurations that this module takes. These are argparse-compliant dicationaries, that are
},
# configurations that this module takes. These are argparse-compliant dicationaries, that are
# used to create command line arguments when the programme is run.
# The full name of the config option will become: `module_name.config_name`
"configs": {
"csv_file": {"default": "db.csv", "help": "CSV file name"},
"required_field": {"required": True, "help": "required field in the CSV file"},
},
"csv_file": {"default": "db.csv", "help": "CSV file name"},
"required_field": {"required": True, "help": "required field in the CSV file"},
},
# A description of the module, used for documentation
"description": "This is an example module",
}
}

View File

@@ -1,5 +1,6 @@
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
def download(self, item):
print("download")
@@ -7,7 +8,6 @@ class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
def __iter__(self):
yield Metadata().set_url("https://example.com")
def done(self, result):
print("done")
@@ -16,13 +16,12 @@ class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
def get_cdn_url(self, media):
return "nice_url"
def save(self, item):
print("save")
def uploadf(self, file, key, **kwargs):
print("uploadf")
def format(self, item):
print("format")

View File

@@ -41,9 +41,16 @@ def test_fetch(api_db, metadata, mocker):
mock_datetime = mocker.patch("auto_archiver.core.metadata.datetime.datetime")
mock_datetime.now.return_value = "2021-01-01T00:00:00"
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = [{"result": {}}, {"result":
{'media': [], 'metadata': {'_processed_at': '2021-01-01T00:00:00', 'url': 'https://example.com'},
'status': 'no archiver'}}]
mock_get.return_value.json.return_value = [
{"result": {}},
{
"result": {
"media": [],
"metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"},
"status": "no archiver",
}
},
]
assert api_db.fetch(metadata) == metadata
@@ -52,8 +59,15 @@ def test_done_success(api_db, metadata, mocker):
mock_post.return_value.status_code = 201
api_db.done(metadata)
mock_post.assert_called_once()
mock_post.assert_called_once_with("https://api.example.com/interop/submit-archive",
json={'author_id': 'Someone', 'url': 'https://example.com',
'public': False, 'group_id': '123', 'tags': ['[', ']'], 'result': '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}'},
headers={'Authorization': 'Bearer test-token'})
mock_post.assert_called_once_with(
"https://api.example.com/interop/submit-archive",
json={
"author_id": "Someone",
"url": "https://example.com",
"public": False,
"group_id": "123",
"tags": ["[", "]"],
"result": '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}',
},
headers={"Authorization": "Bearer test-token"},
)

View File

@@ -50,9 +50,7 @@ def test_failed_with_atlos_id(atlos_db, metadata, mocker):
post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp)
atlos_db.failed(metadata, "failure reason")
expected_endpoint = f"/api/v2/source_material/metadata/42/auto_archiver"
expected_json = {
"metadata": {"processed": True, "status": "error", "error": "failure reason"}
}
expected_json = {"metadata": {"processed": True, "status": "error", "error": "failure reason"}}
post_mock.assert_called_once_with(expected_endpoint, json=expected_json)

View File

@@ -1,4 +1,3 @@
from auto_archiver.modules.csv_db import CSVDb
from auto_archiver.core import Metadata
@@ -9,12 +8,21 @@ def test_store_item(tmp_path, setup_module):
temp_db = tmp_path / "temp_db.csv"
db = setup_module(CSVDb, {"csv_file": temp_db.as_posix()})
item = Metadata().set_url("http://example.com").set_title("Example").set_content("Example content").success("my-archiver")
item = (
Metadata()
.set_url("http://example.com")
.set_title("Example")
.set_content("Example content")
.success("my-archiver")
)
db.done(item)
with open(temp_db, "r", encoding="utf-8") as f:
assert f.read().strip() == f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
assert (
f.read().strip()
== f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
)
# TODO: csv db doesn't have a fetch method - need to add it (?)
# assert db.fetch(item) == item
# assert db.fetch(item) == item

View File

@@ -28,6 +28,7 @@ def mock_metadata(mocker):
metadata.get_first_image.return_value = None
return metadata
@pytest.fixture
def metadata():
metadata = Metadata()
@@ -51,6 +52,7 @@ def mock_media(mocker):
mock_media.get.return_value = "not-calculated"
return mock_media
@pytest.fixture
def gsheets_db(mock_gworksheet, setup_module, mocker):
mocker.patch("gspread.service_account")
@@ -59,7 +61,22 @@ def gsheets_db(mock_gworksheet, setup_module, mocker):
"sheet_id": None,
"header": 1,
"service_account": "test/service_account.json",
"columns": {'url': 'link', 'status': 'archive status', 'folder': 'destination folder', 'archive': 'archive location', 'date': 'archive date', 'thumbnail': 'thumbnail', 'timestamp': 'upload timestamp', 'title': 'upload title', 'text': 'text content', 'screenshot': 'screenshot', 'hash': 'hash', 'pdq_hash': 'perceptual hashes', 'wacz': 'wacz', 'replaywebpage': 'replaywebpage'},
"columns": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage",
},
"allow_worksheets": set(),
"block_worksheets": set(),
"use_sheet_names_in_stored_paths": True,
@@ -78,20 +95,21 @@ def fixed_timestamp():
@pytest.fixture
def expected_calls(mock_media, fixed_timestamp):
"""Fixture for the expected cell updates."""
return [
(1, 'status', 'my-archiver: success'),
(1, 'archive', 'http://example.com/screenshot.png'),
(1, 'date', '2025-02-01T00:00:00+00:00'),
(1, 'title', 'Example Title'),
(1, 'text', 'Example Content'),
(1, 'timestamp', '2025-01-01T00:00:00+00:00'),
(1, 'hash', 'not-calculated'),
return [
(1, "status", "my-archiver: success"),
(1, "archive", "http://example.com/screenshot.png"),
(1, "date", "2025-02-01T00:00:00+00:00"),
(1, "title", "Example Title"),
(1, "text", "Example Content"),
(1, "timestamp", "2025-01-01T00:00:00+00:00"),
(1, "hash", "not-calculated"),
# (1, 'screenshot', 'http://example.com/screenshot.png'),
# (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'),
# (1, 'wacz', 'http://example.com/browsertrix.wacz'),
# (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=')
]
def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
gw, row = gsheets_db._retrieve_gsheet(metadata)
assert gw == mock_gworksheet
@@ -100,27 +118,34 @@ def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
def test_started(gsheets_db, mock_metadata, mock_gworksheet):
gsheets_db.started(mock_metadata)
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Archive in progress')
mock_gworksheet.set_cell.assert_called_once_with(1, "status", "Archive in progress")
def test_failed(gsheets_db, mock_metadata, mock_gworksheet):
reason = "Test failure"
gsheets_db.failed(mock_metadata, reason)
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}')
mock_gworksheet.set_cell.assert_called_once_with(1, "status", f"Archive failed {reason}")
def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
gsheets_db.aborted(mock_metadata)
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '')
mock_gworksheet.set_cell.assert_called_once_with(1, "status", "")
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker):
mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
mocker.patch(
"auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
return_value="2025-02-01T00:00:00+00:00",
)
gsheets_db.done(metadata)
mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
mocker.patch(
"auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
return_value="2025-02-01T00:00:00+00:00",
)
gsheets_db.done(metadata, cached=True)
# Verify the status message includes "[cached]"
@@ -131,15 +156,17 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker):
# clear media from metadata
metadata.media = []
mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
mocker.patch(
"auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
return_value="2025-02-01T00:00:00+00:00",
)
gsheets_db.done(metadata)
# Verify nothing media-related gets updated
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
media_fields = {'archive', 'screenshot', 'thumbnail', 'wacz', 'replaywebpage'}
media_fields = {"archive", "screenshot", "thumbnail", "wacz", "replaywebpage"}
assert all(call[1] not in media_fields for call in call_args)
def test_safe_status_update(gsheets_db, metadata, mock_gworksheet):
gsheets_db._safe_status_update(metadata, "Test status")
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Test status')
mock_gworksheet.set_cell.assert_called_once_with(1, "status", "Test status")

View File

@@ -4,34 +4,50 @@ from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import ModuleFactory
@pytest.mark.parametrize("algorithm, filename, expected_hash", [
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
("SHA3-512", "tests/data/testfile_1.txt", "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e"),
("SHA3-512", "tests/data/testfile_2.txt", "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6")
])
@pytest.mark.parametrize(
"algorithm, filename, expected_hash",
[
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
(
"SHA3-512",
"tests/data/testfile_1.txt",
"d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e",
),
(
"SHA3-512",
"tests/data/testfile_2.txt",
"e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6",
),
],
)
def test_calculate_hash(algorithm, filename, expected_hash, setup_module):
# test SHA-256
he = setup_module(HashEnricher, {"algorithm": algorithm, "chunksize": 100})
assert he.calculate_hash(filename) == expected_hash
def test_default_config_values(setup_module):
he = setup_module(HashEnricher)
assert he.algorithm == "SHA-256"
assert he.chunksize == 16000000
def test_config():
# test default config
c = ModuleFactory().get_module_lazy('hash_enricher').configs
c = ModuleFactory().get_module_lazy("hash_enricher").configs
assert c["algorithm"]["default"] == "SHA-256"
assert c["chunksize"]["default"] == 16000000
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]
assert c["algorithm"]["help"] == "hash algorithm to use"
assert c["chunksize"]["help"] == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
assert (
c["chunksize"]["help"]
== "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
)
def test_hash_media(setup_module):
he = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 1})
# generate metadata with two test files
@@ -46,4 +62,4 @@ def test_hash_media(setup_module):
he.enrich(m)
assert m.media[0].get("hash") == "SHA-256:1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"
assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"
assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"

View File

@@ -16,6 +16,7 @@ def mock_metadata(mocker):
mock.get_all_media.return_value = []
return mock
@pytest.fixture
def mock_media(mocker):
"""Creates a mock Media object."""
@@ -59,6 +60,7 @@ def test_enrich_file_sizes(meta_enricher, metadata, tmp_path):
assert metadata.get("total_bytes") == 3000
assert metadata.get("total_size") == "2.9 KB"
@pytest.mark.parametrize(
"size, expected",
[
@@ -74,6 +76,7 @@ def test_human_readable_bytes(size, expected):
enricher = MetaEnricher()
assert enricher.human_readable_bytes(size) == expected
def test_enrich_file_sizes_no_media(meta_enricher, metadata):
"""Test that enrich_file_sizes() handles empty media list gracefully."""
meta_enricher.enrich_file_sizes(metadata)
@@ -91,4 +94,4 @@ def test_enrich_archive_duration(meta_enricher, metadata, mocker):
mock_datetime.now.return_value = mock_now
meta_enricher.enrich_archive_duration(metadata)
assert metadata.get("archive_duration_seconds") == 630
assert metadata.get("archive_duration_seconds") == 630

View File

@@ -1,4 +1,3 @@
import pytest
from auto_archiver.core import Media
@@ -33,9 +32,7 @@ def test_get_metadata(enricher, output, expected, mocker):
result = enricher.get_metadata("test.jpg")
assert result == expected
mock_run.assert_called_once_with(
["exiftool", "test.jpg"], capture_output=True, text=True
)
mock_run.assert_called_once_with(["exiftool", "test.jpg"], capture_output=True, text=True)
def test_get_metadata_exiftool_not_found(enricher, mocker):
@@ -85,4 +82,3 @@ def test_metadata_pickle(enricher, unpickle, mocker):
actual_media = metadata.media
assert len(expected_media) == len(actual_media)
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")

View File

@@ -57,7 +57,7 @@ def test_enrich_handles_corrupted_image(metadata_with_images, mocker):
("screenshot", False),
("warc-file-123", False),
("regular-image", True),
]
],
)
def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):
metadata = Metadata()
@@ -73,4 +73,3 @@ def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):
media_item = metadata.media[0]
assert (media_item.get("pdq_hash") is not None) == should_have_hash

View File

@@ -19,9 +19,11 @@ def mock_selenium_env(mocker):
mock_popen = mocker.patch("subprocess.Popen")
mock_is_connectable = mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions")
# Define side effect for `shutil.which`
def mock_which_side_effect(dep):
return "/mock/geckodriver" if dep == "geckodriver" else None
mock_which.side_effect = mock_which_side_effect
# Mock binary paths
@@ -104,13 +106,7 @@ def test_enrich_adds_screenshot(
],
)
def test_enrich_auth_wall(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
url,
is_auth,
mocker
screenshot_enricher, metadata_with_video, mock_selenium_env, common_patches, url, is_auth, mocker
):
# Testing with and without is_auth_wall
mock_driver, mock_driver_class, _ = mock_selenium_env
@@ -128,9 +124,7 @@ def test_enrich_auth_wall(
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
def test_handle_timeout_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
):
def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
mock_driver.get.side_effect = TimeoutException
@@ -140,9 +134,7 @@ def test_handle_timeout_exception(
assert len(metadata_with_video.media) == 1
def test_handle_general_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
):
def test_handle_general_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
"""Test proper handling of unexpected general exceptions"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Simulate a generic exception when save_screenshot is called
@@ -152,9 +144,7 @@ def test_handle_general_exception(
mock_log = mocker.patch("loguru.logger.error")
screenshot_enricher.enrich(metadata_with_video)
# Verify that the exception was logged with the log
mock_log.assert_called_once_with(
"Got error while loading webdriver for screenshot enricher: Unexpected Error"
)
mock_log.assert_called_once_with("Got error while loading webdriver for screenshot enricher: Unexpected Error")
# And no new media was added due to the error
assert len(metadata_with_video.media) == 1

View File

@@ -51,4 +51,3 @@ def test_ssl_error_handling(enricher, metadata, mocker):
mocker.patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error"))
with pytest.raises(ssl.SSLError, match="SSL error"):
enricher.enrich(metadata)

View File

@@ -25,7 +25,7 @@ def mock_ffmpeg_environment(mocker):
# Mocking all the ffmpeg calls in one place
mock_ffmpeg_input = mocker.patch("ffmpeg.input")
mock_makedirs = mocker.patch("os.makedirs")
mocker.patch.object(Media, "is_video", return_value=True),
(mocker.patch.object(Media, "is_video", return_value=True),)
mock_probe = mocker.patch(
"ffmpeg.probe",
return_value={
@@ -35,9 +35,7 @@ def mock_ffmpeg_environment(mocker):
},
)
mock_output = mocker.MagicMock()
mock_ffmpeg_input.return_value.filter.return_value.output.return_value = (
mock_output
)
mock_ffmpeg_input.return_value.filter.return_value.output.return_value = mock_output
return {
"mock_ffmpeg_input": mock_ffmpeg_input,
@@ -47,14 +45,21 @@ def mock_ffmpeg_environment(mocker):
}
@pytest.mark.parametrize("thumbnails_per_minute, max_thumbnails, expected_count", [
(10, 5, 5), # Capped at max_thumbnails
(1, 10, 2), # Less than max_thumbnails
(60, 7, 7), # Matches exactly
])
@pytest.mark.parametrize(
"thumbnails_per_minute, max_thumbnails, expected_count",
[
(10, 5, 5), # Capped at max_thumbnails
(1, 10, 2), # Less than max_thumbnails
(60, 7, 7), # Matches exactly
],
)
def test_enrich_thumbnail_limits(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment,
thumbnails_per_minute, max_thumbnails, expected_count
thumbnail_enricher,
metadata_with_video,
mock_ffmpeg_environment,
thumbnails_per_minute,
max_thumbnails,
expected_count,
):
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
thumbnail_enricher.max_thumbnails = max_thumbnails
@@ -65,8 +70,8 @@ def test_enrich_thumbnail_limits(
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert len(thumbnails) == expected_count
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error"))
mocker.patch("os.makedirs")
mock_logger = mocker.patch("loguru.logger.error")
@@ -74,36 +79,43 @@ def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, m
thumbnail_enricher.enrich(metadata_with_video)
# Ensure error was logged
mock_logger.assert_called_with(
f"error getting duration of video video.mp4: Probe error"
)
mock_logger.assert_called_with(f"error getting duration of video video.mp4: Probe error")
# Ensure no thumbnails were created
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert thumbnails is None
def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video, mocker):
mocker.patch.object(Media, "is_video", return_value=False)
mock_ffmpeg = mocker.patch("ffmpeg.input")
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg.assert_not_called()
mocker.patch.object(Media, "is_video", return_value=False)
mock_ffmpeg = mocker.patch("ffmpeg.input")
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg.assert_not_called()
@pytest.mark.parametrize("thumbnails_per_minute,max_thumbnails,expected_count", [
(60, 5, 5), # caught by max
(60, 20, 10), # caught by t/min
(0, 20, 1), # test min caught (1)
(11, 20, 1), # test min caught (1)
(12, 20, 2), # test caught by t/min
])
@pytest.mark.parametrize(
"thumbnails_per_minute,max_thumbnails,expected_count",
[
(60, 5, 5), # caught by max
(60, 20, 10), # caught by t/min
(0, 20, 1), # test min caught (1)
(11, 20, 1), # test min caught (1)
(12, 20, 2), # test caught by t/min
],
)
def test_enrich_handles_short_video(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, thumbnails_per_minute, max_thumbnails, expected_count, mocker
thumbnail_enricher,
metadata_with_video,
mock_ffmpeg_environment,
thumbnails_per_minute,
max_thumbnails,
expected_count,
mocker,
):
# override mock duration
fake_duration = 10
mocker.patch(
"ffmpeg.probe",
return_value={ "streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
return_value={"streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
)
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
thumbnail_enricher.max_thumbnails = max_thumbnails
@@ -114,9 +126,7 @@ def test_enrich_handles_short_video(
assert len(thumbnails) == expected_count
def test_uses_existing_duration(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment
):
def test_uses_existing_duration(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment):
metadata_with_video.media[0].set("duration", 60)
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg_environment["mock_probe"].assert_not_called()
@@ -125,7 +135,7 @@ def test_uses_existing_duration(
def test_enrich_metadata_structure(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker):
fake_duration = 120
mocker.patch("ffmpeg.probe", return_value={'streams': [{'codec_type': 'video', 'duration': str(fake_duration)}]})
mocker.patch("ffmpeg.probe", return_value={"streams": [{"codec_type": "video", "duration": str(fake_duration)}]})
thumbnail_enricher.thumbnails_per_minute = 2
thumbnail_enricher.max_thumbnails = 4

View File

@@ -8,34 +8,43 @@ from auto_archiver.core import Metadata
@pytest.fixture
def mock_is_auth_wall(mocker):
"""Fixture to mock is_auth_wall behavior."""
def _mock_is_auth_wall(return_value: bool):
return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value)
return _mock_is_auth_wall
@pytest.fixture
def mock_post_success(mocker):
"""Fixture to mock POST requests with a successful response."""
def _mock_post(json_data: dict = None, status_code: int = 200):
json_data = {"job_id": "job123"} if json_data is None else json_data
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.post", return_value=resp)
return _mock_post
@pytest.fixture
def mock_get_success(mocker):
"""Fixture to mock GET requests returning a completed archive status."""
def _mock_get(json_data: dict = None, status_code: int = 200):
json_data = json_data or {
"status": "success",
"timestamp": "20250101010101",
"original_url": "https://example.com"
"original_url": "https://example.com",
}
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.get", return_value=resp)
return _mock_get
@pytest.fixture
def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
configs: dict = {
@@ -49,12 +58,7 @@ def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
return setup_module("wayback_extractor_enricher", configs)
def test_download_success(
wayback_extractor_enricher,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
def test_download_success(wayback_extractor_enricher, mock_is_auth_wall, mock_post_success, mock_get_success):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success()
@@ -63,34 +67,28 @@ def test_download_success(
result = wayback_extractor_enricher.download(metadata)
assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall):
mock_is_auth_wall(True)
result = wayback_extractor_enricher.enrich(metadata)
assert result is None
def test_enrich_already_enriched(wayback_extractor_enricher, metadata):
metadata.set("wayback", "existing")
result = wayback_extractor_enricher.enrich(metadata)
assert result is True
def test_enrich_post_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
def test_enrich_post_failure(wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success):
mock_is_auth_wall(False)
mock_post_success(json_data={"error": "server error"}, status_code=500)
result = wayback_extractor_enricher.enrich(metadata)
assert result is False
assert "Internet archive failed with status of 500" in metadata.get("wayback")
def test_enrich_post_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mocker
):
def test_enrich_post_json_decode_error(wayback_extractor_enricher, metadata, mock_is_auth_wall, mocker):
mock_is_auth_wall(False)
resp = mocker.Mock(status_code=200)
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
@@ -98,22 +96,15 @@ def test_enrich_post_json_decode_error(
mocker.patch("requests.post", return_value=resp)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_no_job_id(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
def test_enrich_no_job_id(wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success):
mock_is_auth_wall(False)
mock_post_success(json_data={})
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_success(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
@@ -122,24 +113,18 @@ def test_enrich_get_success(
assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com"
def test_enrich_get_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success(json_data={"status": "failed"}, status_code=400)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_request_exception(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mocker
):
mock_is_auth_wall(False)
mock_post_success()
@@ -149,12 +134,9 @@ def test_enrich_get_request_exception(
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback").get("job_id") == "job123"
def test_enrich_get_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mocker
):
mock_is_auth_wall(False)
mock_post_success()

View File

@@ -16,7 +16,7 @@ def enricher(mocker):
"include_srt": False,
"timeout": 5,
"action": "translate",
"steps": {"storages": ["s3_storage"]}
"steps": {"storages": ["s3_storage"]},
}
mock_s3 = mocker.MagicMock(spec=S3Storage)
mock_s3.get_cdn_url.return_value = TEST_S3_URL
@@ -25,7 +25,7 @@ def enricher(mocker):
instance.display_name = "Whisper Enricher"
instance.config_setup({instance.name: config})
# bypassing the setup method and mocking S3 setup
instance.stores = config['steps']['storages']
instance.stores = config["steps"]["storages"]
instance.s3 = mock_s3
yield instance, mock_s3
@@ -63,19 +63,14 @@ def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
# Mock the complete API interaction chain
mock_status_response = mocker.MagicMock()
mock_status_response.status_code = 200
mock_status_response.json.return_value = {
"status": "success",
"meta": {}
}
mock_status_response.json.return_value = {"status": "success", "meta": {}}
mock_artifacts_response = mocker.MagicMock()
mock_artifacts_response.status_code = 200
mock_artifacts_response.json.return_value = [{
"data": [{"start": 0, "end": 5, "text": "test transcript"}]
}]
mock_artifacts_response.json.return_value = [{"data": [{"start": 0, "end": 5, "text": "test transcript"}]}]
# Set up mock response sequence
mock_requests.get.side_effect = [
mock_status_response, # First call: status check
mock_artifacts_response # Second call: artifacts check
mock_artifacts_response, # Second call: artifacts check
]
# Run enrichment (without opening file)
@@ -84,15 +79,17 @@ def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
mock_requests.post.assert_called_once_with(
"http://testapi/jobs",
json={"url": "http://cdn.example.com/test.mp4", "type": "translate"},
headers={"Authorization": "Bearer whisper-key"}
headers={"Authorization": "Bearer whisper-key"},
)
# Verify job status checks
assert mock_requests.get.call_count == 2
assert "artifact_0_text" in metadata.media[0].get("whisper_model")
assert metadata.media[0].get("whisper_model") == {'artifact_0_text': 'test transcript',
'job_artifacts_check': 'http://testapi/jobs/job123/artifacts',
'job_id': 'job123',
'job_status_check': 'http://testapi/jobs/job123'}
assert metadata.media[0].get("whisper_model") == {
"artifact_0_text": "test transcript",
"job_artifacts_check": "http://testapi/jobs/job123/artifacts",
"job_id": "job123",
"job_status_check": "http://testapi/jobs/job123",
}
def test_submit_job(enricher, mocker):

View File

@@ -7,7 +7,6 @@ from auto_archiver.core.extractor import Extractor
class TestExtractorBase(object):
extractor_module: str = None
config: dict = None
@@ -17,7 +16,7 @@ class TestExtractorBase(object):
assert self.config is not None, "self.config must be a dict set on the subclass"
self.extractor: Type[Extractor] = setup_module(self.extractor_module, self.config)
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
assert test_response is not False

View File

@@ -9,26 +9,28 @@ import pytest
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
from .test_extractor_base import TestExtractorBase
CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
class TestGenericExtractor(TestExtractorBase):
"""Tests Generic Extractor
"""
extractor_module = 'generic_extractor'
"""Tests Generic Extractor"""
extractor_module = "generic_extractor"
extractor: GenericExtractor
config = {
'subtitles': False,
'comments': False,
'livestreams': False,
'live_from_start': False,
'end_means_success': True,
'allow_playlist': False,
'max_downloads': "inf",
'proxy': None,
'cookies_from_browser': False,
'cookie_file': None,
}
"subtitles": False,
"comments": False,
"livestreams": False,
"live_from_start": False,
"end_means_success": True,
"allow_playlist": False,
"max_downloads": "inf",
"proxy": None,
"cookies_from_browser": False,
"cookie_file": None,
}
def test_load_dropin(self):
# test loading dropins that are in the generic_archiver package
package = "auto_archiver.modules.generic_extractor"
@@ -38,21 +40,26 @@ class TestGenericExtractor(TestExtractorBase):
path = os.path.join(dirname(dirname(__file__)), "data/")
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
@pytest.mark.parametrize("url, is_suitable", [
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
("https://www.twitch.tv/videos/1167226570", True),
("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
("https://google.com", True)])
@pytest.mark.parametrize(
"url, is_suitable",
[
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
("https://www.twitch.tv/videos/1167226570", True),
(
"https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/",
True,
),
("https://google.com", True),
],
)
def test_suitable_urls(self, make_item, url, is_suitable):
"""
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
and then if and only if all archivers fails, does it fall back to the generic archiver)
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
and then if and only if all archivers fails, does it fall back to the generic archiver)
"""
assert self.extractor.suitable(url) == is_suitable
@@ -63,11 +70,14 @@ class TestGenericExtractor(TestExtractorBase):
assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
@pytest.mark.download
@pytest.mark.parametrize("url", [
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
"twitter.com/bellingcat/status/123",
"https://www.youtube.com/watch?v=1"
])
@pytest.mark.parametrize(
"url",
[
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
"twitter.com/bellingcat/status/123",
"https://www.youtube.com/watch?v=1",
],
)
def test_download_nonexistent_media(self, make_item, url):
"""
Test to make sure that the extractor doesn't break on non-existend posts/media
@@ -78,7 +88,10 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert not result
@pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
@pytest.mark.skipif(
CI,
reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.",
)
@pytest.mark.download
def test_youtube_download(self, make_item):
# url https://www.youtube.com/watch?v=5qap5aO4i9A
@@ -87,7 +100,10 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
assert (
result.get("description")
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
)
assert len(result.media) == 2
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
assert Path(result.media[1].filename).name == "hqdefault.jpg"
@@ -103,7 +119,7 @@ class TestGenericExtractor(TestExtractorBase):
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.download
def test_bluesky_download_no_media(self, make_item):
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z")
@@ -115,7 +131,7 @@ class TestGenericExtractor(TestExtractorBase):
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_video(self, make_item):
@@ -130,14 +146,14 @@ class TestGenericExtractor(TestExtractorBase):
item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_poll(self, make_item):
item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_single_image(self, make_item):
@@ -159,7 +175,7 @@ class TestGenericExtractor(TestExtractorBase):
url = "https://x.com/Bellingcat/status/17197025860711058"
response = self.extractor.download(make_item(url))
assert not response
@pytest.mark.download
def test_twitter_download_malformed_tweetid(self, make_item):
# this tweet does not exist
@@ -169,7 +185,6 @@ class TestGenericExtractor(TestExtractorBase):
@pytest.mark.download
def test_twitter_download_tweet_no_media(self, make_item):
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
post = self.extractor.download(item)
@@ -177,9 +192,9 @@ class TestGenericExtractor(TestExtractorBase):
post,
"Onion rings are just vegetable donuts.",
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
"yt-dlp_Twitter: success"
"yt-dlp_Twitter: success",
)
@pytest.mark.download
def test_twitter_download_video(self, make_item):
url = "https://x.com/bellingcat/status/1871552600346415571"
@@ -187,26 +202,46 @@ class TestGenericExtractor(TestExtractorBase):
self.assertValidResponseMetadata(
post,
"Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
)
@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
@pytest.mark.xfail(
reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented"
)
@pytest.mark.download
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
])
@pytest.mark.parametrize(
"url, title, timestamp, image_hash",
[
(
"https://x.com/SozinhoRamalho/status/1876710769913450647",
"ignore tweet, testing sensitivity warning nudity",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
(
"https://x.com/SozinhoRamalho/status/1876710875475681357",
"ignore tweet, testing sensitivity warning violence",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
(
"https://x.com/SozinhoRamalho/status/1876711053813227618",
"ignore tweet, testing sensitivity warning sensitive",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
(
"https://x.com/SozinhoRamalho/status/1876711141314801937",
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
],
)
def test_twitter_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
"""Download tweets with sensitive media"""
post = self.extractor.download(make_item(url))
self.assertValidResponseMetadata(
post,
title,
timestamp
)
self.assertValidResponseMetadata(post, title, timestamp)
assert len(post.media) == 1
assert post.media[0].hash == image_hash
assert post.media[0].hash == image_hash

View File

@@ -15,10 +15,11 @@ def mock_user_response():
"username": "test_user",
"full_name": "Test User",
"profile_pic_url_hd": "http://example.com/profile.jpg",
"profile_pic_url": "http://example.com/profile_lowres.jpg"
"profile_pic_url": "http://example.com/profile_lowres.jpg",
}
}
@pytest.fixture
def mock_post_response():
return {
@@ -27,16 +28,14 @@ def mock_post_response():
"caption_text": "Test Caption",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/video.mp4",
"thumbnail_url": "http://example.com/thumbnail.jpg"
"thumbnail_url": "http://example.com/thumbnail.jpg",
}
@pytest.fixture
def mock_story_response():
return [{
"id": "story_123",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/story.mp4"
}]
return [{"id": "story_123", "taken_at": datetime.now().timestamp(), "video_url": "http://example.com/story.mp4"}]
@pytest.fixture
def mock_highlight_response():
@@ -46,11 +45,13 @@ def mock_highlight_response():
"highlight:123": {
"id": "123",
"title": "Test Highlight",
"items": [{
"id": "item_123",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/highlight.mp4"
}]
"items": [
{
"id": "item_123",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/highlight.mp4",
}
],
}
}
}
@@ -81,24 +82,30 @@ class TestInstagramAPIExtractor(TestExtractorBase):
m.set("netloc", "instagram.com")
return m
@pytest.mark.parametrize("url,expected", [
("https://instagram.com/user", [("", "user", "")]),
("https://instagr.am/p/post_id", []),
("https://youtube.com", []),
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
])
@pytest.mark.parametrize(
"url,expected",
[
("https://instagram.com/user", [("", "user", "")]),
("https://instagr.am/p/post_id", []),
("https://youtube.com", []),
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
],
)
def test_url_parsing(self, url, expected):
assert self.extractor.valid_url.findall(url) == expected
def test_initialize(self):
assert self.extractor.api_endpoint[-1] != "/"
@pytest.mark.parametrize("input_dict,expected", [
({"x": 0, "valid": "data"}, {"valid": "data"}),
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
])
@pytest.mark.parametrize(
"input_dict,expected",
[
({"x": 0, "valid": "data"}, {"valid": "data"}),
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
],
)
def test_cleanup_dict(self, input_dict, expected):
assert self.extractor.cleanup_dict(input_dict) == expected
@@ -114,8 +121,8 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_basic(self, metadata, mock_user_response, mocker):
"""Test basic profile download without full_profile"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_download = mocker.patch.object(self.extractor, 'download_from_url')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_download = mocker.patch.object(self.extractor, "download_from_url")
# Mock API responses
mock_call.return_value = mock_user_response
mock_download.return_value = "profile.jpg"
@@ -132,17 +139,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_full(self, metadata, mock_user_response, mock_story_response, mocker):
"""Test full profile download with stories/posts"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
mock_stories = mocker.patch.object(self.extractor, '_download_stories_reusable')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
mock_stories = mocker.patch.object(self.extractor, "_download_stories_reusable")
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
mock_story_response
]
mock_call.side_effect = [mock_user_response, mock_story_response]
mock_highlights.return_value = None
mock_stories.return_value = mock_story_response
mock_posts.return_value = None
@@ -155,7 +159,7 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_not_found(self, metadata, mocker):
"""Test profile not found error"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_call.return_value = {"user": None}
with pytest.raises(AssertionError) as exc_info:
self.extractor.download_profile(metadata, "invalid_user")
@@ -163,18 +167,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_error_handling(self, metadata, mock_user_response, mocker):
"""Test error handling in full profile mode"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
stories_tagged = mocker.patch.object(self.extractor, '_download_stories_reusable')
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
stories_tagged = mocker.patch.object(self.extractor, "_download_stories_reusable")
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
Exception("Stories API failed"),
Exception("Posts API failed")
]
mock_call.side_effect = [mock_user_response, Exception("Stories API failed"), Exception("Posts API failed")]
mock_highlights.return_value = None
mock_tagged.return_value = None
stories_tagged.return_value = None
@@ -182,4 +182,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
result = self.extractor.download_profile(metadata, "test_user")
assert result.is_success()
assert "Error downloading stories for test_user" in result.metadata["errors"]
assert "Error downloading stories for test_user" in result.metadata["errors"]

View File

@@ -5,8 +5,7 @@ from auto_archiver.modules.instagram_extractor import InstagramExtractor
@pytest.fixture
def instagram_extractor(setup_module, mocker):
extractor_module: str = 'instagram_extractor'
extractor_module: str = "instagram_extractor"
config: dict = {
"username": "user_name",
"password": "password123",
@@ -17,20 +16,26 @@ def instagram_extractor(setup_module, mocker):
fake_loader.load_session_from_file.return_value = None
fake_loader.login.return_value = None
fake_loader.save_session_to_file.return_value = None
mocker.patch("instaloader.Instaloader", return_value=fake_loader,)
mocker.patch(
"instaloader.Instaloader",
return_value=fake_loader,
)
return setup_module(extractor_module, config)
@pytest.mark.parametrize("url", [
"https://www.instagram.com/p/",
"https://www.instagram.com/p/1234567890/",
"https://www.instagram.com/reel/1234567890/",
"https://www.instagram.com/username/",
"https://www.instagram.com/username/stories/",
"https://www.instagram.com/username/highlights/",
])
@pytest.mark.parametrize(
"url",
[
"https://www.instagram.com/p/",
"https://www.instagram.com/p/1234567890/",
"https://www.instagram.com/reel/1234567890/",
"https://www.instagram.com/username/",
"https://www.instagram.com/username/stories/",
"https://www.instagram.com/username/highlights/",
],
)
def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
"""
Ensure that the valid_url regex matches all provided Instagram URLs.
"""
assert instagram_extractor.valid_url.match(url)
assert instagram_extractor.valid_url.match(url)

View File

@@ -9,8 +9,8 @@ from tests.extractors.test_extractor_base import TestExtractorBase
@pytest.fixture
def patch_extractor_methods(request, setup_module, mocker):
mocker.patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None)
mocker.patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None)
mocker.patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None)
mocker.patch.object(InstagramTbotExtractor, "_initialize_telegram_client", return_value=None)
yield
@@ -35,12 +35,7 @@ def mock_telegram_client(mocker):
@pytest.fixture
def extractor(setup_module, patch_extractor_methods, mocker):
extractor_module = "instagram_tbot_extractor"
config = {
"api_id": 12345,
"api_hash": "test_api_hash",
"session_file": "test_session",
"timeout": 4
}
config = {"api_id": 12345, "api_hash": "test_api_hash", "session_file": "test_session", "timeout": 4}
extractor = setup_module(extractor_module, config)
extractor.client = mocker.MagicMock()
extractor.session_file = "test_session"
@@ -79,21 +74,30 @@ class TestInstagramTbotExtractorReal(TestExtractorBase):
"session_file": "secrets/anon-insta",
}
@pytest.mark.parametrize("url, expected_status, message, len_media", [
("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success",
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
6),
("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success",
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
3),
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
# Seems to be working intermittently for highlights
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
# Marking invalid url as success
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
])
@pytest.mark.parametrize(
"url, expected_status, message, len_media",
[
(
"https://www.instagram.com/p/C4QgLbrIKXG",
"insta-via-bot: success",
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
6,
),
(
"https://www.instagram.com/reel/DEVLK8qoIbg/",
"insta-via-bot: success",
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
3,
),
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
# Seems to be working intermittently for highlights
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
# Marking invalid url as success
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
],
)
def test_download(self, url, expected_status, message, len_media, metadata_sample):
"""Test the `download()` method with various Instagram URLs."""
metadata_sample.set_url(url)

View File

@@ -10,8 +10,7 @@ from auto_archiver.modules.twitter_api_extractor import TwitterApiExtractor
@pytest.mark.incremental
class TestTwitterApiExtractor(TestExtractorBase):
extractor_module = 'twitter_api_extractor'
extractor_module = "twitter_api_extractor"
config = {
"bearer_tokens": [],
@@ -22,41 +21,79 @@ class TestTwitterApiExtractor(TestExtractorBase):
"access_secret": os.environ.get("TWITTER_ACCESS_SECRET"),
}
@pytest.mark.parametrize("url, expected", [
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
])
@pytest.mark.parametrize(
"url, expected",
[
(
"https://x.com/bellingcat/status/1874097816571961839",
"https://x.com/bellingcat/status/1874097816571961839",
), # x.com urls unchanged
(
"https://twitter.com/bellingcat/status/1874097816571961839",
"https://twitter.com/bellingcat/status/1874097816571961839",
), # twitter urls unchanged
(
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
), # don't strip params from twitter urls (changed Jan 2025)
(
"https://www.bellingcat.com/category/resources/",
"https://www.bellingcat.com/category/resources/",
), # non-twitter/x urls unchanged
(
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
), # shouldn't strip params from non-twitter/x URLs
],
)
def test_sanitize_url(self, url, expected):
assert expected == self.extractor.sanitize_url(url)
@pytest.mark.download
def test_sanitize_url_download(self):
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url(
"https://t.co/yl3oOJatFp"
)
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://www.bellingcat.com/category/resources/", False, False)
])
@pytest.mark.parametrize(
"url, exptected_username, exptected_tweetid",
[
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://www.bellingcat.com/category/resources/", False, False),
],
)
def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
username, tweet_id = self.extractor.get_username_tweet_id(url)
assert exptected_username == username
assert exptected_tweetid == tweet_id
def test_choose_variants(self):
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
variant_list = [MediaVariant(content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'),
MediaVariant(bit_rate=256000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'),
MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'),
MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12')
]
variant_list = [
MediaVariant(
content_type="application/x-mpegURL",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b",
),
MediaVariant(
bit_rate=256000,
content_type="video/mp4",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12",
),
MediaVariant(
bit_rate=832000,
content_type="video/mp4",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12",
),
MediaVariant(
bit_rate=2176000,
content_type="video/mp4",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12",
),
]
chosen_variant = self.extractor.choose_variant(variant_list)
assert chosen_variant == variant_list[3]
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@pytest.mark.download
def test_download_nonexistent_tweet(self, make_item):
@@ -76,7 +113,6 @@ class TestTwitterApiExtractor(TestExtractorBase):
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@pytest.mark.download
def test_download_tweet_no_media(self, make_item):
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
post = self.extractor.download(item)
@@ -84,7 +120,7 @@ class TestTwitterApiExtractor(TestExtractorBase):
post,
"Onion rings are just vegetable donuts.",
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
"twitter-api: success"
"twitter-api: success",
)
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@@ -95,27 +131,41 @@ class TestTwitterApiExtractor(TestExtractorBase):
self.assertValidResponseMetadata(
post,
"This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
)
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@pytest.mark.parametrize("url, title, timestamp", [
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
])
@pytest.mark.parametrize(
"url, title, timestamp",
[
(
"https://x.com/SozinhoRamalho/status/1876710769913450647",
"ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
(
"https://x.com/SozinhoRamalho/status/1876710875475681357",
"ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
(
"https://x.com/SozinhoRamalho/status/1876711053813227618",
"ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
(
"https://x.com/SozinhoRamalho/status/1876711141314801937",
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
],
)
@pytest.mark.download
def test_download_sensitive_media(self, url, title, timestamp, check_hash, make_item):
"""Download tweets with sensitive media"""
post = self.extractor.download(make_item(url))
self.assertValidResponseMetadata(
post,
title,
timestamp
)
self.assertValidResponseMetadata(post, title, timestamp)
assert len(post.media) == 1
# check the SHA1 hash (quick) of the media, to make sure it's valid
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")

View File

@@ -9,6 +9,7 @@ def mock_vk_scraper(mocker):
"""Fixture to mock VkScraper."""
return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
@pytest.fixture
def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
"""Fixture to initialize VkExtractor with mocked VkScraper."""
@@ -39,7 +40,7 @@ def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
mock_scrapes = [
{"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2}
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
]
mock_filenames = ["image1.jpg", "image2.png"]
vk_extractor.vks.scrape.return_value = mock_scrapes
@@ -56,16 +57,16 @@ def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
assert len(result.media) == 2
assert result.media[0].filename == "image1.jpg"
assert result.media[1].filename == "image2.png"
vk_extractor.vks.download_media.assert_called_once_with(
mock_scrapes, vk_extractor.tmp_dir
)
vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
def test_adds_first_title_and_timestamp(vk_extractor):
metadata = Metadata().set_url("https://vk.com/no-metadata")
metadata.set_url("https://vk.com/no-metadata")
mock_scrapes = [{"text": "value", "datetime": "2023-01-01T00:00:00"},
{"text": "value2", "datetime": "2023-01-02T00:00:00"}]
mock_scrapes = [
{"text": "value", "datetime": "2023-01-01T00:00:00"},
{"text": "value2", "datetime": "2023-01-02T00:00:00"},
]
vk_extractor.vks.scrape.return_value = mock_scrapes
vk_extractor.vks.download_media.return_value = []
result = vk_extractor.download(metadata)
@@ -73,4 +74,4 @@ def test_adds_first_title_and_timestamp(vk_extractor):
assert result.get_title() == "value"
# formatted timestamp
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
assert result.is_success()
assert result.is_success()

View File

@@ -36,29 +36,45 @@ def atlos_feeder(setup_module, mocker) -> AtlosFeeder:
@pytest.fixture
def mock_atlos_api(atlos_feeder):
"""Fixture to update the atlos_feeder.session.get side_effect."""
def _mock_responses(responses):
atlos_feeder.session.get.side_effect = [FakeAPIResponse(data) for data in responses]
return _mock_responses
def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api):
"""Test valid items are yielded and invalid ones ignored."""
mock_atlos_api([
{
"next": None,
"results": [
{"source_url": "http://example.com", "id": 1,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
{"source_url": "", "id": 2,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
{"source_url": "http://example.org", "id": 3,
"metadata": {"auto_archiver": {"processed": True}},
"visibility": "visible", "status": "complete"},
],
}
])
mock_atlos_api(
[
{
"next": None,
"results": [
{
"source_url": "http://example.com",
"id": 1,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible",
"status": "complete",
},
{
"source_url": "",
"id": 2,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible",
"status": "complete",
},
{
"source_url": "http://example.org",
"id": 3,
"metadata": {"auto_archiver": {"processed": True}},
"visibility": "visible",
"status": "complete",
},
],
}
]
)
items = list(atlos_feeder)
assert len(items) == 1
@@ -68,24 +84,34 @@ def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api):
def test_atlos_feeder_multiple_pages(atlos_feeder, mock_atlos_api):
"""Test iteration over multiple pages with valid items."""
mock_atlos_api([
{
"next": "cursor2",
"results": [
{"source_url": "http://example1.com", "id": 10,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
],
},
{
"next": None,
"results": [
{"source_url": "http://example2.com", "id": 20,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
],
},
])
mock_atlos_api(
[
{
"next": "cursor2",
"results": [
{
"source_url": "http://example1.com",
"id": 10,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible",
"status": "complete",
},
],
},
{
"next": None,
"results": [
{
"source_url": "http://example2.com",
"id": 20,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible",
"status": "complete",
},
],
},
]
)
items = list(atlos_feeder)
assert len(items) == 2

View File

@@ -1,13 +1,16 @@
import pytest
@pytest.fixture
def headerless_csv_file():
return "tests/data/csv_no_headers.csv"
@pytest.fixture
def header_csv_file():
return "tests/data/csv_with_headers.csv"
@pytest.fixture
def header_csv_file_non_default_column():
return "tests/data/csv_with_headers_non_default_column.csv"
@@ -23,6 +26,7 @@ def test_csv_feeder_no_headers(headerless_csv_file, setup_module):
assert urls[0].get_url() == "https://example.com/1/"
assert urls[1].get_url() == "https://example.com/2/"
def test_csv_feeder_with_headers(header_csv_file, setup_module):
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
@@ -33,10 +37,10 @@ def test_csv_feeder_with_headers(header_csv_file, setup_module):
assert urls[0].get_url() == "https://example.com/1/"
assert urls[1].get_url() == "https://example.com/2/"
def test_csv_feeder_wrong_column(header_csv_file, setup_module, caplog):
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
with caplog.at_level("WARNING"):
feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": 1})
urls = list(feeder)
@@ -54,4 +58,4 @@ def test_csv_feeder_column_by_name(header_csv_file, setup_module):
urls = list(feeder)
assert len(urls) == 2
assert urls[0].get_url() == "https://example.com/1/"
assert urls[1].get_url() == "https://example.com/2/"
assert urls[1].get_url() == "https://example.com/2/"

View File

@@ -19,35 +19,32 @@ def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
@pytest.fixture
def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB:
config: dict = {
"service_account": "dummy.json",
"sheet": "test-auto-archiver",
"sheet_id": None,
"header": 1,
"columns": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage",
},
"allow_worksheets": set(),
"block_worksheets": set(),
"use_sheet_names_in_stored_paths": True,
}
"service_account": "dummy.json",
"sheet": "test-auto-archiver",
"sheet_id": None,
"header": 1,
"columns": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage",
},
"allow_worksheets": set(),
"block_worksheets": set(),
"use_sheet_names_in_stored_paths": True,
}
mocker.patch("gspread.service_account")
feeder = setup_module(
"gsheet_feeder_db",
config
)
feeder = setup_module("gsheet_feeder_db", config)
feeder.gsheets_client = mocker.MagicMock()
return feeder
@@ -128,9 +125,7 @@ def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB):
(None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID"),
],
)
def test_open_sheet_with_name_or_id(
setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker
):
def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker):
"""Ensure open_sheet() correctly opens by name or ID based on configuration."""
mock_service_account = mocker.patch("gspread.service_account")
mock_client = mocker.MagicMock()
@@ -145,9 +140,7 @@ def test_open_sheet_with_name_or_id(
)
sheet_result = feeder.open_sheet()
# Validate the correct method was called
getattr(mock_client, expected_method).assert_called_once_with(
expected_arg
), f"Failed: {description}"
getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}"
assert sheet_result == "MockSheet", f"Failed: {description}"
@@ -220,9 +213,7 @@ class TestGSheetsFeederReal:
@pytest.fixture(autouse=True)
def setup_feeder(self, setup_module):
assert (
self.module_name is not None
), "self.module_name must be set on the subclass"
assert self.module_name is not None, "self.module_name must be set on the subclass"
assert self.config is not None, "self.config must be a dict set on the subclass"
self.feeder: Type[Feeder] = setup_module(self.module_name, self.config)
@@ -241,9 +232,7 @@ class TestGSheetsFeederReal:
"""Ensure open_sheet() connects to a real Google Sheets instance."""
sheet = self.feeder.open_sheet()
assert sheet is not None, "open_sheet() should return a valid sheet instance"
assert hasattr(
sheet, "worksheets"
), "Returned object should have worksheets method"
assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method"
def test_iter_yields_metadata_real_data(self):
"""Ensure __iter__() yields Metadata objects for real test sheet data."""

View File

@@ -81,40 +81,27 @@ class TestGWorksheet:
(False, ""),
],
)
def test_get_cell_or_default_handles_empty_values(
self, mock_worksheet, when_empty, expected
):
def test_get_cell_or_default_handles_empty_values(self, mock_worksheet, when_empty, expected):
mock_worksheet.get_values.return_value[1][0] = "" # Empty URL cell
g = GWorksheet(mock_worksheet)
assert (
g.get_cell_or_default(
2, "url", default="default", when_empty_use_default=when_empty
)
== expected
)
assert g.get_cell_or_default(2, "url", default="default", when_empty_use_default=when_empty) == expected
def test_get_cell_or_default_handles_missing_columns(self, gworksheet):
assert (
gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe"
)
assert gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe"
# Test write operations
def test_set_cell_updates_correct_position(self, mock_worksheet, gworksheet):
gworksheet.set_cell(2, "url", "new_url")
mock_worksheet.update_cell.assert_called_once_with(2, 1, "new_url")
def test_batch_set_cell_formats_requests_correctly(
self, mock_worksheet, gworksheet
):
def test_batch_set_cell_formats_requests_correctly(self, mock_worksheet, gworksheet):
updates = [(2, "url", "new_url"), (3, "status", "processed")]
gworksheet.batch_set_cell(updates)
expected_batch = [
{"range": "A2", "values": [["new_url"]]},
{"range": "B3", "values": [["processed"]]},
]
mock_worksheet.batch_update.assert_called_once_with(
expected_batch, value_input_option="USER_ENTERED"
)
mock_worksheet.batch_update.assert_called_once_with(expected_batch, value_input_option="USER_ENTERED")
def test_batch_set_cell_truncates_long_values(self, mock_worksheet, gworksheet):
long_value = "x" * 50000

View File

@@ -5,13 +5,13 @@ from auto_archiver.core import Metadata, Media
def test_format(setup_module):
formatter = setup_module(HtmlFormatter)
metadata = Metadata().set("content", "Hello, world!").set_url('https://example.com')
metadata = Metadata().set("content", "Hello, world!").set_url("https://example.com")
final_media = formatter.format(metadata)
assert isinstance(final_media, Media)
assert ".html" in final_media.filename
with open (final_media.filename, "r", encoding="utf-8") as f:
with open(final_media.filename, "r", encoding="utf-8") as f:
content = f.read()
assert "Hello, world!" in content
assert final_media.mimetype == "text/html"
assert "SHA-256:" in final_media.get('hash')
assert "SHA-256:" in final_media.get("hash")

View File

@@ -8,6 +8,7 @@ class TestS3Storage:
"""
Test suite for S3Storage.
"""
module_name: str = "s3_storage"
storage: Type[S3Storage]
config: dict = {
@@ -32,10 +33,10 @@ class TestS3Storage:
"""Test that S3 client is initialized with correct parameters"""
assert self.storage.s3 is not None
assert self.storage.s3.meta.region_name == 'test-region'
assert self.storage.s3.meta.region_name == "test-region"
def test_get_cdn_url_generation(self):
"""Test CDN URL formatting """
"""Test CDN URL formatting"""
media = Media("test.txt")
media.key = "path/to/file.txt"
url = self.storage.get_cdn_url(media)
@@ -46,14 +47,14 @@ class TestS3Storage:
def test_uploadf_sets_acl_public(self, mocker):
media = Media("test.txt")
mock_file = mocker.MagicMock()
mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
mocker.patch.object(self.storage, 'is_upload_needed', return_value=True)
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
self.storage.uploadf(mock_file, media)
mock_s3_upload.assert_called_once_with(
mock_file,
Bucket='test-bucket',
Bucket="test-bucket",
Key=media.key,
ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
)
def test_upload_decision_logic(self, mocker):
@@ -61,23 +62,31 @@ class TestS3Storage:
media = Media("test.txt")
assert self.storage.is_upload_needed(media) is True
self.storage.random_no_duplicate = True
mock_calc_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123')
mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt')
mock_calc_hash = mocker.patch(
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
return_value="beepboop123beepboop123beepboop123",
)
mock_file_in_folder = mocker.patch.object(self.storage, "file_in_folder", return_value="existing_key.txt")
assert self.storage.is_upload_needed(media) is False
assert media.key == 'existing_key.txt'
mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be')
assert media.key == "existing_key.txt"
mock_file_in_folder.assert_called_with("no-dups/beepboop123beepboop123be")
def test_skips_upload_when_duplicate_exists(self, mocker):
"""Test that upload skips when file_in_folder finds existing object"""
self.storage.random_no_duplicate = True
mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
mock_file_in_folder = mocker.patch.object(
S3Storage, "file_in_folder", return_value="existing_folder/existing_file.txt"
)
media = Media("test.txt")
media.key = "original_path.txt"
mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
mock_calculate_hash = mocker.patch(
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
return_value="beepboop123beepboop123beepboop123",
)
assert self.storage.is_upload_needed(media) is False
assert media.key == "existing_folder/existing_file.txt"
assert media.get("previously archived") is True
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
result = self.storage.uploadf(None, media)
mock_upload.assert_not_called()
assert result is True
@@ -85,21 +94,20 @@ class TestS3Storage:
def test_uploads_with_correct_parameters(self, mocker):
media = Media("test.txt")
media.key = "original_key.txt"
mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
media.mimetype = 'image/png'
mocker.patch.object(S3Storage, "is_upload_needed", return_value=True)
media.mimetype = "image/png"
mock_file = mocker.MagicMock()
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
self.storage.uploadf(mock_file, media)
mock_upload.assert_called_once_with(
mock_file,
Bucket='test-bucket',
Key='original_key.txt',
ExtraArgs={
'ACL': 'public-read',
'ContentType': 'image/png'
}
Bucket="test-bucket",
Key="original_key.txt",
ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
)
def test_file_in_folder_exists(self, mocker):
mock_list_objects = mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]})
assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'
mock_list_objects = mocker.patch.object(
self.storage.s3, "list_objects", return_value={"Contents": [{"Key": "path/to/file.txt"}]}
)
assert self.storage.file_in_folder("path/to/") == "path/to/file.txt"

View File

@@ -101,7 +101,9 @@ def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage, metadata: Me
assert file_tuple[0] == os.path.basename(media.filename)
def test_upload_post_http_error(tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
def test_upload_post_http_error(
tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker
) -> None:
"""Test upload() propagates HTTP error during POST."""
metadata.set("atlos_id", 303)
fake_get_response = {"result": {"artifacts": []}}
@@ -109,4 +111,3 @@ def test_upload_post_http_error(tmp_path, atlos_storage: AtlosStorage, metadata:
mocker.patch.object(atlos_storage, "_post", side_effect=Exception("HTTP error"))
with pytest.raises(Exception, match="HTTP error"):
atlos_storage.upload(media, metadata)

View File

@@ -12,26 +12,28 @@ from tests.storages.test_storage_base import TestStorageBase
def gdrive_storage(setup_module, mocker):
module_name: str = "gdrive_storage"
storage: GDriveStorage
config: dict = {'path_generator': 'url',
'filename_generator': 'static',
'root_folder_id': "fake_root_folder_id",
'oauth_token': None,
'service_account': 'fake_service_account.json'
}
mocker.patch('google.oauth2.service_account.Credentials.from_service_account_file')
config: dict = {
"path_generator": "url",
"filename_generator": "static",
"root_folder_id": "fake_root_folder_id",
"oauth_token": None,
"service_account": "fake_service_account.json",
}
mocker.patch("google.oauth2.service_account.Credentials.from_service_account_file")
return setup_module(module_name, config)
def test_initialize_fails_with_non_existent_creds(setup_module):
"""Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist.
(and isn't mocked)
(and isn't mocked)
"""
config: dict = {'path_generator': 'url',
'filename_generator': 'static',
'root_folder_id': "fake_root_folder_id",
'oauth_token': None,
'service_account': 'fake_service_account.json'
}
config: dict = {
"path_generator": "url",
"filename_generator": "static",
"root_folder_id": "fake_root_folder_id",
"oauth_token": None,
"service_account": "fake_service_account.json",
}
with pytest.raises(FileNotFoundError) as exc_info:
setup_module("gdrive_storage", config)
assert "No such file or directory" in str(exc_info.value)
@@ -48,12 +50,12 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker):
result = gdrive_storage._get_id_from_parent_and_name("parent", "mock", retries=1, use_mime_type=False)
assert result == "123"
def test_path_parts():
media = Media(filename="test.jpg")
media.key = "folder1/folder2/test.jpg"
@pytest.mark.skip(reason="Requires real credentials")
@pytest.mark.download
class TestGDriveStorageConnected(TestStorageBase):
@@ -63,19 +65,17 @@ class TestGDriveStorageConnected(TestStorageBase):
module_name: str = "gdrive_storage"
storage: Type[GDriveStorage]
config: dict = {'path_generator': 'url',
'filename_generator': 'static',
# TODO: replace with real root folder id
'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
'oauth_token': None,
'service_account': 'secrets/service_account.json'
}
config: dict = {
"path_generator": "url",
"filename_generator": "static",
# TODO: replace with real root folder id
"root_folder_id": "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
"oauth_token": None,
"service_account": "secrets/service_account.json",
}
def test_initialize_with_real_credentials(self):
"""
Test that the Google Drive service can be initialized with real credentials.
"""
assert self.storage.service is not None

View File

@@ -1,4 +1,3 @@
import os
from pathlib import Path
@@ -34,13 +33,13 @@ def test_get_cdn_url_relative(local_storage):
assert local_storage.get_cdn_url(media) == expected
def test_get_cdn_url_absolute(local_storage):
media = Media(key="test.txt", filename="dummy.txt")
local_storage.save_absolute = True
expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
assert local_storage.get_cdn_url(media) == expected
def test_upload_file_contents_and_metadata(local_storage, sample_media):
dest = os.path.join(local_storage.save_to, sample_media.key)
assert local_storage.upload(sample_media) is True
@@ -51,5 +50,3 @@ def test_upload_nonexistent_source(local_storage):
media = Media(key="missing.txt", filename="nonexistent.txt")
with pytest.raises(FileNotFoundError):
local_storage.upload(media)

View File

@@ -7,16 +7,11 @@ from auto_archiver.core.storage import Storage
class TestStorageBase(object):
module_name: str = None
config: dict = None
@pytest.fixture(autouse=True)
def setup_storage(self, setup_module):
assert (
self.module_name is not None
), "self.module_name must be set on the subclass"
assert self.module_name is not None, "self.module_name must be set on the subclass"
assert self.config is not None, "self.config must be a dict set on the subclass"
self.storage: Type[Storage] = setup_module(
self.module_name, self.config
)
self.storage: Type[Storage] = setup_module(self.module_name, self.config)

View File

@@ -3,39 +3,46 @@ from auto_archiver.core import config
from ruamel.yaml.scanner import ScannerError
from ruamel.yaml.comments import CommentedMap
def test_return_default_config_for_nonexistent_file():
assert config.read_yaml("nonexistent_file.yaml") == config.EMPTY_CONFIG
def test_return_default_config_for_empty_file(tmp_path):
empty_file = tmp_path / "empty_file.yaml"
empty_file.write_text("")
assert config.read_yaml(empty_file) == config.EMPTY_CONFIG
def test_raise_error_on_invalid_yaml(tmp_path):
invalid_yaml = tmp_path / "invalid_yaml.yaml"
invalid_yaml.write_text("key: \"value_without_end_quote")
invalid_yaml.write_text('key: "value_without_end_quote')
# make sure it raises ScannerError
with pytest.raises(ScannerError):
config.read_yaml(invalid_yaml)
def test_write_yaml(tmp_path):
yaml_file = tmp_path / "write_yaml.yaml"
config.store_yaml(config.EMPTY_CONFIG, yaml_file.as_posix())
assert "steps:\n" in yaml_file.read_text()
def test_round_trip_comments(tmp_path):
yaml_file = tmp_path / "round_trip_comments.yaml"
with open(yaml_file, "w") as f:
f.write("generic_extractor:\n facebook_cookie: abc # end of line comment\n subtitles: true\n # comments: false\n # livestreams: false\n list_type:\n - value1\n - value2")
f.write(
"generic_extractor:\n facebook_cookie: abc # end of line comment\n subtitles: true\n # comments: false\n # livestreams: false\n list_type:\n - value1\n - value2"
)
loaded = config.read_yaml(yaml_file)
# check the comments are preserved
assert loaded['generic_extractor']['facebook_cookie'] == "abc"
assert loaded['generic_extractor'].ca.items['facebook_cookie'][2].value == "# end of line comment\n"
assert loaded["generic_extractor"]["facebook_cookie"] == "abc"
assert loaded["generic_extractor"].ca.items["facebook_cookie"][2].value == "# end of line comment\n"
# add some more items to my_settings
loaded['generic_extractor']['list_type'].append("bellingcat")
loaded["generic_extractor"]["list_type"].append("bellingcat")
config.store_yaml(loaded, yaml_file.as_posix())
assert "# comments: false" in yaml_file.read_text()
@@ -43,14 +50,17 @@ def test_round_trip_comments(tmp_path):
assert "abc # end of line comment" in yaml_file.read_text()
assert "- value2\n - bellingcat" in yaml_file.read_text()
def test_merge_dicts():
yaml_dict = config.EMPTY_CONFIG
yaml_dict['settings'] = CommentedMap(**{
yaml_dict["settings"] = CommentedMap(
**{
"key1": ["a"],
"key2": "old_value",
"key3": ["a", "b", "c"],
"key5": "value5",
})
}
)
dotdict = {
"settings.key1": ["b", "c"],
@@ -77,6 +87,7 @@ def test_check_types():
assert config.is_dict_type([]) == False
assert config.is_dict_type("") == False
def test_from_dot_notation():
dotdict = {
"settings.key1": ["a", "b", "c"],
@@ -88,16 +99,17 @@ def test_from_dot_notation():
assert normal_dict["settings"]["key2"] == "new_value"
assert normal_dict["settings"]["key3"]["key4"] == "value"
def test_to_dot_notation():
yaml_dict = config.EMPTY_CONFIG
yaml_dict['settings'] = {
yaml_dict["settings"] = {
"key1": ["a", "b", "c"],
"key2": "new_value",
"key3": {
"key4": "value",
}
},
}
dotdict = config.to_dot_notation(yaml_dict)
assert dotdict["settings.key1"] == ["a", "b", "c"]
assert dotdict["settings.key2"] == "new_value"
assert dotdict["settings.key3.key4"] == "value"
assert dotdict["settings.key3.key4"] == "value"

View File

@@ -10,21 +10,23 @@ def orchestration_file_path(tmp_path):
folder.mkdir(exist_ok=True)
return (folder / "example_orch.yaml").as_posix()
@pytest.fixture
def orchestration_file(orchestration_file_path):
def _orchestration_file(content=''):
def _orchestration_file(content=""):
with open(orchestration_file_path, "w") as f:
f.write(content)
return orchestration_file_path
return _orchestration_file
@pytest.fixture
def autoarchiver(tmp_path, monkeypatch, request):
def _autoarchiver(args=[]):
def cleanup():
from loguru import logger
if not logger._core.handlers.get(0):
logger._core.handlers_count = 0
logger.add(sys.stderr)
@@ -47,6 +49,7 @@ def test_run_auto_archiver_no_args(caplog, autoarchiver):
assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text
def test_run_auto_archiver_invalid_file(caplog, autoarchiver):
# exec 'auto-archiver' on the command lin
with pytest.raises(SystemExit):
@@ -54,6 +57,7 @@ def test_run_auto_archiver_invalid_file(caplog, autoarchiver):
assert "Make sure the file exists and try again, or run without th" in caplog.text
def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
# create a valid (empty) orchestration file
path = orchestration_file(content="")
@@ -64,6 +68,7 @@ def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
# should treat an empty file as if there is no file at all
assert " No URLs provided. Please provide at least one URL via the com" in caplog.text
def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
from auto_archiver.__main__ import main
@@ -75,4 +80,4 @@ def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
with pytest.raises(SystemExit):
main()
assert "No URLs provided. Please provide at least one" in caplog.text
assert "No URLs provided. Please provide at least one" in caplog.text

View File

@@ -62,18 +62,8 @@ def test_simple_merge(basic_metadata):
def test_left_merge():
left = (
Metadata()
.set("tags", ["a"])
.set("stats", {"views": 10})
.set("status", "success")
)
right = (
Metadata()
.set("tags", ["b"])
.set("stats", {"likes": 5})
.set("status", "no archiver")
)
left = Metadata().set("tags", ["a"]).set("stats", {"views": 10}).set("status", "success")
right = Metadata().set("tags", ["b"]).set("stats", {"likes": 5}).set("status", "no archiver")
left.merge(right, overwrite_left=True)
assert left.get("status") == "no archiver"
@@ -120,6 +110,7 @@ def test_is_empty():
def test_store():
pass
# Test Media operations
@@ -176,6 +167,7 @@ def test_choose_most_complete():
res = Metadata.choose_most_complete([m_more, m_less])
assert res.metadata.get("title") == "Title 1"
def test_choose_most_complete_from_pickles(unpickle):
# test most complete from pickles before and after an enricher has run
# Only compares length of media, not the actual media

View File

@@ -3,6 +3,7 @@ import pytest
from auto_archiver.core.module import ModuleFactory, LazyBaseModule
from auto_archiver.core.base_module import BaseModule
@pytest.fixture
def example_module():
import auto_archiver
@@ -14,12 +15,14 @@ def example_module():
return module_factory.get_module_lazy("example_module")
def test_get_module_lazy(example_module):
assert example_module.name == "example_module"
assert example_module.display_name == "Example Module"
assert example_module.manifest is not None
def test_python_dependency_check(example_module):
# example_module requires loguru, which is not installed
# monkey patch the manifest to include a nonexistnet dependency
@@ -30,11 +33,13 @@ def test_python_dependency_check(example_module):
assert load_error.value.code == 1
def test_binary_dependency_check(example_module):
# example_module requires ffmpeg, which is not installed
# monkey patch the manifest to include a nonexistnet dependency
example_module.manifest["dependencies"]["binary"] = ["does_not_exist"]
def test_module_dependency_check_loads_module(example_module):
# example_module requires cli_feeder, which is not installed
# monkey patch the manifest to include a nonexistnet dependency
@@ -49,19 +54,20 @@ def test_module_dependency_check_loads_module(example_module):
assert module_factory._lazy_modules["hash_enricher"] is not None
assert module_factory._lazy_modules["hash_enricher"]._instance is not None
def test_load_module(example_module):
def test_load_module(example_module):
# setup the module, and check that config is set to the default values
loaded_module = example_module.load({})
assert loaded_module is not None
assert isinstance(loaded_module, BaseModule)
assert loaded_module.name == "example_module"
assert loaded_module.display_name == "Example Module"
assert loaded_module.config["example_module"] == {"csv_file" : "db.csv"}
assert loaded_module.config["example_module"] == {"csv_file": "db.csv"}
# check that the vlaue is set on the module itself
assert loaded_module.csv_file == "db.csv"
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_load_modules(module_name):
# test that specific modules can be loaded
@@ -96,5 +102,3 @@ def test_lazy_base_module(module_name):
assert len(lazy_module.configs) > 0
assert len(lazy_module.description) > 0
assert len(lazy_module.version) > 0

View File

@@ -9,49 +9,63 @@ from auto_archiver.core import Metadata
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
TEST_MODULES = "tests/data/test_modules/"
@pytest.fixture
def test_args():
return ["--config", TEST_ORCHESTRATION,
"--module_paths", TEST_MODULES,
"--example_module.required_field", "some_value"] # just set this for normal testing, we will remove it later
return [
"--config",
TEST_ORCHESTRATION,
"--module_paths",
TEST_MODULES,
"--example_module.required_field",
"some_value",
] # just set this for normal testing, we will remove it later
@pytest.fixture
def orchestrator():
return ArchivingOrchestrator()
@pytest.fixture
def basic_parser(orchestrator) -> ArgumentParser:
return orchestrator.setup_basic_parser()
def test_setup_orchestrator(orchestrator):
assert orchestrator is not None
def test_parse_config():
pass
def test_parse_basic(basic_parser):
args = basic_parser.parse_args(["--config", TEST_ORCHESTRATION])
assert args.config_file == TEST_ORCHESTRATION
@pytest.mark.parametrize("mode", ["simple", "full"])
def test_mode(basic_parser, mode):
args = basic_parser.parse_args(["--mode", mode])
assert args.mode == mode
def test_mode_invalid(basic_parser, capsys):
with pytest.raises(SystemExit) as exit_error:
basic_parser.parse_args(["--mode", "invalid"])
assert exit_error.value.code == 2
assert "invalid choice" in capsys.readouterr().err
def test_version(basic_parser, capsys):
with pytest.raises(SystemExit) as exit_error:
basic_parser.parse_args(["--version"])
assert exit_error.value.code == 0
assert capsys.readouterr().out == f"{__version__}\n"
def test_help(orchestrator, basic_parser, capsys):
def test_help(orchestrator, basic_parser, capsys):
args = basic_parser.parse_args(["--help"])
assert args.help == True
@@ -83,14 +97,17 @@ def test_help(orchestrator, basic_parser, capsys):
def test_add_custom_modules_path(orchestrator, test_args):
orchestrator.setup_config(test_args)
import auto_archiver
assert "tests/data/test_modules/" in auto_archiver.modules.__path__
def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
orchestrator.setup_config(test_args + # we still need to load the real path to get the example_module
["--module_paths", "tests/data/invalid_test_modules/"])
def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
orchestrator.setup_config(
test_args # we still need to load the real path to get the example_module
+ ["--module_paths", "tests/data/invalid_test_modules/"]
)
assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."
@@ -104,11 +121,11 @@ def test_check_required_values(orchestrator, caplog, test_args):
assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"
def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
# load the default example yaml, add a required field, then run the orchestrator
test_yaml = read_yaml(TEST_ORCHESTRATION)
test_yaml['example_module'] = {'required_field': 'some_value'}
test_yaml["example_module"] = {"required_field": "some_value"}
# write it to a temp file
tmp_file = (tmp_path / "temp_config.yaml").as_posix()
store_yaml(test_yaml, tmp_file)
@@ -117,27 +134,42 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
config = orchestrator.setup_config(["--config", tmp_file, "--module_paths", TEST_MODULES])
assert config is not None
def test_load_authentication_string(orchestrator, test_args):
config = orchestrator.setup_config(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
assert config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
def test_load_authentication_string(orchestrator, test_args):
config = orchestrator.setup_config(
test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}']
)
assert config["authentication"] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
def test_load_authentication_string_concat_site(orchestrator, test_args):
config = orchestrator.setup_config(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
assert config['authentication'] == {"x.com": {"api_key": "my_key"},
"twitter.com": {"api_key": "my_key"}}
assert config["authentication"] == {"x.com": {"api_key": "my_key"}, "twitter.com": {"api_key": "my_key"}}
def test_load_invalid_authentication_string(orchestrator, test_args):
with pytest.raises(ArgumentTypeError):
orchestrator.setup_config(test_args + ["--authentication", "{\''invalid_json"])
orchestrator.setup_config(test_args + ["--authentication", "{''invalid_json"])
def test_load_authentication_invalid_dict(orchestrator, test_args):
with pytest.raises(ArgumentTypeError):
orchestrator.setup_config(test_args + ["--authentication", "[true, false]"])
def test_load_modules_from_commandline(orchestrator, test_args):
args = test_args + ["--feeders", "example_module", "--extractors", "example_module", "--databases", "example_module", "--enrichers", "example_module", "--formatters", "example_module"]
args = test_args + [
"--feeders",
"example_module",
"--extractors",
"example_module",
"--databases",
"example_module",
"--enrichers",
"example_module",
"--formatters",
"example_module",
]
orchestrator.setup(args)
@@ -153,27 +185,37 @@ def test_load_modules_from_commandline(orchestrator, test_args):
assert orchestrator.enrichers[0].name == "example_module"
assert orchestrator.formatters[0].name == "example_module"
def test_load_settings_for_module_from_commandline(orchestrator, test_args):
args = test_args + ["--feeders", "gsheet_feeder_db", "--gsheet_feeder_db.sheet_id", "123", "--gsheet_feeder_db.service_account", "tests/data/test_service_account.json"]
args = test_args + [
"--feeders",
"gsheet_feeder_db",
"--gsheet_feeder_db.sheet_id",
"123",
"--gsheet_feeder_db.service_account",
"tests/data/test_service_account.json",
]
orchestrator.setup(args)
assert len(orchestrator.feeders) == 1
assert orchestrator.feeders[0].name == "gsheet_feeder_db"
assert orchestrator.config['gsheet_feeder_db']['sheet_id'] == "123"
assert orchestrator.config["gsheet_feeder_db"]["sheet_id"] == "123"
def test_multiple_orchestrator(test_args):
o1_args = test_args + ["--feeders", "gsheet_feeder_db", "--gsheet_feeder_db.service_account", "tests/data/test_service_account.json"]
o1_args = test_args + [
"--feeders",
"gsheet_feeder_db",
"--gsheet_feeder_db.service_account",
"tests/data/test_service_account.json",
]
o1 = ArchivingOrchestrator()
with pytest.raises(ValueError) as exit_error:
# this should fail because the gsheet_feeder_db requires a sheet_id / sheet
o1.setup(o1_args)
o2_args = test_args + ["--feeders", "example_module"]
o2 = ArchivingOrchestrator()
o2.setup(o2_args)
@@ -182,4 +224,4 @@ def test_multiple_orchestrator(test_args):
output: Metadata = list(o2.feed())
assert len(output) == 1
assert output[0].get_url() == "https://example.com"
assert output[0].get_url() == "https://example.com"

View File

@@ -14,7 +14,7 @@ from auto_archiver.utils.misc import (
update_nested_dict,
calculate_file_hash,
random_str,
get_timestamp
get_timestamp,
)
@@ -38,40 +38,46 @@ class TestDirectoryUtils:
mkdir_if_not_exists(existing_dir)
assert existing_dir.exists()
class TestURLExpansion:
@pytest.mark.parametrize("input_url,expected", [
("https://example.com", "https://example.com"),
("https://t.co/test", "https://expanded.url")
])
@pytest.mark.parametrize(
"input_url,expected",
[("https://example.com", "https://example.com"), ("https://t.co/test", "https://expanded.url")],
)
def test_expand_url(self, input_url, expected, mocker):
mock_response = mocker.Mock()
mock_response.url = "https://expanded.url"
mocker.patch('requests.get', return_value=mock_response)
mocker.patch("requests.get", return_value=mock_response)
result = expand_url(input_url)
assert result == expected
def test_expand_url_handles_errors(self, caplog, mocker):
mocker.patch('requests.get', side_effect=Exception("Connection error"))
mocker.patch("requests.get", side_effect=Exception("Connection error"))
url = "https://t.co/error"
result = expand_url(url)
assert result == url
assert f"Failed to expand url {url}" in caplog.text
class TestAttributeHandling:
class Sample:
exists = "value"
none = None
@pytest.mark.parametrize("obj,attr,default,expected", [
(Sample(), "exists", "default", "value"),
(Sample(), "none", "default", "default"),
(Sample(), "missing", "default", "default"),
(None, "anything", "fallback", "fallback"),
])
@pytest.mark.parametrize(
"obj,attr,default,expected",
[
(Sample(), "exists", "default", "value"),
(Sample(), "none", "default", "default"),
(Sample(), "missing", "default", "default"),
(None, "anything", "fallback", "fallback"),
],
)
def test_getattr_or(self, obj, attr, default, expected):
# Test gets attribute or returns a default value
assert getattr_or(obj, attr, default) == expected
class TestDateTimeHandling:
def test_datetime_encoder(self, sample_datetime):
result = json.dumps({"dt": sample_datetime}, cls=DateTimeEncoder)
@@ -83,11 +89,14 @@ class TestDateTimeHandling:
result = dump_payload(payload)
assert str(sample_datetime) in result
@pytest.mark.parametrize("dt_str,fmt,expected", [
("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)),
("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)),
("invalid", None, None),
])
@pytest.mark.parametrize(
"dt_str,fmt,expected",
[
("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)),
("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)),
("invalid", None, None),
],
)
def test_datetime_from_string(self, dt_str, fmt, expected):
result = get_datetime_from_str(dt_str, fmt)
if expected is None:
@@ -95,16 +104,21 @@ class TestDateTimeHandling:
else:
assert result == expected.replace(tzinfo=result.tzinfo)
class TestDictUtils:
@pytest.mark.parametrize("original,update,expected", [
({"a": 1}, {"b": 2}, {"a": 1, "b": 2}),
({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}),
({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}),
])
@pytest.mark.parametrize(
"original,update,expected",
[
({"a": 1}, {"b": 2}, {"a": 1, "b": 2}),
({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}),
({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}),
],
)
def test_update_nested_dict(self, original, update, expected):
update_nested_dict(original, update)
assert original == expected
class TestHashingUtils:
def test_file_hashing(self, sample_file):
expected = hashlib.sha256(b"test content").hexdigest()
@@ -118,6 +132,7 @@ class TestHashingUtils:
expected = hashlib.sha256(content).hexdigest()
assert calculate_file_hash(str(file_path)) == expected
class TestMiscUtils:
def test_random_str_length(self):
for length in [8, 16, 32]:
@@ -131,14 +146,17 @@ class TestMiscUtils:
def test_random_str_uniqueness(self):
assert random_str() != random_str()
@pytest.mark.parametrize("ts_input,utc,iso,expected_type", [
(datetime.now(), True, True, str),
("2023-01-01T12:00:00+00:00", False, False, datetime),
(1672574400, True, True, str),
])
@pytest.mark.parametrize(
"ts_input,utc,iso,expected_type",
[
(datetime.now(), True, True, str),
("2023-01-01T12:00:00+00:00", False, False, datetime),
(1672574400, True, True, str),
],
)
def test_timestamp_parsing(self, ts_input, utc, iso, expected_type):
result = get_timestamp(ts_input, utc=utc, iso=iso)
assert isinstance(result, expected_type)
def test_invalid_timestamp_returns_none(self):
assert get_timestamp("invalid-date") is None
assert get_timestamp("invalid-date") is None