Merge branch 'main' into timestamping_rewrite

This commit is contained in:
Patrick Robertson
2025-02-24 12:03:14 +00:00
125 changed files with 4277 additions and 2162 deletions

View File

@@ -2,7 +2,7 @@ import pytest
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module_lazy
from auto_archiver.core.module import ModuleFactory
@pytest.mark.parametrize("algorithm, filename, expected_hash", [
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
@@ -22,7 +22,7 @@ def test_default_config_values(setup_module):
def test_config():
# test default config
c = get_module_lazy('hash_enricher').configs
c = ModuleFactory().get_module_lazy('hash_enricher').configs
assert c["algorithm"]["default"] == "SHA-256"
assert c["chunksize"]["default"] == 16000000
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]

View File

@@ -1,6 +1,5 @@
import datetime
from datetime import datetime, timedelta, timezone
from unittest.mock import MagicMock, patch
import pytest
@@ -9,29 +8,21 @@ from auto_archiver.modules.meta_enricher import MetaEnricher
@pytest.fixture
def mock_metadata():
def mock_metadata(mocker):
"""Creates a mock Metadata object."""
mock: Metadata = MagicMock(spec=Metadata)
mock: Metadata = mocker.MagicMock(spec=Metadata)
mock.get_url.return_value = "https://example.com"
mock.is_empty.return_value = False # Default to not empty
mock.get_all_media.return_value = []
return mock
@pytest.fixture
def mock_media():
def mock_media(mocker):
"""Creates a mock Media object."""
mock: Media = MagicMock(spec=Media)
mock: Media = mocker.MagicMock(spec=Media)
mock.filename = "mock_file.txt"
return mock
@pytest.fixture
def metadata():
m = Metadata()
m.set_url("https://example.com")
m.set_title("Test Title")
m.set_content("Test Content")
return m
@pytest.fixture(autouse=True)
def meta_enricher(setup_module):
@@ -90,14 +81,14 @@ def test_enrich_file_sizes_no_media(meta_enricher, metadata):
assert metadata.get("total_size") == "0.0 bytes"
def test_enrich_archive_duration(meta_enricher, metadata):
def test_enrich_archive_duration(meta_enricher, metadata, mocker):
# Set fixed "processed at" time in the past
processed_at = datetime.now(timezone.utc) - timedelta(minutes=10, seconds=30)
metadata.set("_processed_at", processed_at)
# patch datetime
with patch("datetime.datetime") as mock_datetime:
mock_now = datetime.now(timezone.utc)
mock_datetime.now.return_value = mock_now
meta_enricher.enrich_archive_duration(metadata)
mock_datetime = mocker.patch("datetime.datetime")
mock_now = datetime.now(timezone.utc)
mock_datetime.now.return_value = mock_now
meta_enricher.enrich_archive_duration(metadata)
assert metadata.get("archive_duration_seconds") == 630

View File

@@ -0,0 +1,88 @@
import pytest
from auto_archiver.core import Media
@pytest.fixture
def mock_media(mocker):
"""Creates a mock Media object."""
mock: Media = mocker.MagicMock(spec=Media)
mock.filename = "mock_file.txt"
return mock
@pytest.fixture
def enricher(setup_module, mock_binary_dependencies):
return setup_module("metadata_enricher", {})
@pytest.mark.parametrize(
"output,expected",
[
("Key1: Value1\nKey2: Value2", {"Key1": "Value1", "Key2": "Value2"}),
("InvalidLine", {}),
("", {}),
],
)
def test_get_metadata(enricher, output, expected, mocker):
mock_run = mocker.patch("subprocess.run")
mock_run.return_value.stdout = output
mock_run.return_value.stderr = ""
mock_run.return_value.returncode = 0
result = enricher.get_metadata("test.jpg")
assert result == expected
mock_run.assert_called_once_with(
["exiftool", "test.jpg"], capture_output=True, text=True
)
def test_get_metadata_exiftool_not_found(enricher, mocker):
mock_run = mocker.patch("subprocess.run")
mock_run.side_effect = FileNotFoundError
result = enricher.get_metadata("test.jpg")
assert result == {}
def test_enrich_sets_metadata(enricher, mocker):
media1 = mocker.Mock(filename="img1.jpg")
media2 = mocker.Mock(filename="img2.jpg")
metadata = mocker.Mock()
metadata.media = [media1, media2]
enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {}
enricher.enrich(metadata)
media1.set.assert_called_once_with("metadata", {"key": "value"})
media2.set.assert_not_called()
assert metadata.media == [media1, media2]
def test_enrich_empty_media(enricher, mocker):
metadata = mocker.Mock()
metadata.media = []
# Should not raise errors
enricher.enrich(metadata)
def test_get_metadata_error_handling(enricher, mocker):
mocker.patch("subprocess.run", side_effect=Exception("Test error"))
mock_log = mocker.patch("loguru.logger.error")
result = enricher.get_metadata("test.jpg")
assert result == {}
assert "Error occurred: " in mock_log.call_args[0][0]
def test_metadata_pickle(enricher, unpickle, mocker):
mock_run = mocker.patch("subprocess.run")
# Uses pickled values
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
expected = unpickle("metadata_enricher_ytshort_expected.pickle")
enricher.enrich(metadata)
expected_media = expected.media
actual_media = metadata.media
assert len(expected_media) == len(actual_media)
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")

View File

@@ -0,0 +1,78 @@
import pytest
from PIL import UnidentifiedImageError
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.pdq_hash_enricher import PdqHashEnricher
@pytest.fixture
def enricher(setup_module):
return setup_module("pdq_hash_enricher", {})
@pytest.fixture
def metadata_with_images():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="image1.jpg", key="image1"))
m.add_media(Media(filename="image2.jpg", key="image2"))
return m
def test_successful_enrich(metadata_with_images, mocker):
with (
mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)),
mocker.patch("PIL.Image.open"),
mocker.patch.object(Media, "is_image", return_value=True) as mock_is_image,
):
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
# Ensure the hash is set for image media
for media in metadata_with_images.media:
assert media.get("pdq_hash") is not None
def test_enrich_skip_non_image(metadata_with_images, mocker):
mocker.patch.object(Media, "is_image", return_value=False)
mock_pdq = mocker.patch("pdqhash.compute")
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
mock_pdq.assert_not_called()
def test_enrich_handles_corrupted_image(metadata_with_images, mocker):
mocker.patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image"))
mock_pdq = mocker.patch("pdqhash.compute")
mock_logger = mocker.patch("loguru.logger.error")
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
assert mock_logger.call_count == len(metadata_with_images.media)
mock_pdq.assert_not_called()
@pytest.mark.parametrize(
"media_id, should_have_hash",
[
("screenshot", False),
("warc-file-123", False),
("regular-image", True),
]
)
def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):
metadata = Metadata()
metadata.set_url("https://example.com")
metadata.add_media(Media(filename="image.jpg").set("id", media_id))
mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100))
mocker.patch("PIL.Image.open")
mocker.patch.object(Media, "is_image", return_value=True)
enricher = PdqHashEnricher()
enricher.enrich(metadata)
media_item = metadata.media[0]
assert (media_item.get("pdq_hash") is not None) == should_have_hash

View File

@@ -0,0 +1,195 @@
import base64
import pytest
from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher
@pytest.fixture
def mock_selenium_env(mocker):
"""Patches Selenium calls and driver checks in one place."""
# Patch external dependencies
mock_which = mocker.patch("shutil.which")
mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver")
mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths")
mock_is_file = mocker.patch("pathlib.Path.is_file", return_value=True)
mock_popen = mocker.patch("subprocess.Popen")
mock_is_connectable = mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions")
# Define side effect for `shutil.which`
def mock_which_side_effect(dep):
return "/mock/geckodriver" if dep == "geckodriver" else None
mock_which.side_effect = mock_which_side_effect
# Mock binary paths
mock_binary_paths.return_value = {
"driver_path": "/mock/driver",
"browser_path": "/mock/browser",
}
# Mock `subprocess.Popen`
mock_proc = mocker.MagicMock()
mock_proc.poll.return_value = None
mock_popen.return_value = mock_proc
# Mock `CookieSettingDriver`
mock_driver = mocker.MagicMock()
mock_driver_class.return_value = mock_driver
# Mock `FirefoxOptions`
mock_options_instance = mocker.MagicMock()
mock_firefox_options.return_value = mock_options_instance
yield mock_driver, mock_driver_class, mock_options_instance
@pytest.fixture
def common_patches(tmp_path, mocker):
"""Patches common utilities used across multiple tests."""
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=False)
mocker.patch("os.path.join", return_value=str(tmp_path / "test.png"))
mocker.patch("time.sleep")
yield
@pytest.fixture
def screenshot_enricher(setup_module, mock_binary_dependencies) -> ScreenshotEnricher:
configs: dict = {
"width": 1280,
"height": 720,
"timeout": 60,
"sleep_before_screenshot": 4,
"http_proxy": "",
"save_to_pdf": "False",
"print_options": {},
}
return setup_module("screenshot_enricher", configs)
@pytest.fixture
def metadata_with_video():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="video.mp4").set("id", "video1"))
return m
def test_enrich_adds_screenshot(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
tmp_path,
):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
screenshot_enricher.enrich(metadata_with_video)
mock_driver_class.assert_called_once_with(
cookies=None,
cookiejar=None,
facebook_accept_cookies=False,
options=mock_options_instance,
)
# Verify the actual calls on the returned mock_driver
mock_driver.get.assert_called_once_with("https://example.com")
mock_driver.save_screenshot.assert_called_once_with(str(tmp_path / "test.png"))
# Check that the media was added (2 = original video + screenshot)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
@pytest.mark.parametrize(
"url,is_auth",
[
("https://example.com", False),
("https://private.com", True),
],
)
def test_enrich_auth_wall(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
url,
is_auth,
mocker
):
# Testing with and without is_auth_wall
mock_driver, mock_driver_class, _ = mock_selenium_env
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth)
metadata_with_video.set_url(url)
screenshot_enricher.enrich(metadata_with_video)
if is_auth:
mock_driver.get.assert_not_called()
assert len(metadata_with_video.media) == 1
assert metadata_with_video.media[0].properties.get("id") == "video1"
else:
mock_driver.get.assert_called_once_with(url)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
def test_handle_timeout_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
mock_driver.get.side_effect = TimeoutException
mock_log = mocker.patch("loguru.logger.info")
screenshot_enricher.enrich(metadata_with_video)
mock_log.assert_called_once_with("TimeoutException loading page for screenshot")
assert len(metadata_with_video.media) == 1
def test_handle_general_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
):
"""Test proper handling of unexpected general exceptions"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Simulate a generic exception when save_screenshot is called
mock_driver.get.return_value = None
mock_driver.save_screenshot.side_effect = Exception("Unexpected Error")
mock_log = mocker.patch("loguru.logger.error")
screenshot_enricher.enrich(metadata_with_video)
# Verify that the exception was logged with the log
mock_log.assert_called_once_with(
"Got error while loading webdriver for screenshot enricher: Unexpected Error"
)
# And no new media was added due to the error
assert len(metadata_with_video.media) == 1
def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_selenium_env):
"""Test PDF creation when save_to_pdf is enabled"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Override the save_to_pdf option
screenshot_enricher.save_to_pdf = True
# Mock the print_page method to return base64-encoded content
mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8")
# Patch functions with mocker
mock_os_path_join = mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
mock_random_str = mocker.patch(
"auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str",
return_value="fixed123",
)
mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open)
mock_log_error = mocker.patch("loguru.logger.error")
screenshot_enricher.enrich(metadata_with_video)
# Verify screenshot and PDF creation
mock_driver.save_screenshot.assert_called_once()
mock_driver.print_page.assert_called_once_with(mock_driver.print_options)
# Check that PDF file was opened and written
mock_open.assert_any_call("pdf_fixed123.pdf", "wb")
# Ensure both screenshot and PDF were added as media
assert len(metadata_with_video.media) == 3
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
assert metadata_with_video.media[2].properties.get("id") == "pdf"
@pytest.fixture(autouse=True)
def cleanup_files(tmp_path):
yield
for file in tmp_path.iterdir():
file.unlink()

View File

@@ -0,0 +1,54 @@
import ssl
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def enricher(setup_module):
configs: dict = {
"skip_when_nothing_archived": "True",
}
return setup_module("ssl_enricher", configs)
@pytest.fixture
def metadata():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media("tests/data/testfile_1.txt"))
m.add_media(Media("tests/data/testfile_2.txt"))
return m
def test_http_raises(metadata, enricher):
metadata.set_url("http://example.com")
with pytest.raises(AssertionError) as exc_info:
enricher.enrich(metadata)
assert "Invalid URL scheme" in str(exc_info.value)
def test_empty_metadata(metadata, enricher):
metadata.media = []
assert enricher.enrich(metadata) is None
def test_ssl_enrich(metadata, enricher, mocker):
mocker.patch("ssl.get_server_certificate", return_value="TEST_CERT")
mock_file = mocker.patch("builtins.open", mocker.mock_open())
media_len_before = len(metadata.media)
enricher.enrich(metadata)
ssl.get_server_certificate.assert_called_once_with(("example.com", 443))
mock_file.assert_called_once_with(f"{enricher.tmp_dir}/example-com.pem", "w")
mock_file().write.assert_called_once_with("TEST_CERT")
assert len(metadata.media) == media_len_before + 1
# Ensure the certificate is added to metadata
assert any(media.filename.endswith("example-com.pem") for media in metadata.media)
def test_ssl_error_handling(enricher, metadata, mocker):
mocker.patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error"))
with pytest.raises(ssl.SSLError, match="SSL error"):
enricher.enrich(metadata)

View File

@@ -0,0 +1,148 @@
import pytest
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.thumbnail_enricher import ThumbnailEnricher
@pytest.fixture
def thumbnail_enricher(setup_module, mock_binary_dependencies) -> ThumbnailEnricher:
config: dict = {
"thumbnails_per_minute": 60,
"max_thumbnails": 4,
}
return setup_module("thumbnail_enricher", config)
@pytest.fixture
def metadata_with_video():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="video.mp4").set("id", "video1"))
return m
@pytest.fixture
def mock_ffmpeg_environment(mocker):
# Mocking all the ffmpeg calls in one place
mock_ffmpeg_input = mocker.patch("ffmpeg.input")
mock_makedirs = mocker.patch("os.makedirs")
mocker.patch.object(Media, "is_video", return_value=True),
mock_probe = mocker.patch(
"ffmpeg.probe",
return_value={
"streams": [
{"codec_type": "video", "duration": "120"}
] # Default 2-minute duration, but can override in tests
},
)
mock_output = mocker.MagicMock()
mock_ffmpeg_input.return_value.filter.return_value.output.return_value = (
mock_output
)
return {
"mock_ffmpeg_input": mock_ffmpeg_input,
"mock_makedirs": mock_makedirs,
"mock_output": mock_output,
"mock_probe": mock_probe,
}
@pytest.mark.parametrize("thumbnails_per_minute, max_thumbnails, expected_count", [
(10, 5, 5), # Capped at max_thumbnails
(1, 10, 2), # Less than max_thumbnails
(60, 7, 7), # Matches exactly
])
def test_enrich_thumbnail_limits(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment,
thumbnails_per_minute, max_thumbnails, expected_count
):
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
thumbnail_enricher.max_thumbnails = max_thumbnails
thumbnail_enricher.enrich(metadata_with_video)
assert mock_ffmpeg_environment["mock_output"].run.call_count == expected_count
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert len(thumbnails) == expected_count
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error"))
mocker.patch("os.makedirs")
mock_logger = mocker.patch("loguru.logger.error")
mocker.patch.object(Media, "is_video", return_value=True)
thumbnail_enricher.enrich(metadata_with_video)
# Ensure error was logged
mock_logger.assert_called_with(
f"error getting duration of video video.mp4: Probe error"
)
# Ensure no thumbnails were created
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert thumbnails is None
def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video, mocker):
mocker.patch.object(Media, "is_video", return_value=False)
mock_ffmpeg = mocker.patch("ffmpeg.input")
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg.assert_not_called()
@pytest.mark.parametrize("thumbnails_per_minute,max_thumbnails,expected_count", [
(60, 5, 5), # caught by max
(60, 20, 10), # caught by t/min
(0, 20, 1), # test min caught (1)
(11, 20, 1), # test min caught (1)
(12, 20, 2), # test caught by t/min
])
def test_enrich_handles_short_video(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, thumbnails_per_minute, max_thumbnails, expected_count, mocker
):
# override mock duration
fake_duration = 10
mocker.patch(
"ffmpeg.probe",
return_value={ "streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
)
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
thumbnail_enricher.max_thumbnails = max_thumbnails
thumbnail_enricher.enrich(metadata_with_video)
assert mock_ffmpeg_environment["mock_output"].run.call_count == expected_count
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert len(thumbnails) == expected_count
def test_uses_existing_duration(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment
):
metadata_with_video.media[0].set("duration", 60)
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg_environment["mock_probe"].assert_not_called()
assert mock_ffmpeg_environment["mock_output"].run.call_count == 4
def test_enrich_metadata_structure(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker):
fake_duration = 120
mocker.patch("ffmpeg.probe", return_value={'streams': [{'codec_type': 'video', 'duration': str(fake_duration)}]})
thumbnail_enricher.thumbnails_per_minute = 2
thumbnail_enricher.max_thumbnails = 4
thumbnail_enricher.enrich(metadata_with_video)
media_item = metadata_with_video.media[0]
thumbnails = media_item.get("thumbnails")
# Assert normal metadata
assert media_item.get("id") == "video1"
assert media_item.get("duration") == fake_duration
# Evenly spaced timestamps
expected_timestamps = ["24.000s", "48.000s", "72.000s", "96.000s"]
assert thumbnails is not None
assert len(thumbnails) == 4
for index, thumbnail in enumerate(thumbnails):
assert thumbnail.filename is not None
assert thumbnail.properties.get("id") == f"thumbnail_{index}"
assert thumbnail.properties.get("timestamp") == expected_timestamps[index]

View File

@@ -0,0 +1,112 @@
import os
from zipfile import ZipFile
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def wacz_enricher(setup_module, mock_binary_dependencies):
configs: dict = {
"profile": None,
"docker_commands": None,
"timeout": 120,
"extract_media": False,
"extract_screenshot": True,
"socks_proxy_host": None,
"socks_proxy_port": None,
"proxy_server": None,
}
wacz = setup_module("wacz_enricher", configs)
return wacz
def test_setup_without_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
wacz_enricher.setup()
assert not wacz_enricher.docker_in_docker
def test_setup_with_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"WACZ_ENABLE_DOCKER": "1"}, clear=True)
wacz_enricher.setup()
assert wacz_enricher.use_docker
def test_already_ran(wacz_enricher, metadata, mocker):
metadata.add_media(Media("test.wacz"), id="browsertrix")
mock_log = mocker.patch("loguru.logger.info")
assert wacz_enricher.enrich(metadata) is True
assert "WACZ enricher had already been executed" in mock_log.call_args[0][0]
def test_basic_call_execution(wacz_enricher, mocker):
mock_run = mocker.patch("subprocess.run")
mock_run.return_value = mocker.Mock(returncode=0)
metadata = Metadata().set_url("https://example.com")
wacz_enricher.enrich(metadata)
assert mock_run.called
# Checks that the url is passed to the cmd
assert "--url https://example.com" in " ".join(mock_run.call_args[0][0])
def test_download_success(wacz_enricher, mocker) -> None:
"""Test download returns metadata on successful enrichment."""
basic_metadata = Metadata().set_url("https://example.com")
mocker.patch.object(wacz_enricher, "enrich", return_value=True)
result = wacz_enricher.download(basic_metadata)
assert result is not None
assert isinstance(result, Metadata)
assert result.status == "wacz: success"
def test_enrich_already_executed(wacz_enricher, mocker) -> None:
"""Test enrich if already executed."""
mock_log = mocker.patch("loguru.logger.info")
metadata = Metadata().set_url("https://example.com")
media = Media(filename="some_file.wacz")
metadata.add_media(media, id="browsertrix")
result = wacz_enricher.enrich(metadata)
assert result is True
assert "WACZ enricher had already been executed:" in mock_log.call_args[0][0]
def test_enrich_subprocess_exception(wacz_enricher, mocker, tmp_path) -> None:
"""Test enrich returns False when subprocess fails."""
wacz_enricher.tmp_dir = str(tmp_path)
wacz_enricher.extract_media = False
wacz_enricher.extract_screenshot = True
mocker.patch("auto_archiver.utils.misc.random_str", return_value="TESTCOL")
mocker.patch("subprocess.run", side_effect=Exception("fail"))
basic_metadata = Metadata().set_url("https://example.com")
result = wacz_enricher.enrich(basic_metadata)
assert result is False
def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
"""Test extract_media_from_wacz extracts screenshot media."""
wacz_enricher.tmp_dir = str(tmp_path)
# Create a *real* zip file so ZipFile won't fail.
wacz_file = tmp_path / "dummy.wacz"
with ZipFile(wacz_file, "w") as zf:
zf.writestr("dummy.txt", "test content")
mocker.patch("os.listdir", return_value=[])
warc_data = (
b"WARC/1.0\r\n"
b"WARC-Type: resource\r\n"
b"Content-Type: image/png\r\n"
b"WARC-Target-URI: http://example.com/image.png\r\n"
b"Content-Length: 12\r\n"
b"\r\n"
b"image-bytes"
b"\r\n\r\nWARC/1.0\r\n\r\n"
)
mock_file = mocker.mock_open(read_data=warc_data)
mocker.patch("builtins.open", mock_file)
metadata.add_media(Media("something.wacz"), "browsertrix")
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
assert len(metadata.media) == 2
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"

View File

@@ -0,0 +1,168 @@
import json
import requests
import pytest
from auto_archiver.modules.wayback_extractor_enricher import WaybackExtractorEnricher
from auto_archiver.core import Metadata
@pytest.fixture
def mock_is_auth_wall(mocker):
"""Fixture to mock is_auth_wall behavior."""
def _mock_is_auth_wall(return_value: bool):
return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value)
return _mock_is_auth_wall
@pytest.fixture
def mock_post_success(mocker):
"""Fixture to mock POST requests with a successful response."""
def _mock_post(json_data: dict = None, status_code: int = 200):
json_data = json_data or {"job_id": "job123"}
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.post", return_value=resp)
return _mock_post
@pytest.fixture
def mock_get_success(mocker):
"""Fixture to mock GET requests returning a completed archive status."""
def _mock_get(json_data: dict = None, status_code: int = 200):
json_data = json_data or {
"status": "success",
"timestamp": "20250101010101",
"original_url": "https://example.com"
}
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.get", return_value=resp)
return _mock_get
@pytest.fixture
def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
configs: dict = {
"timeout": 5,
"if_not_archived_within": None,
"key": "somekey",
"secret": "secret",
"proxy_http": None,
"proxy_https": None,
}
return setup_module("wayback_extractor_enricher", configs)
def test_download_success(
wayback_extractor_enricher,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success()
# Basic metadata to allow merge
metadata = Metadata().set_url("https://example.com")
result = wayback_extractor_enricher.download(metadata)
assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall):
mock_is_auth_wall(True)
result = wayback_extractor_enricher.enrich(metadata)
assert result is None
def test_enrich_already_enriched(wayback_extractor_enricher, metadata):
metadata.set("wayback", "existing")
result = wayback_extractor_enricher.enrich(metadata)
assert result is True
def test_enrich_post_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
mock_is_auth_wall(False)
mock_post_success(json_data={"error": "server error"}, status_code=500)
result = wayback_extractor_enricher.enrich(metadata)
assert result is False
assert "Internet archive failed with status of 500" in metadata.get("wayback")
def test_enrich_post_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mocker
):
mock_is_auth_wall(False)
resp = mocker.Mock(status_code=200)
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
resp.text = "invalid json"
mocker.patch("requests.post", return_value=resp)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_no_job_id(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
mock_is_auth_wall(False)
mock_post_success(json_data={})
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_success(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success()
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com"
def test_enrich_get_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success(json_data={"status": "failed"}, status_code=400)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_request_exception(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
):
mock_is_auth_wall(False)
mock_post_success()
mocker.patch("requests.get", side_effect=requests.exceptions.RequestException("error"))
mocker.patch("time.sleep", return_value=None)
# check it still enriches the job_id information
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback").get("job_id") == "job123"
def test_enrich_get_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
):
mock_is_auth_wall(False)
mock_post_success()
resp = mocker.Mock()
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
resp.text = "invalid json"
mocker.patch("requests.get", return_value=resp)
mocker.patch("time.sleep", return_value=None)
# check it still enriches the job_id information
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback").get("job_id") == "job123"

View File

@@ -0,0 +1,133 @@
import pytest
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.s3_storage import S3Storage
from auto_archiver.modules.whisper_enricher import WhisperEnricher
TEST_S3_URL = "http://cdn.example.com/test.mp4"
@pytest.fixture
def enricher(mocker):
"""Fixture with mocked S3 and API dependencies"""
config = {
"api_endpoint": "http://testapi",
"api_key": "whisper-key",
"include_srt": False,
"timeout": 5,
"action": "translate",
"steps": {"storages": ["s3_storage"]}
}
mock_s3 = mocker.MagicMock(spec=S3Storage)
mock_s3.get_cdn_url.return_value = TEST_S3_URL
instance = WhisperEnricher()
instance.name = "whisper_enricher"
instance.display_name = "Whisper Enricher"
instance.config_setup({instance.name: config})
# bypassing the setup method and mocking S3 setup
instance.stores = config['steps']['storages']
instance.s3 = mock_s3
yield instance, mock_s3
@pytest.fixture
def metadata():
metadata = Metadata()
metadata.set_url("http://test.url")
metadata.set_title("test title")
return metadata
@pytest.fixture
def mock_requests(mocker):
mock_requests = mocker.patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests")
mock_response = mocker.MagicMock()
mock_response.status_code = 201
mock_response.json.return_value = {"id": "job123"}
mock_requests.post.return_value = mock_response
yield mock_requests
def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
"""Test successful media processing with S3 configured"""
whisper, mock_s3 = enricher
# Configure mock S3 URL to match test expectation
mock_s3.get_cdn_url.return_value = TEST_S3_URL
# Create test media with matching CDN URL
m = Media("test.mp4")
m.mimetype = "video/mp4"
m.add_url(mock_s3.get_cdn_url.return_value)
metadata.media = [m]
# Mock the complete API interaction chain
mock_status_response = mocker.MagicMock()
mock_status_response.status_code = 200
mock_status_response.json.return_value = {
"status": "success",
"meta": {}
}
mock_artifacts_response = mocker.MagicMock()
mock_artifacts_response.status_code = 200
mock_artifacts_response.json.return_value = [{
"data": [{"start": 0, "end": 5, "text": "test transcript"}]
}]
# Set up mock response sequence
mock_requests.get.side_effect = [
mock_status_response, # First call: status check
mock_artifacts_response # Second call: artifacts check
]
# Run enrichment (without opening file)
whisper.enrich(metadata)
# Check API interactions
mock_requests.post.assert_called_once_with(
"http://testapi/jobs",
json={"url": "http://cdn.example.com/test.mp4", "type": "translate"},
headers={"Authorization": "Bearer whisper-key"}
)
# Verify job status checks
assert mock_requests.get.call_count == 2
assert "artifact_0_text" in metadata.media[0].get("whisper_model")
assert metadata.media[0].get("whisper_model") == {'artifact_0_text': 'test transcript',
'job_artifacts_check': 'http://testapi/jobs/job123/artifacts',
'job_id': 'job123',
'job_status_check': 'http://testapi/jobs/job123'}
def test_submit_job(enricher, mocker):
"""Test job submission method"""
whisper, _ = enricher
m = Media("test.mp4")
m.add_url(TEST_S3_URL)
mock_requests = mocker.patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests")
mock_response = mocker.MagicMock()
mock_response.status_code = 201
mock_response.json.return_value = {"id": "job123"}
mock_requests.post.return_value = mock_response
job_id = whisper.submit_job(m)
assert job_id == "job123"
def test_submit_raises_status(enricher, mocker):
whisper, _ = enricher
m = Media("test.mp4")
m.add_url(TEST_S3_URL)
mock_requests = mocker.patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests")
mock_response = mocker.MagicMock()
mock_response.status_code = 400
mock_response.json.return_value = {"id": "job123"}
mock_requests.post.return_value = mock_response
with pytest.raises(AssertionError) as exc_info:
whisper.submit_job(m)
assert str(exc_info.value) == "calling the whisper api http://testapi returned a non-success code: 400"
# @pytest.mark.parametrize("test_url, status", ["http://cdn.example.com/test.mp4",])
def test_submit_job_fails(enricher):
"""Test assertion fails with non-S3 URL"""
whisper, mock_s3 = enricher
m = Media("test.mp4")
m.add_url("http://cdn.wrongurl.com/test.mp4")
with pytest.raises(AssertionError):
whisper.submit_job(m)