Add WACZ, Wayback and local storage tests.

This commit is contained in:
erinhmclark
2025-02-19 13:14:08 +00:00
parent a8ffb19325
commit 47a634fc63
4 changed files with 334 additions and 2 deletions

View File

@@ -0,0 +1,112 @@
import os
from zipfile import ZipFile
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def wacz_enricher(setup_module, mock_binary_dependencies):
configs: dict = {
"profile": None,
"docker_commands": None,
"timeout": 120,
"extract_media": False,
"extract_screenshot": True,
"socks_proxy_host": None,
"socks_proxy_port": None,
"proxy_server": None,
}
wacz = setup_module("wacz_enricher", configs)
return wacz
def test_setup_without_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
wacz_enricher.setup()
assert not wacz_enricher.docker_in_docker
def test_setup_with_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"WACZ_ENABLE_DOCKER": "1"}, clear=True)
wacz_enricher.setup()
assert wacz_enricher.use_docker
def test_already_ran(wacz_enricher, metadata, mocker):
metadata.add_media(Media("test.wacz"), id="browsertrix")
mock_log = mocker.patch("loguru.logger.info")
assert wacz_enricher.enrich(metadata) is True
assert "WACZ enricher had already been executed" in mock_log.call_args[0][0]
def test_basic_call_execution(wacz_enricher, mocker):
mock_run = mocker.patch("subprocess.run")
mock_run.return_value = mocker.Mock(returncode=0)
metadata = Metadata().set_url("https://example.com")
wacz_enricher.enrich(metadata)
assert mock_run.called
# Checks that the url is passed to the cmd
assert "--url https://example.com" in " ".join(mock_run.call_args[0][0])
def test_download_success(wacz_enricher, mocker) -> None:
"""Test download returns metadata on successful enrichment."""
basic_metadata = Metadata().set_url("https://example.com")
mocker.patch.object(wacz_enricher, "enrich", return_value=True)
result = wacz_enricher.download(basic_metadata)
assert result is not None
assert isinstance(result, Metadata)
assert result.status == "wacz: success"
def test_enrich_already_executed(wacz_enricher, mocker) -> None:
"""Test enrich if already executed."""
mock_log = mocker.patch("loguru.logger.info")
metadata = Metadata().set_url("https://example.com")
media = Media(filename="some_file.wacz")
metadata.add_media(media, id="browsertrix")
result = wacz_enricher.enrich(metadata)
assert result is True
assert "WACZ enricher had already been executed:" in mock_log.call_args[0][0]
def test_enrich_subprocess_exception(wacz_enricher, mocker, tmp_path) -> None:
"""Test enrich returns False when subprocess fails."""
wacz_enricher.tmp_dir = str(tmp_path)
wacz_enricher.extract_media = False
wacz_enricher.extract_screenshot = True
mocker.patch("auto_archiver.utils.misc.random_str", return_value="TESTCOL")
mocker.patch("subprocess.run", side_effect=Exception("fail"))
basic_metadata = Metadata().set_url("https://example.com")
result = wacz_enricher.enrich(basic_metadata)
assert result is False
def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
"""Test extract_media_from_wacz extracts screenshot media."""
wacz_enricher.tmp_dir = str(tmp_path)
# Create a *real* zip file so ZipFile won't fail.
wacz_file = tmp_path / "dummy.wacz"
with ZipFile(wacz_file, "w") as zf:
zf.writestr("dummy.txt", "test content")
mocker.patch("os.listdir", return_value=[])
warc_data = (
b"WARC/1.0\r\n"
b"WARC-Type: resource\r\n"
b"Content-Type: image/png\r\n"
b"WARC-Target-URI: http://example.com/image.png\r\n"
b"Content-Length: 12\r\n"
b"\r\n"
b"image-bytes"
b"\r\n\r\nWARC/1.0\r\n\r\n"
)
mock_file = mocker.mock_open(read_data=warc_data)
mocker.patch("builtins.open", mock_file)
metadata.add_media(Media("something.wacz"), "browsertrix")
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
assert len(metadata.media) == 2
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"

View File

@@ -0,0 +1,168 @@
import json
import requests
import pytest
from auto_archiver.modules.wayback_extractor_enricher import WaybackExtractorEnricher
from auto_archiver.core import Metadata
@pytest.fixture
def mock_is_auth_wall(mocker):
"""Fixture to mock is_auth_wall behavior."""
def _mock_is_auth_wall(return_value: bool):
return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value)
return _mock_is_auth_wall
@pytest.fixture
def mock_post_success(mocker):
"""Fixture to mock POST requests with a successful response."""
def _mock_post(json_data: dict = None, status_code: int = 200):
json_data = json_data or {"job_id": "job123"}
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.post", return_value=resp)
return _mock_post
@pytest.fixture
def mock_get_success(mocker):
"""Fixture to mock GET requests returning a completed archive status."""
def _mock_get(json_data: dict = None, status_code: int = 200):
json_data = json_data or {
"status": "success",
"timestamp": "20250101010101",
"original_url": "https://example.com"
}
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.get", return_value=resp)
return _mock_get
@pytest.fixture
def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
configs: dict = {
"timeout": 5,
"if_not_archived_within": None,
"key": "somekey",
"secret": "secret",
"proxy_http": None,
"proxy_https": None,
}
return setup_module("wayback_extractor_enricher", configs)
def test_download_success(
wayback_extractor_enricher,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success()
# Basic metadata to allow merge
metadata = Metadata().set_url("https://example.com")
result = wayback_extractor_enricher.download(metadata)
assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall):
mock_is_auth_wall(True)
result = wayback_extractor_enricher.enrich(metadata)
assert result is None
def test_enrich_already_enriched(wayback_extractor_enricher, metadata):
metadata.set("wayback", "existing")
result = wayback_extractor_enricher.enrich(metadata)
assert result is True
def test_enrich_post_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
mock_is_auth_wall(False)
mock_post_success(json_data={"error": "server error"}, status_code=500)
result = wayback_extractor_enricher.enrich(metadata)
assert result is False
assert "Internet archive failed with status of 500" in metadata.get("wayback")
def test_enrich_post_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mocker
):
mock_is_auth_wall(False)
resp = mocker.Mock(status_code=200)
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
resp.text = "invalid json"
mocker.patch("requests.post", return_value=resp)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_no_job_id(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
mock_is_auth_wall(False)
mock_post_success(json_data={})
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_success(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success()
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com"
def test_enrich_get_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success(json_data={"status": "failed"}, status_code=400)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_request_exception(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
):
mock_is_auth_wall(False)
mock_post_success()
mocker.patch("requests.get", side_effect=requests.exceptions.RequestException("error"))
mocker.patch("time.sleep", return_value=None)
# check it still enriches the job_id information
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback").get("job_id") == "job123"
def test_enrich_get_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
):
mock_is_auth_wall(False)
mock_post_success()
resp = mocker.Mock()
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
resp.text = "invalid json"
mocker.patch("requests.get", return_value=resp)
mocker.patch("time.sleep", return_value=None)
# check it still enriches the job_id information
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback").get("job_id") == "job123"