mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Add WACZ, Wayback and local storage tests.
This commit is contained in:
112
tests/enrichers/test_wacz_enricher.py
Normal file
112
tests/enrichers/test_wacz_enricher.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import os
|
||||
from zipfile import ZipFile
|
||||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def wacz_enricher(setup_module, mock_binary_dependencies):
|
||||
configs: dict = {
|
||||
"profile": None,
|
||||
"docker_commands": None,
|
||||
"timeout": 120,
|
||||
"extract_media": False,
|
||||
"extract_screenshot": True,
|
||||
"socks_proxy_host": None,
|
||||
"socks_proxy_port": None,
|
||||
"proxy_server": None,
|
||||
}
|
||||
wacz = setup_module("wacz_enricher", configs)
|
||||
return wacz
|
||||
|
||||
|
||||
def test_setup_without_docker(wacz_enricher, mocker):
|
||||
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
|
||||
wacz_enricher.setup()
|
||||
assert not wacz_enricher.docker_in_docker
|
||||
|
||||
|
||||
def test_setup_with_docker(wacz_enricher, mocker):
|
||||
mocker.patch.dict(os.environ, {"WACZ_ENABLE_DOCKER": "1"}, clear=True)
|
||||
wacz_enricher.setup()
|
||||
assert wacz_enricher.use_docker
|
||||
|
||||
|
||||
def test_already_ran(wacz_enricher, metadata, mocker):
|
||||
metadata.add_media(Media("test.wacz"), id="browsertrix")
|
||||
mock_log = mocker.patch("loguru.logger.info")
|
||||
assert wacz_enricher.enrich(metadata) is True
|
||||
assert "WACZ enricher had already been executed" in mock_log.call_args[0][0]
|
||||
|
||||
|
||||
def test_basic_call_execution(wacz_enricher, mocker):
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
mock_run.return_value = mocker.Mock(returncode=0)
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
wacz_enricher.enrich(metadata)
|
||||
assert mock_run.called
|
||||
# Checks that the url is passed to the cmd
|
||||
assert "--url https://example.com" in " ".join(mock_run.call_args[0][0])
|
||||
|
||||
|
||||
def test_download_success(wacz_enricher, mocker) -> None:
|
||||
"""Test download returns metadata on successful enrichment."""
|
||||
basic_metadata = Metadata().set_url("https://example.com")
|
||||
mocker.patch.object(wacz_enricher, "enrich", return_value=True)
|
||||
result = wacz_enricher.download(basic_metadata)
|
||||
assert result is not None
|
||||
assert isinstance(result, Metadata)
|
||||
assert result.status == "wacz: success"
|
||||
|
||||
|
||||
def test_enrich_already_executed(wacz_enricher, mocker) -> None:
|
||||
"""Test enrich if already executed."""
|
||||
mock_log = mocker.patch("loguru.logger.info")
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
media = Media(filename="some_file.wacz")
|
||||
metadata.add_media(media, id="browsertrix")
|
||||
result = wacz_enricher.enrich(metadata)
|
||||
assert result is True
|
||||
assert "WACZ enricher had already been executed:" in mock_log.call_args[0][0]
|
||||
|
||||
|
||||
def test_enrich_subprocess_exception(wacz_enricher, mocker, tmp_path) -> None:
|
||||
"""Test enrich returns False when subprocess fails."""
|
||||
wacz_enricher.tmp_dir = str(tmp_path)
|
||||
wacz_enricher.extract_media = False
|
||||
wacz_enricher.extract_screenshot = True
|
||||
mocker.patch("auto_archiver.utils.misc.random_str", return_value="TESTCOL")
|
||||
mocker.patch("subprocess.run", side_effect=Exception("fail"))
|
||||
basic_metadata = Metadata().set_url("https://example.com")
|
||||
result = wacz_enricher.enrich(basic_metadata)
|
||||
assert result is False
|
||||
|
||||
|
||||
def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
|
||||
"""Test extract_media_from_wacz extracts screenshot media."""
|
||||
wacz_enricher.tmp_dir = str(tmp_path)
|
||||
|
||||
# Create a *real* zip file so ZipFile won't fail.
|
||||
wacz_file = tmp_path / "dummy.wacz"
|
||||
with ZipFile(wacz_file, "w") as zf:
|
||||
zf.writestr("dummy.txt", "test content")
|
||||
|
||||
mocker.patch("os.listdir", return_value=[])
|
||||
warc_data = (
|
||||
b"WARC/1.0\r\n"
|
||||
b"WARC-Type: resource\r\n"
|
||||
b"Content-Type: image/png\r\n"
|
||||
b"WARC-Target-URI: http://example.com/image.png\r\n"
|
||||
b"Content-Length: 12\r\n"
|
||||
b"\r\n"
|
||||
b"image-bytes"
|
||||
b"\r\n\r\nWARC/1.0\r\n\r\n"
|
||||
)
|
||||
mock_file = mocker.mock_open(read_data=warc_data)
|
||||
mocker.patch("builtins.open", mock_file)
|
||||
metadata.add_media(Media("something.wacz"), "browsertrix")
|
||||
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
|
||||
assert len(metadata.media) == 2
|
||||
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"
|
||||
168
tests/enrichers/test_wayback_enricher.py
Normal file
168
tests/enrichers/test_wayback_enricher.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import json
|
||||
import requests
|
||||
import pytest
|
||||
from auto_archiver.modules.wayback_extractor_enricher import WaybackExtractorEnricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_is_auth_wall(mocker):
|
||||
"""Fixture to mock is_auth_wall behavior."""
|
||||
def _mock_is_auth_wall(return_value: bool):
|
||||
return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value)
|
||||
return _mock_is_auth_wall
|
||||
|
||||
@pytest.fixture
|
||||
def mock_post_success(mocker):
|
||||
"""Fixture to mock POST requests with a successful response."""
|
||||
def _mock_post(json_data: dict = None, status_code: int = 200):
|
||||
json_data = json_data or {"job_id": "job123"}
|
||||
resp = mocker.Mock(status_code=status_code)
|
||||
resp.json.return_value = json_data
|
||||
return mocker.patch("requests.post", return_value=resp)
|
||||
return _mock_post
|
||||
|
||||
@pytest.fixture
|
||||
def mock_get_success(mocker):
|
||||
"""Fixture to mock GET requests returning a completed archive status."""
|
||||
def _mock_get(json_data: dict = None, status_code: int = 200):
|
||||
json_data = json_data or {
|
||||
"status": "success",
|
||||
"timestamp": "20250101010101",
|
||||
"original_url": "https://example.com"
|
||||
}
|
||||
resp = mocker.Mock(status_code=status_code)
|
||||
resp.json.return_value = json_data
|
||||
return mocker.patch("requests.get", return_value=resp)
|
||||
return _mock_get
|
||||
|
||||
@pytest.fixture
|
||||
def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
|
||||
configs: dict = {
|
||||
"timeout": 5,
|
||||
"if_not_archived_within": None,
|
||||
"key": "somekey",
|
||||
"secret": "secret",
|
||||
"proxy_http": None,
|
||||
"proxy_https": None,
|
||||
}
|
||||
return setup_module("wayback_extractor_enricher", configs)
|
||||
|
||||
|
||||
def test_download_success(
|
||||
wayback_extractor_enricher,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mock_get_success
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
mock_get_success()
|
||||
# Basic metadata to allow merge
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
result = wayback_extractor_enricher.download(metadata)
|
||||
assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
|
||||
|
||||
def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall):
|
||||
mock_is_auth_wall(True)
|
||||
result = wayback_extractor_enricher.enrich(metadata)
|
||||
assert result is None
|
||||
|
||||
def test_enrich_already_enriched(wayback_extractor_enricher, metadata):
|
||||
metadata.set("wayback", "existing")
|
||||
result = wayback_extractor_enricher.enrich(metadata)
|
||||
assert result is True
|
||||
|
||||
def test_enrich_post_failure(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success(json_data={"error": "server error"}, status_code=500)
|
||||
result = wayback_extractor_enricher.enrich(metadata)
|
||||
assert result is False
|
||||
assert "Internet archive failed with status of 500" in metadata.get("wayback")
|
||||
|
||||
def test_enrich_post_json_decode_error(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mocker
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
resp = mocker.Mock(status_code=200)
|
||||
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
|
||||
resp.text = "invalid json"
|
||||
mocker.patch("requests.post", return_value=resp)
|
||||
assert wayback_extractor_enricher.enrich(metadata) is False
|
||||
|
||||
def test_enrich_no_job_id(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success(json_data={})
|
||||
assert wayback_extractor_enricher.enrich(metadata) is False
|
||||
|
||||
def test_enrich_get_success(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mock_get_success
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
mock_get_success()
|
||||
assert wayback_extractor_enricher.enrich(metadata) is True
|
||||
assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
|
||||
assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com"
|
||||
|
||||
def test_enrich_get_failure(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mock_get_success
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
mock_get_success(json_data={"status": "failed"}, status_code=400)
|
||||
assert wayback_extractor_enricher.enrich(metadata) is False
|
||||
|
||||
def test_enrich_get_request_exception(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mocker
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
mocker.patch("requests.get", side_effect=requests.exceptions.RequestException("error"))
|
||||
mocker.patch("time.sleep", return_value=None)
|
||||
# check it still enriches the job_id information
|
||||
assert wayback_extractor_enricher.enrich(metadata) is True
|
||||
assert metadata.get("wayback").get("job_id") == "job123"
|
||||
|
||||
def test_enrich_get_json_decode_error(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mocker
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
resp = mocker.Mock()
|
||||
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
|
||||
resp.text = "invalid json"
|
||||
mocker.patch("requests.get", return_value=resp)
|
||||
mocker.patch("time.sleep", return_value=None)
|
||||
# check it still enriches the job_id information
|
||||
assert wayback_extractor_enricher.enrich(metadata) is True
|
||||
assert metadata.get("wayback").get("job_id") == "job123"
|
||||
Reference in New Issue
Block a user