diff --git a/tests/enrichers/test_wacz_enricher.py b/tests/enrichers/test_wacz_enricher.py new file mode 100644 index 0000000..d55733d --- /dev/null +++ b/tests/enrichers/test_wacz_enricher.py @@ -0,0 +1,112 @@ +import os +from zipfile import ZipFile + +import pytest + +from auto_archiver.core import Metadata, Media + + +@pytest.fixture +def wacz_enricher(setup_module, mock_binary_dependencies): + configs: dict = { + "profile": None, + "docker_commands": None, + "timeout": 120, + "extract_media": False, + "extract_screenshot": True, + "socks_proxy_host": None, + "socks_proxy_port": None, + "proxy_server": None, + } + wacz = setup_module("wacz_enricher", configs) + return wacz + + +def test_setup_without_docker(wacz_enricher, mocker): + mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True) + wacz_enricher.setup() + assert not wacz_enricher.docker_in_docker + + +def test_setup_with_docker(wacz_enricher, mocker): + mocker.patch.dict(os.environ, {"WACZ_ENABLE_DOCKER": "1"}, clear=True) + wacz_enricher.setup() + assert wacz_enricher.use_docker + + +def test_already_ran(wacz_enricher, metadata, mocker): + metadata.add_media(Media("test.wacz"), id="browsertrix") + mock_log = mocker.patch("loguru.logger.info") + assert wacz_enricher.enrich(metadata) is True + assert "WACZ enricher had already been executed" in mock_log.call_args[0][0] + + +def test_basic_call_execution(wacz_enricher, mocker): + mock_run = mocker.patch("subprocess.run") + mock_run.return_value = mocker.Mock(returncode=0) + metadata = Metadata().set_url("https://example.com") + wacz_enricher.enrich(metadata) + assert mock_run.called + # Checks that the url is passed to the cmd + assert "--url https://example.com" in " ".join(mock_run.call_args[0][0]) + + +def test_download_success(wacz_enricher, mocker) -> None: + """Test download returns metadata on successful enrichment.""" + basic_metadata = Metadata().set_url("https://example.com") + mocker.patch.object(wacz_enricher, "enrich", return_value=True) + result = wacz_enricher.download(basic_metadata) + assert result is not None + assert isinstance(result, Metadata) + assert result.status == "wacz: success" + + +def test_enrich_already_executed(wacz_enricher, mocker) -> None: + """Test enrich if already executed.""" + mock_log = mocker.patch("loguru.logger.info") + metadata = Metadata().set_url("https://example.com") + media = Media(filename="some_file.wacz") + metadata.add_media(media, id="browsertrix") + result = wacz_enricher.enrich(metadata) + assert result is True + assert "WACZ enricher had already been executed:" in mock_log.call_args[0][0] + + +def test_enrich_subprocess_exception(wacz_enricher, mocker, tmp_path) -> None: + """Test enrich returns False when subprocess fails.""" + wacz_enricher.tmp_dir = str(tmp_path) + wacz_enricher.extract_media = False + wacz_enricher.extract_screenshot = True + mocker.patch("auto_archiver.utils.misc.random_str", return_value="TESTCOL") + mocker.patch("subprocess.run", side_effect=Exception("fail")) + basic_metadata = Metadata().set_url("https://example.com") + result = wacz_enricher.enrich(basic_metadata) + assert result is False + + +def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None: + """Test extract_media_from_wacz extracts screenshot media.""" + wacz_enricher.tmp_dir = str(tmp_path) + + # Create a *real* zip file so ZipFile won't fail. + wacz_file = tmp_path / "dummy.wacz" + with ZipFile(wacz_file, "w") as zf: + zf.writestr("dummy.txt", "test content") + + mocker.patch("os.listdir", return_value=[]) + warc_data = ( + b"WARC/1.0\r\n" + b"WARC-Type: resource\r\n" + b"Content-Type: image/png\r\n" + b"WARC-Target-URI: http://example.com/image.png\r\n" + b"Content-Length: 12\r\n" + b"\r\n" + b"image-bytes" + b"\r\n\r\nWARC/1.0\r\n\r\n" + ) + mock_file = mocker.mock_open(read_data=warc_data) + mocker.patch("builtins.open", mock_file) + metadata.add_media(Media("something.wacz"), "browsertrix") + wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file)) + assert len(metadata.media) == 2 + assert metadata.media[1].properties.get("id") == "browsertrix-screenshot" diff --git a/tests/enrichers/test_wayback_enricher.py b/tests/enrichers/test_wayback_enricher.py new file mode 100644 index 0000000..88f4662 --- /dev/null +++ b/tests/enrichers/test_wayback_enricher.py @@ -0,0 +1,168 @@ +import json +import requests +import pytest +from auto_archiver.modules.wayback_extractor_enricher import WaybackExtractorEnricher +from auto_archiver.core import Metadata + + +@pytest.fixture +def mock_is_auth_wall(mocker): + """Fixture to mock is_auth_wall behavior.""" + def _mock_is_auth_wall(return_value: bool): + return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value) + return _mock_is_auth_wall + +@pytest.fixture +def mock_post_success(mocker): + """Fixture to mock POST requests with a successful response.""" + def _mock_post(json_data: dict = None, status_code: int = 200): + json_data = json_data or {"job_id": "job123"} + resp = mocker.Mock(status_code=status_code) + resp.json.return_value = json_data + return mocker.patch("requests.post", return_value=resp) + return _mock_post + +@pytest.fixture +def mock_get_success(mocker): + """Fixture to mock GET requests returning a completed archive status.""" + def _mock_get(json_data: dict = None, status_code: int = 200): + json_data = json_data or { + "status": "success", + "timestamp": "20250101010101", + "original_url": "https://example.com" + } + resp = mocker.Mock(status_code=status_code) + resp.json.return_value = json_data + return mocker.patch("requests.get", return_value=resp) + return _mock_get + +@pytest.fixture +def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher: + configs: dict = { + "timeout": 5, + "if_not_archived_within": None, + "key": "somekey", + "secret": "secret", + "proxy_http": None, + "proxy_https": None, + } + return setup_module("wayback_extractor_enricher", configs) + + +def test_download_success( + wayback_extractor_enricher, + mock_is_auth_wall, + mock_post_success, + mock_get_success +): + mock_is_auth_wall(False) + mock_post_success() + mock_get_success() + # Basic metadata to allow merge + metadata = Metadata().set_url("https://example.com") + result = wayback_extractor_enricher.download(metadata) + assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com" + +def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall): + mock_is_auth_wall(True) + result = wayback_extractor_enricher.enrich(metadata) + assert result is None + +def test_enrich_already_enriched(wayback_extractor_enricher, metadata): + metadata.set("wayback", "existing") + result = wayback_extractor_enricher.enrich(metadata) + assert result is True + +def test_enrich_post_failure( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success +): + mock_is_auth_wall(False) + mock_post_success(json_data={"error": "server error"}, status_code=500) + result = wayback_extractor_enricher.enrich(metadata) + assert result is False + assert "Internet archive failed with status of 500" in metadata.get("wayback") + +def test_enrich_post_json_decode_error( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mocker +): + mock_is_auth_wall(False) + resp = mocker.Mock(status_code=200) + resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0) + resp.text = "invalid json" + mocker.patch("requests.post", return_value=resp) + assert wayback_extractor_enricher.enrich(metadata) is False + +def test_enrich_no_job_id( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success +): + mock_is_auth_wall(False) + mock_post_success(json_data={}) + assert wayback_extractor_enricher.enrich(metadata) is False + +def test_enrich_get_success( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success, + mock_get_success +): + mock_is_auth_wall(False) + mock_post_success() + mock_get_success() + assert wayback_extractor_enricher.enrich(metadata) is True + assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com" + assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com" + +def test_enrich_get_failure( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success, + mock_get_success +): + mock_is_auth_wall(False) + mock_post_success() + mock_get_success(json_data={"status": "failed"}, status_code=400) + assert wayback_extractor_enricher.enrich(metadata) is False + +def test_enrich_get_request_exception( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success, + mocker +): + mock_is_auth_wall(False) + mock_post_success() + mocker.patch("requests.get", side_effect=requests.exceptions.RequestException("error")) + mocker.patch("time.sleep", return_value=None) + # check it still enriches the job_id information + assert wayback_extractor_enricher.enrich(metadata) is True + assert metadata.get("wayback").get("job_id") == "job123" + +def test_enrich_get_json_decode_error( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success, + mocker +): + mock_is_auth_wall(False) + mock_post_success() + resp = mocker.Mock() + resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0) + resp.text = "invalid json" + mocker.patch("requests.get", return_value=resp) + mocker.patch("time.sleep", return_value=None) + # check it still enriches the job_id information + assert wayback_extractor_enricher.enrich(metadata) is True + assert metadata.get("wayback").get("job_id") == "job123" diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py index 9238f89..f274728 100644 --- a/tests/extractors/test_instagram_tbot_extractor.py +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -6,8 +6,6 @@ from auto_archiver.core import Metadata from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor from tests.extractors.test_extractor_base import TestExtractorBase -TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles") - @pytest.fixture def patch_extractor_methods(request, setup_module, mocker): diff --git a/tests/storages/test_local_storage.py b/tests/storages/test_local_storage.py new file mode 100644 index 0000000..85f97c6 --- /dev/null +++ b/tests/storages/test_local_storage.py @@ -0,0 +1,54 @@ + +import os +from pathlib import Path + +import pytest + +from auto_archiver.core import Media +from auto_archiver.modules.local_storage import LocalStorage + + +@pytest.fixture +def local_storage(setup_module) -> LocalStorage: + configs: dict = { + "path_generator": "flat", + "filename_generator": "static", + "save_to": "./local_archive", + "save_absolute": False, + } + return setup_module("local_storage", configs) + + +@pytest.fixture +def sample_media(tmp_path) -> Media: + """Fixture creating a Media object with temporary source file""" + src_file = tmp_path / "source.txt" + src_file.write_text("test content") + return Media(key="subdir/test.txt", filename=str(src_file)) + + +def test_get_cdn_url_relative(local_storage): + media = Media(key="test.txt", filename="dummy.txt") + expected = os.path.join(local_storage.save_to, media.key) + assert local_storage.get_cdn_url(media) == expected + + + +def test_get_cdn_url_absolute(local_storage): + media = Media(key="test.txt", filename="dummy.txt") + local_storage.save_absolute = True + expected = os.path.abspath(os.path.join(local_storage.save_to, media.key)) + assert local_storage.get_cdn_url(media) == expected + +def test_upload_file_contents_and_metadata(local_storage, sample_media): + dest = os.path.join(local_storage.save_to, sample_media.key) + assert local_storage.upload(sample_media) is True + assert Path(sample_media.filename).read_text() == Path(dest).read_text() + + +def test_upload_nonexistent_source(local_storage): + media = Media(key="missing.txt", filename="nonexistent.txt") + with pytest.raises(FileNotFoundError): + local_storage.upload(media) + +