From ae0e53e434d6e5d7e5860e1c6acb9bc57b1952b3 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 6 Apr 2026 17:15:32 +0100
Subject: [PATCH] adds tests for new ghostarchive enricher feature
---
tests/enrichers/test_ghostarchive_enricher.py | 277 ++++++++++++++++++
1 file changed, 277 insertions(+)
create mode 100644 tests/enrichers/test_ghostarchive_enricher.py
diff --git a/tests/enrichers/test_ghostarchive_enricher.py b/tests/enrichers/test_ghostarchive_enricher.py
new file mode 100644
index 0000000..a476fa4
--- /dev/null
+++ b/tests/enrichers/test_ghostarchive_enricher.py
@@ -0,0 +1,277 @@
+import pytest
+import requests
+import os
+from unittest.mock import MagicMock
+
+from auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher import GhostarchiveEnricher
+
+CI = os.getenv("GITHUB_ACTIONS", "") == "true"
+
+# sample HTML responses for mocking
+SEARCH_HTML_FOUND = """
+
+Archives for https://example.com
+
+
+"""
+
+SEARCH_HTML_NOT_FOUND = """
+
+Archives for https://example.com
+Page 0 out of 0
+No archives for that site.
+
+"""
+
+SAVE_RESPONSE_HTML_WITH_LINK = """
+
+Archive saved
+View archive
+
+"""
+
+ENRICHER_CONFIG = {
+ "timeout": 120,
+ "check_existing": True,
+ "proxy_http": None,
+ "proxy_https": None,
+}
+
+
+class TestGhostarchiveEnricher:
+ """Tests for Ghost Archive Enricher"""
+
+ @pytest.fixture(autouse=True)
+ def setup_enricher(self, setup_module):
+ self.enricher: GhostarchiveEnricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
+
+ def test_search_existing_found(self, mocker):
+ """When an existing archive is found, it should be returned."""
+ mock_response = mocker.Mock()
+ mock_response.status_code = 200
+ mock_response.text = SEARCH_HTML_FOUND
+ mocker.patch(
+ "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
+ )
+
+ result = self.enricher._search_existing("https://example.com")
+ assert result == "https://ghostarchive.org/archive/Abc12"
+
+ def test_search_existing_not_found(self, mocker):
+ """When no existing archive is found, None should be returned."""
+ mock_response = mocker.Mock()
+ mock_response.status_code = 200
+ mock_response.text = SEARCH_HTML_NOT_FOUND
+ mocker.patch(
+ "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
+ )
+
+ result = self.enricher._search_existing("https://example.com")
+ assert result is None
+
+ def test_search_existing_request_error(self, mocker):
+ """When search request fails, None should be returned."""
+ mocker.patch(
+ "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get",
+ side_effect=requests.exceptions.ConnectionError("connection failed"),
+ )
+
+ result = self.enricher._search_existing("https://example.com")
+ assert result is None
+
+ def test_search_existing_non_200(self, mocker):
+ """When search returns non-200, None should be returned."""
+ mock_response = mocker.Mock()
+ mock_response.status_code = 503
+ mocker.patch(
+ "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
+ )
+
+ result = self.enricher._search_existing("https://example.com")
+ assert result is None
+
+ def test_submit_url_success_redirect(self, mocker):
+ """Successful submission via headless browser should return archive URL."""
+ mock_sb = MagicMock()
+ mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1"
+ mock_sb.__enter__ = MagicMock(return_value=mock_sb)
+ mock_sb.__exit__ = MagicMock(return_value=False)
+
+ mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
+
+ result = self.enricher._submit_url("https://example.com")
+ assert result == "https://ghostarchive.org/archive/NewId1"
+ mock_sb.type.assert_called_once()
+ mock_sb.click.assert_called_once()
+
+ def test_submit_url_success_redirect_strips_query(self, mocker):
+ """Redirect URL query params should be stripped."""
+ mock_sb = MagicMock()
+ mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1?wr=false"
+ mock_sb.__enter__ = MagicMock(return_value=mock_sb)
+ mock_sb.__exit__ = MagicMock(return_value=False)
+
+ mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
+
+ result = self.enricher._submit_url("https://example.com")
+ assert result == "https://ghostarchive.org/archive/NewId1"
+
+ def test_submit_url_success_html_fallback(self, mocker):
+ """When browser doesn't redirect, should parse page source for archive link."""
+ mock_sb = MagicMock()
+ mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive2"
+ mock_sb.get_page_source.return_value = SAVE_RESPONSE_HTML_WITH_LINK
+ mock_sb.__enter__ = MagicMock(return_value=mock_sb)
+ mock_sb.__exit__ = MagicMock(return_value=False)
+
+ # make timeout=0 so the polling loop exits immediately and falls through to HTML parsing
+ self.enricher.timeout = 0
+ mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
+
+ result = self.enricher._submit_url("https://example.com")
+ assert result == "https://ghostarchive.org/archive/Xyz99"
+
+ def test_submit_url_browser_error(self, mocker):
+ """Browser error during submission should return None."""
+ mocker.patch(
+ "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB",
+ side_effect=Exception("browser failed to start"),
+ )
+
+ result = self.enricher._submit_url("https://example.com")
+ assert result is None
+
+ def test_proxy_configuration(self, mocker):
+ """Proxies should be passed to search requests when configured."""
+ self.enricher.proxy_http = "http://proxy:8080"
+ self.enricher.proxy_https = "https://proxy:8443"
+
+ mock_get = mocker.patch(
+ "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get",
+ )
+ mock_response = mocker.Mock()
+ mock_response.status_code = 200
+ mock_response.text = SEARCH_HTML_FOUND
+ mock_get.return_value = mock_response
+
+ result = self.enricher._search_existing("https://example.com")
+
+ call_kwargs = mock_get.call_args
+ assert call_kwargs.kwargs.get("proxies") == {"http": "http://proxy:8080", "https": "https://proxy:8443"}
+ assert result is not None
+
+ def test_parse_archive_url_with_replay_links(self):
+ """Parser should ignore /replay/ links and only return /archive/ links."""
+ html = """
+
+ replay
+ valid
+
+ """
+ result = self.enricher._parse_archive_url(html)
+ assert result == "https://ghostarchive.org/archive/Valid1"
+
+ def test_parse_archive_url_no_links(self):
+ """Parser should return None when no archive links found."""
+ html = "No archive here
"
+ result = self.enricher._parse_archive_url(html)
+ assert result is None
+
+ def test_enrich_sets_ghostarchive_on_metadata(self, mocker, make_item):
+ """enrich() should set 'ghostarchive' key on the metadata object."""
+ mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Enr1")
+
+ item = make_item("https://example.com")
+ result = self.enricher.enrich(item)
+
+ assert result is True
+ assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Enr1"
+
+ def test_enrich_skips_if_already_enriched(self, mocker, make_item):
+ """enrich() should skip if ghostarchive key is already set."""
+ mock_search = mocker.patch.object(self.enricher, "_search_existing")
+
+ item = make_item("https://example.com", ghostarchive="https://ghostarchive.org/archive/Old1")
+ result = self.enricher.enrich(item)
+
+ assert result is True
+ mock_search.assert_not_called()
+
+ def test_enrich_returns_false_on_failure(self, mocker, make_item):
+ """enrich() should return False when both search and submit fail."""
+ mocker.patch.object(self.enricher, "_search_existing", return_value=None)
+ mocker.patch.object(self.enricher, "_submit_url", return_value=None)
+
+ item = make_item("https://example.com")
+ result = self.enricher.enrich(item)
+
+ assert result is False
+
+ def test_enrich_skips_auth_wall(self, mocker, make_item):
+ """enrich() should skip URLs behind auth walls."""
+ mocker.patch(
+ "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.UrlUtil.is_auth_wall", return_value=True
+ )
+
+ item = make_item("https://example.com/login")
+ result = self.enricher.enrich(item)
+ assert result is False
+
+ def test_enrich_with_existing_archive(self, mocker, make_item):
+ """enrich() should use existing archive when check_existing is True."""
+ mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Exist1")
+ mock_submit = mocker.patch.object(self.enricher, "_submit_url")
+
+ item = make_item("https://example.com")
+ result = self.enricher.enrich(item)
+
+ assert result is True
+ assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Exist1"
+ mock_submit.assert_not_called()
+
+ def test_enrich_submits_when_no_existing(self, mocker, make_item):
+ """enrich() should submit URL when no existing archive found."""
+ mocker.patch.object(self.enricher, "_search_existing", return_value=None)
+ mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/New42")
+
+ item = make_item("https://example.com")
+ result = self.enricher.enrich(item)
+
+ assert result is True
+ assert item.get("ghostarchive") == "https://ghostarchive.org/archive/New42"
+
+ def test_enrich_skips_check_existing_when_disabled(self, mocker, make_item):
+ """enrich() should skip search when check_existing is False."""
+ self.enricher.check_existing = False
+ mock_search = mocker.patch.object(self.enricher, "_search_existing")
+ mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/Direct1")
+
+ item = make_item("https://example.com")
+ result = self.enricher.enrich(item)
+
+ assert result is True
+ mock_search.assert_not_called()
+
+ @pytest.mark.download
+ def test_real_search_existing(self, setup_module):
+ """Integration test: search for an existing archive on Ghost Archive."""
+ enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
+ # example.com is commonly archived
+ result = enricher._search_existing("https://example.com")
+ # we just check it doesn't crash; result may or may not be found
+ assert result is None or result.startswith("https://ghostarchive.org/archive/")
+
+ @pytest.mark.download
+ @pytest.mark.skipif(CI, reason="Avoid submitting a real task on every CI run")
+ def test_real_submit_example_com(self, setup_module, make_item):
+ """Integration test: submit example.com to Ghost Archive and verify enrichment."""
+ enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
+ item = make_item("https://example.com")
+ result = enricher.enrich(item)
+
+ assert result is True
+ archive_url = item.get("ghostarchive")
+ assert archive_url is not None
+ assert archive_url.startswith("https://ghostarchive.org/archive/")