diff --git a/src/auto_archiver/modules/ghostarchive_enricher/__init__.py b/src/auto_archiver/modules/ghostarchive_enricher/__init__.py new file mode 100644 index 0000000..8aabf41 --- /dev/null +++ b/src/auto_archiver/modules/ghostarchive_enricher/__init__.py @@ -0,0 +1 @@ +from .ghostarchive_enricher import GhostarchiveEnricher diff --git a/src/auto_archiver/modules/ghostarchive_enricher/__manifest__.py b/src/auto_archiver/modules/ghostarchive_enricher/__manifest__.py new file mode 100644 index 0000000..ce7de1c --- /dev/null +++ b/src/auto_archiver/modules/ghostarchive_enricher/__manifest__.py @@ -0,0 +1,58 @@ +{ + "name": "Ghost Archive Enricher", + "type": ["enricher"], + "entry_point": "ghostarchive_enricher::GhostarchiveEnricher", + "requires_setup": False, + "dependencies": { + "python": ["loguru", "requests", "bs4", "seleniumbase"], + }, + "configs": { + "timeout": { + "default": 120, + "type": "int", + "help": "seconds to wait for successful archive confirmation from Ghost Archive.", + }, + "check_existing": { + "default": True, + "type": "bool", + "help": "whether to search for an existing archive before submitting a new one.", + }, + "proxy_http": { + "default": None, + "help": "http proxy to use for requests, eg http://proxy-user:password@proxy-ip:port", + }, + "proxy_https": { + "default": None, + "help": "https proxy to use for requests, eg https://proxy-user:password@proxy-ip:port", + }, + }, + "description": """ + Submits the current URL to [Ghost Archive](https://ghostarchive.org/) for archiving and returns the archived page URL. + + Used as an **enricher** to add a Ghost Archive URL to items already extracted by other modules. + + ### Features + - Archives any public URL using the Ghost Archive service. + - Optionally checks for existing archives before submitting a new one. + - Supports HTTP and HTTPS proxies for requests. + - Parses HTML responses to extract archive URLs (Ghost Archive has no JSON API). + + ### Important + - This module confirms that Ghost Archive accepted the URL submission and returned an archive link. + It does **not** verify the contents or completeness of the archived page. + + ### Notes + - Ghost Archive is a free service with no authentication required. + - Archived pages must be smaller than 50 MB (including CSS, fonts, images, etc.). + - Videos are archived up to 360p and must be under 100 MB and shorter than 30 minutes. + - Archival may take up to 5 minutes depending on the queue and page complexity. + - Archived content is stored indefinitely. + - Ghost Archive does not archive pages that require authentication or form submission. + + ### Limitations + - No official API — this module interacts with the Ghost Archive web interface. + - The submission endpoint is protected by Cloudflare, so a headless browser (SeleniumBase) is used for new submissions. + - Searching for existing archives uses plain HTTP requests and does not require a browser. + - Rate limiting may apply; consider using a delay between requests if archiving many URLs. + """, +} diff --git a/src/auto_archiver/modules/ghostarchive_enricher/ghostarchive_enricher.py b/src/auto_archiver/modules/ghostarchive_enricher/ghostarchive_enricher.py new file mode 100644 index 0000000..a170ef8 --- /dev/null +++ b/src/auto_archiver/modules/ghostarchive_enricher/ghostarchive_enricher.py @@ -0,0 +1,153 @@ +import time +import re + +import requests +from bs4 import BeautifulSoup +from seleniumbase import SB +from auto_archiver.utils.custom_logger import logger +from auto_archiver.utils import url as UrlUtil +from auto_archiver.core import Enricher, Metadata + + +class GhostarchiveEnricher(Enricher): + """ + Submits the current URL to Ghost Archive (ghostarchive.org) for archiving + and stores the archived page URL as enrichment metadata. + + Ghost Archive has no official API — this module interacts with the web form + and parses HTML responses. The submission endpoint is protected by Cloudflare, + so a headless browser (SeleniumBase) is used for archival submissions, while + plain HTTP requests are used for searching existing archives. + + Note: this module only confirms that Ghost Archive accepted the submission + and returned an archive URL. It does not verify that the archived page + content is complete or correctly rendered. + """ + + GHOSTARCHIVE_BASE = "https://ghostarchive.org" + ARCHIVE_ENDPOINT = f"{GHOSTARCHIVE_BASE}/archive2" + SEARCH_ENDPOINT = f"{GHOSTARCHIVE_BASE}/search" + ARCHIVE_URL_PATTERN = re.compile(r"/archive/([A-Za-z0-9]+)") + + def _get_proxies(self) -> dict: + proxies = {} + if self.proxy_http: + proxies["http"] = self.proxy_http + if self.proxy_https: + proxies["https"] = self.proxy_https + return proxies + + def _get_headers(self) -> dict: + return { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + } + + def _normalize_archive_href(self, href: str) -> str | None: + """Normalize an archive link href to a full HTTPS URL, filtering out replay links.""" + if "/archive/" not in href or "/replay/" in href: + return None + if href.startswith("/"): + return f"{self.GHOSTARCHIVE_BASE}{href}" + if href.startswith("http://ghostarchive.org"): + return href.replace("http://", "https://") + if href.startswith("https://ghostarchive.org"): + return href + return None + + def _search_existing(self, url: str) -> str | None: + """ + Search Ghost Archive for an existing archive of the given URL. + Returns the archive URL if found, otherwise None. + """ + try: + r = requests.get( + self.SEARCH_ENDPOINT, + params={"term": url}, + headers=self._get_headers(), + proxies=self._get_proxies(), + timeout=30, + ) + if r.status_code != 200: + logger.warning(f"Ghost Archive search returned status {r.status_code}") + return None + + soup = BeautifulSoup(r.text, "html.parser") + for link in soup.find_all("a", href=True): + archive_url = self._normalize_archive_href(link["href"]) + if archive_url: + logger.info(f"Found existing Ghost Archive: {archive_url}") + return archive_url + + except requests.exceptions.RequestException as e: + logger.warning(f"Ghost Archive search failed: {e}") + + return None + + def _submit_url(self, url: str) -> str | None: + """ + Submit a URL to Ghost Archive for archiving using a headless browser. + The /archive2 endpoint is Cloudflare-protected, requiring JS execution. + Returns the archive URL if successful, otherwise None. + """ + try: + with SB(uc=True, headless=True) as sb: + logger.debug("Opening Ghost Archive homepage in headless browser") + sb.open(self.GHOSTARCHIVE_BASE) + + # fill in the archive form and submit + sb.type('input[name="archive"]', url) + sb.click('input[type="submit"][value="Submit for archival"]') + + # wait for navigation to /archive/{id} or timeout + start_time = time.time() + while time.time() - start_time < self.timeout: + current_url = sb.get_current_url() + if self.ARCHIVE_URL_PATTERN.search(current_url): + archive_url = current_url.split("?")[0] + logger.info(f"Ghost Archive saved: {archive_url}") + return archive_url + time.sleep(2) + + # if we didn't redirect, try parsing the page source + page_source = sb.get_page_source() + return self._parse_archive_url(page_source) + + except Exception as e: + logger.warning(f"Ghost Archive submission failed: {e}") + return None + + def _parse_archive_url(self, html: str) -> str | None: + """Parse HTML response to find an archive URL.""" + soup = BeautifulSoup(html, "html.parser") + for link in soup.find_all("a", href=True): + archive_url = self._normalize_archive_href(link["href"]) + if archive_url: + return archive_url + return None + + def enrich(self, to_enrich: Metadata) -> bool: + url = to_enrich.get_url() + if UrlUtil.is_auth_wall(url): + logger.debug("[SKIP] Ghost Archive since url is behind AUTH WALL") + return False + + if to_enrich.get("ghostarchive"): + logger.info(f"Ghost Archive enricher had already been executed: {to_enrich.get('ghostarchive')}") + return True + + # optionally check for existing archive first + archive_url = None + if self.check_existing: + logger.debug(f"Searching Ghost Archive for existing archive of {url}") + archive_url = self._search_existing(url) + + if not archive_url: + logger.debug(f"Submitting {url} to Ghost Archive") + archive_url = self._submit_url(url) + + if archive_url: + to_enrich.set("ghostarchive", archive_url) + return True + + logger.warning(f"Ghost Archive failed to archive {url}") + return False diff --git a/tests/enrichers/test_ghostarchive_enricher.py b/tests/enrichers/test_ghostarchive_enricher.py new file mode 100644 index 0000000..a476fa4 --- /dev/null +++ b/tests/enrichers/test_ghostarchive_enricher.py @@ -0,0 +1,277 @@ +import pytest +import requests +import os +from unittest.mock import MagicMock + +from auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher import GhostarchiveEnricher + +CI = os.getenv("GITHUB_ACTIONS", "") == "true" + +# sample HTML responses for mocking +SEARCH_HTML_FOUND = """ + +

Archives for https://example.com

+ + +
https://example.com
+ +""" + +SEARCH_HTML_NOT_FOUND = """ + +

Archives for https://example.com

+

Page 0 out of 0

+

No archives for that site.

+ +""" + +SAVE_RESPONSE_HTML_WITH_LINK = """ + +

Archive saved

+View archive + +""" + +ENRICHER_CONFIG = { + "timeout": 120, + "check_existing": True, + "proxy_http": None, + "proxy_https": None, +} + + +class TestGhostarchiveEnricher: + """Tests for Ghost Archive Enricher""" + + @pytest.fixture(autouse=True) + def setup_enricher(self, setup_module): + self.enricher: GhostarchiveEnricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG) + + def test_search_existing_found(self, mocker): + """When an existing archive is found, it should be returned.""" + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = SEARCH_HTML_FOUND + mocker.patch( + "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response + ) + + result = self.enricher._search_existing("https://example.com") + assert result == "https://ghostarchive.org/archive/Abc12" + + def test_search_existing_not_found(self, mocker): + """When no existing archive is found, None should be returned.""" + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = SEARCH_HTML_NOT_FOUND + mocker.patch( + "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response + ) + + result = self.enricher._search_existing("https://example.com") + assert result is None + + def test_search_existing_request_error(self, mocker): + """When search request fails, None should be returned.""" + mocker.patch( + "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", + side_effect=requests.exceptions.ConnectionError("connection failed"), + ) + + result = self.enricher._search_existing("https://example.com") + assert result is None + + def test_search_existing_non_200(self, mocker): + """When search returns non-200, None should be returned.""" + mock_response = mocker.Mock() + mock_response.status_code = 503 + mocker.patch( + "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response + ) + + result = self.enricher._search_existing("https://example.com") + assert result is None + + def test_submit_url_success_redirect(self, mocker): + """Successful submission via headless browser should return archive URL.""" + mock_sb = MagicMock() + mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1" + mock_sb.__enter__ = MagicMock(return_value=mock_sb) + mock_sb.__exit__ = MagicMock(return_value=False) + + mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb) + + result = self.enricher._submit_url("https://example.com") + assert result == "https://ghostarchive.org/archive/NewId1" + mock_sb.type.assert_called_once() + mock_sb.click.assert_called_once() + + def test_submit_url_success_redirect_strips_query(self, mocker): + """Redirect URL query params should be stripped.""" + mock_sb = MagicMock() + mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1?wr=false" + mock_sb.__enter__ = MagicMock(return_value=mock_sb) + mock_sb.__exit__ = MagicMock(return_value=False) + + mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb) + + result = self.enricher._submit_url("https://example.com") + assert result == "https://ghostarchive.org/archive/NewId1" + + def test_submit_url_success_html_fallback(self, mocker): + """When browser doesn't redirect, should parse page source for archive link.""" + mock_sb = MagicMock() + mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive2" + mock_sb.get_page_source.return_value = SAVE_RESPONSE_HTML_WITH_LINK + mock_sb.__enter__ = MagicMock(return_value=mock_sb) + mock_sb.__exit__ = MagicMock(return_value=False) + + # make timeout=0 so the polling loop exits immediately and falls through to HTML parsing + self.enricher.timeout = 0 + mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb) + + result = self.enricher._submit_url("https://example.com") + assert result == "https://ghostarchive.org/archive/Xyz99" + + def test_submit_url_browser_error(self, mocker): + """Browser error during submission should return None.""" + mocker.patch( + "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", + side_effect=Exception("browser failed to start"), + ) + + result = self.enricher._submit_url("https://example.com") + assert result is None + + def test_proxy_configuration(self, mocker): + """Proxies should be passed to search requests when configured.""" + self.enricher.proxy_http = "http://proxy:8080" + self.enricher.proxy_https = "https://proxy:8443" + + mock_get = mocker.patch( + "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", + ) + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = SEARCH_HTML_FOUND + mock_get.return_value = mock_response + + result = self.enricher._search_existing("https://example.com") + + call_kwargs = mock_get.call_args + assert call_kwargs.kwargs.get("proxies") == {"http": "http://proxy:8080", "https": "https://proxy:8443"} + assert result is not None + + def test_parse_archive_url_with_replay_links(self): + """Parser should ignore /replay/ links and only return /archive/ links.""" + html = """ + + replay + valid + + """ + result = self.enricher._parse_archive_url(html) + assert result == "https://ghostarchive.org/archive/Valid1" + + def test_parse_archive_url_no_links(self): + """Parser should return None when no archive links found.""" + html = "

No archive here

" + result = self.enricher._parse_archive_url(html) + assert result is None + + def test_enrich_sets_ghostarchive_on_metadata(self, mocker, make_item): + """enrich() should set 'ghostarchive' key on the metadata object.""" + mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Enr1") + + item = make_item("https://example.com") + result = self.enricher.enrich(item) + + assert result is True + assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Enr1" + + def test_enrich_skips_if_already_enriched(self, mocker, make_item): + """enrich() should skip if ghostarchive key is already set.""" + mock_search = mocker.patch.object(self.enricher, "_search_existing") + + item = make_item("https://example.com", ghostarchive="https://ghostarchive.org/archive/Old1") + result = self.enricher.enrich(item) + + assert result is True + mock_search.assert_not_called() + + def test_enrich_returns_false_on_failure(self, mocker, make_item): + """enrich() should return False when both search and submit fail.""" + mocker.patch.object(self.enricher, "_search_existing", return_value=None) + mocker.patch.object(self.enricher, "_submit_url", return_value=None) + + item = make_item("https://example.com") + result = self.enricher.enrich(item) + + assert result is False + + def test_enrich_skips_auth_wall(self, mocker, make_item): + """enrich() should skip URLs behind auth walls.""" + mocker.patch( + "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.UrlUtil.is_auth_wall", return_value=True + ) + + item = make_item("https://example.com/login") + result = self.enricher.enrich(item) + assert result is False + + def test_enrich_with_existing_archive(self, mocker, make_item): + """enrich() should use existing archive when check_existing is True.""" + mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Exist1") + mock_submit = mocker.patch.object(self.enricher, "_submit_url") + + item = make_item("https://example.com") + result = self.enricher.enrich(item) + + assert result is True + assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Exist1" + mock_submit.assert_not_called() + + def test_enrich_submits_when_no_existing(self, mocker, make_item): + """enrich() should submit URL when no existing archive found.""" + mocker.patch.object(self.enricher, "_search_existing", return_value=None) + mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/New42") + + item = make_item("https://example.com") + result = self.enricher.enrich(item) + + assert result is True + assert item.get("ghostarchive") == "https://ghostarchive.org/archive/New42" + + def test_enrich_skips_check_existing_when_disabled(self, mocker, make_item): + """enrich() should skip search when check_existing is False.""" + self.enricher.check_existing = False + mock_search = mocker.patch.object(self.enricher, "_search_existing") + mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/Direct1") + + item = make_item("https://example.com") + result = self.enricher.enrich(item) + + assert result is True + mock_search.assert_not_called() + + @pytest.mark.download + def test_real_search_existing(self, setup_module): + """Integration test: search for an existing archive on Ghost Archive.""" + enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG) + # example.com is commonly archived + result = enricher._search_existing("https://example.com") + # we just check it doesn't crash; result may or may not be found + assert result is None or result.startswith("https://ghostarchive.org/archive/") + + @pytest.mark.download + @pytest.mark.skipif(CI, reason="Avoid submitting a real task on every CI run") + def test_real_submit_example_com(self, setup_module, make_item): + """Integration test: submit example.com to Ghost Archive and verify enrichment.""" + enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG) + item = make_item("https://example.com") + result = enricher.enrich(item) + + assert result is True + archive_url = item.get("ghostarchive") + assert archive_url is not None + assert archive_url.startswith("https://ghostarchive.org/archive/")