Merge pull request #418 from bellingcat/feat/ghostarchive

Feat/ghostarchive
This commit is contained in:
Miguel Sozinho Ramalho
2026-04-06 18:33:15 +01:00
committed by GitHub
4 changed files with 489 additions and 0 deletions

View File

@@ -0,0 +1 @@
from .ghostarchive_enricher import GhostarchiveEnricher

View File

@@ -0,0 +1,58 @@
{
"name": "Ghost Archive Enricher",
"type": ["enricher"],
"entry_point": "ghostarchive_enricher::GhostarchiveEnricher",
"requires_setup": False,
"dependencies": {
"python": ["loguru", "requests", "bs4", "seleniumbase"],
},
"configs": {
"timeout": {
"default": 120,
"type": "int",
"help": "seconds to wait for successful archive confirmation from Ghost Archive.",
},
"check_existing": {
"default": True,
"type": "bool",
"help": "whether to search for an existing archive before submitting a new one.",
},
"proxy_http": {
"default": None,
"help": "http proxy to use for requests, eg http://proxy-user:password@proxy-ip:port",
},
"proxy_https": {
"default": None,
"help": "https proxy to use for requests, eg https://proxy-user:password@proxy-ip:port",
},
},
"description": """
Submits the current URL to [Ghost Archive](https://ghostarchive.org/) for archiving and returns the archived page URL.
Used as an **enricher** to add a Ghost Archive URL to items already extracted by other modules.
### Features
- Archives any public URL using the Ghost Archive service.
- Optionally checks for existing archives before submitting a new one.
- Supports HTTP and HTTPS proxies for requests.
- Parses HTML responses to extract archive URLs (Ghost Archive has no JSON API).
### Important
- This module confirms that Ghost Archive accepted the URL submission and returned an archive link.
It does **not** verify the contents or completeness of the archived page.
### Notes
- Ghost Archive is a free service with no authentication required.
- Archived pages must be smaller than 50 MB (including CSS, fonts, images, etc.).
- Videos are archived up to 360p and must be under 100 MB and shorter than 30 minutes.
- Archival may take up to 5 minutes depending on the queue and page complexity.
- Archived content is stored indefinitely.
- Ghost Archive does not archive pages that require authentication or form submission.
### Limitations
- No official API — this module interacts with the Ghost Archive web interface.
- The submission endpoint is protected by Cloudflare, so a headless browser (SeleniumBase) is used for new submissions.
- Searching for existing archives uses plain HTTP requests and does not require a browser.
- Rate limiting may apply; consider using a delay between requests if archiving many URLs.
""",
}

View File

@@ -0,0 +1,153 @@
import time
import re
import requests
from bs4 import BeautifulSoup
from seleniumbase import SB
from auto_archiver.utils.custom_logger import logger
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core import Enricher, Metadata
class GhostarchiveEnricher(Enricher):
"""
Submits the current URL to Ghost Archive (ghostarchive.org) for archiving
and stores the archived page URL as enrichment metadata.
Ghost Archive has no official API — this module interacts with the web form
and parses HTML responses. The submission endpoint is protected by Cloudflare,
so a headless browser (SeleniumBase) is used for archival submissions, while
plain HTTP requests are used for searching existing archives.
Note: this module only confirms that Ghost Archive accepted the submission
and returned an archive URL. It does not verify that the archived page
content is complete or correctly rendered.
"""
GHOSTARCHIVE_BASE = "https://ghostarchive.org"
ARCHIVE_ENDPOINT = f"{GHOSTARCHIVE_BASE}/archive2"
SEARCH_ENDPOINT = f"{GHOSTARCHIVE_BASE}/search"
ARCHIVE_URL_PATTERN = re.compile(r"/archive/([A-Za-z0-9]+)")
def _get_proxies(self) -> dict:
proxies = {}
if self.proxy_http:
proxies["http"] = self.proxy_http
if self.proxy_https:
proxies["https"] = self.proxy_https
return proxies
def _get_headers(self) -> dict:
return {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
def _normalize_archive_href(self, href: str) -> str | None:
"""Normalize an archive link href to a full HTTPS URL, filtering out replay links."""
if "/archive/" not in href or "/replay/" in href:
return None
if href.startswith("/"):
return f"{self.GHOSTARCHIVE_BASE}{href}"
if href.startswith("http://ghostarchive.org"):
return href.replace("http://", "https://")
if href.startswith("https://ghostarchive.org"):
return href
return None
def _search_existing(self, url: str) -> str | None:
"""
Search Ghost Archive for an existing archive of the given URL.
Returns the archive URL if found, otherwise None.
"""
try:
r = requests.get(
self.SEARCH_ENDPOINT,
params={"term": url},
headers=self._get_headers(),
proxies=self._get_proxies(),
timeout=30,
)
if r.status_code != 200:
logger.warning(f"Ghost Archive search returned status {r.status_code}")
return None
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all("a", href=True):
archive_url = self._normalize_archive_href(link["href"])
if archive_url:
logger.info(f"Found existing Ghost Archive: {archive_url}")
return archive_url
except requests.exceptions.RequestException as e:
logger.warning(f"Ghost Archive search failed: {e}")
return None
def _submit_url(self, url: str) -> str | None:
"""
Submit a URL to Ghost Archive for archiving using a headless browser.
The /archive2 endpoint is Cloudflare-protected, requiring JS execution.
Returns the archive URL if successful, otherwise None.
"""
try:
with SB(uc=True, headless=True) as sb:
logger.debug("Opening Ghost Archive homepage in headless browser")
sb.open(self.GHOSTARCHIVE_BASE)
# fill in the archive form and submit
sb.type('input[name="archive"]', url)
sb.click('input[type="submit"][value="Submit for archival"]')
# wait for navigation to /archive/{id} or timeout
start_time = time.time()
while time.time() - start_time < self.timeout:
current_url = sb.get_current_url()
if self.ARCHIVE_URL_PATTERN.search(current_url):
archive_url = current_url.split("?")[0]
logger.info(f"Ghost Archive saved: {archive_url}")
return archive_url
time.sleep(2)
# if we didn't redirect, try parsing the page source
page_source = sb.get_page_source()
return self._parse_archive_url(page_source)
except Exception as e:
logger.warning(f"Ghost Archive submission failed: {e}")
return None
def _parse_archive_url(self, html: str) -> str | None:
"""Parse HTML response to find an archive URL."""
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all("a", href=True):
archive_url = self._normalize_archive_href(link["href"])
if archive_url:
return archive_url
return None
def enrich(self, to_enrich: Metadata) -> bool:
url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug("[SKIP] Ghost Archive since url is behind AUTH WALL")
return False
if to_enrich.get("ghostarchive"):
logger.info(f"Ghost Archive enricher had already been executed: {to_enrich.get('ghostarchive')}")
return True
# optionally check for existing archive first
archive_url = None
if self.check_existing:
logger.debug(f"Searching Ghost Archive for existing archive of {url}")
archive_url = self._search_existing(url)
if not archive_url:
logger.debug(f"Submitting {url} to Ghost Archive")
archive_url = self._submit_url(url)
if archive_url:
to_enrich.set("ghostarchive", archive_url)
return True
logger.warning(f"Ghost Archive failed to archive {url}")
return False

View File

@@ -0,0 +1,277 @@
import pytest
import requests
import os
from unittest.mock import MagicMock
from auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher import GhostarchiveEnricher
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
# sample HTML responses for mocking
SEARCH_HTML_FOUND = """
<html><body>
<h1>Archives for https://example.com</h1>
<table>
<tr><td><a href="http://ghostarchive.org/archive/Abc12">https://example.com</a></td></tr>
</table>
</body></html>
"""
SEARCH_HTML_NOT_FOUND = """
<html><body>
<h1>Archives for https://example.com</h1>
<p>Page 0 out of 0</p>
<p>No archives for that site.</p>
</body></html>
"""
SAVE_RESPONSE_HTML_WITH_LINK = """
<html><body>
<h1>Archive saved</h1>
<a href="/archive/Xyz99">View archive</a>
</body></html>
"""
ENRICHER_CONFIG = {
"timeout": 120,
"check_existing": True,
"proxy_http": None,
"proxy_https": None,
}
class TestGhostarchiveEnricher:
"""Tests for Ghost Archive Enricher"""
@pytest.fixture(autouse=True)
def setup_enricher(self, setup_module):
self.enricher: GhostarchiveEnricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
def test_search_existing_found(self, mocker):
"""When an existing archive is found, it should be returned."""
mock_response = mocker.Mock()
mock_response.status_code = 200
mock_response.text = SEARCH_HTML_FOUND
mocker.patch(
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
)
result = self.enricher._search_existing("https://example.com")
assert result == "https://ghostarchive.org/archive/Abc12"
def test_search_existing_not_found(self, mocker):
"""When no existing archive is found, None should be returned."""
mock_response = mocker.Mock()
mock_response.status_code = 200
mock_response.text = SEARCH_HTML_NOT_FOUND
mocker.patch(
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
)
result = self.enricher._search_existing("https://example.com")
assert result is None
def test_search_existing_request_error(self, mocker):
"""When search request fails, None should be returned."""
mocker.patch(
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get",
side_effect=requests.exceptions.ConnectionError("connection failed"),
)
result = self.enricher._search_existing("https://example.com")
assert result is None
def test_search_existing_non_200(self, mocker):
"""When search returns non-200, None should be returned."""
mock_response = mocker.Mock()
mock_response.status_code = 503
mocker.patch(
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
)
result = self.enricher._search_existing("https://example.com")
assert result is None
def test_submit_url_success_redirect(self, mocker):
"""Successful submission via headless browser should return archive URL."""
mock_sb = MagicMock()
mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1"
mock_sb.__enter__ = MagicMock(return_value=mock_sb)
mock_sb.__exit__ = MagicMock(return_value=False)
mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
result = self.enricher._submit_url("https://example.com")
assert result == "https://ghostarchive.org/archive/NewId1"
mock_sb.type.assert_called_once()
mock_sb.click.assert_called_once()
def test_submit_url_success_redirect_strips_query(self, mocker):
"""Redirect URL query params should be stripped."""
mock_sb = MagicMock()
mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1?wr=false"
mock_sb.__enter__ = MagicMock(return_value=mock_sb)
mock_sb.__exit__ = MagicMock(return_value=False)
mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
result = self.enricher._submit_url("https://example.com")
assert result == "https://ghostarchive.org/archive/NewId1"
def test_submit_url_success_html_fallback(self, mocker):
"""When browser doesn't redirect, should parse page source for archive link."""
mock_sb = MagicMock()
mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive2"
mock_sb.get_page_source.return_value = SAVE_RESPONSE_HTML_WITH_LINK
mock_sb.__enter__ = MagicMock(return_value=mock_sb)
mock_sb.__exit__ = MagicMock(return_value=False)
# make timeout=0 so the polling loop exits immediately and falls through to HTML parsing
self.enricher.timeout = 0
mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
result = self.enricher._submit_url("https://example.com")
assert result == "https://ghostarchive.org/archive/Xyz99"
def test_submit_url_browser_error(self, mocker):
"""Browser error during submission should return None."""
mocker.patch(
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB",
side_effect=Exception("browser failed to start"),
)
result = self.enricher._submit_url("https://example.com")
assert result is None
def test_proxy_configuration(self, mocker):
"""Proxies should be passed to search requests when configured."""
self.enricher.proxy_http = "http://proxy:8080"
self.enricher.proxy_https = "https://proxy:8443"
mock_get = mocker.patch(
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get",
)
mock_response = mocker.Mock()
mock_response.status_code = 200
mock_response.text = SEARCH_HTML_FOUND
mock_get.return_value = mock_response
result = self.enricher._search_existing("https://example.com")
call_kwargs = mock_get.call_args
assert call_kwargs.kwargs.get("proxies") == {"http": "http://proxy:8080", "https": "https://proxy:8443"}
assert result is not None
def test_parse_archive_url_with_replay_links(self):
"""Parser should ignore /replay/ links and only return /archive/ links."""
html = """
<html><body>
<a href="/archive/replay/w/id-abc/mp_/https://example.com">replay</a>
<a href="/archive/Valid1">valid</a>
</body></html>
"""
result = self.enricher._parse_archive_url(html)
assert result == "https://ghostarchive.org/archive/Valid1"
def test_parse_archive_url_no_links(self):
"""Parser should return None when no archive links found."""
html = "<html><body><p>No archive here</p></body></html>"
result = self.enricher._parse_archive_url(html)
assert result is None
def test_enrich_sets_ghostarchive_on_metadata(self, mocker, make_item):
"""enrich() should set 'ghostarchive' key on the metadata object."""
mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Enr1")
item = make_item("https://example.com")
result = self.enricher.enrich(item)
assert result is True
assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Enr1"
def test_enrich_skips_if_already_enriched(self, mocker, make_item):
"""enrich() should skip if ghostarchive key is already set."""
mock_search = mocker.patch.object(self.enricher, "_search_existing")
item = make_item("https://example.com", ghostarchive="https://ghostarchive.org/archive/Old1")
result = self.enricher.enrich(item)
assert result is True
mock_search.assert_not_called()
def test_enrich_returns_false_on_failure(self, mocker, make_item):
"""enrich() should return False when both search and submit fail."""
mocker.patch.object(self.enricher, "_search_existing", return_value=None)
mocker.patch.object(self.enricher, "_submit_url", return_value=None)
item = make_item("https://example.com")
result = self.enricher.enrich(item)
assert result is False
def test_enrich_skips_auth_wall(self, mocker, make_item):
"""enrich() should skip URLs behind auth walls."""
mocker.patch(
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.UrlUtil.is_auth_wall", return_value=True
)
item = make_item("https://example.com/login")
result = self.enricher.enrich(item)
assert result is False
def test_enrich_with_existing_archive(self, mocker, make_item):
"""enrich() should use existing archive when check_existing is True."""
mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Exist1")
mock_submit = mocker.patch.object(self.enricher, "_submit_url")
item = make_item("https://example.com")
result = self.enricher.enrich(item)
assert result is True
assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Exist1"
mock_submit.assert_not_called()
def test_enrich_submits_when_no_existing(self, mocker, make_item):
"""enrich() should submit URL when no existing archive found."""
mocker.patch.object(self.enricher, "_search_existing", return_value=None)
mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/New42")
item = make_item("https://example.com")
result = self.enricher.enrich(item)
assert result is True
assert item.get("ghostarchive") == "https://ghostarchive.org/archive/New42"
def test_enrich_skips_check_existing_when_disabled(self, mocker, make_item):
"""enrich() should skip search when check_existing is False."""
self.enricher.check_existing = False
mock_search = mocker.patch.object(self.enricher, "_search_existing")
mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/Direct1")
item = make_item("https://example.com")
result = self.enricher.enrich(item)
assert result is True
mock_search.assert_not_called()
@pytest.mark.download
def test_real_search_existing(self, setup_module):
"""Integration test: search for an existing archive on Ghost Archive."""
enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
# example.com is commonly archived
result = enricher._search_existing("https://example.com")
# we just check it doesn't crash; result may or may not be found
assert result is None or result.startswith("https://ghostarchive.org/archive/")
@pytest.mark.download
@pytest.mark.skipif(CI, reason="Avoid submitting a real task on every CI run")
def test_real_submit_example_com(self, setup_module, make_item):
"""Integration test: submit example.com to Ghost Archive and verify enrichment."""
enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
item = make_item("https://example.com")
result = enricher.enrich(item)
assert result is True
archive_url = item.get("ghostarchive")
assert archive_url is not None
assert archive_url.startswith("https://ghostarchive.org/archive/")