Merge pull request #419 from bellingcat/dev

Dependencies bump, new ghostarchive enricher
2026-06-07 19:08:30 +03:00 · 2026-04-07 14:44:35 +01:00
parent aa65299844 17c4ae15eb
commit 792838f1a1
7 changed files with 1048 additions and 560 deletions
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.11.4 AS base
+FROM webrecorder/browsertrix-crawler:1.12.4 AS base
 ENV RUNNING_IN_DOCKER=1 \
    LANG=C.UTF-8 \
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [project]
 name = "auto-archiver"
-version = "1.2.5"
+version = "1.2.6"
 description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
 requires-python = ">=3.10,<3.13"
--- a/src/auto_archiver/modules/ghostarchive_enricher/init.py
+++ b/src/auto_archiver/modules/ghostarchive_enricher/init.py
@@ -0,0 +1 @@
 from .ghostarchive_enricher import GhostarchiveEnricher
--- a/src/auto_archiver/modules/ghostarchive_enricher/manifest.py
+++ b/src/auto_archiver/modules/ghostarchive_enricher/manifest.py
@@ -0,0 +1,58 @@
 {
    "name": "Ghost Archive Enricher",
    "type": ["enricher"],
    "entry_point": "ghostarchive_enricher::GhostarchiveEnricher",
    "requires_setup": False,
    "dependencies": {
        "python": ["loguru", "requests", "bs4", "seleniumbase"],
    },
    "configs": {
        "timeout": {
            "default": 120,
            "type": "int",
            "help": "seconds to wait for successful archive confirmation from Ghost Archive.",
        },
        "check_existing": {
            "default": True,
            "type": "bool",
            "help": "whether to search for an existing archive before submitting a new one.",
        },
        "proxy_http": {
            "default": None,
            "help": "http proxy to use for requests, eg http://proxy-user:password@proxy-ip:port",
        },
        "proxy_https": {
            "default": None,
            "help": "https proxy to use for requests, eg https://proxy-user:password@proxy-ip:port",
        },
    },
    "description": """
    Submits the current URL to [Ghost Archive](https://ghostarchive.org/) for archiving and returns the archived page URL.
    Used as an **enricher** to add a Ghost Archive URL to items already extracted by other modules.
    ### Features
    - Archives any public URL using the Ghost Archive service.
    - Optionally checks for existing archives before submitting a new one.
    - Supports HTTP and HTTPS proxies for requests.
    - Parses HTML responses to extract archive URLs (Ghost Archive has no JSON API).
    ### Important
    - This module confirms that Ghost Archive accepted the URL submission and returned an archive link.
      It does **not** verify the contents or completeness of the archived page.
    ### Notes
    - Ghost Archive is a free service with no authentication required.
    - Archived pages must be smaller than 50 MB (including CSS, fonts, images, etc.).
    - Videos are archived up to 360p and must be under 100 MB and shorter than 30 minutes.
    - Archival may take up to 5 minutes depending on the queue and page complexity.
    - Archived content is stored indefinitely.
    - Ghost Archive does not archive pages that require authentication or form submission.
    ### Limitations
    - No official API — this module interacts with the Ghost Archive web interface.
    - The submission endpoint is protected by Cloudflare, so a headless browser (SeleniumBase) is used for new submissions.
    - Searching for existing archives uses plain HTTP requests and does not require a browser.
    - Rate limiting may apply; consider using a delay between requests if archiving many URLs.
    """,
 }
--- a/src/auto_archiver/modules/ghostarchive_enricher/ghostarchive_enricher.py
+++ b/src/auto_archiver/modules/ghostarchive_enricher/ghostarchive_enricher.py
@@ -0,0 +1,153 @@
 import time
 import re
 import requests
 from bs4 import BeautifulSoup
 from seleniumbase import SB
 from auto_archiver.utils.custom_logger import logger
 from auto_archiver.utils import url as UrlUtil
 from auto_archiver.core import Enricher, Metadata
 class GhostarchiveEnricher(Enricher):
    """
    Submits the current URL to Ghost Archive (ghostarchive.org) for archiving
    and stores the archived page URL as enrichment metadata.
    Ghost Archive has no official API — this module interacts with the web form
    and parses HTML responses. The submission endpoint is protected by Cloudflare,
    so a headless browser (SeleniumBase) is used for archival submissions, while
    plain HTTP requests are used for searching existing archives.
    Note: this module only confirms that Ghost Archive accepted the submission
    and returned an archive URL. It does not verify that the archived page
    content is complete or correctly rendered.
    """
    GHOSTARCHIVE_BASE = "https://ghostarchive.org"
    ARCHIVE_ENDPOINT = f"{GHOSTARCHIVE_BASE}/archive2"
    SEARCH_ENDPOINT = f"{GHOSTARCHIVE_BASE}/search"
    ARCHIVE_URL_PATTERN = re.compile(r"/archive/([A-Za-z0-9]+)")
    def _get_proxies(self) -> dict:
        proxies = {}
        if self.proxy_http:
            proxies["http"] = self.proxy_http
        if self.proxy_https:
            proxies["https"] = self.proxy_https
        return proxies
    def _get_headers(self) -> dict:
        return {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        }
    def _normalize_archive_href(self, href: str) -> str | None:
        """Normalize an archive link href to a full HTTPS URL, filtering out replay links."""
        if "/archive/" not in href or "/replay/" in href:
            return None
        if href.startswith("/"):
            return f"{self.GHOSTARCHIVE_BASE}{href}"
        if href.startswith("http://ghostarchive.org"):
            return href.replace("http://", "https://")
        if href.startswith("https://ghostarchive.org"):
            return href
        return None
    def _search_existing(self, url: str) -> str | None:
        """
        Search Ghost Archive for an existing archive of the given URL.
        Returns the archive URL if found, otherwise None.
        """
        try:
            r = requests.get(
                self.SEARCH_ENDPOINT,
                params={"term": url},
                headers=self._get_headers(),
                proxies=self._get_proxies(),
                timeout=30,
            )
            if r.status_code != 200:
                logger.warning(f"Ghost Archive search returned status {r.status_code}")
                return None
            soup = BeautifulSoup(r.text, "html.parser")
            for link in soup.find_all("a", href=True):
                archive_url = self._normalize_archive_href(link["href"])
                if archive_url:
                    logger.info(f"Found existing Ghost Archive: {archive_url}")
                    return archive_url
        except requests.exceptions.RequestException as e:
            logger.warning(f"Ghost Archive search failed: {e}")
        return None
    def _submit_url(self, url: str) -> str | None:
        """
        Submit a URL to Ghost Archive for archiving using a headless browser.
        The /archive2 endpoint is Cloudflare-protected, requiring JS execution.
        Returns the archive URL if successful, otherwise None.
        """
        try:
            with SB(uc=True, headless=True) as sb:
                logger.debug("Opening Ghost Archive homepage in headless browser")
                sb.open(self.GHOSTARCHIVE_BASE)
                # fill in the archive form and submit
                sb.type('input[name="archive"]', url)
                sb.click('input[type="submit"][value="Submit for archival"]')
                # wait for navigation to /archive/{id} or timeout
                start_time = time.time()
                while time.time() - start_time < self.timeout:
                    current_url = sb.get_current_url()
                    if self.ARCHIVE_URL_PATTERN.search(current_url):
                        archive_url = current_url.split("?")[0]
                        logger.info(f"Ghost Archive saved: {archive_url}")
                        return archive_url
                    time.sleep(2)
                # if we didn't redirect, try parsing the page source
                page_source = sb.get_page_source()
                return self._parse_archive_url(page_source)
        except Exception as e:
            logger.warning(f"Ghost Archive submission failed: {e}")
            return None
    def _parse_archive_url(self, html: str) -> str | None:
        """Parse HTML response to find an archive URL."""
        soup = BeautifulSoup(html, "html.parser")
        for link in soup.find_all("a", href=True):
            archive_url = self._normalize_archive_href(link["href"])
            if archive_url:
                return archive_url
        return None
    def enrich(self, to_enrich: Metadata) -> bool:
        url = to_enrich.get_url()
        if UrlUtil.is_auth_wall(url):
            logger.debug("[SKIP] Ghost Archive since url is behind AUTH WALL")
            return False
        if to_enrich.get("ghostarchive"):
            logger.info(f"Ghost Archive enricher had already been executed: {to_enrich.get('ghostarchive')}")
            return True
        # optionally check for existing archive first
        archive_url = None
        if self.check_existing:
            logger.debug(f"Searching Ghost Archive for existing archive of {url}")
            archive_url = self._search_existing(url)
        if not archive_url:
            logger.debug(f"Submitting {url} to Ghost Archive")
            archive_url = self._submit_url(url)
        if archive_url:
            to_enrich.set("ghostarchive", archive_url)
            return True
        logger.warning(f"Ghost Archive failed to archive {url}")
        return False
--- a/tests/enrichers/test_ghostarchive_enricher.py
+++ b/tests/enrichers/test_ghostarchive_enricher.py
@@ -0,0 +1,277 @@
 import pytest
 import requests
 import os
 from unittest.mock import MagicMock
 from auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher import GhostarchiveEnricher
 CI = os.getenv("GITHUB_ACTIONS", "") == "true"
 # sample HTML responses for mocking
 SEARCH_HTML_FOUND = """
 <html><body>
 <h1>Archives for https://example.com</h1>
 <table>
 <tr><td><a href="http://ghostarchive.org/archive/Abc12">https://example.com</a></td></tr>
 </table>
 </body></html>
 """
 SEARCH_HTML_NOT_FOUND = """
 <html><body>
 <h1>Archives for https://example.com</h1>
 <p>Page 0 out of 0</p>
 <p>No archives for that site.</p>
 </body></html>
 """
 SAVE_RESPONSE_HTML_WITH_LINK = """
 <html><body>
 <h1>Archive saved</h1>
 <a href="/archive/Xyz99">View archive</a>
 </body></html>
 """
 ENRICHER_CONFIG = {
    "timeout": 120,
    "check_existing": True,
    "proxy_http": None,
    "proxy_https": None,
 }
 class TestGhostarchiveEnricher:
    """Tests for Ghost Archive Enricher"""
    @pytest.fixture(autouse=True)
    def setup_enricher(self, setup_module):
        self.enricher: GhostarchiveEnricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
    def test_search_existing_found(self, mocker):
        """When an existing archive is found, it should be returned."""
        mock_response = mocker.Mock()
        mock_response.status_code = 200
        mock_response.text = SEARCH_HTML_FOUND
        mocker.patch(
            "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
        )
        result = self.enricher._search_existing("https://example.com")
        assert result == "https://ghostarchive.org/archive/Abc12"
    def test_search_existing_not_found(self, mocker):
        """When no existing archive is found, None should be returned."""
        mock_response = mocker.Mock()
        mock_response.status_code = 200
        mock_response.text = SEARCH_HTML_NOT_FOUND
        mocker.patch(
            "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
        )
        result = self.enricher._search_existing("https://example.com")
        assert result is None
    def test_search_existing_request_error(self, mocker):
        """When search request fails, None should be returned."""
        mocker.patch(
            "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get",
            side_effect=requests.exceptions.ConnectionError("connection failed"),
        )
        result = self.enricher._search_existing("https://example.com")
        assert result is None
    def test_search_existing_non_200(self, mocker):
        """When search returns non-200, None should be returned."""
        mock_response = mocker.Mock()
        mock_response.status_code = 503
        mocker.patch(
            "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
        )
        result = self.enricher._search_existing("https://example.com")
        assert result is None
    def test_submit_url_success_redirect(self, mocker):
        """Successful submission via headless browser should return archive URL."""
        mock_sb = MagicMock()
        mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1"
        mock_sb.__enter__ = MagicMock(return_value=mock_sb)
        mock_sb.__exit__ = MagicMock(return_value=False)
        mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
        result = self.enricher._submit_url("https://example.com")
        assert result == "https://ghostarchive.org/archive/NewId1"
        mock_sb.type.assert_called_once()
        mock_sb.click.assert_called_once()
    def test_submit_url_success_redirect_strips_query(self, mocker):
        """Redirect URL query params should be stripped."""
        mock_sb = MagicMock()
        mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1?wr=false"
        mock_sb.__enter__ = MagicMock(return_value=mock_sb)
        mock_sb.__exit__ = MagicMock(return_value=False)
        mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
        result = self.enricher._submit_url("https://example.com")
        assert result == "https://ghostarchive.org/archive/NewId1"
    def test_submit_url_success_html_fallback(self, mocker):
        """When browser doesn't redirect, should parse page source for archive link."""
        mock_sb = MagicMock()
        mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive2"
        mock_sb.get_page_source.return_value = SAVE_RESPONSE_HTML_WITH_LINK
        mock_sb.__enter__ = MagicMock(return_value=mock_sb)
        mock_sb.__exit__ = MagicMock(return_value=False)
        # make timeout=0 so the polling loop exits immediately and falls through to HTML parsing
        self.enricher.timeout = 0
        mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
        result = self.enricher._submit_url("https://example.com")
        assert result == "https://ghostarchive.org/archive/Xyz99"
    def test_submit_url_browser_error(self, mocker):
        """Browser error during submission should return None."""
        mocker.patch(
            "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB",
            side_effect=Exception("browser failed to start"),
        )
        result = self.enricher._submit_url("https://example.com")
        assert result is None
    def test_proxy_configuration(self, mocker):
        """Proxies should be passed to search requests when configured."""
        self.enricher.proxy_http = "http://proxy:8080"
        self.enricher.proxy_https = "https://proxy:8443"
        mock_get = mocker.patch(
            "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get",
        )
        mock_response = mocker.Mock()
        mock_response.status_code = 200
        mock_response.text = SEARCH_HTML_FOUND
        mock_get.return_value = mock_response
        result = self.enricher._search_existing("https://example.com")
        call_kwargs = mock_get.call_args
        assert call_kwargs.kwargs.get("proxies") == {"http": "http://proxy:8080", "https": "https://proxy:8443"}
        assert result is not None
    def test_parse_archive_url_with_replay_links(self):
        """Parser should ignore /replay/ links and only return /archive/ links."""
        html = """
        <html><body>
        <a href="/archive/replay/w/id-abc/mp_/https://example.com">replay</a>
        <a href="/archive/Valid1">valid</a>
        </body></html>
        """
        result = self.enricher._parse_archive_url(html)
        assert result == "https://ghostarchive.org/archive/Valid1"
    def test_parse_archive_url_no_links(self):
        """Parser should return None when no archive links found."""
        html = "<html><body><p>No archive here</p></body></html>"
        result = self.enricher._parse_archive_url(html)
        assert result is None
    def test_enrich_sets_ghostarchive_on_metadata(self, mocker, make_item):
        """enrich() should set 'ghostarchive' key on the metadata object."""
        mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Enr1")
        item = make_item("https://example.com")
        result = self.enricher.enrich(item)
        assert result is True
        assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Enr1"
    def test_enrich_skips_if_already_enriched(self, mocker, make_item):
        """enrich() should skip if ghostarchive key is already set."""
        mock_search = mocker.patch.object(self.enricher, "_search_existing")
        item = make_item("https://example.com", ghostarchive="https://ghostarchive.org/archive/Old1")
        result = self.enricher.enrich(item)
        assert result is True
        mock_search.assert_not_called()
    def test_enrich_returns_false_on_failure(self, mocker, make_item):
        """enrich() should return False when both search and submit fail."""
        mocker.patch.object(self.enricher, "_search_existing", return_value=None)
        mocker.patch.object(self.enricher, "_submit_url", return_value=None)
        item = make_item("https://example.com")
        result = self.enricher.enrich(item)
        assert result is False
    def test_enrich_skips_auth_wall(self, mocker, make_item):
        """enrich() should skip URLs behind auth walls."""
        mocker.patch(
            "auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.UrlUtil.is_auth_wall", return_value=True
        )
        item = make_item("https://example.com/login")
        result = self.enricher.enrich(item)
        assert result is False
    def test_enrich_with_existing_archive(self, mocker, make_item):
        """enrich() should use existing archive when check_existing is True."""
        mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Exist1")
        mock_submit = mocker.patch.object(self.enricher, "_submit_url")
        item = make_item("https://example.com")
        result = self.enricher.enrich(item)
        assert result is True
        assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Exist1"
        mock_submit.assert_not_called()
    def test_enrich_submits_when_no_existing(self, mocker, make_item):
        """enrich() should submit URL when no existing archive found."""
        mocker.patch.object(self.enricher, "_search_existing", return_value=None)
        mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/New42")
        item = make_item("https://example.com")
        result = self.enricher.enrich(item)
        assert result is True
        assert item.get("ghostarchive") == "https://ghostarchive.org/archive/New42"
    def test_enrich_skips_check_existing_when_disabled(self, mocker, make_item):
        """enrich() should skip search when check_existing is False."""
        self.enricher.check_existing = False
        mock_search = mocker.patch.object(self.enricher, "_search_existing")
        mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/Direct1")
        item = make_item("https://example.com")
        result = self.enricher.enrich(item)
        assert result is True
        mock_search.assert_not_called()
    @pytest.mark.download
    def test_real_search_existing(self, setup_module):
        """Integration test: search for an existing archive on Ghost Archive."""
        enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
        # example.com is commonly archived
        result = enricher._search_existing("https://example.com")
        # we just check it doesn't crash; result may or may not be found
        assert result is None or result.startswith("https://ghostarchive.org/archive/")
    @pytest.mark.download
    @pytest.mark.skipif(CI, reason="Avoid submitting a real task on every CI run")
    def test_real_submit_example_com(self, setup_module, make_item):
        """Integration test: submit example.com to Ghost Archive and verify enrichment."""
        enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
        item = make_item("https://example.com")
        result = enricher.enrich(item)
        assert result is True
        archive_url = item.get("ghostarchive")
        assert archive_url is not None
        assert archive_url.startswith("https://ghostarchive.org/archive/")
		`@@ -0,0 +1 @@`
							`from .ghostarchive_enricher import GhostarchiveEnricher`