implements new enricher to submit URLs to ghostarchive

2026-06-07 19:08:30 +03:00 · 2026-04-06 17:13:48 +01:00
parent 1b69ec1f00
commit 82fc786d56
3 changed files with 212 additions and 0 deletions
--- a/src/auto_archiver/modules/ghostarchive_enricher/init.py
+++ b/src/auto_archiver/modules/ghostarchive_enricher/init.py
@@ -0,0 +1 @@
+from .ghostarchive_enricher import GhostarchiveEnricher
--- a/src/auto_archiver/modules/ghostarchive_enricher/manifest.py
+++ b/src/auto_archiver/modules/ghostarchive_enricher/manifest.py
@@ -0,0 +1,58 @@
+{
+    "name": "Ghost Archive Enricher",
+    "type": ["enricher"],
+    "entry_point": "ghostarchive_enricher::GhostarchiveEnricher",
+    "requires_setup": False,
+    "dependencies": {
+        "python": ["loguru", "requests", "bs4", "seleniumbase"],
+    },
+    "configs": {
+        "timeout": {
+            "default": 120,
+            "type": "int",
+            "help": "seconds to wait for successful archive confirmation from Ghost Archive.",
+        },
+        "check_existing": {
+            "default": True,
+            "type": "bool",
+            "help": "whether to search for an existing archive before submitting a new one.",
+        },
+        "proxy_http": {
+            "default": None,
+            "help": "http proxy to use for requests, eg http://proxy-user:password@proxy-ip:port",
+        },
+        "proxy_https": {
+            "default": None,
+            "help": "https proxy to use for requests, eg https://proxy-user:password@proxy-ip:port",
+        },
+    },
+    "description": """
+    Submits the current URL to [Ghost Archive](https://ghostarchive.org/) for archiving and returns the archived page URL.
+
+    Used as an **enricher** to add a Ghost Archive URL to items already extracted by other modules.
+
+    ### Features
+    - Archives any public URL using the Ghost Archive service.
+    - Optionally checks for existing archives before submitting a new one.
+    - Supports HTTP and HTTPS proxies for requests.
+    - Parses HTML responses to extract archive URLs (Ghost Archive has no JSON API).
+
+    ### Important
+    - This module confirms that Ghost Archive accepted the URL submission and returned an archive link.
+      It does **not** verify the contents or completeness of the archived page.
+
+    ### Notes
+    - Ghost Archive is a free service with no authentication required.
+    - Archived pages must be smaller than 50 MB (including CSS, fonts, images, etc.).
+    - Videos are archived up to 360p and must be under 100 MB and shorter than 30 minutes.
+    - Archival may take up to 5 minutes depending on the queue and page complexity.
+    - Archived content is stored indefinitely.
+    - Ghost Archive does not archive pages that require authentication or form submission.
+
+    ### Limitations
+    - No official API — this module interacts with the Ghost Archive web interface.
+    - The submission endpoint is protected by Cloudflare, so a headless browser (SeleniumBase) is used for new submissions.
+    - Searching for existing archives uses plain HTTP requests and does not require a browser.
+    - Rate limiting may apply; consider using a delay between requests if archiving many URLs.
+    """,
+}
--- a/src/auto_archiver/modules/ghostarchive_enricher/ghostarchive_enricher.py
+++ b/src/auto_archiver/modules/ghostarchive_enricher/ghostarchive_enricher.py
@@ -0,0 +1,153 @@
+import time
+import re
+
+import requests
+from bs4 import BeautifulSoup
+from seleniumbase import SB
+from auto_archiver.utils.custom_logger import logger
+from auto_archiver.utils import url as UrlUtil
+from auto_archiver.core import Enricher, Metadata
+
+
+class GhostarchiveEnricher(Enricher):
+    """
+    Submits the current URL to Ghost Archive (ghostarchive.org) for archiving
+    and stores the archived page URL as enrichment metadata.
+
+    Ghost Archive has no official API — this module interacts with the web form
+    and parses HTML responses. The submission endpoint is protected by Cloudflare,
+    so a headless browser (SeleniumBase) is used for archival submissions, while
+    plain HTTP requests are used for searching existing archives.
+
+    Note: this module only confirms that Ghost Archive accepted the submission
+    and returned an archive URL. It does not verify that the archived page
+    content is complete or correctly rendered.
+    """
+
+    GHOSTARCHIVE_BASE = "https://ghostarchive.org"
+    ARCHIVE_ENDPOINT = f"{GHOSTARCHIVE_BASE}/archive2"
+    SEARCH_ENDPOINT = f"{GHOSTARCHIVE_BASE}/search"
+    ARCHIVE_URL_PATTERN = re.compile(r"/archive/([A-Za-z0-9]+)")
+
+    def _get_proxies(self) -> dict:
+        proxies = {}
+        if self.proxy_http:
+            proxies["http"] = self.proxy_http
+        if self.proxy_https:
+            proxies["https"] = self.proxy_https
+        return proxies
+
+    def _get_headers(self) -> dict:
+        return {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        }
+
+    def _normalize_archive_href(self, href: str) -> str | None:
+        """Normalize an archive link href to a full HTTPS URL, filtering out replay links."""
+        if "/archive/" not in href or "/replay/" in href:
+            return None
+        if href.startswith("/"):
+            return f"{self.GHOSTARCHIVE_BASE}{href}"
+        if href.startswith("http://ghostarchive.org"):
+            return href.replace("http://", "https://")
+        if href.startswith("https://ghostarchive.org"):
+            return href
+        return None
+
+    def _search_existing(self, url: str) -> str | None:
+        """
+        Search Ghost Archive for an existing archive of the given URL.
+        Returns the archive URL if found, otherwise None.
+        """
+        try:
+            r = requests.get(
+                self.SEARCH_ENDPOINT,
+                params={"term": url},
+                headers=self._get_headers(),
+                proxies=self._get_proxies(),
+                timeout=30,
+            )
+            if r.status_code != 200:
+                logger.warning(f"Ghost Archive search returned status {r.status_code}")
+                return None
+
+            soup = BeautifulSoup(r.text, "html.parser")
+            for link in soup.find_all("a", href=True):
+                archive_url = self._normalize_archive_href(link["href"])
+                if archive_url:
+                    logger.info(f"Found existing Ghost Archive: {archive_url}")
+                    return archive_url
+
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Ghost Archive search failed: {e}")
+
+        return None
+
+    def _submit_url(self, url: str) -> str | None:
+        """
+        Submit a URL to Ghost Archive for archiving using a headless browser.
+        The /archive2 endpoint is Cloudflare-protected, requiring JS execution.
+        Returns the archive URL if successful, otherwise None.
+        """
+        try:
+            with SB(uc=True, headless=True) as sb:
+                logger.debug("Opening Ghost Archive homepage in headless browser")
+                sb.open(self.GHOSTARCHIVE_BASE)
+
+                # fill in the archive form and submit
+                sb.type('input[name="archive"]', url)
+                sb.click('input[type="submit"][value="Submit for archival"]')
+
+                # wait for navigation to /archive/{id} or timeout
+                start_time = time.time()
+                while time.time() - start_time < self.timeout:
+                    current_url = sb.get_current_url()
+                    if self.ARCHIVE_URL_PATTERN.search(current_url):
+                        archive_url = current_url.split("?")[0]
+                        logger.info(f"Ghost Archive saved: {archive_url}")
+                        return archive_url
+                    time.sleep(2)
+
+                # if we didn't redirect, try parsing the page source
+                page_source = sb.get_page_source()
+                return self._parse_archive_url(page_source)
+
+        except Exception as e:
+            logger.warning(f"Ghost Archive submission failed: {e}")
+            return None
+
+    def _parse_archive_url(self, html: str) -> str | None:
+        """Parse HTML response to find an archive URL."""
+        soup = BeautifulSoup(html, "html.parser")
+        for link in soup.find_all("a", href=True):
+            archive_url = self._normalize_archive_href(link["href"])
+            if archive_url:
+                return archive_url
+        return None
+
+    def enrich(self, to_enrich: Metadata) -> bool:
+        url = to_enrich.get_url()
+        if UrlUtil.is_auth_wall(url):
+            logger.debug("[SKIP] Ghost Archive since url is behind AUTH WALL")
+            return False
+
+        if to_enrich.get("ghostarchive"):
+            logger.info(f"Ghost Archive enricher had already been executed: {to_enrich.get('ghostarchive')}")
+            return True
+
+        # optionally check for existing archive first
+        archive_url = None
+        if self.check_existing:
+            logger.debug(f"Searching Ghost Archive for existing archive of {url}")
+            archive_url = self._search_existing(url)
+
+        if not archive_url:
+            logger.debug(f"Submitting {url} to Ghost Archive")
+            archive_url = self._submit_url(url)
+
+        if archive_url:
+            to_enrich.set("ghostarchive", archive_url)
+            return True
+
+        logger.warning(f"Ghost Archive failed to archive {url}")
+        return False
				`@@ -0,0 +1 @@`
				`from .ghostarchive_enricher import GhostarchiveEnricher`