mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
implements new enricher to submit URLs to ghostarchive
This commit is contained in:
@@ -0,0 +1 @@
|
||||
from .ghostarchive_enricher import GhostarchiveEnricher
|
||||
@@ -0,0 +1,58 @@
|
||||
{
|
||||
"name": "Ghost Archive Enricher",
|
||||
"type": ["enricher"],
|
||||
"entry_point": "ghostarchive_enricher::GhostarchiveEnricher",
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests", "bs4", "seleniumbase"],
|
||||
},
|
||||
"configs": {
|
||||
"timeout": {
|
||||
"default": 120,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for successful archive confirmation from Ghost Archive.",
|
||||
},
|
||||
"check_existing": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "whether to search for an existing archive before submitting a new one.",
|
||||
},
|
||||
"proxy_http": {
|
||||
"default": None,
|
||||
"help": "http proxy to use for requests, eg http://proxy-user:password@proxy-ip:port",
|
||||
},
|
||||
"proxy_https": {
|
||||
"default": None,
|
||||
"help": "https proxy to use for requests, eg https://proxy-user:password@proxy-ip:port",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Submits the current URL to [Ghost Archive](https://ghostarchive.org/) for archiving and returns the archived page URL.
|
||||
|
||||
Used as an **enricher** to add a Ghost Archive URL to items already extracted by other modules.
|
||||
|
||||
### Features
|
||||
- Archives any public URL using the Ghost Archive service.
|
||||
- Optionally checks for existing archives before submitting a new one.
|
||||
- Supports HTTP and HTTPS proxies for requests.
|
||||
- Parses HTML responses to extract archive URLs (Ghost Archive has no JSON API).
|
||||
|
||||
### Important
|
||||
- This module confirms that Ghost Archive accepted the URL submission and returned an archive link.
|
||||
It does **not** verify the contents or completeness of the archived page.
|
||||
|
||||
### Notes
|
||||
- Ghost Archive is a free service with no authentication required.
|
||||
- Archived pages must be smaller than 50 MB (including CSS, fonts, images, etc.).
|
||||
- Videos are archived up to 360p and must be under 100 MB and shorter than 30 minutes.
|
||||
- Archival may take up to 5 minutes depending on the queue and page complexity.
|
||||
- Archived content is stored indefinitely.
|
||||
- Ghost Archive does not archive pages that require authentication or form submission.
|
||||
|
||||
### Limitations
|
||||
- No official API — this module interacts with the Ghost Archive web interface.
|
||||
- The submission endpoint is protected by Cloudflare, so a headless browser (SeleniumBase) is used for new submissions.
|
||||
- Searching for existing archives uses plain HTTP requests and does not require a browser.
|
||||
- Rate limiting may apply; consider using a delay between requests if archiving many URLs.
|
||||
""",
|
||||
}
|
||||
@@ -0,0 +1,153 @@
|
||||
import time
|
||||
import re
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from seleniumbase import SB
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
from auto_archiver.core import Enricher, Metadata
|
||||
|
||||
|
||||
class GhostarchiveEnricher(Enricher):
|
||||
"""
|
||||
Submits the current URL to Ghost Archive (ghostarchive.org) for archiving
|
||||
and stores the archived page URL as enrichment metadata.
|
||||
|
||||
Ghost Archive has no official API — this module interacts with the web form
|
||||
and parses HTML responses. The submission endpoint is protected by Cloudflare,
|
||||
so a headless browser (SeleniumBase) is used for archival submissions, while
|
||||
plain HTTP requests are used for searching existing archives.
|
||||
|
||||
Note: this module only confirms that Ghost Archive accepted the submission
|
||||
and returned an archive URL. It does not verify that the archived page
|
||||
content is complete or correctly rendered.
|
||||
"""
|
||||
|
||||
GHOSTARCHIVE_BASE = "https://ghostarchive.org"
|
||||
ARCHIVE_ENDPOINT = f"{GHOSTARCHIVE_BASE}/archive2"
|
||||
SEARCH_ENDPOINT = f"{GHOSTARCHIVE_BASE}/search"
|
||||
ARCHIVE_URL_PATTERN = re.compile(r"/archive/([A-Za-z0-9]+)")
|
||||
|
||||
def _get_proxies(self) -> dict:
|
||||
proxies = {}
|
||||
if self.proxy_http:
|
||||
proxies["http"] = self.proxy_http
|
||||
if self.proxy_https:
|
||||
proxies["https"] = self.proxy_https
|
||||
return proxies
|
||||
|
||||
def _get_headers(self) -> dict:
|
||||
return {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
}
|
||||
|
||||
def _normalize_archive_href(self, href: str) -> str | None:
|
||||
"""Normalize an archive link href to a full HTTPS URL, filtering out replay links."""
|
||||
if "/archive/" not in href or "/replay/" in href:
|
||||
return None
|
||||
if href.startswith("/"):
|
||||
return f"{self.GHOSTARCHIVE_BASE}{href}"
|
||||
if href.startswith("http://ghostarchive.org"):
|
||||
return href.replace("http://", "https://")
|
||||
if href.startswith("https://ghostarchive.org"):
|
||||
return href
|
||||
return None
|
||||
|
||||
def _search_existing(self, url: str) -> str | None:
|
||||
"""
|
||||
Search Ghost Archive for an existing archive of the given URL.
|
||||
Returns the archive URL if found, otherwise None.
|
||||
"""
|
||||
try:
|
||||
r = requests.get(
|
||||
self.SEARCH_ENDPOINT,
|
||||
params={"term": url},
|
||||
headers=self._get_headers(),
|
||||
proxies=self._get_proxies(),
|
||||
timeout=30,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
logger.warning(f"Ghost Archive search returned status {r.status_code}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
for link in soup.find_all("a", href=True):
|
||||
archive_url = self._normalize_archive_href(link["href"])
|
||||
if archive_url:
|
||||
logger.info(f"Found existing Ghost Archive: {archive_url}")
|
||||
return archive_url
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.warning(f"Ghost Archive search failed: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _submit_url(self, url: str) -> str | None:
|
||||
"""
|
||||
Submit a URL to Ghost Archive for archiving using a headless browser.
|
||||
The /archive2 endpoint is Cloudflare-protected, requiring JS execution.
|
||||
Returns the archive URL if successful, otherwise None.
|
||||
"""
|
||||
try:
|
||||
with SB(uc=True, headless=True) as sb:
|
||||
logger.debug("Opening Ghost Archive homepage in headless browser")
|
||||
sb.open(self.GHOSTARCHIVE_BASE)
|
||||
|
||||
# fill in the archive form and submit
|
||||
sb.type('input[name="archive"]', url)
|
||||
sb.click('input[type="submit"][value="Submit for archival"]')
|
||||
|
||||
# wait for navigation to /archive/{id} or timeout
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < self.timeout:
|
||||
current_url = sb.get_current_url()
|
||||
if self.ARCHIVE_URL_PATTERN.search(current_url):
|
||||
archive_url = current_url.split("?")[0]
|
||||
logger.info(f"Ghost Archive saved: {archive_url}")
|
||||
return archive_url
|
||||
time.sleep(2)
|
||||
|
||||
# if we didn't redirect, try parsing the page source
|
||||
page_source = sb.get_page_source()
|
||||
return self._parse_archive_url(page_source)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Ghost Archive submission failed: {e}")
|
||||
return None
|
||||
|
||||
def _parse_archive_url(self, html: str) -> str | None:
|
||||
"""Parse HTML response to find an archive URL."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
for link in soup.find_all("a", href=True):
|
||||
archive_url = self._normalize_archive_href(link["href"])
|
||||
if archive_url:
|
||||
return archive_url
|
||||
return None
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
url = to_enrich.get_url()
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug("[SKIP] Ghost Archive since url is behind AUTH WALL")
|
||||
return False
|
||||
|
||||
if to_enrich.get("ghostarchive"):
|
||||
logger.info(f"Ghost Archive enricher had already been executed: {to_enrich.get('ghostarchive')}")
|
||||
return True
|
||||
|
||||
# optionally check for existing archive first
|
||||
archive_url = None
|
||||
if self.check_existing:
|
||||
logger.debug(f"Searching Ghost Archive for existing archive of {url}")
|
||||
archive_url = self._search_existing(url)
|
||||
|
||||
if not archive_url:
|
||||
logger.debug(f"Submitting {url} to Ghost Archive")
|
||||
archive_url = self._submit_url(url)
|
||||
|
||||
if archive_url:
|
||||
to_enrich.set("ghostarchive", archive_url)
|
||||
return True
|
||||
|
||||
logger.warning(f"Ghost Archive failed to archive {url}")
|
||||
return False
|
||||
Reference in New Issue
Block a user