mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
Merge pull request #419 from bellingcat/dev
Dependencies bump, new ghostarchive enricher
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
FROM webrecorder/browsertrix-crawler:1.11.4 AS base
|
FROM webrecorder/browsertrix-crawler:1.12.4 AS base
|
||||||
|
|
||||||
ENV RUNNING_IN_DOCKER=1 \
|
ENV RUNNING_IN_DOCKER=1 \
|
||||||
LANG=C.UTF-8 \
|
LANG=C.UTF-8 \
|
||||||
|
|||||||
1115
poetry.lock
generated
1115
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "auto-archiver"
|
name = "auto-archiver"
|
||||||
version = "1.2.5"
|
version = "1.2.6"
|
||||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
requires-python = ">=3.10,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
from .ghostarchive_enricher import GhostarchiveEnricher
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
{
|
||||||
|
"name": "Ghost Archive Enricher",
|
||||||
|
"type": ["enricher"],
|
||||||
|
"entry_point": "ghostarchive_enricher::GhostarchiveEnricher",
|
||||||
|
"requires_setup": False,
|
||||||
|
"dependencies": {
|
||||||
|
"python": ["loguru", "requests", "bs4", "seleniumbase"],
|
||||||
|
},
|
||||||
|
"configs": {
|
||||||
|
"timeout": {
|
||||||
|
"default": 120,
|
||||||
|
"type": "int",
|
||||||
|
"help": "seconds to wait for successful archive confirmation from Ghost Archive.",
|
||||||
|
},
|
||||||
|
"check_existing": {
|
||||||
|
"default": True,
|
||||||
|
"type": "bool",
|
||||||
|
"help": "whether to search for an existing archive before submitting a new one.",
|
||||||
|
},
|
||||||
|
"proxy_http": {
|
||||||
|
"default": None,
|
||||||
|
"help": "http proxy to use for requests, eg http://proxy-user:password@proxy-ip:port",
|
||||||
|
},
|
||||||
|
"proxy_https": {
|
||||||
|
"default": None,
|
||||||
|
"help": "https proxy to use for requests, eg https://proxy-user:password@proxy-ip:port",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"description": """
|
||||||
|
Submits the current URL to [Ghost Archive](https://ghostarchive.org/) for archiving and returns the archived page URL.
|
||||||
|
|
||||||
|
Used as an **enricher** to add a Ghost Archive URL to items already extracted by other modules.
|
||||||
|
|
||||||
|
### Features
|
||||||
|
- Archives any public URL using the Ghost Archive service.
|
||||||
|
- Optionally checks for existing archives before submitting a new one.
|
||||||
|
- Supports HTTP and HTTPS proxies for requests.
|
||||||
|
- Parses HTML responses to extract archive URLs (Ghost Archive has no JSON API).
|
||||||
|
|
||||||
|
### Important
|
||||||
|
- This module confirms that Ghost Archive accepted the URL submission and returned an archive link.
|
||||||
|
It does **not** verify the contents or completeness of the archived page.
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
- Ghost Archive is a free service with no authentication required.
|
||||||
|
- Archived pages must be smaller than 50 MB (including CSS, fonts, images, etc.).
|
||||||
|
- Videos are archived up to 360p and must be under 100 MB and shorter than 30 minutes.
|
||||||
|
- Archival may take up to 5 minutes depending on the queue and page complexity.
|
||||||
|
- Archived content is stored indefinitely.
|
||||||
|
- Ghost Archive does not archive pages that require authentication or form submission.
|
||||||
|
|
||||||
|
### Limitations
|
||||||
|
- No official API — this module interacts with the Ghost Archive web interface.
|
||||||
|
- The submission endpoint is protected by Cloudflare, so a headless browser (SeleniumBase) is used for new submissions.
|
||||||
|
- Searching for existing archives uses plain HTTP requests and does not require a browser.
|
||||||
|
- Rate limiting may apply; consider using a delay between requests if archiving many URLs.
|
||||||
|
""",
|
||||||
|
}
|
||||||
@@ -0,0 +1,153 @@
|
|||||||
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from seleniumbase import SB
|
||||||
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
from auto_archiver.utils import url as UrlUtil
|
||||||
|
from auto_archiver.core import Enricher, Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class GhostarchiveEnricher(Enricher):
|
||||||
|
"""
|
||||||
|
Submits the current URL to Ghost Archive (ghostarchive.org) for archiving
|
||||||
|
and stores the archived page URL as enrichment metadata.
|
||||||
|
|
||||||
|
Ghost Archive has no official API — this module interacts with the web form
|
||||||
|
and parses HTML responses. The submission endpoint is protected by Cloudflare,
|
||||||
|
so a headless browser (SeleniumBase) is used for archival submissions, while
|
||||||
|
plain HTTP requests are used for searching existing archives.
|
||||||
|
|
||||||
|
Note: this module only confirms that Ghost Archive accepted the submission
|
||||||
|
and returned an archive URL. It does not verify that the archived page
|
||||||
|
content is complete or correctly rendered.
|
||||||
|
"""
|
||||||
|
|
||||||
|
GHOSTARCHIVE_BASE = "https://ghostarchive.org"
|
||||||
|
ARCHIVE_ENDPOINT = f"{GHOSTARCHIVE_BASE}/archive2"
|
||||||
|
SEARCH_ENDPOINT = f"{GHOSTARCHIVE_BASE}/search"
|
||||||
|
ARCHIVE_URL_PATTERN = re.compile(r"/archive/([A-Za-z0-9]+)")
|
||||||
|
|
||||||
|
def _get_proxies(self) -> dict:
|
||||||
|
proxies = {}
|
||||||
|
if self.proxy_http:
|
||||||
|
proxies["http"] = self.proxy_http
|
||||||
|
if self.proxy_https:
|
||||||
|
proxies["https"] = self.proxy_https
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
def _get_headers(self) -> dict:
|
||||||
|
return {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _normalize_archive_href(self, href: str) -> str | None:
|
||||||
|
"""Normalize an archive link href to a full HTTPS URL, filtering out replay links."""
|
||||||
|
if "/archive/" not in href or "/replay/" in href:
|
||||||
|
return None
|
||||||
|
if href.startswith("/"):
|
||||||
|
return f"{self.GHOSTARCHIVE_BASE}{href}"
|
||||||
|
if href.startswith("http://ghostarchive.org"):
|
||||||
|
return href.replace("http://", "https://")
|
||||||
|
if href.startswith("https://ghostarchive.org"):
|
||||||
|
return href
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _search_existing(self, url: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Search Ghost Archive for an existing archive of the given URL.
|
||||||
|
Returns the archive URL if found, otherwise None.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
r = requests.get(
|
||||||
|
self.SEARCH_ENDPOINT,
|
||||||
|
params={"term": url},
|
||||||
|
headers=self._get_headers(),
|
||||||
|
proxies=self._get_proxies(),
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
if r.status_code != 200:
|
||||||
|
logger.warning(f"Ghost Archive search returned status {r.status_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r.text, "html.parser")
|
||||||
|
for link in soup.find_all("a", href=True):
|
||||||
|
archive_url = self._normalize_archive_href(link["href"])
|
||||||
|
if archive_url:
|
||||||
|
logger.info(f"Found existing Ghost Archive: {archive_url}")
|
||||||
|
return archive_url
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.warning(f"Ghost Archive search failed: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _submit_url(self, url: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Submit a URL to Ghost Archive for archiving using a headless browser.
|
||||||
|
The /archive2 endpoint is Cloudflare-protected, requiring JS execution.
|
||||||
|
Returns the archive URL if successful, otherwise None.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with SB(uc=True, headless=True) as sb:
|
||||||
|
logger.debug("Opening Ghost Archive homepage in headless browser")
|
||||||
|
sb.open(self.GHOSTARCHIVE_BASE)
|
||||||
|
|
||||||
|
# fill in the archive form and submit
|
||||||
|
sb.type('input[name="archive"]', url)
|
||||||
|
sb.click('input[type="submit"][value="Submit for archival"]')
|
||||||
|
|
||||||
|
# wait for navigation to /archive/{id} or timeout
|
||||||
|
start_time = time.time()
|
||||||
|
while time.time() - start_time < self.timeout:
|
||||||
|
current_url = sb.get_current_url()
|
||||||
|
if self.ARCHIVE_URL_PATTERN.search(current_url):
|
||||||
|
archive_url = current_url.split("?")[0]
|
||||||
|
logger.info(f"Ghost Archive saved: {archive_url}")
|
||||||
|
return archive_url
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# if we didn't redirect, try parsing the page source
|
||||||
|
page_source = sb.get_page_source()
|
||||||
|
return self._parse_archive_url(page_source)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Ghost Archive submission failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_archive_url(self, html: str) -> str | None:
|
||||||
|
"""Parse HTML response to find an archive URL."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
for link in soup.find_all("a", href=True):
|
||||||
|
archive_url = self._normalize_archive_href(link["href"])
|
||||||
|
if archive_url:
|
||||||
|
return archive_url
|
||||||
|
return None
|
||||||
|
|
||||||
|
def enrich(self, to_enrich: Metadata) -> bool:
|
||||||
|
url = to_enrich.get_url()
|
||||||
|
if UrlUtil.is_auth_wall(url):
|
||||||
|
logger.debug("[SKIP] Ghost Archive since url is behind AUTH WALL")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if to_enrich.get("ghostarchive"):
|
||||||
|
logger.info(f"Ghost Archive enricher had already been executed: {to_enrich.get('ghostarchive')}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# optionally check for existing archive first
|
||||||
|
archive_url = None
|
||||||
|
if self.check_existing:
|
||||||
|
logger.debug(f"Searching Ghost Archive for existing archive of {url}")
|
||||||
|
archive_url = self._search_existing(url)
|
||||||
|
|
||||||
|
if not archive_url:
|
||||||
|
logger.debug(f"Submitting {url} to Ghost Archive")
|
||||||
|
archive_url = self._submit_url(url)
|
||||||
|
|
||||||
|
if archive_url:
|
||||||
|
to_enrich.set("ghostarchive", archive_url)
|
||||||
|
return True
|
||||||
|
|
||||||
|
logger.warning(f"Ghost Archive failed to archive {url}")
|
||||||
|
return False
|
||||||
277
tests/enrichers/test_ghostarchive_enricher.py
Normal file
277
tests/enrichers/test_ghostarchive_enricher.py
Normal file
@@ -0,0 +1,277 @@
|
|||||||
|
import pytest
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
from auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher import GhostarchiveEnricher
|
||||||
|
|
||||||
|
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
|
||||||
|
|
||||||
|
# sample HTML responses for mocking
|
||||||
|
SEARCH_HTML_FOUND = """
|
||||||
|
<html><body>
|
||||||
|
<h1>Archives for https://example.com</h1>
|
||||||
|
<table>
|
||||||
|
<tr><td><a href="http://ghostarchive.org/archive/Abc12">https://example.com</a></td></tr>
|
||||||
|
</table>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
SEARCH_HTML_NOT_FOUND = """
|
||||||
|
<html><body>
|
||||||
|
<h1>Archives for https://example.com</h1>
|
||||||
|
<p>Page 0 out of 0</p>
|
||||||
|
<p>No archives for that site.</p>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
SAVE_RESPONSE_HTML_WITH_LINK = """
|
||||||
|
<html><body>
|
||||||
|
<h1>Archive saved</h1>
|
||||||
|
<a href="/archive/Xyz99">View archive</a>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
ENRICHER_CONFIG = {
|
||||||
|
"timeout": 120,
|
||||||
|
"check_existing": True,
|
||||||
|
"proxy_http": None,
|
||||||
|
"proxy_https": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestGhostarchiveEnricher:
|
||||||
|
"""Tests for Ghost Archive Enricher"""
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def setup_enricher(self, setup_module):
|
||||||
|
self.enricher: GhostarchiveEnricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
|
||||||
|
|
||||||
|
def test_search_existing_found(self, mocker):
|
||||||
|
"""When an existing archive is found, it should be returned."""
|
||||||
|
mock_response = mocker.Mock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.text = SEARCH_HTML_FOUND
|
||||||
|
mocker.patch(
|
||||||
|
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
|
||||||
|
)
|
||||||
|
|
||||||
|
result = self.enricher._search_existing("https://example.com")
|
||||||
|
assert result == "https://ghostarchive.org/archive/Abc12"
|
||||||
|
|
||||||
|
def test_search_existing_not_found(self, mocker):
|
||||||
|
"""When no existing archive is found, None should be returned."""
|
||||||
|
mock_response = mocker.Mock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.text = SEARCH_HTML_NOT_FOUND
|
||||||
|
mocker.patch(
|
||||||
|
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
|
||||||
|
)
|
||||||
|
|
||||||
|
result = self.enricher._search_existing("https://example.com")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_search_existing_request_error(self, mocker):
|
||||||
|
"""When search request fails, None should be returned."""
|
||||||
|
mocker.patch(
|
||||||
|
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get",
|
||||||
|
side_effect=requests.exceptions.ConnectionError("connection failed"),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = self.enricher._search_existing("https://example.com")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_search_existing_non_200(self, mocker):
|
||||||
|
"""When search returns non-200, None should be returned."""
|
||||||
|
mock_response = mocker.Mock()
|
||||||
|
mock_response.status_code = 503
|
||||||
|
mocker.patch(
|
||||||
|
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get", return_value=mock_response
|
||||||
|
)
|
||||||
|
|
||||||
|
result = self.enricher._search_existing("https://example.com")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_submit_url_success_redirect(self, mocker):
|
||||||
|
"""Successful submission via headless browser should return archive URL."""
|
||||||
|
mock_sb = MagicMock()
|
||||||
|
mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1"
|
||||||
|
mock_sb.__enter__ = MagicMock(return_value=mock_sb)
|
||||||
|
mock_sb.__exit__ = MagicMock(return_value=False)
|
||||||
|
|
||||||
|
mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
|
||||||
|
|
||||||
|
result = self.enricher._submit_url("https://example.com")
|
||||||
|
assert result == "https://ghostarchive.org/archive/NewId1"
|
||||||
|
mock_sb.type.assert_called_once()
|
||||||
|
mock_sb.click.assert_called_once()
|
||||||
|
|
||||||
|
def test_submit_url_success_redirect_strips_query(self, mocker):
|
||||||
|
"""Redirect URL query params should be stripped."""
|
||||||
|
mock_sb = MagicMock()
|
||||||
|
mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive/NewId1?wr=false"
|
||||||
|
mock_sb.__enter__ = MagicMock(return_value=mock_sb)
|
||||||
|
mock_sb.__exit__ = MagicMock(return_value=False)
|
||||||
|
|
||||||
|
mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
|
||||||
|
|
||||||
|
result = self.enricher._submit_url("https://example.com")
|
||||||
|
assert result == "https://ghostarchive.org/archive/NewId1"
|
||||||
|
|
||||||
|
def test_submit_url_success_html_fallback(self, mocker):
|
||||||
|
"""When browser doesn't redirect, should parse page source for archive link."""
|
||||||
|
mock_sb = MagicMock()
|
||||||
|
mock_sb.get_current_url.return_value = "https://ghostarchive.org/archive2"
|
||||||
|
mock_sb.get_page_source.return_value = SAVE_RESPONSE_HTML_WITH_LINK
|
||||||
|
mock_sb.__enter__ = MagicMock(return_value=mock_sb)
|
||||||
|
mock_sb.__exit__ = MagicMock(return_value=False)
|
||||||
|
|
||||||
|
# make timeout=0 so the polling loop exits immediately and falls through to HTML parsing
|
||||||
|
self.enricher.timeout = 0
|
||||||
|
mocker.patch("auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB", return_value=mock_sb)
|
||||||
|
|
||||||
|
result = self.enricher._submit_url("https://example.com")
|
||||||
|
assert result == "https://ghostarchive.org/archive/Xyz99"
|
||||||
|
|
||||||
|
def test_submit_url_browser_error(self, mocker):
|
||||||
|
"""Browser error during submission should return None."""
|
||||||
|
mocker.patch(
|
||||||
|
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.SB",
|
||||||
|
side_effect=Exception("browser failed to start"),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = self.enricher._submit_url("https://example.com")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_proxy_configuration(self, mocker):
|
||||||
|
"""Proxies should be passed to search requests when configured."""
|
||||||
|
self.enricher.proxy_http = "http://proxy:8080"
|
||||||
|
self.enricher.proxy_https = "https://proxy:8443"
|
||||||
|
|
||||||
|
mock_get = mocker.patch(
|
||||||
|
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.requests.get",
|
||||||
|
)
|
||||||
|
mock_response = mocker.Mock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.text = SEARCH_HTML_FOUND
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
result = self.enricher._search_existing("https://example.com")
|
||||||
|
|
||||||
|
call_kwargs = mock_get.call_args
|
||||||
|
assert call_kwargs.kwargs.get("proxies") == {"http": "http://proxy:8080", "https": "https://proxy:8443"}
|
||||||
|
assert result is not None
|
||||||
|
|
||||||
|
def test_parse_archive_url_with_replay_links(self):
|
||||||
|
"""Parser should ignore /replay/ links and only return /archive/ links."""
|
||||||
|
html = """
|
||||||
|
<html><body>
|
||||||
|
<a href="/archive/replay/w/id-abc/mp_/https://example.com">replay</a>
|
||||||
|
<a href="/archive/Valid1">valid</a>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
result = self.enricher._parse_archive_url(html)
|
||||||
|
assert result == "https://ghostarchive.org/archive/Valid1"
|
||||||
|
|
||||||
|
def test_parse_archive_url_no_links(self):
|
||||||
|
"""Parser should return None when no archive links found."""
|
||||||
|
html = "<html><body><p>No archive here</p></body></html>"
|
||||||
|
result = self.enricher._parse_archive_url(html)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_enrich_sets_ghostarchive_on_metadata(self, mocker, make_item):
|
||||||
|
"""enrich() should set 'ghostarchive' key on the metadata object."""
|
||||||
|
mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Enr1")
|
||||||
|
|
||||||
|
item = make_item("https://example.com")
|
||||||
|
result = self.enricher.enrich(item)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Enr1"
|
||||||
|
|
||||||
|
def test_enrich_skips_if_already_enriched(self, mocker, make_item):
|
||||||
|
"""enrich() should skip if ghostarchive key is already set."""
|
||||||
|
mock_search = mocker.patch.object(self.enricher, "_search_existing")
|
||||||
|
|
||||||
|
item = make_item("https://example.com", ghostarchive="https://ghostarchive.org/archive/Old1")
|
||||||
|
result = self.enricher.enrich(item)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
mock_search.assert_not_called()
|
||||||
|
|
||||||
|
def test_enrich_returns_false_on_failure(self, mocker, make_item):
|
||||||
|
"""enrich() should return False when both search and submit fail."""
|
||||||
|
mocker.patch.object(self.enricher, "_search_existing", return_value=None)
|
||||||
|
mocker.patch.object(self.enricher, "_submit_url", return_value=None)
|
||||||
|
|
||||||
|
item = make_item("https://example.com")
|
||||||
|
result = self.enricher.enrich(item)
|
||||||
|
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_enrich_skips_auth_wall(self, mocker, make_item):
|
||||||
|
"""enrich() should skip URLs behind auth walls."""
|
||||||
|
mocker.patch(
|
||||||
|
"auto_archiver.modules.ghostarchive_enricher.ghostarchive_enricher.UrlUtil.is_auth_wall", return_value=True
|
||||||
|
)
|
||||||
|
|
||||||
|
item = make_item("https://example.com/login")
|
||||||
|
result = self.enricher.enrich(item)
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_enrich_with_existing_archive(self, mocker, make_item):
|
||||||
|
"""enrich() should use existing archive when check_existing is True."""
|
||||||
|
mocker.patch.object(self.enricher, "_search_existing", return_value="https://ghostarchive.org/archive/Exist1")
|
||||||
|
mock_submit = mocker.patch.object(self.enricher, "_submit_url")
|
||||||
|
|
||||||
|
item = make_item("https://example.com")
|
||||||
|
result = self.enricher.enrich(item)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
assert item.get("ghostarchive") == "https://ghostarchive.org/archive/Exist1"
|
||||||
|
mock_submit.assert_not_called()
|
||||||
|
|
||||||
|
def test_enrich_submits_when_no_existing(self, mocker, make_item):
|
||||||
|
"""enrich() should submit URL when no existing archive found."""
|
||||||
|
mocker.patch.object(self.enricher, "_search_existing", return_value=None)
|
||||||
|
mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/New42")
|
||||||
|
|
||||||
|
item = make_item("https://example.com")
|
||||||
|
result = self.enricher.enrich(item)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
assert item.get("ghostarchive") == "https://ghostarchive.org/archive/New42"
|
||||||
|
|
||||||
|
def test_enrich_skips_check_existing_when_disabled(self, mocker, make_item):
|
||||||
|
"""enrich() should skip search when check_existing is False."""
|
||||||
|
self.enricher.check_existing = False
|
||||||
|
mock_search = mocker.patch.object(self.enricher, "_search_existing")
|
||||||
|
mocker.patch.object(self.enricher, "_submit_url", return_value="https://ghostarchive.org/archive/Direct1")
|
||||||
|
|
||||||
|
item = make_item("https://example.com")
|
||||||
|
result = self.enricher.enrich(item)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
mock_search.assert_not_called()
|
||||||
|
|
||||||
|
@pytest.mark.download
|
||||||
|
def test_real_search_existing(self, setup_module):
|
||||||
|
"""Integration test: search for an existing archive on Ghost Archive."""
|
||||||
|
enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
|
||||||
|
# example.com is commonly archived
|
||||||
|
result = enricher._search_existing("https://example.com")
|
||||||
|
# we just check it doesn't crash; result may or may not be found
|
||||||
|
assert result is None or result.startswith("https://ghostarchive.org/archive/")
|
||||||
|
|
||||||
|
@pytest.mark.download
|
||||||
|
@pytest.mark.skipif(CI, reason="Avoid submitting a real task on every CI run")
|
||||||
|
def test_real_submit_example_com(self, setup_module, make_item):
|
||||||
|
"""Integration test: submit example.com to Ghost Archive and verify enrichment."""
|
||||||
|
enricher = setup_module("ghostarchive_enricher", ENRICHER_CONFIG)
|
||||||
|
item = make_item("https://example.com")
|
||||||
|
result = enricher.enrich(item)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
archive_url = item.get("ghostarchive")
|
||||||
|
assert archive_url is not None
|
||||||
|
assert archive_url.startswith("https://ghostarchive.org/archive/")
|
||||||
Reference in New Issue
Block a user