mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
The [browsertrix-crawler] utility is a browser-based crawler that can crawl one or more pages. browsertrix-crawler creates archives in the [WACZ] format which is essentially a standardized ZIP file (similar to DOCX, EPUB, JAR, etc) which can then be replayed using the [ReplayWeb.page] web component, or unzipped to get the original WARC data (the ISO standard format used by the Internet Archive Wayback Machine). This PR adds browsertrix-crawler to archiver classes where screenshots are made made. The WACZ is uploaded to storage and then added to a new column in the spreadsheet. A column can be added that will display the WACZ, loaded from cloud storage (S3, digitalocean, etc) using the client side ReplayWeb page. You can see an example of the spreadsheet here: https://docs.google.com/spreadsheets/d/1Tk-iJWzT9Sx2-YccuPttL9HcMdZEnhv_OR7Bc6tfeu8/edit#gid=0 browsertrix-crawler requires Docker to be installed. If Docker is not installed an error message will be logged and things continue as normal. [browsertrix-crawler]: https://github.com/webrecorder/browsertrix-crawler [WACZ]: https://specs.webrecorder.net/wacz/latest/ [ReplayWeb.page]: https://replayweb.page
90 lines
3.8 KiB
Python
90 lines
3.8 KiB
Python
import time, requests
|
|
|
|
from loguru import logger
|
|
from bs4 import BeautifulSoup
|
|
|
|
from storages import Storage
|
|
from .base_archiver import Archiver, ArchiveResult
|
|
from configs import WaybackConfig
|
|
|
|
|
|
class WaybackArchiver(Archiver):
|
|
"""
|
|
This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}"
|
|
but that might not be desirable since the webpage might have been archived a long time ago and thus have changed
|
|
"""
|
|
name = "wayback"
|
|
|
|
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
|
super(WaybackArchiver, self).__init__(storage, driver)
|
|
self.config = config
|
|
self.seen_urls = {}
|
|
|
|
def download(self, url, check_if_exists=False):
|
|
if self.config is None:
|
|
logger.error('Missing Wayback config')
|
|
return False
|
|
if check_if_exists:
|
|
if url in self.seen_urls: return self.seen_urls[url]
|
|
|
|
screenshot = self.get_screenshot(url)
|
|
wacz = self.get_wacz(url)
|
|
|
|
logger.debug(f"POSTing {url=} to web.archive.org")
|
|
ia_headers = {
|
|
"Accept": "application/json",
|
|
"Authorization": f"LOW {self.config.key}:{self.config.secret}"
|
|
}
|
|
r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
|
|
|
if r.status_code != 200:
|
|
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
|
return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
|
|
|
|
if 'job_id' not in r.json() and 'message' in r.json():
|
|
return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
|
|
|
|
job_id = r.json()['job_id']
|
|
logger.debug(f"GETting status for {job_id=} on {url=}")
|
|
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
|
retries = 0
|
|
|
|
# TODO: make the job queue parallel -> consider propagation of results back to sheet though
|
|
# wait 90-120 seconds for the archive job to finish
|
|
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
|
|
time.sleep(3)
|
|
try:
|
|
logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]")
|
|
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
|
except:
|
|
time.sleep(1)
|
|
retries += 1
|
|
|
|
if status_r.status_code != 200:
|
|
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)
|
|
|
|
status_json = status_r.json()
|
|
if status_json['status'] != 'success':
|
|
return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz)
|
|
|
|
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
|
|
|
|
try:
|
|
req = requests.get(archive_url)
|
|
parsed = BeautifulSoup(req.content, 'html.parser')
|
|
title = parsed.find_all('title')[0].text
|
|
if title == 'Wayback Machine':
|
|
title = 'Could not get title'
|
|
except:
|
|
title = "Could not get title"
|
|
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
|
|
return self.seen_urls[url]
|
|
|
|
def custom_retry(self, json_data, **kwargs):
|
|
logger.warning(f"Internet archive failed json \n {json_data}")
|
|
if "please try again" in str(json_data).lower():
|
|
return self.signal_retry_in(**kwargs)
|
|
if "this host has been already captured" in str(json_data).lower():
|
|
return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
|
|
return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
|