mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
clean up and wacz WIP
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
from .enricher import Enricher
|
||||
from .screenshot_enricher import ScreenshotEnricher
|
||||
from .wayback_enricher import WaybackEnricher
|
||||
from .wayback_enricher import WaybackArchiverEnricher
|
||||
from .hash_enricher import HashEnricher
|
||||
from .thumbnail_enricher import ThumbnailEnricher
|
||||
from .thumbnail_enricher import ThumbnailEnricher
|
||||
from .wacz_enricher import WaczEnricher
|
||||
70
src/enrichers/wacz_enricher.py
Normal file
70
src/enrichers/wacz_enricher.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import uuid
|
||||
from archivers.archiver import Archiverv2
|
||||
from media import Media
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
|
||||
class WaczEnricher(Enricher):
|
||||
"""
|
||||
Submits the current URL to the webarchive and returns a job_id or completed archive
|
||||
"""
|
||||
name = "wacz_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
# TODO: figure out support for browsertrix in docker
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"generating WACZ for {url=}")
|
||||
collection = str(uuid.uuid4())[0:8]
|
||||
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
# TODO: test which is right
|
||||
cmd.extend(["--profile", profile_fn])
|
||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
subprocess.run(cmd, check=True)
|
||||
except Exception as e:
|
||||
logger.error(f"WACZ generation failed: {e}")
|
||||
return False
|
||||
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
if not os.path.exists(filename):
|
||||
logger.warning(f"Unable to locate and upload WACZ {filename=}")
|
||||
return False
|
||||
|
||||
to_enrich.add_media(Media(filename), "browsertrix")
|
||||
@@ -1,16 +1,15 @@
|
||||
from utils import Webdriver
|
||||
from archivers.archiver import Archiverv2
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time, requests
|
||||
|
||||
|
||||
class WaybackEnricher(Enricher):
|
||||
class WaybackArchiverEnricher(Enricher, Archiverv2):
|
||||
"""
|
||||
Submits the current URL to the webarchive and returns a job_id or completed archive
|
||||
"""
|
||||
name = "wayback_enricher"
|
||||
name = "wayback_archiver_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
@@ -26,9 +25,19 @@ class WaybackEnricher(Enricher):
|
||||
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
result = Metadata()
|
||||
result.merge(item)
|
||||
if self.enrich(result):
|
||||
return result.success("wayback")
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"Enriching wayback for {url=}")
|
||||
logger.debug(f"calling wayback for {url=}")
|
||||
|
||||
if to_enrich.get("wayback"):
|
||||
logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}")
|
||||
return True
|
||||
|
||||
ia_headers = {
|
||||
"Accept": "application/json",
|
||||
@@ -39,10 +48,13 @@ class WaybackEnricher(Enricher):
|
||||
if r.status_code != 200:
|
||||
logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}")
|
||||
to_enrich.set("wayback", em)
|
||||
return
|
||||
return False
|
||||
|
||||
# check job status
|
||||
job_id = r.json()['job_id']
|
||||
job_id = r.json().get('job_id')
|
||||
if not job_id:
|
||||
logger.error(f"Wayback failed with {r.json()}")
|
||||
return False
|
||||
|
||||
# waits at most timeout seconds until job is completed, otherwise only enriches the job_id information
|
||||
start_time = time.time()
|
||||
@@ -50,12 +62,15 @@ class WaybackEnricher(Enricher):
|
||||
attempt = 1
|
||||
while not wayback_url and time.time() - start_time <= self.timeout:
|
||||
try:
|
||||
|
||||
logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})")
|
||||
r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
||||
r_json = r_status.json()
|
||||
if r_status.status_code == 200 and r_json['status'] == 'success':
|
||||
wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}"
|
||||
elif r_status.status_code != 200 or r_json['status'] != 'pending':
|
||||
logger.error(f"Wayback failed with {r_json}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"error fetching status for {url=} due to: {e}")
|
||||
if not wayback_url:
|
||||
@@ -66,4 +81,5 @@ class WaybackEnricher(Enricher):
|
||||
to_enrich.set("wayback", wayback_url)
|
||||
else:
|
||||
to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'})
|
||||
to_enrich.set("wayback lookup", f"https://web.archive.org/web/*/{url}")
|
||||
to_enrich.set("check wayback", f"https://web.archive.org/web/*/{url}")
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user