clean up and wacz WIP

2026-06-11 20:58:29 +03:00 · 2023-01-19 00:27:11 +00:00
parent 9bbc13e9be
commit ea2c266fa2
19 changed files with 141 additions and 910 deletions
--- a/src/enrichers/init.py
+++ b/src/enrichers/init.py
@@ -1,5 +1,6 @@
 from .enricher import Enricher
 from .screenshot_enricher import ScreenshotEnricher 
-from .wayback_enricher import WaybackEnricher
+from .wayback_enricher import WaybackArchiverEnricher
 from .hash_enricher import HashEnricher
-from .thumbnail_enricher import ThumbnailEnricher
+from .thumbnail_enricher import ThumbnailEnricher
+from .wacz_enricher import WaczEnricher
--- a/src/enrichers/wacz_enricher.py
+++ b/src/enrichers/wacz_enricher.py
@@ -0,0 +1,70 @@
+import os
+import shutil
+import subprocess
+import uuid
+from archivers.archiver import Archiverv2
+from media import Media
+from . import Enricher
+from metadata import Metadata
+from loguru import logger
+import time, requests
+
+
+class WaczEnricher(Enricher):
+    """
+    Submits the current URL to the webarchive and returns a job_id or completed archive
+    """
+    name = "wacz_enricher"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
+            "timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
+        }
+
+    def enrich(self, to_enrich: Metadata) -> bool:
+        # TODO: figure out support for browsertrix in docker
+        url = to_enrich.get_url()
+        logger.debug(f"generating WACZ for {url=}")
+        collection = str(uuid.uuid4())[0:8]
+        browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
+        cmd = [
+            "docker", "run",
+            "--rm",  # delete container once it has completed running
+            "-v", f"{browsertrix_home}:/crawls/",
+            # "-it", # this leads to "the input device is not a TTY"
+            "webrecorder/browsertrix-crawler", "crawl",
+            "--url", url,
+            "--scopeType", "page",
+            "--generateWACZ",
+            "--text",
+            "--collection", collection,
+            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
+            "--behaviorTimeout", str(self.timeout),
+            "--timeout", str(self.timeout)
+        ]
+        if self.profile:
+            profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
+            shutil.copyfile(self.profile, profile_fn)
+            # TODO: test which is right
+            cmd.extend(["--profile", profile_fn])
+            # cmd.extend(["--profile", "/crawls/profile.tar.gz"])
+
+        try:
+            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
+            subprocess.run(cmd, check=True)
+        except Exception as e:
+            logger.error(f"WACZ generation failed: {e}")
+            return False
+
+        filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
+        if not os.path.exists(filename):
+            logger.warning(f"Unable to locate and upload WACZ  {filename=}")
+            return False
+
+        to_enrich.add_media(Media(filename), "browsertrix")
--- a/src/enrichers/wayback_enricher.py
+++ b/src/enrichers/wayback_enricher.py
@@ -1,16 +1,15 @@
-from utils import Webdriver
+from archivers.archiver import Archiverv2
 from . import Enricher
 from metadata import Metadata
 from loguru import logger
-from selenium.common.exceptions import TimeoutException
 import time, requests


-class WaybackEnricher(Enricher):
+class WaybackArchiverEnricher(Enricher, Archiverv2):
    """
    Submits the current URL to the webarchive and returns a job_id or completed archive
    """
-    name = "wayback_enricher"
+    name = "wayback_archiver_enricher"

    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
@@ -26,9 +25,19 @@ class WaybackEnricher(Enricher):
            "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
        }

-    def enrich(self, to_enrich: Metadata) -> None:
+    def download(self, item: Metadata) -> Metadata:
+        result = Metadata()
+        result.merge(item)
+        if self.enrich(result):
+            return result.success("wayback")
+
+    def enrich(self, to_enrich: Metadata) -> bool:
        url = to_enrich.get_url()
-        logger.debug(f"Enriching wayback for {url=}")
+        logger.debug(f"calling wayback for {url=}")
+
+        if to_enrich.get("wayback"):
+            logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}")
+            return True

        ia_headers = {
            "Accept": "application/json",
@@ -39,10 +48,13 @@ class WaybackEnricher(Enricher):
        if r.status_code != 200:
            logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}")
            to_enrich.set("wayback", em)
-            return
+            return False

        # check job status
-        job_id = r.json()['job_id']
+        job_id = r.json().get('job_id')
+        if not job_id:
+            logger.error(f"Wayback failed with {r.json()}")
+            return False

        # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information
        start_time = time.time()
@@ -50,12 +62,15 @@ class WaybackEnricher(Enricher):
        attempt = 1
        while not wayback_url and time.time() - start_time <= self.timeout:
            try:
-
                logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})")
                r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
                r_json = r_status.json()
                if r_status.status_code == 200 and r_json['status'] == 'success':
                    wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}"
+                elif r_status.status_code != 200 or r_json['status'] != 'pending':
+                    logger.error(f"Wayback failed with {r_json}")
+                    return False
+
            except Exception as e:
                logger.warning(f"error fetching status for {url=} due to: {e}")
            if not wayback_url:
@@ -66,4 +81,5 @@ class WaybackEnricher(Enricher):
            to_enrich.set("wayback", wayback_url)
        else:
            to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'})
-        to_enrich.set("wayback lookup", f"https://web.archive.org/web/*/{url}")
+        to_enrich.set("check wayback", f"https://web.archive.org/web/*/{url}")
+        return True