refactors for edges in browsertrix and s3 upload, adds timeout parameter

2026-06-07 19:08:30 +03:00 · 2022-10-17 14:07:31 +01:00
parent dc0ca8bdd6
commit 57464f1506
7 changed files with 30 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ gd-token.json
 credentials.json
 secrets/*
 browsertrix/*
+browsertrix-tmp/*
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ You also need:
 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 
 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
 6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
-   in addition to screenshots you will need to install Docker.
+   in addition to screenshots you will need to install [Docker](https://www.docker.com/).

 ### Configuration file
 Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -198,17 +198,16 @@ class Archiver(ABC):
            logger.info("TimeoutException loading page for screenshot")

        self.driver.save_screenshot(filename)
-        self.storage.upload(filename, key, extra_args={
-                            'ACL': 'public-read', 'ContentType': 'image/png'})
+        self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})

        return self.storage.get_cdn_url(key)

    def get_wacz(self, url):
        logger.debug(f"getting wacz for {url}")
        key = self._get_key_from_url(url, ".wacz", append_datetime=True)
-        collection = key.replace(".wacz", "").replace("-", "")
+        collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))

-        browsertrix_home = os.path.join(os.getcwd(), "browsertrix")
+        browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
        cmd = [
            "docker", "run",
            "-v", f"{browsertrix_home}:/crawls/",
@@ -220,7 +219,7 @@ class Archiver(ABC):
            "--text",
            "--collection", collection,
            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
-            "--behaviorTimeout", "90"
+            "--behaviorTimeout", str(self.browsertrix.timeout_seconds)
        ]

        if not os.path.isdir(browsertrix_home):
--- a/configs/browsertrix_config.py
+++ b/configs/browsertrix_config.py
@@ -3,3 +3,4 @@ from dataclasses import dataclass
@dataclass
 class BrowsertrixConfig:
    profile: str
+    timeout_seconds: str
--- a/configs/config.py
+++ b/configs/config.py
@@ -1,5 +1,4 @@
-
-import argparse, yaml, json
+import argparse, yaml, json, os
 import gspread
 from loguru import logger
 from selenium import webdriver
@@ -84,8 +83,11 @@ class Config:

        # browsertrix config
        browsertrix_configs = execution.get("browsertrix", {})
+        if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
+            browsertrix_profile = os.path.abspath(browsertrix_profile)
        self.browsertrix_config = BrowsertrixConfig(
-            profile=browsertrix_configs.get("profile")
+            profile=browsertrix_profile,
+            timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
        )

        self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
@@ -271,6 +273,7 @@ class Config:
            "header": self.header,
            "check_if_exists": self.check_if_exists,
            "hash_algorithm": self.hash_algorithm,
+            "browsertrix_config": asdict(self.browsertrix_config),
            "save_logs": self.save_logs,
            "selenium_config": asdict(self.selenium_config),
            "selenium_webdriver": self.webdriver != None,
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -8,7 +8,8 @@ secrets:
    key: "s3 API key"
    secret: "s3 API secret"
    # use region format like such
-    endpoint_url: "https://s3.{region}.amazonaws.com"
+    endpoint_url: "https://{region}.digitaloceanspaces.com"
+    # endpoint_url: "https://s3.{region}.amazonaws.com"
    #use bucket, region, and key (key is the archived file path generated when executing) format like such as:
    cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
    # if private:true S3 urls will not be readable online
@@ -101,6 +102,11 @@ execution:
    timeout_seconds: 120
    window_width: 1400
    window_height: 2000
+
+  # optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
+  browsertrix:
+    profile: "./browsertrix/crawls/profile.tar.gz"
+    timeout_seconds: 90 # defaults to 90s
  # puts execution logs into /logs folder, defaults to false
  save_logs: true
  # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
@@ -120,5 +126,6 @@ execution:
    screenshot: screenshot
    hash: hash
    wacz: wacz
+    # if you want the replaypage to work, make sure to allow CORS on your bucket
    replaywebpage: replaywebpage

--- a/storages/s3_storage.py
+++ b/storages/s3_storage.py
@@ -67,13 +67,14 @@ class S3Storage(Storage):
            return False

    def uploadf(self, file, key, **kwargs):
-        if self.private:
-            extra_args = kwargs.get("extra_args", {})
-        else:
-            extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
-        if key.endswith('.wacz'):
-            extra_args['ContentType'] = "application/zip"
-        else:
-            extra_args['ContentType'] = mimetypes.guess_type(key)[0]
+        extra_args = kwargs.get("extra_args", {})
+        if not self.private and 'ACL' not in extra_args:
+            extra_args['ACL'] = 'public-read'
+
+        if 'ContentType' not in extra_args:
+            try:
+                extra_args['ContentType'] = mimetypes.guess_type(key)[0]
+            except Exception as e:
+                logger.error(f"Unable to get mimetype for {key=}, error: {e}")

        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)