From 3b87dffe6bdee04c2169c5603730d142b6baae4b Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Sun, 25 Sep 2022 19:40:20 +0000
Subject: [PATCH 1/5] Add browsertrix-crawler capture

The [browsertrix-crawler] utility is a browser-based crawler that can
crawl one or more pages. browsertrix-crawler creates archives in the
[WACZ] format which is essentially a standardized ZIP file (similar to DOCX, EPUB, JAR, etc) which can then be replayed using the [ReplayWeb.page] web
component, or unzipped to get the original WARC data (the ISO standard
format used by the Internet Archive Wayback Machine).

This PR adds browsertrix-crawler to archiver classes where screenshots are made made. The WACZ is uploaded to storage and then added to a new column in the spreadsheet. A column can be added that will display the WACZ, loaded from cloud storage (S3, digitalocean, etc) using the client side ReplayWeb page. You can see an example of the spreadsheet here:

https://docs.google.com/spreadsheets/d/1Tk-iJWzT9Sx2-YccuPttL9HcMdZEnhv_OR7Bc6tfeu8/edit#gid=0

browsertrix-crawler requires Docker to be installed. If Docker is not
installed an error message will be logged and things continue as normal.

[browsertrix-crawler]: https://github.com/webrecorder/browsertrix-crawler
[WACZ]: https://specs.webrecorder.net/wacz/latest/
[ReplayWeb.page]: https://replayweb.page
---
 README.md                       |  2 ++
 archivers/base_archiver.py      | 38 ++++++++++++++++++++++++++++++++-
 archivers/telegram_archiver.py  |  3 ++-
 archivers/tiktok_archiver.py    |  3 ++-
 archivers/twitter_archiver.py   |  6 ++++--
 archivers/wayback_archiver.py   | 11 +++++-----
 archivers/youtubedl_archiver.py |  3 ++-
 auto_archive.py                 |  1 +
 example.config.yaml             |  1 +
 storages/s3_storage.py          |  5 ++++-
 utils/gworksheet.py             |  3 ++-
 11 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index ca5e06a..0b79cbb 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,8 @@ You also need:
 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 
 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 
 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
+6. If you would like to take archival WACZ snapshots using browsertrix-crawler
+   in addition to screenshots you will need to install Docker.
 
 ### Configuration file
 Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 902f626..91cc25a 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -1,4 +1,4 @@
-import os, datetime, shutil, hashlib, time, requests, re, mimetypes
+import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
 from dataclasses import dataclass
 from abc import ABC, abstractmethod
 from urllib.parse import urlparse
@@ -24,6 +24,7 @@ class ArchiveResult:
     title: str = None
     timestamp: datetime.datetime = None
     screenshot: str = None
+    wacz: str = None
     hash: str = None
 
 class Archiver(ABC):
@@ -200,6 +201,41 @@ class Archiver(ABC):
 
         return self.storage.get_cdn_url(key)
 
+    def get_wacz(self, url):
+        logger.debug(f"getting wacz for {url}")
+        key = self._get_key_from_url(url, ".wacz", append_datetime=True)
+        collection = key.replace(".wacz", "").replace("-", "")
+
+        cwd = os.getcwd()
+        cmd = [
+            "docker", "run",
+            "-v", f"{cwd}/browsertrix:/crawls/",
+            "-it",
+            "webrecorder/browsertrix-crawler", "crawl",
+            "--url", url,
+            "--scopeType", "page",
+            "--generateWACZ",
+            "--text",
+            "--collection", collection,
+            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
+            "--behaviorTimeout", "90"
+        ]
+        try:
+            subprocess.run(cmd, check=True)
+        except Exception as e:
+            logger.error(f"wacz generation failed: {e}")
+            return
+
+        filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz")
+
+        self.storage.upload(filename, key, extra_args={
+                            'ACL': 'public-read', 'ContentType': 'application/zip'})
+
+        # TODO: remove wacz collection, waiting for resolution on: 
+        # https://github.com/webrecorder/browsertrix-crawler/issues/170
+
+        return self.storage.get_cdn_url(key)
+
     def get_thumbnails(self, filename, key, duration=None):
         thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
         key_folder = key.split('.')[0] + os.path.sep
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index 0b6e777..d98f761 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -28,6 +28,7 @@ class TelegramArchiver(Archiver):
             url += "?embed=1"
 
         screenshot = self.get_screenshot(url)
+        wacz = self.get_wacz(url)
 
         t = requests.get(url, headers=headers)
         s = BeautifulSoup(t.content, 'html.parser')
@@ -46,7 +47,7 @@ class TelegramArchiver(Archiver):
             time_elements = s.find_all('time')
             timestamp = time_elements[0].get('datetime') if len(time_elements) else None
 
-            return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp)
+            return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
 
         video_url = video.get('src')
         video_id = video_url.split('/')[-1].split('?')[0]
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
index 8100bb1..bdaad52 100644
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -48,6 +48,7 @@ class TiktokArchiver(Archiver):
 
             hash = self.get_hash(filename)
             screenshot = self.get_screenshot(url)
+            wacz = self.get_wacz(url)
 
             try: os.remove(filename)
             except FileNotFoundError:
@@ -57,7 +58,7 @@ class TiktokArchiver(Archiver):
 
             return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
                                  thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
-                                 timestamp=timestamp, hash=hash, screenshot=screenshot)
+                                 timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
 
         except tiktok_downloader.Except.InvalidUrl as e:
             status = 'Invalid URL'
diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py
index 8f646fd..81f20ab 100644
--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@@ -39,8 +39,9 @@ class TwitterArchiver(Archiver):
         if tweet.media is None:
             logger.debug(f'No media found, archiving tweet text only')
             screenshot = self.get_screenshot(url)
+            wacz = self.get_wacz(url)
             page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
-            return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot)
+            return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
 
         urls = []
 
@@ -59,8 +60,9 @@ class TwitterArchiver(Archiver):
         page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
 
         screenshot = self.get_screenshot(url)
+        wacz = self.get_wacz(url)
 
-        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)
+        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
 
     def download_alternative(self, url, tweet_id):
         # https://stackoverflow.com/a/71867055/6196010
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
index f46d1cb..cf32874 100644
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -28,6 +28,8 @@ class WaybackArchiver(Archiver):
             if url in self.seen_urls: return self.seen_urls[url]
 
         screenshot = self.get_screenshot(url)
+        wacz = self.get_wacz(url)
+
         logger.debug(f"POSTing {url=} to web.archive.org")
         ia_headers = {
             "Accept": "application/json",
@@ -37,10 +39,10 @@ class WaybackArchiver(Archiver):
 
         if r.status_code != 200:
             logger.warning(f"Internet archive failed with status of {r.status_code}")
-            return ArchiveResult(status="Internet archive failed", screenshot=screenshot)
+            return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
 
         if 'job_id' not in r.json() and 'message' in r.json():
-            return self.custom_retry(r.json(), screenshot=screenshot)
+            return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
 
         job_id = r.json()['job_id']
         logger.debug(f"GETting status for {job_id=} on {url=}")
@@ -63,7 +65,7 @@ class WaybackArchiver(Archiver):
 
         status_json = status_r.json()
         if status_json['status'] != 'success':
-            return self.custom_retry(status_json, screenshot=screenshot)
+            return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz)
 
         archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
 
@@ -75,8 +77,7 @@ class WaybackArchiver(Archiver):
                 title = 'Could not get title'
         except:
             title = "Could not get title"
-        screenshot = self.get_screenshot(url)
-        self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
+        self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
         return self.seen_urls[url]
 
     def custom_retry(self, json_data, **kwargs):
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
index 7990131..c66378d 100644
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -93,6 +93,7 @@ class YoutubeDLArchiver(Archiver):
 
         hash = self.get_hash(filename)
         screenshot = self.get_screenshot(url)
+        wacz = self.get_wacz(url)
 
         # get duration
         duration = info.get('duration')
@@ -113,4 +114,4 @@ class YoutubeDLArchiver(Archiver):
             timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
 
         return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
-                             title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)
+                             title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
diff --git a/auto_archive.py b/auto_archive.py
index f12b9c4..86d951b 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -30,6 +30,7 @@ def update_sheet(gw, row, result: ArchiveResult):
     batch_if_valid('duration', result.duration, str(result.duration))
     batch_if_valid('screenshot', result.screenshot)
     batch_if_valid('hash', result.hash)
+    batch_if_valid('wacz', result.wacz)
 
     if result.timestamp is not None:
         if type(result.timestamp) == int:
diff --git a/example.config.yaml b/example.config.yaml
index acbe52c..c9dd323 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -119,4 +119,5 @@ execution:
     duration: duration
     screenshot: screenshot
     hash: hash
+    wacz: wacz
 
diff --git a/storages/s3_storage.py b/storages/s3_storage.py
index b124aae..fa8e0b9 100644
--- a/storages/s3_storage.py
+++ b/storages/s3_storage.py
@@ -71,5 +71,8 @@ class S3Storage(Storage):
             extra_args = kwargs.get("extra_args", {})
         else:
             extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
-        extra_args['ContentType'] = mimetypes.guess_type(key)[0]
+        if key.endswith('.wacz'):
+            extra_args['ContentType'] = "application/zip"
+        else:
+            extra_args['ContentType'] = mimetypes.guess_type(key)[0]
         self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
diff --git a/utils/gworksheet.py b/utils/gworksheet.py
index 0e05ab6..eda2cc6 100644
--- a/utils/gworksheet.py
+++ b/utils/gworksheet.py
@@ -20,7 +20,8 @@ class GWorksheet:
         'title': 'upload title',
         'duration': 'duration',
         'screenshot': 'screenshot',
-        'hash': 'hash'
+        'hash': 'hash',
+        'wacz': 'wacz'
     }
 
     def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):

From c34fb9cf105648ba59aa40139a066c4fd8d5420d Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Tue, 11 Oct 2022 16:14:25 -0400
Subject: [PATCH 2/5] Add browsertrix profile config option

This commit adds a browsertrix profile option to the configuration. In
order to not require the passing of the browsertrix config to every
Archiver, the Archiver constructors (include the base) were modified to
accept a Storage and Config instance. Some of the constructors them pick
out the pieces they need from the Config, in addition to calling the
parent constructor. In order to avoid a circular import that this
created the Config object now defines the default hash function to use,
rather than having it be a static property of the Archiver class.
---
 .gitignore                        |  3 ++-
 README.md                         |  2 +-
 archivers/base_archiver.py        | 31 +++++++++++++++++++++----------
 archivers/telethon_archiver.py    | 13 +++++++------
 archivers/twitter_api_archiver.py | 15 ++++++++-------
 archivers/vk_archiver.py          | 10 +++++-----
 archivers/wayback_archiver.py     |  8 ++++----
 archivers/youtubedl_archiver.py   |  7 ++++---
 auto_archive.py                   | 22 ++++++++++++----------
 configs/browsertrix_config.py     |  5 +++++
 configs/config.py                 | 13 ++++++++++---
 example.config.yaml               |  3 ++-
 storages/s3_storage.py            |  1 +
 utils/gworksheet.py               |  3 ++-
 14 files changed, 84 insertions(+), 52 deletions(-)
 create mode 100644 configs/browsertrix_config.py

diff --git a/.gitignore b/.gitignore
index 8da75c3..e525a6a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,4 +19,5 @@ local_archive/
 vk_config*.json
 gd-token.json
 credentials.json
-secrets/*
\ No newline at end of file
+secrets/*
+browsertrix/*
diff --git a/README.md b/README.md
index 0b79cbb..b8f3c75 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ You also need:
 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 
 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 
 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
-6. If you would like to take archival WACZ snapshots using browsertrix-crawler
+6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
    in addition to screenshots you will need to install Docker.
 
 ### Configuration file
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 91cc25a..4ee3433 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -10,6 +10,7 @@ from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.by import By
 from slugify import slugify
 
+from configs import Config
 from storages import Storage
 from utils import mkdir_if_not_exists
 
@@ -28,13 +29,14 @@ class ArchiveResult:
     hash: str = None
 
 class Archiver(ABC):
-    HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
     name = "default"
     retry_regex = r"retrying at (\d+)$"
 
-    def __init__(self, storage: Storage, driver):
+    def __init__(self, storage: Storage, config: Config):
         self.storage = storage
-        self.driver = driver
+        self.driver = config.webdriver
+        self.hash_algorithm = config.hash_algorithm
+        self.browsertrix = config.browsertrix_config
 
     def __str__(self):
         return self.__class__.__name__
@@ -163,11 +165,11 @@ class Archiver(ABC):
     def get_hash(self, filename):
         with open(filename, "rb") as f:
             bytes = f.read()  # read entire file as bytes
-            logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
+            logger.debug(f'Hash algorithm is {self.hash_algorithm}')
 
-            if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
-            elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
-            else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
+            if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
+            elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
+            else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")
 
         return hash.hexdigest()
 
@@ -206,10 +208,10 @@ class Archiver(ABC):
         key = self._get_key_from_url(url, ".wacz", append_datetime=True)
         collection = key.replace(".wacz", "").replace("-", "")
 
-        cwd = os.getcwd()
+        browsertrix_home = os.path.join(os.getcwd(), "browsertrix")
         cmd = [
             "docker", "run",
-            "-v", f"{cwd}/browsertrix:/crawls/",
+            "-v", f"{browsertrix_home}:/crawls/",
             "-it",
             "webrecorder/browsertrix-crawler", "crawl",
             "--url", url,
@@ -220,13 +222,22 @@ class Archiver(ABC):
             "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
             "--behaviorTimeout", "90"
         ]
+
+        if not os.path.isdir(browsertrix_home):
+            os.mkdir(browsertrix_home)
+
+        if self.browsertrix.profile:
+            shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
+            cmd.extend(["--profile", "/crawls/profile.tar.gz"])
+
         try:
+            logger.info(f"running browsertrix-crawler: {' '.join(cmd)}")
             subprocess.run(cmd, check=True)
         except Exception as e:
             logger.error(f"wacz generation failed: {e}")
             return
 
-        filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz")
+        filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
 
         self.storage.upload(filename, key, extra_args={
                             'ACL': 'public-read', 'ContentType': 'application/zip'})
diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py
index f35e323..d47cdc5 100644
--- a/archivers/telethon_archiver.py
+++ b/archivers/telethon_archiver.py
@@ -7,7 +7,7 @@ from telethon.errors import ChannelInvalidError
 
 from storages import Storage
 from .base_archiver import Archiver, ArchiveResult
-from configs import TelethonConfig
+from configs import Config
 from utils import getattr_or
 
 
@@ -15,11 +15,12 @@ class TelethonArchiver(Archiver):
     name = "telethon"
     link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
 
-    def __init__(self, storage: Storage, driver, config: TelethonConfig):
-        super().__init__(storage, driver)
-        if config:
-            self.client = TelegramClient("./anon", config.api_id, config.api_hash)
-            self.bot_token = config.bot_token
+    def __init__(self, storage: Storage, config: Config):
+        super().__init__(storage, config)
+        if config.telegram_config:
+            c = config.telegram_config
+            self.client = TelegramClient("./anon", c.api_id, c.api_hash)
+            self.bot_token = c.bot_token
 
     def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
         """
diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py
index 6aa1742..852df12 100644
--- a/archivers/twitter_api_archiver.py
+++ b/archivers/twitter_api_archiver.py
@@ -5,7 +5,7 @@ from loguru import logger
 from pytwitter import Api
 
 from storages.base_storage import Storage
-from configs import TwitterApiConfig
+from configs import Config
 from .base_archiver import ArchiveResult
 from .twitter_archiver import TwitterArchiver
 
@@ -13,14 +13,15 @@ from .twitter_archiver import TwitterArchiver
 class TwitterApiArchiver(TwitterArchiver):
     name = "twitter_api"
 
-    def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
-        super().__init__(storage, driver)
+    def __init__(self, storage: Storage, config: Config):
+        super().__init__(storage, config)
+        c = config.twitter_config
 
-        if config.bearer_token:
-            self.api = Api(bearer_token=config.bearer_token)
-        elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret:
+        if c.bearer_token:
+            self.api = Api(bearer_token=c.bearer_token)
+        elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret:
             self.api = Api(
-                consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret)
+                consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret)
 
     def download(self, url, check_if_exists=False):
         if not hasattr(self, "api"):
diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py
index c448367..a3af9db 100644
--- a/archivers/vk_archiver.py
+++ b/archivers/vk_archiver.py
@@ -5,7 +5,7 @@ from vk_url_scraper import VkScraper, DateTimeEncoder
 
 from storages import Storage
 from .base_archiver import Archiver, ArchiveResult
-from configs import VkConfig
+from configs import Config
 
 
 class VkArchiver(Archiver):
@@ -17,10 +17,10 @@ class VkArchiver(Archiver):
     wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
     photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
 
-    def __init__(self, storage: Storage, driver, config: VkConfig):
-        super().__init__(storage, driver)
-        if config != None:
-            self.vks = VkScraper(config.username, config.password)
+    def __init__(self, storage: Storage, config: Config): 
+        super().__init__(storage, config)
+        if config.vk_config != None:
+            self.vks = VkScraper(config.vk_config.username, config.vk_config.password)
 
     def download(self, url, check_if_exists=False):
         if not hasattr(self, "vks") or self.vks is None:
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
index cf32874..4de2fa8 100644
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
 
 from storages import Storage
 from .base_archiver import Archiver, ArchiveResult
-from configs import WaybackConfig
+from configs import Config
 
 
 class WaybackArchiver(Archiver):
@@ -15,9 +15,9 @@ class WaybackArchiver(Archiver):
     """
     name = "wayback"
 
-    def __init__(self, storage: Storage, driver, config: WaybackConfig):
-        super(WaybackArchiver, self).__init__(storage, driver)
-        self.config = config
+    def __init__(self, storage: Storage, config: Config):
+        super(WaybackArchiver, self).__init__(storage, config)
+        self.config = config.wayback_config
         self.seen_urls = {}
 
     def download(self, url, check_if_exists=False):
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
index c66378d..5d09442 100644
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -6,15 +6,16 @@ from loguru import logger
 
 from .base_archiver import Archiver, ArchiveResult
 from storages import Storage
+from configs import Config
 
 
 class YoutubeDLArchiver(Archiver):
     name = "youtube_dl"
     ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
 
-    def __init__(self, storage: Storage, driver, fb_cookie):
-        super().__init__(storage, driver)
-        self.fb_cookie = fb_cookie
+    def __init__(self, storage: Storage, config: Config):
+        super().__init__(storage, config)
+        self.fb_cookie = config.facebook_cookie
 
     def download(self, url, check_if_exists=False):
         netloc = self.get_netloc(url)
diff --git a/auto_archive.py b/auto_archive.py
index 86d951b..d657061 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -2,6 +2,7 @@ import os, datetime, traceback, random, tempfile
 
 from loguru import logger
 from slugify import slugify
+from urllib.parse import quote
 
 from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
 from utils import GWorksheet, mkdir_if_not_exists, expand_url
@@ -11,7 +12,7 @@ from storages import Storage
 random.seed()
 
 
-def update_sheet(gw, row, result: ArchiveResult):
+def update_sheet(gw, row, url, result: ArchiveResult):
     cell_updates = []
     row_values = gw.get_row(row)
 
@@ -31,6 +32,7 @@ def update_sheet(gw, row, result: ArchiveResult):
     batch_if_valid('screenshot', result.screenshot)
     batch_if_valid('hash', result.hash)
     batch_if_valid('wacz', result.wacz)
+    batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
 
     if result.timestamp is not None:
         if type(result.timestamp) == int:
@@ -105,14 +107,14 @@ def process_sheet(c: Config):
 
                 # order matters, first to succeed excludes remaining
                 active_archivers = [
-                    TelethonArchiver(storage, c.webdriver, c.telegram_config),
-                    TiktokArchiver(storage, c.webdriver),
-                    TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
-                    YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
-                    TelegramArchiver(storage, c.webdriver),
-                    TwitterArchiver(storage, c.webdriver),
-                    VkArchiver(storage, c.webdriver, c.vk_config),
-                    WaybackArchiver(storage, c.webdriver, c.wayback_config)
+                    TelethonArchiver(storage, c),
+                    TiktokArchiver(storage, c),
+                    TwitterApiArchiver(storage, c),
+                    YoutubeDLArchiver(storage, c),
+                    TelegramArchiver(storage, c),
+                    TwitterArchiver(storage, c),
+                    VkArchiver(storage, c),
+                    WaybackArchiver(storage, c)
                 ]
 
                 for archiver in active_archivers:
@@ -137,7 +139,7 @@ def process_sheet(c: Config):
                         logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
 
                 if result:
-                    update_sheet(gw, row, result)
+                    update_sheet(gw, row, url, result)
                 else:
                     gw.set_cell(row, 'status', 'failed: no archiver')
             except KeyboardInterrupt:
diff --git a/configs/browsertrix_config.py b/configs/browsertrix_config.py
new file mode 100644
index 0000000..8b30dac
--- /dev/null
+++ b/configs/browsertrix_config.py
@@ -0,0 +1,5 @@
+from dataclasses import dataclass
+
+@dataclass
+class BrowsertrixConfig:
+    profile: str
diff --git a/configs/config.py b/configs/config.py
index 0d11467..4124236 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -1,6 +1,5 @@
 
 import argparse, yaml, json
-from archivers.base_archiver import Archiver
 import gspread
 from loguru import logger
 from selenium import webdriver
@@ -13,6 +12,7 @@ from .telethon_config import TelethonConfig
 from .selenium_config import SeleniumConfig
 from .vk_config import VkConfig
 from .twitter_api_config import TwitterApiConfig
+from .browsertrix_config import BrowsertrixConfig
 from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
 
 
@@ -82,7 +82,13 @@ class Config:
         )
         self.webdriver = "not initialized"
 
-        Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
+        # browsertrix config
+        browsertrix_configs = execution.get("browsertrix", {})
+        self.browsertrix_config = BrowsertrixConfig(
+            profile=browsertrix_configs.get("profile")
+        )
+
+        self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
 
         # ---------------------- SECRETS - APIs and service configurations
         secrets = self.config.get("secrets", {})
@@ -208,6 +214,7 @@ class Config:
         update the folder in each of the storages
         """
         self.folder = folder
+        logger.info(f"setting folder to {folder}")
         # s3
         if hasattr(self, "s3_config"): self.s3_config.folder = folder
         if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
@@ -263,7 +270,7 @@ class Config:
             "storage": self.storage,
             "header": self.header,
             "check_if_exists": self.check_if_exists,
-            "hash_algorithm": Archiver.HASH_ALGORITHM,
+            "hash_algorithm": self.hash_algorithm,
             "save_logs": self.save_logs,
             "selenium_config": asdict(self.selenium_config),
             "selenium_webdriver": self.webdriver != None,
diff --git a/example.config.yaml b/example.config.yaml
index c9dd323..b736eca 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -8,7 +8,7 @@ secrets:
     key: "s3 API key"
     secret: "s3 API secret"
     # use region format like such
-    endpoint_url: "https://{region}.digitaloceanspaces.com"
+    endpoint_url: "https://s3.{region}.amazonaws.com"
     #use bucket, region, and key (key is the archived file path generated when executing) format like such as:
     cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
     # if private:true S3 urls will not be readable online
@@ -120,4 +120,5 @@ execution:
     screenshot: screenshot
     hash: hash
     wacz: wacz
+    replaywebpage: replaywebpage
 
diff --git a/storages/s3_storage.py b/storages/s3_storage.py
index fa8e0b9..3dee2dc 100644
--- a/storages/s3_storage.py
+++ b/storages/s3_storage.py
@@ -75,4 +75,5 @@ class S3Storage(Storage):
             extra_args['ContentType'] = "application/zip"
         else:
             extra_args['ContentType'] = mimetypes.guess_type(key)[0]
+
         self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
diff --git a/utils/gworksheet.py b/utils/gworksheet.py
index eda2cc6..8fe640e 100644
--- a/utils/gworksheet.py
+++ b/utils/gworksheet.py
@@ -21,7 +21,8 @@ class GWorksheet:
         'duration': 'duration',
         'screenshot': 'screenshot',
         'hash': 'hash',
-        'wacz': 'wacz'
+        'wacz': 'wacz',
+        'replaywebpage': 'replaywebpage',
     }
 
     def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):

From 20ca50dc90cbde60055d2ed3e7c643b5bc19c9af Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Tue, 11 Oct 2022 16:49:19 -0400
Subject: [PATCH 3/5] Clean up browsertrix-crawler files

Remove any local browsertrix-crawler files after the WACZ has been
copied to storage. Note, until this issue has a release on DockerHub the
local files won't be able to be deleted since Docker on Linux creates
the files as root:

https://github.com/webrecorder/browsertrix-crawler/issues/170

The code will catch this exception and log a warning instead of failing
and losing the work that has been completed.
---
 archivers/base_archiver.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 4ee3433..ea172f8 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -231,10 +231,10 @@ class Archiver(ABC):
             cmd.extend(["--profile", "/crawls/profile.tar.gz"])
 
         try:
-            logger.info(f"running browsertrix-crawler: {' '.join(cmd)}")
+            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
             subprocess.run(cmd, check=True)
         except Exception as e:
-            logger.error(f"wacz generation failed: {e}")
+            logger.error(f"WACZ generation failed: {e}")
             return
 
         filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
@@ -242,8 +242,11 @@ class Archiver(ABC):
         self.storage.upload(filename, key, extra_args={
                             'ACL': 'public-read', 'ContentType': 'application/zip'})
 
-        # TODO: remove wacz collection, waiting for resolution on: 
-        # https://github.com/webrecorder/browsertrix-crawler/issues/170
+        # clean up the local browsertrix files
+        try:
+            shutil.rmtree(browsertrix_home)
+        except PermissionError:
+            logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
 
         return self.storage.get_cdn_url(key)
 

From dc0ca8bdd60e1480d54c826a63e34db85aeb3ffb Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 17 Oct 2022 14:06:50 +0100
Subject: [PATCH 4/5] adds browsertrix to all archivers flows

---
 archivers/telegram_archiver.py    | 2 +-
 archivers/telethon_archiver.py    | 7 ++++---
 archivers/twitter_api_archiver.py | 3 ++-
 archivers/twitter_archiver.py     | 3 ++-
 archivers/vk_archiver.py          | 3 ++-
 archivers/wayback_archiver.py     | 2 +-
 6 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index d98f761..026bdd0 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -86,4 +86,4 @@ class TelegramArchiver(Archiver):
 
         cdn_url = self.storage.get_cdn_url(key)
         return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
-                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot)
+                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)
diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py
index d47cdc5..9f9bbbf 100644
--- a/archivers/telethon_archiver.py
+++ b/archivers/telethon_archiver.py
@@ -74,6 +74,7 @@ class TelethonArchiver(Archiver):
             logger.debug(f'got {len(media_posts)=} for {url=}')
 
             screenshot = self.get_screenshot(url)
+            wacz = self.get_wacz(url)
 
             if len(media_posts) > 0:
                 key = self.get_html_key(url)
@@ -81,7 +82,7 @@ class TelethonArchiver(Archiver):
                 if check_if_exists and self.storage.exists(key):
                     # only s3 storage supports storage.exists as not implemented on gd
                     cdn_url = self.storage.get_cdn_url(key)
-                    return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
+                    return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
 
                 key_thumb, thumb_index = None, None
                 group_id = post.grouped_id if post.grouped_id is not None else post.id
@@ -120,7 +121,7 @@ class TelethonArchiver(Archiver):
 
                 page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
 
-                return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index)
+                return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
 
             page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
-            return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot)
+            return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py
index 852df12..454cfe2 100644
--- a/archivers/twitter_api_archiver.py
+++ b/archivers/twitter_api_archiver.py
@@ -70,5 +70,6 @@ class TwitterApiArchiver(TwitterArchiver):
         }, ensure_ascii=False, indent=4)
 
         screenshot = self.get_screenshot(url)
+        wacz = self.get_wacz(url)
         page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
-        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text)
+        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py
index 81f20ab..b868af5 100644
--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@@ -85,8 +85,9 @@ class TwitterArchiver(Archiver):
 
         timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
         screenshot = self.get_screenshot(url)
+        wacz = self.get_wacz(url)
         page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
-        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"])
+        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
 
     def choose_variant(self, variants):
         # choosing the highest quality possible
diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py
index a3af9db..91b8354 100644
--- a/archivers/vk_archiver.py
+++ b/archivers/vk_archiver.py
@@ -70,4 +70,5 @@ class VkArchiver(Archiver):
         page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail)
         # # if multiple wall/photos/videos are present the screenshot will only grab the 1st
         screenshot = self.get_screenshot(url)
-        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title)
+        wacz = self.get_wacz(url)
+        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
index 4de2fa8..e0ede90 100644
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -61,7 +61,7 @@ class WaybackArchiver(Archiver):
             retries += 1
 
         if status_r.status_code != 200:
-            return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)
+            return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
 
         status_json = status_r.json()
         if status_json['status'] != 'success':

From 57464f1506e0b4ccd50bbe081f92abeb8ae583e8 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 17 Oct 2022 14:07:31 +0100
Subject: [PATCH 5/5] refactors for edges in browsertrix and s3 upload, adds
 timeout parameter

---
 .gitignore                    |  1 +
 README.md                     |  2 +-
 archivers/base_archiver.py    |  9 ++++-----
 configs/browsertrix_config.py |  1 +
 configs/config.py             |  9 ++++++---
 example.config.yaml           |  9 ++++++++-
 storages/s3_storage.py        | 17 +++++++++--------
 7 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/.gitignore b/.gitignore
index e525a6a..4d19b9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ gd-token.json
 credentials.json
 secrets/*
 browsertrix/*
+browsertrix-tmp/*
\ No newline at end of file
diff --git a/README.md b/README.md
index b8f3c75..9e77d19 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ You also need:
 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 
 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
 6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
-   in addition to screenshots you will need to install Docker.
+   in addition to screenshots you will need to install [Docker](https://www.docker.com/).
 
 ### Configuration file
 Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index ea172f8..82d705a 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -198,17 +198,16 @@ class Archiver(ABC):
             logger.info("TimeoutException loading page for screenshot")
 
         self.driver.save_screenshot(filename)
-        self.storage.upload(filename, key, extra_args={
-                            'ACL': 'public-read', 'ContentType': 'image/png'})
+        self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
 
         return self.storage.get_cdn_url(key)
 
     def get_wacz(self, url):
         logger.debug(f"getting wacz for {url}")
         key = self._get_key_from_url(url, ".wacz", append_datetime=True)
-        collection = key.replace(".wacz", "").replace("-", "")
+        collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
 
-        browsertrix_home = os.path.join(os.getcwd(), "browsertrix")
+        browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
         cmd = [
             "docker", "run",
             "-v", f"{browsertrix_home}:/crawls/",
@@ -220,7 +219,7 @@ class Archiver(ABC):
             "--text",
             "--collection", collection,
             "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
-            "--behaviorTimeout", "90"
+            "--behaviorTimeout", str(self.browsertrix.timeout_seconds)
         ]
 
         if not os.path.isdir(browsertrix_home):
diff --git a/configs/browsertrix_config.py b/configs/browsertrix_config.py
index 8b30dac..1039da3 100644
--- a/configs/browsertrix_config.py
+++ b/configs/browsertrix_config.py
@@ -3,3 +3,4 @@ from dataclasses import dataclass
 @dataclass
 class BrowsertrixConfig:
     profile: str
+    timeout_seconds: str
diff --git a/configs/config.py b/configs/config.py
index 4124236..beff612 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -1,5 +1,4 @@
-
-import argparse, yaml, json
+import argparse, yaml, json, os
 import gspread
 from loguru import logger
 from selenium import webdriver
@@ -84,8 +83,11 @@ class Config:
 
         # browsertrix config
         browsertrix_configs = execution.get("browsertrix", {})
+        if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
+            browsertrix_profile = os.path.abspath(browsertrix_profile)
         self.browsertrix_config = BrowsertrixConfig(
-            profile=browsertrix_configs.get("profile")
+            profile=browsertrix_profile,
+            timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
         )
 
         self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
@@ -271,6 +273,7 @@ class Config:
             "header": self.header,
             "check_if_exists": self.check_if_exists,
             "hash_algorithm": self.hash_algorithm,
+            "browsertrix_config": asdict(self.browsertrix_config),
             "save_logs": self.save_logs,
             "selenium_config": asdict(self.selenium_config),
             "selenium_webdriver": self.webdriver != None,
diff --git a/example.config.yaml b/example.config.yaml
index b736eca..a8138af 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -8,7 +8,8 @@ secrets:
     key: "s3 API key"
     secret: "s3 API secret"
     # use region format like such
-    endpoint_url: "https://s3.{region}.amazonaws.com"
+    endpoint_url: "https://{region}.digitaloceanspaces.com"
+    # endpoint_url: "https://s3.{region}.amazonaws.com"
     #use bucket, region, and key (key is the archived file path generated when executing) format like such as:
     cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
     # if private:true S3 urls will not be readable online
@@ -101,6 +102,11 @@ execution:
     timeout_seconds: 120
     window_width: 1400
     window_height: 2000
+
+  # optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
+  browsertrix:
+    profile: "./browsertrix/crawls/profile.tar.gz"
+    timeout_seconds: 90 # defaults to 90s
   # puts execution logs into /logs folder, defaults to false
   save_logs: true
   # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
@@ -120,5 +126,6 @@ execution:
     screenshot: screenshot
     hash: hash
     wacz: wacz
+    # if you want the replaypage to work, make sure to allow CORS on your bucket
     replaywebpage: replaywebpage
 
diff --git a/storages/s3_storage.py b/storages/s3_storage.py
index 3dee2dc..563d2ea 100644
--- a/storages/s3_storage.py
+++ b/storages/s3_storage.py
@@ -67,13 +67,14 @@ class S3Storage(Storage):
             return False
 
     def uploadf(self, file, key, **kwargs):
-        if self.private:
-            extra_args = kwargs.get("extra_args", {})
-        else:
-            extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
-        if key.endswith('.wacz'):
-            extra_args['ContentType'] = "application/zip"
-        else:
-            extra_args['ContentType'] = mimetypes.guess_type(key)[0]
+        extra_args = kwargs.get("extra_args", {})
+        if not self.private and 'ACL' not in extra_args:
+            extra_args['ACL'] = 'public-read'
+
+        if 'ContentType' not in extra_args:
+            try:
+                extra_args['ContentType'] = mimetypes.guess_type(key)[0]
+            except Exception as e:
+                logger.error(f"Unable to get mimetype for {key=}, error: {e}")
 
         self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)