diff --git a/.gitignore b/.gitignore index e525a6a..4d19b9e 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ gd-token.json credentials.json secrets/* browsertrix/* +browsertrix-tmp/* \ No newline at end of file diff --git a/README.md b/README.md index b8f3c75..9e77d19 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ You also need: 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. 6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler) - in addition to screenshots you will need to install Docker. + in addition to screenshots you will need to install [Docker](https://www.docker.com/). ### Configuration file Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index ea172f8..82d705a 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -198,17 +198,16 @@ class Archiver(ABC): logger.info("TimeoutException loading page for screenshot") self.driver.save_screenshot(filename) - self.storage.upload(filename, key, extra_args={ - 'ACL': 'public-read', 'ContentType': 'image/png'}) + self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'}) return self.storage.get_cdn_url(key) def get_wacz(self, url): logger.debug(f"getting wacz for {url}") key = self._get_key_from_url(url, ".wacz", append_datetime=True) - collection = key.replace(".wacz", "").replace("-", "") + collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", "")) - browsertrix_home = os.path.join(os.getcwd(), "browsertrix") + browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp") cmd = [ "docker", "run", "-v", f"{browsertrix_home}:/crawls/", @@ -220,7 +219,7 @@ class Archiver(ABC): "--text", "--collection", collection, "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", - "--behaviorTimeout", "90" + "--behaviorTimeout", str(self.browsertrix.timeout_seconds) ] if not os.path.isdir(browsertrix_home): diff --git a/configs/browsertrix_config.py b/configs/browsertrix_config.py index 8b30dac..1039da3 100644 --- a/configs/browsertrix_config.py +++ b/configs/browsertrix_config.py @@ -3,3 +3,4 @@ from dataclasses import dataclass @dataclass class BrowsertrixConfig: profile: str + timeout_seconds: str diff --git a/configs/config.py b/configs/config.py index 4124236..beff612 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,5 +1,4 @@ - -import argparse, yaml, json +import argparse, yaml, json, os import gspread from loguru import logger from selenium import webdriver @@ -84,8 +83,11 @@ class Config: # browsertrix config browsertrix_configs = execution.get("browsertrix", {}) + if len(browsertrix_profile := browsertrix_configs.get("profile", "")): + browsertrix_profile = os.path.abspath(browsertrix_profile) self.browsertrix_config = BrowsertrixConfig( - profile=browsertrix_configs.get("profile") + profile=browsertrix_profile, + timeout_seconds=browsertrix_configs.get("timeout_seconds", "90") ) self.hash_algorithm = execution.get("hash_algorithm", "SHA-256") @@ -271,6 +273,7 @@ class Config: "header": self.header, "check_if_exists": self.check_if_exists, "hash_algorithm": self.hash_algorithm, + "browsertrix_config": asdict(self.browsertrix_config), "save_logs": self.save_logs, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, diff --git a/example.config.yaml b/example.config.yaml index b736eca..a8138af 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -8,7 +8,8 @@ secrets: key: "s3 API key" secret: "s3 API secret" # use region format like such - endpoint_url: "https://s3.{region}.amazonaws.com" + endpoint_url: "https://{region}.digitaloceanspaces.com" + # endpoint_url: "https://s3.{region}.amazonaws.com" #use bucket, region, and key (key is the archived file path generated when executing) format like such as: cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" # if private:true S3 urls will not be readable online @@ -101,6 +102,11 @@ execution: timeout_seconds: 120 window_width: 1400 window_height: 2000 + + # optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) + browsertrix: + profile: "./browsertrix/crawls/profile.tar.gz" + timeout_seconds: 90 # defaults to 90s # puts execution logs into /logs folder, defaults to false save_logs: true # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" @@ -120,5 +126,6 @@ execution: screenshot: screenshot hash: hash wacz: wacz + # if you want the replaypage to work, make sure to allow CORS on your bucket replaywebpage: replaywebpage diff --git a/storages/s3_storage.py b/storages/s3_storage.py index 3dee2dc..563d2ea 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -67,13 +67,14 @@ class S3Storage(Storage): return False def uploadf(self, file, key, **kwargs): - if self.private: - extra_args = kwargs.get("extra_args", {}) - else: - extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) - if key.endswith('.wacz'): - extra_args['ContentType'] = "application/zip" - else: - extra_args['ContentType'] = mimetypes.guess_type(key)[0] + extra_args = kwargs.get("extra_args", {}) + if not self.private and 'ACL' not in extra_args: + extra_args['ACL'] = 'public-read' + + if 'ContentType' not in extra_args: + try: + extra_args['ContentType'] = mimetypes.guess_type(key)[0] + except Exception as e: + logger.error(f"Unable to get mimetype for {key=}, error: {e}") self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)