refactors for edges in browsertrix and s3 upload, adds timeout parameter

This commit is contained in:
msramalho
2022-10-17 14:07:31 +01:00
parent dc0ca8bdd6
commit 57464f1506
7 changed files with 30 additions and 18 deletions

1
.gitignore vendored
View File

@@ -21,3 +21,4 @@ gd-token.json
credentials.json
secrets/*
browsertrix/*
browsertrix-tmp/*

View File

@@ -19,7 +19,7 @@ You also need:
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
in addition to screenshots you will need to install Docker.
in addition to screenshots you will need to install [Docker](https://www.docker.com/).
### Configuration file
Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:

View File

@@ -198,17 +198,16 @@ class Archiver(ABC):
logger.info("TimeoutException loading page for screenshot")
self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'image/png'})
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
return self.storage.get_cdn_url(key)
def get_wacz(self, url):
logger.debug(f"getting wacz for {url}")
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
collection = key.replace(".wacz", "").replace("-", "")
collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
browsertrix_home = os.path.join(os.getcwd(), "browsertrix")
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
cmd = [
"docker", "run",
"-v", f"{browsertrix_home}:/crawls/",
@@ -220,7 +219,7 @@ class Archiver(ABC):
"--text",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", "90"
"--behaviorTimeout", str(self.browsertrix.timeout_seconds)
]
if not os.path.isdir(browsertrix_home):

View File

@@ -3,3 +3,4 @@ from dataclasses import dataclass
@dataclass
class BrowsertrixConfig:
profile: str
timeout_seconds: str

View File

@@ -1,5 +1,4 @@
import argparse, yaml, json
import argparse, yaml, json, os
import gspread
from loguru import logger
from selenium import webdriver
@@ -84,8 +83,11 @@ class Config:
# browsertrix config
browsertrix_configs = execution.get("browsertrix", {})
if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
browsertrix_profile = os.path.abspath(browsertrix_profile)
self.browsertrix_config = BrowsertrixConfig(
profile=browsertrix_configs.get("profile")
profile=browsertrix_profile,
timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
)
self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
@@ -271,6 +273,7 @@ class Config:
"header": self.header,
"check_if_exists": self.check_if_exists,
"hash_algorithm": self.hash_algorithm,
"browsertrix_config": asdict(self.browsertrix_config),
"save_logs": self.save_logs,
"selenium_config": asdict(self.selenium_config),
"selenium_webdriver": self.webdriver != None,

View File

@@ -8,7 +8,8 @@ secrets:
key: "s3 API key"
secret: "s3 API secret"
# use region format like such
endpoint_url: "https://s3.{region}.amazonaws.com"
endpoint_url: "https://{region}.digitaloceanspaces.com"
# endpoint_url: "https://s3.{region}.amazonaws.com"
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
# if private:true S3 urls will not be readable online
@@ -101,6 +102,11 @@ execution:
timeout_seconds: 120
window_width: 1400
window_height: 2000
# optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
browsertrix:
profile: "./browsertrix/crawls/profile.tar.gz"
timeout_seconds: 90 # defaults to 90s
# puts execution logs into /logs folder, defaults to false
save_logs: true
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
@@ -120,5 +126,6 @@ execution:
screenshot: screenshot
hash: hash
wacz: wacz
# if you want the replaypage to work, make sure to allow CORS on your bucket
replaywebpage: replaywebpage

View File

@@ -67,13 +67,14 @@ class S3Storage(Storage):
return False
def uploadf(self, file, key, **kwargs):
if self.private:
extra_args = kwargs.get("extra_args", {})
else:
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
if key.endswith('.wacz'):
extra_args['ContentType'] = "application/zip"
else:
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
extra_args = kwargs.get("extra_args", {})
if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read'
if 'ContentType' not in extra_args:
try:
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
except Exception as e:
logger.error(f"Unable to get mimetype for {key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)