diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 82d705a..96e0fbf 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -203,6 +203,10 @@ class Archiver(ABC): return self.storage.get_cdn_url(key) def get_wacz(self, url): + if not self.browsertrix.enabled: + logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.") + return + logger.debug(f"getting wacz for {url}") key = self._get_key_from_url(url, ".wacz", append_datetime=True) collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", "")) @@ -219,7 +223,8 @@ class Archiver(ABC): "--text", "--collection", collection, "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", - "--behaviorTimeout", str(self.browsertrix.timeout_seconds) + "--behaviorTimeout", str(self.browsertrix.timeout_seconds), + "--timeout", str(self.browsertrix.timeout_seconds) ] if not os.path.isdir(browsertrix_home): diff --git a/auto_archive.py b/auto_archive.py index d657061..50719a3 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -31,8 +31,9 @@ def update_sheet(gw, row, url, result: ArchiveResult): batch_if_valid('duration', result.duration, str(result.duration)) batch_if_valid('screenshot', result.screenshot) batch_if_valid('hash', result.hash) - batch_if_valid('wacz', result.wacz) - batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') + if result.wacz is not None: + batch_if_valid('wacz', result.wacz) + batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') if result.timestamp is not None: if type(result.timestamp) == int: diff --git a/configs/browsertrix_config.py b/configs/browsertrix_config.py index 1039da3..bc7acd5 100644 --- a/configs/browsertrix_config.py +++ b/configs/browsertrix_config.py @@ -2,5 +2,6 @@ from dataclasses import dataclass @dataclass class BrowsertrixConfig: + enabled: bool profile: str timeout_seconds: str diff --git a/configs/config.py b/configs/config.py index beff612..6e97dc4 100644 --- a/configs/config.py +++ b/configs/config.py @@ -86,6 +86,7 @@ class Config: if len(browsertrix_profile := browsertrix_configs.get("profile", "")): browsertrix_profile = os.path.abspath(browsertrix_profile) self.browsertrix_config = BrowsertrixConfig( + enabled=bool(browsertrix_configs.get("enabled", False)), profile=browsertrix_profile, timeout_seconds=browsertrix_configs.get("timeout_seconds", "90") ) diff --git a/example.config.yaml b/example.config.yaml index a8138af..e42d10f 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -103,10 +103,12 @@ execution: window_width: 1400 window_height: 2000 - # optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) + # optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) + # browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage browsertrix: + enabled: true # defaults to false profile: "./browsertrix/crawls/profile.tar.gz" - timeout_seconds: 90 # defaults to 90s + timeout_seconds: 120 # defaults to 90s # puts execution logs into /logs folder, defaults to false save_logs: true # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"