mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
refactors for edges in browsertrix and s3 upload, adds timeout parameter
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -21,3 +21,4 @@ gd-token.json
|
||||
credentials.json
|
||||
secrets/*
|
||||
browsertrix/*
|
||||
browsertrix-tmp/*
|
||||
@@ -19,7 +19,7 @@ You also need:
|
||||
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
|
||||
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
|
||||
6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
|
||||
in addition to screenshots you will need to install Docker.
|
||||
in addition to screenshots you will need to install [Docker](https://www.docker.com/).
|
||||
|
||||
### Configuration file
|
||||
Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
|
||||
|
||||
@@ -198,17 +198,16 @@ class Archiver(ABC):
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
|
||||
self.driver.save_screenshot(filename)
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
|
||||
return self.storage.get_cdn_url(key)
|
||||
|
||||
def get_wacz(self, url):
|
||||
logger.debug(f"getting wacz for {url}")
|
||||
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
|
||||
collection = key.replace(".wacz", "").replace("-", "")
|
||||
collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
|
||||
|
||||
browsertrix_home = os.path.join(os.getcwd(), "browsertrix")
|
||||
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
@@ -220,7 +219,7 @@ class Archiver(ABC):
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", "90"
|
||||
"--behaviorTimeout", str(self.browsertrix.timeout_seconds)
|
||||
]
|
||||
|
||||
if not os.path.isdir(browsertrix_home):
|
||||
|
||||
@@ -3,3 +3,4 @@ from dataclasses import dataclass
|
||||
@dataclass
|
||||
class BrowsertrixConfig:
|
||||
profile: str
|
||||
timeout_seconds: str
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
|
||||
import argparse, yaml, json
|
||||
import argparse, yaml, json, os
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from selenium import webdriver
|
||||
@@ -84,8 +83,11 @@ class Config:
|
||||
|
||||
# browsertrix config
|
||||
browsertrix_configs = execution.get("browsertrix", {})
|
||||
if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
|
||||
browsertrix_profile = os.path.abspath(browsertrix_profile)
|
||||
self.browsertrix_config = BrowsertrixConfig(
|
||||
profile=browsertrix_configs.get("profile")
|
||||
profile=browsertrix_profile,
|
||||
timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
|
||||
)
|
||||
|
||||
self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
|
||||
@@ -271,6 +273,7 @@ class Config:
|
||||
"header": self.header,
|
||||
"check_if_exists": self.check_if_exists,
|
||||
"hash_algorithm": self.hash_algorithm,
|
||||
"browsertrix_config": asdict(self.browsertrix_config),
|
||||
"save_logs": self.save_logs,
|
||||
"selenium_config": asdict(self.selenium_config),
|
||||
"selenium_webdriver": self.webdriver != None,
|
||||
|
||||
@@ -8,7 +8,8 @@ secrets:
|
||||
key: "s3 API key"
|
||||
secret: "s3 API secret"
|
||||
# use region format like such
|
||||
endpoint_url: "https://s3.{region}.amazonaws.com"
|
||||
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
||||
# endpoint_url: "https://s3.{region}.amazonaws.com"
|
||||
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
|
||||
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
# if private:true S3 urls will not be readable online
|
||||
@@ -101,6 +102,11 @@ execution:
|
||||
timeout_seconds: 120
|
||||
window_width: 1400
|
||||
window_height: 2000
|
||||
|
||||
# optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
|
||||
browsertrix:
|
||||
profile: "./browsertrix/crawls/profile.tar.gz"
|
||||
timeout_seconds: 90 # defaults to 90s
|
||||
# puts execution logs into /logs folder, defaults to false
|
||||
save_logs: true
|
||||
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
|
||||
@@ -120,5 +126,6 @@ execution:
|
||||
screenshot: screenshot
|
||||
hash: hash
|
||||
wacz: wacz
|
||||
# if you want the replaypage to work, make sure to allow CORS on your bucket
|
||||
replaywebpage: replaywebpage
|
||||
|
||||
|
||||
@@ -67,13 +67,14 @@ class S3Storage(Storage):
|
||||
return False
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
if self.private:
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
else:
|
||||
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
|
||||
if key.endswith('.wacz'):
|
||||
extra_args['ContentType'] = "application/zip"
|
||||
else:
|
||||
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
if not self.private and 'ACL' not in extra_args:
|
||||
extra_args['ACL'] = 'public-read'
|
||||
|
||||
if 'ContentType' not in extra_args:
|
||||
try:
|
||||
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to get mimetype for {key=}, error: {e}")
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
||||
|
||||
Reference in New Issue
Block a user