Files
auto-archiver/storages/s3_storage.py
Ed Summers 3b87dffe6b Add browsertrix-crawler capture
The [browsertrix-crawler] utility is a browser-based crawler that can
crawl one or more pages. browsertrix-crawler creates archives in the
[WACZ] format which is essentially a standardized ZIP file (similar to DOCX, EPUB, JAR, etc) which can then be replayed using the [ReplayWeb.page] web
component, or unzipped to get the original WARC data (the ISO standard
format used by the Internet Archive Wayback Machine).

This PR adds browsertrix-crawler to archiver classes where screenshots are made made. The WACZ is uploaded to storage and then added to a new column in the spreadsheet. A column can be added that will display the WACZ, loaded from cloud storage (S3, digitalocean, etc) using the client side ReplayWeb page. You can see an example of the spreadsheet here:

https://docs.google.com/spreadsheets/d/1Tk-iJWzT9Sx2-YccuPttL9HcMdZEnhv_OR7Bc6tfeu8/edit#gid=0

browsertrix-crawler requires Docker to be installed. If Docker is not
installed an error message will be logged and things continue as normal.

[browsertrix-crawler]: https://github.com/webrecorder/browsertrix-crawler
[WACZ]: https://specs.webrecorder.net/wacz/latest/
[ReplayWeb.page]: https://replayweb.page
2022-09-25 19:46:29 +00:00

79 lines
2.6 KiB
Python

import uuid, os, mimetypes
from dataclasses import dataclass
import boto3
from botocore.errorfactory import ClientError
from .base_storage import Storage
from dataclasses import dataclass
from loguru import logger
@dataclass
class S3Config:
bucket: str
region: str
key: str
secret: str
folder: str = ""
endpoint_url: str = "https://{region}.digitaloceanspaces.com"
cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
private: bool = False
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
class S3Storage(Storage):
def __init__(self, config: S3Config):
self.bucket = config.bucket
self.region = config.region
self.folder = config.folder
self.private = config.private
self.cdn_url = config.cdn_url
self.key_path = config.key_path
self.key_dict = {}
self.s3 = boto3.client(
's3',
region_name=config.region,
endpoint_url=config.endpoint_url.format(region=config.region),
aws_access_key_id=config.key,
aws_secret_access_key=config.secret
)
def _get_path(self, key):
"""
Depends on the self.key_path configuration:
* random - assigns a random UUID which can be used in conjunction with "private=false" to have unguessable documents publicly available -> self.folder/randomUUID
* default -> defaults to self.folder/key
"""
# defaults to /key
final_key = key
if self.key_path == "random":
if key not in self.key_dict:
ext = os.path.splitext(key)[1]
self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
final_key = self.key_dict[key]
return os.path.join(self.folder, final_key)
def get_cdn_url(self, key):
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
def exists(self, key):
try:
self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
return True
except ClientError:
return False
def uploadf(self, file, key, **kwargs):
if self.private:
extra_args = kwargs.get("extra_args", {})
else:
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
if key.endswith('.wacz'):
extra_args['ContentType'] = "application/zip"
else:
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)