mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 12:48:28 +03:00
The [browsertrix-crawler] utility is a browser-based crawler that can crawl one or more pages. browsertrix-crawler creates archives in the [WACZ] format which is essentially a standardized ZIP file (similar to DOCX, EPUB, JAR, etc) which can then be replayed using the [ReplayWeb.page] web component, or unzipped to get the original WARC data (the ISO standard format used by the Internet Archive Wayback Machine). This PR adds browsertrix-crawler to archiver classes where screenshots are made made. The WACZ is uploaded to storage and then added to a new column in the spreadsheet. A column can be added that will display the WACZ, loaded from cloud storage (S3, digitalocean, etc) using the client side ReplayWeb page. You can see an example of the spreadsheet here: https://docs.google.com/spreadsheets/d/1Tk-iJWzT9Sx2-YccuPttL9HcMdZEnhv_OR7Bc6tfeu8/edit#gid=0 browsertrix-crawler requires Docker to be installed. If Docker is not installed an error message will be logged and things continue as normal. [browsertrix-crawler]: https://github.com/webrecorder/browsertrix-crawler [WACZ]: https://specs.webrecorder.net/wacz/latest/ [ReplayWeb.page]: https://replayweb.page
124 lines
4.6 KiB
YAML
124 lines
4.6 KiB
YAML
---
|
|
secrets:
|
|
# needed if you use storage=s3
|
|
s3:
|
|
# contains S3 info on region, bucket, key and secret
|
|
region: reg1
|
|
bucket: my-bucket
|
|
key: "s3 API key"
|
|
secret: "s3 API secret"
|
|
# use region format like such
|
|
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
|
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
|
|
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
|
# if private:true S3 urls will not be readable online
|
|
private: false
|
|
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
|
|
key_path: random
|
|
|
|
# needed if you use storage=gd
|
|
google_drive:
|
|
# To authenticate with google you have two options (1. service account OR 2. OAuth token)
|
|
|
|
# 1. service account - storage space will count towards the developer account
|
|
# filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
|
|
# service_account: "service_account.json"
|
|
|
|
# 2. OAuth token - storage space will count towards the owner of the GDrive folder
|
|
# (only 1. or 2. - if both specified then this 2. takes precedence)
|
|
# needs write access on the server so refresh flow works
|
|
# To get the token, run the file `create_update_test_oauth_token.py`
|
|
# you can edit that file if you want a different token filename, default is "gd-token.json"
|
|
oauth_token_filename: "gd-token.json"
|
|
|
|
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
|
|
|
|
# needed if you use storage=local
|
|
local:
|
|
# local path to save files in
|
|
save_to: "./local_archive"
|
|
|
|
wayback:
|
|
# to get credentials visit https://archive.org/account/s3.php
|
|
key: your API key
|
|
secret: your API secret
|
|
|
|
telegram:
|
|
# to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27
|
|
api_id: your API key, see
|
|
api_hash: your API hash
|
|
# optional, but allows access to more content such as large videos, talk to @botfather
|
|
bot_token: your bot-token
|
|
|
|
# twitter configuration - API V2 only
|
|
# if you don't provide credentials the less-effective unofficial TwitterArchiver will be used instead
|
|
twitter:
|
|
# either bearer_token only
|
|
bearer_token: ""
|
|
# OR all of the below
|
|
consumer_key: ""
|
|
consumer_secret: ""
|
|
access_token: ""
|
|
access_secret: ""
|
|
|
|
# vkontakte (vk.com) credentials
|
|
vk:
|
|
username: "phone number or email"
|
|
password: "password"
|
|
|
|
google_sheets:
|
|
# local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
|
|
service_account: "service_account.json"
|
|
|
|
facebook:
|
|
# optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'
|
|
cookie: ""
|
|
execution:
|
|
# can be overwritten with CMD --sheet=
|
|
sheet: your-sheet-name
|
|
|
|
# block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
|
|
# worksheet_allow and worksheet_block can be single values or lists
|
|
# if worksheet_allow is specified, worksheet_block is ignored
|
|
# worksheet_allow:
|
|
# - Sheet1
|
|
# - "Sheet 2"
|
|
# worksheet_block: BlockedSheet
|
|
|
|
# which row of your tabs contains the header, can be overwritten with CMD --header=
|
|
header: 1
|
|
# which storage to use, can be overwritten with CMD --storage=
|
|
storage: s3
|
|
# defaults to false, when true will try to avoid duplicate URL archives
|
|
check_if_exists: true
|
|
|
|
# choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
|
|
# hash_algorithm: SHA-256
|
|
|
|
# optional configurations for the selenium browser that takes screenshots, these are the defaults
|
|
selenium:
|
|
# values under 10s might mean screenshots fail to grab screenshot
|
|
timeout_seconds: 120
|
|
window_width: 1400
|
|
window_height: 2000
|
|
# puts execution logs into /logs folder, defaults to false
|
|
save_logs: true
|
|
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
|
|
# url and status are the only columns required to be present in the google sheet
|
|
column_names:
|
|
url: link
|
|
status: archive status
|
|
archive: archive location
|
|
# use this column to override default location data
|
|
folder: folder
|
|
date: archive date
|
|
thumbnail: thumbnail
|
|
thumbnail_index: thumbnail index
|
|
timestamp: upload timestamp
|
|
title: upload title
|
|
duration: duration
|
|
screenshot: screenshot
|
|
hash: hash
|
|
wacz: wacz
|
|
|