mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 04:38:29 +03:00
Added an example config section to the example.orchestration.yaml file to clarify how to store info about what's been archived and also stores the archive result
123 lines
3.5 KiB
YAML
123 lines
3.5 KiB
YAML
steps:
|
|
# only 1 feeder allowed
|
|
feeder: gsheet_feeder # defaults to cli_feeder
|
|
archivers: # order matters, uncomment to activate
|
|
# - vk_archiver
|
|
# - telethon_archiver
|
|
# - telegram_archiver
|
|
# - twitter_archiver
|
|
# - twitter_api_archiver
|
|
# - instagram_tbot_archiver
|
|
# - instagram_archiver
|
|
# - tiktok_archiver
|
|
- youtubedl_archiver
|
|
- wayback_archiver_enricher
|
|
enrichers:
|
|
- hash_enricher
|
|
# - screenshot_enricher
|
|
# - thumbnail_enricher
|
|
# - wayback_archiver_enricher
|
|
# - wacz_enricher
|
|
|
|
formatter: html_formatter # defaults to mute_formatter
|
|
storages:
|
|
- local_storage
|
|
# - s3_storage
|
|
# - gdrive_storage
|
|
databases:
|
|
- console_db
|
|
# - csv_db
|
|
# - gsheet_db
|
|
# - mongo_db
|
|
|
|
configurations:
|
|
gsheet_feeder:
|
|
sheet: "your sheet name"
|
|
header: 1
|
|
service_account: "secrets/service_account.json"
|
|
# allow_worksheets: "only parse this worksheet"
|
|
# block_worksheets: "blocked sheet 1,blocked sheet 2"
|
|
use_sheet_names_in_stored_paths: false
|
|
columns:
|
|
url: link
|
|
status: archive status
|
|
folder: destination folder
|
|
archive: archive location
|
|
date: archive date
|
|
thumbnail: thumbnail
|
|
timestamp: upload timestamp
|
|
title: upload title
|
|
text: textual content
|
|
screenshot: screenshot
|
|
hash: hash
|
|
wacz: wacz
|
|
replaywebpage: replaywebpage
|
|
instagram_tbot_archiver:
|
|
api_id: "TELEGRAM_BOT_API_ID"
|
|
api_hash: "TELEGRAM_BOT_API_HASH"
|
|
# session_file: "secrets/anon"
|
|
telethon_archiver:
|
|
api_id: "TELEGRAM_BOT_API_ID"
|
|
api_hash: "TELEGRAM_BOT_API_HASH"
|
|
# session_file: "secrets/anon"
|
|
join_channels: false
|
|
channel_invites: # if you want to archive from private channels
|
|
- invite: https://t.me/+123456789
|
|
id: 0000000001
|
|
- invite: https://t.me/+123456788
|
|
id: 0000000002
|
|
|
|
twitter_api_archiver:
|
|
# either bearer_token only
|
|
bearer_token: "TWITTER_BEARER_TOKEN"
|
|
# OR all of the below
|
|
# consumer_key: ""
|
|
# consumer_secret: ""
|
|
# access_token: ""
|
|
# access_secret: ""
|
|
instagram_archiver:
|
|
username: "INSTAGRAM_USERNAME"
|
|
password: "INSTAGRAM_PASSWORD"
|
|
# session_file: "secrets/instaloader.session"
|
|
|
|
vk_archiver:
|
|
username: "or phone number"
|
|
password: "vk pass"
|
|
session_file: "secrets/vk_config.v2.json"
|
|
|
|
screenshot_enricher:
|
|
width: 1280
|
|
height: 2300
|
|
wayback_archiver_enricher:
|
|
timeout: 10
|
|
key: "wayback key"
|
|
secret: "wayback secret"
|
|
hash_enricher:
|
|
algorithm: "SHA3-512" # can also be SHA-256
|
|
wacz_enricher:
|
|
profile: secrets/profile.tar.gz
|
|
local_storage:
|
|
save_to: "./local_archive"
|
|
save_absolute: true
|
|
filename_generator: static
|
|
path_generator: flat
|
|
s3_storage:
|
|
bucket: your-bucket-name
|
|
region: reg1
|
|
key: S3_KEY
|
|
secret: S3_SECRET
|
|
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
|
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
|
# if private:true S3 urls will not be readable online
|
|
private: false
|
|
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
|
|
key_path: random
|
|
gdrive_storage:
|
|
path_generator: url
|
|
filename_generator: random
|
|
root_folder_id: folder_id_from_url
|
|
oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
|
|
service_account: "secrets/service_account.json"
|
|
csv_db:
|
|
csv_file: "./local_archive/db.csv"
|