mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Add browsertrix profile config option
This commit adds a browsertrix profile option to the configuration. In order to not require the passing of the browsertrix config to every Archiver, the Archiver constructors (include the base) were modified to accept a Storage and Config instance. Some of the constructors them pick out the pieces they need from the Config, in addition to calling the parent constructor. In order to avoid a circular import that this created the Config object now defines the default hash function to use, rather than having it be a static property of the Archiver class.
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -19,4 +19,5 @@ local_archive/
|
||||
vk_config*.json
|
||||
gd-token.json
|
||||
credentials.json
|
||||
secrets/*
|
||||
secrets/*
|
||||
browsertrix/*
|
||||
|
||||
@@ -18,7 +18,7 @@ You also need:
|
||||
3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
|
||||
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
|
||||
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
|
||||
6. If you would like to take archival WACZ snapshots using browsertrix-crawler
|
||||
6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
|
||||
in addition to screenshots you will need to install Docker.
|
||||
|
||||
### Configuration file
|
||||
|
||||
@@ -10,6 +10,7 @@ from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.common.by import By
|
||||
from slugify import slugify
|
||||
|
||||
from configs import Config
|
||||
from storages import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
||||
@@ -28,13 +29,14 @@ class ArchiveResult:
|
||||
hash: str = None
|
||||
|
||||
class Archiver(ABC):
|
||||
HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
|
||||
name = "default"
|
||||
retry_regex = r"retrying at (\d+)$"
|
||||
|
||||
def __init__(self, storage: Storage, driver):
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
self.storage = storage
|
||||
self.driver = driver
|
||||
self.driver = config.webdriver
|
||||
self.hash_algorithm = config.hash_algorithm
|
||||
self.browsertrix = config.browsertrix_config
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
@@ -163,11 +165,11 @@ class Archiver(ABC):
|
||||
def get_hash(self, filename):
|
||||
with open(filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
|
||||
logger.debug(f'Hash algorithm is {self.hash_algorithm}')
|
||||
|
||||
if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
|
||||
elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
|
||||
else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
|
||||
if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
|
||||
elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
|
||||
else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")
|
||||
|
||||
return hash.hexdigest()
|
||||
|
||||
@@ -206,10 +208,10 @@ class Archiver(ABC):
|
||||
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
|
||||
collection = key.replace(".wacz", "").replace("-", "")
|
||||
|
||||
cwd = os.getcwd()
|
||||
browsertrix_home = os.path.join(os.getcwd(), "browsertrix")
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"-v", f"{cwd}/browsertrix:/crawls/",
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
"-it",
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
@@ -220,13 +222,22 @@ class Archiver(ABC):
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", "90"
|
||||
]
|
||||
|
||||
if not os.path.isdir(browsertrix_home):
|
||||
os.mkdir(browsertrix_home)
|
||||
|
||||
if self.browsertrix.profile:
|
||||
shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
|
||||
cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
try:
|
||||
logger.info(f"running browsertrix-crawler: {' '.join(cmd)}")
|
||||
subprocess.run(cmd, check=True)
|
||||
except Exception as e:
|
||||
logger.error(f"wacz generation failed: {e}")
|
||||
return
|
||||
|
||||
filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz")
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'application/zip'})
|
||||
|
||||
@@ -7,7 +7,7 @@ from telethon.errors import ChannelInvalidError
|
||||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from configs import TelethonConfig
|
||||
from configs import Config
|
||||
from utils import getattr_or
|
||||
|
||||
|
||||
@@ -15,11 +15,12 @@ class TelethonArchiver(Archiver):
|
||||
name = "telethon"
|
||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: TelethonConfig):
|
||||
super().__init__(storage, driver)
|
||||
if config:
|
||||
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
|
||||
self.bot_token = config.bot_token
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super().__init__(storage, config)
|
||||
if config.telegram_config:
|
||||
c = config.telegram_config
|
||||
self.client = TelegramClient("./anon", c.api_id, c.api_hash)
|
||||
self.bot_token = c.bot_token
|
||||
|
||||
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
|
||||
"""
|
||||
|
||||
@@ -5,7 +5,7 @@ from loguru import logger
|
||||
from pytwitter import Api
|
||||
|
||||
from storages.base_storage import Storage
|
||||
from configs import TwitterApiConfig
|
||||
from configs import Config
|
||||
from .base_archiver import ArchiveResult
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
|
||||
@@ -13,14 +13,15 @@ from .twitter_archiver import TwitterArchiver
|
||||
class TwitterApiArchiver(TwitterArchiver):
|
||||
name = "twitter_api"
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
|
||||
super().__init__(storage, driver)
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super().__init__(storage, config)
|
||||
c = config.twitter_config
|
||||
|
||||
if config.bearer_token:
|
||||
self.api = Api(bearer_token=config.bearer_token)
|
||||
elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret:
|
||||
if c.bearer_token:
|
||||
self.api = Api(bearer_token=c.bearer_token)
|
||||
elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret:
|
||||
self.api = Api(
|
||||
consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret)
|
||||
consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret)
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if not hasattr(self, "api"):
|
||||
|
||||
@@ -5,7 +5,7 @@ from vk_url_scraper import VkScraper, DateTimeEncoder
|
||||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from configs import VkConfig
|
||||
from configs import Config
|
||||
|
||||
|
||||
class VkArchiver(Archiver):
|
||||
@@ -17,10 +17,10 @@ class VkArchiver(Archiver):
|
||||
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: VkConfig):
|
||||
super().__init__(storage, driver)
|
||||
if config != None:
|
||||
self.vks = VkScraper(config.username, config.password)
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super().__init__(storage, config)
|
||||
if config.vk_config != None:
|
||||
self.vks = VkScraper(config.vk_config.username, config.vk_config.password)
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if not hasattr(self, "vks") or self.vks is None:
|
||||
|
||||
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from configs import WaybackConfig
|
||||
from configs import Config
|
||||
|
||||
|
||||
class WaybackArchiver(Archiver):
|
||||
@@ -15,9 +15,9 @@ class WaybackArchiver(Archiver):
|
||||
"""
|
||||
name = "wayback"
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
||||
super(WaybackArchiver, self).__init__(storage, driver)
|
||||
self.config = config
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super(WaybackArchiver, self).__init__(storage, config)
|
||||
self.config = config.wayback_config
|
||||
self.seen_urls = {}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
|
||||
@@ -6,15 +6,16 @@ from loguru import logger
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from storages import Storage
|
||||
from configs import Config
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "youtube_dl"
|
||||
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
||||
|
||||
def __init__(self, storage: Storage, driver, fb_cookie):
|
||||
super().__init__(storage, driver)
|
||||
self.fb_cookie = fb_cookie
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super().__init__(storage, config)
|
||||
self.fb_cookie = config.facebook_cookie
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
netloc = self.get_netloc(url)
|
||||
|
||||
@@ -2,6 +2,7 @@ import os, datetime, traceback, random, tempfile
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
from urllib.parse import quote
|
||||
|
||||
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
|
||||
from utils import GWorksheet, mkdir_if_not_exists, expand_url
|
||||
@@ -11,7 +12,7 @@ from storages import Storage
|
||||
random.seed()
|
||||
|
||||
|
||||
def update_sheet(gw, row, result: ArchiveResult):
|
||||
def update_sheet(gw, row, url, result: ArchiveResult):
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
@@ -31,6 +32,7 @@ def update_sheet(gw, row, result: ArchiveResult):
|
||||
batch_if_valid('screenshot', result.screenshot)
|
||||
batch_if_valid('hash', result.hash)
|
||||
batch_if_valid('wacz', result.wacz)
|
||||
batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
|
||||
|
||||
if result.timestamp is not None:
|
||||
if type(result.timestamp) == int:
|
||||
@@ -105,14 +107,14 @@ def process_sheet(c: Config):
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
TelethonArchiver(storage, c.webdriver, c.telegram_config),
|
||||
TiktokArchiver(storage, c.webdriver),
|
||||
TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
|
||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
||||
TelegramArchiver(storage, c.webdriver),
|
||||
TwitterArchiver(storage, c.webdriver),
|
||||
VkArchiver(storage, c.webdriver, c.vk_config),
|
||||
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
||||
TelethonArchiver(storage, c),
|
||||
TiktokArchiver(storage, c),
|
||||
TwitterApiArchiver(storage, c),
|
||||
YoutubeDLArchiver(storage, c),
|
||||
TelegramArchiver(storage, c),
|
||||
TwitterArchiver(storage, c),
|
||||
VkArchiver(storage, c),
|
||||
WaybackArchiver(storage, c)
|
||||
]
|
||||
|
||||
for archiver in active_archivers:
|
||||
@@ -137,7 +139,7 @@ def process_sheet(c: Config):
|
||||
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
|
||||
|
||||
if result:
|
||||
update_sheet(gw, row, result)
|
||||
update_sheet(gw, row, url, result)
|
||||
else:
|
||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||
except KeyboardInterrupt:
|
||||
|
||||
5
configs/browsertrix_config.py
Normal file
5
configs/browsertrix_config.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class BrowsertrixConfig:
|
||||
profile: str
|
||||
@@ -1,6 +1,5 @@
|
||||
|
||||
import argparse, yaml, json
|
||||
from archivers.base_archiver import Archiver
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from selenium import webdriver
|
||||
@@ -13,6 +12,7 @@ from .telethon_config import TelethonConfig
|
||||
from .selenium_config import SeleniumConfig
|
||||
from .vk_config import VkConfig
|
||||
from .twitter_api_config import TwitterApiConfig
|
||||
from .browsertrix_config import BrowsertrixConfig
|
||||
from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
|
||||
|
||||
|
||||
@@ -82,7 +82,13 @@ class Config:
|
||||
)
|
||||
self.webdriver = "not initialized"
|
||||
|
||||
Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
|
||||
# browsertrix config
|
||||
browsertrix_configs = execution.get("browsertrix", {})
|
||||
self.browsertrix_config = BrowsertrixConfig(
|
||||
profile=browsertrix_configs.get("profile")
|
||||
)
|
||||
|
||||
self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
|
||||
|
||||
# ---------------------- SECRETS - APIs and service configurations
|
||||
secrets = self.config.get("secrets", {})
|
||||
@@ -208,6 +214,7 @@ class Config:
|
||||
update the folder in each of the storages
|
||||
"""
|
||||
self.folder = folder
|
||||
logger.info(f"setting folder to {folder}")
|
||||
# s3
|
||||
if hasattr(self, "s3_config"): self.s3_config.folder = folder
|
||||
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
|
||||
@@ -263,7 +270,7 @@ class Config:
|
||||
"storage": self.storage,
|
||||
"header": self.header,
|
||||
"check_if_exists": self.check_if_exists,
|
||||
"hash_algorithm": Archiver.HASH_ALGORITHM,
|
||||
"hash_algorithm": self.hash_algorithm,
|
||||
"save_logs": self.save_logs,
|
||||
"selenium_config": asdict(self.selenium_config),
|
||||
"selenium_webdriver": self.webdriver != None,
|
||||
|
||||
@@ -8,7 +8,7 @@ secrets:
|
||||
key: "s3 API key"
|
||||
secret: "s3 API secret"
|
||||
# use region format like such
|
||||
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
||||
endpoint_url: "https://s3.{region}.amazonaws.com"
|
||||
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
|
||||
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
# if private:true S3 urls will not be readable online
|
||||
@@ -120,4 +120,5 @@ execution:
|
||||
screenshot: screenshot
|
||||
hash: hash
|
||||
wacz: wacz
|
||||
replaywebpage: replaywebpage
|
||||
|
||||
|
||||
@@ -75,4 +75,5 @@ class S3Storage(Storage):
|
||||
extra_args['ContentType'] = "application/zip"
|
||||
else:
|
||||
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
||||
|
||||
@@ -21,7 +21,8 @@ class GWorksheet:
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz'
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
}
|
||||
|
||||
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
||||
|
||||
Reference in New Issue
Block a user