Add browsertrix profile config option

This commit adds a browsertrix profile option to the configuration. In
order to not require the passing of the browsertrix config to every
Archiver, the Archiver constructors (include the base) were modified to
accept a Storage and Config instance. Some of the constructors them pick
out the pieces they need from the Config, in addition to calling the
parent constructor. In order to avoid a circular import that this
created the Config object now defines the default hash function to use,
rather than having it be a static property of the Archiver class.
This commit is contained in:
Ed Summers
2022-10-11 16:14:25 -04:00
parent 3b87dffe6b
commit c34fb9cf10
14 changed files with 84 additions and 52 deletions

3
.gitignore vendored
View File

@@ -19,4 +19,5 @@ local_archive/
vk_config*.json
gd-token.json
credentials.json
secrets/*
secrets/*
browsertrix/*

View File

@@ -18,7 +18,7 @@ You also need:
3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
6. If you would like to take archival WACZ snapshots using browsertrix-crawler
6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
in addition to screenshots you will need to install Docker.
### Configuration file

View File

@@ -10,6 +10,7 @@ from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from slugify import slugify
from configs import Config
from storages import Storage
from utils import mkdir_if_not_exists
@@ -28,13 +29,14 @@ class ArchiveResult:
hash: str = None
class Archiver(ABC):
HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
name = "default"
retry_regex = r"retrying at (\d+)$"
def __init__(self, storage: Storage, driver):
def __init__(self, storage: Storage, config: Config):
self.storage = storage
self.driver = driver
self.driver = config.webdriver
self.hash_algorithm = config.hash_algorithm
self.browsertrix = config.browsertrix_config
def __str__(self):
return self.__class__.__name__
@@ -163,11 +165,11 @@ class Archiver(ABC):
def get_hash(self, filename):
with open(filename, "rb") as f:
bytes = f.read() # read entire file as bytes
logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
logger.debug(f'Hash algorithm is {self.hash_algorithm}')
if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")
return hash.hexdigest()
@@ -206,10 +208,10 @@ class Archiver(ABC):
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
collection = key.replace(".wacz", "").replace("-", "")
cwd = os.getcwd()
browsertrix_home = os.path.join(os.getcwd(), "browsertrix")
cmd = [
"docker", "run",
"-v", f"{cwd}/browsertrix:/crawls/",
"-v", f"{browsertrix_home}:/crawls/",
"-it",
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
@@ -220,13 +222,22 @@ class Archiver(ABC):
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", "90"
]
if not os.path.isdir(browsertrix_home):
os.mkdir(browsertrix_home)
if self.browsertrix.profile:
shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
cmd.extend(["--profile", "/crawls/profile.tar.gz"])
try:
logger.info(f"running browsertrix-crawler: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
except Exception as e:
logger.error(f"wacz generation failed: {e}")
return
filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz")
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'application/zip'})

View File

@@ -7,7 +7,7 @@ from telethon.errors import ChannelInvalidError
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import TelethonConfig
from configs import Config
from utils import getattr_or
@@ -15,11 +15,12 @@ class TelethonArchiver(Archiver):
name = "telethon"
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
def __init__(self, storage: Storage, driver, config: TelethonConfig):
super().__init__(storage, driver)
if config:
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
self.bot_token = config.bot_token
def __init__(self, storage: Storage, config: Config):
super().__init__(storage, config)
if config.telegram_config:
c = config.telegram_config
self.client = TelegramClient("./anon", c.api_id, c.api_hash)
self.bot_token = c.bot_token
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
"""

View File

@@ -5,7 +5,7 @@ from loguru import logger
from pytwitter import Api
from storages.base_storage import Storage
from configs import TwitterApiConfig
from configs import Config
from .base_archiver import ArchiveResult
from .twitter_archiver import TwitterArchiver
@@ -13,14 +13,15 @@ from .twitter_archiver import TwitterArchiver
class TwitterApiArchiver(TwitterArchiver):
name = "twitter_api"
def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
super().__init__(storage, driver)
def __init__(self, storage: Storage, config: Config):
super().__init__(storage, config)
c = config.twitter_config
if config.bearer_token:
self.api = Api(bearer_token=config.bearer_token)
elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret:
if c.bearer_token:
self.api = Api(bearer_token=c.bearer_token)
elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret:
self.api = Api(
consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret)
consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret)
def download(self, url, check_if_exists=False):
if not hasattr(self, "api"):

View File

@@ -5,7 +5,7 @@ from vk_url_scraper import VkScraper, DateTimeEncoder
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import VkConfig
from configs import Config
class VkArchiver(Archiver):
@@ -17,10 +17,10 @@ class VkArchiver(Archiver):
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
def __init__(self, storage: Storage, driver, config: VkConfig):
super().__init__(storage, driver)
if config != None:
self.vks = VkScraper(config.username, config.password)
def __init__(self, storage: Storage, config: Config):
super().__init__(storage, config)
if config.vk_config != None:
self.vks = VkScraper(config.vk_config.username, config.vk_config.password)
def download(self, url, check_if_exists=False):
if not hasattr(self, "vks") or self.vks is None:

View File

@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import WaybackConfig
from configs import Config
class WaybackArchiver(Archiver):
@@ -15,9 +15,9 @@ class WaybackArchiver(Archiver):
"""
name = "wayback"
def __init__(self, storage: Storage, driver, config: WaybackConfig):
super(WaybackArchiver, self).__init__(storage, driver)
self.config = config
def __init__(self, storage: Storage, config: Config):
super(WaybackArchiver, self).__init__(storage, config)
self.config = config.wayback_config
self.seen_urls = {}
def download(self, url, check_if_exists=False):

View File

@@ -6,15 +6,16 @@ from loguru import logger
from .base_archiver import Archiver, ArchiveResult
from storages import Storage
from configs import Config
class YoutubeDLArchiver(Archiver):
name = "youtube_dl"
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
def __init__(self, storage: Storage, driver, fb_cookie):
super().__init__(storage, driver)
self.fb_cookie = fb_cookie
def __init__(self, storage: Storage, config: Config):
super().__init__(storage, config)
self.fb_cookie = config.facebook_cookie
def download(self, url, check_if_exists=False):
netloc = self.get_netloc(url)

View File

@@ -2,6 +2,7 @@ import os, datetime, traceback, random, tempfile
from loguru import logger
from slugify import slugify
from urllib.parse import quote
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
from utils import GWorksheet, mkdir_if_not_exists, expand_url
@@ -11,7 +12,7 @@ from storages import Storage
random.seed()
def update_sheet(gw, row, result: ArchiveResult):
def update_sheet(gw, row, url, result: ArchiveResult):
cell_updates = []
row_values = gw.get_row(row)
@@ -31,6 +32,7 @@ def update_sheet(gw, row, result: ArchiveResult):
batch_if_valid('screenshot', result.screenshot)
batch_if_valid('hash', result.hash)
batch_if_valid('wacz', result.wacz)
batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
if result.timestamp is not None:
if type(result.timestamp) == int:
@@ -105,14 +107,14 @@ def process_sheet(c: Config):
# order matters, first to succeed excludes remaining
active_archivers = [
TelethonArchiver(storage, c.webdriver, c.telegram_config),
TiktokArchiver(storage, c.webdriver),
TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TelegramArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver),
VkArchiver(storage, c.webdriver, c.vk_config),
WaybackArchiver(storage, c.webdriver, c.wayback_config)
TelethonArchiver(storage, c),
TiktokArchiver(storage, c),
TwitterApiArchiver(storage, c),
YoutubeDLArchiver(storage, c),
TelegramArchiver(storage, c),
TwitterArchiver(storage, c),
VkArchiver(storage, c),
WaybackArchiver(storage, c)
]
for archiver in active_archivers:
@@ -137,7 +139,7 @@ def process_sheet(c: Config):
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
if result:
update_sheet(gw, row, result)
update_sheet(gw, row, url, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
except KeyboardInterrupt:

View File

@@ -0,0 +1,5 @@
from dataclasses import dataclass
@dataclass
class BrowsertrixConfig:
profile: str

View File

@@ -1,6 +1,5 @@
import argparse, yaml, json
from archivers.base_archiver import Archiver
import gspread
from loguru import logger
from selenium import webdriver
@@ -13,6 +12,7 @@ from .telethon_config import TelethonConfig
from .selenium_config import SeleniumConfig
from .vk_config import VkConfig
from .twitter_api_config import TwitterApiConfig
from .browsertrix_config import BrowsertrixConfig
from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
@@ -82,7 +82,13 @@ class Config:
)
self.webdriver = "not initialized"
Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
# browsertrix config
browsertrix_configs = execution.get("browsertrix", {})
self.browsertrix_config = BrowsertrixConfig(
profile=browsertrix_configs.get("profile")
)
self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
# ---------------------- SECRETS - APIs and service configurations
secrets = self.config.get("secrets", {})
@@ -208,6 +214,7 @@ class Config:
update the folder in each of the storages
"""
self.folder = folder
logger.info(f"setting folder to {folder}")
# s3
if hasattr(self, "s3_config"): self.s3_config.folder = folder
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
@@ -263,7 +270,7 @@ class Config:
"storage": self.storage,
"header": self.header,
"check_if_exists": self.check_if_exists,
"hash_algorithm": Archiver.HASH_ALGORITHM,
"hash_algorithm": self.hash_algorithm,
"save_logs": self.save_logs,
"selenium_config": asdict(self.selenium_config),
"selenium_webdriver": self.webdriver != None,

View File

@@ -8,7 +8,7 @@ secrets:
key: "s3 API key"
secret: "s3 API secret"
# use region format like such
endpoint_url: "https://{region}.digitaloceanspaces.com"
endpoint_url: "https://s3.{region}.amazonaws.com"
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
# if private:true S3 urls will not be readable online
@@ -120,4 +120,5 @@ execution:
screenshot: screenshot
hash: hash
wacz: wacz
replaywebpage: replaywebpage

View File

@@ -75,4 +75,5 @@ class S3Storage(Storage):
extra_args['ContentType'] = "application/zip"
else:
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)

View File

@@ -21,7 +21,8 @@ class GWorksheet:
'duration': 'duration',
'screenshot': 'screenshot',
'hash': 'hash',
'wacz': 'wacz'
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
}
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):