Add browsertrix profile config option

This commit adds a browsertrix profile option to the configuration. In order to not require the passing of the browsertrix config to every Archiver, the Archiver constructors (include the base) were modified to accept a Storage and Config instance. Some of the constructors them pick out the pieces they need from the Config, in addition to calling the parent constructor. In order to avoid a circular import that this created the Config object now defines the default hash function to use, rather than having it be a static property of the Archiver class.
2026-06-08 03:18:28 +03:00 · 2022-10-11 16:14:25 -04:00
parent 3b87dffe6b
commit c34fb9cf10
14 changed files with 84 additions and 52 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -19,4 +19,5 @@ local_archive/
 vk_config*.json
 gd-token.json
 credentials.json
-secrets/*
+secrets/*
+browsertrix/*
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ You also need:
 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 
 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 
 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
-6. If you would like to take archival WACZ snapshots using browsertrix-crawler
+6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
   in addition to screenshots you will need to install Docker.

 ### Configuration file
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -10,6 +10,7 @@ from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.by import By
 from slugify import slugify

+from configs import Config
 from storages import Storage
 from utils import mkdir_if_not_exists

@@ -28,13 +29,14 @@ class ArchiveResult:
    hash: str = None

 class Archiver(ABC):
-    HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
    name = "default"
    retry_regex = r"retrying at (\d+)$"

-    def __init__(self, storage: Storage, driver):
+    def __init__(self, storage: Storage, config: Config):
        self.storage = storage
-        self.driver = driver
+        self.driver = config.webdriver
+        self.hash_algorithm = config.hash_algorithm
+        self.browsertrix = config.browsertrix_config

    def __str__(self):
        return self.__class__.__name__
@@ -163,11 +165,11 @@ class Archiver(ABC):
    def get_hash(self, filename):
        with open(filename, "rb") as f:
            bytes = f.read()  # read entire file as bytes
-            logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
+            logger.debug(f'Hash algorithm is {self.hash_algorithm}')

-            if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
-            elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
-            else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
+            if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
+            elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
+            else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")

        return hash.hexdigest()

@@ -206,10 +208,10 @@ class Archiver(ABC):
        key = self._get_key_from_url(url, ".wacz", append_datetime=True)
        collection = key.replace(".wacz", "").replace("-", "")

-        cwd = os.getcwd()
+        browsertrix_home = os.path.join(os.getcwd(), "browsertrix")
        cmd = [
            "docker", "run",
-            "-v", f"{cwd}/browsertrix:/crawls/",
+            "-v", f"{browsertrix_home}:/crawls/",
            "-it",
            "webrecorder/browsertrix-crawler", "crawl",
            "--url", url,
@@ -220,13 +222,22 @@ class Archiver(ABC):
            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
            "--behaviorTimeout", "90"
        ]
+
+        if not os.path.isdir(browsertrix_home):
+            os.mkdir(browsertrix_home)
+
+        if self.browsertrix.profile:
+            shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
+            cmd.extend(["--profile", "/crawls/profile.tar.gz"])
+
        try:
+            logger.info(f"running browsertrix-crawler: {' '.join(cmd)}")
            subprocess.run(cmd, check=True)
        except Exception as e:
            logger.error(f"wacz generation failed: {e}")
            return

-        filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz")
+        filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")

        self.storage.upload(filename, key, extra_args={
                            'ACL': 'public-read', 'ContentType': 'application/zip'})
--- a/archivers/telethon_archiver.py
+++ b/archivers/telethon_archiver.py
@@ -7,7 +7,7 @@ from telethon.errors import ChannelInvalidError

 from storages import Storage
 from .base_archiver import Archiver, ArchiveResult
-from configs import TelethonConfig
+from configs import Config
 from utils import getattr_or


@@ -15,11 +15,12 @@ class TelethonArchiver(Archiver):
    name = "telethon"
    link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")

-    def __init__(self, storage: Storage, driver, config: TelethonConfig):
-        super().__init__(storage, driver)
-        if config:
-            self.client = TelegramClient("./anon", config.api_id, config.api_hash)
-            self.bot_token = config.bot_token
+    def __init__(self, storage: Storage, config: Config):
+        super().__init__(storage, config)
+        if config.telegram_config:
+            c = config.telegram_config
+            self.client = TelegramClient("./anon", c.api_id, c.api_hash)
+            self.bot_token = c.bot_token

    def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
        """
--- a/archivers/twitter_api_archiver.py
+++ b/archivers/twitter_api_archiver.py
@@ -5,7 +5,7 @@ from loguru import logger
 from pytwitter import Api

 from storages.base_storage import Storage
-from configs import TwitterApiConfig
+from configs import Config
 from .base_archiver import ArchiveResult
 from .twitter_archiver import TwitterArchiver

@@ -13,14 +13,15 @@ from .twitter_archiver import TwitterArchiver
 class TwitterApiArchiver(TwitterArchiver):
    name = "twitter_api"

-    def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
-        super().__init__(storage, driver)
+    def __init__(self, storage: Storage, config: Config):
+        super().__init__(storage, config)
+        c = config.twitter_config

-        if config.bearer_token:
-            self.api = Api(bearer_token=config.bearer_token)
-        elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret:
+        if c.bearer_token:
+            self.api = Api(bearer_token=c.bearer_token)
+        elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret:
            self.api = Api(
-                consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret)
+                consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret)

    def download(self, url, check_if_exists=False):
        if not hasattr(self, "api"):
--- a/archivers/vk_archiver.py
+++ b/archivers/vk_archiver.py
@@ -5,7 +5,7 @@ from vk_url_scraper import VkScraper, DateTimeEncoder

 from storages import Storage
 from .base_archiver import Archiver, ArchiveResult
-from configs import VkConfig
+from configs import Config


 class VkArchiver(Archiver):
@@ -17,10 +17,10 @@ class VkArchiver(Archiver):
    wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
    photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")

-    def __init__(self, storage: Storage, driver, config: VkConfig):
-        super().__init__(storage, driver)
-        if config != None:
-            self.vks = VkScraper(config.username, config.password)
+    def __init__(self, storage: Storage, config: Config): 
+        super().__init__(storage, config)
+        if config.vk_config != None:
+            self.vks = VkScraper(config.vk_config.username, config.vk_config.password)

    def download(self, url, check_if_exists=False):
        if not hasattr(self, "vks") or self.vks is None:
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup

 from storages import Storage
 from .base_archiver import Archiver, ArchiveResult
-from configs import WaybackConfig
+from configs import Config


 class WaybackArchiver(Archiver):
@@ -15,9 +15,9 @@ class WaybackArchiver(Archiver):
    """
    name = "wayback"

-    def __init__(self, storage: Storage, driver, config: WaybackConfig):
-        super(WaybackArchiver, self).__init__(storage, driver)
-        self.config = config
+    def __init__(self, storage: Storage, config: Config):
+        super(WaybackArchiver, self).__init__(storage, config)
+        self.config = config.wayback_config
        self.seen_urls = {}

    def download(self, url, check_if_exists=False):
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -6,15 +6,16 @@ from loguru import logger

 from .base_archiver import Archiver, ArchiveResult
 from storages import Storage
+from configs import Config


 class YoutubeDLArchiver(Archiver):
    name = "youtube_dl"
    ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}

-    def __init__(self, storage: Storage, driver, fb_cookie):
-        super().__init__(storage, driver)
-        self.fb_cookie = fb_cookie
+    def __init__(self, storage: Storage, config: Config):
+        super().__init__(storage, config)
+        self.fb_cookie = config.facebook_cookie

    def download(self, url, check_if_exists=False):
        netloc = self.get_netloc(url)
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -2,6 +2,7 @@ import os, datetime, traceback, random, tempfile

 from loguru import logger
 from slugify import slugify
+from urllib.parse import quote

 from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
 from utils import GWorksheet, mkdir_if_not_exists, expand_url
@@ -11,7 +12,7 @@ from storages import Storage
 random.seed()


-def update_sheet(gw, row, result: ArchiveResult):
+def update_sheet(gw, row, url, result: ArchiveResult):
    cell_updates = []
    row_values = gw.get_row(row)

@@ -31,6 +32,7 @@ def update_sheet(gw, row, result: ArchiveResult):
    batch_if_valid('screenshot', result.screenshot)
    batch_if_valid('hash', result.hash)
    batch_if_valid('wacz', result.wacz)
+    batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')

    if result.timestamp is not None:
        if type(result.timestamp) == int:
@@ -105,14 +107,14 @@ def process_sheet(c: Config):

                # order matters, first to succeed excludes remaining
                active_archivers = [
-                    TelethonArchiver(storage, c.webdriver, c.telegram_config),
-                    TiktokArchiver(storage, c.webdriver),
-                    TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
-                    YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
-                    TelegramArchiver(storage, c.webdriver),
-                    TwitterArchiver(storage, c.webdriver),
-                    VkArchiver(storage, c.webdriver, c.vk_config),
-                    WaybackArchiver(storage, c.webdriver, c.wayback_config)
+                    TelethonArchiver(storage, c),
+                    TiktokArchiver(storage, c),
+                    TwitterApiArchiver(storage, c),
+                    YoutubeDLArchiver(storage, c),
+                    TelegramArchiver(storage, c),
+                    TwitterArchiver(storage, c),
+                    VkArchiver(storage, c),
+                    WaybackArchiver(storage, c)
                ]

                for archiver in active_archivers:
@@ -137,7 +139,7 @@ def process_sheet(c: Config):
                        logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')

                if result:
-                    update_sheet(gw, row, result)
+                    update_sheet(gw, row, url, result)
                else:
                    gw.set_cell(row, 'status', 'failed: no archiver')
            except KeyboardInterrupt:
--- a/configs/browsertrix_config.py
+++ b/configs/browsertrix_config.py
@@ -0,0 +1,5 @@
+from dataclasses import dataclass
+
+@dataclass
+class BrowsertrixConfig:
+    profile: str
--- a/configs/config.py
+++ b/configs/config.py
@@ -1,6 +1,5 @@

 import argparse, yaml, json
-from archivers.base_archiver import Archiver
 import gspread
 from loguru import logger
 from selenium import webdriver
@@ -13,6 +12,7 @@ from .telethon_config import TelethonConfig
 from .selenium_config import SeleniumConfig
 from .vk_config import VkConfig
 from .twitter_api_config import TwitterApiConfig
+from .browsertrix_config import BrowsertrixConfig
 from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig


@@ -82,7 +82,13 @@ class Config:
        )
        self.webdriver = "not initialized"

-        Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
+        # browsertrix config
+        browsertrix_configs = execution.get("browsertrix", {})
+        self.browsertrix_config = BrowsertrixConfig(
+            profile=browsertrix_configs.get("profile")
+        )
+
+        self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")

        # ---------------------- SECRETS - APIs and service configurations
        secrets = self.config.get("secrets", {})
@@ -208,6 +214,7 @@ class Config:
        update the folder in each of the storages
        """
        self.folder = folder
+        logger.info(f"setting folder to {folder}")
        # s3
        if hasattr(self, "s3_config"): self.s3_config.folder = folder
        if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
@@ -263,7 +270,7 @@ class Config:
            "storage": self.storage,
            "header": self.header,
            "check_if_exists": self.check_if_exists,
-            "hash_algorithm": Archiver.HASH_ALGORITHM,
+            "hash_algorithm": self.hash_algorithm,
            "save_logs": self.save_logs,
            "selenium_config": asdict(self.selenium_config),
            "selenium_webdriver": self.webdriver != None,
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -8,7 +8,7 @@ secrets:
    key: "s3 API key"
    secret: "s3 API secret"
    # use region format like such
-    endpoint_url: "https://{region}.digitaloceanspaces.com"
+    endpoint_url: "https://s3.{region}.amazonaws.com"
    #use bucket, region, and key (key is the archived file path generated when executing) format like such as:
    cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
    # if private:true S3 urls will not be readable online
@@ -120,4 +120,5 @@ execution:
    screenshot: screenshot
    hash: hash
    wacz: wacz
+    replaywebpage: replaywebpage

--- a/storages/s3_storage.py
+++ b/storages/s3_storage.py
@@ -75,4 +75,5 @@ class S3Storage(Storage):
            extra_args['ContentType'] = "application/zip"
        else:
            extra_args['ContentType'] = mimetypes.guess_type(key)[0]
+
        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
--- a/utils/gworksheet.py
+++ b/utils/gworksheet.py
@@ -21,7 +21,8 @@ class GWorksheet:
        'duration': 'duration',
        'screenshot': 'screenshot',
        'hash': 'hash',
-        'wacz': 'wacz'
+        'wacz': 'wacz',
+        'replaywebpage': 'replaywebpage',
    }

    def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):