diff --git a/.gitignore b/.gitignore index 8da75c3..e525a6a 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,5 @@ local_archive/ vk_config*.json gd-token.json credentials.json -secrets/* \ No newline at end of file +secrets/* +browsertrix/* diff --git a/README.md b/README.md index 0b79cbb..b8f3c75 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ You also need: 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. -6. If you would like to take archival WACZ snapshots using browsertrix-crawler +6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler) in addition to screenshots you will need to install Docker. ### Configuration file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 91cc25a..4ee3433 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -10,6 +10,7 @@ from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from slugify import slugify +from configs import Config from storages import Storage from utils import mkdir_if_not_exists @@ -28,13 +29,14 @@ class ArchiveResult: hash: str = None class Archiver(ABC): - HASH_ALGORITHM="SHA-256" # can be overwritten by user configs name = "default" retry_regex = r"retrying at (\d+)$" - def __init__(self, storage: Storage, driver): + def __init__(self, storage: Storage, config: Config): self.storage = storage - self.driver = driver + self.driver = config.webdriver + self.hash_algorithm = config.hash_algorithm + self.browsertrix = config.browsertrix_config def __str__(self): return self.__class__.__name__ @@ -163,11 +165,11 @@ class Archiver(ABC): def get_hash(self, filename): with open(filename, "rb") as f: bytes = f.read() # read entire file as bytes - logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}') + logger.debug(f'Hash algorithm is {self.hash_algorithm}') - if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes) - elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes) - else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}") + if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes) + elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes) + else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}") return hash.hexdigest() @@ -206,10 +208,10 @@ class Archiver(ABC): key = self._get_key_from_url(url, ".wacz", append_datetime=True) collection = key.replace(".wacz", "").replace("-", "") - cwd = os.getcwd() + browsertrix_home = os.path.join(os.getcwd(), "browsertrix") cmd = [ "docker", "run", - "-v", f"{cwd}/browsertrix:/crawls/", + "-v", f"{browsertrix_home}:/crawls/", "-it", "webrecorder/browsertrix-crawler", "crawl", "--url", url, @@ -220,13 +222,22 @@ class Archiver(ABC): "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", "--behaviorTimeout", "90" ] + + if not os.path.isdir(browsertrix_home): + os.mkdir(browsertrix_home) + + if self.browsertrix.profile: + shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz")) + cmd.extend(["--profile", "/crawls/profile.tar.gz"]) + try: + logger.info(f"running browsertrix-crawler: {' '.join(cmd)}") subprocess.run(cmd, check=True) except Exception as e: logger.error(f"wacz generation failed: {e}") return - filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz") + filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") self.storage.upload(filename, key, extra_args={ 'ACL': 'public-read', 'ContentType': 'application/zip'}) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index f35e323..d47cdc5 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -7,7 +7,7 @@ from telethon.errors import ChannelInvalidError from storages import Storage from .base_archiver import Archiver, ArchiveResult -from configs import TelethonConfig +from configs import Config from utils import getattr_or @@ -15,11 +15,12 @@ class TelethonArchiver(Archiver): name = "telethon" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") - def __init__(self, storage: Storage, driver, config: TelethonConfig): - super().__init__(storage, driver) - if config: - self.client = TelegramClient("./anon", config.api_id, config.api_hash) - self.bot_token = config.bot_token + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + if config.telegram_config: + c = config.telegram_config + self.client = TelegramClient("./anon", c.api_id, c.api_hash) + self.bot_token = c.bot_token def _get_media_posts_in_group(self, chat, original_post, max_amp=10): """ diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py index 6aa1742..852df12 100644 --- a/archivers/twitter_api_archiver.py +++ b/archivers/twitter_api_archiver.py @@ -5,7 +5,7 @@ from loguru import logger from pytwitter import Api from storages.base_storage import Storage -from configs import TwitterApiConfig +from configs import Config from .base_archiver import ArchiveResult from .twitter_archiver import TwitterArchiver @@ -13,14 +13,15 @@ from .twitter_archiver import TwitterArchiver class TwitterApiArchiver(TwitterArchiver): name = "twitter_api" - def __init__(self, storage: Storage, driver, config: TwitterApiConfig): - super().__init__(storage, driver) + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + c = config.twitter_config - if config.bearer_token: - self.api = Api(bearer_token=config.bearer_token) - elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret: + if c.bearer_token: + self.api = Api(bearer_token=c.bearer_token) + elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret: self.api = Api( - consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret) + consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret) def download(self, url, check_if_exists=False): if not hasattr(self, "api"): diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index c448367..a3af9db 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -5,7 +5,7 @@ from vk_url_scraper import VkScraper, DateTimeEncoder from storages import Storage from .base_archiver import Archiver, ArchiveResult -from configs import VkConfig +from configs import Config class VkArchiver(Archiver): @@ -17,10 +17,10 @@ class VkArchiver(Archiver): wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") - def __init__(self, storage: Storage, driver, config: VkConfig): - super().__init__(storage, driver) - if config != None: - self.vks = VkScraper(config.username, config.password) + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + if config.vk_config != None: + self.vks = VkScraper(config.vk_config.username, config.vk_config.password) def download(self, url, check_if_exists=False): if not hasattr(self, "vks") or self.vks is None: diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index cf32874..4de2fa8 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup from storages import Storage from .base_archiver import Archiver, ArchiveResult -from configs import WaybackConfig +from configs import Config class WaybackArchiver(Archiver): @@ -15,9 +15,9 @@ class WaybackArchiver(Archiver): """ name = "wayback" - def __init__(self, storage: Storage, driver, config: WaybackConfig): - super(WaybackArchiver, self).__init__(storage, driver) - self.config = config + def __init__(self, storage: Storage, config: Config): + super(WaybackArchiver, self).__init__(storage, config) + self.config = config.wayback_config self.seen_urls = {} def download(self, url, check_if_exists=False): diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index c66378d..5d09442 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -6,15 +6,16 @@ from loguru import logger from .base_archiver import Archiver, ArchiveResult from storages import Storage +from configs import Config class YoutubeDLArchiver(Archiver): name = "youtube_dl" ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} - def __init__(self, storage: Storage, driver, fb_cookie): - super().__init__(storage, driver) - self.fb_cookie = fb_cookie + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + self.fb_cookie = config.facebook_cookie def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) diff --git a/auto_archive.py b/auto_archive.py index 86d951b..d657061 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -2,6 +2,7 @@ import os, datetime, traceback, random, tempfile from loguru import logger from slugify import slugify +from urllib.parse import quote from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver from utils import GWorksheet, mkdir_if_not_exists, expand_url @@ -11,7 +12,7 @@ from storages import Storage random.seed() -def update_sheet(gw, row, result: ArchiveResult): +def update_sheet(gw, row, url, result: ArchiveResult): cell_updates = [] row_values = gw.get_row(row) @@ -31,6 +32,7 @@ def update_sheet(gw, row, result: ArchiveResult): batch_if_valid('screenshot', result.screenshot) batch_if_valid('hash', result.hash) batch_if_valid('wacz', result.wacz) + batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') if result.timestamp is not None: if type(result.timestamp) == int: @@ -105,14 +107,14 @@ def process_sheet(c: Config): # order matters, first to succeed excludes remaining active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), - TiktokArchiver(storage, c.webdriver), - TwitterApiArchiver(storage, c.webdriver, c.twitter_config), - YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), - TelegramArchiver(storage, c.webdriver), - TwitterArchiver(storage, c.webdriver), - VkArchiver(storage, c.webdriver, c.vk_config), - WaybackArchiver(storage, c.webdriver, c.wayback_config) + TelethonArchiver(storage, c), + TiktokArchiver(storage, c), + TwitterApiArchiver(storage, c), + YoutubeDLArchiver(storage, c), + TelegramArchiver(storage, c), + TwitterArchiver(storage, c), + VkArchiver(storage, c), + WaybackArchiver(storage, c) ] for archiver in active_archivers: @@ -137,7 +139,7 @@ def process_sheet(c: Config): logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') if result: - update_sheet(gw, row, result) + update_sheet(gw, row, url, result) else: gw.set_cell(row, 'status', 'failed: no archiver') except KeyboardInterrupt: diff --git a/configs/browsertrix_config.py b/configs/browsertrix_config.py new file mode 100644 index 0000000..8b30dac --- /dev/null +++ b/configs/browsertrix_config.py @@ -0,0 +1,5 @@ +from dataclasses import dataclass + +@dataclass +class BrowsertrixConfig: + profile: str diff --git a/configs/config.py b/configs/config.py index 0d11467..4124236 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,6 +1,5 @@ import argparse, yaml, json -from archivers.base_archiver import Archiver import gspread from loguru import logger from selenium import webdriver @@ -13,6 +12,7 @@ from .telethon_config import TelethonConfig from .selenium_config import SeleniumConfig from .vk_config import VkConfig from .twitter_api_config import TwitterApiConfig +from .browsertrix_config import BrowsertrixConfig from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig @@ -82,7 +82,13 @@ class Config: ) self.webdriver = "not initialized" - Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM) + # browsertrix config + browsertrix_configs = execution.get("browsertrix", {}) + self.browsertrix_config = BrowsertrixConfig( + profile=browsertrix_configs.get("profile") + ) + + self.hash_algorithm = execution.get("hash_algorithm", "SHA-256") # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) @@ -208,6 +214,7 @@ class Config: update the folder in each of the storages """ self.folder = folder + logger.info(f"setting folder to {folder}") # s3 if hasattr(self, "s3_config"): self.s3_config.folder = folder if hasattr(self, "s3_storage"): self.s3_storage.folder = folder @@ -263,7 +270,7 @@ class Config: "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, - "hash_algorithm": Archiver.HASH_ALGORITHM, + "hash_algorithm": self.hash_algorithm, "save_logs": self.save_logs, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, diff --git a/example.config.yaml b/example.config.yaml index c9dd323..b736eca 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -8,7 +8,7 @@ secrets: key: "s3 API key" secret: "s3 API secret" # use region format like such - endpoint_url: "https://{region}.digitaloceanspaces.com" + endpoint_url: "https://s3.{region}.amazonaws.com" #use bucket, region, and key (key is the archived file path generated when executing) format like such as: cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" # if private:true S3 urls will not be readable online @@ -120,4 +120,5 @@ execution: screenshot: screenshot hash: hash wacz: wacz + replaywebpage: replaywebpage diff --git a/storages/s3_storage.py b/storages/s3_storage.py index fa8e0b9..3dee2dc 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -75,4 +75,5 @@ class S3Storage(Storage): extra_args['ContentType'] = "application/zip" else: extra_args['ContentType'] = mimetypes.guess_type(key)[0] + self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) diff --git a/utils/gworksheet.py b/utils/gworksheet.py index eda2cc6..8fe640e 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -21,7 +21,8 @@ class GWorksheet: 'duration': 'duration', 'screenshot': 'screenshot', 'hash': 'hash', - 'wacz': 'wacz' + 'wacz': 'wacz', + 'replaywebpage': 'replaywebpage', } def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):