From 3b87dffe6bdee04c2169c5603730d142b6baae4b Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Sun, 25 Sep 2022 19:40:20 +0000 Subject: [PATCH 1/5] Add browsertrix-crawler capture The [browsertrix-crawler] utility is a browser-based crawler that can crawl one or more pages. browsertrix-crawler creates archives in the [WACZ] format which is essentially a standardized ZIP file (similar to DOCX, EPUB, JAR, etc) which can then be replayed using the [ReplayWeb.page] web component, or unzipped to get the original WARC data (the ISO standard format used by the Internet Archive Wayback Machine). This PR adds browsertrix-crawler to archiver classes where screenshots are made made. The WACZ is uploaded to storage and then added to a new column in the spreadsheet. A column can be added that will display the WACZ, loaded from cloud storage (S3, digitalocean, etc) using the client side ReplayWeb page. You can see an example of the spreadsheet here: https://docs.google.com/spreadsheets/d/1Tk-iJWzT9Sx2-YccuPttL9HcMdZEnhv_OR7Bc6tfeu8/edit#gid=0 browsertrix-crawler requires Docker to be installed. If Docker is not installed an error message will be logged and things continue as normal. [browsertrix-crawler]: https://github.com/webrecorder/browsertrix-crawler [WACZ]: https://specs.webrecorder.net/wacz/latest/ [ReplayWeb.page]: https://replayweb.page --- README.md | 2 ++ archivers/base_archiver.py | 38 ++++++++++++++++++++++++++++++++- archivers/telegram_archiver.py | 3 ++- archivers/tiktok_archiver.py | 3 ++- archivers/twitter_archiver.py | 6 ++++-- archivers/wayback_archiver.py | 11 +++++----- archivers/youtubedl_archiver.py | 3 ++- auto_archive.py | 1 + example.config.yaml | 1 + storages/s3_storage.py | 5 ++++- utils/gworksheet.py | 3 ++- 11 files changed, 63 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index ca5e06a..0b79cbb 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ You also need: 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. +6. If you would like to take archival WACZ snapshots using browsertrix-crawler + in addition to screenshots you will need to install Docker. ### Configuration file Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 902f626..91cc25a 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,4 +1,4 @@ -import os, datetime, shutil, hashlib, time, requests, re, mimetypes +import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse @@ -24,6 +24,7 @@ class ArchiveResult: title: str = None timestamp: datetime.datetime = None screenshot: str = None + wacz: str = None hash: str = None class Archiver(ABC): @@ -200,6 +201,41 @@ class Archiver(ABC): return self.storage.get_cdn_url(key) + def get_wacz(self, url): + logger.debug(f"getting wacz for {url}") + key = self._get_key_from_url(url, ".wacz", append_datetime=True) + collection = key.replace(".wacz", "").replace("-", "") + + cwd = os.getcwd() + cmd = [ + "docker", "run", + "-v", f"{cwd}/browsertrix:/crawls/", + "-it", + "webrecorder/browsertrix-crawler", "crawl", + "--url", url, + "--scopeType", "page", + "--generateWACZ", + "--text", + "--collection", collection, + "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", + "--behaviorTimeout", "90" + ] + try: + subprocess.run(cmd, check=True) + except Exception as e: + logger.error(f"wacz generation failed: {e}") + return + + filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz") + + self.storage.upload(filename, key, extra_args={ + 'ACL': 'public-read', 'ContentType': 'application/zip'}) + + # TODO: remove wacz collection, waiting for resolution on: + # https://github.com/webrecorder/browsertrix-crawler/issues/170 + + return self.storage.get_cdn_url(key) + def get_thumbnails(self, filename, key, duration=None): thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep key_folder = key.split('.')[0] + os.path.sep diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 0b6e777..d98f761 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -28,6 +28,7 @@ class TelegramArchiver(Archiver): url += "?embed=1" screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) t = requests.get(url, headers=headers) s = BeautifulSoup(t.content, 'html.parser') @@ -46,7 +47,7 @@ class TelegramArchiver(Archiver): time_elements = s.find_all('time') timestamp = time_elements[0].get('datetime') if len(time_elements) else None - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz) video_url = video.get('src') video_id = video_url.split('/')[-1].split('?')[0] diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 8100bb1..bdaad52 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -48,6 +48,7 @@ class TiktokArchiver(Archiver): hash = self.get_hash(filename) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) try: os.remove(filename) except FileNotFoundError: @@ -57,7 +58,7 @@ class TiktokArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), - timestamp=timestamp, hash=hash, screenshot=screenshot) + timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) except tiktok_downloader.Except.InvalidUrl as e: status = 'Invalid URL' diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 8f646fd..81f20ab 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -39,8 +39,9 @@ class TwitterArchiver(Archiver): if tweet.media is None: logger.debug(f'No media found, archiving tweet text only') screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) - return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot) + return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz) urls = [] @@ -59,8 +60,9 @@ class TwitterArchiver(Archiver): page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz) def download_alternative(self, url, tweet_id): # https://stackoverflow.com/a/71867055/6196010 diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index f46d1cb..cf32874 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -28,6 +28,8 @@ class WaybackArchiver(Archiver): if url in self.seen_urls: return self.seen_urls[url] screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) + logger.debug(f"POSTing {url=} to web.archive.org") ia_headers = { "Accept": "application/json", @@ -37,10 +39,10 @@ class WaybackArchiver(Archiver): if r.status_code != 200: logger.warning(f"Internet archive failed with status of {r.status_code}") - return ArchiveResult(status="Internet archive failed", screenshot=screenshot) + return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz) if 'job_id' not in r.json() and 'message' in r.json(): - return self.custom_retry(r.json(), screenshot=screenshot) + return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz) job_id = r.json()['job_id'] logger.debug(f"GETting status for {job_id=} on {url=}") @@ -63,7 +65,7 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - return self.custom_retry(status_json, screenshot=screenshot) + return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" @@ -75,8 +77,7 @@ class WaybackArchiver(Archiver): title = 'Could not get title' except: title = "Could not get title" - screenshot = self.get_screenshot(url) - self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot) + self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz) return self.seen_urls[url] def custom_retry(self, json_data, **kwargs): diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 7990131..c66378d 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -93,6 +93,7 @@ class YoutubeDLArchiver(Archiver): hash = self.get_hash(filename) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) # get duration duration = info.get('duration') @@ -113,4 +114,4 @@ class YoutubeDLArchiver(Archiver): timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) + title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/auto_archive.py b/auto_archive.py index f12b9c4..86d951b 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -30,6 +30,7 @@ def update_sheet(gw, row, result: ArchiveResult): batch_if_valid('duration', result.duration, str(result.duration)) batch_if_valid('screenshot', result.screenshot) batch_if_valid('hash', result.hash) + batch_if_valid('wacz', result.wacz) if result.timestamp is not None: if type(result.timestamp) == int: diff --git a/example.config.yaml b/example.config.yaml index acbe52c..c9dd323 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -119,4 +119,5 @@ execution: duration: duration screenshot: screenshot hash: hash + wacz: wacz diff --git a/storages/s3_storage.py b/storages/s3_storage.py index b124aae..fa8e0b9 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -71,5 +71,8 @@ class S3Storage(Storage): extra_args = kwargs.get("extra_args", {}) else: extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) - extra_args['ContentType'] = mimetypes.guess_type(key)[0] + if key.endswith('.wacz'): + extra_args['ContentType'] = "application/zip" + else: + extra_args['ContentType'] = mimetypes.guess_type(key)[0] self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) diff --git a/utils/gworksheet.py b/utils/gworksheet.py index 0e05ab6..eda2cc6 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -20,7 +20,8 @@ class GWorksheet: 'title': 'upload title', 'duration': 'duration', 'screenshot': 'screenshot', - 'hash': 'hash' + 'hash': 'hash', + 'wacz': 'wacz' } def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1): From c34fb9cf105648ba59aa40139a066c4fd8d5420d Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Tue, 11 Oct 2022 16:14:25 -0400 Subject: [PATCH 2/5] Add browsertrix profile config option This commit adds a browsertrix profile option to the configuration. In order to not require the passing of the browsertrix config to every Archiver, the Archiver constructors (include the base) were modified to accept a Storage and Config instance. Some of the constructors them pick out the pieces they need from the Config, in addition to calling the parent constructor. In order to avoid a circular import that this created the Config object now defines the default hash function to use, rather than having it be a static property of the Archiver class. --- .gitignore | 3 ++- README.md | 2 +- archivers/base_archiver.py | 31 +++++++++++++++++++++---------- archivers/telethon_archiver.py | 13 +++++++------ archivers/twitter_api_archiver.py | 15 ++++++++------- archivers/vk_archiver.py | 10 +++++----- archivers/wayback_archiver.py | 8 ++++---- archivers/youtubedl_archiver.py | 7 ++++--- auto_archive.py | 22 ++++++++++++---------- configs/browsertrix_config.py | 5 +++++ configs/config.py | 13 ++++++++++--- example.config.yaml | 3 ++- storages/s3_storage.py | 1 + utils/gworksheet.py | 3 ++- 14 files changed, 84 insertions(+), 52 deletions(-) create mode 100644 configs/browsertrix_config.py diff --git a/.gitignore b/.gitignore index 8da75c3..e525a6a 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,5 @@ local_archive/ vk_config*.json gd-token.json credentials.json -secrets/* \ No newline at end of file +secrets/* +browsertrix/* diff --git a/README.md b/README.md index 0b79cbb..b8f3c75 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ You also need: 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. -6. If you would like to take archival WACZ snapshots using browsertrix-crawler +6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler) in addition to screenshots you will need to install Docker. ### Configuration file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 91cc25a..4ee3433 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -10,6 +10,7 @@ from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from slugify import slugify +from configs import Config from storages import Storage from utils import mkdir_if_not_exists @@ -28,13 +29,14 @@ class ArchiveResult: hash: str = None class Archiver(ABC): - HASH_ALGORITHM="SHA-256" # can be overwritten by user configs name = "default" retry_regex = r"retrying at (\d+)$" - def __init__(self, storage: Storage, driver): + def __init__(self, storage: Storage, config: Config): self.storage = storage - self.driver = driver + self.driver = config.webdriver + self.hash_algorithm = config.hash_algorithm + self.browsertrix = config.browsertrix_config def __str__(self): return self.__class__.__name__ @@ -163,11 +165,11 @@ class Archiver(ABC): def get_hash(self, filename): with open(filename, "rb") as f: bytes = f.read() # read entire file as bytes - logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}') + logger.debug(f'Hash algorithm is {self.hash_algorithm}') - if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes) - elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes) - else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}") + if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes) + elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes) + else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}") return hash.hexdigest() @@ -206,10 +208,10 @@ class Archiver(ABC): key = self._get_key_from_url(url, ".wacz", append_datetime=True) collection = key.replace(".wacz", "").replace("-", "") - cwd = os.getcwd() + browsertrix_home = os.path.join(os.getcwd(), "browsertrix") cmd = [ "docker", "run", - "-v", f"{cwd}/browsertrix:/crawls/", + "-v", f"{browsertrix_home}:/crawls/", "-it", "webrecorder/browsertrix-crawler", "crawl", "--url", url, @@ -220,13 +222,22 @@ class Archiver(ABC): "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", "--behaviorTimeout", "90" ] + + if not os.path.isdir(browsertrix_home): + os.mkdir(browsertrix_home) + + if self.browsertrix.profile: + shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz")) + cmd.extend(["--profile", "/crawls/profile.tar.gz"]) + try: + logger.info(f"running browsertrix-crawler: {' '.join(cmd)}") subprocess.run(cmd, check=True) except Exception as e: logger.error(f"wacz generation failed: {e}") return - filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz") + filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") self.storage.upload(filename, key, extra_args={ 'ACL': 'public-read', 'ContentType': 'application/zip'}) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index f35e323..d47cdc5 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -7,7 +7,7 @@ from telethon.errors import ChannelInvalidError from storages import Storage from .base_archiver import Archiver, ArchiveResult -from configs import TelethonConfig +from configs import Config from utils import getattr_or @@ -15,11 +15,12 @@ class TelethonArchiver(Archiver): name = "telethon" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") - def __init__(self, storage: Storage, driver, config: TelethonConfig): - super().__init__(storage, driver) - if config: - self.client = TelegramClient("./anon", config.api_id, config.api_hash) - self.bot_token = config.bot_token + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + if config.telegram_config: + c = config.telegram_config + self.client = TelegramClient("./anon", c.api_id, c.api_hash) + self.bot_token = c.bot_token def _get_media_posts_in_group(self, chat, original_post, max_amp=10): """ diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py index 6aa1742..852df12 100644 --- a/archivers/twitter_api_archiver.py +++ b/archivers/twitter_api_archiver.py @@ -5,7 +5,7 @@ from loguru import logger from pytwitter import Api from storages.base_storage import Storage -from configs import TwitterApiConfig +from configs import Config from .base_archiver import ArchiveResult from .twitter_archiver import TwitterArchiver @@ -13,14 +13,15 @@ from .twitter_archiver import TwitterArchiver class TwitterApiArchiver(TwitterArchiver): name = "twitter_api" - def __init__(self, storage: Storage, driver, config: TwitterApiConfig): - super().__init__(storage, driver) + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + c = config.twitter_config - if config.bearer_token: - self.api = Api(bearer_token=config.bearer_token) - elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret: + if c.bearer_token: + self.api = Api(bearer_token=c.bearer_token) + elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret: self.api = Api( - consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret) + consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret) def download(self, url, check_if_exists=False): if not hasattr(self, "api"): diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index c448367..a3af9db 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -5,7 +5,7 @@ from vk_url_scraper import VkScraper, DateTimeEncoder from storages import Storage from .base_archiver import Archiver, ArchiveResult -from configs import VkConfig +from configs import Config class VkArchiver(Archiver): @@ -17,10 +17,10 @@ class VkArchiver(Archiver): wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") - def __init__(self, storage: Storage, driver, config: VkConfig): - super().__init__(storage, driver) - if config != None: - self.vks = VkScraper(config.username, config.password) + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + if config.vk_config != None: + self.vks = VkScraper(config.vk_config.username, config.vk_config.password) def download(self, url, check_if_exists=False): if not hasattr(self, "vks") or self.vks is None: diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index cf32874..4de2fa8 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup from storages import Storage from .base_archiver import Archiver, ArchiveResult -from configs import WaybackConfig +from configs import Config class WaybackArchiver(Archiver): @@ -15,9 +15,9 @@ class WaybackArchiver(Archiver): """ name = "wayback" - def __init__(self, storage: Storage, driver, config: WaybackConfig): - super(WaybackArchiver, self).__init__(storage, driver) - self.config = config + def __init__(self, storage: Storage, config: Config): + super(WaybackArchiver, self).__init__(storage, config) + self.config = config.wayback_config self.seen_urls = {} def download(self, url, check_if_exists=False): diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index c66378d..5d09442 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -6,15 +6,16 @@ from loguru import logger from .base_archiver import Archiver, ArchiveResult from storages import Storage +from configs import Config class YoutubeDLArchiver(Archiver): name = "youtube_dl" ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} - def __init__(self, storage: Storage, driver, fb_cookie): - super().__init__(storage, driver) - self.fb_cookie = fb_cookie + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + self.fb_cookie = config.facebook_cookie def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) diff --git a/auto_archive.py b/auto_archive.py index 86d951b..d657061 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -2,6 +2,7 @@ import os, datetime, traceback, random, tempfile from loguru import logger from slugify import slugify +from urllib.parse import quote from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver from utils import GWorksheet, mkdir_if_not_exists, expand_url @@ -11,7 +12,7 @@ from storages import Storage random.seed() -def update_sheet(gw, row, result: ArchiveResult): +def update_sheet(gw, row, url, result: ArchiveResult): cell_updates = [] row_values = gw.get_row(row) @@ -31,6 +32,7 @@ def update_sheet(gw, row, result: ArchiveResult): batch_if_valid('screenshot', result.screenshot) batch_if_valid('hash', result.hash) batch_if_valid('wacz', result.wacz) + batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') if result.timestamp is not None: if type(result.timestamp) == int: @@ -105,14 +107,14 @@ def process_sheet(c: Config): # order matters, first to succeed excludes remaining active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), - TiktokArchiver(storage, c.webdriver), - TwitterApiArchiver(storage, c.webdriver, c.twitter_config), - YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), - TelegramArchiver(storage, c.webdriver), - TwitterArchiver(storage, c.webdriver), - VkArchiver(storage, c.webdriver, c.vk_config), - WaybackArchiver(storage, c.webdriver, c.wayback_config) + TelethonArchiver(storage, c), + TiktokArchiver(storage, c), + TwitterApiArchiver(storage, c), + YoutubeDLArchiver(storage, c), + TelegramArchiver(storage, c), + TwitterArchiver(storage, c), + VkArchiver(storage, c), + WaybackArchiver(storage, c) ] for archiver in active_archivers: @@ -137,7 +139,7 @@ def process_sheet(c: Config): logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') if result: - update_sheet(gw, row, result) + update_sheet(gw, row, url, result) else: gw.set_cell(row, 'status', 'failed: no archiver') except KeyboardInterrupt: diff --git a/configs/browsertrix_config.py b/configs/browsertrix_config.py new file mode 100644 index 0000000..8b30dac --- /dev/null +++ b/configs/browsertrix_config.py @@ -0,0 +1,5 @@ +from dataclasses import dataclass + +@dataclass +class BrowsertrixConfig: + profile: str diff --git a/configs/config.py b/configs/config.py index 0d11467..4124236 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,6 +1,5 @@ import argparse, yaml, json -from archivers.base_archiver import Archiver import gspread from loguru import logger from selenium import webdriver @@ -13,6 +12,7 @@ from .telethon_config import TelethonConfig from .selenium_config import SeleniumConfig from .vk_config import VkConfig from .twitter_api_config import TwitterApiConfig +from .browsertrix_config import BrowsertrixConfig from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig @@ -82,7 +82,13 @@ class Config: ) self.webdriver = "not initialized" - Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM) + # browsertrix config + browsertrix_configs = execution.get("browsertrix", {}) + self.browsertrix_config = BrowsertrixConfig( + profile=browsertrix_configs.get("profile") + ) + + self.hash_algorithm = execution.get("hash_algorithm", "SHA-256") # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) @@ -208,6 +214,7 @@ class Config: update the folder in each of the storages """ self.folder = folder + logger.info(f"setting folder to {folder}") # s3 if hasattr(self, "s3_config"): self.s3_config.folder = folder if hasattr(self, "s3_storage"): self.s3_storage.folder = folder @@ -263,7 +270,7 @@ class Config: "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, - "hash_algorithm": Archiver.HASH_ALGORITHM, + "hash_algorithm": self.hash_algorithm, "save_logs": self.save_logs, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, diff --git a/example.config.yaml b/example.config.yaml index c9dd323..b736eca 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -8,7 +8,7 @@ secrets: key: "s3 API key" secret: "s3 API secret" # use region format like such - endpoint_url: "https://{region}.digitaloceanspaces.com" + endpoint_url: "https://s3.{region}.amazonaws.com" #use bucket, region, and key (key is the archived file path generated when executing) format like such as: cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" # if private:true S3 urls will not be readable online @@ -120,4 +120,5 @@ execution: screenshot: screenshot hash: hash wacz: wacz + replaywebpage: replaywebpage diff --git a/storages/s3_storage.py b/storages/s3_storage.py index fa8e0b9..3dee2dc 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -75,4 +75,5 @@ class S3Storage(Storage): extra_args['ContentType'] = "application/zip" else: extra_args['ContentType'] = mimetypes.guess_type(key)[0] + self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) diff --git a/utils/gworksheet.py b/utils/gworksheet.py index eda2cc6..8fe640e 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -21,7 +21,8 @@ class GWorksheet: 'duration': 'duration', 'screenshot': 'screenshot', 'hash': 'hash', - 'wacz': 'wacz' + 'wacz': 'wacz', + 'replaywebpage': 'replaywebpage', } def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1): From 20ca50dc90cbde60055d2ed3e7c643b5bc19c9af Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Tue, 11 Oct 2022 16:49:19 -0400 Subject: [PATCH 3/5] Clean up browsertrix-crawler files Remove any local browsertrix-crawler files after the WACZ has been copied to storage. Note, until this issue has a release on DockerHub the local files won't be able to be deleted since Docker on Linux creates the files as root: https://github.com/webrecorder/browsertrix-crawler/issues/170 The code will catch this exception and log a warning instead of failing and losing the work that has been completed. --- archivers/base_archiver.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 4ee3433..ea172f8 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -231,10 +231,10 @@ class Archiver(ABC): cmd.extend(["--profile", "/crawls/profile.tar.gz"]) try: - logger.info(f"running browsertrix-crawler: {' '.join(cmd)}") + logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") subprocess.run(cmd, check=True) except Exception as e: - logger.error(f"wacz generation failed: {e}") + logger.error(f"WACZ generation failed: {e}") return filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") @@ -242,8 +242,11 @@ class Archiver(ABC): self.storage.upload(filename, key, extra_args={ 'ACL': 'public-read', 'ContentType': 'application/zip'}) - # TODO: remove wacz collection, waiting for resolution on: - # https://github.com/webrecorder/browsertrix-crawler/issues/170 + # clean up the local browsertrix files + try: + shutil.rmtree(browsertrix_home) + except PermissionError: + logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}") return self.storage.get_cdn_url(key) From dc0ca8bdd60e1480d54c826a63e34db85aeb3ffb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 17 Oct 2022 14:06:50 +0100 Subject: [PATCH 4/5] adds browsertrix to all archivers flows --- archivers/telegram_archiver.py | 2 +- archivers/telethon_archiver.py | 7 ++++--- archivers/twitter_api_archiver.py | 3 ++- archivers/twitter_archiver.py | 3 ++- archivers/vk_archiver.py | 3 ++- archivers/wayback_archiver.py | 2 +- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index d98f761..026bdd0 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -86,4 +86,4 @@ class TelegramArchiver(Archiver): cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, - duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot) + duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index d47cdc5..9f9bbbf 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -74,6 +74,7 @@ class TelethonArchiver(Archiver): logger.debug(f'got {len(media_posts)=} for {url=}') screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) if len(media_posts) > 0: key = self.get_html_key(url) @@ -81,7 +82,7 @@ class TelethonArchiver(Archiver): if check_if_exists and self.storage.exists(key): # only s3 storage supports storage.exists as not implemented on gd cdn_url = self.storage.get_cdn_url(key) - return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) + return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz) key_thumb, thumb_index = None, None group_id = post.grouped_id if post.grouped_id is not None else post.id @@ -120,7 +121,7 @@ class TelethonArchiver(Archiver): page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index) + return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py index 852df12..454cfe2 100644 --- a/archivers/twitter_api_archiver.py +++ b/archivers/twitter_api_archiver.py @@ -70,5 +70,6 @@ class TwitterApiArchiver(TwitterArchiver): }, ensure_ascii=False, indent=4) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz) diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 81f20ab..b868af5 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -85,8 +85,9 @@ class TwitterArchiver(Archiver): timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"]) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz) def choose_variant(self, variants): # choosing the highest quality possible diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index a3af9db..91b8354 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -70,4 +70,5 @@ class VkArchiver(Archiver): page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail) # # if multiple wall/photos/videos are present the screenshot will only grab the 1st screenshot = self.get_screenshot(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title) + wacz = self.get_wacz(url) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 4de2fa8..e0ede90 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -61,7 +61,7 @@ class WaybackArchiver(Archiver): retries += 1 if status_r.status_code != 200: - return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot) + return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz) status_json = status_r.json() if status_json['status'] != 'success': From 57464f1506e0b4ccd50bbe081f92abeb8ae583e8 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 17 Oct 2022 14:07:31 +0100 Subject: [PATCH 5/5] refactors for edges in browsertrix and s3 upload, adds timeout parameter --- .gitignore | 1 + README.md | 2 +- archivers/base_archiver.py | 9 ++++----- configs/browsertrix_config.py | 1 + configs/config.py | 9 ++++++--- example.config.yaml | 9 ++++++++- storages/s3_storage.py | 17 +++++++++-------- 7 files changed, 30 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index e525a6a..4d19b9e 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ gd-token.json credentials.json secrets/* browsertrix/* +browsertrix-tmp/* \ No newline at end of file diff --git a/README.md b/README.md index b8f3c75..9e77d19 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ You also need: 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. 6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler) - in addition to screenshots you will need to install Docker. + in addition to screenshots you will need to install [Docker](https://www.docker.com/). ### Configuration file Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index ea172f8..82d705a 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -198,17 +198,16 @@ class Archiver(ABC): logger.info("TimeoutException loading page for screenshot") self.driver.save_screenshot(filename) - self.storage.upload(filename, key, extra_args={ - 'ACL': 'public-read', 'ContentType': 'image/png'}) + self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'}) return self.storage.get_cdn_url(key) def get_wacz(self, url): logger.debug(f"getting wacz for {url}") key = self._get_key_from_url(url, ".wacz", append_datetime=True) - collection = key.replace(".wacz", "").replace("-", "") + collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", "")) - browsertrix_home = os.path.join(os.getcwd(), "browsertrix") + browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp") cmd = [ "docker", "run", "-v", f"{browsertrix_home}:/crawls/", @@ -220,7 +219,7 @@ class Archiver(ABC): "--text", "--collection", collection, "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", - "--behaviorTimeout", "90" + "--behaviorTimeout", str(self.browsertrix.timeout_seconds) ] if not os.path.isdir(browsertrix_home): diff --git a/configs/browsertrix_config.py b/configs/browsertrix_config.py index 8b30dac..1039da3 100644 --- a/configs/browsertrix_config.py +++ b/configs/browsertrix_config.py @@ -3,3 +3,4 @@ from dataclasses import dataclass @dataclass class BrowsertrixConfig: profile: str + timeout_seconds: str diff --git a/configs/config.py b/configs/config.py index 4124236..beff612 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,5 +1,4 @@ - -import argparse, yaml, json +import argparse, yaml, json, os import gspread from loguru import logger from selenium import webdriver @@ -84,8 +83,11 @@ class Config: # browsertrix config browsertrix_configs = execution.get("browsertrix", {}) + if len(browsertrix_profile := browsertrix_configs.get("profile", "")): + browsertrix_profile = os.path.abspath(browsertrix_profile) self.browsertrix_config = BrowsertrixConfig( - profile=browsertrix_configs.get("profile") + profile=browsertrix_profile, + timeout_seconds=browsertrix_configs.get("timeout_seconds", "90") ) self.hash_algorithm = execution.get("hash_algorithm", "SHA-256") @@ -271,6 +273,7 @@ class Config: "header": self.header, "check_if_exists": self.check_if_exists, "hash_algorithm": self.hash_algorithm, + "browsertrix_config": asdict(self.browsertrix_config), "save_logs": self.save_logs, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, diff --git a/example.config.yaml b/example.config.yaml index b736eca..a8138af 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -8,7 +8,8 @@ secrets: key: "s3 API key" secret: "s3 API secret" # use region format like such - endpoint_url: "https://s3.{region}.amazonaws.com" + endpoint_url: "https://{region}.digitaloceanspaces.com" + # endpoint_url: "https://s3.{region}.amazonaws.com" #use bucket, region, and key (key is the archived file path generated when executing) format like such as: cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" # if private:true S3 urls will not be readable online @@ -101,6 +102,11 @@ execution: timeout_seconds: 120 window_width: 1400 window_height: 2000 + + # optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) + browsertrix: + profile: "./browsertrix/crawls/profile.tar.gz" + timeout_seconds: 90 # defaults to 90s # puts execution logs into /logs folder, defaults to false save_logs: true # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" @@ -120,5 +126,6 @@ execution: screenshot: screenshot hash: hash wacz: wacz + # if you want the replaypage to work, make sure to allow CORS on your bucket replaywebpage: replaywebpage diff --git a/storages/s3_storage.py b/storages/s3_storage.py index 3dee2dc..563d2ea 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -67,13 +67,14 @@ class S3Storage(Storage): return False def uploadf(self, file, key, **kwargs): - if self.private: - extra_args = kwargs.get("extra_args", {}) - else: - extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) - if key.endswith('.wacz'): - extra_args['ContentType'] = "application/zip" - else: - extra_args['ContentType'] = mimetypes.guess_type(key)[0] + extra_args = kwargs.get("extra_args", {}) + if not self.private and 'ACL' not in extra_args: + extra_args['ACL'] = 'public-read' + + if 'ContentType' not in extra_args: + try: + extra_args['ContentType'] = mimetypes.guess_type(key)[0] + except Exception as e: + logger.error(f"Unable to get mimetype for {key=}, error: {e}") self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)