diff --git a/example.orchestration.yaml b/example.orchestration.yaml index a36b125..f1eed2a 100644 --- a/example.orchestration.yaml +++ b/example.orchestration.yaml @@ -2,6 +2,7 @@ steps: # only 1 feeder allowed feeder: gsheet_feeder # defaults to cli_feeder archivers: # order matters, uncomment to activate + - bluesky_archiver # - vk_archiver # - telethon_archiver # - telegram_archiver @@ -94,9 +95,33 @@ configurations: password: "vk pass" session_file: "secrets/vk_config.v2.json" + youtubedl_archiver: + subtitles: true + # use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser + # for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp + # cookie_file: "secrets/youtube_cookies.txt" + # cookies_from_browser: firefox + # proxy: socks5://proxy-user:password@proxy-ip:port + screenshot_enricher: width: 1280 height: 2300 + # to save as pdf, uncomment the following lines and adjust the print options + # save_to_pdf: true + # print_options: + # for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html + # background: true + # orientation: "portrait" + # scale: 1 + # page_width: 8.5in + # page_height: 11in + # margin_top: 0.4in + # margin_bottom: 0.4in + # margin_left: 0.4in + # margin_right: 0.4in + # page_ranges: "" + # shrink_to_fit: true + wayback_archiver_enricher: timeout: 10 key: "wayback key" diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py index ac92fde..996ca3b 100644 --- a/src/auto_archiver/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -8,4 +8,5 @@ from .tiktok_archiver import TiktokArchiver from .telegram_archiver import TelegramArchiver from .vk_archiver import VkArchiver from .youtubedl_archiver import YoutubeDLArchiver -from .instagram_api_archiver import InstagramAPIArchiver \ No newline at end of file +from .instagram_api_archiver import InstagramAPIArchiver +from .bluesky_archiver import BlueskyArchiver \ No newline at end of file diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py index c44ab0a..25e08c3 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -48,6 +48,8 @@ class Archiver(Step): """ downloads a URL to provided filename, or inferred from URL, returns local filename """ + # TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches + # TODO: should we guess the extension? if not to_filename: to_filename = url.split('/')[-1].split('?')[0] if len(to_filename) > 64: diff --git a/src/auto_archiver/archivers/bluesky_archiver.py b/src/auto_archiver/archivers/bluesky_archiver.py new file mode 100644 index 0000000..534fba2 --- /dev/null +++ b/src/auto_archiver/archivers/bluesky_archiver.py @@ -0,0 +1,119 @@ +import os +import re, requests, mimetypes +from loguru import logger + + +from . import Archiver +from ..core import Metadata, Media, ArchivingContext + + +class BlueskyArchiver(Archiver): + """ + Uses an unauthenticated Bluesky API to archive posts including metadata, images and videos. Relies on `public.api.bsky.app/xrpc` and `bsky.social/xrpc`. Avoids ATProto to avoid auth. + + Some inspiration from https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/bluesky.py + """ + name = "bluesky_archiver" + BSKY_POST = re.compile(r"/profile/([^/]+)/post/([a-zA-Z0-9]+)") + + def __init__(self, config: dict) -> None: + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + if not re.search(self.BSKY_POST, url): + return False + + logger.debug(f"Identified a Bluesky post: {url}, archiving...") + result = Metadata() + + # fetch post info and update result + post = self._get_post_from_uri(url) + logger.debug(f"Extracted post info: {post['record']['text']}") + result.set_title(post["record"]["text"]) + result.set_timestamp(post["record"]["createdAt"]) + for k, v in self._get_post_data(post).items(): + if v: result.set(k, v) + + # download if embeds present (1 video XOR >=1 images) + for media in self._download_bsky_embeds(post): + result.add_media(media) + logger.debug(f"Downloaded {len(result.media)} media files") + + return result.success("bluesky") + + def _get_post_from_uri(self, post_uri: str) -> dict: + """ + Calls a public (no auth needed) Bluesky API to get a post from its uri, uses .getPostThread as it brings author info as well (unlike .getPost). + """ + post_match = re.search(self.BSKY_POST, post_uri) + username = post_match.group(1) + post_id = post_match.group(2) + at_uri = f'at://{username}/app.bsky.feed.post/{post_id}' + r = requests.get(f"https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread?uri={at_uri}&depth=0&parent_height=0") + r.raise_for_status() + thread = r.json() + assert thread["thread"]["$type"] == "app.bsky.feed.defs#threadViewPost" + return thread["thread"]["post"] + + def _download_bsky_embeds(self, post: dict) -> list[Media]: + """ + Iterates over image(s) or video in a Bluesky post and downloads them + """ + media = [] + embed = post.get("record", {}).get("embed", {}) + image_medias = embed.get("images", []) + embed.get("media", {}).get("images", []) + video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e] + + for image_media in image_medias: + image_media = self._download_bsky_file_as_media(image_media["image"]["ref"]["$link"], post["author"]["did"]) + media.append(image_media) + for video_media in video_medias: + video_media = self._download_bsky_file_as_media(video_media["ref"]["$link"], post["author"]["did"]) + media.append(video_media) + return media + + def _download_bsky_file_as_media(self, cid: str, did: str) -> Media: + """ + Uses the Bluesky API to download a file by its `cid` and `did`. + """ + # TODO: replace with self.download_from_url once that function has been cleaned-up + file_url = f"https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={cid}&did={did}" + response = requests.get(file_url, stream=True) + response.raise_for_status() + ext = mimetypes.guess_extension(response.headers["Content-Type"]) + filename = os.path.join(ArchivingContext.get_tmp_dir(), f"{cid}{ext}") + with open(filename, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + media = Media(filename=filename) + media.set("src", file_url) + return media + + def _get_post_data(self, post: dict) -> dict: + """ + Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links. + """ + author = post["author"] + if "labels" in author and not author["labels"]: del author["labels"] + if "associated" in author: del author["associated"] + + mentions, tags, links = [], [], [] + facets = post.get("record", {}).get("facets", []) + for f in facets: + for feature in f["features"]: + if feature["$type"] == "app.bsky.richtext.facet#mention": + mentions.append(feature["did"]) + elif feature["$type"] == "app.bsky.richtext.facet#tag": + tags.append(feature["tag"]) + elif feature["$type"] == "app.bsky.richtext.facet#link": + links.append(feature["uri"]) + res = {"author": author} + if mentions: res["mentions"] = mentions + if tags: res["tags"] = tags + if links: res["links"] = links + return res diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py index e1fe810..5f9a6b5 100644 --- a/src/auto_archiver/archivers/twitter_archiver.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -2,7 +2,6 @@ import re, requests, mimetypes, json from typing import Union from datetime import datetime from loguru import logger -from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from yt_dlp import YoutubeDL from yt_dlp.extractor.twitter import TwitterIE from slugify import slugify @@ -49,7 +48,7 @@ class TwitterArchiver(Archiver): username, tweet_id = self.get_username_tweet_id(url) if not username: return False - strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication] + strategies = [self.download_yt_dlp, self.download_syndication] for strategy in strategies: logger.debug(f"Trying {strategy.__name__} for {url=}") try: @@ -61,45 +60,6 @@ class TwitterArchiver(Archiver): logger.warning(f"No free strategy worked for {url}") return False - - def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]: - scr = TwitterTweetScraper(tweet_id) - try: - tweet = next(scr.get_items()) - except Exception as ex: - logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}") - return False - - result = Metadata() - result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date) - if tweet.media is None: - logger.debug(f'No media found, archiving tweet text only') - return result - - for i, tweet_media in enumerate(tweet.media): - media = Media(filename="") - mimetype = "" - if type(tweet_media) == Video: - variant = max( - [v for v in tweet_media.variants if v.bitrate], key=lambda v: v.bitrate) - media.set("src", variant.url).set("duration", tweet_media.duration) - mimetype = variant.contentType - elif type(tweet_media) == Gif: - variant = tweet_media.variants[0] - media.set("src", variant.url) - mimetype = variant.contentType - elif type(tweet_media) == Photo: - media.set("src", UrlUtil.twitter_best_quality_url(tweet_media.fullUrl)) - mimetype = "image/jpeg" - else: - logger.warning(f"Could not get media URL of {tweet_media}") - continue - ext = mimetypes.guess_extension(mimetype) - media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}') - result.add_media(media) - - return result.success("twitter-snscrape") - def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]: """ Hack alternative working again. diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py index 9a42bdf..b13cceb 100644 --- a/src/auto_archiver/archivers/youtubedl_archiver.py +++ b/src/auto_archiver/archivers/youtubedl_archiver.py @@ -30,6 +30,8 @@ class YoutubeDLArchiver(Archiver): "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."}, 'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."}, "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."}, + "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"}, + "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"}, } def download(self, item: Metadata) -> Metadata: @@ -38,8 +40,17 @@ class YoutubeDLArchiver(Archiver): if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie: logger.debug('Using Facebook cookie') yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie - + ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads} + + if item.netloc in ['youtube.com', 'www.youtube.com']: + if self.cookies_from_browser: + logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube') + ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,) + elif self.cookie_file: + logger.debug(f'Using cookies from file {self.cookie_file}') + ydl_options['cookiefile'] = self.cookie_file + ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" try: diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py index 69f466b..b2ef096 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -1,5 +1,7 @@ from loguru import logger import time, os +import base64 + from selenium.common.exceptions import TimeoutException @@ -18,22 +20,31 @@ class ScreenshotEnricher(Enricher): "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, + "print_options": {"default": {}, "help": "options to pass to the pdf printer"} } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() + if UrlUtil.is_auth_wall(url): logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") return logger.debug(f"Enriching screenshot for {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver: try: driver.get(url) time.sleep(int(self.sleep_before_screenshot)) screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png") driver.save_screenshot(screenshot_file) to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") + if self.save_to_pdf: + pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf") + pdf = driver.print_page(driver.print_options) + with open(pdf_file, "wb") as f: + f.write(base64.b64decode(pdf)) + to_enrich.add_media(Media(filename=pdf_file), id="pdf") except TimeoutException: logger.info("TimeoutException loading page for screenshot") except Exception as e: diff --git a/src/auto_archiver/enrichers/wayback_enricher.py b/src/auto_archiver/enrichers/wayback_enricher.py index 12eb6e3..305bfcf 100644 --- a/src/auto_archiver/enrichers/wayback_enricher.py +++ b/src/auto_archiver/enrichers/wayback_enricher.py @@ -1,3 +1,4 @@ +import json from loguru import logger import time, requests @@ -70,11 +71,16 @@ class WaybackArchiverEnricher(Enricher, Archiver): return False # check job status - job_id = r.json().get('job_id') - if not job_id: - logger.error(f"Wayback failed with {r.json()}") + try: + job_id = r.json().get('job_id') + if not job_id: + logger.error(f"Wayback failed with {r.json()}") + return False + except json.decoder.JSONDecodeError as e: + logger.error(f"Expected a JSON with job_id from Wayback and got {r.text}") return False + # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information start_time = time.time() wayback_url = False @@ -92,6 +98,9 @@ class WaybackArchiverEnricher(Enricher, Archiver): except requests.exceptions.RequestException as e: logger.warning(f"RequestException: fetching status for {url=} due to: {e}") break + except json.decoder.JSONDecodeError as e: + logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}") + break except Exception as e: logger.warning(f"error fetching status for {url=} due to: {e}") if not wayback_url: diff --git a/src/auto_archiver/formatters/templates/html_template.html b/src/auto_archiver/formatters/templates/html_template.html index 13ec15a..8bdf5ef 100644 --- a/src/auto_archiver/formatters/templates/html_template.html +++ b/src/auto_archiver/formatters/templates/html_template.html @@ -286,11 +286,11 @@ // logic for enabled/disabled greyscale // Get references to the checkboxes and images/videos const safeImageViewCheckbox = document.getElementById('safe-media-view'); - const imagesVideos = document.querySelectorAll('img, video'); + const visualPreviews = document.querySelectorAll('img, video,embed'); // Function to toggle grayscale effect function toggleGrayscale() { - imagesVideos.forEach(element => { + visualPreviews.forEach(element => { if (safeImageViewCheckbox.checked) { // Enable grayscale effect element.style.filter = 'grayscale(1)'; @@ -307,7 +307,7 @@ safeImageViewCheckbox.addEventListener('change', toggleGrayscale); // Handle the hover effect using JavaScript - imagesVideos.forEach(element => { + visualPreviews.forEach(element => { element.addEventListener('mouseenter', () => { // Disable grayscale effect on hover element.style.filter = 'none'; diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/formatters/templates/macros.html index c187f8c..772138f 100644 --- a/src/auto_archiver/formatters/templates/macros.html +++ b/src/auto_archiver/formatters/templates/macros.html @@ -32,6 +32,10 @@ No URL available for {{ m.key }}. Your browser does not support the video element. +{% elif 'application/pdf' in m.mimetype %} +
+ +
{% elif 'audio' in m.mimetype %}