Merge branch 'main' into feat/unittest

This commit is contained in:
Patrick Robertson
2025-01-08 10:35:45 +01:00
10 changed files with 199 additions and 10 deletions

View File

@@ -2,6 +2,7 @@ steps:
# only 1 feeder allowed # only 1 feeder allowed
feeder: gsheet_feeder # defaults to cli_feeder feeder: gsheet_feeder # defaults to cli_feeder
archivers: # order matters, uncomment to activate archivers: # order matters, uncomment to activate
- bluesky_archiver
# - vk_archiver # - vk_archiver
# - telethon_archiver # - telethon_archiver
# - telegram_archiver # - telegram_archiver
@@ -94,9 +95,33 @@ configurations:
password: "vk pass" password: "vk pass"
session_file: "secrets/vk_config.v2.json" session_file: "secrets/vk_config.v2.json"
youtubedl_archiver:
subtitles: true
# use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
# for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
# cookie_file: "secrets/youtube_cookies.txt"
# cookies_from_browser: firefox
# proxy: socks5://proxy-user:password@proxy-ip:port
screenshot_enricher: screenshot_enricher:
width: 1280 width: 1280
height: 2300 height: 2300
# to save as pdf, uncomment the following lines and adjust the print options
# save_to_pdf: true
# print_options:
# for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html
# background: true
# orientation: "portrait"
# scale: 1
# page_width: 8.5in
# page_height: 11in
# margin_top: 0.4in
# margin_bottom: 0.4in
# margin_left: 0.4in
# margin_right: 0.4in
# page_ranges: ""
# shrink_to_fit: true
wayback_archiver_enricher: wayback_archiver_enricher:
timeout: 10 timeout: 10
key: "wayback key" key: "wayback key"

View File

@@ -8,4 +8,5 @@ from .tiktok_archiver import TiktokArchiver
from .telegram_archiver import TelegramArchiver from .telegram_archiver import TelegramArchiver
from .vk_archiver import VkArchiver from .vk_archiver import VkArchiver
from .youtubedl_archiver import YoutubeDLArchiver from .youtubedl_archiver import YoutubeDLArchiver
from .instagram_api_archiver import InstagramAPIArchiver from .instagram_api_archiver import InstagramAPIArchiver
from .bluesky_archiver import BlueskyArchiver

View File

@@ -48,6 +48,8 @@ class Archiver(Step):
""" """
downloads a URL to provided filename, or inferred from URL, returns local filename downloads a URL to provided filename, or inferred from URL, returns local filename
""" """
# TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches
# TODO: should we guess the extension?
if not to_filename: if not to_filename:
to_filename = url.split('/')[-1].split('?')[0] to_filename = url.split('/')[-1].split('?')[0]
if len(to_filename) > 64: if len(to_filename) > 64:

View File

@@ -0,0 +1,119 @@
import os
import re, requests, mimetypes
from loguru import logger
from . import Archiver
from ..core import Metadata, Media, ArchivingContext
class BlueskyArchiver(Archiver):
"""
Uses an unauthenticated Bluesky API to archive posts including metadata, images and videos. Relies on `public.api.bsky.app/xrpc` and `bsky.social/xrpc`. Avoids ATProto to avoid auth.
Some inspiration from https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/bluesky.py
"""
name = "bluesky_archiver"
BSKY_POST = re.compile(r"/profile/([^/]+)/post/([a-zA-Z0-9]+)")
def __init__(self, config: dict) -> None:
super().__init__(config)
@staticmethod
def configs() -> dict:
return {}
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if not re.search(self.BSKY_POST, url):
return False
logger.debug(f"Identified a Bluesky post: {url}, archiving...")
result = Metadata()
# fetch post info and update result
post = self._get_post_from_uri(url)
logger.debug(f"Extracted post info: {post['record']['text']}")
result.set_title(post["record"]["text"])
result.set_timestamp(post["record"]["createdAt"])
for k, v in self._get_post_data(post).items():
if v: result.set(k, v)
# download if embeds present (1 video XOR >=1 images)
for media in self._download_bsky_embeds(post):
result.add_media(media)
logger.debug(f"Downloaded {len(result.media)} media files")
return result.success("bluesky")
def _get_post_from_uri(self, post_uri: str) -> dict:
"""
Calls a public (no auth needed) Bluesky API to get a post from its uri, uses .getPostThread as it brings author info as well (unlike .getPost).
"""
post_match = re.search(self.BSKY_POST, post_uri)
username = post_match.group(1)
post_id = post_match.group(2)
at_uri = f'at://{username}/app.bsky.feed.post/{post_id}'
r = requests.get(f"https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread?uri={at_uri}&depth=0&parent_height=0")
r.raise_for_status()
thread = r.json()
assert thread["thread"]["$type"] == "app.bsky.feed.defs#threadViewPost"
return thread["thread"]["post"]
def _download_bsky_embeds(self, post: dict) -> list[Media]:
"""
Iterates over image(s) or video in a Bluesky post and downloads them
"""
media = []
embed = post.get("record", {}).get("embed", {})
image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
for image_media in image_medias:
image_media = self._download_bsky_file_as_media(image_media["image"]["ref"]["$link"], post["author"]["did"])
media.append(image_media)
for video_media in video_medias:
video_media = self._download_bsky_file_as_media(video_media["ref"]["$link"], post["author"]["did"])
media.append(video_media)
return media
def _download_bsky_file_as_media(self, cid: str, did: str) -> Media:
"""
Uses the Bluesky API to download a file by its `cid` and `did`.
"""
# TODO: replace with self.download_from_url once that function has been cleaned-up
file_url = f"https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={cid}&did={did}"
response = requests.get(file_url, stream=True)
response.raise_for_status()
ext = mimetypes.guess_extension(response.headers["Content-Type"])
filename = os.path.join(ArchivingContext.get_tmp_dir(), f"{cid}{ext}")
with open(filename, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
media = Media(filename=filename)
media.set("src", file_url)
return media
def _get_post_data(self, post: dict) -> dict:
"""
Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
"""
author = post["author"]
if "labels" in author and not author["labels"]: del author["labels"]
if "associated" in author: del author["associated"]
mentions, tags, links = [], [], []
facets = post.get("record", {}).get("facets", [])
for f in facets:
for feature in f["features"]:
if feature["$type"] == "app.bsky.richtext.facet#mention":
mentions.append(feature["did"])
elif feature["$type"] == "app.bsky.richtext.facet#tag":
tags.append(feature["tag"])
elif feature["$type"] == "app.bsky.richtext.facet#link":
links.append(feature["uri"])
res = {"author": author}
if mentions: res["mentions"] = mentions
if tags: res["tags"] = tags
if links: res["links"] = links
return res

View File

@@ -30,6 +30,8 @@ class YoutubeDLArchiver(Archiver):
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."}, "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."}, 'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."}, "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
} }
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
@@ -38,8 +40,17 @@ class YoutubeDLArchiver(Archiver):
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie: if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
logger.debug('Using Facebook cookie') logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads} ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
if item.netloc in ['youtube.com', 'www.youtube.com']:
if self.cookies_from_browser:
logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
elif self.cookie_file:
logger.debug(f'Using cookies from file {self.cookie_file}')
ydl_options['cookiefile'] = self.cookie_file
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
try: try:

View File

@@ -1,5 +1,7 @@
from loguru import logger from loguru import logger
import time, os import time, os
import base64
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
@@ -18,22 +20,31 @@ class ScreenshotEnricher(Enricher):
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}, "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
} }
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url): if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return return
logger.debug(f"Enriching screenshot for {url=}") logger.debug(f"Enriching screenshot for {url=}")
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
try: try:
driver.get(url) driver.get(url)
time.sleep(int(self.sleep_before_screenshot)) time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png") screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file) driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
if self.save_to_pdf:
pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
pdf = driver.print_page(driver.print_options)
with open(pdf_file, "wb") as f:
f.write(base64.b64decode(pdf))
to_enrich.add_media(Media(filename=pdf_file), id="pdf")
except TimeoutException: except TimeoutException:
logger.info("TimeoutException loading page for screenshot") logger.info("TimeoutException loading page for screenshot")
except Exception as e: except Exception as e:

View File

@@ -1,3 +1,4 @@
import json
from loguru import logger from loguru import logger
import time, requests import time, requests
@@ -70,11 +71,16 @@ class WaybackArchiverEnricher(Enricher, Archiver):
return False return False
# check job status # check job status
job_id = r.json().get('job_id') try:
if not job_id: job_id = r.json().get('job_id')
logger.error(f"Wayback failed with {r.json()}") if not job_id:
logger.error(f"Wayback failed with {r.json()}")
return False
except json.decoder.JSONDecodeError as e:
logger.error(f"Expected a JSON with job_id from Wayback and got {r.text}")
return False return False
# waits at most timeout seconds until job is completed, otherwise only enriches the job_id information # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information
start_time = time.time() start_time = time.time()
wayback_url = False wayback_url = False
@@ -92,6 +98,9 @@ class WaybackArchiverEnricher(Enricher, Archiver):
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
logger.warning(f"RequestException: fetching status for {url=} due to: {e}") logger.warning(f"RequestException: fetching status for {url=} due to: {e}")
break break
except json.decoder.JSONDecodeError as e:
logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}")
break
except Exception as e: except Exception as e:
logger.warning(f"error fetching status for {url=} due to: {e}") logger.warning(f"error fetching status for {url=} due to: {e}")
if not wayback_url: if not wayback_url:

View File

@@ -286,11 +286,11 @@
// logic for enabled/disabled greyscale // logic for enabled/disabled greyscale
// Get references to the checkboxes and images/videos // Get references to the checkboxes and images/videos
const safeImageViewCheckbox = document.getElementById('safe-media-view'); const safeImageViewCheckbox = document.getElementById('safe-media-view');
const imagesVideos = document.querySelectorAll('img, video'); const visualPreviews = document.querySelectorAll('img, video,embed');
// Function to toggle grayscale effect // Function to toggle grayscale effect
function toggleGrayscale() { function toggleGrayscale() {
imagesVideos.forEach(element => { visualPreviews.forEach(element => {
if (safeImageViewCheckbox.checked) { if (safeImageViewCheckbox.checked) {
// Enable grayscale effect // Enable grayscale effect
element.style.filter = 'grayscale(1)'; element.style.filter = 'grayscale(1)';
@@ -307,7 +307,7 @@
safeImageViewCheckbox.addEventListener('change', toggleGrayscale); safeImageViewCheckbox.addEventListener('change', toggleGrayscale);
// Handle the hover effect using JavaScript // Handle the hover effect using JavaScript
imagesVideos.forEach(element => { visualPreviews.forEach(element => {
element.addEventListener('mouseenter', () => { element.addEventListener('mouseenter', () => {
// Disable grayscale effect on hover // Disable grayscale effect on hover
element.style.filter = 'none'; element.style.filter = 'none';

View File

@@ -32,6 +32,10 @@ No URL available for {{ m.key }}.
Your browser does not support the video element. Your browser does not support the video element.
</video> </video>
</div> </div>
{% elif 'application/pdf' in m.mimetype %}
<div>
<embed src="{{ url }}" width="100%" height="400px"/>
</div>
{% elif 'audio' in m.mimetype %} {% elif 'audio' in m.mimetype %}
<div> <div>
<audio controls> <audio controls>

View File

@@ -2,18 +2,24 @@ from __future__ import annotations
from selenium import webdriver from selenium import webdriver
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.proxy import Proxy, ProxyType from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.common.print_page_options import PrintOptions
from loguru import logger from loguru import logger
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
import time import time
class Webdriver: class Webdriver:
def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "") -> webdriver: def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver:
self.width = width self.width = width
self.height = height self.height = height
self.timeout_seconds = timeout_seconds self.timeout_seconds = timeout_seconds
self.facebook_accept_cookies = facebook_accept_cookies self.facebook_accept_cookies = facebook_accept_cookies
self.http_proxy = http_proxy self.http_proxy = http_proxy
# create and set print options
self.print_options = PrintOptions()
for k, v in print_options.items():
setattr(self.print_options, k, v)
def __enter__(self) -> webdriver: def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
@@ -24,6 +30,7 @@ class Webdriver:
self.driver = webdriver.Firefox(options=options) self.driver = webdriver.Firefox(options=options)
self.driver.set_window_size(self.width, self.height) self.driver.set_window_size(self.width, self.height)
self.driver.set_page_load_timeout(self.timeout_seconds) self.driver.set_page_load_timeout(self.timeout_seconds)
self.driver.print_options = self.print_options
except TimeoutException as e: except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")