Merge branch 'main' into feat/unittest

2026-06-15 06:38:29 +03:00 · 2025-01-08 10:35:45 +01:00
parent 1bd017000e a697f0a212
commit ab9335bb7a
10 changed files with 199 additions and 10 deletions
--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@@ -2,6 +2,7 @@ steps:
  # only 1 feeder allowed
  feeder: gsheet_feeder # defaults to cli_feeder
  archivers: # order matters, uncomment to activate
+    - bluesky_archiver
    # - vk_archiver
    # - telethon_archiver
    # - telegram_archiver
@@ -94,9 +95,33 @@ configurations:
    password: "vk pass"
    session_file: "secrets/vk_config.v2.json"

+  youtubedl_archiver:
+    subtitles: true
+    # use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
+    # for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
+    # cookie_file: "secrets/youtube_cookies.txt"
+    # cookies_from_browser: firefox
+    # proxy: socks5://proxy-user:password@proxy-ip:port
+
  screenshot_enricher:
    width: 1280
    height: 2300
+    # to save as pdf, uncomment the following lines and adjust the print options
+    # save_to_pdf: true
+    # print_options:
+      # for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html
+      # background: true
+      # orientation: "portrait"
+      # scale: 1
+      # page_width: 8.5in
+      # page_height: 11in
+      # margin_top: 0.4in
+      # margin_bottom: 0.4in
+      # margin_left: 0.4in
+      # margin_right: 0.4in
+      # page_ranges: ""
+      # shrink_to_fit: true
+
  wayback_archiver_enricher:
    timeout: 10
    key: "wayback key"
--- a/src/auto_archiver/archivers/init.py
+++ b/src/auto_archiver/archivers/init.py
@@ -8,4 +8,5 @@ from .tiktok_archiver import TiktokArchiver
 from .telegram_archiver import TelegramArchiver
 from .vk_archiver import VkArchiver
 from .youtubedl_archiver import YoutubeDLArchiver
-from .instagram_api_archiver import InstagramAPIArchiver
+from .instagram_api_archiver import InstagramAPIArchiver
+from .bluesky_archiver import BlueskyArchiver
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -48,6 +48,8 @@ class Archiver(Step):
        """
        downloads a URL to provided filename, or inferred from URL, returns local filename
        """
+        # TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches
+        # TODO: should we guess the extension?
        if not to_filename:
            to_filename = url.split('/')[-1].split('?')[0]
            if len(to_filename) > 64:
--- a/src/auto_archiver/archivers/bluesky_archiver.py
+++ b/src/auto_archiver/archivers/bluesky_archiver.py
@@ -0,0 +1,119 @@
+import os
+import re, requests, mimetypes
+from loguru import logger
+
+
+from . import Archiver
+from ..core import Metadata, Media, ArchivingContext
+
+
+class BlueskyArchiver(Archiver):
+    """
+    Uses an unauthenticated Bluesky API to archive posts including metadata, images and videos. Relies on `public.api.bsky.app/xrpc` and `bsky.social/xrpc`. Avoids ATProto to avoid auth.
+
+    Some inspiration from https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/bluesky.py
+    """
+    name = "bluesky_archiver"
+    BSKY_POST = re.compile(r"/profile/([^/]+)/post/([a-zA-Z0-9]+)")
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+
+    @staticmethod
+    def configs() -> dict:
+        return {}
+
+    def download(self, item: Metadata) -> Metadata:
+        url = item.get_url()
+        if not re.search(self.BSKY_POST, url):
+            return False
+
+        logger.debug(f"Identified a Bluesky post: {url}, archiving...")
+        result = Metadata()
+
+        # fetch post info and update result
+        post = self._get_post_from_uri(url)
+        logger.debug(f"Extracted post info: {post['record']['text']}")
+        result.set_title(post["record"]["text"])
+        result.set_timestamp(post["record"]["createdAt"])
+        for k, v in self._get_post_data(post).items():
+            if v: result.set(k, v)
+
+        # download if embeds present (1 video XOR >=1 images)
+        for media in self._download_bsky_embeds(post):
+            result.add_media(media)
+        logger.debug(f"Downloaded {len(result.media)} media files")
+
+        return result.success("bluesky")
+
+    def _get_post_from_uri(self, post_uri: str) -> dict:
+        """
+        Calls a public (no auth needed) Bluesky API to get a post from its uri, uses .getPostThread as it brings author info as well (unlike .getPost).
+        """
+        post_match = re.search(self.BSKY_POST, post_uri)
+        username = post_match.group(1)
+        post_id = post_match.group(2)
+        at_uri = f'at://{username}/app.bsky.feed.post/{post_id}'
+        r = requests.get(f"https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread?uri={at_uri}&depth=0&parent_height=0")
+        r.raise_for_status()
+        thread = r.json()
+        assert thread["thread"]["$type"] == "app.bsky.feed.defs#threadViewPost"
+        return thread["thread"]["post"]
+
+    def _download_bsky_embeds(self, post: dict) -> list[Media]:
+        """
+        Iterates over image(s) or video in a Bluesky post and downloads them        
+        """
+        media = []
+        embed = post.get("record", {}).get("embed", {})
+        image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
+        video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
+
+        for image_media in image_medias:
+                image_media = self._download_bsky_file_as_media(image_media["image"]["ref"]["$link"], post["author"]["did"])
+                media.append(image_media)
+        for video_media in video_medias:
+            video_media = self._download_bsky_file_as_media(video_media["ref"]["$link"], post["author"]["did"])
+            media.append(video_media)
+        return media
+
+    def _download_bsky_file_as_media(self, cid: str, did: str) -> Media:
+        """
+        Uses the Bluesky API to download a file by its `cid` and `did`.
+        """
+        # TODO: replace with self.download_from_url once that function has been cleaned-up
+        file_url = f"https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={cid}&did={did}"
+        response = requests.get(file_url, stream=True)
+        response.raise_for_status()
+        ext = mimetypes.guess_extension(response.headers["Content-Type"])
+        filename = os.path.join(ArchivingContext.get_tmp_dir(), f"{cid}{ext}")
+        with open(filename, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        media = Media(filename=filename)
+        media.set("src", file_url)
+        return media
+
+    def _get_post_data(self, post: dict) -> dict:
+        """
+        Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
+        """
+        author = post["author"]
+        if "labels" in author and not author["labels"]: del author["labels"]
+        if "associated" in author: del author["associated"]
+
+        mentions, tags, links = [], [], []
+        facets = post.get("record", {}).get("facets", [])
+        for f in facets:
+            for feature in f["features"]:
+                if feature["$type"] == "app.bsky.richtext.facet#mention":
+                    mentions.append(feature["did"])
+                elif feature["$type"] == "app.bsky.richtext.facet#tag":
+                    tags.append(feature["tag"])
+                elif feature["$type"] == "app.bsky.richtext.facet#link":
+                    links.append(feature["uri"])
+        res = {"author": author}
+        if mentions: res["mentions"] = mentions
+        if tags: res["tags"] = tags
+        if links: res["links"] = links
+        return res
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -30,6 +30,8 @@ class YoutubeDLArchiver(Archiver):
            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
+            "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
+            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
        }

    def download(self, item: Metadata) -> Metadata:
@@ -38,8 +40,17 @@ class YoutubeDLArchiver(Archiver):
        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
-
+        
        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+
+        if item.netloc in ['youtube.com', 'www.youtube.com']:
+            if self.cookies_from_browser:
+                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
+                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
+            elif self.cookie_file:
+                logger.debug(f'Using cookies from file {self.cookie_file}')
+                ydl_options['cookiefile'] = self.cookie_file
+
        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

        try:
--- a/src/auto_archiver/enrichers/screenshot_enricher.py
+++ b/src/auto_archiver/enrichers/screenshot_enricher.py
@@ -1,5 +1,7 @@
 from loguru import logger
 import time, os
+import base64
+
 from selenium.common.exceptions import TimeoutException


@@ -18,22 +20,31 @@ class ScreenshotEnricher(Enricher):
            "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
            "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
            "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
+            "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
+            "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
        }

    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
+
        if UrlUtil.is_auth_wall(url):
            logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
            return

        logger.debug(f"Enriching screenshot for {url=}")
-        with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver:
+        with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
            try:
                driver.get(url)
                time.sleep(int(self.sleep_before_screenshot))
                screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
                driver.save_screenshot(screenshot_file)
                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
+                if self.save_to_pdf:
+                    pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
+                    pdf = driver.print_page(driver.print_options)
+                    with open(pdf_file, "wb") as f:
+                        f.write(base64.b64decode(pdf))
+                    to_enrich.add_media(Media(filename=pdf_file), id="pdf")
            except TimeoutException:
                logger.info("TimeoutException loading page for screenshot")
            except Exception as e:
--- a/src/auto_archiver/enrichers/wayback_enricher.py
+++ b/src/auto_archiver/enrichers/wayback_enricher.py
@@ -1,3 +1,4 @@
+import json
 from loguru import logger
 import time, requests

@@ -70,11 +71,16 @@ class WaybackArchiverEnricher(Enricher, Archiver):
            return False

        # check job status
-        job_id = r.json().get('job_id')
-        if not job_id:
-            logger.error(f"Wayback failed with {r.json()}")
+        try:
+            job_id = r.json().get('job_id')
+            if not job_id:
+                logger.error(f"Wayback failed with {r.json()}")
+                return False
+        except json.decoder.JSONDecodeError as e:
+            logger.error(f"Expected a JSON with job_id from Wayback and got {r.text}")
            return False

+
        # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information
        start_time = time.time()
        wayback_url = False
@@ -92,6 +98,9 @@ class WaybackArchiverEnricher(Enricher, Archiver):
            except requests.exceptions.RequestException as e:
                logger.warning(f"RequestException: fetching status for {url=} due to: {e}")
                break
+            except json.decoder.JSONDecodeError as e:
+                logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}")
+                break
            except Exception as e:
                logger.warning(f"error fetching status for {url=} due to: {e}")
            if not wayback_url:
--- a/src/auto_archiver/formatters/templates/html_template.html
+++ b/src/auto_archiver/formatters/templates/html_template.html
@@ -286,11 +286,11 @@
        // logic for enabled/disabled greyscale
        // Get references to the checkboxes and images/videos
        const safeImageViewCheckbox = document.getElementById('safe-media-view');
-        const imagesVideos = document.querySelectorAll('img, video');
+        const visualPreviews = document.querySelectorAll('img, video,embed');

        // Function to toggle grayscale effect
        function toggleGrayscale() {
-            imagesVideos.forEach(element => {
+            visualPreviews.forEach(element => {
                if (safeImageViewCheckbox.checked) {
                    // Enable grayscale effect
                    element.style.filter = 'grayscale(1)';
@@ -307,7 +307,7 @@
        safeImageViewCheckbox.addEventListener('change', toggleGrayscale);

        // Handle the hover effect using JavaScript
-        imagesVideos.forEach(element => {
+        visualPreviews.forEach(element => {
            element.addEventListener('mouseenter', () => {
                // Disable grayscale effect on hover
                element.style.filter = 'none';
--- a/src/auto_archiver/formatters/templates/macros.html
+++ b/src/auto_archiver/formatters/templates/macros.html
@@ -32,6 +32,10 @@ No URL available for {{ m.key }}.
        Your browser does not support the video element.
    </video>
 </div>
+{% elif 'application/pdf' in m.mimetype %}
+<div>
+    <embed src="{{ url }}" width="100%" height="400px"/>
+</div>
 {% elif 'audio' in m.mimetype %}
 <div>
    <audio controls>
--- a/src/auto_archiver/utils/webdriver.py
+++ b/src/auto_archiver/utils/webdriver.py
@@ -2,18 +2,24 @@ from __future__ import annotations
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.proxy import Proxy, ProxyType
+from selenium.webdriver.common.print_page_options import PrintOptions
+
 from loguru import logger
 from selenium.webdriver.common.by import By
 import time


 class Webdriver:
-    def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "") -> webdriver:
+    def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver:
        self.width = width
        self.height = height
        self.timeout_seconds = timeout_seconds
        self.facebook_accept_cookies = facebook_accept_cookies
        self.http_proxy = http_proxy
+        # create and set print options
+        self.print_options = PrintOptions()
+        for k, v in print_options.items():
+            setattr(self.print_options, k, v)

    def __enter__(self) -> webdriver:
        options = webdriver.FirefoxOptions()
@@ -24,6 +30,7 @@ class Webdriver:
            self.driver = webdriver.Firefox(options=options)
            self.driver.set_window_size(self.width, self.height)
            self.driver.set_page_load_timeout(self.timeout_seconds)
+            self.driver.print_options = self.print_options
        except TimeoutException as e:
            logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")