from auto_archiver.utils.custom_logger import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core.metadata import Metadata, Media from .dropin import GenericDropin, InfoExtractor class Bluesky(GenericDropin): def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: result = Metadata() result.set_url(url) result.set_title(post["record"]["text"]) result.set_timestamp(post["record"]["createdAt"]) for k, v in self._get_post_data(post).items(): if v: result.set(k, v) # download if embeds present (1 video XOR >=1 images) for media in self._download_bsky_embeds(post, archiver): result.add_media(media) logger.debug(f"Downloaded {len(result.media)} media files") return result def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict: # TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below handle, video_id = ie_instance._match_valid_url(url).group("handle", "id") return ie_instance._extract_post(handle=handle, post_id=video_id) def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]: """ Iterates over image(s) or video in a Bluesky post and downloads them """ media = [] embed = post.get("record", {}).get("embed", {}) image_medias = embed.get("images", []) + embed.get("media", {}).get("images", []) video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e] media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}" for image_media in image_medias: url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"]) filename = archiver.download_from_url(url) if filename: media.append(Media(filename)) else: logger.warning(f"Failed to download Bluesky image from {url}") for video_media in video_medias: url = media_url.format(video_media["ref"]["$link"], post["author"]["did"]) filename = archiver.download_from_url(url) if filename: media.append(Media(filename)) else: logger.warning(f"Failed to download Bluesky video from {url}") return media def _get_post_data(self, post: dict) -> dict: """ Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links. """ author = post["author"] if "labels" in author and not author["labels"]: del author["labels"] if "associated" in author: del author["associated"] mentions, tags, links = [], [], [] facets = post.get("record", {}).get("facets", []) for f in facets: for feature in f["features"]: if feature["$type"] == "app.bsky.richtext.facet#mention": mentions.append(feature["did"]) elif feature["$type"] == "app.bsky.richtext.facet#tag": tags.append(feature["tag"]) elif feature["$type"] == "app.bsky.richtext.facet#link": links.append(feature["uri"]) res = {"author": author} if mentions: res["mentions"] = mentions if tags: res["tags"] = tags if links: res["links"] = links return res