Add yt-dlp based archiving for TwitterArchiver (#138 )

* Add ytdlp archiving capability * Add type annotation * version bump --------- Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2026-06-12 21:28:29 +03:00 · 2024-04-15 19:54:55 +01:00
2 changed files with 43 additions and 7 deletions
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -2,6 +2,8 @@ import re, requests, mimetypes, json
 from datetime import datetime
 from loguru import logger
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
 from yt_dlp import YoutubeDL
 from yt_dlp.extractor.twitter import TwitterIE
 from slugify import slugify
 from . import Archiver
@@ -98,7 +100,9 @@ class TwitterArchiver(Archiver):
        hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
        r = requests.get(hack_url)
-        if r.status_code != 200: return False
+        if r.status_code != 200 or r.json()=={}: 
            logger.warning(f"Failed to get tweet information from {hack_url}, trying ytdl")
            return self.download_ytdl(item, url, tweet_id)
        tweet = r.json()
        urls = []
@@ -108,7 +112,7 @@ class TwitterArchiver(Archiver):
        # 1 tweet has 1 video max
        if "video" in tweet:
            v = tweet["video"]
-            urls.append(self.choose_variant(v.get("variants", [])))
+            urls.append(self.choose_variant(v.get("variants", []))['url'])
        logger.debug(f"Twitter hack got {urls=}")
@@ -126,6 +130,38 @@ class TwitterArchiver(Archiver):
        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
        return result.success("twitter-hack")
    def download_ytdl(self, item: Metadata, url:str, tweet_id:str) -> Metadata:
        downloader = YoutubeDL()
        tie = TwitterIE(downloader)
        tweet = tie._extract_status(tweet_id)
        result = Metadata()
        result\
            .set_title(tweet.get('full_text', ''))\
            .set_content(json.dumps(tweet, ensure_ascii=False))\
            .set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
        if not tweet.get("entities", {}).get("media"):
            logger.debug('No media found, archiving tweet text only')
            return result
        for i, tw_media in enumerate(tweet["entities"]["media"]):
            media = Media(filename="")
            mimetype = ""
            if tw_media["type"] == "photo":
                media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
                mimetype = "image/jpeg"
            elif tw_media["type"] == "video":
                variant = self.choose_variant(tw_media['video_info']['variants'])
                media.set("src", variant['url'])
                mimetype = variant['content_type']
            elif tw_media["type"] == "animated_gif":
                variant = tw_media['video_info']['variants'][0]
                media.set("src", variant['url'])
                mimetype = variant['content_type']
            ext = mimetypes.guess_extension(mimetype)
            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
            result.add_media(media)
        return result.success("twitter-ytdl")
    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
        matches = self.link_pattern.findall(url)
@@ -140,13 +176,13 @@ class TwitterArchiver(Archiver):
        # choosing the highest quality possible
        variant, width, height = None, 0, 0
        for var in variants:
-            if var.get("type", "") == "video/mp4":
+            if var.get("content_type", "") == "video/mp4":
-                width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"])
+                width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
                if width_height:
                    w, h = int(width_height[1]), int(width_height[2])
                    if w > width or h > height:
                        width, height = w, h
-                        variant = var.get("src", variant)
+                        variant = var
            else:
-                variant = var.get("src") if not variant else variant
+                variant = var if not variant else variant
        return variant
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -3,7 +3,7 @@ _MAJOR = "0"
 _MINOR = "11"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "1"
+_PATCH = "2"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""