fxtwitter working instead of nitter

2026-06-07 19:08:30 +03:00 · 2026-03-02 12:31:28 +00:00
parent 139d647197
commit bc66dd4f2a
2 changed files with 305 additions and 44 deletions
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -10,7 +10,6 @@ from auto_archiver.core.extractor import Extractor
 from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
 from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
 import requests
-from bs4 import BeautifulSoup
 from retrying import retry


@@ -37,56 +36,80 @@ class Twitter(GenericDropin):
            if not post_data or not post_data.get("user") or not post_data.get("created_at"):
                raise ValueError("Error retrieving post with twitter dropin")
            return post_data
-        except Exception:
-            # try nitter
-            nitter_url = f"https://nitter.net/i/status/{twid}"
-            # nitter_url = f"https://nitter.space/i/status/{twid}"
-            logger.info(f"Falling back to nitter.net for tweet extraction at {nitter_url}")
+        except Exception as e:
+            logger.debug(f"yt-dlp twitter extraction failed: {e}")
+            # try fxtwitter API as fallback
+            return self._fetch_fxtwitter(twid)

-            @retry(wait_random_min=500, wait_random_max=2000, stop_max_attempt_number=3)
-            def fetch_nitter_soup(url):
-                headers = {
-                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
-                }
-                resp = requests.get(url, headers=headers, timeout=10)
-                if resp.status_code != 200:
-                    raise ValueError("Failed to retrieve tweet from nitter.net")
-                logger.error(resp.text)
-                soup = BeautifulSoup(resp.text, "html.parser")
-                tweet_container = soup.find("div", {"class": "main-tweet"})
-                if not tweet_container:
-                    raise ValueError("Could not find tweet container on nitter.net page")
-                return tweet_container
+    def _fetch_fxtwitter(self, twid: str) -> dict:
+        """Fetch tweet data from fxtwitter API and convert to expected format."""
+        fxtwitter_url = f"https://api.fxtwitter.com/status/{twid}"
+        logger.info(f"Falling back to fxtwitter API for tweet extraction: {fxtwitter_url}")

-            tweet_container = fetch_nitter_soup(nitter_url)
-            user = tweet_container.find("a", {"class": "username"})
-            author = user.text.strip() if user else ""
-            created_at = tweet_container.find("span", {"class": "tweet-date"})
-            timestamp = created_at.find("a")["title"] if created_at and created_at.find("a") else ""
+        @retry(wait_random_min=500, wait_random_max=2000, stop_max_attempt_number=3)
+        def fetch_fxtwitter_data(url):
+            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"}
+            resp = requests.get(url, headers=headers, timeout=15)
+            if resp.status_code != 200:
+                raise ValueError(f"Failed to retrieve tweet from fxtwitter API: {resp.status_code}")
+            data = resp.json()
+            if "tweet" not in data:
+                raise ValueError(f"No tweet data in fxtwitter response: {data.get('message', 'Unknown error')}")
+            return data["tweet"]

-            full_text = tweet_container.find("div", {"class": "tweet-content"})
-            text = full_text.text.strip() if full_text else ""
+        tweet = fetch_fxtwitter_data(fxtwitter_url)

-            media = []
-            media_tags = tweet_container.find_all("a", {"class": "still-image"})
-            for m in media_tags:
-                img_url = m["href"]
-                if img_url.startswith("/"):
-                    img_url = "https://nitter.net" + img_url
-                media.append({"type": "photo", "media_url_https": img_url})
+        # Convert fxtwitter format to expected format
+        author = tweet.get("author", {}).get("name", "")
+        created_at = tweet.get("created_at", "")  # Format: "Sun Feb 08 18:45:00 +0000 2026"
+        full_text = tweet.get("text", "") or tweet.get("raw_text", "")

-            video_tags = tweet_container.find_all("video")
-            for v in video_tags:
-                src = v.find("source")
-                if src and src.get("src"):
-                    video_url = src["src"]
-                    if video_url.startswith("/"):
-                        video_url = "https://nitter.net" + video_url
-                    media.append(
-                        {"type": "video", "video_info": {"variants": [{"url": video_url, "content_type": "video/mp4"}]}}
+        # Convert media format
+        media = []
+        fx_media = tweet.get("media", {})
+
+        # Handle photos
+        for photo in fx_media.get("photos", []):
+            media.append({"type": "photo", "media_url_https": photo.get("url", "")})
+
+        # Handle videos
+        for video in fx_media.get("videos", []):
+            variants = video.get("variants", [])
+            # Convert to expected variant format
+            converted_variants = []
+            for var in variants:
+                converted_variants.append(
+                    {
+                        "url": var.get("url", ""),
+                        "content_type": var.get("content_type", "video/mp4"),
+                        "bitrate": var.get("bitrate", 0),
+                    }
+                )
+            if converted_variants:
+                media.append({"type": "video", "video_info": {"variants": converted_variants}})
+
+        # Handle animated gifs (fxtwitter may include these in videos)
+        for item in fx_media.get("all", []):
+            if item.get("type") == "gif":
+                variants = item.get("variants", [])
+                converted_variants = []
+                for var in variants:
+                    converted_variants.append(
+                        {
+                            "url": var.get("url", ""),
+                            "content_type": var.get("content_type", "video/mp4"),
+                            "bitrate": var.get("bitrate", 0),
+                        }
                    )
+                if converted_variants:
+                    media.append({"type": "animated_gif", "video_info": {"variants": converted_variants}})

-            return {"user": {"name": author}, "created_at": timestamp, "full_text": text, "entities": {"media": media}}
+        return {
+            "user": {"name": author},
+            "created_at": created_at,
+            "full_text": full_text,
+            "entities": {"media": media},
+        }

    def keys_to_clean(self, video_data, info_extractor):
        return ["user", "created_at", "entities", "favorited", "translator_type"]
--- a/tests/extractors/test_twitter_dropin.py
+++ b/tests/extractors/test_twitter_dropin.py
@@ -0,0 +1,238 @@
+"""
+Tests for the Twitter dropin extractor with fxtwitter fallback
+"""
+
+import pytest
+from unittest.mock import Mock, patch
+
+from auto_archiver.modules.generic_extractor.twitter import Twitter
+
+
+@pytest.fixture
+def twitter_dropin():
+    return Twitter()
+
+
+class TestTwitterFxTwitterFallback:
+    """Test the fxtwitter API fallback functionality."""
+
+    @pytest.fixture
+    def mock_fxtwitter_video_response(self):
+        return {
+            "code": 200,
+            "message": "OK",
+            "tweet": {
+                "url": "https://x.com/user/status/123456789",
+                "id": "123456789",
+                "text": "Test tweet with video",
+                "author": {
+                    "id": "111",
+                    "name": "Test User",
+                    "screen_name": "testuser",
+                },
+                "created_at": "Sun Feb 08 18:45:00 +0000 2026",
+                "media": {
+                    "all": [
+                        {
+                            "type": "video",
+                            "url": "https://video.twimg.com/test.mp4",
+                            "variants": [
+                                {"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
+                                {
+                                    "url": "https://video.twimg.com/test_480.mp4",
+                                    "content_type": "video/mp4",
+                                    "bitrate": 632000,
+                                },
+                                {
+                                    "url": "https://video.twimg.com/test_720.mp4",
+                                    "content_type": "video/mp4",
+                                    "bitrate": 2176000,
+                                },
+                            ],
+                        }
+                    ],
+                    "videos": [
+                        {
+                            "url": "https://video.twimg.com/test.mp4",
+                            "variants": [
+                                {"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
+                                {
+                                    "url": "https://video.twimg.com/test_480.mp4",
+                                    "content_type": "video/mp4",
+                                    "bitrate": 632000,
+                                },
+                                {
+                                    "url": "https://video.twimg.com/test_720.mp4",
+                                    "content_type": "video/mp4",
+                                    "bitrate": 2176000,
+                                },
+                            ],
+                        }
+                    ],
+                },
+            },
+        }
+
+    @pytest.fixture
+    def mock_fxtwitter_photo_response(self):
+        return {
+            "code": 200,
+            "message": "OK",
+            "tweet": {
+                "url": "https://x.com/user/status/123456790",
+                "id": "123456790",
+                "text": "Test tweet with photo",
+                "author": {
+                    "id": "111",
+                    "name": "Test User",
+                    "screen_name": "testuser",
+                },
+                "created_at": "Mon Feb 09 10:30:00 +0000 2026",
+                "media": {
+                    "all": [
+                        {
+                            "type": "photo",
+                            "url": "https://pbs.twimg.com/media/test.jpg?name=orig",
+                        }
+                    ],
+                    "photos": [
+                        {
+                            "type": "photo",
+                            "url": "https://pbs.twimg.com/media/test.jpg?name=orig",
+                        }
+                    ],
+                },
+            },
+        }
+
+    def test_fetch_fxtwitter_video(self, twitter_dropin, mock_fxtwitter_video_response):
+        """Test fetching a tweet with video via fxtwitter API."""
+        with patch("requests.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_response.json.return_value = mock_fxtwitter_video_response
+            mock_get.return_value = mock_response
+
+            result = twitter_dropin._fetch_fxtwitter("123456789")
+
+            assert result["user"]["name"] == "Test User"
+            assert result["created_at"] == "Sun Feb 08 18:45:00 +0000 2026"
+            assert result["full_text"] == "Test tweet with video"
+            assert len(result["entities"]["media"]) == 1
+            assert result["entities"]["media"][0]["type"] == "video"
+            assert "video_info" in result["entities"]["media"][0]
+            assert len(result["entities"]["media"][0]["video_info"]["variants"]) == 3
+
+    def test_fetch_fxtwitter_photo(self, twitter_dropin, mock_fxtwitter_photo_response):
+        """Test fetching a tweet with photo via fxtwitter API."""
+        with patch("requests.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_response.json.return_value = mock_fxtwitter_photo_response
+            mock_get.return_value = mock_response
+
+            result = twitter_dropin._fetch_fxtwitter("123456790")
+
+            assert result["user"]["name"] == "Test User"
+            assert result["created_at"] == "Mon Feb 09 10:30:00 +0000 2026"
+            assert result["full_text"] == "Test tweet with photo"
+            assert len(result["entities"]["media"]) == 1
+            assert result["entities"]["media"][0]["type"] == "photo"
+            assert result["entities"]["media"][0]["media_url_https"] == "https://pbs.twimg.com/media/test.jpg?name=orig"
+
+    def test_fetch_fxtwitter_no_media(self, twitter_dropin):
+        """Test fetching a text-only tweet via fxtwitter API."""
+        mock_response_data = {
+            "code": 200,
+            "message": "OK",
+            "tweet": {
+                "id": "123456791",
+                "text": "Just text, no media",
+                "author": {"name": "Text Only User"},
+                "created_at": "Tue Feb 10 12:00:00 +0000 2026",
+                "media": {},
+            },
+        }
+        with patch("requests.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_response.json.return_value = mock_response_data
+            mock_get.return_value = mock_response
+
+            result = twitter_dropin._fetch_fxtwitter("123456791")
+
+            assert result["user"]["name"] == "Text Only User"
+            assert result["full_text"] == "Just text, no media"
+            assert result["entities"]["media"] == []
+
+    def test_fetch_fxtwitter_api_error(self, twitter_dropin):
+        """Test handling of fxtwitter API errors."""
+        with patch("requests.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 404
+            mock_get.return_value = mock_response
+
+            with pytest.raises(Exception):
+                twitter_dropin._fetch_fxtwitter("nonexistent")
+
+
+class TestTwitterChooseVariant:
+    """Test the video variant selection logic."""
+
+    def test_choose_highest_quality_video(self, twitter_dropin):
+        """Test that the highest quality video variant is selected."""
+        variants = [
+            {"url": "https://video.twimg.com/vid/320x240/test.mp4", "content_type": "video/mp4"},
+            {"url": "https://video.twimg.com/vid/1280x720/test.mp4", "content_type": "video/mp4"},
+            {"url": "https://video.twimg.com/vid/640x480/test.mp4", "content_type": "video/mp4"},
+        ]
+
+        result = twitter_dropin.choose_variant(variants)
+
+        assert result["url"] == "https://video.twimg.com/vid/1280x720/test.mp4"
+
+    def test_choose_variant_fallback_for_non_mp4(self, twitter_dropin):
+        """Test fallback when no mp4 variant is available."""
+        variants = [
+            {"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
+        ]
+
+        result = twitter_dropin.choose_variant(variants)
+
+        assert result["url"] == "https://video.twimg.com/test.m3u8"
+
+    def test_choose_variant_prefers_mp4(self, twitter_dropin):
+        """Test that mp4 is preferred over other formats when quality is equal."""
+        variants = [
+            {"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
+            {"url": "https://video.twimg.com/vid/1280x720/test.mp4", "content_type": "video/mp4"},
+        ]
+
+        result = twitter_dropin.choose_variant(variants)
+
+        assert result["content_type"] == "video/mp4"
+
+
+@pytest.mark.download
+class TestTwitterFxTwitterLive:
+    """Live integration tests for fxtwitter API - requires network access."""
+
+    @pytest.mark.parametrize(
+        "tweet_id,expected_media_type",
+        [
+            ("2020569571682312581", "video"),  # Video tweet
+            ("2020410438198890618", "video"),  # Video tweet
+            ("2020341585502957801", "photo"),  # Photo tweet
+        ],
+    )
+    def test_fetch_real_tweets(self, twitter_dropin, tweet_id, expected_media_type):
+        """Test fetching real tweets from fxtwitter API."""
+        result = twitter_dropin._fetch_fxtwitter(tweet_id)
+
+        assert result["user"]["name"]  # Author should be non-empty
+        assert result["created_at"]  # Should have timestamp
+        assert result["full_text"]  # Should have text content
+
+        media = result["entities"]["media"]
+        assert len(media) >= 1
+        assert media[0]["type"] == expected_media_type