mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
fxtwitter working instead of nitter
This commit is contained in:
@@ -10,7 +10,6 @@ from auto_archiver.core.extractor import Extractor
|
|||||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||||
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
|
|
||||||
|
|
||||||
@@ -37,56 +36,80 @@ class Twitter(GenericDropin):
|
|||||||
if not post_data or not post_data.get("user") or not post_data.get("created_at"):
|
if not post_data or not post_data.get("user") or not post_data.get("created_at"):
|
||||||
raise ValueError("Error retrieving post with twitter dropin")
|
raise ValueError("Error retrieving post with twitter dropin")
|
||||||
return post_data
|
return post_data
|
||||||
except Exception:
|
except Exception as e:
|
||||||
# try nitter
|
logger.debug(f"yt-dlp twitter extraction failed: {e}")
|
||||||
nitter_url = f"https://nitter.net/i/status/{twid}"
|
# try fxtwitter API as fallback
|
||||||
# nitter_url = f"https://nitter.space/i/status/{twid}"
|
return self._fetch_fxtwitter(twid)
|
||||||
logger.info(f"Falling back to nitter.net for tweet extraction at {nitter_url}")
|
|
||||||
|
|
||||||
@retry(wait_random_min=500, wait_random_max=2000, stop_max_attempt_number=3)
|
def _fetch_fxtwitter(self, twid: str) -> dict:
|
||||||
def fetch_nitter_soup(url):
|
"""Fetch tweet data from fxtwitter API and convert to expected format."""
|
||||||
headers = {
|
fxtwitter_url = f"https://api.fxtwitter.com/status/{twid}"
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
|
logger.info(f"Falling back to fxtwitter API for tweet extraction: {fxtwitter_url}")
|
||||||
}
|
|
||||||
resp = requests.get(url, headers=headers, timeout=10)
|
|
||||||
if resp.status_code != 200:
|
|
||||||
raise ValueError("Failed to retrieve tweet from nitter.net")
|
|
||||||
logger.error(resp.text)
|
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
|
||||||
tweet_container = soup.find("div", {"class": "main-tweet"})
|
|
||||||
if not tweet_container:
|
|
||||||
raise ValueError("Could not find tweet container on nitter.net page")
|
|
||||||
return tweet_container
|
|
||||||
|
|
||||||
tweet_container = fetch_nitter_soup(nitter_url)
|
@retry(wait_random_min=500, wait_random_max=2000, stop_max_attempt_number=3)
|
||||||
user = tweet_container.find("a", {"class": "username"})
|
def fetch_fxtwitter_data(url):
|
||||||
author = user.text.strip() if user else ""
|
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"}
|
||||||
created_at = tweet_container.find("span", {"class": "tweet-date"})
|
resp = requests.get(url, headers=headers, timeout=15)
|
||||||
timestamp = created_at.find("a")["title"] if created_at and created_at.find("a") else ""
|
if resp.status_code != 200:
|
||||||
|
raise ValueError(f"Failed to retrieve tweet from fxtwitter API: {resp.status_code}")
|
||||||
|
data = resp.json()
|
||||||
|
if "tweet" not in data:
|
||||||
|
raise ValueError(f"No tweet data in fxtwitter response: {data.get('message', 'Unknown error')}")
|
||||||
|
return data["tweet"]
|
||||||
|
|
||||||
full_text = tweet_container.find("div", {"class": "tweet-content"})
|
tweet = fetch_fxtwitter_data(fxtwitter_url)
|
||||||
text = full_text.text.strip() if full_text else ""
|
|
||||||
|
|
||||||
media = []
|
# Convert fxtwitter format to expected format
|
||||||
media_tags = tweet_container.find_all("a", {"class": "still-image"})
|
author = tweet.get("author", {}).get("name", "")
|
||||||
for m in media_tags:
|
created_at = tweet.get("created_at", "") # Format: "Sun Feb 08 18:45:00 +0000 2026"
|
||||||
img_url = m["href"]
|
full_text = tweet.get("text", "") or tweet.get("raw_text", "")
|
||||||
if img_url.startswith("/"):
|
|
||||||
img_url = "https://nitter.net" + img_url
|
|
||||||
media.append({"type": "photo", "media_url_https": img_url})
|
|
||||||
|
|
||||||
video_tags = tweet_container.find_all("video")
|
# Convert media format
|
||||||
for v in video_tags:
|
media = []
|
||||||
src = v.find("source")
|
fx_media = tweet.get("media", {})
|
||||||
if src and src.get("src"):
|
|
||||||
video_url = src["src"]
|
# Handle photos
|
||||||
if video_url.startswith("/"):
|
for photo in fx_media.get("photos", []):
|
||||||
video_url = "https://nitter.net" + video_url
|
media.append({"type": "photo", "media_url_https": photo.get("url", "")})
|
||||||
media.append(
|
|
||||||
{"type": "video", "video_info": {"variants": [{"url": video_url, "content_type": "video/mp4"}]}}
|
# Handle videos
|
||||||
|
for video in fx_media.get("videos", []):
|
||||||
|
variants = video.get("variants", [])
|
||||||
|
# Convert to expected variant format
|
||||||
|
converted_variants = []
|
||||||
|
for var in variants:
|
||||||
|
converted_variants.append(
|
||||||
|
{
|
||||||
|
"url": var.get("url", ""),
|
||||||
|
"content_type": var.get("content_type", "video/mp4"),
|
||||||
|
"bitrate": var.get("bitrate", 0),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if converted_variants:
|
||||||
|
media.append({"type": "video", "video_info": {"variants": converted_variants}})
|
||||||
|
|
||||||
|
# Handle animated gifs (fxtwitter may include these in videos)
|
||||||
|
for item in fx_media.get("all", []):
|
||||||
|
if item.get("type") == "gif":
|
||||||
|
variants = item.get("variants", [])
|
||||||
|
converted_variants = []
|
||||||
|
for var in variants:
|
||||||
|
converted_variants.append(
|
||||||
|
{
|
||||||
|
"url": var.get("url", ""),
|
||||||
|
"content_type": var.get("content_type", "video/mp4"),
|
||||||
|
"bitrate": var.get("bitrate", 0),
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
if converted_variants:
|
||||||
|
media.append({"type": "animated_gif", "video_info": {"variants": converted_variants}})
|
||||||
|
|
||||||
return {"user": {"name": author}, "created_at": timestamp, "full_text": text, "entities": {"media": media}}
|
return {
|
||||||
|
"user": {"name": author},
|
||||||
|
"created_at": created_at,
|
||||||
|
"full_text": full_text,
|
||||||
|
"entities": {"media": media},
|
||||||
|
}
|
||||||
|
|
||||||
def keys_to_clean(self, video_data, info_extractor):
|
def keys_to_clean(self, video_data, info_extractor):
|
||||||
return ["user", "created_at", "entities", "favorited", "translator_type"]
|
return ["user", "created_at", "entities", "favorited", "translator_type"]
|
||||||
|
|||||||
238
tests/extractors/test_twitter_dropin.py
Normal file
238
tests/extractors/test_twitter_dropin.py
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
"""
|
||||||
|
Tests for the Twitter dropin extractor with fxtwitter fallback
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
|
from auto_archiver.modules.generic_extractor.twitter import Twitter
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def twitter_dropin():
|
||||||
|
return Twitter()
|
||||||
|
|
||||||
|
|
||||||
|
class TestTwitterFxTwitterFallback:
|
||||||
|
"""Test the fxtwitter API fallback functionality."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_fxtwitter_video_response(self):
|
||||||
|
return {
|
||||||
|
"code": 200,
|
||||||
|
"message": "OK",
|
||||||
|
"tweet": {
|
||||||
|
"url": "https://x.com/user/status/123456789",
|
||||||
|
"id": "123456789",
|
||||||
|
"text": "Test tweet with video",
|
||||||
|
"author": {
|
||||||
|
"id": "111",
|
||||||
|
"name": "Test User",
|
||||||
|
"screen_name": "testuser",
|
||||||
|
},
|
||||||
|
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||||
|
"media": {
|
||||||
|
"all": [
|
||||||
|
{
|
||||||
|
"type": "video",
|
||||||
|
"url": "https://video.twimg.com/test.mp4",
|
||||||
|
"variants": [
|
||||||
|
{"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
|
||||||
|
{
|
||||||
|
"url": "https://video.twimg.com/test_480.mp4",
|
||||||
|
"content_type": "video/mp4",
|
||||||
|
"bitrate": 632000,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://video.twimg.com/test_720.mp4",
|
||||||
|
"content_type": "video/mp4",
|
||||||
|
"bitrate": 2176000,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"videos": [
|
||||||
|
{
|
||||||
|
"url": "https://video.twimg.com/test.mp4",
|
||||||
|
"variants": [
|
||||||
|
{"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
|
||||||
|
{
|
||||||
|
"url": "https://video.twimg.com/test_480.mp4",
|
||||||
|
"content_type": "video/mp4",
|
||||||
|
"bitrate": 632000,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://video.twimg.com/test_720.mp4",
|
||||||
|
"content_type": "video/mp4",
|
||||||
|
"bitrate": 2176000,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_fxtwitter_photo_response(self):
|
||||||
|
return {
|
||||||
|
"code": 200,
|
||||||
|
"message": "OK",
|
||||||
|
"tweet": {
|
||||||
|
"url": "https://x.com/user/status/123456790",
|
||||||
|
"id": "123456790",
|
||||||
|
"text": "Test tweet with photo",
|
||||||
|
"author": {
|
||||||
|
"id": "111",
|
||||||
|
"name": "Test User",
|
||||||
|
"screen_name": "testuser",
|
||||||
|
},
|
||||||
|
"created_at": "Mon Feb 09 10:30:00 +0000 2026",
|
||||||
|
"media": {
|
||||||
|
"all": [
|
||||||
|
{
|
||||||
|
"type": "photo",
|
||||||
|
"url": "https://pbs.twimg.com/media/test.jpg?name=orig",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"photos": [
|
||||||
|
{
|
||||||
|
"type": "photo",
|
||||||
|
"url": "https://pbs.twimg.com/media/test.jpg?name=orig",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_fetch_fxtwitter_video(self, twitter_dropin, mock_fxtwitter_video_response):
|
||||||
|
"""Test fetching a tweet with video via fxtwitter API."""
|
||||||
|
with patch("requests.get") as mock_get:
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.json.return_value = mock_fxtwitter_video_response
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
result = twitter_dropin._fetch_fxtwitter("123456789")
|
||||||
|
|
||||||
|
assert result["user"]["name"] == "Test User"
|
||||||
|
assert result["created_at"] == "Sun Feb 08 18:45:00 +0000 2026"
|
||||||
|
assert result["full_text"] == "Test tweet with video"
|
||||||
|
assert len(result["entities"]["media"]) == 1
|
||||||
|
assert result["entities"]["media"][0]["type"] == "video"
|
||||||
|
assert "video_info" in result["entities"]["media"][0]
|
||||||
|
assert len(result["entities"]["media"][0]["video_info"]["variants"]) == 3
|
||||||
|
|
||||||
|
def test_fetch_fxtwitter_photo(self, twitter_dropin, mock_fxtwitter_photo_response):
|
||||||
|
"""Test fetching a tweet with photo via fxtwitter API."""
|
||||||
|
with patch("requests.get") as mock_get:
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.json.return_value = mock_fxtwitter_photo_response
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
result = twitter_dropin._fetch_fxtwitter("123456790")
|
||||||
|
|
||||||
|
assert result["user"]["name"] == "Test User"
|
||||||
|
assert result["created_at"] == "Mon Feb 09 10:30:00 +0000 2026"
|
||||||
|
assert result["full_text"] == "Test tweet with photo"
|
||||||
|
assert len(result["entities"]["media"]) == 1
|
||||||
|
assert result["entities"]["media"][0]["type"] == "photo"
|
||||||
|
assert result["entities"]["media"][0]["media_url_https"] == "https://pbs.twimg.com/media/test.jpg?name=orig"
|
||||||
|
|
||||||
|
def test_fetch_fxtwitter_no_media(self, twitter_dropin):
|
||||||
|
"""Test fetching a text-only tweet via fxtwitter API."""
|
||||||
|
mock_response_data = {
|
||||||
|
"code": 200,
|
||||||
|
"message": "OK",
|
||||||
|
"tweet": {
|
||||||
|
"id": "123456791",
|
||||||
|
"text": "Just text, no media",
|
||||||
|
"author": {"name": "Text Only User"},
|
||||||
|
"created_at": "Tue Feb 10 12:00:00 +0000 2026",
|
||||||
|
"media": {},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
with patch("requests.get") as mock_get:
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.json.return_value = mock_response_data
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
result = twitter_dropin._fetch_fxtwitter("123456791")
|
||||||
|
|
||||||
|
assert result["user"]["name"] == "Text Only User"
|
||||||
|
assert result["full_text"] == "Just text, no media"
|
||||||
|
assert result["entities"]["media"] == []
|
||||||
|
|
||||||
|
def test_fetch_fxtwitter_api_error(self, twitter_dropin):
|
||||||
|
"""Test handling of fxtwitter API errors."""
|
||||||
|
with patch("requests.get") as mock_get:
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.status_code = 404
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
twitter_dropin._fetch_fxtwitter("nonexistent")
|
||||||
|
|
||||||
|
|
||||||
|
class TestTwitterChooseVariant:
|
||||||
|
"""Test the video variant selection logic."""
|
||||||
|
|
||||||
|
def test_choose_highest_quality_video(self, twitter_dropin):
|
||||||
|
"""Test that the highest quality video variant is selected."""
|
||||||
|
variants = [
|
||||||
|
{"url": "https://video.twimg.com/vid/320x240/test.mp4", "content_type": "video/mp4"},
|
||||||
|
{"url": "https://video.twimg.com/vid/1280x720/test.mp4", "content_type": "video/mp4"},
|
||||||
|
{"url": "https://video.twimg.com/vid/640x480/test.mp4", "content_type": "video/mp4"},
|
||||||
|
]
|
||||||
|
|
||||||
|
result = twitter_dropin.choose_variant(variants)
|
||||||
|
|
||||||
|
assert result["url"] == "https://video.twimg.com/vid/1280x720/test.mp4"
|
||||||
|
|
||||||
|
def test_choose_variant_fallback_for_non_mp4(self, twitter_dropin):
|
||||||
|
"""Test fallback when no mp4 variant is available."""
|
||||||
|
variants = [
|
||||||
|
{"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
|
||||||
|
]
|
||||||
|
|
||||||
|
result = twitter_dropin.choose_variant(variants)
|
||||||
|
|
||||||
|
assert result["url"] == "https://video.twimg.com/test.m3u8"
|
||||||
|
|
||||||
|
def test_choose_variant_prefers_mp4(self, twitter_dropin):
|
||||||
|
"""Test that mp4 is preferred over other formats when quality is equal."""
|
||||||
|
variants = [
|
||||||
|
{"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
|
||||||
|
{"url": "https://video.twimg.com/vid/1280x720/test.mp4", "content_type": "video/mp4"},
|
||||||
|
]
|
||||||
|
|
||||||
|
result = twitter_dropin.choose_variant(variants)
|
||||||
|
|
||||||
|
assert result["content_type"] == "video/mp4"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.download
|
||||||
|
class TestTwitterFxTwitterLive:
|
||||||
|
"""Live integration tests for fxtwitter API - requires network access."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"tweet_id,expected_media_type",
|
||||||
|
[
|
||||||
|
("2020569571682312581", "video"), # Video tweet
|
||||||
|
("2020410438198890618", "video"), # Video tweet
|
||||||
|
("2020341585502957801", "photo"), # Photo tweet
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_fetch_real_tweets(self, twitter_dropin, tweet_id, expected_media_type):
|
||||||
|
"""Test fetching real tweets from fxtwitter API."""
|
||||||
|
result = twitter_dropin._fetch_fxtwitter(tweet_id)
|
||||||
|
|
||||||
|
assert result["user"]["name"] # Author should be non-empty
|
||||||
|
assert result["created_at"] # Should have timestamp
|
||||||
|
assert result["full_text"] # Should have text content
|
||||||
|
|
||||||
|
media = result["entities"]["media"]
|
||||||
|
assert len(media) >= 1
|
||||||
|
assert media[0]["type"] == expected_media_type
|
||||||
Reference in New Issue
Block a user