mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
see #400 WIP nitter not working as of now
This commit is contained in:
@@ -9,6 +9,9 @@ from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
|||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||||
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from retrying import retry
|
||||||
|
|
||||||
|
|
||||||
class Twitter(GenericDropin):
|
class Twitter(GenericDropin):
|
||||||
@@ -29,7 +32,61 @@ class Twitter(GenericDropin):
|
|||||||
|
|
||||||
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
||||||
twid = ie_instance._match_valid_url(url).group("id")
|
twid = ie_instance._match_valid_url(url).group("id")
|
||||||
return ie_instance._extract_status(twid=twid)
|
try:
|
||||||
|
post_data = ie_instance._extract_status(twid=twid)
|
||||||
|
if not post_data or not post_data.get("user") or not post_data.get("created_at"):
|
||||||
|
raise ValueError("Error retrieving post with twitter dropin")
|
||||||
|
return post_data
|
||||||
|
except Exception:
|
||||||
|
# try nitter
|
||||||
|
nitter_url = f"https://nitter.net/i/status/{twid}"
|
||||||
|
# nitter_url = f"https://nitter.space/i/status/{twid}"
|
||||||
|
logger.info(f"Falling back to nitter.net for tweet extraction at {nitter_url}")
|
||||||
|
|
||||||
|
@retry(wait_random_min=500, wait_random_max=2000, stop_max_attempt_number=3)
|
||||||
|
def fetch_nitter_soup(url):
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
|
||||||
|
}
|
||||||
|
resp = requests.get(url, headers=headers, timeout=10)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise ValueError("Failed to retrieve tweet from nitter.net")
|
||||||
|
logger.error(resp.text)
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
tweet_container = soup.find("div", {"class": "main-tweet"})
|
||||||
|
if not tweet_container:
|
||||||
|
raise ValueError("Could not find tweet container on nitter.net page")
|
||||||
|
return tweet_container
|
||||||
|
|
||||||
|
tweet_container = fetch_nitter_soup(nitter_url)
|
||||||
|
user = tweet_container.find("a", {"class": "username"})
|
||||||
|
author = user.text.strip() if user else ""
|
||||||
|
created_at = tweet_container.find("span", {"class": "tweet-date"})
|
||||||
|
timestamp = created_at.find("a")["title"] if created_at and created_at.find("a") else ""
|
||||||
|
|
||||||
|
full_text = tweet_container.find("div", {"class": "tweet-content"})
|
||||||
|
text = full_text.text.strip() if full_text else ""
|
||||||
|
|
||||||
|
media = []
|
||||||
|
media_tags = tweet_container.find_all("a", {"class": "still-image"})
|
||||||
|
for m in media_tags:
|
||||||
|
img_url = m["href"]
|
||||||
|
if img_url.startswith("/"):
|
||||||
|
img_url = "https://nitter.net" + img_url
|
||||||
|
media.append({"type": "photo", "media_url_https": img_url})
|
||||||
|
|
||||||
|
video_tags = tweet_container.find_all("video")
|
||||||
|
for v in video_tags:
|
||||||
|
src = v.find("source")
|
||||||
|
if src and src.get("src"):
|
||||||
|
video_url = src["src"]
|
||||||
|
if video_url.startswith("/"):
|
||||||
|
video_url = "https://nitter.net" + video_url
|
||||||
|
media.append(
|
||||||
|
{"type": "video", "video_info": {"variants": [{"url": video_url, "content_type": "video/mp4"}]}}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"user": {"name": author}, "created_at": timestamp, "full_text": text, "entities": {"media": media}}
|
||||||
|
|
||||||
def keys_to_clean(self, video_data, info_extractor):
|
def keys_to_clean(self, video_data, info_extractor):
|
||||||
return ["user", "created_at", "entities", "favorited", "translator_type"]
|
return ["user", "created_at", "entities", "favorited", "translator_type"]
|
||||||
|
|||||||
Reference in New Issue
Block a user