From b9ab26ed5a01bcb28aa96a4d308549ae60f63161 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 23 Feb 2026 12:20:10 +0000 Subject: [PATCH] see #400 WIP nitter not working as of now --- .../modules/generic_extractor/twitter.py | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 0141e1b..c20fdd0 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -9,6 +9,9 @@ from auto_archiver.utils import url as UrlUtil, get_datetime_from_str from auto_archiver.core.extractor import Extractor from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor +import requests +from bs4 import BeautifulSoup +from retrying import retry class Twitter(GenericDropin): @@ -29,7 +32,61 @@ class Twitter(GenericDropin): def extract_post(self, url: str, ie_instance: InfoExtractor): twid = ie_instance._match_valid_url(url).group("id") - return ie_instance._extract_status(twid=twid) + try: + post_data = ie_instance._extract_status(twid=twid) + if not post_data or not post_data.get("user") or not post_data.get("created_at"): + raise ValueError("Error retrieving post with twitter dropin") + return post_data + except Exception: + # try nitter + nitter_url = f"https://nitter.net/i/status/{twid}" + # nitter_url = f"https://nitter.space/i/status/{twid}" + logger.info(f"Falling back to nitter.net for tweet extraction at {nitter_url}") + + @retry(wait_random_min=500, wait_random_max=2000, stop_max_attempt_number=3) + def fetch_nitter_soup(url): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0" + } + resp = requests.get(url, headers=headers, timeout=10) + if resp.status_code != 200: + raise ValueError("Failed to retrieve tweet from nitter.net") + logger.error(resp.text) + soup = BeautifulSoup(resp.text, "html.parser") + tweet_container = soup.find("div", {"class": "main-tweet"}) + if not tweet_container: + raise ValueError("Could not find tweet container on nitter.net page") + return tweet_container + + tweet_container = fetch_nitter_soup(nitter_url) + user = tweet_container.find("a", {"class": "username"}) + author = user.text.strip() if user else "" + created_at = tweet_container.find("span", {"class": "tweet-date"}) + timestamp = created_at.find("a")["title"] if created_at and created_at.find("a") else "" + + full_text = tweet_container.find("div", {"class": "tweet-content"}) + text = full_text.text.strip() if full_text else "" + + media = [] + media_tags = tweet_container.find_all("a", {"class": "still-image"}) + for m in media_tags: + img_url = m["href"] + if img_url.startswith("/"): + img_url = "https://nitter.net" + img_url + media.append({"type": "photo", "media_url_https": img_url}) + + video_tags = tweet_container.find_all("video") + for v in video_tags: + src = v.find("source") + if src and src.get("src"): + video_url = src["src"] + if video_url.startswith("/"): + video_url = "https://nitter.net" + video_url + media.append( + {"type": "video", "video_info": {"variants": [{"url": video_url, "content_type": "video/mp4"}]}} + ) + + return {"user": {"name": author}, "created_at": timestamp, "full_text": text, "entities": {"media": media}} def keys_to_clean(self, video_data, info_extractor): return ["user", "created_at", "entities", "favorited", "translator_type"]