Merge branch 'JustAnotherArchivist:master' into master

2026-06-11 03:48:29 +03:00 · 2023-03-01 22:49:43 +03:00
parent 7061ad2eb5 03ef3debaf
commit aa8d93e07c
3 changed files with 43 additions and 31 deletions
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ The following services are currently supported:
 * Mastodon: user profiles and toots (single or thread)
 * Reddit: users, subreddits, and searches (via Pushshift)
 * Telegram: channels
-* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends
+* Twitter: users, user profiles, hashtags, searches (live tweets, top tweets, and users), tweets (single or surrounding thread), list posts, communities, and trends
 * VKontakte: user profiles
 * Weibo (Sina Weibo): user profiles

--- a/snscrape/_cli.py
+++ b/snscrape/_cli.py
@@ -6,6 +6,7 @@ import datetime
 import importlib.metadata
 import inspect
 import logging
+import os
 import requests
 # Imported in parse_args() after setting up the logger:
 #import snscrape.base
@@ -307,32 +308,36 @@ def main():

 	i = 0
 	with _dump_locals_on_exception():
-		if args.withEntity and (entity := scraper.entity):
-			if args.jsonl:
-				print(entity.json())
+		try:
+			if args.withEntity and (entity := scraper.entity):
+				if args.jsonl:
+					print(entity.json())
+				else:
+					print(entity)
+			if args.maxResults == 0:
+				logger.info('Exiting after 0 results')
+				return
+			for i, item in enumerate(scraper.get_items(), start = 1):
+				if args.since is not None and item.date < args.since:
+					logger.info(f'Exiting due to reaching older results than {args.since}')
+					break
+				if args.jsonl:
+					print(item.json())
+				elif args.format is not None:
+					print(args.format.format(item))
+				else:
+					print(item)
+				if args.progress and i % 100 == 0:
+					print(f'Scraping, {i} results so far', file = sys.stderr)
+				if args.maxResults and i >= args.maxResults:
+					logger.info(f'Exiting after {i} results')
+					if args.progress:
+						print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
+					break
 			else:
-				print(entity)
-		if args.maxResults == 0:
-			logger.info('Exiting after 0 results')
-			return
-		for i, item in enumerate(scraper.get_items(), start = 1):
-			if args.since is not None and item.date < args.since:
-				logger.info(f'Exiting due to reaching older results than {args.since}')
-				break
-			if args.jsonl:
-				print(item.json())
-			elif args.format is not None:
-				print(args.format.format(item))
-			else:
-				print(item)
-			if args.progress and i % 100 == 0:
-				print(f'Scraping, {i} results so far', file = sys.stderr)
-			if args.maxResults and i >= args.maxResults:
-				logger.info(f'Exiting after {i} results')
+				logger.info(f'Done, found {i} results')
 				if args.progress:
-					print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
-				break
-		else:
-			logger.info(f'Done, found {i} results')
-			if args.progress:
-				print(f'Finished, {i} results', file = sys.stderr)
+					print(f'Finished, {i} results', file = sys.stderr)
+		except BrokenPipeError:
+			os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno())
+			sys.exit(1)
--- a/snscrape/modules/twitter.py
+++ b/snscrape/modules/twitter.py
@@ -1412,14 +1412,21 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
 			#TODO Tombstones will cause a crash here.
 			kwargs['retweetedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(tweet['retweeted_status_result']['result'])
 		if 'quoted_status_result' in result:
-			kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str']))
+			if 'result' not in result['quoted_status_result']:
+				_logger.warning(f'quoted_status_result for {tweet["quoted_status_id_str"]} without an actual result on tweet {self._get_tweet_id(tweet)}, using TweetRef')
+				kwargs['quotedTweet'] = TweetRef(int(tweet['quoted_status_id_str']))
+			else:
+				kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str']))
 		elif result.get('quotedRefResult'):
 			if result['quotedRefResult']['result']['__typename'] == 'TweetTombstone':
 				kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quotedRefResult']['result'], tweetId = int(tweet['quoted_status_id_str']))
 			else:
-				if result['quotedRefResult']['result']['__typename'] != 'Tweet':
+				qTweet = result['quotedRefResult']['result']
+				if result['quotedRefResult']['result']['__typename'] not in ('Tweet', 'TweetWithVisibilityResults'):
 					_logger.warning(f'Unknown quotedRefResult type {result["quotedRefResult"]["result"]["__typename"]!r} on tweet {self._get_tweet_id(tweet)}, using TweetRef')
-				kwargs['quotedTweet'] = TweetRef(id = int(result['quotedRefResult']['result']['rest_id']))
+				elif result['quotedRefResult']['result']['__typename'] == 'TweetWithVisibilityResults':
+					qTweet = qTweet['tweet']
+				kwargs['quotedTweet'] = TweetRef(id = int(qTweet['rest_id']))
 		elif 'quoted_status_id_str' in tweet:
 			# Omit the TweetRef if this is a retweet and the quoted tweet ID matches the tweet quoted in the retweeted tweet.
 			if tweet['quoted_status_id_str'] != tweet.get('retweeted_status_result', {}).get('result', {}).get('quoted_status_result', {}).get('result', {}).get('rest_id'):