Move Tweet object generation to TwitterAPIScraper

2026-06-12 20:38:29 +03:00 · 2020-09-01 02:25:00 +00:00
parent dc5d55004b
commit bfb92a47b9
1 changed files with 29 additions and 26 deletions
--- a/snscrape/modules/twitter.py
+++ b/snscrape/modules/twitter.py
@@ -200,6 +200,34 @@ class TwitterAPIScraper(TwitterCommonScraper):
 			reqParams = paginationParams.copy()
 			reqParams['cursor'] = cursor
 	def _instructions_to_tweets(self, obj):
 		# No data format test, just a hard and loud crash if anything's wrong :-)
 		for instruction in obj['timeline']['instructions']:
 			if 'addEntries' in instruction:
 				entries = instruction['addEntries']['entries']
 			elif 'replaceEntry' in instruction:
 				entries = [instruction['replaceEntry']['entry']]
 			else:
 				continue
 			for entry in entries:
 				if entry['entryId'].startswith('sq-I-t-'):
 					if 'tweet' in entry['content']['item']['content']:
 						if 'promotedMetadata' in entry['content']['item']['content']['tweet']: # Promoted tweet aka ads
 							continue
 						tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tweet']['id']]
 					elif 'tombstone' in entry['content']['item']['content'] and 'tweet' in entry['content']['item']['content']['tombstone']:
 						tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tombstone']['tweet']['id']]
 					else:
 						raise snscrape.base.ScraperException(f'Unable to handle entry {entry["entryId"]!r}')
 					tweetID = tweet['id']
 					content = tweet['full_text']
 					username = obj['globalObjects']['users'][tweet['user_id_str']]['screen_name']
 					date = datetime.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo = datetime.timezone.utc)
 					outlinks = [u['expanded_url'] for u in tweet['entities']['urls']]
 					tcooutlinks = [u['url'] for u in tweet['entities']['urls']]
 					url = f'https://twitter.com/{username}/status/{tweetID}'
 					yield Tweet(url, date, content, tweetID, username, outlinks, ' '.join(outlinks), tcooutlinks, ' '.join(tcooutlinks))
 class TwitterSearchScraper(TwitterAPIScraper):
 	name = 'twitter-search'
@@ -255,32 +283,7 @@ class TwitterSearchScraper(TwitterAPIScraper):
 			d['ext'] = 'ext=mediaStats%2ChighlightedLabel'
 		for obj in self._iter_api_data('https://api.twitter.com/2/search/adaptive.json', params, paginationParams):
-			# No data format test, just a hard and loud crash if anything's wrong :-)
+			yield from self._instructions_to_tweets(obj)
 			for instruction in obj['timeline']['instructions']:
 				if 'addEntries' in instruction:
 					entries = instruction['addEntries']['entries']
 				elif 'replaceEntry' in instruction:
 					entries = [instruction['replaceEntry']['entry']]
 				else:
 					continue
 				for entry in entries:
 					if entry['entryId'].startswith('sq-I-t-'):
 						if 'tweet' in entry['content']['item']['content']:
 							if 'promotedMetadata' in entry['content']['item']['content']['tweet']: # Promoted tweet aka ads
 								continue
 							tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tweet']['id']]
 						elif 'tombstone' in entry['content']['item']['content'] and 'tweet' in entry['content']['item']['content']['tombstone']:
 							tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tombstone']['tweet']['id']]
 						else:
 							raise snscrape.base.ScraperException(f'Unable to handle entry {entry["entryId"]!r}')
 						tweetID = tweet['id']
 						content = tweet['full_text']
 						username = obj['globalObjects']['users'][tweet['user_id_str']]['screen_name']
 						date = datetime.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo = datetime.timezone.utc)
 						outlinks = [u['expanded_url'] for u in tweet['entities']['urls']]
 						tcooutlinks = [u['url'] for u in tweet['entities']['urls']]
 						url = f'https://twitter.com/{username}/status/{tweetID}'
 						yield Tweet(url, date, content, tweetID, username, outlinks, ' '.join(outlinks), tcooutlinks, ' '.join(tcooutlinks))
 	@classmethod
 	def setup_parser(cls, subparser):