Force TwitterThreadScraper and TwitterListMembersScraper to fetch the old design

Port TwitterSearchScraper to redesign
Fixes #57
2026-06-09 19:08:28 +03:00 · 2020-03-04 00:40:49 +00:00 · 2020-03-04 00:40:49 +00:00 · 2020-02-09 23:48:59 +00:00 · 2020-02-07 11:30:16 +00:00 · 2020-02-05 16:15:10 +00:00
4 changed files with 133 additions and 71 deletions
--- a/snscrape/modules/facebook.py
+++ b/snscrape/modules/facebook.py
@@ -80,6 +80,7 @@ class FacebookCommonScraper(snscrape.base.Scraper):
 			return False, None

 	def _soup_to_items(self, soup, baseUrl, mode):
+		cleanUrl = None # Value from previous iteration is used for warning on link-less entries
 		for entry in soup.find_all('div', class_ = '_5pcr'): # also class 'fbUserContent' in 2017 and 'userContentWrapper' in 2019
 			entryA = entry.find('a', class_ = '_5pcq') # There can be more than one, e.g. when a post is shared by another user, but the first one is always the one of this entry.
 			mediaSetA = entry.find('a', class_ = '_17z-')
@@ -96,6 +97,7 @@ class FacebookCommonScraper(snscrape.base.Scraper):
 					logger.warning(f'Ignoring odd link: {href}')
 				continue
 			dirtyUrl = urllib.parse.urljoin(baseUrl, href)
+			cleanUrl = self._clean_url(dirtyUrl)
 			date = datetime.datetime.fromtimestamp(int(entry.find('abbr', class_ = '_5ptz')['data-utime']), datetime.timezone.utc)
 			contentDiv = entry.find('div', class_ = '_5pbx')
 			if contentDiv:
@@ -116,7 +118,7 @@ class FacebookCommonScraper(snscrape.base.Scraper):
 				outlink = query['u'][0]
 				if outlink.startswith('http://') or outlink.startswith('https://') and outlink not in outlinks:
 					outlinks.append(outlink)
-			yield FacebookPost(cleanUrl = self._clean_url(dirtyUrl), dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks, outlinksss = ' '.join(outlinks))
+			yield FacebookPost(cleanUrl = cleanUrl, dirtyUrl = dirtyUrl, date = date, content = content, outlinks = outlinks, outlinksss = ' '.join(outlinks))


 class FacebookUserScraper(FacebookCommonScraper):
--- a/snscrape/modules/instagram.py
+++ b/snscrape/modules/instagram.py
@@ -16,6 +16,11 @@ class InstagramPost(typing.NamedTuple, snscrape.base.Item):
 	content: str
 	thumbnailUrl: str
 	displayUrl: str
+	username: str
+	likes: int
+	comments: int
+	commentsDisabled: bool
+	isVideo: bool

 	def __str__(self):
 		return self.cleanUrl
@@ -57,7 +62,8 @@ class InstagramCommonScraper(snscrape.base.Scraper):
 	def _response_to_items(self, response):
 		for node in response[self._responseContainer][self._edgeXToMedia]['edges']:
 			code = node['node']['shortcode']
-			usernameQuery = '?taken-by=' + node['node']['owner']['username'] if 'username' in node['node']['owner'] else ''
+			username = node['node']['owner']['username'] if 'username' in node['node']['owner'] else ''
+			usernameQuery = '?taken-by=' + username
 			cleanUrl = f'https://www.instagram.com/p/{code}/'
 			yield InstagramPost(
 			  cleanUrl = cleanUrl,
@@ -66,6 +72,11 @@ class InstagramCommonScraper(snscrape.base.Scraper):
 			  content = node['node']['edge_media_to_caption']['edges'][0]['node']['text'] if len(node['node']['edge_media_to_caption']['edges']) else None,
 			  thumbnailUrl = node['node']['thumbnail_src'],
 			  displayUrl = node['node']['display_url'],
+			  username = username,
+			  likes = node['node']['edge_media_preview_like']['count'],
+			  comments = node['node']['edge_media_to_comment']['count'],
+			  commentsDisabled = node['node']['comments_disabled'],
+			  isVideo = node['node']['is_video'],
 			 )

 	def _check_initial_page_callback(self, r):
--- a/snscrape/modules/twitter.py
+++ b/snscrape/modules/twitter.py
@@ -3,8 +3,10 @@ import datetime
 import json
 import random
 import logging
+import re
 import snscrape.base
 import typing
+import urllib.parse


 logger = logging.getLogger(__name__)
@@ -86,82 +88,122 @@ class TwitterCommonScraper(snscrape.base.Scraper):
 class TwitterSearchScraper(TwitterCommonScraper):
 	name = 'twitter-search'

-	def __init__(self, query, maxPosition = None, **kwargs):
+	def __init__(self, query, cursor = None, **kwargs):
 		super().__init__(**kwargs)
 		self._query = query
-		self._maxPosition = maxPosition
+		self._cursor = cursor
+		self._userAgent = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.{random.randint(0, 9999)} Safari/537.{random.randint(0, 99)}'
+		self._baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': self._query, 'src': 'spelling_expansion_revert_click'})

-	def _get_feed_from_html(self, html, withMinPosition):
-		soup = bs4.BeautifulSoup(html, 'lxml')
-		feed = soup.find_all('li', 'js-stream-item')
-		if withMinPosition:
-			streamContainer = soup.find('div', 'stream-container')
-			if not streamContainer or not streamContainer.has_attr('data-min-position'):
-				if soup.find('div', 'SearchEmptyTimeline'):
-					# No results found
-					minPosition = None
-				else:
-					# Unknown error condition
-					raise RuntimeError('Unable to find min-position')
-			else:
-				minPosition = streamContainer['data-min-position']
-		else:
-			minPosition = None
-		return feed, minPosition
+	def _get_guest_token(self):
+		logger.info(f'Retrieving guest token from search page')
+		r = self._get(self._baseUrl, headers = {'User-Agent': self._userAgent})
+		match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+);', r.text)
+		if not match:
+			raise RuntimeError('Unable to find guest token')
+		return match.group(1)
+
+	def _check_scroll_response(self, r):
+		if r.status_code == 429:
+			# Accept a 429 response as "valid" to prevent retries; handled explicitly in get_items
+			return True, None
+		if r.headers.get('content-type') != 'application/json;charset=utf-8':
+			return False, f'content type is not JSON'
+		if r.status_code != 200:
+			return False, f'non-200 status code'
+		return True, None

 	def get_items(self):
-		headers = {'User-Agent': f'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.{random.randint(1, 3500)}.{random.randint(1, 160)} Safari/537.36'}
-
-		# First page
-		if self._maxPosition is None:
-			logger.info(f'Retrieving search page for {self._query}')
-			r = self._get('https://twitter.com/search', params = {'f': 'tweets', 'vertical': 'default', 'lang': 'en', 'q': self._query, 'src': 'spxr', 'qf': 'off'}, headers = headers)
-
-			feed, maxPosition = self._get_feed_from_html(r.text, True)
-			if not feed:
-				logger.warning(f'No results for {self._query}')
-				return
-			yield from self._feed_to_items(feed)
-		else:
-			maxPosition = self._maxPosition
-
-		if not maxPosition:
-			return
-
+		headers = {
+			'User-Agent': self._userAgent,
+			'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
+			'Referer': self._baseUrl,
+		}
+		guestToken = None
+		cursor = self._cursor
 		while True:
-			logger.info(f'Retrieving scroll page {maxPosition}')
-			r = self._get('https://twitter.com/i/search/timeline',
-				params = {
-					'f': 'tweets',
-					'vertical': 'default',
-					'lang': 'en',
-					'q': self._query,
-					'include_available_features': '1',
-					'include_entities': '1',
-					'reset_error_state': 'false',
-					'src': 'spxr',
-					'qf': 'off',
-					'max_position': maxPosition,
-				},
-				headers = headers,
-				responseOkCallback = self._check_json_callback)
+			if not guestToken:
+				guestToken = self._get_guest_token()
+				headers['x-guest-token'] = guestToken

-			obj = json.loads(r.text)
-			feed, _ = self._get_feed_from_html(obj['items_html'], False)
-			if feed:
-				yield from self._feed_to_items(feed)
-			if obj['min_position'] == maxPosition:
-				return
-			maxPosition = obj['min_position']
+			logger.info(f'Retrieving scroll page {cursor}')
+			params = {
+				'include_profile_interstitial_type': '1',
+				'include_blocking': '1',
+				'include_blocked_by': '1',
+				'include_followed_by': '1',
+				'include_want_retweets': '1',
+				'include_mute_edge': '1',
+				'include_can_dm': '1',
+				'include_can_media_tag': '1',
+				'skip_status': '1',
+				'cards_platform': 'Web-12',
+				'include_cards': '1',
+				'include_composer_source': 'true',
+				'include_ext_alt_text': 'true',
+				'include_reply_count': '1',
+				'tweet_mode': 'extended',
+				'include_entities': 'true',
+				'include_user_entities': 'true',
+				'include_ext_media_color': 'true',
+				'include_ext_media_availability': 'true',
+				'send_error_codes': 'true',
+				'simple_quoted_tweets': 'true',
+				'q': self._query,
+				'tweet_search_mode': 'live',
+				'count': '100',
+				'query_source': 'spelling_expansion_revert_click',
+			}
+			if cursor:
+				params['cursor'] = cursor
+			params['pc'] = '1'
+			params['spelling_corrections'] = '1'
+			params['ext'] = 'mediaStats%2CcameraMoment'
+			r = self._get('https://api.twitter.com/2/search/adaptive.json', params = params, headers = headers, responseOkCallback = self._check_scroll_response)
+			if r.status_code == 429:
+				guestToken = None
+				continue
+			try:
+				obj = r.json()
+			except json.JSONDecodeError as e:
+				logger.error(f'Received invalid JSON from Twitter: {e!s}')
+				raise RuntimeError('Received invalid JSON from Twitter') from e
+
+			# No data format test, just a hard and loud crash if anything's wrong :-)
+			newCursor = None
+			for instruction in obj['timeline']['instructions']:
+				if 'addEntries' in instruction:
+					entries = instruction['addEntries']['entries']
+				elif 'replaceEntry' in instruction:
+					entries = [instruction['replaceEntry']['entry']]
+				else:
+					continue
+				for entry in entries:
+					if entry['entryId'].startswith('sq-I-t-'):
+						tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tweet']['id']]
+						tweetID = tweet['id']
+						content = tweet['full_text']
+						username = obj['globalObjects']['users'][tweet['user_id_str']]['screen_name']
+						date = datetime.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo = datetime.timezone.utc)
+						outlinks = [u['expanded_url'] for u in tweet['entities']['urls']]
+						tcooutlinks = [u['url'] for u in tweet['entities']['urls']]
+						url = f'https://twitter.com/{username}/status/{tweetID}'
+						yield Tweet(url, date, content, tweetID, username, outlinks, ' '.join(outlinks), tcooutlinks, ' '.join(tcooutlinks))
+					elif entry['entryId'] == 'sq-cursor-bottom':
+						newCursor = entry['content']['operation']['cursor']['value']
+			if not newCursor or newCursor == cursor:
+				# End of pagination
+				break
+			cursor = newCursor

 	@classmethod
 	def setup_parser(cls, subparser):
-		subparser.add_argument('--max-position', metavar = 'POSITION', dest = 'maxPosition')
+		subparser.add_argument('--cursor', metavar = 'CURSOR')
 		subparser.add_argument('query', help = 'A Twitter search string')

 	@classmethod
 	def from_args(cls, args):
-		return cls(args.query, maxPosition = args.maxPosition, retries = args.retries)
+		return cls(args.query, cursor = args.cursor, retries = args.retries)


 class TwitterUserScraper(TwitterSearchScraper):
@@ -205,7 +247,7 @@ class TwitterThreadScraper(TwitterCommonScraper):
 		self._tweetID = tweetID

 	def get_items(self):
-		headers = {'User-Agent': f'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.{random.randint(1, 3500)}.{random.randint(1, 160)} Safari/537.36'}
+		headers = {'User-Agent': f'Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18'}

 		# Fetch the page of the last tweet in the thread
 		r = self._get(f'https://twitter.com/user/status/{self._tweetID}', headers = headers)
@@ -283,7 +325,7 @@ class TwitterListMembersScraper(TwitterCommonScraper):
 		self._user, self._list = listName.split('/')

 	def get_items(self):
-		headers = {'User-Agent': f'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.{random.randint(1, 3500)}.{random.randint(1, 160)} Safari/537.36'}
+		headers = {'User-Agent': f'Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18'}

 		baseUrl = f'https://twitter.com/{self._user}/lists/{self._list}/members'
 		r = self._get(baseUrl, headers = headers)
--- a/snscrape/modules/vkontakte.py
+++ b/snscrape/modules/vkontakte.py
@@ -56,6 +56,12 @@ class VKontakteUserScraper(snscrape.base.Scraper):
 			logger.error('Private profile')
 			return

+		profileDeleted = soup.find('h5', class_ = 'profile_deleted_text')
+		if profileDeleted:
+			# Unclear what this state represents, so just log website text.
+			logger.error(profileDeleted.text)
+			return
+
 		newestPost = soup.find('div', class_ = 'post')
 		if not newestPost:
 			logger.info('Wall has no posts')
@@ -80,14 +86,15 @@ class VKontakteUserScraper(snscrape.base.Scraper):
 			if r.status_code != 200:
 				logger.error(f'Got status code {r.status_code}')
 				return
-			fields = r.content.split(b'<!>')
-			if fields[5].startswith(b'<div class="page_block no_posts">'):
+			# Convert to JSON and read the HTML payload.  Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
+			posts = r.json()['payload'][1][0]
+			if posts.startswith('<div class="page_block no_posts">'):
 				# Reached the end
 				break
-			if not fields[5].startswith(b'<div id="post'):
-				logger.error(f'Got an unknown response: {fields[5][:200]!r}...')
+			if not posts.startswith('<div id="post'):
+				logger.error(f'Got an unknown response: {posts[:200]!r}...')
 				break
-			soup = bs4.BeautifulSoup(fields[5], 'lxml', from_encoding = r.encoding)
+			soup = bs4.BeautifulSoup(posts, 'lxml')
 			yield from self._soup_to_items(soup, baseUrl)

 	@classmethod
Author	SHA1	Message	Date
JustAnotherArchivist	b6cc3180d9	Force TwitterThreadScraper and TwitterListMembersScraper to fetch the old design	2020-03-04 00:40:49 +00:00
JustAnotherArchivist	613395d1c2	Port TwitterSearchScraper to redesign Fixes #57	2020-03-04 00:40:49 +00:00
JustAnotherArchivist	82a87b7b5a	Merge pull request #53 from JackDallas/add-more-insta-fields Add more fields to the instagram scraper	2020-02-09 23:48:59 +00:00
Jack Dallas	9568028bf9	Update changed fields	2020-02-07 11:30:16 +00:00
JustAnotherArchivist	6df351772e	Fix crash in Facebook scraper on link-less entries	2020-02-05 16:15:10 +00:00
JustAnotherArchivist	541173b0c8	Merge pull request #54 from jodizzle/fix/vkontakte-user Fix vkontakte-user: pagination returns JSON now, and handle some unscrapable profiles.	2020-02-05 14:56:12 +00:00
Jody Leonard	b6772d3778	vkontakte-user: Handle additional un-scrapeable profile case	2019-10-31 16:01:29 -04:00
Jody Leonard	20ea117a2c	Fix vkontakte-user pagination	2019-10-30 22:29:49 -04:00
JackDallas	ff54c350bc	Add more fields to the instagram scraper	2019-08-30 12:43:02 +01:00