Add --max-empty-pages option to stop long (potentially infinite) empty pagination

Fixes #636
2026-06-10 19:38:29 +03:00 · 2023-01-13 02:35:48 +00:00
parent 7de8d734e9
commit 129ad3fc34
1 changed files with 12 additions and 3 deletions
--- a/snscrape/modules/twitter.py
+++ b/snscrape/modules/twitter.py
@@ -628,7 +628,7 @@ class _TwitterAPIType(enum.Enum):


 class _TwitterAPIScraper(snscrape.base.Scraper):
-	def __init__(self, baseUrl, *, guestTokenManager = None, **kwargs):
+	def __init__(self, baseUrl, *, guestTokenManager = None, maxEmptyPages = 0, **kwargs):
 		super().__init__(**kwargs)
 		self._baseUrl = baseUrl
 		if guestTokenManager is None:
@@ -637,6 +637,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
 				_globalGuestTokenManager = GuestTokenManager()
 			guestTokenManager = _globalGuestTokenManager
 		self._guestTokenManager = guestTokenManager
+		self._maxEmptyPages = maxEmptyPages
 		self._apiHeaders = {
 			'User-Agent': None,
 			'Authorization': _API_AUTHORIZATION_HEADER,
@@ -729,6 +730,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
 			dir = 'bottom'
 		stopOnEmptyResponse = False
 		emptyResponsesOnCursor = 0
+		emptyPages = 0
 		while True:
 			_logger.info(f'Retrieving scroll page {cursor}')
 			obj = self._get_api_data(endpoint, apiType, reqParams)
@@ -785,6 +787,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
 				emptyResponsesOnCursor += 1
 				if emptyResponsesOnCursor > self._retries:
 					break
+			if tweetCount == 0:
+				emptyPages += 1
+				if self._maxEmptyPages and emptyPages >= self._maxEmptyPages:
+					_logger.warning(f'Stopping after {emptyPages} empty pages')
+					break
 			if not newCursor or (stopOnEmptyResponse and tweetCount == 0):
 				# End of pagination
 				if promptCursor is not None:
@@ -1403,9 +1410,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
 class TwitterSearchScraper(_TwitterAPIScraper):
 	name = 'twitter-search'

-	def __init__(self, query, *, cursor = None, top = False, **kwargs):
+	def __init__(self, query, *, cursor = None, top = False, maxEmptyPages = 20, **kwargs):
 		if not query.strip():
 			raise ValueError('empty query')
+		kwargs['maxEmptyPages'] = maxEmptyPages
 		super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs)
 		self._query = query  # Note: may get replaced by subclasses when using user ID resolution
 		self._cursor = cursor
@@ -1472,11 +1480,12 @@ class TwitterSearchScraper(_TwitterAPIScraper):
 	def _cli_setup_parser(cls, subparser):
 		subparser.add_argument('--cursor', metavar = 'CURSOR')
 		subparser.add_argument('--top', action = 'store_true', default = False, help = 'Enable fetching top tweets instead of live/chronological')
+		subparser.add_argument('--max-empty-pages', dest = 'maxEmptyPages', metavar = 'N', type = int, default = 20, help = 'Stop after N empty pages from Twitter; set to 0 to disable')
 		subparser.add_argument('query', type = snscrape.base.nonempty_string('query'), help = 'A Twitter search string')

 	@classmethod
 	def _cli_from_args(cls, args):
-		return cls._cli_construct(args, args.query, cursor = args.cursor, top = args.top)
+		return cls._cli_construct(args, args.query, cursor = args.cursor, top = args.top, maxEmptyPages = args.maxEmptyPages)


 class TwitterUserScraper(TwitterSearchScraper):