mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-10 19:38:29 +03:00
Add --max-empty-pages option to stop long (potentially infinite) empty pagination
Fixes #636
This commit is contained in:
@@ -628,7 +628,7 @@ class _TwitterAPIType(enum.Enum):
|
||||
|
||||
|
||||
class _TwitterAPIScraper(snscrape.base.Scraper):
|
||||
def __init__(self, baseUrl, *, guestTokenManager = None, **kwargs):
|
||||
def __init__(self, baseUrl, *, guestTokenManager = None, maxEmptyPages = 0, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._baseUrl = baseUrl
|
||||
if guestTokenManager is None:
|
||||
@@ -637,6 +637,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
||||
_globalGuestTokenManager = GuestTokenManager()
|
||||
guestTokenManager = _globalGuestTokenManager
|
||||
self._guestTokenManager = guestTokenManager
|
||||
self._maxEmptyPages = maxEmptyPages
|
||||
self._apiHeaders = {
|
||||
'User-Agent': None,
|
||||
'Authorization': _API_AUTHORIZATION_HEADER,
|
||||
@@ -729,6 +730,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
||||
dir = 'bottom'
|
||||
stopOnEmptyResponse = False
|
||||
emptyResponsesOnCursor = 0
|
||||
emptyPages = 0
|
||||
while True:
|
||||
_logger.info(f'Retrieving scroll page {cursor}')
|
||||
obj = self._get_api_data(endpoint, apiType, reqParams)
|
||||
@@ -785,6 +787,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
||||
emptyResponsesOnCursor += 1
|
||||
if emptyResponsesOnCursor > self._retries:
|
||||
break
|
||||
if tweetCount == 0:
|
||||
emptyPages += 1
|
||||
if self._maxEmptyPages and emptyPages >= self._maxEmptyPages:
|
||||
_logger.warning(f'Stopping after {emptyPages} empty pages')
|
||||
break
|
||||
if not newCursor or (stopOnEmptyResponse and tweetCount == 0):
|
||||
# End of pagination
|
||||
if promptCursor is not None:
|
||||
@@ -1403,9 +1410,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
||||
class TwitterSearchScraper(_TwitterAPIScraper):
|
||||
name = 'twitter-search'
|
||||
|
||||
def __init__(self, query, *, cursor = None, top = False, **kwargs):
|
||||
def __init__(self, query, *, cursor = None, top = False, maxEmptyPages = 20, **kwargs):
|
||||
if not query.strip():
|
||||
raise ValueError('empty query')
|
||||
kwargs['maxEmptyPages'] = maxEmptyPages
|
||||
super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs)
|
||||
self._query = query # Note: may get replaced by subclasses when using user ID resolution
|
||||
self._cursor = cursor
|
||||
@@ -1472,11 +1480,12 @@ class TwitterSearchScraper(_TwitterAPIScraper):
|
||||
def _cli_setup_parser(cls, subparser):
|
||||
subparser.add_argument('--cursor', metavar = 'CURSOR')
|
||||
subparser.add_argument('--top', action = 'store_true', default = False, help = 'Enable fetching top tweets instead of live/chronological')
|
||||
subparser.add_argument('--max-empty-pages', dest = 'maxEmptyPages', metavar = 'N', type = int, default = 20, help = 'Stop after N empty pages from Twitter; set to 0 to disable')
|
||||
subparser.add_argument('query', type = snscrape.base.nonempty_string('query'), help = 'A Twitter search string')
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.query, cursor = args.cursor, top = args.top)
|
||||
return cls._cli_construct(args, args.query, cursor = args.cursor, top = args.top, maxEmptyPages = args.maxEmptyPages)
|
||||
|
||||
|
||||
class TwitterUserScraper(TwitterSearchScraper):
|
||||
|
||||
Reference in New Issue
Block a user