mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 04:18:29 +03:00
@@ -773,7 +773,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
|||||||
else:
|
else:
|
||||||
# TweetDetail
|
# TweetDetail
|
||||||
instructions = obj['data'].get('threaded_conversation_with_injections_v2', {}).get('instructions', [])
|
instructions = obj['data'].get('threaded_conversation_with_injections_v2', {}).get('instructions', [])
|
||||||
tweetCount = 0
|
entryCount = 0
|
||||||
for instruction in instructions:
|
for instruction in instructions:
|
||||||
if 'addEntries' in instruction:
|
if 'addEntries' in instruction:
|
||||||
entries = instruction['addEntries']['entries']
|
entries = instruction['addEntries']['entries']
|
||||||
@@ -783,7 +783,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
|||||||
entries = instruction['entries']
|
entries = instruction['entries']
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
tweetCount += self._count_tweets(entries)
|
entryCount += self._count_tweets_and_users(entries)
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
if not (entry['entryId'].startswith('sq-cursor-') or entry['entryId'].startswith('cursor-')):
|
if not (entry['entryId'].startswith('sq-cursor-') or entry['entryId'].startswith('cursor-')):
|
||||||
continue
|
continue
|
||||||
@@ -806,20 +806,20 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
|||||||
newBottomCursorAndStop = (entryCursor, entryCursorStop or False)
|
newBottomCursorAndStop = (entryCursor, entryCursorStop or False)
|
||||||
if bottomCursorAndStop is None and newBottomCursorAndStop is not None:
|
if bottomCursorAndStop is None and newBottomCursorAndStop is not None:
|
||||||
bottomCursorAndStop = newBottomCursorAndStop
|
bottomCursorAndStop = newBottomCursorAndStop
|
||||||
if newCursor == cursor and tweetCount == 0:
|
if newCursor == cursor and entryCount == 0:
|
||||||
# Twitter sometimes returns the same cursor as requested and no results even though there are more results.
|
# Twitter sometimes returns the same cursor as requested and no results even though there are more results.
|
||||||
# When this happens, retry the same cursor up to the retries setting.
|
# When this happens, retry the same cursor up to the retries setting.
|
||||||
emptyResponsesOnCursor += 1
|
emptyResponsesOnCursor += 1
|
||||||
if emptyResponsesOnCursor > self._retries:
|
if emptyResponsesOnCursor > self._retries:
|
||||||
break
|
break
|
||||||
if tweetCount == 0:
|
if entryCount == 0:
|
||||||
emptyPages += 1
|
emptyPages += 1
|
||||||
if self._maxEmptyPages and emptyPages >= self._maxEmptyPages:
|
if self._maxEmptyPages and emptyPages >= self._maxEmptyPages:
|
||||||
_logger.warning(f'Stopping after {emptyPages} empty pages')
|
_logger.warning(f'Stopping after {emptyPages} empty pages')
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
emptyPages = 0
|
emptyPages = 0
|
||||||
if not newCursor or (stopOnEmptyResponse and tweetCount == 0):
|
if not newCursor or (stopOnEmptyResponse and entryCount == 0):
|
||||||
# End of pagination
|
# End of pagination
|
||||||
if promptCursor is not None:
|
if promptCursor is not None:
|
||||||
newCursor = promptCursor
|
newCursor = promptCursor
|
||||||
@@ -838,10 +838,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
|||||||
else:
|
else:
|
||||||
reqParams['variables']['cursor'] = cursor
|
reqParams['variables']['cursor'] = cursor
|
||||||
|
|
||||||
def _count_tweets(self, entries):
|
def _count_tweets_and_users(self, entries):
|
||||||
return sum(entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-') for entry in entries)
|
return sum(entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-') or entry['entryId'].startswith('user-') for entry in entries)
|
||||||
|
|
||||||
def _v2_timeline_instructions_to_tweets(self, obj):
|
def _v2_timeline_instructions_to_tweets_or_users(self, obj):
|
||||||
# No data format test, just a hard and loud crash if anything's wrong :-)
|
# No data format test, just a hard and loud crash if anything's wrong :-)
|
||||||
for instruction in obj['timeline']['instructions']:
|
for instruction in obj['timeline']['instructions']:
|
||||||
if 'addEntries' in instruction:
|
if 'addEntries' in instruction:
|
||||||
@@ -853,6 +853,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
|||||||
for entry in entries:
|
for entry in entries:
|
||||||
if entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-'):
|
if entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-'):
|
||||||
yield from self._v2_instruction_tweet_entry_to_tweet(entry['entryId'], entry['content'], obj)
|
yield from self._v2_instruction_tweet_entry_to_tweet(entry['entryId'], entry['content'], obj)
|
||||||
|
elif entry['entryId'].startswith('user-'):
|
||||||
|
yield self._user_to_user(obj['globalObjects']['users'][entry['content']['item']['content']['user']['id']])
|
||||||
|
|
||||||
def _v2_instruction_tweet_entry_to_tweet(self, entryId, entry, obj):
|
def _v2_instruction_tweet_entry_to_tweet(self, entryId, entry, obj):
|
||||||
if 'tweet' in entry['item']['content']:
|
if 'tweet' in entry['item']['content']:
|
||||||
@@ -1550,17 +1552,35 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
|||||||
return super()._cli_construct(argparseArgs, *args, **kwargs)
|
return super()._cli_construct(argparseArgs, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class TwitterSearchScraperMode(enum.Enum):
|
||||||
|
LIVE = 'live'
|
||||||
|
TOP = 'top'
|
||||||
|
USER = 'user'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _cli_from_args(cls, args):
|
||||||
|
if args.top:
|
||||||
|
return cls.TOP
|
||||||
|
if args.user:
|
||||||
|
return cls.USER
|
||||||
|
return cls.LIVE
|
||||||
|
|
||||||
|
|
||||||
class TwitterSearchScraper(_TwitterAPIScraper):
|
class TwitterSearchScraper(_TwitterAPIScraper):
|
||||||
name = 'twitter-search'
|
name = 'twitter-search'
|
||||||
|
|
||||||
def __init__(self, query, *, cursor = None, top = False, maxEmptyPages = 20, **kwargs):
|
def __init__(self, query, *, cursor = None, mode = TwitterSearchScraperMode.LIVE, top = None, maxEmptyPages = 20, **kwargs):
|
||||||
if not query.strip():
|
if not query.strip():
|
||||||
raise ValueError('empty query')
|
raise ValueError('empty query')
|
||||||
kwargs['maxEmptyPages'] = maxEmptyPages
|
kwargs['maxEmptyPages'] = maxEmptyPages
|
||||||
super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs)
|
super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs)
|
||||||
self._query = query # Note: may get replaced by subclasses when using user ID resolution
|
self._query = query # Note: may get replaced by subclasses when using user ID resolution
|
||||||
self._cursor = cursor
|
self._cursor = cursor
|
||||||
self._top = top
|
if top is not None:
|
||||||
|
replacement = f'{__name__}.TwitterSearchScraperMode.' + ('TOP' if top else 'LIVE')
|
||||||
|
warnings.warn(f'`top` argument is deprecated, use `mode = {replacement}` instead of `top = {bool(top)}`', snscrape.base.DeprecatedFeatureWarning, stacklevel = 2)
|
||||||
|
mode = TwitterSearchScraperMode.TOP if top else TwitterSearchScraperMode.LIVE
|
||||||
|
self._mode = mode
|
||||||
|
|
||||||
def get_items(self):
|
def get_items(self):
|
||||||
if not self._query.strip():
|
if not self._query.strip():
|
||||||
@@ -1596,7 +1616,22 @@ class TwitterSearchScraper(_TwitterAPIScraper):
|
|||||||
'send_error_codes': 'true',
|
'send_error_codes': 'true',
|
||||||
'simple_quoted_tweet': 'true',
|
'simple_quoted_tweet': 'true',
|
||||||
'q': self._query,
|
'q': self._query,
|
||||||
'tweet_search_mode': 'live',
|
}
|
||||||
|
if self._mode is TwitterSearchScraperMode.LIVE:
|
||||||
|
paginationParams = {
|
||||||
|
**paginationParams,
|
||||||
|
'tweet_search_mode': 'live',
|
||||||
|
}
|
||||||
|
elif self._mode is TwitterSearchScraperMode.TOP:
|
||||||
|
pass
|
||||||
|
elif self._mode is TwitterSearchScraperMode.USER:
|
||||||
|
paginationParams = {
|
||||||
|
**paginationParams,
|
||||||
|
'result_filter': 'user',
|
||||||
|
'query_source': '',
|
||||||
|
}
|
||||||
|
paginationParams = {
|
||||||
|
**paginationParams,
|
||||||
'count': '20',
|
'count': '20',
|
||||||
'query_source': 'spelling_expansion_revert_click',
|
'query_source': 'spelling_expansion_revert_click',
|
||||||
'cursor': None,
|
'cursor': None,
|
||||||
@@ -1608,23 +1643,21 @@ class TwitterSearchScraper(_TwitterAPIScraper):
|
|||||||
params = paginationParams.copy()
|
params = paginationParams.copy()
|
||||||
del params['cursor']
|
del params['cursor']
|
||||||
|
|
||||||
if self._top:
|
|
||||||
del params['tweet_search_mode']
|
|
||||||
del paginationParams['tweet_search_mode']
|
|
||||||
|
|
||||||
for obj in self._iter_api_data('https://api.twitter.com/2/search/adaptive.json', _TwitterAPIType.V2, params, paginationParams, cursor = self._cursor):
|
for obj in self._iter_api_data('https://api.twitter.com/2/search/adaptive.json', _TwitterAPIType.V2, params, paginationParams, cursor = self._cursor):
|
||||||
yield from self._v2_timeline_instructions_to_tweets(obj)
|
yield from self._v2_timeline_instructions_to_tweets_or_users(obj)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _cli_setup_parser(cls, subparser):
|
def _cli_setup_parser(cls, subparser):
|
||||||
subparser.add_argument('--cursor', metavar = 'CURSOR')
|
subparser.add_argument('--cursor', metavar = 'CURSOR')
|
||||||
subparser.add_argument('--top', action = 'store_true', default = False, help = 'Enable fetching top tweets instead of live/chronological')
|
group = subparser.add_mutually_exclusive_group(required = False)
|
||||||
|
group.add_argument('--top', action = 'store_true', default = False, help = 'Search top tweets instead of live/chronological')
|
||||||
|
group.add_argument('--user', action = 'store_true', default = False, help = 'Search users instead of tweets')
|
||||||
subparser.add_argument('--max-empty-pages', dest = 'maxEmptyPages', metavar = 'N', type = int, default = 20, help = 'Stop after N empty pages from Twitter; set to 0 to disable')
|
subparser.add_argument('--max-empty-pages', dest = 'maxEmptyPages', metavar = 'N', type = int, default = 20, help = 'Stop after N empty pages from Twitter; set to 0 to disable')
|
||||||
subparser.add_argument('query', type = snscrape.base.nonempty_string('query'), help = 'A Twitter search string')
|
subparser.add_argument('query', type = snscrape.base.nonempty_string('query'), help = 'A Twitter search string')
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _cli_from_args(cls, args):
|
def _cli_from_args(cls, args):
|
||||||
return cls._cli_construct(args, args.query, cursor = args.cursor, top = args.top, maxEmptyPages = args.maxEmptyPages)
|
return cls._cli_construct(args, args.query, cursor = args.cursor, mode = TwitterSearchScraperMode._cli_from_args(args), maxEmptyPages = args.maxEmptyPages)
|
||||||
|
|
||||||
|
|
||||||
class TwitterUserScraper(TwitterSearchScraper):
|
class TwitterUserScraper(TwitterSearchScraper):
|
||||||
|
|||||||
Reference in New Issue
Block a user