From ef7c4fad3e480d945c4446e7bd8519fde82f3f79 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 29 May 2022 05:06:34 +0000 Subject: [PATCH 1/9] Fix AttributeError for DescriptionURL on from-import --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 048df34..9cd5a89 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1,5 +1,5 @@ __all__ = [ - 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'DescriptionUrl', 'Coordinates', 'Place', + 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'DescriptionURL', 'Coordinates', 'Place', 'User', 'UserLabel', 'Trend', 'GuestTokenManager', From 01cf6a09b3d244d6e03be1f706fec06b2d1e9d4d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 29 May 2022 05:06:51 +0000 Subject: [PATCH 2/9] Fix type of description URL objects --- snscrape/modules/twitter.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 9cd5a89..74aa024 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1303,7 +1303,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['description'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) kwargs['rawDescription'] = user['description'] if user['entities']['description'].get('urls'): - kwargs['descriptionUrls'] = [{'text': x.get('display_url'), 'url': x['expanded_url'], 'tcourl': x['url'], 'indices': tuple(x['indices'])} for x in user['entities']['description']['urls']] + kwargs['descriptionUrls'] = [DescriptionURL( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['entities']['description']['urls']] kwargs['verified'] = user.get('verified') kwargs['created'] = email.utils.parsedate_to_datetime(user['created_at']) kwargs['followersCount'] = user['followers_count'] @@ -1451,7 +1456,12 @@ class TwitterUserScraper(TwitterSearchScraper): displayname = user['legacy']['name'], description = description, rawDescription = rawDescription, - descriptionUrls = [{'text': x.get('display_url'), 'url': x['expanded_url'], 'tcourl': x['url'], 'indices': tuple(x['indices'])} for x in user['legacy']['entities']['description']['urls']], + descriptionUrls = [DescriptionURL( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['legacy']['entities']['description']['urls']], verified = user['legacy']['verified'], created = email.utils.parsedate_to_datetime(user['legacy']['created_at']), followersCount = user['legacy']['followers_count'], From dc6bc9bf9d85d33302589530577136cfda4f48ee Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 29 May 2022 07:16:04 +0000 Subject: [PATCH 3/9] Refactor how links on Twitter are handled All links in text (tweets, profile descriptions, and profile links) are now represented by TextLink objects, which contain all relevant information: the displayed text (if available), the URL, the short t.co URL, and the indices in the text at which it appears. Closes #478 --- snscrape/modules/twitter.py | 114 ++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 39 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 74aa024..841b2c5 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1,5 +1,5 @@ __all__ = [ - 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'DescriptionURL', 'Coordinates', 'Place', + 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'TextLink', 'Coordinates', 'Place', 'User', 'UserLabel', 'Trend', 'GuestTokenManager', @@ -31,6 +31,18 @@ import string import time import typing import urllib.parse +import warnings + + +# DescriptionURL deprecation +_DEPRECATED_NAMES = {'DescriptionURL': 'TextLink'} +def __getattr__(name): + if name in _DEPRECATED_NAMES: + warnings.warn(f'{name} is deprecated, use {_DEPRECATED_NAMES[name]} instead', FutureWarning, stacklevel = 2) + return globals()[_DEPRECATED_NAMES[name]] + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') +def __dir__(): + return sorted(__all__ + list(_DEPRECATED_NAMES.keys())) _logger = logging.getLogger(__name__) @@ -56,8 +68,7 @@ class Tweet(snscrape.base.Item): source: str sourceUrl: typing.Optional[str] = None sourceLabel: typing.Optional[str] = None - outlinks: typing.Optional[typing.List[str]] = None - tcooutlinks: typing.Optional[typing.List[str]] = None + links: typing.Optional[typing.List['TextLink']] = None media: typing.Optional[typing.List['Medium']] = None retweetedTweet: typing.Optional['Tweet'] = None quotedTweet: typing.Optional['Tweet'] = None @@ -71,13 +82,23 @@ class Tweet(snscrape.base.Item): card: typing.Optional['Card'] = None username = snscrape.base._DeprecatedProperty('username', lambda self: self.user.username, 'user.username') - outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks) if self.outlinks else '', 'outlinks') - tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(self.tcooutlinks) if self.tcooutlinks else '', 'tcooutlinks') + outlinks = snscrape.base._DeprecatedProperty('outlinks', lambda self: [x.url for x in self.links] if self.links else [], 'links (url attribute)') + outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(x.url for x in self.links) if self.links else '', 'links (url attribute)') + tcooutlinks = snscrape.base._DeprecatedProperty('tcooutlinks', lambda self: [x.tcourl for x in self.links] if self.links else [], 'links (tcourl attribute)') + tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(x.tcourl for x in self.links) if self.links else '', 'links (tcourl attribute)') def __str__(self): return self.url +@dataclasses.dataclass +class TextLink: + text: typing.Optional[str] + url: str + tcourl: str + indices: typing.Tuple[int, int] + + class Medium: pass @@ -109,14 +130,6 @@ class Gif(Medium): variants: typing.List[VideoVariant] -@dataclasses.dataclass -class DescriptionURL: - text: typing.Optional[str] - url: str - tcourl: str - indices: typing.Tuple[int, int] - - @dataclasses.dataclass class Coordinates: longitude: float @@ -447,7 +460,7 @@ class User(snscrape.base.Entity): displayname: typing.Optional[str] = None description: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced rawDescription: typing.Optional[str] = None # Raw description with the URL(s) intact - descriptionUrls: typing.Optional[typing.List[DescriptionURL]] = None + descriptionLinks: typing.Optional[typing.List[TextLink]] = None verified: typing.Optional[bool] = None created: typing.Optional[datetime.datetime] = None followersCount: typing.Optional[int] = None @@ -458,12 +471,15 @@ class User(snscrape.base.Entity): mediaCount: typing.Optional[int] = None location: typing.Optional[str] = None protected: typing.Optional[bool] = None - linkUrl: typing.Optional[str] = None - linkTcourl: typing.Optional[str] = None + link: typing.Optional[TextLink] = None profileImageUrl: typing.Optional[str] = None profileBannerUrl: typing.Optional[str] = None label: typing.Optional['UserLabel'] = None + descriptionUrls = snscrape.base._DeprecatedProperty('descriptionUrls', lambda self: self.descriptionLinks, 'descriptionLinks') + linkUrl = snscrape.base._DeprecatedProperty('linkUrl', lambda self: self.link.url if self.link else None, 'link.url') + linkTcourl = snscrape.base._DeprecatedProperty('linkTcourl', lambda self: self.link.tcourl if self.link else None, 'link.tcourl') + @property def url(self): return f'https://twitter.com/{self.username}' @@ -815,8 +831,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['user'] = user kwargs['date'] = email.utils.parsedate_to_datetime(tweet['created_at']) if tweet['entities'].get('urls'): - kwargs['outlinks'] = [u['expanded_url'] for u in tweet['entities']['urls']] - kwargs['tcooutlinks'] = [u['url'] for u in tweet['entities']['urls']] + kwargs['links'] = [TextLink( + text = u.get('display_url'), + url = u['expanded_url'], + tcourl = u['url'], + indices = tuple(u['indices']), + ) for u in tweet['entities']['urls']] kwargs['url'] = f'https://twitter.com/{user.username}/status/{tweetId}' kwargs['replyCount'] = tweet['reply_count'] kwargs['retweetCount'] = tweet['retweet_count'] @@ -877,10 +897,15 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if hasattr(card, 'url') and '//t.co/' in card.url: # Try to convert the URL to the non-shortened/t.co one # Retweets inherit the card but not the outlinks; try to get them from the retweeted tweet instead in that case. - if 'tcooutlinks' in kwargs and card.url in kwargs['tcooutlinks']: - card.url = kwargs['outlinks'][kwargs['tcooutlinks'].index(card.url)] - elif retweetedTweet and retweetedTweet.tcooutlinks and card.url in retweetedTweet.tcooutlinks: - card.url = retweetedTweet.outlinks[retweetedTweet.tcooutlinks.index(card.url)] + candidates = [] + if 'links' in kwargs: + candidates.extend(kwargs['links']) + if retweetedTweet: + candidates.extend(retweetedTweet.links) + for u in candidates: + if u.tcourl == card.url: + card.url = u.url + break else: _logger.warning(f'Could not translate t.co card URL on tweet {tweetId}') return Tweet(**kwargs) @@ -1303,12 +1328,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['description'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) kwargs['rawDescription'] = user['description'] if user['entities']['description'].get('urls'): - kwargs['descriptionUrls'] = [DescriptionURL( - text = x.get('display_url'), - url = x['expanded_url'], - tcourl = x['url'], - indices = tuple(x['indices']), - ) for x in user['entities']['description']['urls']] + kwargs['descriptionLinks'] = [TextLink( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['entities']['description']['urls']] kwargs['verified'] = user.get('verified') kwargs['created'] = email.utils.parsedate_to_datetime(user['created_at']) kwargs['followersCount'] = user['followers_count'] @@ -1319,9 +1344,13 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['mediaCount'] = user['media_count'] kwargs['location'] = user['location'] kwargs['protected'] = user.get('protected') - if 'url' in user['entities']: - kwargs['linkUrl'] = (user['entities']['url']['urls'][0].get('expanded_url') or user.get('url')) - kwargs['linkTcourl'] = user.get('url') + if user.get('url'): + entity = user['entities'].get('url', {}).get('urls', [None])[0] + if not entity or entity['url'] != user['url']: + self.logger.warning(f'Link inconsistency on user {kwargs["id"]}') + if not entity: + entity = {'display_url': None, 'expanded_url': user['url'], 'indices': (0, len(user['url']))} + kwargs['link'] = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['url'], indices = tuple(entity['indices'])) kwargs['profileImageUrl'] = user['profile_image_url_https'] kwargs['profileBannerUrl'] = user.get('profile_banner_url') if 'ext' in user and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')): @@ -1447,6 +1476,14 @@ class TwitterUserScraper(TwitterSearchScraper): user = obj['data']['user']['result'] rawDescription = user['legacy']['description'] description = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls']) + link = None + if user['legacy'].get('url'): + entity = user['legacy']['entities'].get('url', {}).get('urls', [None])[0] + if not entity or entity['url'] != user['legacy']['url']: + self.logger.warning(f'Link inconsistency on user') + if not entity: + entity = {'display_url': None, 'expanded_url': user['legacy']['url'], 'indices': (0, len(user['legacy']['url']))} + link = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['legacy']['url'], indices = tuple(entity['indices'])) label = None if (labelO := user['affiliates_highlighted_label'].get('label')): label = self._user_label_to_user_label(labelO) @@ -1456,12 +1493,12 @@ class TwitterUserScraper(TwitterSearchScraper): displayname = user['legacy']['name'], description = description, rawDescription = rawDescription, - descriptionUrls = [DescriptionURL( - text = x.get('display_url'), - url = x['expanded_url'], - tcourl = x['url'], - indices = tuple(x['indices']), - ) for x in user['legacy']['entities']['description']['urls']], + descriptionLinks = [TextLink( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['legacy']['entities']['description']['urls']], verified = user['legacy']['verified'], created = email.utils.parsedate_to_datetime(user['legacy']['created_at']), followersCount = user['legacy']['followers_count'], @@ -1472,8 +1509,7 @@ class TwitterUserScraper(TwitterSearchScraper): mediaCount = user['legacy']['media_count'], location = user['legacy']['location'], protected = user['legacy']['protected'], - linkUrl = user['legacy']['entities']['url']['urls'][0]['expanded_url'] if 'url' in user['legacy']['entities'] else None, - linkTcourl = user['legacy'].get('url'), + link = link, profileImageUrl = user['legacy']['profile_image_url_https'], profileBannerUrl = user['legacy'].get('profile_banner_url'), label = label, From 530f4fa12277b2437d09d175e13a5146d5f0ac8e Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 29 May 2022 17:23:38 +0000 Subject: [PATCH 4/9] Fix KeyErrors on display_url and expanded_url for certain users with broken profile links Fixes #480 --- snscrape/modules/twitter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 841b2c5..e4a2318 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1349,8 +1349,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if not entity or entity['url'] != user['url']: self.logger.warning(f'Link inconsistency on user {kwargs["id"]}') if not entity: - entity = {'display_url': None, 'expanded_url': user['url'], 'indices': (0, len(user['url']))} - kwargs['link'] = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['url'], indices = tuple(entity['indices'])) + entity = {'indices': (0, len(user['url']))} + kwargs['link'] = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['url']), tcourl = user['url'], indices = tuple(entity['indices'])) kwargs['profileImageUrl'] = user['profile_image_url_https'] kwargs['profileBannerUrl'] = user.get('profile_banner_url') if 'ext' in user and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')): @@ -1482,8 +1482,8 @@ class TwitterUserScraper(TwitterSearchScraper): if not entity or entity['url'] != user['legacy']['url']: self.logger.warning(f'Link inconsistency on user') if not entity: - entity = {'display_url': None, 'expanded_url': user['legacy']['url'], 'indices': (0, len(user['legacy']['url']))} - link = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['legacy']['url'], indices = tuple(entity['indices'])) + entity = {'indices': (0, len(user['legacy']['url']))} + link = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['legacy']['url']), tcourl = user['legacy']['url'], indices = tuple(entity['indices'])) label = None if (labelO := user['affiliates_highlighted_label'].get('label')): label = self._user_label_to_user_label(labelO) From 0d361685ff77425aaf7cc6c6580237d914041a55 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 1 Jun 2022 17:35:38 +0000 Subject: [PATCH 5/9] Fix AttributeError crash on scrapers using the default CLI constructor Introduced by 267b7d0e Fixes #483 --- snscrape/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/base.py b/snscrape/base.py index 71ab649..0e1ba1b 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -229,7 +229,7 @@ class Scraper: @classmethod def _cli_from_args(cls, args): - return cls._construct(args) + return cls._cli_construct(args) @classmethod def _cli_construct(cls, argparseArgs, *args, **kwargs): From bcad6923c243421e7ffcb5a3f7d7721033e500c8 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 14 Jun 2022 00:35:02 +0000 Subject: [PATCH 6/9] Rename Tweet.content to rawContent and User.description to renderedDescription for consistency Closes #479 --- snscrape/modules/twitter.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index e4a2318..abf3f2b 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -55,7 +55,7 @@ _GUEST_TOKEN_VALIDITY = 10800 class Tweet(snscrape.base.Item): url: str date: datetime.datetime - content: str + rawContent: str renderedContent: str id: int user: 'User' @@ -86,6 +86,7 @@ class Tweet(snscrape.base.Item): outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(x.url for x in self.links) if self.links else '', 'links (url attribute)') tcooutlinks = snscrape.base._DeprecatedProperty('tcooutlinks', lambda self: [x.tcourl for x in self.links] if self.links else [], 'links (tcourl attribute)') tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(x.tcourl for x in self.links) if self.links else '', 'links (tcourl attribute)') + content = snscrape.base._DeprecatedProperty('content', lambda self: self.rawContent, 'rawContent') def __str__(self): return self.url @@ -458,8 +459,8 @@ class User(snscrape.base.Entity): username: str id: int displayname: typing.Optional[str] = None - description: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced rawDescription: typing.Optional[str] = None # Raw description with the URL(s) intact + renderedDescription: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced descriptionLinks: typing.Optional[typing.List[TextLink]] = None verified: typing.Optional[bool] = None created: typing.Optional[datetime.datetime] = None @@ -479,6 +480,7 @@ class User(snscrape.base.Entity): descriptionUrls = snscrape.base._DeprecatedProperty('descriptionUrls', lambda self: self.descriptionLinks, 'descriptionLinks') linkUrl = snscrape.base._DeprecatedProperty('linkUrl', lambda self: self.link.url if self.link else None, 'link.url') linkTcourl = snscrape.base._DeprecatedProperty('linkTcourl', lambda self: self.link.tcourl if self.link else None, 'link.tcourl') + description = snscrape.base._DeprecatedProperty('description', lambda self: self.renderedDescription, 'renderedDescription') @property def url(self): @@ -826,7 +828,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): tweetId = self._get_tweet_id(tweet) kwargs = {} kwargs['id'] = tweetId - kwargs['content'] = tweet['full_text'] + kwargs['rawContent'] = tweet['full_text'] kwargs['renderedContent'] = self._render_text_with_urls(tweet['full_text'], tweet['entities'].get('urls')) kwargs['user'] = user kwargs['date'] = email.utils.parsedate_to_datetime(tweet['created_at']) @@ -1325,8 +1327,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['username'] = user['screen_name'] kwargs['id'] = id_ if id_ else user['id'] if 'id' in user else int(user['id_str']) kwargs['displayname'] = user['name'] - kwargs['description'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) kwargs['rawDescription'] = user['description'] + kwargs['renderedDescription'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) if user['entities']['description'].get('urls'): kwargs['descriptionLinks'] = [TextLink( text = x.get('display_url'), @@ -1475,7 +1477,7 @@ class TwitterUserScraper(TwitterSearchScraper): return None user = obj['data']['user']['result'] rawDescription = user['legacy']['description'] - description = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls']) + renderedDescription = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls']) link = None if user['legacy'].get('url'): entity = user['legacy']['entities'].get('url', {}).get('urls', [None])[0] @@ -1491,8 +1493,8 @@ class TwitterUserScraper(TwitterSearchScraper): username = user['legacy']['screen_name'], id = int(user['rest_id']), displayname = user['legacy']['name'], - description = description, rawDescription = rawDescription, + renderedDescription = renderedDescription, descriptionLinks = [TextLink( text = x.get('display_url'), url = x['expanded_url'], From 50899c01f39c1be54161d204f1cf3a5b3bd3cf88 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 16 Jun 2022 17:12:04 +0000 Subject: [PATCH 7/9] Fix crash on malformed guest token cache file Fixes #494 --- snscrape/modules/twitter.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index abf3f2b..334851c 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -566,7 +566,12 @@ class _CLIGuestTokenManager(GuestTokenManager): return None _logger.info(f'Reading guest token from {self._file}') with open(self._file, 'r') as fp: - o = json.load(fp) + try: + o = json.load(fp) + except json.JSONDecodeError as e: + _logger.warning(f'Malformed guest token file {self._file}: {e!s}') + self.reset() + return None self._token = o['token'] self._setTime = o['setTime'] if self._setTime < time.time() - _GUEST_TOKEN_VALIDITY: From d5b406bc1bc08af7fe5a179f4b14414b0de73ee1 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 23 Jun 2022 19:50:17 +0000 Subject: [PATCH 8/9] Update API parameters to what Twitter currently uses The `count` reduction does not affect anything as Twitter ignores that parameter now. Cf. #481 --- snscrape/modules/twitter.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 334851c..6942db6 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1414,6 +1414,7 @@ class TwitterSearchScraper(_TwitterAPIScraper): 'include_mute_edge': '1', 'include_can_dm': '1', 'include_can_media_tag': '1', + 'include_ext_has_nft_avatar': '1', 'skip_status': '1', 'cards_platform': 'Web-12', 'include_cards': '1', @@ -1425,16 +1426,18 @@ class TwitterSearchScraper(_TwitterAPIScraper): 'include_user_entities': 'true', 'include_ext_media_color': 'true', 'include_ext_media_availability': 'true', + 'include_ext_sensitive_media_warning': 'true', + 'include_ext_trusted_friends_metadata': 'true', 'send_error_codes': 'true', - 'simple_quoted_tweets': 'true', + 'simple_quoted_tweet': 'true', 'q': self._query, 'tweet_search_mode': 'live', - 'count': '100', + 'count': '20', 'query_source': 'spelling_expansion_revert_click', 'cursor': None, 'pc': '1', 'spelling_corrections': '1', - 'ext': 'mediaStats,highlightedLabel', + 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo', } params = paginationParams.copy() del params['cursor'] @@ -1726,6 +1729,7 @@ class TwitterTrendsScraper(_TwitterAPIScraper): 'include_mute_edge': '1', 'include_can_dm': '1', 'include_can_media_tag': '1', + 'include_ext_has_nft_avatar': '1', 'skip_status': '1', 'cards_platform': 'Web-12', 'include_cards': '1', @@ -1737,13 +1741,15 @@ class TwitterTrendsScraper(_TwitterAPIScraper): 'include_user_entities': 'true', 'include_ext_media_color': 'true', 'include_ext_media_availability': 'true', + 'include_ext_sensitive_media_warning': 'true', + 'include_ext_trusted_friends_metadata': 'true', 'send_error_codes': 'true', 'simple_quoted_tweet': 'true', 'count': '20', 'candidate_source': 'trends', 'include_page_configuration': 'false', 'entity_tokens': 'false', - 'ext': 'mediaStats,highlightedLabel,voiceInfo', + 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo', } obj = self._get_api_data('https://twitter.com/i/api/2/guide.json', _TwitterAPIType.V2, params) for instruction in obj['timeline']['instructions']: From d72b51953f0ec05ee18761ea31c1bb82f886f7a9 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 24 Jun 2022 23:12:50 +0000 Subject: [PATCH 9/9] Fix missing r prefix on string with regex backslashes --- snscrape/modules/vkontakte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index ea33b44..06ddf7a 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -32,7 +32,7 @@ _logger = logging.getLogger(__name__) _months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] _datePattern = re.compile(r'^(?Ptoday' r'|yesterday' - r'|(?P\d+)\s+(?P' + '|'.join(_months) + ')(\s+(?P\d{4}))?' + r'|(?P\d+)\s+(?P' + '|'.join(_months) + r')(\s+(?P\d{4}))?' r'|(?P' + '|'.join(_months) + r')\s+(?P\d+),\s+(?P\d{4})' ')' r'\s+at\s+(?P\d+):(?P\d+)\s+(?P[ap]m)$')