From ec5626097a66e4723f5dadbafb5b1d976458d6a7 Mon Sep 17 00:00:00 2001 From: TheTechRobo <52163910+TheTechRobo@users.noreply.github.com> Date: Tue, 4 Jan 2022 12:39:49 -0500 Subject: [PATCH 01/94] Create bug_report.yml --- .github/ISSUE_TEMPLATE/bug_report.yml | 79 +++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..ef602de --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,79 @@ +name: Bug report +description: Create a report to help us improve +labels: 'bug' +body: + - type: markdown + attributes: + value: | + ## Self Check + - Try searching existing GitHub Issues (open or closed) for similar issues. + - type: textarea + validations: + required: true + attributes: + label: Describe the bug + description: A clear description of what the bug is. + placeholder: e.g. I see an AssertionError when trying to scrape a Twitter user! + - type: textarea + validations: + required: true + attributes: + label: How to reproduce + description: | + How to reproduce the problem. + placeholder: e.g. I can reproduce this issue by scraping the textfiles user with the twitter-user scraper. + - type: textarea + validations: + required: true + attributes: + label: Expected behavior + description: A brief description of what should happen. + - type: textarea + attributes: + label: Screenshots and recordings + description: | + If applicable, add screenshots or videos to help explain your problem. (Videos should be as short as possible! Avoid watermarks too.) + - type: input + validations: + required: true + attributes: + label: OS / Distro + description: Include the version too, please! + placeholder: e.g. Windows 10, Ubuntu 20.04, macOS 10.15... + - type: input + validations: + required: true + attributes: + label: Output from `snscrape --version` + - type: input + validations: + required: true + attributes: + label: Scraper + placeholder: e.g. twitter-user, reddit-search,... + - type: textarea + validations: + required: false + attributes: + label: Backtrace + description: What is the error snscrape gives you, if any? + - type: input + validations: + required: false + attributes: + label: Dump of locals + description: | + Here put a link to the dump of your snscrape locals, if it's a crash. (snscrape should tell you the path). + Please note that it may contain identifying info such as IP address, if the website returns that. + You can also optionally request to exchange the file in private. + Finally, if snscrape didn't crash, leave this field blank. + - type: dropdown + validations: + required: true + attributes: + label: How are you using snscrape? + options: ['CLI', 'Module'] + - type: textarea + attributes: + label: Additional context + description: Add any other context about the problem here. From afb6bfc429737ff8a7858e6089226edc148fbc78 Mon Sep 17 00:00:00 2001 From: TheTechRobo <52163910+TheTechRobo@users.noreply.github.com> Date: Tue, 4 Jan 2022 12:55:41 -0500 Subject: [PATCH 02/94] add feature_request and question templates --- .github/ISSUE_TEMPLATE/bug_report.yml | 4 ++-- .github/ISSUE_TEMPLATE/feature_request.yml | 27 ++++++++++++++++++++++ .github/ISSUE_TEMPLATE/question.md | 6 +++++ 3 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml create mode 100644 .github/ISSUE_TEMPLATE/question.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index ef602de..ad484ce 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -57,13 +57,13 @@ body: attributes: label: Backtrace description: What is the error snscrape gives you, if any? - - type: input + - type: textarea validations: required: false attributes: label: Dump of locals description: | - Here put a link to the dump of your snscrape locals, if it's a crash. (snscrape should tell you the path). + Here attach the dump of your snscrape locals, if it's a crash. (snscrape should tell you the path). Please note that it may contain identifying info such as IP address, if the website returns that. You can also optionally request to exchange the file in private. Finally, if snscrape didn't crash, leave this field blank. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..5cf91e1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,27 @@ +name: Feature Request +description: Want a feature? Ask; we don't bite! +labels: 'enhancement' +body: + - type: markdown + attributes: + value: | + ## Self Check + - Try searching existing GitHub Issues (open or closed) for similar issues. + - type: textarea + validations: + required: true + attributes: + label: Describe the feature + description: A clear description of what the feature is. + - type: textarea + validations: + required: false + attributes: + label: Would this fix a problem you're experiencing? If so, specify. + - type: textarea + attributes: + label: Did you consider other alternatives? + description: If so, specify + - type: input + attributes: + label: Additional context diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 0000000..e23dcd5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,6 @@ +--- +name: Question +about: Ask away! +labels: 'question' + +--- From ef7c4fad3e480d945c4446e7bd8519fde82f3f79 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 29 May 2022 05:06:34 +0000 Subject: [PATCH 03/94] Fix AttributeError for DescriptionURL on from-import --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 048df34..9cd5a89 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1,5 +1,5 @@ __all__ = [ - 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'DescriptionUrl', 'Coordinates', 'Place', + 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'DescriptionURL', 'Coordinates', 'Place', 'User', 'UserLabel', 'Trend', 'GuestTokenManager', From 01cf6a09b3d244d6e03be1f706fec06b2d1e9d4d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 29 May 2022 05:06:51 +0000 Subject: [PATCH 04/94] Fix type of description URL objects --- snscrape/modules/twitter.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 9cd5a89..74aa024 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1303,7 +1303,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['description'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) kwargs['rawDescription'] = user['description'] if user['entities']['description'].get('urls'): - kwargs['descriptionUrls'] = [{'text': x.get('display_url'), 'url': x['expanded_url'], 'tcourl': x['url'], 'indices': tuple(x['indices'])} for x in user['entities']['description']['urls']] + kwargs['descriptionUrls'] = [DescriptionURL( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['entities']['description']['urls']] kwargs['verified'] = user.get('verified') kwargs['created'] = email.utils.parsedate_to_datetime(user['created_at']) kwargs['followersCount'] = user['followers_count'] @@ -1451,7 +1456,12 @@ class TwitterUserScraper(TwitterSearchScraper): displayname = user['legacy']['name'], description = description, rawDescription = rawDescription, - descriptionUrls = [{'text': x.get('display_url'), 'url': x['expanded_url'], 'tcourl': x['url'], 'indices': tuple(x['indices'])} for x in user['legacy']['entities']['description']['urls']], + descriptionUrls = [DescriptionURL( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['legacy']['entities']['description']['urls']], verified = user['legacy']['verified'], created = email.utils.parsedate_to_datetime(user['legacy']['created_at']), followersCount = user['legacy']['followers_count'], From dc6bc9bf9d85d33302589530577136cfda4f48ee Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 29 May 2022 07:16:04 +0000 Subject: [PATCH 05/94] Refactor how links on Twitter are handled All links in text (tweets, profile descriptions, and profile links) are now represented by TextLink objects, which contain all relevant information: the displayed text (if available), the URL, the short t.co URL, and the indices in the text at which it appears. Closes #478 --- snscrape/modules/twitter.py | 114 ++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 39 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 74aa024..841b2c5 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1,5 +1,5 @@ __all__ = [ - 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'DescriptionURL', 'Coordinates', 'Place', + 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'TextLink', 'Coordinates', 'Place', 'User', 'UserLabel', 'Trend', 'GuestTokenManager', @@ -31,6 +31,18 @@ import string import time import typing import urllib.parse +import warnings + + +# DescriptionURL deprecation +_DEPRECATED_NAMES = {'DescriptionURL': 'TextLink'} +def __getattr__(name): + if name in _DEPRECATED_NAMES: + warnings.warn(f'{name} is deprecated, use {_DEPRECATED_NAMES[name]} instead', FutureWarning, stacklevel = 2) + return globals()[_DEPRECATED_NAMES[name]] + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') +def __dir__(): + return sorted(__all__ + list(_DEPRECATED_NAMES.keys())) _logger = logging.getLogger(__name__) @@ -56,8 +68,7 @@ class Tweet(snscrape.base.Item): source: str sourceUrl: typing.Optional[str] = None sourceLabel: typing.Optional[str] = None - outlinks: typing.Optional[typing.List[str]] = None - tcooutlinks: typing.Optional[typing.List[str]] = None + links: typing.Optional[typing.List['TextLink']] = None media: typing.Optional[typing.List['Medium']] = None retweetedTweet: typing.Optional['Tweet'] = None quotedTweet: typing.Optional['Tweet'] = None @@ -71,13 +82,23 @@ class Tweet(snscrape.base.Item): card: typing.Optional['Card'] = None username = snscrape.base._DeprecatedProperty('username', lambda self: self.user.username, 'user.username') - outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks) if self.outlinks else '', 'outlinks') - tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(self.tcooutlinks) if self.tcooutlinks else '', 'tcooutlinks') + outlinks = snscrape.base._DeprecatedProperty('outlinks', lambda self: [x.url for x in self.links] if self.links else [], 'links (url attribute)') + outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(x.url for x in self.links) if self.links else '', 'links (url attribute)') + tcooutlinks = snscrape.base._DeprecatedProperty('tcooutlinks', lambda self: [x.tcourl for x in self.links] if self.links else [], 'links (tcourl attribute)') + tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(x.tcourl for x in self.links) if self.links else '', 'links (tcourl attribute)') def __str__(self): return self.url +@dataclasses.dataclass +class TextLink: + text: typing.Optional[str] + url: str + tcourl: str + indices: typing.Tuple[int, int] + + class Medium: pass @@ -109,14 +130,6 @@ class Gif(Medium): variants: typing.List[VideoVariant] -@dataclasses.dataclass -class DescriptionURL: - text: typing.Optional[str] - url: str - tcourl: str - indices: typing.Tuple[int, int] - - @dataclasses.dataclass class Coordinates: longitude: float @@ -447,7 +460,7 @@ class User(snscrape.base.Entity): displayname: typing.Optional[str] = None description: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced rawDescription: typing.Optional[str] = None # Raw description with the URL(s) intact - descriptionUrls: typing.Optional[typing.List[DescriptionURL]] = None + descriptionLinks: typing.Optional[typing.List[TextLink]] = None verified: typing.Optional[bool] = None created: typing.Optional[datetime.datetime] = None followersCount: typing.Optional[int] = None @@ -458,12 +471,15 @@ class User(snscrape.base.Entity): mediaCount: typing.Optional[int] = None location: typing.Optional[str] = None protected: typing.Optional[bool] = None - linkUrl: typing.Optional[str] = None - linkTcourl: typing.Optional[str] = None + link: typing.Optional[TextLink] = None profileImageUrl: typing.Optional[str] = None profileBannerUrl: typing.Optional[str] = None label: typing.Optional['UserLabel'] = None + descriptionUrls = snscrape.base._DeprecatedProperty('descriptionUrls', lambda self: self.descriptionLinks, 'descriptionLinks') + linkUrl = snscrape.base._DeprecatedProperty('linkUrl', lambda self: self.link.url if self.link else None, 'link.url') + linkTcourl = snscrape.base._DeprecatedProperty('linkTcourl', lambda self: self.link.tcourl if self.link else None, 'link.tcourl') + @property def url(self): return f'https://twitter.com/{self.username}' @@ -815,8 +831,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['user'] = user kwargs['date'] = email.utils.parsedate_to_datetime(tweet['created_at']) if tweet['entities'].get('urls'): - kwargs['outlinks'] = [u['expanded_url'] for u in tweet['entities']['urls']] - kwargs['tcooutlinks'] = [u['url'] for u in tweet['entities']['urls']] + kwargs['links'] = [TextLink( + text = u.get('display_url'), + url = u['expanded_url'], + tcourl = u['url'], + indices = tuple(u['indices']), + ) for u in tweet['entities']['urls']] kwargs['url'] = f'https://twitter.com/{user.username}/status/{tweetId}' kwargs['replyCount'] = tweet['reply_count'] kwargs['retweetCount'] = tweet['retweet_count'] @@ -877,10 +897,15 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if hasattr(card, 'url') and '//t.co/' in card.url: # Try to convert the URL to the non-shortened/t.co one # Retweets inherit the card but not the outlinks; try to get them from the retweeted tweet instead in that case. - if 'tcooutlinks' in kwargs and card.url in kwargs['tcooutlinks']: - card.url = kwargs['outlinks'][kwargs['tcooutlinks'].index(card.url)] - elif retweetedTweet and retweetedTweet.tcooutlinks and card.url in retweetedTweet.tcooutlinks: - card.url = retweetedTweet.outlinks[retweetedTweet.tcooutlinks.index(card.url)] + candidates = [] + if 'links' in kwargs: + candidates.extend(kwargs['links']) + if retweetedTweet: + candidates.extend(retweetedTweet.links) + for u in candidates: + if u.tcourl == card.url: + card.url = u.url + break else: _logger.warning(f'Could not translate t.co card URL on tweet {tweetId}') return Tweet(**kwargs) @@ -1303,12 +1328,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['description'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) kwargs['rawDescription'] = user['description'] if user['entities']['description'].get('urls'): - kwargs['descriptionUrls'] = [DescriptionURL( - text = x.get('display_url'), - url = x['expanded_url'], - tcourl = x['url'], - indices = tuple(x['indices']), - ) for x in user['entities']['description']['urls']] + kwargs['descriptionLinks'] = [TextLink( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['entities']['description']['urls']] kwargs['verified'] = user.get('verified') kwargs['created'] = email.utils.parsedate_to_datetime(user['created_at']) kwargs['followersCount'] = user['followers_count'] @@ -1319,9 +1344,13 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['mediaCount'] = user['media_count'] kwargs['location'] = user['location'] kwargs['protected'] = user.get('protected') - if 'url' in user['entities']: - kwargs['linkUrl'] = (user['entities']['url']['urls'][0].get('expanded_url') or user.get('url')) - kwargs['linkTcourl'] = user.get('url') + if user.get('url'): + entity = user['entities'].get('url', {}).get('urls', [None])[0] + if not entity or entity['url'] != user['url']: + self.logger.warning(f'Link inconsistency on user {kwargs["id"]}') + if not entity: + entity = {'display_url': None, 'expanded_url': user['url'], 'indices': (0, len(user['url']))} + kwargs['link'] = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['url'], indices = tuple(entity['indices'])) kwargs['profileImageUrl'] = user['profile_image_url_https'] kwargs['profileBannerUrl'] = user.get('profile_banner_url') if 'ext' in user and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')): @@ -1447,6 +1476,14 @@ class TwitterUserScraper(TwitterSearchScraper): user = obj['data']['user']['result'] rawDescription = user['legacy']['description'] description = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls']) + link = None + if user['legacy'].get('url'): + entity = user['legacy']['entities'].get('url', {}).get('urls', [None])[0] + if not entity or entity['url'] != user['legacy']['url']: + self.logger.warning(f'Link inconsistency on user') + if not entity: + entity = {'display_url': None, 'expanded_url': user['legacy']['url'], 'indices': (0, len(user['legacy']['url']))} + link = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['legacy']['url'], indices = tuple(entity['indices'])) label = None if (labelO := user['affiliates_highlighted_label'].get('label')): label = self._user_label_to_user_label(labelO) @@ -1456,12 +1493,12 @@ class TwitterUserScraper(TwitterSearchScraper): displayname = user['legacy']['name'], description = description, rawDescription = rawDescription, - descriptionUrls = [DescriptionURL( - text = x.get('display_url'), - url = x['expanded_url'], - tcourl = x['url'], - indices = tuple(x['indices']), - ) for x in user['legacy']['entities']['description']['urls']], + descriptionLinks = [TextLink( + text = x.get('display_url'), + url = x['expanded_url'], + tcourl = x['url'], + indices = tuple(x['indices']), + ) for x in user['legacy']['entities']['description']['urls']], verified = user['legacy']['verified'], created = email.utils.parsedate_to_datetime(user['legacy']['created_at']), followersCount = user['legacy']['followers_count'], @@ -1472,8 +1509,7 @@ class TwitterUserScraper(TwitterSearchScraper): mediaCount = user['legacy']['media_count'], location = user['legacy']['location'], protected = user['legacy']['protected'], - linkUrl = user['legacy']['entities']['url']['urls'][0]['expanded_url'] if 'url' in user['legacy']['entities'] else None, - linkTcourl = user['legacy'].get('url'), + link = link, profileImageUrl = user['legacy']['profile_image_url_https'], profileBannerUrl = user['legacy'].get('profile_banner_url'), label = label, From 530f4fa12277b2437d09d175e13a5146d5f0ac8e Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 29 May 2022 17:23:38 +0000 Subject: [PATCH 06/94] Fix KeyErrors on display_url and expanded_url for certain users with broken profile links Fixes #480 --- snscrape/modules/twitter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 841b2c5..e4a2318 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1349,8 +1349,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if not entity or entity['url'] != user['url']: self.logger.warning(f'Link inconsistency on user {kwargs["id"]}') if not entity: - entity = {'display_url': None, 'expanded_url': user['url'], 'indices': (0, len(user['url']))} - kwargs['link'] = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['url'], indices = tuple(entity['indices'])) + entity = {'indices': (0, len(user['url']))} + kwargs['link'] = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['url']), tcourl = user['url'], indices = tuple(entity['indices'])) kwargs['profileImageUrl'] = user['profile_image_url_https'] kwargs['profileBannerUrl'] = user.get('profile_banner_url') if 'ext' in user and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')): @@ -1482,8 +1482,8 @@ class TwitterUserScraper(TwitterSearchScraper): if not entity or entity['url'] != user['legacy']['url']: self.logger.warning(f'Link inconsistency on user') if not entity: - entity = {'display_url': None, 'expanded_url': user['legacy']['url'], 'indices': (0, len(user['legacy']['url']))} - link = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['legacy']['url'], indices = tuple(entity['indices'])) + entity = {'indices': (0, len(user['legacy']['url']))} + link = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['legacy']['url']), tcourl = user['legacy']['url'], indices = tuple(entity['indices'])) label = None if (labelO := user['affiliates_highlighted_label'].get('label')): label = self._user_label_to_user_label(labelO) From 0d361685ff77425aaf7cc6c6580237d914041a55 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 1 Jun 2022 17:35:38 +0000 Subject: [PATCH 07/94] Fix AttributeError crash on scrapers using the default CLI constructor Introduced by 267b7d0e Fixes #483 --- snscrape/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/base.py b/snscrape/base.py index 71ab649..0e1ba1b 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -229,7 +229,7 @@ class Scraper: @classmethod def _cli_from_args(cls, args): - return cls._construct(args) + return cls._cli_construct(args) @classmethod def _cli_construct(cls, argparseArgs, *args, **kwargs): From bcad6923c243421e7ffcb5a3f7d7721033e500c8 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 14 Jun 2022 00:35:02 +0000 Subject: [PATCH 08/94] Rename Tweet.content to rawContent and User.description to renderedDescription for consistency Closes #479 --- snscrape/modules/twitter.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index e4a2318..abf3f2b 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -55,7 +55,7 @@ _GUEST_TOKEN_VALIDITY = 10800 class Tweet(snscrape.base.Item): url: str date: datetime.datetime - content: str + rawContent: str renderedContent: str id: int user: 'User' @@ -86,6 +86,7 @@ class Tweet(snscrape.base.Item): outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(x.url for x in self.links) if self.links else '', 'links (url attribute)') tcooutlinks = snscrape.base._DeprecatedProperty('tcooutlinks', lambda self: [x.tcourl for x in self.links] if self.links else [], 'links (tcourl attribute)') tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(x.tcourl for x in self.links) if self.links else '', 'links (tcourl attribute)') + content = snscrape.base._DeprecatedProperty('content', lambda self: self.rawContent, 'rawContent') def __str__(self): return self.url @@ -458,8 +459,8 @@ class User(snscrape.base.Entity): username: str id: int displayname: typing.Optional[str] = None - description: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced rawDescription: typing.Optional[str] = None # Raw description with the URL(s) intact + renderedDescription: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced descriptionLinks: typing.Optional[typing.List[TextLink]] = None verified: typing.Optional[bool] = None created: typing.Optional[datetime.datetime] = None @@ -479,6 +480,7 @@ class User(snscrape.base.Entity): descriptionUrls = snscrape.base._DeprecatedProperty('descriptionUrls', lambda self: self.descriptionLinks, 'descriptionLinks') linkUrl = snscrape.base._DeprecatedProperty('linkUrl', lambda self: self.link.url if self.link else None, 'link.url') linkTcourl = snscrape.base._DeprecatedProperty('linkTcourl', lambda self: self.link.tcourl if self.link else None, 'link.tcourl') + description = snscrape.base._DeprecatedProperty('description', lambda self: self.renderedDescription, 'renderedDescription') @property def url(self): @@ -826,7 +828,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): tweetId = self._get_tweet_id(tweet) kwargs = {} kwargs['id'] = tweetId - kwargs['content'] = tweet['full_text'] + kwargs['rawContent'] = tweet['full_text'] kwargs['renderedContent'] = self._render_text_with_urls(tweet['full_text'], tweet['entities'].get('urls')) kwargs['user'] = user kwargs['date'] = email.utils.parsedate_to_datetime(tweet['created_at']) @@ -1325,8 +1327,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['username'] = user['screen_name'] kwargs['id'] = id_ if id_ else user['id'] if 'id' in user else int(user['id_str']) kwargs['displayname'] = user['name'] - kwargs['description'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) kwargs['rawDescription'] = user['description'] + kwargs['renderedDescription'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls')) if user['entities']['description'].get('urls'): kwargs['descriptionLinks'] = [TextLink( text = x.get('display_url'), @@ -1475,7 +1477,7 @@ class TwitterUserScraper(TwitterSearchScraper): return None user = obj['data']['user']['result'] rawDescription = user['legacy']['description'] - description = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls']) + renderedDescription = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls']) link = None if user['legacy'].get('url'): entity = user['legacy']['entities'].get('url', {}).get('urls', [None])[0] @@ -1491,8 +1493,8 @@ class TwitterUserScraper(TwitterSearchScraper): username = user['legacy']['screen_name'], id = int(user['rest_id']), displayname = user['legacy']['name'], - description = description, rawDescription = rawDescription, + renderedDescription = renderedDescription, descriptionLinks = [TextLink( text = x.get('display_url'), url = x['expanded_url'], From 50899c01f39c1be54161d204f1cf3a5b3bd3cf88 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 16 Jun 2022 17:12:04 +0000 Subject: [PATCH 09/94] Fix crash on malformed guest token cache file Fixes #494 --- snscrape/modules/twitter.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index abf3f2b..334851c 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -566,7 +566,12 @@ class _CLIGuestTokenManager(GuestTokenManager): return None _logger.info(f'Reading guest token from {self._file}') with open(self._file, 'r') as fp: - o = json.load(fp) + try: + o = json.load(fp) + except json.JSONDecodeError as e: + _logger.warning(f'Malformed guest token file {self._file}: {e!s}') + self.reset() + return None self._token = o['token'] self._setTime = o['setTime'] if self._setTime < time.time() - _GUEST_TOKEN_VALIDITY: From d5b406bc1bc08af7fe5a179f4b14414b0de73ee1 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 23 Jun 2022 19:50:17 +0000 Subject: [PATCH 10/94] Update API parameters to what Twitter currently uses The `count` reduction does not affect anything as Twitter ignores that parameter now. Cf. #481 --- snscrape/modules/twitter.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 334851c..6942db6 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1414,6 +1414,7 @@ class TwitterSearchScraper(_TwitterAPIScraper): 'include_mute_edge': '1', 'include_can_dm': '1', 'include_can_media_tag': '1', + 'include_ext_has_nft_avatar': '1', 'skip_status': '1', 'cards_platform': 'Web-12', 'include_cards': '1', @@ -1425,16 +1426,18 @@ class TwitterSearchScraper(_TwitterAPIScraper): 'include_user_entities': 'true', 'include_ext_media_color': 'true', 'include_ext_media_availability': 'true', + 'include_ext_sensitive_media_warning': 'true', + 'include_ext_trusted_friends_metadata': 'true', 'send_error_codes': 'true', - 'simple_quoted_tweets': 'true', + 'simple_quoted_tweet': 'true', 'q': self._query, 'tweet_search_mode': 'live', - 'count': '100', + 'count': '20', 'query_source': 'spelling_expansion_revert_click', 'cursor': None, 'pc': '1', 'spelling_corrections': '1', - 'ext': 'mediaStats,highlightedLabel', + 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo', } params = paginationParams.copy() del params['cursor'] @@ -1726,6 +1729,7 @@ class TwitterTrendsScraper(_TwitterAPIScraper): 'include_mute_edge': '1', 'include_can_dm': '1', 'include_can_media_tag': '1', + 'include_ext_has_nft_avatar': '1', 'skip_status': '1', 'cards_platform': 'Web-12', 'include_cards': '1', @@ -1737,13 +1741,15 @@ class TwitterTrendsScraper(_TwitterAPIScraper): 'include_user_entities': 'true', 'include_ext_media_color': 'true', 'include_ext_media_availability': 'true', + 'include_ext_sensitive_media_warning': 'true', + 'include_ext_trusted_friends_metadata': 'true', 'send_error_codes': 'true', 'simple_quoted_tweet': 'true', 'count': '20', 'candidate_source': 'trends', 'include_page_configuration': 'false', 'entity_tokens': 'false', - 'ext': 'mediaStats,highlightedLabel,voiceInfo', + 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo', } obj = self._get_api_data('https://twitter.com/i/api/2/guide.json', _TwitterAPIType.V2, params) for instruction in obj['timeline']['instructions']: From d72b51953f0ec05ee18761ea31c1bb82f886f7a9 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 24 Jun 2022 23:12:50 +0000 Subject: [PATCH 11/94] Fix missing r prefix on string with regex backslashes --- snscrape/modules/vkontakte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index ea33b44..06ddf7a 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -32,7 +32,7 @@ _logger = logging.getLogger(__name__) _months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] _datePattern = re.compile(r'^(?Ptoday' r'|yesterday' - r'|(?P\d+)\s+(?P' + '|'.join(_months) + ')(\s+(?P\d{4}))?' + r'|(?P\d+)\s+(?P' + '|'.join(_months) + r')(\s+(?P\d{4}))?' r'|(?P' + '|'.join(_months) + r')\s+(?P\d+),\s+(?P\d{4})' ')' r'\s+at\s+(?P\d+):(?P\d+)\s+(?P[ap]m)$') From 279d1cf4a1fa14893366c4cfd7b4b56778b6765d Mon Sep 17 00:00:00 2001 From: hgrsd Date: Sat, 16 Jul 2022 18:27:02 +0100 Subject: [PATCH 12/94] fix(vkontakte): update photo detection --- snscrape/modules/vkontakte.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index 06ddf7a..3e431f3 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -117,6 +117,9 @@ class VKontakteUserScraper(snscrape.base.Scraper): return urllib.parse.unquote(a['href'][13 : end]) return None + def is_photo(self, a): + return 'aria-label' in a.attrs and a.attrs['aria-label'].startswith('photo') + def _date_span_to_date(self, dateSpan): if not dateSpan: return None @@ -172,7 +175,7 @@ class VKontakteUserScraper(snscrape.base.Scraper): not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes photos = [] for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'): - if 'data-photo-id' not in a.attrs and 'data-video' not in a.attrs: + if not self.is_photo(a) and 'data-video' not in a.attrs: _logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}') continue if 'data-video' in a.attrs: From da3d870e10236f6c45c6621b00bc87f9417e9425 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 13 Aug 2022 21:17:55 +0000 Subject: [PATCH 13/94] Drop app icons when Twitter didn't actually include them in the response Fixes #470 --- snscrape/modules/twitter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 6942db6..c08c4bc 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1242,6 +1242,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper): vKwargs['ratingAverage'] = var['ratings']['star'] vKwargs['ratingCount'] = var['ratings']['count'] vKwargs['url'] = f'https://play.google.com/store/apps/details?id={var["id"]}' if var['type'] == 'android_app' else f'https://itunes.apple.com/app/id{var["id"]}' + if 'iconMediumKey' in vKwargs and vKwargs['iconMediumKey'] not in kwargs['media']: + # https://github.com/JustAnotherArchivist/snscrape/issues/470 + _logger.warning(f'Tweet {tweetId} contains an app icon medium key {vKwargs["iconMediumKey"]!r} on app {vKwargs["type"]!r}/{vKwargs["id"]!r}, but the corresponding medium is missing; dropping') + del vKwargs['iconMediumKey'] variants.append(UnifiedCardApp(**vKwargs)) kwargs['apps'][k] = variants From ff18f6f771802368eca71e44b3f2cd7a4e677c69 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 21 Aug 2022 01:40:31 +0000 Subject: [PATCH 14/94] Fix video extraction on Weibo Fixes #509 --- snscrape/modules/weibo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/weibo.py b/snscrape/modules/weibo.py index 796f864..1865027 100644 --- a/snscrape/modules/weibo.py +++ b/snscrape/modules/weibo.py @@ -92,7 +92,7 @@ class WeiboUserScraper(snscrape.base.Scraper): likesCount = mblog.get('attitudes_count'), picturesCount = mblog.get('pic_num'), pictures = [x['large']['url'] for x in mblog['pics']] if 'pics' in mblog else None, - video = mblog['page_info']['media_info']['mp4_720p_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' else None, + video = (urls := mblog['page_info']['urls']).get('mp4_720p_mp4') or urls.get('mp4_hd_mp4') or urls['mp4_ld_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' else None, link = mblog['page_info']['page_url'] if 'page_info' in mblog and mblog['page_info']['type'] == 'webpage' else None, repostedPost = self._mblog_to_item(mblog['retweeted_status']) if 'retweeted_status' in mblog else None, ) From 2c7a85a62095ce74278c146a73944bc4963308c5 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 21 Aug 2022 01:40:49 +0000 Subject: [PATCH 15/94] Add warning on unknown page_info types --- snscrape/modules/weibo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/snscrape/modules/weibo.py b/snscrape/modules/weibo.py index 1865027..a2b692e 100644 --- a/snscrape/modules/weibo.py +++ b/snscrape/modules/weibo.py @@ -81,6 +81,8 @@ class WeiboUserScraper(snscrape.base.Scraper): return True, None def _mblog_to_item(self, mblog): + if mblog.get('page_info', {}).get('type') not in (None, 'video', 'webpage'): + _logger.warning(f'Skipping unknown page info {mblog["page_info"]["type"]!r} on status {mblog["id"]}') return Post( url = f'https://m.weibo.cn/status/{mblog["bid"]}', id = mblog['id'], From 9294c26ffaafbf8ab31584a30a17a455442f8d64 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 21 Aug 2022 01:58:41 +0000 Subject: [PATCH 16/94] Make PeriscopeBroadcastCard.thumbnailUrl optional to handle tweets without a thumbnail Fixes #507 --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index c08c4bc..fd093cb 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -226,7 +226,7 @@ class PeriscopeBroadcastCard(Card): description: str state: str totalParticipants: int - thumbnailUrl: str + thumbnailUrl: typing.Optional[str] = None source: typing.Optional[str] = None broadcaster: typing.Optional['User'] = None siteUser: typing.Optional['User'] = None From e13033fea0da6e64684513b2fbc489883698f28a Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 24 Aug 2022 15:53:21 +0000 Subject: [PATCH 17/94] Fix AttributeError on certain videos included from other platforms --- snscrape/modules/weibo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/weibo.py b/snscrape/modules/weibo.py index a2b692e..dff1fad 100644 --- a/snscrape/modules/weibo.py +++ b/snscrape/modules/weibo.py @@ -94,7 +94,7 @@ class WeiboUserScraper(snscrape.base.Scraper): likesCount = mblog.get('attitudes_count'), picturesCount = mblog.get('pic_num'), pictures = [x['large']['url'] for x in mblog['pics']] if 'pics' in mblog else None, - video = (urls := mblog['page_info']['urls']).get('mp4_720p_mp4') or urls.get('mp4_hd_mp4') or urls['mp4_ld_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' else None, + video = urls.get('mp4_720p_mp4') or urls.get('mp4_hd_mp4') or urls['mp4_ld_mp4'] if 'page_info' in mblog and mblog['page_info']['type'] == 'video' and (urls := mblog['page_info']['urls']) else None, link = mblog['page_info']['page_url'] if 'page_info' in mblog and mblog['page_info']['type'] == 'webpage' else None, repostedPost = self._mblog_to_item(mblog['retweeted_status']) if 'retweeted_status' in mblog else None, ) From 59abeaf04c38c85b3afed3fb6b907a6868019877 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 4 Sep 2022 15:04:20 +0000 Subject: [PATCH 18/94] Make newsletter card images optional Fixes #546 --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index fd093cb..598f40b 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -254,10 +254,10 @@ class Event: class NewsletterCard(Card): title: str description: str - imageUrl: str url: str revueAccountId: int issueCount: int + imageUrl: typing.Optional[str] = None @dataclasses.dataclass From 46a603053cfbc0ce3c54d43d7e1ac2427fa82b4d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 16 Oct 2022 21:13:46 +0000 Subject: [PATCH 19/94] Handle users with extensions but no label Fixes #559 --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 598f40b..4bf84f1 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1364,7 +1364,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['link'] = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['url']), tcourl = user['url'], indices = tuple(entity['indices'])) kwargs['profileImageUrl'] = user['profile_image_url_https'] kwargs['profileBannerUrl'] = user.get('profile_banner_url') - if 'ext' in user and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')): + if 'ext' in user and 'highlightedLabel' in user['ext'] and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')): kwargs['label'] = self._user_label_to_user_label(label) return User(**kwargs) From aa325fa1a5ccab19aa3e1ac8526920bb3e649a5f Mon Sep 17 00:00:00 2001 From: Casey Ho Date: Mon, 14 Nov 2022 17:38:03 -0800 Subject: [PATCH 20/94] Handle UnifiedCardApp with no category --- snscrape/modules/twitter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 4bf84f1..cf6a506 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -421,9 +421,9 @@ class UnifiedCardApp: type: str id: str title: str - category: str countryCode: str url: str + category: typing.Optional[str] = None description: typing.Optional[str] = None iconMediumKey: typing.Optional[UnifiedCardMediumKey] = None size: typing.Optional[int] = None @@ -1237,7 +1237,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): vKwargs['title'] = var['title']['content'] if 'description' in var: vKwargs['description'] = var['description']['content'] - vKwargs['category'] = var['category']['content'] + if 'category' in var: + vKwargs['category'] = var['category']['content'] if (ratings := var['ratings']): vKwargs['ratingAverage'] = var['ratings']['star'] vKwargs['ratingCount'] = var['ratings']['count'] From e09aea70e74583f58540aeb584c089813cca547b Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 3 Dec 2022 06:36:52 +0000 Subject: [PATCH 21/94] Fix Twitter username length limit Although 15 characters is the official, current limit, there are accounts with longer usernames. 20 is the longest observed example, but it's unclear what the true limit is. --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 4bf84f1..060ed6c 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1539,7 +1539,7 @@ class TwitterUserScraper(TwitterSearchScraper): @staticmethod def is_valid_username(s): - return 1 <= len(s) <= 15 and s.strip(string.ascii_letters + string.digits + '_') == '' + return 1 <= len(s) <= 20 and s.strip(string.ascii_letters + string.digits + '_') == '' @classmethod def _cli_setup_parser(cls, subparser): From 4ff4af13cfe12a4f95c3164703411dbed0071970 Mon Sep 17 00:00:00 2001 From: engkimo Date: Tue, 6 Dec 2022 11:23:01 +0900 Subject: [PATCH 22/94] Add returning Twitter Place IDs --- snscrape/modules/twitter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 060ed6c..2420b52 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -139,6 +139,7 @@ class Coordinates: @dataclasses.dataclass class Place: + id: str fullName: str name: str type: str @@ -891,7 +892,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if (coords := tweet['geo']['coordinates']) and len(coords) == 2: kwargs['coordinates'] = Coordinates(coords[1], coords[0]) if tweet.get('place'): - kwargs['place'] = Place(tweet['place']['full_name'], tweet['place']['name'], tweet['place']['place_type'], tweet['place']['country'], tweet['place']['country_code']) + kwargs['place'] = Place(tweet['place']['id'], tweet['place']['full_name'], tweet['place']['name'], tweet['place']['place_type'], tweet['place']['country'], tweet['place']['country_code']) if 'coordinates' not in kwargs and tweet['place'].get('bounding_box') and (coords := tweet['place']['bounding_box']['coordinates']) and coords[0] and len(coords[0][0]) == 2: # Take the first (longitude, latitude) couple of the "place square" kwargs['coordinates'] = Coordinates(coords[0][0][0], coords[0][0][1]) From e449d5cdbea3777d370aed489352aaad111ba960 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 11 Jan 2023 19:59:47 +0000 Subject: [PATCH 23/94] Expose individual error messages when all request retries fail --- snscrape/base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/snscrape/base.py b/snscrape/base.py index 0e1ba1b..9be17d8 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -164,6 +164,7 @@ class Scraper: def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None): proxies = proxies or self._proxies or {} + errors = [] for attempt in range(self._retries + 1): # The request is newly prepared on each retry because of potential cookie updates. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) @@ -184,6 +185,7 @@ class Scraper: retrying = '' level = logging.ERROR logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}') + errors.append(repr(exc)) else: redirected = f' (redirected to {r.url})' if r.history else '' logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}') @@ -192,6 +194,7 @@ class Scraper: logger.debug(f'... request {i}: {redirect.request.url}: {r.status_code} (Location: {r.headers.get("Location")})') if responseOkCallback is not None: success, msg = responseOkCallback(r) + errors.append(msg) else: success, msg = (True, None) msg = f': {msg}' if msg else '' @@ -214,6 +217,7 @@ class Scraper: else: msg = f'{self._retries + 1} requests to {req.url} failed, giving up.' logger.fatal(msg) + logger.fatal(f'Errors: {", ".join(errors)}') raise ScraperException(msg) raise RuntimeError('Reached unreachable code') From 996cf882cc9baffc6b90f8bcbbdafe49cd24602d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 11 Jan 2023 20:00:08 +0000 Subject: [PATCH 24/94] Expose status code for non-200 Twitter responses --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 742af0b..ec7b391 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -681,7 +681,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if r.headers.get('content-type', '').replace(' ', '') != 'application/json;charset=utf-8': return False, 'content type is not JSON' if r.status_code != 200: - return False, 'non-200 status code' + return False, f'non-200 status code ({r.status_code})' return True, None def _get_api_data(self, endpoint, apiType, params): From ceb06664f012a5506a2cffe52b369c70300311ad Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 11 Jan 2023 22:52:52 +0000 Subject: [PATCH 25/94] Clarify descriptions of issue templates --- .github/ISSUE_TEMPLATE/bug_report.yml | 2 +- .github/ISSUE_TEMPLATE/question.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index ad484ce..a514e69 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -1,5 +1,5 @@ name: Bug report -description: Create a report to help us improve +description: Are you experiencing a problem? Create a report to help us improve! labels: 'bug' body: - type: markdown diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index e23dcd5..2632c39 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -1,6 +1,6 @@ --- name: Question -about: Ask away! +about: Ask away! (Do not use this for bugs or features.) labels: 'question' --- From 7de8d734e97acc2bc89c93d934b467d5596c6ca8 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 02:25:39 +0000 Subject: [PATCH 26/94] Override TLS ciphers to get past Twitter's new fingerprinting Fixes #647 --- snscrape/modules/twitter.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index ec7b391..ee9a259 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -26,11 +26,13 @@ import random import logging import os import re +import requests.adapters import snscrape.base import string import time import typing import urllib.parse +import urllib3.util.ssl_ import warnings @@ -49,6 +51,7 @@ _logger = logging.getLogger(__name__) _API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' _globalGuestTokenManager = None _GUEST_TOKEN_VALIDITY = 10800 +_CIPHERS_CHROME = 'TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-RSA-AES128-SHA:ECDHE-RSA-AES256-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA:AES256-SHA' @dataclasses.dataclass @@ -612,6 +615,13 @@ class _CLIGuestTokenManager(GuestTokenManager): pass +class _TwitterTLSAdapter(requests.adapters.HTTPAdapter): + def init_poolmanager(self, *args, **kwargs): + #FIXME: When urllib3 2.0.0 is out and can be required, this should use urllib3.util.create_urllib3_context instead of the private, undocumented ssl_ module. + kwargs['ssl_context'] = urllib3.util.ssl_.create_urllib3_context(ciphers = _CIPHERS_CHROME) + return super().init_poolmanager(*args, **kwargs) + + class _TwitterAPIType(enum.Enum): V2 = 0 # Introduced with the redesign GRAPHQL = 1 @@ -633,6 +643,9 @@ class _TwitterAPIScraper(snscrape.base.Scraper): 'Referer': self._baseUrl, 'Accept-Language': 'en-US,en;q=0.5', } + adapter = _TwitterTLSAdapter() + self._session.mount('https://twitter.com', adapter) + self._session.mount('https://api.twitter.com', adapter) self._set_random_user_agent() def _set_random_user_agent(self): From 129ad3fc34a7d1c6d2db92cf781d2e1a61d1144c Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 02:35:48 +0000 Subject: [PATCH 27/94] Add --max-empty-pages option to stop long (potentially infinite) empty pagination Fixes #636 --- snscrape/modules/twitter.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index ee9a259..d2ba5cf 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -628,7 +628,7 @@ class _TwitterAPIType(enum.Enum): class _TwitterAPIScraper(snscrape.base.Scraper): - def __init__(self, baseUrl, *, guestTokenManager = None, **kwargs): + def __init__(self, baseUrl, *, guestTokenManager = None, maxEmptyPages = 0, **kwargs): super().__init__(**kwargs) self._baseUrl = baseUrl if guestTokenManager is None: @@ -637,6 +637,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): _globalGuestTokenManager = GuestTokenManager() guestTokenManager = _globalGuestTokenManager self._guestTokenManager = guestTokenManager + self._maxEmptyPages = maxEmptyPages self._apiHeaders = { 'User-Agent': None, 'Authorization': _API_AUTHORIZATION_HEADER, @@ -729,6 +730,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): dir = 'bottom' stopOnEmptyResponse = False emptyResponsesOnCursor = 0 + emptyPages = 0 while True: _logger.info(f'Retrieving scroll page {cursor}') obj = self._get_api_data(endpoint, apiType, reqParams) @@ -785,6 +787,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper): emptyResponsesOnCursor += 1 if emptyResponsesOnCursor > self._retries: break + if tweetCount == 0: + emptyPages += 1 + if self._maxEmptyPages and emptyPages >= self._maxEmptyPages: + _logger.warning(f'Stopping after {emptyPages} empty pages') + break if not newCursor or (stopOnEmptyResponse and tweetCount == 0): # End of pagination if promptCursor is not None: @@ -1403,9 +1410,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper): class TwitterSearchScraper(_TwitterAPIScraper): name = 'twitter-search' - def __init__(self, query, *, cursor = None, top = False, **kwargs): + def __init__(self, query, *, cursor = None, top = False, maxEmptyPages = 20, **kwargs): if not query.strip(): raise ValueError('empty query') + kwargs['maxEmptyPages'] = maxEmptyPages super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs) self._query = query # Note: may get replaced by subclasses when using user ID resolution self._cursor = cursor @@ -1472,11 +1480,12 @@ class TwitterSearchScraper(_TwitterAPIScraper): def _cli_setup_parser(cls, subparser): subparser.add_argument('--cursor', metavar = 'CURSOR') subparser.add_argument('--top', action = 'store_true', default = False, help = 'Enable fetching top tweets instead of live/chronological') + subparser.add_argument('--max-empty-pages', dest = 'maxEmptyPages', metavar = 'N', type = int, default = 20, help = 'Stop after N empty pages from Twitter; set to 0 to disable') subparser.add_argument('query', type = snscrape.base.nonempty_string('query'), help = 'A Twitter search string') @classmethod def _cli_from_args(cls, args): - return cls._cli_construct(args, args.query, cursor = args.cursor, top = args.top) + return cls._cli_construct(args, args.query, cursor = args.cursor, top = args.top, maxEmptyPages = args.maxEmptyPages) class TwitterUserScraper(TwitterSearchScraper): From ff5e2d61ee83a6f10c2b166b7e449936dcc95670 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 03:01:48 +0000 Subject: [PATCH 28/94] Update search API parameters --- snscrape/modules/twitter.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index d2ba5cf..c744fdf 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1442,13 +1442,18 @@ class TwitterSearchScraper(_TwitterAPIScraper): 'include_can_dm': '1', 'include_can_media_tag': '1', 'include_ext_has_nft_avatar': '1', + 'include_ext_is_blue_verified': '1', + 'include_ext_verified_type': '1', 'skip_status': '1', 'cards_platform': 'Web-12', 'include_cards': '1', 'include_ext_alt_text': 'true', + 'include_ext_limited_action_results': 'false', 'include_quote_count': 'true', 'include_reply_count': '1', 'tweet_mode': 'extended', + 'include_ext_collab_control': 'true', + 'include_ext_views': 'true', 'include_entities': 'true', 'include_user_entities': 'true', 'include_ext_media_color': 'true', @@ -1464,7 +1469,8 @@ class TwitterSearchScraper(_TwitterAPIScraper): 'cursor': None, 'pc': '1', 'spelling_corrections': '1', - 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo', + 'include_ext_edit_control': 'true', + 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo,editControl,collab_control,vibe', } params = paginationParams.copy() del params['cursor'] From a0414d92cff16ebcd270e9e61d3f1c73f491a15d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 03:13:10 +0000 Subject: [PATCH 29/94] Extract alt text for media on Twitter Closes #588 --- snscrape/modules/twitter.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index c744fdf..8c5134d 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -111,6 +111,7 @@ class Medium: class Photo(Medium): previewUrl: str fullUrl: str + altText: typing.Optional[str] = None @dataclasses.dataclass @@ -126,12 +127,14 @@ class Video(Medium): variants: typing.List[VideoVariant] duration: typing.Optional[float] = None views: typing.Optional[int] = None + altText: typing.Optional[str] = None @dataclasses.dataclass class Gif(Medium): thumbnailUrl: str variants: typing.List[VideoVariant] + altText: typing.Optional[str] = None @dataclasses.dataclass @@ -949,10 +952,13 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if format not in ('jpg', 'png'): _logger.warning(f'Skipping photo with unknown format on tweet {tweetId}: {format!r}') return - return Photo( - previewUrl = f'{baseUrl}?format={format}&name=small', - fullUrl = f'{baseUrl}?format={format}&name=large', - ) + mKwargs = { + 'previewUrl': f'{baseUrl}?format={format}&name=small', + 'fullUrl': f'{baseUrl}?format={format}&name=large', + } + if medium.get('ext_alt_text'): + mKwargs['altText'] = medium['ext_alt_text'] + return Photo(**mKwargs) elif medium['type'] == 'video' or medium['type'] == 'animated_gif': variants = [] for variant in medium['video_info']['variants']: @@ -970,6 +976,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): cls = Video elif medium['type'] == 'animated_gif': cls = Gif + if medium.get('ext_alt_text'): + mKwargs['altText'] = medium['ext_alt_text'] return cls(**mKwargs) else: _logger.warning(f'Unsupported medium type on tweet {tweetId}: {medium["type"]!r}') From 3e297c9a42c4a1da3b201c5b2aeac7b9d70e2ccf Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 04:00:31 +0000 Subject: [PATCH 30/94] Update GraphQL API parameters --- snscrape/modules/twitter.py | 81 +++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 17 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 8c5134d..bc7bfee 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -15,6 +15,7 @@ __all__ = [ import collections +import copy import dataclasses import datetime import email.utils @@ -704,7 +705,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): def _get_api_data(self, endpoint, apiType, params): self._ensure_guest_token() if apiType is _TwitterAPIType.GRAPHQL: - params = urllib.parse.urlencode({'variables': json.dumps(params, separators = (',', ':'))}, quote_via = urllib.parse.quote) + params = urllib.parse.urlencode({k: json.dumps(v, separators = (',', ':')) for k, v in params.items()}, quote_via = urllib.parse.quote) r = self._get(endpoint, params = params, headers = self._apiHeaders, responseOkCallback = self._check_api_response) try: obj = r.json() @@ -724,8 +725,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if cursor is None: reqParams = params else: - reqParams = paginationParams.copy() - reqParams['cursor'] = cursor + reqParams = copy.deepcopy(paginationParams) + if apiType is _TwitterAPIType.V2: + reqParams['cursor'] = cursor + else: + reqParams['variables']['cursor'] = cursor bottomCursorAndStop = None if direction is _ScrollDirection.TOP or direction is _ScrollDirection.BOTH: dir = 'top' @@ -808,8 +812,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if newCursor != cursor: emptyResponsesOnCursor = 0 cursor = newCursor - reqParams = paginationParams.copy() - reqParams['cursor'] = cursor + reqParams = copy.deepcopy(paginationParams) + if apiType is _TwitterAPIType.V2: + reqParams['cursor'] = cursor + else: + reqParams['variables']['cursor'] = cursor def _count_tweets(self, entries): return sum(entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-') for entry in entries) @@ -1522,7 +1529,7 @@ class TwitterUserScraper(TwitterSearchScraper): fieldName = 'userId' endpoint = 'https://twitter.com/i/api/graphql/I5nvpI91ljifos1Y3Lltyg/UserByRestId' variables = {fieldName: str(self._user), 'withSafetyModeUserFields': True, 'withSuperFollowsUserFields': True} - obj = self._get_api_data(endpoint, _TwitterAPIType.GRAPHQL, params = variables) + obj = self._get_api_data(endpoint, _TwitterAPIType.GRAPHQL, params = {'variables': variables}) if not obj['data'] or obj['data']['user']['result']['__typename'] == 'UserUnavailable': return None user = obj['data']['user']['result'] @@ -1618,9 +1625,30 @@ class TwitterProfileScraper(TwitterUserScraper): } variables = paginationVariables.copy() del variables['cursor'] + features = { + 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'verified_phone_label_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'view_counts_public_visibility_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': False, + 'tweetypie_unmention_optimization_enabled': True, + 'responsive_web_uc_gql_enabled': True, + 'vibe_api_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, + 'interactive_text_enabled': True, + 'responsive_web_text_conversations_enabled': False, + 'responsive_web_enhance_cards_enabled': False, + } + + params = {'variables': variables, 'features': features} + paginationParams = {'variables': paginationVariables, 'features': features} gotPinned = False - for obj in self._iter_api_data('https://twitter.com/i/api/graphql/BSKxQ9_IaCoVyIvQHQROIQ/UserTweetsAndReplies', _TwitterAPIType.GRAPHQL, variables, paginationVariables): + for obj in self._iter_api_data('https://twitter.com/i/api/graphql/W3HCLclD2VauuL6RcQm9MA/UserTweetsAndReplies', _TwitterAPIType.GRAPHQL, params, paginationParams): instructions = obj['data']['user']['result']['timeline']['timeline']['instructions'] if not gotPinned: for instruction in instructions: @@ -1677,10 +1705,8 @@ class TwitterTweetScraper(_TwitterAPIScraper): 'includePromotedContent': True, 'withCommunity': True, 'withQuickPromoteEligibilityTweetFields': True, - 'withTweetQuoteCount': True, 'withBirdwatchNotes': True, 'withSuperFollowsUserFields': True, - 'withBirdwatchPivots': False, 'withDownvotePerspective': False, 'withReactionsMetadata': False, 'withReactionsPerspective': False, @@ -1690,9 +1716,30 @@ class TwitterTweetScraper(_TwitterAPIScraper): } variables = paginationVariables.copy() del variables['cursor'], variables['referrer'] - url = 'https://twitter.com/i/api/graphql/8svRea_Lc0_mdhwP6dqe0Q/TweetDetail' + features = { + 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'verified_phone_label_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'view_counts_public_visibility_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': False, + 'tweetypie_unmention_optimization_enabled': True, + 'responsive_web_uc_gql_enabled': True, + 'vibe_api_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, + 'interactive_text_enabled': True, + 'responsive_web_text_conversations_enabled': False, + 'responsive_web_enhance_cards_enabled': False, + } + + params = {'variables': variables, 'features': features} + paginationParams = {'variables': paginationVariables, 'features': features} + url = 'https://twitter.com/i/api/graphql/HQ_gjq7zDNvSiJOCSkwUEw/TweetDetail' if self._mode is TwitterTweetScraperMode.SINGLE: - obj = self._get_api_data(url, _TwitterAPIType.GRAPHQL, params = variables) + obj = self._get_api_data(url, _TwitterAPIType.GRAPHQL, params = params) if not obj['data']: return for instruction in obj['data']['threaded_conversation_with_injections']['instructions']: @@ -1703,7 +1750,7 @@ class TwitterTweetScraper(_TwitterAPIScraper): yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result']) break elif self._mode is TwitterTweetScraperMode.SCROLL: - for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, variables, paginationVariables, direction = _ScrollDirection.BOTH): + for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, params, paginationParams, direction = _ScrollDirection.BOTH): if not obj['data']: continue yield from self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections']['instructions'], includeConversationThreads = True) @@ -1713,11 +1760,11 @@ class TwitterTweetScraper(_TwitterAPIScraper): queue.append(self._tweetId) while queue: tweetId = queue.popleft() - thisPagVariables = paginationVariables.copy() - thisPagVariables['focalTweetId'] = str(tweetId) - thisVariables = thisPagVariables.copy() - del thisPagVariables['cursor'], thisPagVariables['referrer'] - for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, thisVariables, thisPagVariables, direction = _ScrollDirection.BOTH): + thisPagParams = copy.deepcopy(paginationVariables) + thisPagParams['variables']['focalTweetId'] = str(tweetId) + thisParams = copy.deepcopy(thisPagParams) + del thisPagParams['variables']['cursor'], thisPagParams['variables']['referrer'] + for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, thisParams, thisPagParams, direction = _ScrollDirection.BOTH): if not obj['data']: continue for tweet in self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections']['instructions'], includeConversationThreads = True): From faf09b2f5e75391ad9a48f41ea57c1255fd9eb54 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 04:00:50 +0000 Subject: [PATCH 31/94] Extract tweet view counts Closes #629 --- snscrape/modules/twitter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index bc7bfee..c3e78f7 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -84,6 +84,7 @@ class Tweet(snscrape.base.Item): hashtags: typing.Optional[typing.List[str]] = None cashtags: typing.Optional[typing.List[str]] = None card: typing.Optional['Card'] = None + viewCount: typing.Optional[int] = None username = snscrape.base._DeprecatedProperty('username', lambda self: self.user.username, 'user.username') outlinks = snscrape.base._DeprecatedProperty('outlinks', lambda self: [x.url for x in self.links] if self.links else [], 'links (url attribute)') @@ -860,9 +861,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): def _get_tweet_id(self, tweet): return tweet['id'] if 'id' in tweet else int(tweet['id_str']) - def _make_tweet(self, tweet, user, retweetedTweet = None, quotedTweet = None, card = None): + def _make_tweet(self, tweet, user, retweetedTweet = None, quotedTweet = None, card = None, **kwargs): tweetId = self._get_tweet_id(tweet) - kwargs = {} kwargs['id'] = tweetId kwargs['rawContent'] = tweet['full_text'] kwargs['renderedContent'] = self._render_text_with_urls(tweet['full_text'], tweet['entities'].get('urls')) @@ -1308,6 +1308,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['quotedTweet'] = self._tweet_to_tweet(obj['globalObjects']['tweets'][tweet['quoted_status_id_str']], obj) if 'card' in tweet: kwargs['card'] = self._make_card(tweet['card'], _TwitterAPIType.V2, self._get_tweet_id(tweet)) + if 'ext_views' in tweet and 'count' in tweet['ext_views']: + kwargs['viewCount'] = int(tweet['ext_views']['count']) return self._make_tweet(tweet, user, **kwargs) def _graphql_timeline_tweet_item_result_to_tweet(self, result): @@ -1338,6 +1340,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str'])) if 'card' in result: kwargs['card'] = self._make_card(result['card'], _TwitterAPIType.GRAPHQL, self._get_tweet_id(tweet)) + if 'views' in result and 'count' in result['views']: + kwargs['viewCount'] = int(result['views']['count']) return self._make_tweet(tweet, user, **kwargs) def _graphql_timeline_instructions_to_tweets(self, instructions, includeConversationThreads = False): From 2196bdf3e81cb10c17905ebf8f5464d70df8758d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 04:09:00 +0000 Subject: [PATCH 32/94] Extract vibe --- snscrape/modules/twitter.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index c3e78f7..1d99068 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -85,6 +85,7 @@ class Tweet(snscrape.base.Item): cashtags: typing.Optional[typing.List[str]] = None card: typing.Optional['Card'] = None viewCount: typing.Optional[int] = None + vibe: typing.Optional['Vibe'] = None username = snscrape.base._DeprecatedProperty('username', lambda self: self.user.username, 'user.username') outlinks = snscrape.base._DeprecatedProperty('outlinks', lambda self: [x.url for x in self.links] if self.links else [], 'links (url attribute)') @@ -451,6 +452,13 @@ class UnifiedCardSwipeableLayoutSlide: componentKey: UnifiedCardComponentKey +@dataclasses.dataclass +class Vibe: + text: str + imageUrl: str + imageDescription: str + + @dataclasses.dataclass class TweetRef(snscrape.base.Item): '''A reference to a tweet for which no proper Tweet object could be produced from the data returned by Twitter''' @@ -1299,6 +1307,13 @@ class _TwitterAPIScraper(snscrape.base.Scraper): _logger.warning(f'Unsupported card type on tweet {tweetId}: {cardName!r}') + def _make_vibe(self, vibe): + return Vibe( + text = vibe['text'], + imageUrl = vibe['imgUrl'], + imageDescription = vibe['imgDescription'], + ) + def _tweet_to_tweet(self, tweet, obj): user = self._user_to_user(obj['globalObjects']['users'][tweet['user_id_str']]) kwargs = {} @@ -1310,6 +1325,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['card'] = self._make_card(tweet['card'], _TwitterAPIType.V2, self._get_tweet_id(tweet)) if 'ext_views' in tweet and 'count' in tweet['ext_views']: kwargs['viewCount'] = int(tweet['ext_views']['count']) + if 'vibe' in tweet.get('ext', {}): + kwargs['vibe'] = self._make_vibe(tweet['ext']['vibe']['r']['ok']) return self._make_tweet(tweet, user, **kwargs) def _graphql_timeline_tweet_item_result_to_tweet(self, result): @@ -1342,6 +1359,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['card'] = self._make_card(result['card'], _TwitterAPIType.GRAPHQL, self._get_tweet_id(tweet)) if 'views' in result and 'count' in result['views']: kwargs['viewCount'] = int(result['views']['count']) + if 'vibe' in result: + kwargs['vibe'] = self._make_vibe(result['vibe']) return self._make_tweet(tweet, user, **kwargs) def _graphql_timeline_instructions_to_tweets(self, instructions, includeConversationThreads = False): From 28f5a45825bc8ec979528950992d6944603121bd Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 06:59:51 +0000 Subject: [PATCH 33/94] Fix empty page counter not getting reset on results --- snscrape/modules/twitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 1d99068..ca27478 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -808,6 +808,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if self._maxEmptyPages and emptyPages >= self._maxEmptyPages: _logger.warning(f'Stopping after {emptyPages} empty pages') break + else: + emptyPages = 0 if not newCursor or (stopOnEmptyResponse and tweetCount == 0): # End of pagination if promptCursor is not None: From 3e19f8f84bf7429c88f01b658f73846b8664be40 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 07:36:53 +0000 Subject: [PATCH 34/94] Add support for image_collection_website unified cards --- snscrape/modules/twitter.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index ca27478..2c2d4a9 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -324,11 +324,12 @@ class UnifiedCard(Card): apps: typing.Optional[typing.Dict[UnifiedCardAppKey, typing.List['UnifiedCardApp']]] = None components: typing.Optional[typing.List[UnifiedCardComponentKey]] = None swipeableLayoutSlides: typing.Optional[typing.List['UnifiedCardSwipeableLayoutSlide']] = None + collectionLayoutSlides: typing.Optional[typing.List['UnifiedCardCollectionLayoutSlide']] = None type: typing.Optional[str] = None def __post_init__(self): - if (self.components is None) == (self.swipeableLayoutSlides is None): - raise ValueError('did not get exactly one of components or swipeableLayoutSlides') + if (self.components is not None) + (self.swipeableLayoutSlides is not None) + (self.collectionLayoutSlides is not None) != 1: + raise ValueError('did not get exactly one of components, swipeableLayoutSlides, and collectionLayoutSlides') if self.components and not all(k in self.componentObjects for k in self.components): raise ValueError('missing components') if self.swipeableLayoutSlides and not all(s.mediumComponentKey in self.componentObjects and s.componentKey in self.componentObjects for s in self.swipeableLayoutSlides): @@ -452,6 +453,12 @@ class UnifiedCardSwipeableLayoutSlide: componentKey: UnifiedCardComponentKey +@dataclasses.dataclass +class UnifiedCardCollectionLayoutSlide: + detailsComponentKey: UnifiedCardComponentKey + mediumComponentKey: UnifiedCardComponentKey + + @dataclasses.dataclass class Vibe: text: str @@ -1190,6 +1197,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): 'image_app', 'image_carousel_app', 'image_carousel_website', + 'image_collection_website', 'image_multi_dest_carousel_website', 'image_website', 'mixed_media_multi_dest_carousel_website', @@ -1300,10 +1308,13 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['components'] = o['components'] if 'layout' in o: - if o['layout']['type'] != 'swipeable': + if o['layout']['type'] == 'swipeable': + kwargs['swipeableLayoutSlides'] = [UnifiedCardSwipeableLayoutSlide(mediumComponentKey = v[0], componentKey = v[1]) for v in o['layout']['data']['slides']] + elif o['layout']['type'] == 'collection': + kwargs['collectionLayoutSlides'] = [UnifiedCardCollectionLayoutSlide(detailsComponentKey = v[0], mediumComponentKey = v[1]) for v in o['layout']['data']['slides']] + else: _logger.warning(f'Unsupported unified_card layout type on tweet {tweetId}: {o["layout"]["type"]!r}') return - kwargs['swipeableLayoutSlides'] = [UnifiedCardSwipeableLayoutSlide(mediumComponentKey = v[0], componentKey = v[1]) for v in o['layout']['data']['slides']] return UnifiedCard(**kwargs) From cbeb65d5c9b47cdf1b43cae2a26dec3e9781e74d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 07:57:31 +0000 Subject: [PATCH 35/94] Fix KeyError crash on some tweets with AmplifyCards Fixes #601 --- snscrape/modules/twitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 2c2d4a9..0cafde9 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -119,8 +119,8 @@ class Photo(Medium): @dataclasses.dataclass class VideoVariant: - contentType: str url: str + contentType: typing.Optional[str] bitrate: typing.Optional[int] @@ -1165,7 +1165,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): id = bindingValues['amplify_content_id'], video = Video( thumbnailUrl = bindingValues['player_image'], - variants = [VideoVariant(contentType = bindingValues['player_stream_content_type'], url = bindingValues['amplify_url_vmap'], bitrate = None)], + variants = [VideoVariant(url = bindingValues['amplify_url_vmap'], contentType = bindingValues.get('player_stream_content_type'), bitrate = None)], ), ) elif cardName == 'appplayer': From e846a6a4cdece7090de750d71b5db58e763975bd Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 08:06:57 +0000 Subject: [PATCH 36/94] Fix KeyError in card user handling --- snscrape/modules/twitter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 0cafde9..b147248 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1020,7 +1020,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): assert userId not in userRefs userRefs[userId] = self._user_to_user(o) elif apiType is _TwitterAPIType.GRAPHQL: - for o in card['legacy'].get('user_refs', {}): + for o in card['legacy'].get('user_refs_results', []): + o = o['result'] userId = int(o['rest_id']) if userId in userRefs: _logger.warning(f'Duplicate user {userId} in card on tweet {tweetId}') From 238bdcd56069cc8ee63a3523fca3437d531a7a04 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 08:28:52 +0000 Subject: [PATCH 37/94] Reduce warnings about duplicate users on cards --- snscrape/modules/twitter.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index b147248..62a5a01 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1023,13 +1023,15 @@ class _TwitterAPIScraper(snscrape.base.Scraper): for o in card['legacy'].get('user_refs_results', []): o = o['result'] userId = int(o['rest_id']) - if userId in userRefs: - _logger.warning(f'Duplicate user {userId} in card on tweet {tweetId}') - continue if 'legacy' in o: - userRefs[userId] = self._user_to_user(o['legacy'], id_ = userId) + user = self._user_to_user(o['legacy'], id_ = userId) else: - userRefs[userId] = UserRef(id = userId) + user = UserRef(id = userId) + if userId in userRefs: + if userRefs[userId] != user: + _logger.warning(f'Duplicate user {userId} with differing data in card on tweet {tweetId}') + continue + userRefs[userId] = user if apiType is _TwitterAPIType.V2: messyBindingValues = card['binding_values'].items() From 27374285a257369e3d2c614212ec2c875b720c77 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 08:32:02 +0000 Subject: [PATCH 38/94] Fix crash on missing source label data This data had been announced in mid-November to disappear but was still always returned by the API until very recently. --- snscrape/modules/twitter.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 62a5a01..3aa36ed 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -69,7 +69,7 @@ class Tweet(snscrape.base.Item): quoteCount: int conversationId: int lang: str - source: str + source: typing.Optional[str] = None sourceUrl: typing.Optional[str] = None sourceLabel: typing.Optional[str] = None links: typing.Optional[typing.List['TextLink']] = None @@ -899,11 +899,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['quoteCount'] = tweet['quote_count'] kwargs['conversationId'] = tweet['conversation_id'] if 'conversation_id' in tweet else int(tweet['conversation_id_str']) kwargs['lang'] = tweet['lang'] - kwargs['source'] = tweet['source'] - if (match := re.search(r'href=[\'"]?([^\'" >]+)', tweet['source'])): - kwargs['sourceUrl'] = match.group(1) - if (match := re.search(r'>([^<]*)<', tweet['source'])): - kwargs['sourceLabel'] = match.group(1) + if 'source' in tweet: + kwargs['source'] = tweet['source'] + if (match := re.search(r'href=[\'"]?([^\'" >]+)', tweet['source'])): + kwargs['sourceUrl'] = match.group(1) + if (match := re.search(r'>([^<]*)<', tweet['source'])): + kwargs['sourceLabel'] = match.group(1) if 'extended_entities' in tweet and 'media' in tweet['extended_entities']: media = [] for medium in tweet['extended_entities']['media']: From bf0e720b5a767d395541e9e6b5cdb648c1b72e3c Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 09:01:15 +0000 Subject: [PATCH 39/94] Fix crash on empty tweet entries in timelines Fixes #620 --- snscrape/modules/twitter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 3aa36ed..a17f19b 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1387,6 +1387,9 @@ class _TwitterAPIScraper(snscrape.base.Scraper): for entry in instruction['entries']: if entry['entryId'].startswith('tweet-'): if entry['content']['entryType'] == 'TimelineTimelineItem' and entry['content']['itemContent']['itemType'] == 'TimelineTweet': + if 'result' not in entry['content']['itemContent']['tweet_results']: + _logger.warning(f'Skipping empty tweet entry {entry["entryId"]}') + continue yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result']) else: logger.warning('Got unrecognised timeline tweet item(s)') From 564a5eca7704aeb0e3ab47d5c42349c8972c5d83 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 09:12:16 +0000 Subject: [PATCH 40/94] Fix crash on unavailable users in cards --- snscrape/modules/twitter.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index a17f19b..1c41b07 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1023,6 +1023,9 @@ class _TwitterAPIScraper(snscrape.base.Scraper): elif apiType is _TwitterAPIType.GRAPHQL: for o in card['legacy'].get('user_refs_results', []): o = o['result'] + if o['__typename'] == 'UserUnavailable': + _logger.warning(f'Unavailable user in card on tweet {tweetId}') + continue userId = int(o['rest_id']) if 'legacy' in o: user = self._user_to_user(o['legacy'], id_ = userId) @@ -1056,7 +1059,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper): elif value['type'] == 'BOOLEAN': bindingValues[key] = value['boolean_value'] elif value['type'] == 'USER': - bindingValues[key] = userRefs[int(value['user_value']['id_str'])] + userId = int(value['user_value']['id_str']) + bindingValues[key] = userRefs.get(userId) + if bindingValues[key] is None: + _logger.warning(f'User {userId} not found in user refs in card on tweet {tweetId}') else: _logger.warning(f'Unsupported card value type on {key!r} on tweet {tweetId}: {value["type"]!r}') From d81d247a877888f0a78f3679699295628decd0d3 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 10:07:58 +0000 Subject: [PATCH 41/94] Port Reddit scraper to new Pushshift API Fixes #619 --- snscrape/modules/reddit.py | 58 +++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/snscrape/modules/reddit.py b/snscrape/modules/reddit.py index 55af939..f93b96e 100644 --- a/snscrape/modules/reddit.py +++ b/snscrape/modules/reddit.py @@ -133,6 +133,21 @@ class _RedditPushshiftScraper(snscrape.base.Scraper): return cls(**kwargs) + def _iter_api(self, url, params = None): + '''Iterate through the Pushshift API using the 'until' parameter and yield the items.''' + lowestIdSeen = None + if params is None: + params = {} + while True: + obj = self._get_api(url, params = params) + if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination + break + for d in obj['data']: + if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1: + yield self._api_obj_to_item(d) + lowestIdSeen = d['id'] + params['until'] = obj["data"][-1]["created_utc"] + 1 + class _RedditPushshiftSearchScraper(_RedditPushshiftScraper): def __init__(self, name, *, submissions = True, comments = True, before = None, after = None, **kwargs): @@ -148,35 +163,20 @@ class _RedditPushshiftSearchScraper(_RedditPushshiftScraper): if not self._submissions and not self._comments: raise ValueError('At least one of submissions and comments must be True') - def _iter_api(self, url, params = None): - '''Iterate through the Pushshift API using the 'before' parameter and yield the items.''' - lowestIdSeen = None - if params is None: - params = {} - if self._before is not None: - params['before'] = self._before - if self._after is not None: - params['after'] = self._after - params['sort'] = 'desc' - while True: - obj = self._get_api(url, params = params) - if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination - break - for d in obj['data']: - if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1: - yield self._api_obj_to_item(d) - lowestIdSeen = d['id'] - params['before'] = obj["data"][-1]["created_utc"] + 1 - def _iter_api_submissions_and_comments(self, params: dict): # Retrieve both submissions and comments, interleave the results to get a reverse-chronological order - params['size'] = '1000' + params['limit'] = '1000' + if self._before is not None: + params['until'] = self._before + if self._after is not None: + params['since'] = self._after + if self._submissions: - submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission/', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict + submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict else: submissionsIter = iter(()) if self._comments: - commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment/', params.copy()) + commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment', params.copy()) else: commentsIter = iter(()) @@ -260,21 +260,15 @@ class RedditSubmissionScraper(_RedditPushshiftScraper): self._submissionId = submissionId def get_items(self): - obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission/?ids={self._submissionId}') + obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission?ids={self._submissionId}') if not obj['data']: return if len(obj['data']) != 1: raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1') yield self._api_obj_to_item(obj['data'][0]) - obj = self._get_api(f'https://api.pushshift.io/reddit/submission/comment_ids/{self._submissionId}') - if not obj['data']: - return - commentIds = obj['data'] - for i in range(0, len(commentIds), 500): - ids = commentIds[i : i + 500] - obj = self._get_api(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}') - yield from map(self._api_obj_to_item, obj['data']) + # Upstream bug: link_id must be provided in decimal https://old.reddit.com/r/pushshift/comments/zkggt0/update_on_colo_switchover_bug_fixes_reindexing/ + yield from self._iter_api('https://api.pushshift.io/reddit/search/comment', {'link_id': int(self._submissionId, 36), 'limit': 1000}) @classmethod def _cli_setup_parser(cls, subparser): From 1fb5c39168d84c3a967ce5409a9e4666d81db3ba Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 10:12:39 +0000 Subject: [PATCH 42/94] Add Python 3.11 classifier --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index c026667..b4e484e 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ setuptools.setup( 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', ], packages = ['snscrape', 'snscrape.modules'], setup_requires = ['setuptools_scm'], From 8ad26fc7d1349fabf79ab22c7d3db3f05c67f5c6 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 13 Jan 2023 18:52:03 +0000 Subject: [PATCH 43/94] Switch from setup.py to pyproject.toml --- pyproject.toml | 37 +++++++++++++++++++++++++++++++++++++ setup.py | 43 ------------------------------------------- 2 files changed, 37 insertions(+), 43 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0ccf5bd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ['setuptools>=61', 'setuptools_scm>=6.2'] +build-backend = 'setuptools.build_meta' + +[tool.setuptools] +packages = ['snscrape', 'snscrape.modules'] + +[tool.setuptools_scm] + +[project] +name = 'snscrape' +description = 'A social networking service scraper' +readme = 'README.md' +authors = [{name = 'JustAnotherArchivist'}] +classifiers = [ + 'Development Status :: 4 - Beta', + 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', +] +dependencies = [ + 'requests[socks]', + 'lxml', + 'beautifulsoup4', + 'pytz; python_version < "3.9.0"', + 'filelock', +] +requires-python = '~=3.8' +dynamic = ['version'] + +[project.urls] +repository = "https://github.com/JustAnotherArchivist/snscrape" + +[project.scripts] +snscrape = 'snscrape._cli:main' diff --git a/setup.py b/setup.py deleted file mode 100644 index b4e484e..0000000 --- a/setup.py +++ /dev/null @@ -1,43 +0,0 @@ -import os.path -import setuptools - - -with open(os.path.join(os.path.dirname(__file__), 'README.md')) as fp: - readme = fp.read() - - -setuptools.setup( - name = 'snscrape', - description = 'A social networking service scraper', - long_description = readme, - long_description_content_type = 'text/markdown', - author = 'JustAnotherArchivist', - url = 'https://github.com/JustAnotherArchivist/snscrape', - classifiers = [ - 'Development Status :: 4 - Beta', - 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - ], - packages = ['snscrape', 'snscrape.modules'], - setup_requires = ['setuptools_scm'], - use_scm_version = True, - install_requires = [ - 'requests[socks]', - 'lxml', - 'beautifulsoup4', - 'pytz; python_version < "3.9.0"', - 'filelock', - ], - python_requires = '~=3.8', - extras_require = { - 'test': ['coverage'], - }, - entry_points = { - 'console_scripts': [ - 'snscrape = snscrape._cli:main', - ], - }, -) From b7cb270b6e52abd7f31a3162b478f1b8fca5f595 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 15 Jan 2023 12:31:28 +0000 Subject: [PATCH 44/94] Fix crash on empty user objects --- snscrape/modules/twitter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 1c41b07..56a49d7 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1022,6 +1022,9 @@ class _TwitterAPIScraper(snscrape.base.Scraper): userRefs[userId] = self._user_to_user(o) elif apiType is _TwitterAPIType.GRAPHQL: for o in card['legacy'].get('user_refs_results', []): + if 'result' not in o: + _logger.warning(f'Empty user ref object in card on tweet {tweetId}') + continue o = o['result'] if o['__typename'] == 'UserUnavailable': _logger.warning(f'Unavailable user in card on tweet {tweetId}') @@ -1579,7 +1582,8 @@ class TwitterUserScraper(TwitterSearchScraper): endpoint = 'https://twitter.com/i/api/graphql/I5nvpI91ljifos1Y3Lltyg/UserByRestId' variables = {fieldName: str(self._user), 'withSafetyModeUserFields': True, 'withSuperFollowsUserFields': True} obj = self._get_api_data(endpoint, _TwitterAPIType.GRAPHQL, params = {'variables': variables}) - if not obj['data'] or obj['data']['user']['result']['__typename'] == 'UserUnavailable': + if not obj['data'] or 'result' not in obj['data']['user'] or obj['data']['user']['result']['__typename'] == 'UserUnavailable': + _logger.warning('Empty response or unavailable user') return None user = obj['data']['user']['result'] rawDescription = user['legacy']['description'] From 5d3f27bc2b8c5912557c07b379505cb9faedd0f8 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 15 Jan 2023 16:36:04 +0000 Subject: [PATCH 45/94] Fix title-less BroadcastCard crash --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 56a49d7..3d4df08 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -220,7 +220,7 @@ class PromoConvoAction: class BroadcastCard(Card): id: str url: str - title: str + title: typing.Optional[str] = None state: typing.Optional[str] = None broadcaster: typing.Optional['User'] = None thumbnailUrl: typing.Optional[str] = None From d0fb9ab8a9499120b16c72a53a754983db969eb8 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 16 Jan 2023 02:39:05 +0000 Subject: [PATCH 46/94] Log TLS connection details for debugging --- snscrape/base.py | 30 ++++++++++++++++++++++++++++++ snscrape/modules/twitter.py | 4 ++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/snscrape/base.py b/snscrape/base.py index 9be17d8..80b0045 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -6,6 +6,8 @@ import functools import json import logging import requests +import requests.adapters +import urllib3.connection import time import warnings @@ -130,6 +132,33 @@ class URLItem(Item): return self._url +class _HTTPSAdapter(requests.adapters.HTTPAdapter): + def init_poolmanager(self, *args, **kwargs): + super().init_poolmanager(*args, **kwargs) + #FIXME: Uses private urllib3.PoolManager attribute pool_classes_by_scheme. + try: + self.poolmanager.pool_classes_by_scheme['https'].ConnectionCls = _HTTPSConnection + except (AttributeError, KeyError) as e: + logger.debug(f'Could not install TLS cipher logger: {type(e).__module__}.{type(e).__name__} {e!s}') + + +class _HTTPSConnection(urllib3.connection.HTTPSConnection): + def connect(self, *args, **kwargs): + conn = super().connect(*args, **kwargs) + #FIXME: Uses undocumented attribute self.sock and beyond. + try: + logger.debug(f'Connected to: {self.sock.getpeername()}') + except AttributeError: + # self.sock might be a urllib3.util.ssltransport.SSLTransport, which lacks getpeername. + pass + try: + logger.debug(f'Connection cipher: {self.sock.cipher()}') + except AttributeError: + # Shouldn't be possible, but better safe than sorry. + pass + return conn + + class ScraperException(Exception): pass @@ -143,6 +172,7 @@ class Scraper: self._retries = retries self._proxies = proxies self._session = requests.Session() + self._session.mount('https://', _HTTPSAdapter()) @abc.abstractmethod def get_items(self): diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 3d4df08..1f57887 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -635,11 +635,11 @@ class _CLIGuestTokenManager(GuestTokenManager): pass -class _TwitterTLSAdapter(requests.adapters.HTTPAdapter): +class _TwitterTLSAdapter(snscrape.base._HTTPSAdapter): def init_poolmanager(self, *args, **kwargs): #FIXME: When urllib3 2.0.0 is out and can be required, this should use urllib3.util.create_urllib3_context instead of the private, undocumented ssl_ module. kwargs['ssl_context'] = urllib3.util.ssl_.create_urllib3_context(ciphers = _CIPHERS_CHROME) - return super().init_poolmanager(*args, **kwargs) + super().init_poolmanager(*args, **kwargs) class _TwitterAPIType(enum.Enum): From 49270f6d3ae1831c3523cad5038af5e04f8b3d5e Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 16 Jan 2023 03:47:46 +0000 Subject: [PATCH 47/94] Fix debug messages for redirects to report the correct status code and redirect location --- snscrape/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/base.py b/snscrape/base.py index 80b0045..7002a7a 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -221,7 +221,7 @@ class Scraper: logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}') if r.history: for i, redirect in enumerate(r.history): - logger.debug(f'... request {i}: {redirect.request.url}: {r.status_code} (Location: {r.headers.get("Location")})') + logger.debug(f'... request {i}: {redirect.request.url}: {redirect.status_code} (Location: {redirect.headers.get("Location")})') if responseOkCallback is not None: success, msg = responseOkCallback(r) errors.append(msg) From 36e85c54c17cbc650dcc7bb143b350e822cba6d2 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 16 Jan 2023 03:48:21 +0000 Subject: [PATCH 48/94] Log response headers for debugging --- snscrape/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/snscrape/base.py b/snscrape/base.py index 7002a7a..452c018 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -219,9 +219,11 @@ class Scraper: else: redirected = f' (redirected to {r.url})' if r.history else '' logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}') + logger.debug(f'... with response headers: {r.headers!r}') if r.history: for i, redirect in enumerate(r.history): logger.debug(f'... request {i}: {redirect.request.url}: {redirect.status_code} (Location: {redirect.headers.get("Location")})') + logger.debug(f'... ... with response headers: {redirect.headers!r}') if responseOkCallback is not None: success, msg = responseOkCallback(r) errors.append(msg) From b515a66b937276eafff41cf6b897c53ecf3d54a2 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 19 Jan 2023 16:18:15 +0000 Subject: [PATCH 49/94] Fix crash in recursive tweet scraping Introduced by 3e297c9a Fixes #684 --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 1f57887..e961753 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1813,7 +1813,7 @@ class TwitterTweetScraper(_TwitterAPIScraper): queue.append(self._tweetId) while queue: tweetId = queue.popleft() - thisPagParams = copy.deepcopy(paginationVariables) + thisPagParams = copy.deepcopy(paginationParams) thisPagParams['variables']['focalTweetId'] = str(tweetId) thisParams = copy.deepcopy(thisPagParams) del thisPagParams['variables']['cursor'], thisPagParams['variables']['referrer'] From 35c0c32c38dd7ffa9e6a10537f89095fcb485829 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 2 Feb 2023 21:02:16 +0000 Subject: [PATCH 50/94] Refine bug report template --- .github/ISSUE_TEMPLATE/bug_report.yml | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index a514e69..40cfdb1 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -26,7 +26,7 @@ body: validations: required: true attributes: - label: Expected behavior + label: Expected behaviour description: A brief description of what should happen. - type: textarea attributes: @@ -37,20 +37,25 @@ body: validations: required: true attributes: - label: OS / Distro + label: Operating system description: Include the version too, please! placeholder: e.g. Windows 10, Ubuntu 20.04, macOS 10.15... - type: input validations: required: true attributes: - label: Output from `snscrape --version` + label: Python version: output of `python3 --version` + - type: input + validations: + required: true + attributes: + label: snscrape version: output of `snscrape --version` - type: input validations: required: true attributes: label: Scraper - placeholder: e.g. twitter-user, reddit-search,... + placeholder: e.g. twitter-user, reddit-search, TwitterSearchScraper, ... - type: textarea validations: required: false @@ -62,7 +67,7 @@ body: required: false attributes: label: Dump of locals - description: | + description: | Here attach the dump of your snscrape locals, if it's a crash. (snscrape should tell you the path). Please note that it may contain identifying info such as IP address, if the website returns that. You can also optionally request to exchange the file in private. @@ -72,7 +77,7 @@ body: required: true attributes: label: How are you using snscrape? - options: ['CLI', 'Module'] + options: ['CLI (`snscrape ...` as a command, e.g. in a terminal)', 'Module (`import snscrape.modules.something` in Python code)'] - type: textarea attributes: label: Additional context From 23ebdd2a3ce6c3e93012e2b5bc7c2b02c749aaf2 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 2 Feb 2023 21:03:52 +0000 Subject: [PATCH 51/94] Fix YAML syntax --- .github/ISSUE_TEMPLATE/bug_report.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 40cfdb1..fc50dc7 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -44,12 +44,14 @@ body: validations: required: true attributes: - label: Python version: output of `python3 --version` + label: | + Python version: output of `python3 --version` - type: input validations: required: true attributes: - label: snscrape version: output of `snscrape --version` + label: | + snscrape version: output of `snscrape --version` - type: input validations: required: true From d60ce38b6a2d8aa55f29502f5517d47c1d9e4a5c Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 10 Feb 2023 02:39:06 +0000 Subject: [PATCH 52/94] Make (most) consistency errors in unified cards non-fatal Fixes #703 --- snscrape/modules/twitter.py | 53 +++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index e961753..fc3abdc 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -330,28 +330,6 @@ class UnifiedCard(Card): def __post_init__(self): if (self.components is not None) + (self.swipeableLayoutSlides is not None) + (self.collectionLayoutSlides is not None) != 1: raise ValueError('did not get exactly one of components, swipeableLayoutSlides, and collectionLayoutSlides') - if self.components and not all(k in self.componentObjects for k in self.components): - raise ValueError('missing components') - if self.swipeableLayoutSlides and not all(s.mediumComponentKey in self.componentObjects and s.componentKey in self.componentObjects for s in self.swipeableLayoutSlides): - raise ValueError('missing components') - if any(c.destinationKey not in self.destinations for c in self.componentObjects.values() if hasattr(c, 'destinationKey')): - raise ValueError('missing destinations') - if any(b.destinationKey not in self.destinations for c in self.componentObjects.values() if isinstance(c, UnifiedCardButtonGroupComponentObject) for b in c.buttons): - raise ValueError('missing destinations') - mediaKeys = [] - for c in self.componentObjects.values(): - if isinstance(c, UnifiedCardMediumComponentObject): - mediaKeys.append(c.mediumKey) - elif isinstance(c, UnifiedCardSwipeableMediaComponentObject): - mediaKeys.extend(x.mediumKey for x in c.media) - mediaKeys.extend(d.mediumKey for d in self.destinations.values() if d.mediumKey is not None) - mediaKeys.extend(a.iconMediumKey for l in (self.apps.values() if self.apps is not None else []) for a in l if a.iconMediumKey is not None) - if any(k not in self.media for k in mediaKeys): - raise ValueError('missing media') - if any(c.appKey not in self.apps for c in self.componentObjects.values() if hasattr(c, 'appKey')): - raise ValueError('missing apps') - if any(d.appKey not in self.apps for d in self.destinations.values() if d.appKey is not None): - raise ValueError('missing apps') class UnifiedCardComponentObject: @@ -1329,7 +1307,36 @@ class _TwitterAPIScraper(snscrape.base.Scraper): _logger.warning(f'Unsupported unified_card layout type on tweet {tweetId}: {o["layout"]["type"]!r}') return - return UnifiedCard(**kwargs) + card = UnifiedCard(**kwargs) + + # Consistency checks + missingParts = set() + if card.components and not all(k in card.componentObjects for k in card.components): + missingParts.add('components') + if card.swipeableLayoutSlides and not all(s.mediumComponentKey in card.componentObjects and s.componentKey in card.componentObjects for s in card.swipeableLayoutSlides): + missingParts.add('components') + if any(c.destinationKey not in card.destinations for c in card.componentObjects.values() if hasattr(c, 'destinationKey')): + missingParts.add('destinations') + if any(b.destinationKey not in card.destinations for c in card.componentObjects.values() if isinstance(c, UnifiedCardButtonGroupComponentObject) for b in c.buttons): + missingParts.add('destinations') + mediaKeys = [] + for c in card.componentObjects.values(): + if isinstance(c, UnifiedCardMediumComponentObject): + mediaKeys.append(c.mediumKey) + elif isinstance(c, UnifiedCardSwipeableMediaComponentObject): + mediaKeys.extend(x.mediumKey for x in c.media) + mediaKeys.extend(d.mediumKey for d in card.destinations.values() if d.mediumKey is not None) + mediaKeys.extend(a.iconMediumKey for l in (card.apps.values() if card.apps is not None else []) for a in l if a.iconMediumKey is not None) + if any(k not in card.media for k in mediaKeys): + missingParts.add('media') + if any(c.appKey not in card.apps for c in card.componentObjects.values() if hasattr(c, 'appKey')): + missingParts.add('apps') + if any(d.appKey not in card.apps for d in card.destinations.values() if d.appKey is not None): + missingParts.add('apps') + if missingParts: + _logger.warning(f'Consistency errors in unified card on tweet {tweetId}: missing {", ".join(missingParts)}') + + return card _logger.warning(f'Unsupported card type on tweet {tweetId}: {cardName!r}') From 0933a30e3741e514efc3457510a1b300ff03ffbf Mon Sep 17 00:00:00 2001 From: quentinwolf <18410863+quentinwolf@users.noreply.github.com> Date: Mon, 13 Feb 2023 16:45:44 -0700 Subject: [PATCH 53/94] change fullUrl to use 'orig' instead of 'large' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changing fullUrl from '&name=large' to '&name=orig' since large is capped at half the resolution of orig which may not be ideal for scraping/archiving. Large images are 2048px x 1365px Original images are up to 4096px × 2730px Alternatively one could add largeUrl as an alternative to download the Large image and utillze fullUrl as above to download the original image for those that do wish to save either versions, but I feel there is no reason for saving the middle-resolution image. --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index fc3abdc..5def66f 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -957,7 +957,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): return mKwargs = { 'previewUrl': f'{baseUrl}?format={format}&name=small', - 'fullUrl': f'{baseUrl}?format={format}&name=large', + 'fullUrl': f'{baseUrl}?format={format}&name=orig', } if medium.get('ext_alt_text'): mKwargs['altText'] = medium['ext_alt_text'] From 8709282ba0fa18efa8b376c0ab161b7404a2fce9 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 19 Feb 2023 02:51:47 +0000 Subject: [PATCH 54/94] Add deprecated properties to JSON Cf. #611 --- snscrape/base.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/snscrape/base.py b/snscrape/base.py index 452c018..bac500c 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -15,6 +15,10 @@ import warnings logger = logging.getLogger(__name__) +class DeprecatedPropertyAccessWarning(FutureWarning): + pass + + class _DeprecatedProperty: def __init__(self, name, repl, replStr): self.name = name @@ -24,7 +28,7 @@ class _DeprecatedProperty: def __get__(self, obj, objType): if obj is None: # if the access is through the class using _DeprecatedProperty rather than an instance of the class: return self - warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', FutureWarning, stacklevel = 2) + warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', DeprecatedPropertyAccessWarning, stacklevel = 2) return self.repl(obj) @@ -45,9 +49,9 @@ def _json_dataclass_to_dict(obj): if field.name.startswith('_'): continue out[field.name] = _json_dataclass_to_dict(getattr(obj, field.name)) - # Add in (non-deprecated) properties + # Add properties for k in dir(obj): - if isinstance(getattr(type(obj), k, None), property): + if isinstance(getattr(type(obj), k, None), (property, _DeprecatedProperty)): assert k != '_type' if k.startswith('_'): continue @@ -70,7 +74,9 @@ class _JSONDataclass: def json(self): '''Convert the object to a JSON string''' - out = _json_dataclass_to_dict(self) + with warnings.catch_warnings(): + warnings.filterwarnings(action = 'ignore', category = DeprecatedPropertyAccessWarning) + out = _json_dataclass_to_dict(self) for key, value in list(out.items()): # Modifying the dict below, so make a copy first if isinstance(value, IntWithGranularity): out[key] = int(value) From fe5d90b748f98357303f22047a86b2128069a22f Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 19 Feb 2023 03:29:39 +0000 Subject: [PATCH 55/94] Fix tweets behind 'Show more replies' button getting missed Fixes #572 --- snscrape/modules/twitter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 5def66f..8bbe57c 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -776,7 +776,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): newCursor = entryCursor if entryCursorStop is not None: stopOnEmptyResponse = entryCursorStop - elif entry['entryId'].startswith('cursor-showMoreThreadsPrompt-'): # E.g. 'offensive' replies button + elif entry['entryId'].startswith('cursor-showMoreThreadsPrompt-') or entry['entryId'].startswith('cursor-showmorethreads-'): + # E.g. 'offensive' replies and 'Show more replies' button promptCursor = entryCursor elif direction is _ScrollDirection.BOTH and bottomCursorAndStop is None and (entry['entryId'] == 'sq-cursor-bottom' or entry['entryId'].startswith('cursor-bottom-')): newBottomCursorAndStop = (entryCursor, entryCursorStop or False) From 206907612d78d7c8a502c9608d191396d8141559 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 19 Feb 2023 05:12:47 +0000 Subject: [PATCH 56/94] Fix double dump on exceptions with --dump-locals --- snscrape/_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snscrape/_cli.py b/snscrape/_cli.py index d4b66ea..9f846b8 100644 --- a/snscrape/_cli.py +++ b/snscrape/_cli.py @@ -23,7 +23,7 @@ logger = logging # Replaced below after setting the logger class class Logger(logging.Logger): def _log_with_stack(self, level, *args, **kwargs): super().log(level, *args, **kwargs) - if dumpLocals: + if dumpLocals and not kwargs.get('extra', {}).get('_snscrapeSuppressDumpLocals', False): stack = inspect.stack() if len(stack) >= 3: name = _dump_stack_and_locals(stack[2:][::-1]) @@ -118,7 +118,7 @@ def _dump_locals_on_exception(): trace = inspect.trace() if len(trace) >= 2: name = _dump_stack_and_locals(trace[1:], exc = e) - logger.fatal(f'Dumped stack and locals to {name}') + logger.fatal(f'Dumped stack and locals to {name}', extra = {'_snscrapeSuppressDumpLocals': True}) raise From c65e36a094aba2d86bd32fce780968e22a59a6c9 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 19 Feb 2023 05:59:42 +0000 Subject: [PATCH 57/94] Bump GraphQL endpoints --- snscrape/modules/twitter.py | 43 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 8bbe57c..4850b66 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -746,10 +746,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper): elif apiType is _TwitterAPIType.GRAPHQL: if 'user' in obj['data']: # UserTweets, UserTweetsAndReplies - instructions = obj['data']['user']['result']['timeline']['timeline']['instructions'] + instructions = obj['data']['user']['result']['timeline_v2']['timeline']['instructions'] else: # TweetDetail - instructions = obj['data'].get('threaded_conversation_with_injections', {}).get('instructions', []) + instructions = obj['data'].get('threaded_conversation_with_injections_v2', {}).get('instructions', []) tweetCount = 0 for instruction in instructions: if 'addEntries' in instruction: @@ -1682,22 +1682,25 @@ class TwitterProfileScraper(TwitterUserScraper): 'withReactionsPerspective': False, 'withSuperFollowsTweetFields': True, 'withVoice': True, - 'withV2Timeline': False, + 'withV2Timeline': True, } variables = paginationVariables.copy() del variables['cursor'] features = { 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'responsive_web_graphql_exclude_directive_enabled': False, 'verified_phone_label_enabled': False, 'responsive_web_graphql_timeline_navigation_enabled': True, - 'view_counts_public_visibility_enabled': True, - 'view_counts_everywhere_api_enabled': True, - 'longform_notetweets_consumption_enabled': False, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, 'tweetypie_unmention_optimization_enabled': True, - 'responsive_web_uc_gql_enabled': True, 'vibe_api_enabled': True, 'responsive_web_edit_tweet_api_enabled': True, 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': True, + 'tweet_awards_web_tipping_enabled': False, + 'freedom_of_speech_not_reach_fetch_enabled': False, 'standardized_nudges_misinfo': True, 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, 'interactive_text_enabled': True, @@ -1709,8 +1712,8 @@ class TwitterProfileScraper(TwitterUserScraper): paginationParams = {'variables': paginationVariables, 'features': features} gotPinned = False - for obj in self._iter_api_data('https://twitter.com/i/api/graphql/W3HCLclD2VauuL6RcQm9MA/UserTweetsAndReplies', _TwitterAPIType.GRAPHQL, params, paginationParams): - instructions = obj['data']['user']['result']['timeline']['timeline']['instructions'] + for obj in self._iter_api_data('https://twitter.com/i/api/graphql/nrdle2catTyGnTyj1Qa7wA/UserTweetsAndReplies', _TwitterAPIType.GRAPHQL, params, paginationParams): + instructions = obj['data']['user']['result']['timeline_v2']['timeline']['instructions'] if not gotPinned: for instruction in instructions: if instruction['type'] == 'TimelinePinEntry': @@ -1766,29 +1769,31 @@ class TwitterTweetScraper(_TwitterAPIScraper): 'includePromotedContent': True, 'withCommunity': True, 'withQuickPromoteEligibilityTweetFields': True, - 'withBirdwatchNotes': True, + 'withBirdwatchNotes': False, 'withSuperFollowsUserFields': True, 'withDownvotePerspective': False, 'withReactionsMetadata': False, 'withReactionsPerspective': False, 'withSuperFollowsTweetFields': True, 'withVoice': True, - 'withV2Timeline': False, + 'withV2Timeline': True, } variables = paginationVariables.copy() del variables['cursor'], variables['referrer'] features = { 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'responsive_web_graphql_exclude_directive_enabled': False, 'verified_phone_label_enabled': False, 'responsive_web_graphql_timeline_navigation_enabled': True, - 'view_counts_public_visibility_enabled': True, - 'view_counts_everywhere_api_enabled': True, - 'longform_notetweets_consumption_enabled': False, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, 'tweetypie_unmention_optimization_enabled': True, - 'responsive_web_uc_gql_enabled': True, 'vibe_api_enabled': True, 'responsive_web_edit_tweet_api_enabled': True, 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': True, + 'tweet_awards_web_tipping_enabled': False, + 'freedom_of_speech_not_reach_fetch_enabled': False, 'standardized_nudges_misinfo': True, 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, 'interactive_text_enabled': True, @@ -1798,12 +1803,12 @@ class TwitterTweetScraper(_TwitterAPIScraper): params = {'variables': variables, 'features': features} paginationParams = {'variables': paginationVariables, 'features': features} - url = 'https://twitter.com/i/api/graphql/HQ_gjq7zDNvSiJOCSkwUEw/TweetDetail' + url = 'https://twitter.com/i/api/graphql/NNiD2K-nEYUfXlMwGCocMQ/TweetDetail' if self._mode is TwitterTweetScraperMode.SINGLE: obj = self._get_api_data(url, _TwitterAPIType.GRAPHQL, params = params) if not obj['data']: return - for instruction in obj['data']['threaded_conversation_with_injections']['instructions']: + for instruction in obj['data']['threaded_conversation_with_injections_v2']['instructions']: if instruction['type'] != 'TimelineAddEntries': continue for entry in instruction['entries']: @@ -1814,7 +1819,7 @@ class TwitterTweetScraper(_TwitterAPIScraper): for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, params, paginationParams, direction = _ScrollDirection.BOTH): if not obj['data']: continue - yield from self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections']['instructions'], includeConversationThreads = True) + yield from self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections_v2']['instructions'], includeConversationThreads = True) elif self._mode is TwitterTweetScraperMode.RECURSE: seenTweets = set() queue = collections.deque() @@ -1828,7 +1833,7 @@ class TwitterTweetScraper(_TwitterAPIScraper): for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, thisParams, thisPagParams, direction = _ScrollDirection.BOTH): if not obj['data']: continue - for tweet in self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections']['instructions'], includeConversationThreads = True): + for tweet in self._graphql_timeline_instructions_to_tweets(obj['data']['threaded_conversation_with_injections_v2']['instructions'], includeConversationThreads = True): if tweet.id not in seenTweets: yield tweet seenTweets.add(tweet.id) From 71fb33af700455339a9e1fde92910595a8a3d4a1 Mon Sep 17 00:00:00 2001 From: Ali Madihi Date: Mon, 20 Feb 2023 22:14:34 +0330 Subject: [PATCH 58/94] fix: telegram channel members count --- snscrape/modules/telegram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 11cc19b..b49c186 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -151,7 +151,7 @@ class TelegramChannelScraper(snscrape.base.Scraper): raise snscrape.base.ScraperException(f'Got status code {r.status_code}') soup = bs4.BeautifulSoup(r.text, 'lxml') membersDiv = soup.find('div', class_ = 'tgme_page_extra') - if membersDiv.text.endswith(' members'): + if membersDiv.text.endswith(' subscribers'): kwargs['members'] = int(membersDiv.text[:-8].replace(' ', '')) kwargs['photo'] = soup.find('img', class_ = 'tgme_page_photo_image').attrs['src'] From 280b972f22d9e5b8212f58af735b6b41236082a8 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sun, 19 Feb 2023 06:26:35 +0000 Subject: [PATCH 59/94] Fix extraction of tweets behind 'offensive' replies button --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 4850b66..01951c2 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -776,7 +776,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): newCursor = entryCursor if entryCursorStop is not None: stopOnEmptyResponse = entryCursorStop - elif entry['entryId'].startswith('cursor-showMoreThreadsPrompt-') or entry['entryId'].startswith('cursor-showmorethreads-'): + elif entry['entryId'].startswith('cursor-showmorethreadsprompt-') or entry['entryId'].startswith('cursor-showmorethreads-'): # E.g. 'offensive' replies and 'Show more replies' button promptCursor = entryCursor elif direction is _ScrollDirection.BOTH and bottomCursorAndStop is None and (entry['entryId'] == 'sq-cursor-bottom' or entry['entryId'].startswith('cursor-bottom-')): From b5694e01a2a92065422bc5b804d34943b5b57ed2 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 03:57:19 +0000 Subject: [PATCH 60/94] Fix logger typo --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 01951c2..5df81e6 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1409,7 +1409,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): continue yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result']) else: - logger.warning('Got unrecognised timeline tweet item(s)') + _logger.warning('Got unrecognised timeline tweet item(s)') elif includeConversationThreads and entry['entryId'].startswith('conversationthread-'): #TODO show more cursor? for item in entry['content']['items']: if item['entryId'].startswith(f'{entry["entryId"]}-tweet-'): From 9a2f1524c285425b1b822363b27b9406eb3453a4 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 04:03:30 +0000 Subject: [PATCH 61/94] Remove dead code --- snscrape/modules/twitter.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 5df81e6..be2cdfb 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -818,7 +818,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): def _count_tweets(self, entries): return sum(entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-') for entry in entries) - def _v2_timeline_instructions_to_tweets(self, obj, includeConversationThreads = False): + def _v2_timeline_instructions_to_tweets(self, obj): # No data format test, just a hard and loud crash if anything's wrong :-) for instruction in obj['timeline']['instructions']: if 'addEntries' in instruction: @@ -830,10 +830,6 @@ class _TwitterAPIScraper(snscrape.base.Scraper): for entry in entries: if entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-'): yield from self._v2_instruction_tweet_entry_to_tweet(entry['entryId'], entry['content'], obj) - elif includeConversationThreads and entry['entryId'].startswith('conversationThread-') and not entry['entryId'].endswith('-show_more_cursor'): - for item in entry['content']['timelineModule']['items']: - if item['entryId'].startswith('tweet-'): - yield from self._v2_instruction_tweet_entry_to_tweet(item['entryId'], item, obj) def _v2_instruction_tweet_entry_to_tweet(self, entryId, entry, obj): if 'tweet' in entry['item']['content']: @@ -843,13 +839,6 @@ class _TwitterAPIScraper(snscrape.base.Scraper): _logger.warning(f'Skipping tweet {entry["item"]["content"]["tweet"]["id"]} which is not in globalObjects') return tweet = obj['globalObjects']['tweets'][entry['item']['content']['tweet']['id']] - elif 'tombstone' in entry['item']['content']: - if 'tweet' not in entry['item']['content']['tombstone']: # E.g. deleted reply - return - if entry['item']['content']['tombstone']['tweet']['id'] not in obj['globalObjects']['tweets']: - _logger.warning(f'Skipping tweet {entry["item"]["content"]["tombstone"]["tweet"]["id"]} which is not in globalObjects') - return - tweet = obj['globalObjects']['tweets'][entry['item']['content']['tombstone']['tweet']['id']] else: raise snscrape.base.ScraperException(f'Unable to handle entry {entryId!r}') yield self._tweet_to_tweet(tweet, obj) From 3d6cd63a0011248da81d2118cc35944ca334518d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 04:17:40 +0000 Subject: [PATCH 62/94] Fix more logger typos --- snscrape/modules/twitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index be2cdfb..c4cdf54 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1444,7 +1444,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if user.get('url'): entity = user['entities'].get('url', {}).get('urls', [None])[0] if not entity or entity['url'] != user['url']: - self.logger.warning(f'Link inconsistency on user {kwargs["id"]}') + _logger.warning(f'Link inconsistency on user {kwargs["id"]}') if not entity: entity = {'indices': (0, len(user['url']))} kwargs['link'] = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['url']), tcourl = user['url'], indices = tuple(entity['indices'])) @@ -1589,7 +1589,7 @@ class TwitterUserScraper(TwitterSearchScraper): if user['legacy'].get('url'): entity = user['legacy']['entities'].get('url', {}).get('urls', [None])[0] if not entity or entity['url'] != user['legacy']['url']: - self.logger.warning(f'Link inconsistency on user') + _logger.warning(f'Link inconsistency on user') if not entity: entity = {'indices': (0, len(user['legacy']['url']))} link = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['legacy']['url']), tcourl = user['legacy']['url'], indices = tuple(entity['indices'])) From 6a6b02cb28817f303758806227ae0d93f806c945 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 04:18:12 +0000 Subject: [PATCH 63/94] Handle tombstones Closes #392 Fixes #603 --- snscrape/modules/twitter.py | 59 +++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index c4cdf54..e291612 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -102,7 +102,7 @@ class Tweet(snscrape.base.Item): class TextLink: text: typing.Optional[str] url: str - tcourl: str + tcourl: typing.Optional[str] indices: typing.Tuple[int, int] @@ -454,6 +454,18 @@ class TweetRef(snscrape.base.Item): return f'https://twitter.com/i/web/status/{self.id}' +@dataclasses.dataclass +class Tombstone(snscrape.base.Item): + '''A placeholder for a tweet that cannot be accessed''' + + id: int + text: typing.Optional[str] = None + textLinks: typing.Optional[typing.List[TextLink]] = None + + def __str__(self): + return f'https://twitter.com/i/web/status/{self.id}' + + @dataclasses.dataclass class User(snscrape.base.Entity): # Most fields can be None if they're not known. @@ -1352,12 +1364,26 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['vibe'] = self._make_vibe(tweet['ext']['vibe']['r']['ok']) return self._make_tweet(tweet, user, **kwargs) - def _graphql_timeline_tweet_item_result_to_tweet(self, result): + def _make_tombstone(self, tweetId, info): + if tweetId is None: + raise snscrape.base.ScraperException('Cannot create tombstone without tweet ID') + if info and (text := info.get('richText', info['text'])): + return Tombstone( + id = tweetId, + text = text['text'], + textLinks = [TextLink(text = text['text'][x['fromIndex']:x['toIndex']], url = x['ref']['url'], tcourl = None, indices = (x['fromIndex'], x['toIndex'])) for x in text['entities']], + ) + else: + return Tombstone(id = tweetId) + + def _graphql_timeline_tweet_item_result_to_tweet(self, result, tweetId = None): if result['__typename'] == 'Tweet': pass elif result['__typename'] == 'TweetWithVisibilityResults': #TODO Include result['softInterventionPivot'] in the Tweet object result = result['tweet'] + elif result['__typename'] == 'TweetTombstone': + return self._make_tombstone(tweetId, result.get('tombstone')) else: raise snscrape.base.ScraperException(f'Unknown result type {result["__typename"]!r}') tweet = result['legacy'] @@ -1365,19 +1391,21 @@ class _TwitterAPIScraper(snscrape.base.Scraper): user = self._user_to_user(result['core']['user_results']['result']['legacy'], id_ = userId) kwargs = {} if 'retweeted_status_result' in tweet: + #TODO Tombstones will cause a crash here. kwargs['retweetedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(tweet['retweeted_status_result']['result']) if 'quoted_status_result' in result: - if result['quoted_status_result']['result']['__typename'] == 'TweetTombstone': - kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str'])) - else: - kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result']) - elif 'quotedRefResult' in result: + kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str'])) + elif result.get('quotedRefResult'): if result['quotedRefResult']['result']['__typename'] == 'TweetTombstone': - kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str'])) + kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quotedRefResult']['result'], tweetId = int(tweet['quoted_status_id_str'])) else: + if result['quotedRefResult']['result']['__typename'] != 'Tweet': + _logger.warning(f'Unknown quotedRefResult type {result["quotedRefResult"]["result"]["__typename"]!r} on tweet {self._get_tweet_id(tweet)}, using TweetRef') kwargs['quotedTweet'] = TweetRef(id = int(result['quotedRefResult']['result']['rest_id'])) elif 'quoted_status_id_str' in tweet: - kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str'])) + # Omit the TweetRef if this is a retweet and the quoted tweet ID matches the tweet quoted in the retweeted tweet. + if tweet['quoted_status_id_str'] != tweet.get('retweeted_status_result', {}).get('result', {}).get('quoted_status_result', {}).get('result', {}).get('rest_id'): + kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str'])) if 'card' in result: kwargs['card'] = self._make_card(result['card'], _TwitterAPIType.GRAPHQL, self._get_tweet_id(tweet)) if 'views' in result and 'count' in result['views']: @@ -1392,17 +1420,21 @@ class _TwitterAPIScraper(snscrape.base.Scraper): continue for entry in instruction['entries']: if entry['entryId'].startswith('tweet-'): + tweetId = int(entry['entryId'].split('-', 1)[1]) if entry['content']['entryType'] == 'TimelineTimelineItem' and entry['content']['itemContent']['itemType'] == 'TimelineTweet': if 'result' not in entry['content']['itemContent']['tweet_results']: _logger.warning(f'Skipping empty tweet entry {entry["entryId"]}') continue - yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result']) + yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result'], tweetId = tweetId) else: _logger.warning('Got unrecognised timeline tweet item(s)') elif includeConversationThreads and entry['entryId'].startswith('conversationthread-'): #TODO show more cursor? for item in entry['content']['items']: if item['entryId'].startswith(f'{entry["entryId"]}-tweet-'): - yield self._graphql_timeline_tweet_item_result_to_tweet(item['item']['itemContent']['tweet_results']['result']) + tweetId = int(item['entryId'][len(entry['entryId']) + 7:]) + yield self._graphql_timeline_tweet_item_result_to_tweet(item['item']['itemContent']['tweet_results']['result'], tweetId = tweetId) + elif not entry['entryId'].startswith('cursor-'): + _logger.warning(f'Skipping unrecognised entry ID: {entry["entryId"]!r}') def _render_text_with_urls(self, text, urls): if not urls: @@ -1707,7 +1739,8 @@ class TwitterProfileScraper(TwitterUserScraper): for instruction in instructions: if instruction['type'] == 'TimelinePinEntry': gotPinned = True - yield self._graphql_timeline_tweet_item_result_to_tweet(instruction['entry']['content']['itemContent']['tweet_results']['result']) + tweetId = int(instruction['entry']['entryId'][6:]) if instruction['entry']['entryId'].startswith('tweet-') else None + yield self._graphql_timeline_tweet_item_result_to_tweet(instruction['entry']['content']['itemContent']['tweet_results']['result'], tweetId = tweetId) yield from self._graphql_timeline_instructions_to_tweets(instructions) @@ -1802,7 +1835,7 @@ class TwitterTweetScraper(_TwitterAPIScraper): continue for entry in instruction['entries']: if entry['entryId'] == f'tweet-{self._tweetId}' and entry['content']['entryType'] == 'TimelineTimelineItem' and entry['content']['itemContent']['itemType'] == 'TimelineTweet': - yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result']) + yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result'], tweetId = self._tweetId) break elif self._mode is TwitterTweetScraperMode.SCROLL: for obj in self._iter_api_data(url, _TwitterAPIType.GRAPHQL, params, paginationParams, direction = _ScrollDirection.BOTH): From 82f64a6472129d62c83c79380e5f4d39328e2eb7 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 06:22:13 +0000 Subject: [PATCH 64/94] Remove dead code --- snscrape/modules/twitter.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index e291612..340d26f 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1515,16 +1515,6 @@ class TwitterSearchScraper(_TwitterAPIScraper): self._cursor = cursor self._top = top - def _check_scroll_response(self, r): - if r.status_code == 429: - # Accept a 429 response as "valid" to prevent retries; handled explicitly in get_items - return True, None - if r.headers.get('content-type').replace(' ', '') != 'application/json;charset=utf-8': - return False, 'content type is not JSON' - if r.status_code != 200: - return False, 'non-200 status code' - return True, None - def get_items(self): if not self._query.strip(): raise ValueError('empty query') From 57b126c656e00a56c50a10dcc1853de7683fbace Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 20:15:57 +0000 Subject: [PATCH 65/94] Add support for scraping Twitter Communities Closes #614 --- snscrape/modules/twitter.py | 150 ++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 340d26f..de7d482 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -14,6 +14,7 @@ __all__ = [ ] +import base64 import collections import copy import dataclasses @@ -515,6 +516,27 @@ class UserLabel: @dataclasses.dataclass class UserRef: id: int + text: typing.Optional[str] = None + textLinks: typing.Optional[typing.List[TextLink]] = None + + def __str__(self): + return f'https://twitter.com/i/user/{self.id}' + + +@dataclasses.dataclass +class Community(snscrape.base.Entity): + id: int + name: str + description: str + created: datetime.datetime + admin: typing.Union[User, UserRef] + creator: typing.Union[User, UserRef] + membersFacepile: typing.List[typing.Union[User, UserRef]] + moderatorsCount: int + membersCount: int + rules: typing.List[str] + theme: str + bannerUrl: str @dataclasses.dataclass @@ -1428,6 +1450,14 @@ class _TwitterAPIScraper(snscrape.base.Scraper): yield self._graphql_timeline_tweet_item_result_to_tweet(entry['content']['itemContent']['tweet_results']['result'], tweetId = tweetId) else: _logger.warning('Got unrecognised timeline tweet item(s)') + elif entry['entryId'].startswith('homeConversation-'): + if entry['content']['entryType'] == 'TimelineTimelineModule': + for item in entry['content']['items']: + if not item['entryId'].startswith('homeConversation-') or '-tweet-' not in item['entryId']: + raise snscrape.base.ScraperException(f'Unexpected home conversation entry ID: {item["entryId"]!r}') + tweetId = int(item['entryId'].split('-tweet-', 1)[1]) + if item['item']['itemContent']['itemType'] == 'TimelineTweet': + yield self._graphql_timeline_tweet_item_result_to_tweet(item['item']['itemContent']['tweet_results']['result'], tweetId = tweetId) elif includeConversationThreads and entry['entryId'].startswith('conversationthread-'): #TODO show more cursor? for item in entry['content']['items']: if item['entryId'].startswith(f'{entry["entryId"]}-tweet-'): @@ -1497,6 +1527,30 @@ class _TwitterAPIScraper(snscrape.base.Scraper): labelKwargs['longDescription'] = label['longDescription']['text'] return UserLabel(**labelKwargs) + def _graphql_user_results_to_user_ref(self, obj): + if 'id' not in obj: + return None + if isinstance(obj['id'], int): + userId = obj['id'] + elif obj['id'].startswith('VXNlclJlc3VsdHM6'): + # UserResults: in base64 + try: + userId = base64.b64decode(obj['id']) + except ValueError: + return None + assert userId.startswith(b'UserResults:') + userId = int(userId.split(b':', 1)[1]) + kwargs = {} + if 'result' in obj and obj['result']['__typename'] == 'UserUnavailable' and 'unavailable_message' in obj['result']: + kwargs['text'] = obj['result']['unavailable_message']['text'] + kwargs['textLinks'] = [TextLink(text = kwargs['text'][x['fromIndex']:x['toIndex']], url = x['ref']['url'], tcourl = None, indices = (x['fromIndex'], x['toIndex'])) for x in obj['result']['unavailable_message']['entities']] + return UserRef(id = userId, **kwargs) + + def _graphql_user_results_to_user(self, results): + if 'result' not in results or results['result']['__typename'] == 'UserUnavailable': + return self._graphql_user_results_to_user_ref(results) + return self._user_to_user(results['result']['legacy'], id_ = int(results['result']['rest_id'])) + @classmethod def _cli_construct(cls, argparseArgs, *args, **kwargs): kwargs['guestTokenManager'] = _CLIGuestTokenManager() @@ -1880,6 +1934,102 @@ class TwitterListPostsScraper(TwitterSearchScraper): return cls._cli_construct(args, args.list) +class TwitterCommunityScraper(_TwitterAPIScraper): + name = 'twitter-community' + + def __init__(self, communityId, **kwargs): + self._communityId = communityId + super().__init__(f'https://twitter.com/i/communities/{self._communityId}', **kwargs) + + def _get_entity(self): + self._ensure_guest_token() + params = { + 'variables': { + 'communityId': str(self._communityId), + 'withDmMuting': False, + 'withSafetyModeUserFields': False, + 'withSuperFollowsUserFields': True, + }, + 'features': { + 'responsive_web_graphql_exclude_directive_enabled': False, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'verified_phone_label_enabled': False, + }, + } + obj = self._get_api_data('https://api.twitter.com/graphql/MO8cE7aTvaenXJX_teUGcA/CommunitiesFetchOneQuery', _TwitterAPIType.GRAPHQL, params = params) + if not obj['data'] or 'result' not in obj['data']['communityResults'] or obj['data']['communityResults']['result']['__typename'] == 'CommunityUnavailable': + _logger.warning('Empty response or unavailable community') + return None + community = obj['data']['communityResults']['result'] + return Community( + id = int(community['id_str']), + name = community['name'], + description = community['description'], + created = datetime.datetime.fromtimestamp(community['created_at'] / 1000, tz = datetime.timezone.utc), + admin = self._graphql_user_results_to_user(community['admin_results']), + creator = self._graphql_user_results_to_user(community['creator_results']), + membersFacepile = [self._graphql_user_results_to_user(m) for m in community['members_facepile_results']], + moderatorsCount = community['moderator_count'], + membersCount = community['member_count'], + rules = [r['name'] for r in community['rules']], + theme = community.get('custom_theme', community['default_theme']), + bannerUrl = community.get('custom_banner_media', community['default_banner_media'])['media_info']['original_img_url'], + ) + + def get_items(self): + paginationVariables = { + 'count': 20, + 'cursor': None, + 'communityId': str(self._communityId), + 'withCommunity': True, + 'withSuperFollowsUserFields': True, + 'withDownvotePerspective': False, + 'withReactionsMetadata': False, + 'withReactionsPerspective': False, + 'withSuperFollowsTweetFields': True, + } + variables = paginationVariables.copy() + del variables['count'], variables['cursor'] + features = { + 'responsive_web_twitter_blue_verified_badge_is_enabled': True, + 'responsive_web_graphql_exclude_directive_enabled': False, + 'verified_phone_label_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'tweetypie_unmention_optimization_enabled': True, + 'vibe_api_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': True, + 'tweet_awards_web_tipping_enabled': False, + 'freedom_of_speech_not_reach_fetch_enabled': False, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, + 'interactive_text_enabled': True, + 'responsive_web_text_conversations_enabled': False, + 'responsive_web_enhance_cards_enabled': False, + } + params = {'variables': variables, 'features': features} + paginationParams = {'variables': paginationVariables, 'features': features} + + for obj in self._iter_api_data('https://api.twitter.com/graphql/Qvst9FkHq45wuqicCvMpVw/CommunityTweetsTimeline', _TwitterAPIType.GRAPHQL, params, paginationParams): + if obj['data']['communityResults']['result']['__typename'] == 'CommunityUnavailable': + _logger.warning('Community unavailable') + break + yield from self._graphql_timeline_instructions_to_tweets(obj['data']['communityResults']['result']['community_timeline']['timeline']['instructions']) + + @classmethod + def _cli_setup_parser(cls, subparser): + subparser.add_argument('communityId', type = int, help = 'A community ID') + + @classmethod + def _cli_from_args(cls, args): + return cls._cli_construct(args, args.communityId) + + class TwitterTrendsScraper(_TwitterAPIScraper): name = 'twitter-trends' From 880a0a7f55f92756d8c89deff93ee8bd11db7161 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 20:16:23 +0000 Subject: [PATCH 66/94] Handle TweetUnavailable results Fixes #433 --- snscrape/modules/twitter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index de7d482..82ee1f2 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1406,6 +1406,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper): result = result['tweet'] elif result['__typename'] == 'TweetTombstone': return self._make_tombstone(tweetId, result.get('tombstone')) + elif result['__typename'] == 'TweetUnavailable': + if tweetId is None: + raise snscrape.base.ScraperException('Cannot handle unavailable tweet without tweet ID') + return TweetRef(id = tweetId) else: raise snscrape.base.ScraperException(f'Unknown result type {result["__typename"]!r}') tweet = result['legacy'] From 7327a013979cb4313c2f72d597b984a2c63eb215 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 21:23:12 +0000 Subject: [PATCH 67/94] Refactor module-level deprecation code --- snscrape/base.py | 14 ++++++++++++-- snscrape/modules/twitter.py | 14 +++----------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/snscrape/base.py b/snscrape/base.py index bac500c..6425dc7 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -14,8 +14,18 @@ import warnings logger = logging.getLogger(__name__) +def _module_deprecation_helper(all, **names): + def __getattr__(name): + if name in names: + warnings.warn(f'{name} is deprecated, use {names[name].__name__} instead', DeprecatedFeatureWarning, stacklevel = 2) + return names[name] + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') + def __dir__(): + return sorted(all + list(names.keys())) + return __getattr__, __dir__ -class DeprecatedPropertyAccessWarning(FutureWarning): + +class DeprecatedFeatureWarning(FutureWarning): pass @@ -28,7 +38,7 @@ class _DeprecatedProperty: def __get__(self, obj, objType): if obj is None: # if the access is through the class using _DeprecatedProperty rather than an instance of the class: return self - warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', DeprecatedPropertyAccessWarning, stacklevel = 2) + warnings.warn(f'{self.name} is deprecated, use {self.replStr} instead', DeprecatedFeatureWarning, stacklevel = 2) return self.repl(obj) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 82ee1f2..5cfa4c5 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -38,17 +38,6 @@ import urllib3.util.ssl_ import warnings -# DescriptionURL deprecation -_DEPRECATED_NAMES = {'DescriptionURL': 'TextLink'} -def __getattr__(name): - if name in _DEPRECATED_NAMES: - warnings.warn(f'{name} is deprecated, use {_DEPRECATED_NAMES[name]} instead', FutureWarning, stacklevel = 2) - return globals()[_DEPRECATED_NAMES[name]] - raise AttributeError(f'module {__name__!r} has no attribute {name!r}') -def __dir__(): - return sorted(__all__ + list(_DEPRECATED_NAMES.keys())) - - _logger = logging.getLogger(__name__) _API_AUTHORIZATION_HEADER = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' _globalGuestTokenManager = None @@ -2082,3 +2071,6 @@ class TwitterTrendsScraper(_TwitterAPIScraper): for item in entry['content']['timelineModule']['items']: trend = item['item']['content']['trend'] yield Trend(name = trend['name'], metaDescription = trend['trendMetadata'].get('metaDescription'), domainContext = trend['trendMetadata']['domainContext']) + + +__getattr__, __dir__ = snscrape.base._module_deprecation_helper(__all__, DescriptionURL = TextLink) From 4e70306f9976cf946e3366ed036d7f7347d378c6 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 21:24:00 +0000 Subject: [PATCH 68/94] Deprecate Entity type There is no meaningful distinction from Items, and it complicates the integration of scrapers for user searches --- snscrape/base.py | 20 +++++++------------- snscrape/modules/facebook.py | 2 +- snscrape/modules/instagram.py | 2 +- snscrape/modules/mastodon.py | 2 +- snscrape/modules/telegram.py | 2 +- snscrape/modules/twitter.py | 4 ++-- snscrape/modules/vkontakte.py | 2 +- snscrape/modules/weibo.py | 2 +- 8 files changed, 15 insertions(+), 21 deletions(-) diff --git a/snscrape/base.py b/snscrape/base.py index 6425dc7..f2e3486 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -1,3 +1,6 @@ +__all__ = ['DeprecatedFeatureWarning', 'IntWithGranularity', 'Item', 'Scraper', 'ScraperException'] + + import abc import copy import dataclasses @@ -97,7 +100,7 @@ class _JSONDataclass: @dataclasses.dataclass class Item(_JSONDataclass): - '''An abstract base class for an item returned by the scraper's get_items generator. + '''An abstract base class for an item returned by the scraper. An item can really be anything. The string representation should be useful for the CLI output (e.g. a direct URL for the item). ''' @@ -107,18 +110,6 @@ class Item(_JSONDataclass): pass -@dataclasses.dataclass -class Entity(_JSONDataclass): - '''An abstract base class for an entity returned by the scraper's entity property. - - An entity is typically the account of a person or organisation. The string representation should be the preferred direct URL to the entity's page on the network. - ''' - - @abc.abstractmethod - def __str__(self): - pass - - class IntWithGranularity(int): '''A number with an associated granularity @@ -296,3 +287,6 @@ def nonempty_string(name): raise ValueError('must not be an empty string') f.__name__ = name return f + + +__getattr__, __dir__ = _module_deprecation_helper(__all__, Entity = Item) diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index 6b6bbde..c2839f1 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -30,7 +30,7 @@ class FacebookPost(snscrape.base.Item): @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): username: str pageId: int name: str diff --git a/snscrape/modules/instagram.py b/snscrape/modules/instagram.py index 14483e0..1cd5db7 100644 --- a/snscrape/modules/instagram.py +++ b/snscrape/modules/instagram.py @@ -32,7 +32,7 @@ class InstagramPost(snscrape.base.Item): @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): username: str name: typing.Optional[str] followers: snscrape.base.IntWithGranularity diff --git a/snscrape/modules/mastodon.py b/snscrape/modules/mastodon.py index cfe69e4..653e83a 100644 --- a/snscrape/modules/mastodon.py +++ b/snscrape/modules/mastodon.py @@ -67,7 +67,7 @@ class PollOption: @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): account: str # @username@domain.invalid displayName: typing.Optional[str] = None displayNameWithCustomEmojis: typing.Optional[typing.List[typing.Union[str, 'CustomEmoji']]] = None diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index b49c186..d2140f8 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -39,7 +39,7 @@ class TelegramPost(snscrape.base.Item): @dataclasses.dataclass -class Channel(snscrape.base.Entity): +class Channel(snscrape.base.Item): username: str title: str verified: bool diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 5cfa4c5..ff240e7 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -457,7 +457,7 @@ class Tombstone(snscrape.base.Item): @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): # Most fields can be None if they're not known. username: str @@ -513,7 +513,7 @@ class UserRef: @dataclasses.dataclass -class Community(snscrape.base.Entity): +class Community(snscrape.base.Item): id: int name: str description: str diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index 3e431f3..1b577ae 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -75,7 +75,7 @@ class Video: @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): username: str name: str verified: bool diff --git a/snscrape/modules/weibo.py b/snscrape/modules/weibo.py index dff1fad..654fd62 100644 --- a/snscrape/modules/weibo.py +++ b/snscrape/modules/weibo.py @@ -34,7 +34,7 @@ class Post(snscrape.base.Item): @dataclasses.dataclass -class User(snscrape.base.Entity): +class User(snscrape.base.Item): screenname: str uid: int verified: bool From 4e6956e564f897e3fea42274866a9d7e7a8c13b7 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 21:25:01 +0000 Subject: [PATCH 69/94] Remove dead code --- snscrape/base.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/snscrape/base.py b/snscrape/base.py index f2e3486..1316a02 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -125,20 +125,6 @@ class IntWithGranularity(int): return (IntWithGranularity, (int(self), self.granularity)) -class URLItem(Item): - '''A generic item which only holds a URL string.''' - - def __init__(self, url): - self._url = url - - @property - def url(self): - return self._url - - def __str__(self): - return self._url - - class _HTTPSAdapter(requests.adapters.HTTPAdapter): def init_poolmanager(self, *args, **kwargs): super().init_poolmanager(*args, **kwargs) From 7330e0a9a07ef69b04feaa9862b2009745b0ece2 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 21:26:00 +0000 Subject: [PATCH 70/94] Rename private logger variable --- snscrape/base.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/snscrape/base.py b/snscrape/base.py index 1316a02..c03b8ed 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -15,7 +15,8 @@ import time import warnings -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) + def _module_deprecation_helper(all, **names): def __getattr__(name): @@ -132,7 +133,7 @@ class _HTTPSAdapter(requests.adapters.HTTPAdapter): try: self.poolmanager.pool_classes_by_scheme['https'].ConnectionCls = _HTTPSConnection except (AttributeError, KeyError) as e: - logger.debug(f'Could not install TLS cipher logger: {type(e).__module__}.{type(e).__name__} {e!s}') + _logger.debug(f'Could not install TLS cipher logger: {type(e).__module__}.{type(e).__name__} {e!s}') class _HTTPSConnection(urllib3.connection.HTTPSConnection): @@ -140,12 +141,12 @@ class _HTTPSConnection(urllib3.connection.HTTPSConnection): conn = super().connect(*args, **kwargs) #FIXME: Uses undocumented attribute self.sock and beyond. try: - logger.debug(f'Connected to: {self.sock.getpeername()}') + _logger.debug(f'Connected to: {self.sock.getpeername()}') except AttributeError: # self.sock might be a urllib3.util.ssltransport.SSLTransport, which lacks getpeername. pass try: - logger.debug(f'Connection cipher: {self.sock.cipher()}') + _logger.debug(f'Connection cipher: {self.sock.cipher()}') except AttributeError: # Shouldn't be possible, but better safe than sorry. pass @@ -192,12 +193,12 @@ class Scraper: # The request is newly prepared on each retry because of potential cookie updates. req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) - logger.info(f'Retrieving {req.url}') - logger.debug(f'... with headers: {headers!r}') + _logger.info(f'Retrieving {req.url}') + _logger.debug(f'... with headers: {headers!r}') if data: - logger.debug(f'... with data: {data!r}') + _logger.debug(f'... with data: {data!r}') if environmentSettings: - logger.debug(f'... with environmentSettings: {environmentSettings!r}') + _logger.debug(f'... with environmentSettings: {environmentSettings!r}') try: r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings) except requests.exceptions.RequestException as exc: @@ -207,16 +208,16 @@ class Scraper: else: retrying = '' level = logging.ERROR - logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}') + _logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}') errors.append(repr(exc)) else: redirected = f' (redirected to {r.url})' if r.history else '' - logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}') - logger.debug(f'... with response headers: {r.headers!r}') + _logger.info(f'Retrieved {req.url}{redirected}: {r.status_code}') + _logger.debug(f'... with response headers: {r.headers!r}') if r.history: for i, redirect in enumerate(r.history): - logger.debug(f'... request {i}: {redirect.request.url}: {redirect.status_code} (Location: {redirect.headers.get("Location")})') - logger.debug(f'... ... with response headers: {redirect.headers!r}') + _logger.debug(f'... request {i}: {redirect.request.url}: {redirect.status_code} (Location: {redirect.headers.get("Location")})') + _logger.debug(f'... ... with response headers: {redirect.headers!r}') if responseOkCallback is not None: success, msg = responseOkCallback(r) errors.append(msg) @@ -225,7 +226,7 @@ class Scraper: msg = f': {msg}' if msg else '' if success: - logger.debug(f'{req.url} retrieved successfully{msg}') + _logger.debug(f'{req.url} retrieved successfully{msg}') return r else: if attempt < self._retries: @@ -234,15 +235,15 @@ class Scraper: else: retrying = '' level = logging.ERROR - logger.log(level, f'Error retrieving {req.url}{msg}{retrying}') + _logger.log(level, f'Error retrieving {req.url}{msg}{retrying}') if attempt < self._retries: sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc. - logger.info(f'Waiting {sleepTime:.0f} seconds') + _logger.info(f'Waiting {sleepTime:.0f} seconds') time.sleep(sleepTime) else: msg = f'{self._retries + 1} requests to {req.url} failed, giving up.' - logger.fatal(msg) - logger.fatal(f'Errors: {", ".join(errors)}') + _logger.fatal(msg) + _logger.fatal(f'Errors: {", ".join(errors)}') raise ScraperException(msg) raise RuntimeError('Reached unreachable code') From f109f3fd46e661913df479ff062cf8d482ed4bb4 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 21:59:06 +0000 Subject: [PATCH 71/94] Fix forgotten warning name change (cf. 7327a013) --- snscrape/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/base.py b/snscrape/base.py index c03b8ed..c9e75d9 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -89,7 +89,7 @@ class _JSONDataclass: '''Convert the object to a JSON string''' with warnings.catch_warnings(): - warnings.filterwarnings(action = 'ignore', category = DeprecatedPropertyAccessWarning) + warnings.filterwarnings(action = 'ignore', category = DeprecatedFeatureWarning) out = _json_dataclass_to_dict(self) for key, value in list(out.items()): # Modifying the dict below, so make a copy first if isinstance(value, IntWithGranularity): From f329b69ed4022c4e8af5be8a07d4f5a4125bd8d9 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 22:07:40 +0000 Subject: [PATCH 72/94] Add support for scraping Twitter's user search #263 --- snscrape/modules/twitter.py | 69 +++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index ff240e7..8cf0791 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -773,7 +773,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): else: # TweetDetail instructions = obj['data'].get('threaded_conversation_with_injections_v2', {}).get('instructions', []) - tweetCount = 0 + entryCount = 0 for instruction in instructions: if 'addEntries' in instruction: entries = instruction['addEntries']['entries'] @@ -783,7 +783,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): entries = instruction['entries'] else: continue - tweetCount += self._count_tweets(entries) + entryCount += self._count_tweets_and_users(entries) for entry in entries: if not (entry['entryId'].startswith('sq-cursor-') or entry['entryId'].startswith('cursor-')): continue @@ -806,20 +806,20 @@ class _TwitterAPIScraper(snscrape.base.Scraper): newBottomCursorAndStop = (entryCursor, entryCursorStop or False) if bottomCursorAndStop is None and newBottomCursorAndStop is not None: bottomCursorAndStop = newBottomCursorAndStop - if newCursor == cursor and tweetCount == 0: + if newCursor == cursor and entryCount == 0: # Twitter sometimes returns the same cursor as requested and no results even though there are more results. # When this happens, retry the same cursor up to the retries setting. emptyResponsesOnCursor += 1 if emptyResponsesOnCursor > self._retries: break - if tweetCount == 0: + if entryCount == 0: emptyPages += 1 if self._maxEmptyPages and emptyPages >= self._maxEmptyPages: _logger.warning(f'Stopping after {emptyPages} empty pages') break else: emptyPages = 0 - if not newCursor or (stopOnEmptyResponse and tweetCount == 0): + if not newCursor or (stopOnEmptyResponse and entryCount == 0): # End of pagination if promptCursor is not None: newCursor = promptCursor @@ -838,10 +838,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper): else: reqParams['variables']['cursor'] = cursor - def _count_tweets(self, entries): - return sum(entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-') for entry in entries) + def _count_tweets_and_users(self, entries): + return sum(entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-') or entry['entryId'].startswith('user-') for entry in entries) - def _v2_timeline_instructions_to_tweets(self, obj): + def _v2_timeline_instructions_to_tweets_or_users(self, obj): # No data format test, just a hard and loud crash if anything's wrong :-) for instruction in obj['timeline']['instructions']: if 'addEntries' in instruction: @@ -853,6 +853,8 @@ class _TwitterAPIScraper(snscrape.base.Scraper): for entry in entries: if entry['entryId'].startswith('sq-I-t-') or entry['entryId'].startswith('tweet-'): yield from self._v2_instruction_tweet_entry_to_tweet(entry['entryId'], entry['content'], obj) + elif entry['entryId'].startswith('user-'): + yield self._user_to_user(obj['globalObjects']['users'][entry['content']['item']['content']['user']['id']]) def _v2_instruction_tweet_entry_to_tweet(self, entryId, entry, obj): if 'tweet' in entry['item']['content']: @@ -1550,17 +1552,35 @@ class _TwitterAPIScraper(snscrape.base.Scraper): return super()._cli_construct(argparseArgs, *args, **kwargs) +class TwitterSearchScraperMode(enum.Enum): + LIVE = 'live' + TOP = 'top' + USER = 'user' + + @classmethod + def _cli_from_args(cls, args): + if args.top: + return cls.TOP + if args.user: + return cls.USER + return cls.LIVE + + class TwitterSearchScraper(_TwitterAPIScraper): name = 'twitter-search' - def __init__(self, query, *, cursor = None, top = False, maxEmptyPages = 20, **kwargs): + def __init__(self, query, *, cursor = None, mode = TwitterSearchScraperMode.LIVE, top = None, maxEmptyPages = 20, **kwargs): if not query.strip(): raise ValueError('empty query') kwargs['maxEmptyPages'] = maxEmptyPages super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs) self._query = query # Note: may get replaced by subclasses when using user ID resolution self._cursor = cursor - self._top = top + if top is not None: + replacement = f'{__name__}.TwitterSearchScraperMode.' + ('TOP' if top else 'LIVE') + warnings.warn(f'`top` argument is deprecated, use `mode = {replacement}` instead of `top = {bool(top)}`', snscrape.base.DeprecatedFeatureWarning, stacklevel = 2) + mode = TwitterSearchScraperMode.TOP if top else TwitterSearchScraperMode.LIVE + self._mode = mode def get_items(self): if not self._query.strip(): @@ -1596,7 +1616,22 @@ class TwitterSearchScraper(_TwitterAPIScraper): 'send_error_codes': 'true', 'simple_quoted_tweet': 'true', 'q': self._query, - 'tweet_search_mode': 'live', + } + if self._mode is TwitterSearchScraperMode.LIVE: + paginationParams = { + **paginationParams, + 'tweet_search_mode': 'live', + } + elif self._mode is TwitterSearchScraperMode.TOP: + pass + elif self._mode is TwitterSearchScraperMode.USER: + paginationParams = { + **paginationParams, + 'result_filter': 'user', + 'query_source': '', + } + paginationParams = { + **paginationParams, 'count': '20', 'query_source': 'spelling_expansion_revert_click', 'cursor': None, @@ -1608,23 +1643,21 @@ class TwitterSearchScraper(_TwitterAPIScraper): params = paginationParams.copy() del params['cursor'] - if self._top: - del params['tweet_search_mode'] - del paginationParams['tweet_search_mode'] - for obj in self._iter_api_data('https://api.twitter.com/2/search/adaptive.json', _TwitterAPIType.V2, params, paginationParams, cursor = self._cursor): - yield from self._v2_timeline_instructions_to_tweets(obj) + yield from self._v2_timeline_instructions_to_tweets_or_users(obj) @classmethod def _cli_setup_parser(cls, subparser): subparser.add_argument('--cursor', metavar = 'CURSOR') - subparser.add_argument('--top', action = 'store_true', default = False, help = 'Enable fetching top tweets instead of live/chronological') + group = subparser.add_mutually_exclusive_group(required = False) + group.add_argument('--top', action = 'store_true', default = False, help = 'Search top tweets instead of live/chronological') + group.add_argument('--user', action = 'store_true', default = False, help = 'Search users instead of tweets') subparser.add_argument('--max-empty-pages', dest = 'maxEmptyPages', metavar = 'N', type = int, default = 20, help = 'Stop after N empty pages from Twitter; set to 0 to disable') subparser.add_argument('query', type = snscrape.base.nonempty_string('query'), help = 'A Twitter search string') @classmethod def _cli_from_args(cls, args): - return cls._cli_construct(args, args.query, cursor = args.cursor, top = args.top, maxEmptyPages = args.maxEmptyPages) + return cls._cli_construct(args, args.query, cursor = args.cursor, mode = TwitterSearchScraperMode._cli_from_args(args), maxEmptyPages = args.maxEmptyPages) class TwitterUserScraper(TwitterSearchScraper): From 21cf6268033110195014231eba9b510c25c651cd Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 22:10:33 +0000 Subject: [PATCH 73/94] Update list of scrapers --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bfe30d3..762c0d4 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The following services are currently supported: * Mastodon: user profiles and toots (single or thread) * Reddit: users, subreddits, and searches (via Pushshift) * Telegram: channels -* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends +* Twitter: users, user profiles, hashtags, searches (live tweets, top tweets, and users), tweets (single or surrounding thread), list posts, communities, and trends * VKontakte: user profiles * Weibo (Sina Weibo): user profiles From d1592177abff6d2a532ccabf84c76ca37d91b1a7 Mon Sep 17 00:00:00 2001 From: kelche Date: Mon, 27 Feb 2023 22:35:21 +0300 Subject: [PATCH 74/94] feat: cashtag func --- snscrape/modules/twitter.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 8cf0791..648a833 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1830,6 +1830,23 @@ class TwitterHashtagScraper(TwitterSearchScraper): return cls._cli_construct(args, args.hashtag) +class TwitterCashtagScraper(TwitterSearchScraper): + name = 'twitter-cashtag' + + def __init__(self, cashtag, **kwargs): + super().__init__(f'${cashtag}', **kwargs) + self._cashtag = cashtag + + @classmethod + def _cli_setup_parser(cls, subparser): + subparser.add_argument('cashtag', type=snscrape.base.nonempty_string( + 'cashtag'), help='A Twitter cashtag (without $)') + + @classmethod + def _cli_from_args(cls, args): + return cls._cli_construct(args, args.cashtag) + + class TwitterTweetScraperMode(enum.Enum): SINGLE = 'single' SCROLL = 'scroll' From 61dbbba6b1ed242b66660cdeffa05093808e5960 Mon Sep 17 00:00:00 2001 From: kelche Date: Mon, 27 Feb 2023 22:39:31 +0300 Subject: [PATCH 75/94] feat: cashtag func --- snscrape/modules/twitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 648a833..e25b760 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -11,6 +11,7 @@ __all__ = [ 'TwitterTweetScraper', 'TwitterListPostsScraper', 'TwitterTrendsScraper', + 'TwitterCashtagScraper', ] From ea7c6786c2f4e4cd1e757d0d10e7127044158406 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 28 Feb 2023 20:16:07 +0000 Subject: [PATCH 76/94] Handle TweetWithVisibilityResults on quoted tweets Fixes #604 --- snscrape/modules/twitter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 8cf0791..d59fc34 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1416,9 +1416,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if result['quotedRefResult']['result']['__typename'] == 'TweetTombstone': kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quotedRefResult']['result'], tweetId = int(tweet['quoted_status_id_str'])) else: - if result['quotedRefResult']['result']['__typename'] != 'Tweet': + qTweet = result['quotedRefResult']['result'] + if result['quotedRefResult']['result']['__typename'] not in ('Tweet', 'TweetWithVisibilityResults'): _logger.warning(f'Unknown quotedRefResult type {result["quotedRefResult"]["result"]["__typename"]!r} on tweet {self._get_tweet_id(tweet)}, using TweetRef') - kwargs['quotedTweet'] = TweetRef(id = int(result['quotedRefResult']['result']['rest_id'])) + elif result['quotedRefResult']['result']['__typename'] == 'TweetWithVisibilityResults': + qTweet = qTweet['tweet'] + kwargs['quotedTweet'] = TweetRef(id = int(qTweet['rest_id'])) elif 'quoted_status_id_str' in tweet: # Omit the TweetRef if this is a retweet and the quoted tweet ID matches the tweet quoted in the retweeted tweet. if tweet['quoted_status_id_str'] != tweet.get('retweeted_status_result', {}).get('result', {}).get('quoted_status_result', {}).get('result', {}).get('rest_id'): From 42cb6d8170d424045518930454e7ec462f05f612 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 28 Feb 2023 20:16:55 +0000 Subject: [PATCH 77/94] Fix crash on quotedRefResult without an actual result Fixes #740 --- snscrape/modules/twitter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index d59fc34..743e2e5 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1411,7 +1411,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper): #TODO Tombstones will cause a crash here. kwargs['retweetedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(tweet['retweeted_status_result']['result']) if 'quoted_status_result' in result: - kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str'])) + if 'result' not in result['quoted_status_result']: + _logger.warning(f'quoted_status_result for {tweet["quoted_status_id_str"]} without an actual result on tweet {self._get_tweet_id(tweet)}, using TweetRef') + kwargs['quotedTweet'] = TweetRef(int(tweet['quoted_status_id_str'])) + else: + kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str'])) elif result.get('quotedRefResult'): if result['quotedRefResult']['result']['__typename'] == 'TweetTombstone': kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quotedRefResult']['result'], tweetId = int(tweet['quoted_status_id_str'])) From 03ef3debaf9c32d20cf4ff557a24276c34f12005 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 28 Feb 2023 20:20:28 +0000 Subject: [PATCH 78/94] Fix behaviour on SIGPIPE/BrokenPipeError --- snscrape/_cli.py | 59 ++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/snscrape/_cli.py b/snscrape/_cli.py index 9f846b8..c0bb32d 100644 --- a/snscrape/_cli.py +++ b/snscrape/_cli.py @@ -6,6 +6,7 @@ import datetime import importlib.metadata import inspect import logging +import os import requests # Imported in parse_args() after setting up the logger: #import snscrape.base @@ -307,32 +308,36 @@ def main(): i = 0 with _dump_locals_on_exception(): - if args.withEntity and (entity := scraper.entity): - if args.jsonl: - print(entity.json()) + try: + if args.withEntity and (entity := scraper.entity): + if args.jsonl: + print(entity.json()) + else: + print(entity) + if args.maxResults == 0: + logger.info('Exiting after 0 results') + return + for i, item in enumerate(scraper.get_items(), start = 1): + if args.since is not None and item.date < args.since: + logger.info(f'Exiting due to reaching older results than {args.since}') + break + if args.jsonl: + print(item.json()) + elif args.format is not None: + print(args.format.format(item)) + else: + print(item) + if args.progress and i % 100 == 0: + print(f'Scraping, {i} results so far', file = sys.stderr) + if args.maxResults and i >= args.maxResults: + logger.info(f'Exiting after {i} results') + if args.progress: + print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr) + break else: - print(entity) - if args.maxResults == 0: - logger.info('Exiting after 0 results') - return - for i, item in enumerate(scraper.get_items(), start = 1): - if args.since is not None and item.date < args.since: - logger.info(f'Exiting due to reaching older results than {args.since}') - break - if args.jsonl: - print(item.json()) - elif args.format is not None: - print(args.format.format(item)) - else: - print(item) - if args.progress and i % 100 == 0: - print(f'Scraping, {i} results so far', file = sys.stderr) - if args.maxResults and i >= args.maxResults: - logger.info(f'Exiting after {i} results') + logger.info(f'Done, found {i} results') if args.progress: - print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr) - break - else: - logger.info(f'Done, found {i} results') - if args.progress: - print(f'Finished, {i} results', file = sys.stderr) + print(f'Finished, {i} results', file = sys.stderr) + except BrokenPipeError: + os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()) + sys.exit(1) From 7061ad2eb5c1beaf8c2c86c611578a7a02f32964 Mon Sep 17 00:00:00 2001 From: kelche Date: Wed, 1 Mar 2023 18:09:34 +0300 Subject: [PATCH 79/94] fix: code style --- snscrape/modules/twitter.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index e25b760..8ebb7d7 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -7,11 +7,11 @@ __all__ = [ 'TwitterUserScraper', 'TwitterProfileScraper', 'TwitterHashtagScraper', + 'TwitterCashtagScraper', 'TwitterTweetScraperMode', 'TwitterTweetScraper', 'TwitterListPostsScraper', 'TwitterTrendsScraper', - 'TwitterCashtagScraper', ] @@ -1832,20 +1832,20 @@ class TwitterHashtagScraper(TwitterSearchScraper): class TwitterCashtagScraper(TwitterSearchScraper): - name = 'twitter-cashtag' + name = 'twitter-cashtag' - def __init__(self, cashtag, **kwargs): - super().__init__(f'${cashtag}', **kwargs) - self._cashtag = cashtag + def __init__(self, cashtag, **kwargs): + super().__init__(f'${cashtag}', **kwargs) + self._cashtag = cashtag - @classmethod - def _cli_setup_parser(cls, subparser): - subparser.add_argument('cashtag', type=snscrape.base.nonempty_string( - 'cashtag'), help='A Twitter cashtag (without $)') - @classmethod - def _cli_from_args(cls, args): - return cls._cli_construct(args, args.cashtag) + @classmethod + def _cli_setup_parser(cls, subparser): + subparser.add_argument('cashtag', type = snscrape.base.nonempty_string('cashtag'), help = 'A Twitter cashtag (without $)') + + @classmethod + def _cli_from_args(cls, args): + return cls._cli_construct(args, args.cashtag) class TwitterTweetScraperMode(enum.Enum): From 3545837637a8df36d4c7513ccadbe76224e8aa4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=CE=9EVIN=20K=CE=9ELCH=CE=9E?= Date: Thu, 2 Mar 2023 19:05:16 +0000 Subject: [PATCH 80/94] fix: code style line spacing --- snscrape/modules/twitter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 55e5a83..03ea0d7 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1845,7 +1845,6 @@ class TwitterCashtagScraper(TwitterSearchScraper): super().__init__(f'${cashtag}', **kwargs) self._cashtag = cashtag - @classmethod def _cli_setup_parser(cls, subparser): subparser.add_argument('cashtag', type = snscrape.base.nonempty_string('cashtag'), help = 'A Twitter cashtag (without $)') From 0942beedd6d431d08bcc698bd89903bdbb5d8d54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=CE=9EVIN=20K=CE=9ELCH=CE=9E?= Date: Thu, 2 Mar 2023 19:08:53 +0000 Subject: [PATCH 81/94] fix: code style line spacing --- snscrape/modules/twitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 03ea0d7..73ce4c5 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1847,11 +1847,11 @@ class TwitterCashtagScraper(TwitterSearchScraper): @classmethod def _cli_setup_parser(cls, subparser): - subparser.add_argument('cashtag', type = snscrape.base.nonempty_string('cashtag'), help = 'A Twitter cashtag (without $)') + subparser.add_argument('cashtag', type = snscrape.base.nonempty_string('cashtag'), help = 'A Twitter cashtag (without $)') @classmethod def _cli_from_args(cls, args): - return cls._cli_construct(args, args.cashtag) + return cls._cli_construct(args, args.cashtag) class TwitterTweetScraperMode(enum.Enum): From c77d19da5d54d7afa1cfe4b6eed41c01345f0081 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 3 Mar 2023 00:31:30 +0000 Subject: [PATCH 82/94] Fix crash on some deleted tweets in communities --- snscrape/modules/twitter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 73ce4c5..1fdc220 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1460,7 +1460,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper): raise snscrape.base.ScraperException(f'Unexpected home conversation entry ID: {item["entryId"]!r}') tweetId = int(item['entryId'].split('-tweet-', 1)[1]) if item['item']['itemContent']['itemType'] == 'TimelineTweet': - yield self._graphql_timeline_tweet_item_result_to_tweet(item['item']['itemContent']['tweet_results']['result'], tweetId = tweetId) + if 'result' in item['item']['itemContent']['tweet_results']: + yield self._graphql_timeline_tweet_item_result_to_tweet(item['item']['itemContent']['tweet_results']['result'], tweetId = tweetId) + else: + yield TweetRef(id = tweetId) elif includeConversationThreads and entry['entryId'].startswith('conversationthread-'): #TODO show more cursor? for item in entry['content']['items']: if item['entryId'].startswith(f'{entry["entryId"]}-tweet-'): From fd75fff202bb633b1dad2037fd67c70a968c839d Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 3 Mar 2023 00:39:08 +0000 Subject: [PATCH 83/94] Fix crash on communities without a description --- snscrape/modules/twitter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 1fdc220..b48756a 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -517,7 +517,6 @@ class UserRef: class Community(snscrape.base.Item): id: int name: str - description: str created: datetime.datetime admin: typing.Union[User, UserRef] creator: typing.Union[User, UserRef] @@ -527,6 +526,7 @@ class Community(snscrape.base.Item): rules: typing.List[str] theme: str bannerUrl: str + description: typing.Optional[str] = None @dataclasses.dataclass @@ -2016,10 +2016,12 @@ class TwitterCommunityScraper(_TwitterAPIScraper): _logger.warning('Empty response or unavailable community') return None community = obj['data']['communityResults']['result'] + optKwargs = {} + if 'description' in community: + optKwargs['description'] = community['description'] return Community( id = int(community['id_str']), name = community['name'], - description = community['description'], created = datetime.datetime.fromtimestamp(community['created_at'] / 1000, tz = datetime.timezone.utc), admin = self._graphql_user_results_to_user(community['admin_results']), creator = self._graphql_user_results_to_user(community['creator_results']), @@ -2029,6 +2031,7 @@ class TwitterCommunityScraper(_TwitterAPIScraper): rules = [r['name'] for r in community['rules']], theme = community.get('custom_theme', community['default_theme']), bannerUrl = community.get('custom_banner_media', community['default_banner_media'])['media_info']['original_img_url'], + **optKwargs, ) def get_items(self): From 98b50ff9e920179ee3bc6a873f17f9ec658a5e1a Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 3 Mar 2023 01:16:49 +0000 Subject: [PATCH 84/94] Separate warnings for empty responses and unavailable users/communities --- snscrape/modules/twitter.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index b48756a..cd40226 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1692,8 +1692,11 @@ class TwitterUserScraper(TwitterSearchScraper): endpoint = 'https://twitter.com/i/api/graphql/I5nvpI91ljifos1Y3Lltyg/UserByRestId' variables = {fieldName: str(self._user), 'withSafetyModeUserFields': True, 'withSuperFollowsUserFields': True} obj = self._get_api_data(endpoint, _TwitterAPIType.GRAPHQL, params = {'variables': variables}) - if not obj['data'] or 'result' not in obj['data']['user'] or obj['data']['user']['result']['__typename'] == 'UserUnavailable': - _logger.warning('Empty response or unavailable user') + if not obj['data'] or 'result' not in obj['data']['user']: + _logger.warning('Empty response') + return None + if obj['data']['user']['result']['__typename'] == 'UserUnavailable': + _logger.warning('User unavailable') return None user = obj['data']['user']['result'] rawDescription = user['legacy']['description'] @@ -2012,8 +2015,11 @@ class TwitterCommunityScraper(_TwitterAPIScraper): }, } obj = self._get_api_data('https://api.twitter.com/graphql/MO8cE7aTvaenXJX_teUGcA/CommunitiesFetchOneQuery', _TwitterAPIType.GRAPHQL, params = params) - if not obj['data'] or 'result' not in obj['data']['communityResults'] or obj['data']['communityResults']['result']['__typename'] == 'CommunityUnavailable': - _logger.warning('Empty response or unavailable community') + if not obj['data'] or 'result' not in obj['data']['communityResults']: + _logger.warning('Empty response') + return None + if obj['data']['communityResults']['result']['__typename'] == 'CommunityUnavailable': + _logger.warning('Community unavailable') return None community = obj['data']['communityResults']['result'] optKwargs = {} From 3f7bb0516d6af83047f601c60faedb0621a603c0 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 3 Mar 2023 01:32:07 +0000 Subject: [PATCH 85/94] Fix crash due to missing profile timeline on unavailable users (e.g. protected) --- snscrape/modules/twitter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index cd40226..efa5655 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1818,6 +1818,9 @@ class TwitterProfileScraper(TwitterUserScraper): gotPinned = False for obj in self._iter_api_data('https://twitter.com/i/api/graphql/nrdle2catTyGnTyj1Qa7wA/UserTweetsAndReplies', _TwitterAPIType.GRAPHQL, params, paginationParams): + if obj['data']['user']['result']['__typename'] == 'UserUnavailable': + _logger.warning('User unavailable') + break instructions = obj['data']['user']['result']['timeline_v2']['timeline']['instructions'] if not gotPinned: for instruction in instructions: From 99050710d78b22b5c605b08d85c1513d9db44323 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 3 Mar 2023 01:34:53 +0000 Subject: [PATCH 86/94] Fix AttributeError crashes on resolving user IDs to usernames or vice-versa --- snscrape/modules/twitter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index efa5655..d63f854 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1743,6 +1743,8 @@ class TwitterUserScraper(TwitterSearchScraper): def get_items(self): if self._isUserId: # Resolve user ID to username + if self.entity is None: + raise snscrape.base.ScraperException(f'Could not resolve user ID {self._user!r} to username') self._user = self.entity.username self._isUserId = False self._query = f'from:{self._user}' @@ -1772,6 +1774,8 @@ class TwitterProfileScraper(TwitterUserScraper): def get_items(self): if not self._isUserId: + if self.entity is None: + raise snscrape.base.ScraperException(f'Could not resolve username {self._user!r} to ID') userId = self.entity.id else: userId = self._user From e47fbe3d1f078582d24d4b7b04f0ab4e41ddff1c Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 14 Mar 2023 03:03:50 +0000 Subject: [PATCH 87/94] Bump user agent Fixes #760 --- snscrape/modules/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index d63f854..00c17ef 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -672,7 +672,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): self._set_random_user_agent() def _set_random_user_agent(self): - self._userAgent = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.{random.randint(0, 9999)} Safari/537.{random.randint(0, 99)}' + self._userAgent = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.5563.{random.randint(0, 9999)} Safari/537.{random.randint(0, 99)}' self._apiHeaders['User-Agent'] = self._userAgent def _check_guest_token_response(self, r): From b93cf2640cf218d1c7e7a8a6766d54b197b2e4a3 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 15 Mar 2023 01:14:12 +0000 Subject: [PATCH 88/94] Revise bug reporting instructions: add instructions and field for debug log, put less emphasis on dump files, request minimal reproducer, and reorder template --- .github/ISSUE_TEMPLATE/bug_report.yml | 23 +++++++++++++++++------ README.md | 5 ++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index fc50dc7..e052d78 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -21,6 +21,7 @@ body: label: How to reproduce description: | How to reproduce the problem. + This should be a minimal reproducible example, i.e. the shortest possible code or the smallest number of steps that still causes the error. placeholder: e.g. I can reproduce this issue by scraping the textfiles user with the twitter-user scraper. - type: textarea validations: @@ -58,12 +59,28 @@ body: attributes: label: Scraper placeholder: e.g. twitter-user, reddit-search, TwitterSearchScraper, ... + - type: dropdown + validations: + required: true + attributes: + label: How are you using snscrape? + options: ['CLI (`snscrape ...` as a command, e.g. in a terminal)', 'Module (`import snscrape.modules.something` in Python code)'] - type: textarea validations: required: false attributes: label: Backtrace description: What is the error snscrape gives you, if any? + - type: textarea + validations: + required: false + attributes: + label: Log output + description: | + Insert here the debug log of snscrape. + If you use the CLI, add the global options `-vv` to the command, e.g. `snscrape -vv twitter-search ...`. + If you use the module, set the debug level in your Python code before any use of snscrape: `import logging; logging.basicConfig(level = logging.DEBUG)`. + If you already use `logging` in your own code, you may need to adjust the level there instead. - type: textarea validations: required: false @@ -74,12 +91,6 @@ body: Please note that it may contain identifying info such as IP address, if the website returns that. You can also optionally request to exchange the file in private. Finally, if snscrape didn't crash, leave this field blank. - - type: dropdown - validations: - required: true - attributes: - label: How are you using snscrape? - options: ['CLI (`snscrape ...` as a command, e.g. in a terminal)', 'Module (`import snscrape.modules.something` in Python code)'] - type: textarea attributes: label: Additional context diff --git a/README.md b/README.md index 762c0d4..00cb4f9 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,10 @@ To get the latest 100 tweets with the hashtag #archiveteam: It is also possible to use snscrape as a library in Python, but this is currently undocumented. ## Issue reporting -If you discover an issue with snscrape, please report it at . If possible please run snscrape with `-vv` and `--dump-locals` and include the log output as well as the dump files referenced in the log in the issue. Note that the files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue. +If you discover an issue with snscrape, please report it at . If you use the CLI, please run snscrape with `-vv` and include the log output in the issue. If you use snscrape as a module, please enable debug-level logging using `import logging; logging.basicConfig(level = logging.DEBUG)` (before using snscrape at all) and include the log output in the issue. + +### Dump files +In some cases, debugging may require more information than is available in the log. The CLI has a `--dump-locals` option that enables dumping all local variables within snscrape based on important log messages (rather than, by default, only on crashes). Note that the dump files may contain sensitive information in some cases and could potentially be used to identify you (e.g. if the service includes your IP address in its response). If you prefer to arrange a file transfer privately, just mention that in the issue. ## License This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. From edac5f38cbb5de6fcd5368bccb42c30b13a0a199 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 15 Mar 2023 01:19:53 +0000 Subject: [PATCH 89/94] Validate mode parameter on TwitterSearchScraper --- snscrape/modules/twitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 00c17ef..593eb42 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1583,6 +1583,8 @@ class TwitterSearchScraper(_TwitterAPIScraper): def __init__(self, query, *, cursor = None, mode = TwitterSearchScraperMode.LIVE, top = None, maxEmptyPages = 20, **kwargs): if not query.strip(): raise ValueError('empty query') + if mode not in tuple(TwitterSearchScraperMode): + raise ValueError('invalid mode, must be a TwitterSearchScraperMode') kwargs['maxEmptyPages'] = maxEmptyPages super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs) self._query = query # Note: may get replaced by subclasses when using user ID resolution From adac052723f34060743a4e81f9e005b3838c93c4 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 20 Mar 2023 19:15:11 +0000 Subject: [PATCH 90/94] Treat 404 responses from Twitter as a block --- snscrape/modules/twitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 593eb42..33df8eb 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -678,7 +678,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): def _check_guest_token_response(self, r): if r.status_code != 200: self._set_random_user_agent() - return False, f'non-200 response ({r.status_code})' + return False, ('non-200 response' if r.status_code != 404 else 'blocked') + f' ({r.status_code})' return True, None def _ensure_guest_token(self, url = None): @@ -710,7 +710,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): del self._apiHeaders['x-guest-token'] def _check_api_response(self, r): - if r.status_code in (403, 429): + if r.status_code in (403, 404, 429): self._unset_guest_token() self._ensure_guest_token() return False, f'blocked ({r.status_code})' From 285d5874fc20a9d9463ed261f45c5f4118277d05 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 20 Mar 2023 19:19:19 +0000 Subject: [PATCH 91/94] Deprecate cursor argument to TwitterSearchScraper #778 --- snscrape/modules/twitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 33df8eb..a59e074 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1588,6 +1588,8 @@ class TwitterSearchScraper(_TwitterAPIScraper): kwargs['maxEmptyPages'] = maxEmptyPages super().__init__(baseUrl = 'https://twitter.com/search?' + urllib.parse.urlencode({'f': 'live', 'lang': 'en', 'q': query, 'src': 'spelling_expansion_revert_click'}), **kwargs) self._query = query # Note: may get replaced by subclasses when using user ID resolution + if cursor is not None: + warnings.warn('the `cursor` argument is deprecated', snscrape.base.DeprecatedFeatureWarning, stacklevel = 2) self._cursor = cursor if top is not None: replacement = f'{__name__}.TwitterSearchScraperMode.' + ('TOP' if top else 'LIVE') @@ -1661,7 +1663,7 @@ class TwitterSearchScraper(_TwitterAPIScraper): @classmethod def _cli_setup_parser(cls, subparser): - subparser.add_argument('--cursor', metavar = 'CURSOR') + subparser.add_argument('--cursor', metavar = 'CURSOR', help = '(deprecated)') group = subparser.add_mutually_exclusive_group(required = False) group.add_argument('--top', action = 'store_true', default = False, help = 'Search top tweets instead of live/chronological') group.add_argument('--user', action = 'store_true', default = False, help = 'Search users instead of tweets') From 1c3a592415f59fe71ddf38c01615d263fc3189de Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 1 Apr 2023 22:08:20 +0000 Subject: [PATCH 92/94] Fix KeyError on broadcast cards with incomplete broadcaster user data Fixes #810 --- snscrape/modules/twitter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index a59e074..324bb02 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1135,7 +1135,10 @@ class _TwitterAPIScraper(snscrape.base.Scraper): keyKwargMap = {**keyKwargMap, 'id': 'id', 'url': 'url', 'title': 'title', 'description': 'description', 'total_participants': 'totalParticipants', 'full_size_thumbnail_url': 'thumbnailUrl'} kwargs = _kwargs_from_map(keyKwargMap) if 'broadcaster_twitter_id' in bindingValues: - kwargs['broadcaster'] = User(id = int(bindingValues['broadcaster_twitter_id']), username = bindingValues['broadcaster_username'], displayname = bindingValues['broadcaster_display_name']) + if int(bindingValues['broadcaster_twitter_id']) in userRefs: + kwargs['broadcaster'] = userRefs[int(bindingValues['broadcaster_twitter_id'])] + else: + kwargs['broadcaster'] = User(id = int(bindingValues['broadcaster_twitter_id']), username = bindingValues['broadcaster_username'], displayname = bindingValues['broadcaster_display_name']) if 'siteUser' not in kwargs: kwargs['siteUser'] = None if cardName == '745291183405076480:broadcast': From 7186c833ddca39be74786c1c0d853b84405445ed Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 3 Apr 2023 02:35:05 +0000 Subject: [PATCH 93/94] Move dict remapping helper to utils module --- snscrape/modules/twitter.py | 41 +++++++++++++++++-------------------- snscrape/utils.py | 4 ++++ 2 files changed, 23 insertions(+), 22 deletions(-) create mode 100644 snscrape/utils.py diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 324bb02..8c66fe4 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -31,6 +31,7 @@ import os import re import requests.adapters import snscrape.base +import snscrape.utils import string import time import typing @@ -1004,10 +1005,6 @@ class _TwitterAPIScraper(snscrape.base.Scraper): def _make_card(self, card, apiType, tweetId): bindingValues = {} - def _kwargs_from_map(keyKwargMap): - nonlocal bindingValues - return {kwarg: bindingValues[key] for key, kwarg in keyKwargMap.items() if key in bindingValues} - userRefs = {} if apiType is _TwitterAPIType.V2: for o in card.get('users', {}).values(): @@ -1069,7 +1066,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): cardName = card['legacy']['name'] if cardName in ('summary', 'summary_large_image', 'app', 'direct_store_link_app'): - keyKwargMap = { + keyMap = { 'title': 'title', 'description': 'description', 'card_url': 'url', @@ -1077,13 +1074,13 @@ class _TwitterAPIScraper(snscrape.base.Scraper): 'creator': 'creatorUser', } if cardName in ('app', 'direct_store_link_app'): - keyKwargMap['thumbnail_original'] = 'thumbnailUrl' - return AppCard(**_kwargs_from_map(keyKwargMap)) + keyMap['thumbnail_original'] = 'thumbnailUrl' + return AppCard(**snscrape.utils.dict_map(bindingValues, keyMap)) else: - keyKwargMap['thumbnail_image_original'] = 'thumbnailUrl' - return SummaryCard(**_kwargs_from_map(keyKwargMap)) + keyMap['thumbnail_image_original'] = 'thumbnailUrl' + return SummaryCard(**snscrape.utils.dict_map(bindingValues, keyMap)) elif any(cardName.startswith(x) for x in ('poll2choice_', 'poll3choice_', 'poll4choice_')) and cardName.split('_', 1)[1] in ('text_only', 'image', 'video'): - kwargs = _kwargs_from_map({'end_datetime_utc': 'endDate', 'last_updated_datetime_utc': 'lastUpdateDate', 'duration_minutes': 'duration', 'counts_are_final': 'finalResults'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'end_datetime_utc': 'endDate', 'last_updated_datetime_utc': 'lastUpdateDate', 'duration_minutes': 'duration', 'counts_are_final': 'finalResults'}) options = [] for key in sorted(bindingValues): @@ -1107,9 +1104,9 @@ class _TwitterAPIScraper(snscrape.base.Scraper): return PollCard(**kwargs) elif cardName == 'player': - return PlayerCard(**_kwargs_from_map({'title': 'title', 'description': 'description', 'card_url': 'url', 'player_image_original': 'imageUrl', 'site': 'siteUser'})) + return PlayerCard(**snscrape.utils.dict_map(bindingValues, {'title': 'title', 'description': 'description', 'card_url': 'url', 'player_image_original': 'imageUrl', 'site': 'siteUser'})) elif cardName in ('promo_image_convo', 'promo_video_convo'): - kwargs = _kwargs_from_map({'thank_you_text': 'thankYouText', 'thank_you_url': 'thankYouUrl', 'thank_you_shortened_url': 'thankYouTcoUrl'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'thank_you_text': 'thankYouText', 'thank_you_url': 'thankYouUrl', 'thank_you_shortened_url': 'thankYouTcoUrl'}) kwargs['actions'] = [] for l in ('one', 'two', 'three', 'four'): if f'cta_{l}' in bindingValues: @@ -1128,12 +1125,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['medium'] = Video(thumbnailUrl = bindingValues['player_image_original'], variants = variants, duration = int(bindingValues['content_duration_seconds'])) return PromoConvoCard(**kwargs) elif cardName in ('745291183405076480:broadcast', '3691233323:periscope_broadcast'): - keyKwargMap = {'broadcast_state': 'state', 'broadcast_source': 'source', 'site': 'siteUser'} + keyMap = {'broadcast_state': 'state', 'broadcast_source': 'source', 'site': 'siteUser'} if cardName == '745291183405076480:broadcast': - keyKwargMap = {**keyKwargMap, 'broadcast_id': 'id', 'broadcast_url': 'url', 'broadcast_title': 'title', 'broadcast_thumbnail_original': 'thumbnailUrl'} + keyMap = {**keyMap, 'broadcast_id': 'id', 'broadcast_url': 'url', 'broadcast_title': 'title', 'broadcast_thumbnail_original': 'thumbnailUrl'} else: - keyKwargMap = {**keyKwargMap, 'id': 'id', 'url': 'url', 'title': 'title', 'description': 'description', 'total_participants': 'totalParticipants', 'full_size_thumbnail_url': 'thumbnailUrl'} - kwargs = _kwargs_from_map(keyKwargMap) + keyMap = {**keyMap, 'id': 'id', 'url': 'url', 'title': 'title', 'description': 'description', 'total_participants': 'totalParticipants', 'full_size_thumbnail_url': 'thumbnailUrl'} + kwargs = snscrape.utils.dict_map(bindingValues, keyMap) if 'broadcaster_twitter_id' in bindingValues: if int(bindingValues['broadcaster_twitter_id']) in userRefs: kwargs['broadcaster'] = userRefs[int(bindingValues['broadcaster_twitter_id'])] @@ -1147,17 +1144,17 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['totalParticipants'] = int(kwargs['totalParticipants']) return PeriscopeBroadcastCard(**kwargs) elif cardName == '745291183405076480:live_event': - kwargs = _kwargs_from_map({'event_id': 'id', 'event_title': 'title', 'event_category': 'category', 'event_subtitle': 'description'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'event_id': 'id', 'event_title': 'title', 'event_category': 'category', 'event_subtitle': 'description'}) kwargs['id'] = int(kwargs['id']) kwargs['photo'] = Photo(previewUrl = bindingValues['event_thumbnail_small'], fullUrl = bindingValues.get('event_thumbnail_original') or bindingValues['event_thumbnail']) return EventCard(event = Event(**kwargs)) elif cardName == '3337203208:newsletter_publication': - kwargs = _kwargs_from_map({'newsletter_title': 'title', 'newsletter_description': 'description', 'newsletter_image_original': 'imageUrl', 'card_url': 'url', 'revue_account_id': 'revueAccountId', 'issue_count': 'issueCount'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'newsletter_title': 'title', 'newsletter_description': 'description', 'newsletter_image_original': 'imageUrl', 'card_url': 'url', 'revue_account_id': 'revueAccountId', 'issue_count': 'issueCount'}) kwargs['revueAccountId'] = int(kwargs['revueAccountId']) kwargs['issueCount'] = int(kwargs['issueCount']) return NewsletterCard(**kwargs) elif cardName == '3337203208:newsletter_issue': - kwargs = _kwargs_from_map({ + kwargs = snscrape.utils.dict_map(bindingValues, { 'newsletter_title': 'newsletterTitle', 'newsletter_description': 'newsletterDescription', 'issue_title': 'issueTitle', @@ -1179,7 +1176,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): ), ) elif cardName == 'appplayer': - kwargs = _kwargs_from_map({'title': 'title', 'app_category': 'appCategory', 'player_owner_id': 'playerOwnerId', 'site': 'siteUser'}) + kwargs = snscrape.utils.dict_map(bindingValues, {'title': 'title', 'app_category': 'appCategory', 'player_owner_id': 'playerOwnerId', 'site': 'siteUser'}) kwargs['playerOwnerId'] = int(kwargs['playerOwnerId']) variants = [] variants.append(VideoVariant(contentType = 'application/x-mpegurl', url = bindingValues['player_hls_url'], bitrate = None)) @@ -1189,7 +1186,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): kwargs['video'] = Video(thumbnailUrl = bindingValues['player_image_original'], variants = variants, duration = int(bindingValues['content_duration_seconds'])) return AppPlayerCard(**kwargs) elif cardName == '3691233323:audiospace': - return SpacesCard(**_kwargs_from_map({'card_url': 'url', 'id': 'id'})) + return SpacesCard(**snscrape.utils.dict_map(bindingValues, {'card_url': 'url', 'id': 'id'})) elif cardName == '2586390716:message_me': # Note that the strings in Twitter's JS appear to have an incorrect mapping that then gets changed somewhere in the 1.8 MiB of JS! # cta_1, 3, and 4 should mean 'Message us', 'Send a private message', and 'Send me a private message', but the correct mapping is currently unknown. @@ -1197,7 +1194,7 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if bindingValues['cta'] not in ctas: _logger.warning(f'Unsupported message_me card cta on tweet {tweetId}: {bindingValues["cta"]!r}') return - return MessageMeCard(**_kwargs_from_map({'recipient': 'recipient', 'card_url': 'url'}), buttonText = ctas[bindingValues['cta']]) + return MessageMeCard(**snscrape.utils.dict_map(bindingValues, {'recipient': 'recipient', 'card_url': 'url'}), buttonText = ctas[bindingValues['cta']]) elif cardName == 'unified_card': o = json.loads(bindingValues['unified_card']) kwargs = {} diff --git a/snscrape/utils.py b/snscrape/utils.py new file mode 100644 index 0000000..b184058 --- /dev/null +++ b/snscrape/utils.py @@ -0,0 +1,4 @@ +def dict_map(input, keyMap): + '''Return a new dict from an input dict and a {'input_key': 'output_key'} mapping''' + + return {outputKey: input[inputKey] for inputKey, outputKey in keyMap.items() if inputKey in input} From 3dd9c28e31b8babeb2a187fbae994d9717ded168 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 3 Apr 2023 02:35:26 +0000 Subject: [PATCH 94/94] Add snake_to_camel helper --- snscrape/utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/snscrape/utils.py b/snscrape/utils.py index b184058..3150924 100644 --- a/snscrape/utils.py +++ b/snscrape/utils.py @@ -2,3 +2,15 @@ def dict_map(input, keyMap): '''Return a new dict from an input dict and a {'input_key': 'output_key'} mapping''' return {outputKey: input[inputKey] for inputKey, outputKey in keyMap.items() if inputKey in input} + + +def snake_to_camel(**kwargs): + '''Return a new dict from kwargs with snake_case keys replaced by camelCase''' + + out = {} + for key, value in kwargs.items(): + keyParts = key.split('_') + for i in range(1, len(keyParts)): + keyParts[i] = keyParts[i][:1].upper() + keyParts[i][1:] + out[''.join(keyParts)] = value + return out