From 21cf6268033110195014231eba9b510c25c651cd Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 21 Feb 2023 22:10:33 +0000 Subject: [PATCH 1/4] Update list of scrapers --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bfe30d3..762c0d4 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The following services are currently supported: * Mastodon: user profiles and toots (single or thread) * Reddit: users, subreddits, and searches (via Pushshift) * Telegram: channels -* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends +* Twitter: users, user profiles, hashtags, searches (live tweets, top tweets, and users), tweets (single or surrounding thread), list posts, communities, and trends * VKontakte: user profiles * Weibo (Sina Weibo): user profiles From ea7c6786c2f4e4cd1e757d0d10e7127044158406 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 28 Feb 2023 20:16:07 +0000 Subject: [PATCH 2/4] Handle TweetWithVisibilityResults on quoted tweets Fixes #604 --- snscrape/modules/twitter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 8cf0791..d59fc34 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1416,9 +1416,12 @@ class _TwitterAPIScraper(snscrape.base.Scraper): if result['quotedRefResult']['result']['__typename'] == 'TweetTombstone': kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quotedRefResult']['result'], tweetId = int(tweet['quoted_status_id_str'])) else: - if result['quotedRefResult']['result']['__typename'] != 'Tweet': + qTweet = result['quotedRefResult']['result'] + if result['quotedRefResult']['result']['__typename'] not in ('Tweet', 'TweetWithVisibilityResults'): _logger.warning(f'Unknown quotedRefResult type {result["quotedRefResult"]["result"]["__typename"]!r} on tweet {self._get_tweet_id(tweet)}, using TweetRef') - kwargs['quotedTweet'] = TweetRef(id = int(result['quotedRefResult']['result']['rest_id'])) + elif result['quotedRefResult']['result']['__typename'] == 'TweetWithVisibilityResults': + qTweet = qTweet['tweet'] + kwargs['quotedTweet'] = TweetRef(id = int(qTweet['rest_id'])) elif 'quoted_status_id_str' in tweet: # Omit the TweetRef if this is a retweet and the quoted tweet ID matches the tweet quoted in the retweeted tweet. if tweet['quoted_status_id_str'] != tweet.get('retweeted_status_result', {}).get('result', {}).get('quoted_status_result', {}).get('result', {}).get('rest_id'): From 42cb6d8170d424045518930454e7ec462f05f612 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 28 Feb 2023 20:16:55 +0000 Subject: [PATCH 3/4] Fix crash on quotedRefResult without an actual result Fixes #740 --- snscrape/modules/twitter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index d59fc34..743e2e5 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -1411,7 +1411,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper): #TODO Tombstones will cause a crash here. kwargs['retweetedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(tweet['retweeted_status_result']['result']) if 'quoted_status_result' in result: - kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str'])) + if 'result' not in result['quoted_status_result']: + _logger.warning(f'quoted_status_result for {tweet["quoted_status_id_str"]} without an actual result on tweet {self._get_tweet_id(tweet)}, using TweetRef') + kwargs['quotedTweet'] = TweetRef(int(tweet['quoted_status_id_str'])) + else: + kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str'])) elif result.get('quotedRefResult'): if result['quotedRefResult']['result']['__typename'] == 'TweetTombstone': kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quotedRefResult']['result'], tweetId = int(tweet['quoted_status_id_str'])) From 03ef3debaf9c32d20cf4ff557a24276c34f12005 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 28 Feb 2023 20:20:28 +0000 Subject: [PATCH 4/4] Fix behaviour on SIGPIPE/BrokenPipeError --- snscrape/_cli.py | 59 ++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/snscrape/_cli.py b/snscrape/_cli.py index 9f846b8..c0bb32d 100644 --- a/snscrape/_cli.py +++ b/snscrape/_cli.py @@ -6,6 +6,7 @@ import datetime import importlib.metadata import inspect import logging +import os import requests # Imported in parse_args() after setting up the logger: #import snscrape.base @@ -307,32 +308,36 @@ def main(): i = 0 with _dump_locals_on_exception(): - if args.withEntity and (entity := scraper.entity): - if args.jsonl: - print(entity.json()) + try: + if args.withEntity and (entity := scraper.entity): + if args.jsonl: + print(entity.json()) + else: + print(entity) + if args.maxResults == 0: + logger.info('Exiting after 0 results') + return + for i, item in enumerate(scraper.get_items(), start = 1): + if args.since is not None and item.date < args.since: + logger.info(f'Exiting due to reaching older results than {args.since}') + break + if args.jsonl: + print(item.json()) + elif args.format is not None: + print(args.format.format(item)) + else: + print(item) + if args.progress and i % 100 == 0: + print(f'Scraping, {i} results so far', file = sys.stderr) + if args.maxResults and i >= args.maxResults: + logger.info(f'Exiting after {i} results') + if args.progress: + print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr) + break else: - print(entity) - if args.maxResults == 0: - logger.info('Exiting after 0 results') - return - for i, item in enumerate(scraper.get_items(), start = 1): - if args.since is not None and item.date < args.since: - logger.info(f'Exiting due to reaching older results than {args.since}') - break - if args.jsonl: - print(item.json()) - elif args.format is not None: - print(args.format.format(item)) - else: - print(item) - if args.progress and i % 100 == 0: - print(f'Scraping, {i} results so far', file = sys.stderr) - if args.maxResults and i >= args.maxResults: - logger.info(f'Exiting after {i} results') + logger.info(f'Done, found {i} results') if args.progress: - print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr) - break - else: - logger.info(f'Done, found {i} results') - if args.progress: - print(f'Finished, {i} results', file = sys.stderr) + print(f'Finished, {i} results', file = sys.stderr) + except BrokenPipeError: + os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno()) + sys.exit(1)