Merge branch 'JustAnotherArchivist:master' into master

This commit is contained in:
KΞVIN KΞLCHΞ
2023-03-01 22:49:43 +03:00
committed by GitHub
3 changed files with 43 additions and 31 deletions

View File

@@ -8,7 +8,7 @@ The following services are currently supported:
* Mastodon: user profiles and toots (single or thread)
* Reddit: users, subreddits, and searches (via Pushshift)
* Telegram: channels
* Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends
* Twitter: users, user profiles, hashtags, searches (live tweets, top tweets, and users), tweets (single or surrounding thread), list posts, communities, and trends
* VKontakte: user profiles
* Weibo (Sina Weibo): user profiles

View File

@@ -6,6 +6,7 @@ import datetime
import importlib.metadata
import inspect
import logging
import os
import requests
# Imported in parse_args() after setting up the logger:
#import snscrape.base
@@ -307,32 +308,36 @@ def main():
i = 0
with _dump_locals_on_exception():
if args.withEntity and (entity := scraper.entity):
if args.jsonl:
print(entity.json())
try:
if args.withEntity and (entity := scraper.entity):
if args.jsonl:
print(entity.json())
else:
print(entity)
if args.maxResults == 0:
logger.info('Exiting after 0 results')
return
for i, item in enumerate(scraper.get_items(), start = 1):
if args.since is not None and item.date < args.since:
logger.info(f'Exiting due to reaching older results than {args.since}')
break
if args.jsonl:
print(item.json())
elif args.format is not None:
print(args.format.format(item))
else:
print(item)
if args.progress and i % 100 == 0:
print(f'Scraping, {i} results so far', file = sys.stderr)
if args.maxResults and i >= args.maxResults:
logger.info(f'Exiting after {i} results')
if args.progress:
print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
break
else:
print(entity)
if args.maxResults == 0:
logger.info('Exiting after 0 results')
return
for i, item in enumerate(scraper.get_items(), start = 1):
if args.since is not None and item.date < args.since:
logger.info(f'Exiting due to reaching older results than {args.since}')
break
if args.jsonl:
print(item.json())
elif args.format is not None:
print(args.format.format(item))
else:
print(item)
if args.progress and i % 100 == 0:
print(f'Scraping, {i} results so far', file = sys.stderr)
if args.maxResults and i >= args.maxResults:
logger.info(f'Exiting after {i} results')
logger.info(f'Done, found {i} results')
if args.progress:
print(f'Stopped scraping after {i} results due to --max-results', file = sys.stderr)
break
else:
logger.info(f'Done, found {i} results')
if args.progress:
print(f'Finished, {i} results', file = sys.stderr)
print(f'Finished, {i} results', file = sys.stderr)
except BrokenPipeError:
os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno())
sys.exit(1)

View File

@@ -1412,14 +1412,21 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
#TODO Tombstones will cause a crash here.
kwargs['retweetedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(tweet['retweeted_status_result']['result'])
if 'quoted_status_result' in result:
kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str']))
if 'result' not in result['quoted_status_result']:
_logger.warning(f'quoted_status_result for {tweet["quoted_status_id_str"]} without an actual result on tweet {self._get_tweet_id(tweet)}, using TweetRef')
kwargs['quotedTweet'] = TweetRef(int(tweet['quoted_status_id_str']))
else:
kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quoted_status_result']['result'], tweetId = int(tweet['quoted_status_id_str']))
elif result.get('quotedRefResult'):
if result['quotedRefResult']['result']['__typename'] == 'TweetTombstone':
kwargs['quotedTweet'] = self._graphql_timeline_tweet_item_result_to_tweet(result['quotedRefResult']['result'], tweetId = int(tweet['quoted_status_id_str']))
else:
if result['quotedRefResult']['result']['__typename'] != 'Tweet':
qTweet = result['quotedRefResult']['result']
if result['quotedRefResult']['result']['__typename'] not in ('Tweet', 'TweetWithVisibilityResults'):
_logger.warning(f'Unknown quotedRefResult type {result["quotedRefResult"]["result"]["__typename"]!r} on tweet {self._get_tweet_id(tweet)}, using TweetRef')
kwargs['quotedTweet'] = TweetRef(id = int(result['quotedRefResult']['result']['rest_id']))
elif result['quotedRefResult']['result']['__typename'] == 'TweetWithVisibilityResults':
qTweet = qTweet['tweet']
kwargs['quotedTweet'] = TweetRef(id = int(qTweet['rest_id']))
elif 'quoted_status_id_str' in tweet:
# Omit the TweetRef if this is a retweet and the quoted tweet ID matches the tweet quoted in the retweeted tweet.
if tweet['quoted_status_id_str'] != tweet.get('retweeted_status_result', {}).get('result', {}).get('quoted_status_result', {}).get('result', {}).get('rest_id'):