diff --git a/snscrape/modules/reddit.py b/snscrape/modules/reddit.py new file mode 100644 index 0000000..4f6d322 --- /dev/null +++ b/snscrape/modules/reddit.py @@ -0,0 +1,231 @@ +import datetime +import logging +import re +import snscrape.base +import snscrape.version +import string +import time +import typing + + +logger = logging.getLogger(__name__) + + +# Most of these fields should never be None, but due to broken data, they sometimes are anyway... + +class Submission(typing.NamedTuple, snscrape.base.Item): + author: typing.Optional[str] # E.g. submission hf7k6 + created: datetime.datetime + id: str + link: typing.Optional[str] + selftext: typing.Optional[str] + subreddit: typing.Optional[str] # E.g. submission 617p51 + title: str + url: str + + def __str__(self): + return self.url + + +class Comment(typing.NamedTuple, snscrape.base.Item): + author: typing.Optional[str] + body: str + created: datetime.datetime + id: str + parentId: typing.Optional[str] + subreddit: typing.Optional[str] + url: str + + def __str__(self): + return self.url + + +class RedditPushshiftScraper(snscrape.base.Scraper): + def __init__(self, submissions = True, comments = True, before = None, after = None, **kwargs): + super().__init__(**kwargs) + self._submissions = submissions + self._comments = comments + self._before = before + self._after = after + + if not self._submissions and not self._comments: + raise ValueError('At least one of submissions and comments must be True') + + def _handle_rate_limiting(self, r): + if r.status_code == 429: + logger.info('Got 429 response, sleeping') + time.sleep(10) + return False, 'rate-limited' + if r.status_code != 200: + return False, 'non-200 status code' + return True, None + + def _cmp_id(self, id1, id2): + '''Compare two Reddit IDs. Returns -1 if id1 is less than id2, 0 if they are equal, and 1 if id1 is greater than id2. + + id1 and id2 may have prefixes like t1_, but if included, they must be present on both and equal.''' + + if id1.startswith('t') and '_' in id1: + prefix, id1 = id1.split('_', 1) + if not id2.startswith(f'{prefix}_'): + raise ValueError('id2 must have the same prefix as id1') + _, id2 = id2.split('_', 1) + if id1.strip(string.ascii_lowercase + string.digits) != '': + raise ValueError('invalid characters in id1') + if id2.strip(string.ascii_lowercase + string.digits) != '': + raise ValueError('invalid characters in id2') + if len(id1) < len(id2): + return -1 + if len(id1) > len(id2): + return 1 + if id1 < id2: + return -1 + if id1 > id2: + return 1 + return 0 + + def _iter_api(self, url, params = None): + '''Iterate through the Pushshift API using the 'before' parameter and yield the items.''' + lowestIdSeen = None + if params is None: + params = {} + if self._before is not None: + params['before'] = self._before + if self._after is not None: + params['after'] = self._after + params['sort'] = 'desc' + while True: + r = self._get(url, params = params, headers = {'User-Agent': f'snscrape/{snscrape.version.__version__}'}, responseOkCallback = self._handle_rate_limiting) + if r.status_code != 200: + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') + obj = r.json() + if not obj['data'] or (lowestIdSeen is not None and all(self._cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination + break + for d in obj['data']: + if lowestIdSeen is None or self._cmp_id(d['id'], lowestIdSeen) == -1: + yield self._api_obj_to_item(d) + lowestIdSeen = d['id'] + params['before'] = obj["data"][-1]["created_utc"] + 1 + + def _api_obj_to_item(self, d): + cls = Submission if 'title' in d else Comment + + # Pushshift doesn't always return a permalink; sometimes, there's a permalink_url instead, and sometimes there's nothing at all + permalink = d.get('permalink') + if permalink is None: + # E.g. comment dovj2v7 + permalink = d.get('permalink_url') + if permalink is None: + if 'link_id' in d and d['link_id'].startswith('t3_'): # E.g. comment doraazf + if 'subreddit' in d: + permalink = f'/r/{d["subreddit"]}/comments/{d["link_id"][3:]}/_/{d["id"]}/' + else: # E.g. submission 617p51 but can likely happen for comments as well + permalink = f'/comments/{d["link_id"][3:]}/_/{d["id"]}/' + else: + logger.warning(f'Unable to find or construct permalink') + permalink = '/' + + kwargs = { + 'author': d.get('author'), + 'created': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc), + 'url': f'https://old.reddit.com{permalink}', + 'subreddit': d.get('subreddit'), + } + if cls is Submission: + kwargs['selftext'] = d.get('selftext') or None + kwargs['link'] = (d['url'] if not d['url'].startswith('/') else f'https://old.reddit.com{d["url"]}') if not kwargs['selftext'] else None + if kwargs['link'] == kwargs['url'] or kwargs['url'].replace('//old.reddit.com/', '//www.reddit.com/') == kwargs['link']: + kwargs['link'] = None + kwargs['title'] = d['title'] + kwargs['id'] = f't3_{d["id"]}' + else: + kwargs['body'] = d['body'] + kwargs['parentId'] = d.get('parent_id') + kwargs['id'] = f't1_{d["id"]}' + + return cls(**kwargs) + + def _iter_api_submissions_and_comments(self, params: dict): + # Retrieve both submissions and comments, interleave the results to get a reverse-chronological order + params['size'] = '1000' + if self._submissions: + submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission/', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict + else: + submissionsIter = iter(()) + if self._comments: + commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment/', params.copy()) + else: + commentsIter = iter(()) + + try: + tipSubmission = next(submissionsIter) + except StopIteration: + # There are no submissions, just yield comments and return + yield from commentsIter + return + try: + tipComment = next(commentsIter) + except StopIteration: + # There are no comments, just yield submissions and return + yield tipSubmission + yield from submissionsIter + return + + while True: + # Return newer first; if both have the same creation datetime, return the comment first + if tipSubmission.created > tipComment.created: + yield tipSubmission + try: + tipSubmission = next(submissionsIter) + except StopIteration: + # Reached the end of submissions, just yield the remaining comments and stop + yield tipComment + yield from commentsIter + break + else: + yield tipComment + try: + tipComment = next(commentsIter) + except StopIteration: + yield tipSubmission + yield from submissionsIter + break + + @classmethod + def _setup_parser_opts(cls, subparser): + subparser.add_argument('--no-submissions', dest = 'noSubmissions', action = 'store_true', default = False, help = 'Don\'t list submissions') + subparser.add_argument('--no-comments', dest = 'noComments', action = 'store_true', default = False, help = 'Don\'t list comments') + subparser.add_argument('--before', metavar = 'TIMESTAMP', type = int, help = 'Fetch results before a Unix timestamp') + subparser.add_argument('--after', metavar = 'TIMESTAMP', type = int, help = 'Fetch results after a Unix timestamp') + + +def _make_scraper(name_, validationFunc, apiField): + class Scraper(RedditPushshiftScraper): + name = f'reddit-{name_}' + + def __init__(self, name, **kwargs): + super().__init__(**kwargs) + self._name = name + if not validationFunc(self._name): + raise ValueError(f'invalid {name_} name') + + def get_items(self): + yield from self._iter_api_submissions_and_comments({apiField: self._name}) + + @classmethod + def setup_parser(cls, subparser): + super()._setup_parser_opts(subparser) + subparser.add_argument(name_) + + @classmethod + def from_args(cls, args): + return cls(getattr(args, name_), submissions = not args.noSubmissions, comments = not args.noComments, before = args.before, after = args.after, retries = args.retries) + + Scraper.__name__ = f'Reddit{name_.capitalize()}Scraper' + Scraper.__qualname__ = Scraper.__name__ + globals()[Scraper.__name__] = Scraper + + +_make_scraper('user', lambda x: re.match('^[A-Za-z0-9_-]{3,20}$', x), 'author') +_make_scraper('subreddit', lambda x: re.match('^[A-Za-z0-9][A-Za-z0-9_]{2,20}$', x), 'subreddit') +_make_scraper('search', lambda x: True, 'q')