From 8702a9c7e2b77e8701f47346cbd10306e116a315 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 7 Feb 2022 04:43:54 +0000 Subject: [PATCH] Add Reddit submission scraper Closes #312 --- snscrape/modules/reddit.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/reddit.py b/snscrape/modules/reddit.py index 07114d7..32b9d6a 100644 --- a/snscrape/modules/reddit.py +++ b/snscrape/modules/reddit.py @@ -1,4 +1,4 @@ -__all__ = ['Submission', 'Comment', 'RedditUserScraper', 'RedditSubredditScraper', 'RedditSearchScraper'] +__all__ = ['Submission', 'Comment', 'RedditUserScraper', 'RedditSubredditScraper', 'RedditSearchScraper', 'RedditSubmissionScraper'] import dataclasses @@ -244,3 +244,38 @@ class RedditSearchScraper(_RedditPushshiftSearchScraper): name = 'reddit-search' _validationFunc = lambda x: True _apiField = 'q' + + +class RedditSubmissionScraper(_RedditPushshiftScraper): + name = 'reddit-submission' + + def __init__(self, submissionId, **kwargs): + if (submissionId[3:] if submissionId.startswith('t3_') else submissionId).strip(string.ascii_lowercase + string.digits) != '': + raise ValueError('invalid submissionId') + super().__init__(**kwargs) + self._submissionId = submissionId + + def get_items(self): + obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission/?ids={self._submissionId}') + if not obj['data']: + return + if len(obj['data']) != 1: + raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1') + yield self._api_obj_to_item(obj['data'][0]) + + obj = self._get_api(f'https://api.pushshift.io/reddit/submission/comment_ids/{self._submissionId}') + if not obj['data']: + return + commentIds = obj['data'] + for i in range(0, len(commentIds), 500): + ids = commentIds[i : i + 500] + obj = self._get_api(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}') + yield from map(self._api_obj_to_item, obj['data']) + + @classmethod + def _cli_setup_parser(cls, subparser): + subparser.add_argument('submissionId', type = snscrape.base.nonempty_string('submissionId')) + + @classmethod + def _cli_from_args(cls, args): + return cls._cli_construct(args, args.submissionId)