Port Reddit scraper to new Pushshift API

Fixes #619
2026-06-11 03:48:29 +03:00 · 2023-01-13 10:07:58 +00:00
parent 564a5eca77
commit d81d247a87
1 changed files with 26 additions and 32 deletions
--- a/snscrape/modules/reddit.py
+++ b/snscrape/modules/reddit.py
@@ -133,6 +133,21 @@ class _RedditPushshiftScraper(snscrape.base.Scraper):

 		return cls(**kwargs)

+	def _iter_api(self, url, params = None):
+		'''Iterate through the Pushshift API using the 'until' parameter and yield the items.'''
+		lowestIdSeen = None
+		if params is None:
+			params = {}
+		while True:
+			obj = self._get_api(url, params = params)
+			if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination
+				break
+			for d in obj['data']:
+				if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1:
+					yield self._api_obj_to_item(d)
+					lowestIdSeen = d['id']
+			params['until'] = obj["data"][-1]["created_utc"] + 1
+

 class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
 	def __init__(self, name, *, submissions = True, comments = True, before = None, after = None, **kwargs):
@@ -148,35 +163,20 @@ class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
 		if not self._submissions and not self._comments:
 			raise ValueError('At least one of submissions and comments must be True')

-	def _iter_api(self, url, params = None):
-		'''Iterate through the Pushshift API using the 'before' parameter and yield the items.'''
-		lowestIdSeen = None
-		if params is None:
-			params = {}
-		if self._before is not None:
-			params['before'] = self._before
-		if self._after is not None:
-			params['after'] = self._after
-		params['sort'] = 'desc'
-		while True:
-			obj = self._get_api(url, params = params)
-			if not obj['data'] or (lowestIdSeen is not None and all(_cmp_id(d['id'], lowestIdSeen) >= 0 for d in obj['data'])): # end of pagination
-				break
-			for d in obj['data']:
-				if lowestIdSeen is None or _cmp_id(d['id'], lowestIdSeen) == -1:
-					yield self._api_obj_to_item(d)
-					lowestIdSeen = d['id']
-			params['before'] = obj["data"][-1]["created_utc"] + 1
-
 	def _iter_api_submissions_and_comments(self, params: dict):
 		# Retrieve both submissions and comments, interleave the results to get a reverse-chronological order
-		params['size'] = '1000'
+		params['limit'] = '1000'
+		if self._before is not None:
+			params['until'] = self._before
+		if self._after is not None:
+			params['since'] = self._after
+
 		if self._submissions:
-			submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission/', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict
+			submissionsIter = self._iter_api('https://api.pushshift.io/reddit/search/submission', params.copy()) # Pass copies to prevent the two iterators from messing each other up by using the same dict
 		else:
 			submissionsIter = iter(())
 		if self._comments:
-			commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment/', params.copy())
+			commentsIter = self._iter_api('https://api.pushshift.io/reddit/search/comment', params.copy())
 		else:
 			commentsIter = iter(())

@@ -260,21 +260,15 @@ class RedditSubmissionScraper(_RedditPushshiftScraper):
 		self._submissionId = submissionId

 	def get_items(self):
-		obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission/?ids={self._submissionId}')
+		obj = self._get_api(f'https://api.pushshift.io/reddit/search/submission?ids={self._submissionId}')
 		if not obj['data']:
 			return
 		if len(obj['data']) != 1:
 			raise snscrape.base.ScraperException(f'Got {len(obj["data"])} results instead of 1')
 		yield self._api_obj_to_item(obj['data'][0])

-		obj = self._get_api(f'https://api.pushshift.io/reddit/submission/comment_ids/{self._submissionId}')
-		if not obj['data']:
-			return
-		commentIds = obj['data']
-		for i in range(0, len(commentIds), 500):
-			ids = commentIds[i : i + 500]
-			obj = self._get_api(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}')
-			yield from map(self._api_obj_to_item, obj['data'])
+		# Upstream bug: link_id must be provided in decimal https://old.reddit.com/r/pushshift/comments/zkggt0/update_on_colo_switchover_bug_fixes_reindexing/
+		yield from self._iter_api('https://api.pushshift.io/reddit/search/comment', {'link_id': int(self._submissionId, 36), 'limit': 1000})

 	@classmethod
 	def _cli_setup_parser(cls, subparser):