Work around geoblocked posts on VK

To get around the block, try to iterate over post offsets individually instead of in 10-steps. This means we should get every post that isn't blocked as long as there are at least 10 posts between two blocked ones. Fixes #68
2026-06-10 19:38:29 +03:00 · 2020-09-12 01:24:19 +00:00
parent f8efe98608
commit 8265ffc19e
1 changed files with 44 additions and 27 deletions
--- a/snscrape/modules/vkontakte.py
+++ b/snscrape/modules/vkontakte.py
@@ -106,34 +106,9 @@ class VKontakteUserScraper(snscrape.base.Scraper):
 			fixedPost = None

 		lastPostID = float('infinity')
-		for item in self._soup_to_items(soup.find(id = 'page_wall_posts')):
-			postID = int(item.url.rsplit('_', 1)[1])
-			if postID < lastPostID:
-				if fixedPost is not None and fixedPostID > postID:
-					yield fixedPost
-					fixedPost = None
-				yield item
-				lastPostID = postID

-		headers = self._headers.copy()
-		headers['X-Requested-With'] = 'XMLHttpRequest'
-		for offset in itertools.count(start = 10, step = 10):
-			logger.info('Retrieving next page')
-			r = self._post(
-			  'https://vk.com/al_wall.php',
-			  data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
-			  headers = headers
-			 )
-			if r.status_code != 200:
-				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
-			# Convert to JSON and read the HTML payload.  Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
-			posts = r.json()['payload'][1][0]
-			if posts.startswith('<div class="page_block no_posts">'):
-				# Reached the end
-				break
-			if not posts.startswith('<div id="post'):
-				raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
-			soup = bs4.BeautifulSoup(posts, 'lxml')
+		def _process_soup(soup):
+			nonlocal fixedPost, lastPostID
 			for item in self._soup_to_items(soup):
 				postID = int(item.url.rsplit('_', 1)[1])
 				if postID < lastPostID:
@@ -143,6 +118,48 @@ class VKontakteUserScraper(snscrape.base.Scraper):
 					yield item
 					lastPostID = postID

+		yield from _process_soup(soup.find(id = 'page_wall_posts'))
+
+		lastWorkingOffset = 0
+		for offset in itertools.count(start = 10, step = 10):
+			posts = self._get_wall_offset(fixedPostID, ownerID, offset)
+			if posts.startswith('<div class="page_block no_posts">'):
+				# Reached the end
+				break
+			if not posts.startswith('<div id="post'):
+				if posts == '"\\/blank.php?block=119910902"':
+					logger.warning(f'Encountered geoblock on offset {offset}, trying to work around the block but might be missing content')
+					for geoblockOffset in range(lastWorkingOffset + 1, offset + 10):
+						geoPosts = self._get_wall_offset(fixedPostID, ownerID, geoblockOffset)
+						if geoPosts.startswith('<div class="page_block no_posts">'):
+							# No breaking the outer loop, it'll just make one extra request and exit as well
+							break
+						if not geoPosts.startswith('<div id="post'):
+							if geoPosts == '"\\/blank.php?block=119910902"':
+								continue
+							raise snscrape.base.ScraperException(f'Got an unknown response: {geoPosts[:200]!r}...')
+						yield from _process_soup(soup = bs4.BeautifulSoup(geoPosts, 'lxml'))
+					continue
+				raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
+			lastWorkingOffset = offset
+			soup = bs4.BeautifulSoup(posts, 'lxml')
+			yield from _process_soup(soup)
+
+	def _get_wall_offset(self, fixedPostID, ownerID, offset):
+		headers = self._headers.copy()
+		headers['X-Requested-With'] = 'XMLHttpRequest'
+		logger.info(f'Retrieving page offset {offset}')
+		r = self._post(
+		  'https://vk.com/al_wall.php',
+		  data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
+		  headers = headers
+		 )
+		if r.status_code != 200:
+			raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
+		# Convert to JSON and read the HTML payload.  Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
+		posts = r.json()['payload'][1][0]
+		return posts
+
 	def _get_entity(self):
 		r, soup = self._initial_page()
 		if r.status_code != 200: