From 8265ffc19e889775d0562b14ce43d4b3a0161825 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 12 Sep 2020 01:24:19 +0000 Subject: [PATCH] Work around geoblocked posts on VK To get around the block, try to iterate over post offsets individually instead of in 10-steps. This means we should get every post that isn't blocked as long as there are at least 10 posts between two blocked ones. Fixes #68 --- snscrape/modules/vkontakte.py | 71 ++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 27 deletions(-) diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index 57eaa6d..c2b95f4 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -106,34 +106,9 @@ class VKontakteUserScraper(snscrape.base.Scraper): fixedPost = None lastPostID = float('infinity') - for item in self._soup_to_items(soup.find(id = 'page_wall_posts')): - postID = int(item.url.rsplit('_', 1)[1]) - if postID < lastPostID: - if fixedPost is not None and fixedPostID > postID: - yield fixedPost - fixedPost = None - yield item - lastPostID = postID - headers = self._headers.copy() - headers['X-Requested-With'] = 'XMLHttpRequest' - for offset in itertools.count(start = 10, step = 10): - logger.info('Retrieving next page') - r = self._post( - 'https://vk.com/al_wall.php', - data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)], - headers = headers - ) - if r.status_code != 200: - raise snscrape.base.ScraperException(f'Got status code {r.status_code}') - # Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes. - posts = r.json()['payload'][1][0] - if posts.startswith('
'): - # Reached the end - break - if not posts.startswith('
'): + # Reached the end + break + if not posts.startswith('
'): + # No breaking the outer loop, it'll just make one extra request and exit as well + break + if not geoPosts.startswith('