mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-10 19:38:29 +03:00
Work around geoblocked posts on VK
To get around the block, try to iterate over post offsets individually instead of in 10-steps. This means we should get every post that isn't blocked as long as there are at least 10 posts between two blocked ones. Fixes #68
This commit is contained in:
@@ -106,34 +106,9 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
fixedPost = None
|
||||
|
||||
lastPostID = float('infinity')
|
||||
for item in self._soup_to_items(soup.find(id = 'page_wall_posts')):
|
||||
postID = int(item.url.rsplit('_', 1)[1])
|
||||
if postID < lastPostID:
|
||||
if fixedPost is not None and fixedPostID > postID:
|
||||
yield fixedPost
|
||||
fixedPost = None
|
||||
yield item
|
||||
lastPostID = postID
|
||||
|
||||
headers = self._headers.copy()
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
for offset in itertools.count(start = 10, step = 10):
|
||||
logger.info('Retrieving next page')
|
||||
r = self._post(
|
||||
'https://vk.com/al_wall.php',
|
||||
data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
||||
posts = r.json()['payload'][1][0]
|
||||
if posts.startswith('<div class="page_block no_posts">'):
|
||||
# Reached the end
|
||||
break
|
||||
if not posts.startswith('<div id="post'):
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||
def _process_soup(soup):
|
||||
nonlocal fixedPost, lastPostID
|
||||
for item in self._soup_to_items(soup):
|
||||
postID = int(item.url.rsplit('_', 1)[1])
|
||||
if postID < lastPostID:
|
||||
@@ -143,6 +118,48 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
yield item
|
||||
lastPostID = postID
|
||||
|
||||
yield from _process_soup(soup.find(id = 'page_wall_posts'))
|
||||
|
||||
lastWorkingOffset = 0
|
||||
for offset in itertools.count(start = 10, step = 10):
|
||||
posts = self._get_wall_offset(fixedPostID, ownerID, offset)
|
||||
if posts.startswith('<div class="page_block no_posts">'):
|
||||
# Reached the end
|
||||
break
|
||||
if not posts.startswith('<div id="post'):
|
||||
if posts == '"\\/blank.php?block=119910902"':
|
||||
logger.warning(f'Encountered geoblock on offset {offset}, trying to work around the block but might be missing content')
|
||||
for geoblockOffset in range(lastWorkingOffset + 1, offset + 10):
|
||||
geoPosts = self._get_wall_offset(fixedPostID, ownerID, geoblockOffset)
|
||||
if geoPosts.startswith('<div class="page_block no_posts">'):
|
||||
# No breaking the outer loop, it'll just make one extra request and exit as well
|
||||
break
|
||||
if not geoPosts.startswith('<div id="post'):
|
||||
if geoPosts == '"\\/blank.php?block=119910902"':
|
||||
continue
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {geoPosts[:200]!r}...')
|
||||
yield from _process_soup(soup = bs4.BeautifulSoup(geoPosts, 'lxml'))
|
||||
continue
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
||||
lastWorkingOffset = offset
|
||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||
yield from _process_soup(soup)
|
||||
|
||||
def _get_wall_offset(self, fixedPostID, ownerID, offset):
|
||||
headers = self._headers.copy()
|
||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||
logger.info(f'Retrieving page offset {offset}')
|
||||
r = self._post(
|
||||
'https://vk.com/al_wall.php',
|
||||
data = [('act', 'get_wall'), ('al', 1), ('fixed', fixedPostID), ('offset', offset), ('onlyCache', 'false'), ('owner_id', ownerID), ('type', 'own'), ('wall_start_from', offset)],
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
||||
posts = r.json()['payload'][1][0]
|
||||
return posts
|
||||
|
||||
def _get_entity(self):
|
||||
r, soup = self._initial_page()
|
||||
if r.status_code != 200:
|
||||
|
||||
Reference in New Issue
Block a user