mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 20:38:29 +03:00
Fix vkontakte-user pagination
This commit is contained in:
@@ -80,14 +80,15 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
logger.error(f'Got status code {r.status_code}')
|
logger.error(f'Got status code {r.status_code}')
|
||||||
return
|
return
|
||||||
fields = r.content.split(b'<!>')
|
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
||||||
if fields[5].startswith(b'<div class="page_block no_posts">'):
|
posts = r.json()['payload'][1][0]
|
||||||
|
if posts.startswith('<div class="page_block no_posts">'):
|
||||||
# Reached the end
|
# Reached the end
|
||||||
break
|
break
|
||||||
if not fields[5].startswith(b'<div id="post'):
|
if not posts.startswith('<div id="post'):
|
||||||
logger.error(f'Got an unknown response: {fields[5][:200]!r}...')
|
logger.error(f'Got an unknown response: {posts[:200]!r}...')
|
||||||
break
|
break
|
||||||
soup = bs4.BeautifulSoup(fields[5], 'lxml', from_encoding = r.encoding)
|
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||||
yield from self._soup_to_items(soup, baseUrl)
|
yield from self._soup_to_items(soup, baseUrl)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
Reference in New Issue
Block a user