mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 20:38:29 +03:00
Fix post order on VK: reinsert pinned post at the correct location in the stream
This commit is contained in:
@@ -51,15 +51,20 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
self._initialPage = None
|
self._initialPage = None
|
||||||
self._initialPageSoup = None
|
self._initialPageSoup = None
|
||||||
|
|
||||||
|
def _post_div_to_item(self, post):
|
||||||
|
url = urllib.parse.urljoin(self._baseUrl, post.find('a', class_ = 'post_link')['href'])
|
||||||
|
assert url.startswith('https://vk.com/wall') and '_' in url and url[-1] != '_' and url.rsplit('_', 1)[1].strip('0123456789') == ''
|
||||||
|
dateSpan = post.find('div', class_ = 'post_date').find('span', class_ = 'rel_date')
|
||||||
|
textDiv = post.find('div', class_ = 'wall_post_text')
|
||||||
|
return VKontaktePost(
|
||||||
|
url = url,
|
||||||
|
date = datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc) if 'time' in dateSpan else None,
|
||||||
|
content = textDiv.text if textDiv else None,
|
||||||
|
)
|
||||||
|
|
||||||
def _soup_to_items(self, soup):
|
def _soup_to_items(self, soup):
|
||||||
for post in soup.find_all('div', class_ = 'post'):
|
for post in soup.find_all('div', class_ = 'post'):
|
||||||
dateSpan = post.find('div', class_ = 'post_date').find('span', class_ = 'rel_date')
|
yield self._post_div_to_item(post)
|
||||||
textDiv = post.find('div', class_ = 'wall_post_text')
|
|
||||||
yield VKontaktePost(
|
|
||||||
url = urllib.parse.urljoin(self._baseUrl, post.find('a', class_ = 'post_link')['href']),
|
|
||||||
date = datetime.datetime.fromtimestamp(int(dateSpan['time']), datetime.timezone.utc) if 'time' in dateSpan else None,
|
|
||||||
content = textDiv.text if textDiv else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _initial_page(self):
|
def _initial_page(self):
|
||||||
if self._initialPage is None:
|
if self._initialPage is None:
|
||||||
@@ -92,13 +97,23 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
logger.info('Wall has no posts')
|
logger.info('Wall has no posts')
|
||||||
return
|
return
|
||||||
ownerID = newestPost.attrs['data-post-id'].split('_')[0]
|
ownerID = newestPost.attrs['data-post-id'].split('_')[0]
|
||||||
# If there is a pinned post, we need its ID for the pagination requests
|
# If there is a pinned post, we need its ID for the pagination requests; we also need to keep the post around so it can be inserted into the stream at the right point
|
||||||
if 'post_fixed' in newestPost.attrs['class']:
|
if 'post_fixed' in newestPost.attrs['class']:
|
||||||
fixedPostID = newestPost.attrs['id'].split('_')[1]
|
fixedPostID = int(newestPost.attrs['id'].split('_')[1])
|
||||||
|
fixedPost = self._post_div_to_item(newestPost)
|
||||||
else:
|
else:
|
||||||
fixedPostID = ''
|
fixedPostID = ''
|
||||||
|
fixedPost = None
|
||||||
|
|
||||||
yield from self._soup_to_items(soup)
|
lastPostID = float('infinity')
|
||||||
|
for item in self._soup_to_items(soup.find(id = 'page_wall_posts')):
|
||||||
|
postID = int(item.url.rsplit('_', 1)[1])
|
||||||
|
if postID < lastPostID:
|
||||||
|
if fixedPost is not None and fixedPostID > postID:
|
||||||
|
yield fixedPost
|
||||||
|
fixedPost = None
|
||||||
|
yield item
|
||||||
|
lastPostID = postID
|
||||||
|
|
||||||
headers = self._headers.copy()
|
headers = self._headers.copy()
|
||||||
headers['X-Requested-With'] = 'XMLHttpRequest'
|
headers['X-Requested-With'] = 'XMLHttpRequest'
|
||||||
@@ -119,7 +134,14 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
if not posts.startswith('<div id="post'):
|
if not posts.startswith('<div id="post'):
|
||||||
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
||||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||||
yield from self._soup_to_items(soup)
|
for item in self._soup_to_items(soup):
|
||||||
|
postID = int(item.url.rsplit('_', 1)[1])
|
||||||
|
if postID < lastPostID:
|
||||||
|
if fixedPost is not None and fixedPostID > postID:
|
||||||
|
yield fixedPost
|
||||||
|
fixedPost = None
|
||||||
|
yield item
|
||||||
|
lastPostID = postID
|
||||||
|
|
||||||
def _get_entity(self):
|
def _get_entity(self):
|
||||||
r, soup = self._initial_page()
|
r, soup = self._initial_page()
|
||||||
|
|||||||
Reference in New Issue
Block a user