Refactor post extraction of VK again to work around their weird behaviours

VK doesn't always return posts in chronological order, so that can't be used to filter out duplicates. Instead, remember the last 1k post IDs and filter using that. This should catch the vast majority of duplicates. (Also, duplicates can't only happen in the geoblocking workaround; sometimes, VK also simply returns the same post again for no obvious reason.)
This commit is contained in:
JustAnotherArchivist
2020-09-12 02:00:50 +00:00
parent 8265ffc19e
commit f296f9d21d

View File

@@ -1,4 +1,5 @@
import bs4 import bs4
import collections
import datetime import datetime
import itertools import itertools
import logging import logging
@@ -100,25 +101,20 @@ class VKontakteUserScraper(snscrape.base.Scraper):
# If there is a pinned post, we need its ID for the pagination requests; we also need to keep the post around so it can be inserted into the stream at the right point # If there is a pinned post, we need its ID for the pagination requests; we also need to keep the post around so it can be inserted into the stream at the right point
if 'post_fixed' in newestPost.attrs['class']: if 'post_fixed' in newestPost.attrs['class']:
fixedPostID = int(newestPost.attrs['id'].split('_')[1]) fixedPostID = int(newestPost.attrs['id'].split('_')[1])
fixedPost = self._post_div_to_item(newestPost)
else: else:
fixedPostID = '' fixedPostID = ''
fixedPost = None
lastPostID = float('infinity') last1000PostIDs = collections.deque(maxlen = 1000)
def _process_soup(soup): def _process_soup(soup):
nonlocal fixedPost, lastPostID nonlocal last1000PostIDs
for item in self._soup_to_items(soup): for item in self._soup_to_items(soup):
postID = int(item.url.rsplit('_', 1)[1]) postID = int(item.url.rsplit('_', 1)[1])
if postID < lastPostID: if postID not in last1000PostIDs:
if fixedPost is not None and fixedPostID > postID:
yield fixedPost
fixedPost = None
yield item yield item
lastPostID = postID last1000PostIDs.append(postID)
yield from _process_soup(soup.find(id = 'page_wall_posts')) yield from _process_soup(soup)
lastWorkingOffset = 0 lastWorkingOffset = 0
for offset in itertools.count(start = 10, step = 10): for offset in itertools.count(start = 10, step = 10):