Refactor post extraction of VK again to work around their weird behaviours

VK doesn't always return posts in chronological order, so that can't be used to filter out duplicates. Instead, remember the last 1k post IDs and filter using that. This should catch the vast majority of duplicates. (Also, duplicates can't only happen in the geoblocking workaround; sometimes, VK also simply returns the same post again for no obvious reason.)
This commit is contained in:
JustAnotherArchivist
2020-09-12 02:00:50 +00:00
parent 8265ffc19e
commit f296f9d21d

View File

@@ -1,4 +1,5 @@
import bs4
import collections
import datetime
import itertools
import logging
@@ -100,25 +101,20 @@ class VKontakteUserScraper(snscrape.base.Scraper):
# If there is a pinned post, we need its ID for the pagination requests; we also need to keep the post around so it can be inserted into the stream at the right point
if 'post_fixed' in newestPost.attrs['class']:
fixedPostID = int(newestPost.attrs['id'].split('_')[1])
fixedPost = self._post_div_to_item(newestPost)
else:
fixedPostID = ''
fixedPost = None
lastPostID = float('infinity')
last1000PostIDs = collections.deque(maxlen = 1000)
def _process_soup(soup):
nonlocal fixedPost, lastPostID
nonlocal last1000PostIDs
for item in self._soup_to_items(soup):
postID = int(item.url.rsplit('_', 1)[1])
if postID < lastPostID:
if fixedPost is not None and fixedPostID > postID:
yield fixedPost
fixedPost = None
if postID not in last1000PostIDs:
yield item
lastPostID = postID
last1000PostIDs.append(postID)
yield from _process_soup(soup.find(id = 'page_wall_posts'))
yield from _process_soup(soup)
lastWorkingOffset = 0
for offset in itertools.count(start = 10, step = 10):