From cb429909d053e2e7ea928884dd87905247a90dce Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 5 Jul 2022 10:21:59 -0500 Subject: [PATCH] added User dataclass as argument to VKontaktePost dataclass --- snscrape/modules/vkontakte.py | 78 +++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index 3193abe..021a395 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -38,11 +38,35 @@ _datePattern = re.compile(r'^(?Ptoday' r'\s+at\s+(?P\d+):(?P\d+)\s+(?P[ap]m)$') +@dataclasses.dataclass +class User(snscrape.base.Entity): + username: str + name: str + verified: bool + description: typing.Optional[str] = None + websites: typing.Optional[typing.List[str]] = None + followers: typing.Optional[snscrape.base.IntWithGranularity] = None + posts: typing.Optional[snscrape.base.IntWithGranularity] = None + photos: typing.Optional[snscrape.base.IntWithGranularity] = None + tags: typing.Optional[snscrape.base.IntWithGranularity] = None + following: typing.Optional[snscrape.base.IntWithGranularity] = None + + followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity') + postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity') + photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity') + tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity') + followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity') + + def __str__(self): + return f'https://vk.com/{self.username}' + + @dataclasses.dataclass class VKontaktePost(snscrape.base.Item): url: str date: typing.Optional[typing.Union[datetime.datetime, datetime.date]] content: str + user: User outlinks: typing.Optional[typing.List[str]] = None photos: typing.Optional[typing.List['Photo']] = None video: typing.Optional['Video'] = None @@ -74,29 +98,6 @@ class Video: thumbUrl: str -@dataclasses.dataclass -class User(snscrape.base.Entity): - username: str - name: str - verified: bool - description: typing.Optional[str] = None - websites: typing.Optional[typing.List[str]] = None - followers: typing.Optional[snscrape.base.IntWithGranularity] = None - posts: typing.Optional[snscrape.base.IntWithGranularity] = None - photos: typing.Optional[snscrape.base.IntWithGranularity] = None - tags: typing.Optional[snscrape.base.IntWithGranularity] = None - following: typing.Optional[snscrape.base.IntWithGranularity] = None - - followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity') - postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity') - photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity') - tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity') - followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity') - - def __str__(self): - return f'https://vk.com/{self.username}' - - class VKontakteUserScraper(snscrape.base.Scraper): name = 'vkontakte-user' @@ -216,14 +217,24 @@ class VKontakteUserScraper(snscrape.base.Scraper): photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None photos.append(Photo(variants = photoVariants, url = photoUrl)) quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None + authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author']) + authorLink = authorHeading.find('a', class_ = ['author', 'copy_author']) + username = authorLink['href'].split('/')[-1] + name = authorLink.text + if authorHeading.find('div', class_ = 'page_verified') is not None: + verified = True + else: + verified = False + user = User(username = username, name = name, verified = verified) return VKontaktePost( - url = url, - date = self._date_span_to_date(dateSpan), - content = textDiv.text if textDiv else None, - outlinks = outlinks or None, - photos = photos or None, - video = video or None, - quotedPost = quotedPost, + url = url, + date = self._date_span_to_date(dateSpan), + content = textDiv.text if textDiv else None, + user = user, + outlinks = outlinks or None, + photos = photos or None, + video = video or None, + quotedPost = quotedPost, ) def _soup_to_items(self, soup): @@ -380,6 +391,13 @@ class VKontakteUserScraper(snscrape.base.Scraper): if (followersDiv := soup.find('div', id = 'public_followers')): if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers': kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text)) + # On community groups, this is where followers are listed + elif (followersDiv := soup.find('div', class_ = 'group_friends_text')): + kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text)) + # On public groups, this is where followers are listed + elif (followersDiv := soup.find('div', id = 'group_followers')): + if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members': + kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text)) return User(**kwargs)