Introduce dedicated IntWithGranularity type and deprecate the direct *Granularity fields

This commit is contained in:
JustAnotherArchivist
2020-10-16 18:20:47 +00:00
parent 1bbe25647a
commit 16ebe8bf48
4 changed files with 50 additions and 38 deletions

View File

@@ -37,9 +37,13 @@ class _JSONDataclass:
def json(self):
'''Convert the object to a JSON string'''
out = dataclasses.asdict(self)
for key, value in out.items():
for key, value in list(out.items()): # Modifying the dict below, so make a copy first
if isinstance(value, _JSONDataclass):
out[key] = value.json()
elif isinstance(value, IntWithGranularity):
out[key] = int(value)
assert f'{key}.granularity' not in out, f'Granularity collision on {key}.granularity'
out[f'{key}.granularity'] = value.granularity
return json.dumps(out, default = _json_serialise_datetime)
@@ -65,10 +69,18 @@ class Entity(_JSONDataclass):
pass
Granularity = int
'''Type of fields storing the unit/granularity of numbers.
class IntWithGranularity(int):
'''A number with an associated granularity
For example, a granularity of 1000 means that the SNS returned something like '42k' and the last three significant digits are unknown.'''
For example, an IntWithGranularity(42000, 1000) represents a number on the order of 42000 with two significant digits, i.e. something counted with a granularity of 1000.'''
def __new__(cls, value, granularity, *args, **kwargs):
obj = super().__new__(cls, value, *args, **kwargs)
obj.granularity = granularity
return obj
def __reduce__(self):
return (IntWithGranularity, (int(self), self.granularity))
class URLItem(Item):

View File

@@ -33,12 +33,13 @@ class InstagramPost(snscrape.base.Item):
class User(snscrape.base.Entity):
username: str
name: typing.Optional[str]
followers: int
followersGranularity: snscrape.base.Granularity
following: int
followingGranularity: snscrape.base.Granularity
posts: int
postsGranularity: snscrape.base.Granularity
followers: snscrape.base.IntWithGranularity
following: snscrape.base.IntWithGranularity
posts: snscrape.base.IntWithGranularity
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
def __str__(self):
return f'https://www.instagram.com/{self.username}/'
@@ -204,18 +205,15 @@ class InstagramUserScraper(InstagramCommonScraper):
else:
return int(s.replace(',', '')), 1
followers, followersGranularity = parse_num(m.group(1))
following, followingGranularity = parse_num(m.group(2))
posts, postsGranularity = parse_num(m.group(3))
followers = snscrape.base.IntWithGranularity(*parse_num(m.group(1)))
following = snscrape.base.IntWithGranularity(*parse_num(m.group(2)))
posts = snscrape.base.IntWithGranularity(*parse_num(m.group(3)))
return User(
username = m.group(5) or m.group(6),
name = m.group(4) or None,
followers = followers,
followersGranularity = followersGranularity,
following = following,
followingGranularity = followingGranularity,
posts = posts,
postsGranularity = postsGranularity,
)

View File

@@ -43,14 +43,15 @@ class Channel(snscrape.base.Entity):
photo: str
description: typing.Optional[str] = None
members: typing.Optional[int] = None
photos: typing.Optional[int] = None
photosGranularity: typing.Optional[snscrape.base.Granularity] = None
videos: typing.Optional[int] = None
videosGranularity: typing.Optional[snscrape.base.Granularity] = None
links: typing.Optional[int] = None
linksGranularity: typing.Optional[snscrape.base.Granularity] = None
files: typing.Optional[int] = None
filesGranularity: typing.Optional[snscrape.base.Granularity] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
videos: typing.Optional[snscrape.base.IntWithGranularity] = None
links: typing.Optional[snscrape.base.IntWithGranularity] = None
files: typing.Optional[snscrape.base.IntWithGranularity] = None
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
videosGranularity = snscrape.base._DeprecatedProperty('videosGranularity', lambda self: self.videos.granularity, 'videos.granularity')
linksGranularity = snscrape.base._DeprecatedProperty('linksGranularity', lambda self: self.links.granularity, 'links.granularity')
filesGranularity = snscrape.base._DeprecatedProperty('filesGranularity', lambda self: self.files.granularity, 'files.granularity')
def __str__(self):
return f'https://t.me/s/{self.username}'
@@ -186,7 +187,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
# Already extracted more accurately from /channel, skip
continue
elif type_ in ('photos', 'videos', 'links', 'files'):
kwargs[type_], kwargs[f'{type_}Granularity'] = value, granularity
kwargs[type_] = snscrape.base.IntWithGranularity(value, granularity)
return Channel(**kwargs)

View File

@@ -29,16 +29,17 @@ class User(snscrape.base.Entity):
verified: bool
description: typing.Optional[str] = None
websites: typing.Optional[typing.List[str]] = None
followers: typing.Optional[int] = None
followersGranularity: typing.Optional[snscrape.base.Granularity] = None
posts: typing.Optional[int] = None
postsGranularity: typing.Optional[snscrape.base.Granularity] = None
photos: typing.Optional[int] = None
photosGranularity: typing.Optional[snscrape.base.Granularity] = None
tags: typing.Optional[int] = None
tagsGranularity: typing.Optional[snscrape.base.Granularity] = None
following: typing.Optional[int] = None
followingGranularity: typing.Optional[snscrape.base.Granularity] = None
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
following: typing.Optional[snscrape.base.IntWithGranularity] = None
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
def __str__(self):
return f'https://vk.com/{self.username}'
@@ -204,16 +205,16 @@ class VKontakteUserScraper(snscrape.base.Scraper):
if label in ('follower', 'post', 'photo', 'tag'):
label = f'{label}s'
if label in ('followers', 'posts', 'photos', 'tags'):
kwargs[label], kwargs[f'{label}Granularity'] = count, granularity
kwargs[label] = snscrape.base.IntWithGranularity(count, granularity)
if (idolsDiv := soup.find('div', id = 'profile_idols')):
if (topDiv := idolsDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Following':
kwargs['following'], kwargs['followingGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text)
kwargs['following'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
# On public pages, this is where followers are listed
if (followersDiv := soup.find('div', id = 'public_followers')):
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
kwargs['followers'], kwargs['followersGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text)
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
return User(**kwargs)