diff --git a/cisticola/base.py b/cisticola/base.py index c72edd1..aad30b8 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -229,12 +229,27 @@ class Post: #: The ID of the Channel that the post was forwarded or quoted from forwarded_from: int = None - #: The ID of the Post that this Post is a reply to or reblog of + #: The ID of the Post that this Post is a reply to reply_to: int = None #: Other users mentioned in the post mentions: list = field(default_factory=list) + #: Number of positive post reactions (e.g. likes, favorites, rumbles, upvotes, etc.) + likes: int = None + + #: Number of times the post was forwarded/retweeted/shared + forwards: int = None + + #: Number of times the post was viewed + views: int = None + + #: Video title, if post is a video + video_title: str = None + + #: Video duration in seconds, if post is a video + video_duration: int = None + def hydrate(self): URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(? Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) - soup = BeautifulSoup(raw['body'], features = 'html.parser') - content = soup.find_all('p')[-1].text + if raw['category'] == 'comment': + if raw['parent_id'] is None: + reply_to_id = raw['thread_id'] + else: + reply_to_id = raw['parent_id'] + post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() + if post is None: + if raw['parent_id'] is not None: + # this block is for comments whose parent_ids correspond to deleted comments + post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first() + reply_to = post.id + else: + reply_to = -1 + else: + reply_to = post.id + content = raw['body'].strip() + else: + reply_to = -1 + soup = BeautifulSoup(raw['body'], features = 'html.parser') + soup.find('div', {'class': 'teaser'}).decompose() + soup.find('span', {'class': 'more'}).decompose() + soup.find('span', {'class': 'less hidden'}).decompose() + content = soup.text.strip() transformed = Post( raw_id=data.id, @@ -72,12 +93,19 @@ class BitchuteTransformer(Transformer): date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), - url=raw['url'], + url=raw['url'] if raw['url'] else None, content=content, author_id=raw['author_id'], - author_username=raw['author']) + author_username=raw['author'], + reply_to=reply_to, + hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])), + likes = raw['likes'], + views = int(raw['views']) if raw.get('views') else None, + video_title = raw['subject'], + video_duration = parse_duration_str(raw['length'])) transformed = insert(transformed) + session.flush() def parse_created(created: str, date_archived: datetime) -> datetime: """Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime @@ -93,4 +121,11 @@ def parse_created(created: str, date_archived: datetime) -> datetime: _kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()} kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()} - return date_archived - relativedelta(**kwargs) \ No newline at end of file + return date_archived - relativedelta(**kwargs) + +def parse_duration_str(duration_str: str) -> int: + if not duration_str: + return None + else: + duration_list = duration_str.split(':') + return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]) \ No newline at end of file diff --git a/cisticola/transformer/gettr.py b/cisticola/transformer/gettr.py index 603c05c..275be48 100644 --- a/cisticola/transformer/gettr.py +++ b/cisticola/transformer/gettr.py @@ -3,6 +3,8 @@ from loguru import logger from typing import Generator, Union, Callable import dateutil.parser from datetime import datetime, timezone +from gogettr import PublicClient +from gogettr.api import GettrApiError from cisticola.transformer.base import Transformer from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel @@ -49,7 +51,34 @@ class GettrTransformer(Transformer): raw = json.loads(data.raw_data) if raw["activity"]["action"] == "shares_pst": - forwarded_from = raw["activity"]["uid"] + fwd_from = str(raw["activity"]["uid"]) + channel = session.query(Channel).filter_by(platform_id=str(fwd_from)).first() + if channel is None: + try: + client = PublicClient() + profile = client.user_info(fwd_from.lower()) + screenname = profile.get('_id') + channel = Channel( + name=profile.get('nickname'), + platform_id=screenname, + platform=data.platform, + url="https://gettr.com/user/" + screenname, + screenname=screenname, + category='forwarded', + source=self.__version__, + ) + except GettrApiError: + channel = Channel( + name=None, + platform_id=fwd_from, + platform=data.platform, + url="https://gettr.com/user/" + fwd_from, + screenname=fwd_from, + category='forwarded', + source=self.__version__, + ) + channel = insert(channel) + forwarded_from = channel.id else: forwarded_from = None @@ -69,7 +98,11 @@ class GettrTransformer(Transformer): author_username=raw["uid"], hashtags=raw.get("htgs", []), outlinks = list(filter(None, [raw.get("prevsrc")])), - forwarded_from = forwarded_from) + forwarded_from = forwarded_from, + likes = raw.get('lkbpst'), + forwards = raw.get("shbpst"), + views = raw.get('vfpst') + ) insert(transformed) diff --git a/cisticola/transformer/rumble.py b/cisticola/transformer/rumble.py index 91ef244..f9671c9 100644 --- a/cisticola/transformer/rumble.py +++ b/cisticola/transformer/rumble.py @@ -61,10 +61,19 @@ class RumbleTransformer(Transformer): url=raw['link'], content=raw['content'], author_id=raw['author_id'], - author_username=raw['author_name']) + author_username=raw['author_name'], + views = _process_number(raw.get('views')), + likes = _process_number(raw.get('rumbles'))) insert(transformed) # media = self.process_media(raw, transformed.id, data) # for m in media: - # insert(m) \ No newline at end of file + # insert(m) + +def _process_number(s): + + if s is None: + return None + else: + return int(s.replace(',', '')) \ No newline at end of file diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index 0840cd7..3f9ac02 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -220,7 +220,9 @@ class TelegramTelethonTransformer(Transformer): author_username=author_username, forwarded_from=fwd_from, reply_to=reply_to, - mentions = mentions + mentions = mentions, + forwards = raw.get('forwards'), + views = raw.get('views') ) transformed = insert(transformed)