diff --git a/app.py b/app.py index 0ede270..daaedeb 100644 --- a/app.py +++ b/app.py @@ -14,7 +14,13 @@ from cisticola.scraper import ( BitchuteScraper, RumbleScraper, ) -from cisticola.transformer import (ETLController, TelegramTelethonTransformer) +from cisticola.transformer import ( + ETLController, + TelegramTelethonTransformer, + GettrTransformer, + RumbleTransformer, + BitchuteTransformer) + from sync_with_gsheet import sync_channels def get_db_session(): @@ -49,7 +55,10 @@ def get_transformer_controller(): controller = ETLController() controller.connect_to_db(engine) - transformers = [TelegramTelethonTransformer()] + transformers = [TelegramTelethonTransformer(), + BitchuteTransformer(), + GettrTransformer(), + RumbleTransformer()] controller.register_transformers(transformers) diff --git a/cisticola/base.py b/cisticola/base.py index aad30b8..28ff9f8 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -256,6 +256,7 @@ class Post: # replace is here in order to prevent catastrophic backtracking urls = re.findall(URL_REGEX, self.content.replace("::::::::", "")) self.outlinks += urls + self.outlinks = list(set(outlink for outlink in self.outlinks)) HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)" diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index b14f756..c8dff36 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -69,7 +69,10 @@ class BitchuteTransformer(Transformer): if raw['parent_id'] is not None: # this block is for comments whose parent_ids correspond to deleted comments post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first() - reply_to = post.id + if post is None: + reply_to = -1 + else: + reply_to = post.id else: reply_to = -1 else: @@ -102,7 +105,7 @@ class BitchuteTransformer(Transformer): likes = raw['likes'], views = int(raw['views']) if raw.get('views') else None, video_title = raw['subject'], - video_duration = parse_duration_str(raw['length'])) + video_duration = _parse_duration_str(raw['length'])) transformed = insert(transformed) session.flush() @@ -123,7 +126,9 @@ def parse_created(created: str, date_archived: datetime) -> datetime: return date_archived - relativedelta(**kwargs) -def parse_duration_str(duration_str: str) -> int: +def _parse_duration_str(duration_str: str) -> int: + """Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824). + """ if not duration_str: return None else: diff --git a/cisticola/transformer/gettr.py b/cisticola/transformer/gettr.py index 275be48..0e0a4d0 100644 --- a/cisticola/transformer/gettr.py +++ b/cisticola/transformer/gettr.py @@ -3,6 +3,7 @@ from loguru import logger from typing import Generator, Union, Callable import dateutil.parser from datetime import datetime, timezone +from sqlalchemy import func from gogettr import PublicClient from gogettr.api import GettrApiError @@ -46,42 +47,55 @@ class GettrTransformer(Transformer): transformed = insert(transformed) + def _get_channel_id(self, username: str, category: str, insert: Callable, session): + + channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first() + + if channel is None: + try: + client = PublicClient() + profile = client.user_info(username.lower()) + screenname = profile.get('_id') + channel = Channel( + name=profile.get('nickname'), + platform_id=screenname, + platform='Gettr', + url="https://gettr.com/user/" + screenname, + screenname=screenname, + category=category, + source=self.__version__, + ) + except GettrApiError: + channel = Channel( + name = None, + platform_id = None, + platform = 'Gettr', + url = None, + screenname=username, + category=category, + source=self.__version__, + notes='GettrApiError' + ) + + channel = insert(channel) + + return channel.id def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) if raw["activity"]["action"] == "shares_pst": - fwd_from = str(raw["activity"]["uid"]) - channel = session.query(Channel).filter_by(platform_id=str(fwd_from)).first() - if channel is None: - try: - client = PublicClient() - profile = client.user_info(fwd_from.lower()) - screenname = profile.get('_id') - channel = Channel( - name=profile.get('nickname'), - platform_id=screenname, - platform=data.platform, - url="https://gettr.com/user/" + screenname, - screenname=screenname, - category='forwarded', - source=self.__version__, - ) - except GettrApiError: - channel = Channel( - name=None, - platform_id=fwd_from, - platform=data.platform, - url="https://gettr.com/user/" + fwd_from, - screenname=fwd_from, - category='forwarded', - source=self.__version__, - ) - channel = insert(channel) - forwarded_from = channel.id + forwarded_from = self._get_channel_id( + username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session) else: forwarded_from = None + mentions = [] + for mentioned_user in raw.get("utgs", []): + mentioned_id = self._get_channel_id( + username = mentioned_user, category = 'mentioned', insert = insert, session = session) + mentions.append(mentioned_id) + transformed = Post( raw_id=data.id, platform_id=raw["_id"], @@ -99,6 +113,7 @@ class GettrTransformer(Transformer): hashtags=raw.get("htgs", []), outlinks = list(filter(None, [raw.get("prevsrc")])), forwarded_from = forwarded_from, + mentions = mentions, likes = raw.get('lkbpst'), forwards = raw.get("shbpst"), views = raw.get('vfpst') diff --git a/cisticola/transformer/rumble.py b/cisticola/transformer/rumble.py index f9671c9..79289f5 100644 --- a/cisticola/transformer/rumble.py +++ b/cisticola/transformer/rumble.py @@ -3,6 +3,7 @@ from loguru import logger from typing import Generator, Union, Callable import dateutil.parser from datetime import datetime, timezone +from sqlalchemy import func, JSON, String, cast, text from cisticola.transformer.base import Transformer from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel @@ -22,19 +23,30 @@ class RumbleTransformer(Transformer): def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) + if 'id' not in raw: + # The first version of the Rumble ChannelInfo scraper didn't return + # the platform_id, so this is a workaround. + channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first() + if channel is None: + platform_id = None + else: + platform_id = json.loads(channel.raw_data)['id'] + else: + platform_id = raw['id'] + transformed = ChannelInfo( raw_channel_info_id=data.id, channel=data.channel, - platform_id=raw['id'], + platform_id=platform_id, platform=data.platform, scraper=data.scraper, transformer=self.__version__, - screenname=raw['id'], + screenname=platform_id, name=raw['name'], description='', # does not exist for Rumble description_url='', # does not exist for Rumble description_location='', # does not exist for Rumble - followers=raw['subscribers'], + followers=_process_number(raw['subscribers']), following=-1, # does not exist for Rumble verified=raw['verified'], date_created=None, # does not exist for Rumble @@ -63,7 +75,9 @@ class RumbleTransformer(Transformer): author_id=raw['author_id'], author_username=raw['author_name'], views = _process_number(raw.get('views')), - likes = _process_number(raw.get('rumbles'))) + likes = _process_number(raw.get('rumbles')), + video_title = raw['title'], + video_duration=_parse_duration_str(raw['duration'])) insert(transformed) @@ -76,4 +90,18 @@ def _process_number(s): if s is None: return None else: - return int(s.replace(',', '')) \ No newline at end of file + s = s.replace(' ', '') + if s.endswith('M'): + return int(float(s[:-1]) * 1e6) + elif s.endswith('K'): + return int(float(s[:-1]) * 1000) + return int(s) + +def _parse_duration_str(duration_str: str) -> int: + """Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824). + """ + if not duration_str: + return None + else: + duration_list = duration_str.split(':') + return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]) \ No newline at end of file diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index 3f9ac02..eb48d06 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -131,7 +131,7 @@ class TelegramTelethonTransformer(Transformer): fwd_from = None if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']: - channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first() + channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first() if channel is None: (screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])