diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index e120d3d..159550b 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -23,11 +23,16 @@ from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Ima class TelegramTelethonTransformer(Transformer): __version__ = 'TelegramTelethonTransformer 0.0.4' + # TODO cache + # cache channels for which we cannot get the name from the web interface bad_channels = {} + + # cache channels for which we have already looked up the name channels_cache_by_platformid = {} channels_cache_by_id = {} channels_cache_by_screenname = {} + # cache the ID of posts that get replies, avoiding database lookups when possible posts_cache = {} get_screenname_cache = {} @@ -159,18 +164,19 @@ class TelegramTelethonTransformer(Transformer): platform_id=chat["id"], category=channel.category, # this should be the same as the "parent" platform=channel.platform, # this should be the same as the "parent" - url="", + url=("https://t.me/s/" + chat["username"]) if "username" in chat else "", screenname=chat["username"] if "username" in chat else "", country=channel.country, # this should be the same as the "parent" influencer=channel.influencer, # this should be the same as the "parent" public=None, - chat=None, + chat=not chat["broadcast"], notes=channel.id, # this should be the channel ID of the parent source="linked_channel" ) insert(new_chat) + # TODO this method API is chaotic and could be cleaned up def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) @@ -181,7 +187,7 @@ class TelegramTelethonTransformer(Transformer): fwd_from = None if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']: - # use cache rather than a DB request if possible + # use cache to look up channel instead of a DB request if possible if str(raw['fwd_from']['from_id']['channel_id']) not in self.channels_cache_by_platformid: channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first() @@ -215,11 +221,11 @@ class TelegramTelethonTransformer(Transformer): reply_to = None if raw['reply_to']: reply_to_id = str(raw['reply_to']['reply_to_msg_id']) - # use cache rather than a DB request if possible + # use cache to find post ID instead of a DB request, if possible if (data.channel, reply_to_id) not in self.posts_cache: session.commit() - flush_posts() + flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() if post is None: reply_to = -1 @@ -258,6 +264,7 @@ class TelegramTelethonTransformer(Transformer): logger.info(f"Added {channel}") self.channels_cache_by_screenname[screenname.lower()] = channel + channel = self.channels_cache_by_screenname[screenname.lower()] mentions.append(channel.id) @@ -316,14 +323,6 @@ def stripped(s): return lstripped + rstripped -def stripped(s): - """https://stackoverflow.com/a/29933716""" - - lstripped = ''.join(takewhile(str.isspace, s)) - rstripped = ''.join(reversed(tuple(takewhile(str.isspace, reversed(s))))) - - return lstripped + rstripped - def add_markdown_links(raw_post): """This function is necessary because Telethon's markdown.unparse doesn't correctly handle trailing whitespace or multi-line links""" diff --git a/sync_with_gsheet.py b/sync_with_gsheet.py index b1bbc01..501bd6a 100644 --- a/sync_with_gsheet.py +++ b/sync_with_gsheet.py @@ -97,17 +97,29 @@ def sync_channels(args, session): logger.info(f"Channel found, updating channel {channel}") was_researcher = channel.source == "researcher" - channel.name = c["name"] - channel.category = c["category"] - channel.platform = c["platform"] - channel.url = c["url"] - channel.screenname = c["screenname"] - channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/'))) - channel.influencer = c["influencer"] - channel.public = c["public"] - channel.chat = c["chat"] - channel.notes = c["notes"] - channel.source = c["source"] + # Update only non-empty/none values from the sheet + if c["name"]: + channel.name = c["name"] + if c["category"]: + channel.category = c["category"] + if c["platform"]: + channel.platform = c["platform"] + if c["url"]: + channel.url = c["url"] + if c["screenname"]: + channel.screenname = c["screenname"] + if c["country"]: + channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/'))) + if c["influencer"]: + channel.influencer = c["influencer"] + if c["public"]: + channel.public = c["public"] + if c["chat"]: + channel.chat = c["chat"] + if c["notes"]: + channel.notes = c["notes"] + if c["source"]: + channel.source = c["source"] session.flush() session.commit()