Merge branch 'main' into tests-and-docs

2026-06-07 19:08:35 +03:00 · 2023-08-04 09:33:23 -05:00
parent 30bb4e43e4 d55c13c95d
commit 8421fe7c48
2 changed files with 35 additions and 24 deletions
--- a/cisticola/transformer/telegram_telethon.py
+++ b/cisticola/transformer/telegram_telethon.py
@@ -23,11 +23,16 @@ from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Ima
 class TelegramTelethonTransformer(Transformer):
    __version__ = 'TelegramTelethonTransformer 0.0.4'

+    # TODO cache
+    # cache channels for which we cannot get the name from the web interface
    bad_channels = {}
+
+    # cache channels for which we have already looked up the name
    channels_cache_by_platformid = {}
    channels_cache_by_id = {}
    channels_cache_by_screenname = {}

+    # cache the ID of posts that get replies, avoiding database lookups when possible
    posts_cache = {}

    get_screenname_cache = {}
@@ -159,18 +164,19 @@ class TelegramTelethonTransformer(Transformer):
                    platform_id=chat["id"],
                    category=channel.category, # this should be the same as the "parent"
                    platform=channel.platform, # this should be the same as the "parent"
-                    url="",
+                    url=("https://t.me/s/" + chat["username"]) if "username" in chat else "",
                    screenname=chat["username"] if "username" in chat else "",
                    country=channel.country, # this should be the same as the "parent"
                    influencer=channel.influencer, # this should be the same as the "parent"
                    public=None,
-                    chat=None,
+                    chat=not chat["broadcast"],
                    notes=channel.id, # this should be the channel ID of the parent
                    source="linked_channel"
                )

                insert(new_chat)

+    # TODO this method API is chaotic and could be cleaned up
    def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
        raw = json.loads(data.raw_data)

@@ -181,7 +187,7 @@ class TelegramTelethonTransformer(Transformer):
        fwd_from = None

        if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
-            # use cache rather than a DB request if possible
+            # use cache to look up channel instead of a DB request if possible
            if str(raw['fwd_from']['from_id']['channel_id']) not in self.channels_cache_by_platformid:
                channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first()

@@ -215,11 +221,11 @@ class TelegramTelethonTransformer(Transformer):
        reply_to = None
        if raw['reply_to']:
            reply_to_id = str(raw['reply_to']['reply_to_msg_id'])
-            # use cache rather than a DB request if possible

+            # use cache to find post ID instead of a DB request, if possible
            if (data.channel, reply_to_id) not in self.posts_cache:
                session.commit()
-                flush_posts()
+                flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
                post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
                if post is None:
                    reply_to = -1
@@ -258,6 +264,7 @@ class TelegramTelethonTransformer(Transformer):
                    logger.info(f"Added {channel}")
                
                self.channels_cache_by_screenname[screenname.lower()] = channel
+
            channel = self.channels_cache_by_screenname[screenname.lower()]

            mentions.append(channel.id)
@@ -316,14 +323,6 @@ def stripped(s):

    return lstripped + rstripped

-def stripped(s):
-    """https://stackoverflow.com/a/29933716"""
-
-    lstripped = ''.join(takewhile(str.isspace, s))
-    rstripped = ''.join(reversed(tuple(takewhile(str.isspace, reversed(s)))))
-
-    return lstripped + rstripped
-
 def add_markdown_links(raw_post):
    """This function is necessary because Telethon's markdown.unparse doesn't 
    correctly handle trailing whitespace or multi-line links"""
--- a/sync_with_gsheet.py
+++ b/sync_with_gsheet.py
@@ -97,17 +97,29 @@ def sync_channels(args, session):
                logger.info(f"Channel found, updating channel {channel}")
                was_researcher = channel.source == "researcher"

-                channel.name = c["name"]
-                channel.category = c["category"]
-                channel.platform = c["platform"]
-                channel.url = c["url"]
-                channel.screenname = c["screenname"]
-                channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/')))
-                channel.influencer = c["influencer"]
-                channel.public = c["public"]
-                channel.chat = c["chat"]
-                channel.notes = c["notes"]
-                channel.source = c["source"]
+                # Update only non-empty/none values from the sheet
+                if c["name"]:
+                    channel.name = c["name"]
+                if c["category"]:
+                    channel.category = c["category"]
+                if c["platform"]:
+                    channel.platform = c["platform"]
+                if c["url"]:
+                    channel.url = c["url"]
+                if c["screenname"]:
+                    channel.screenname = c["screenname"]
+                if c["country"]:
+                    channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/')))
+                if c["influencer"]:
+                    channel.influencer = c["influencer"]
+                if c["public"]:
+                    channel.public = c["public"]
+                if c["chat"]:
+                    channel.chat = c["chat"]
+                if c["notes"]:
+                    channel.notes = c["notes"]
+                if c["source"]:
+                    channel.source = c["source"]

                session.flush()
                session.commit()