Merge branch 'main' into tests-and-docs

This commit is contained in:
Tristan Lee
2023-08-04 09:33:23 -05:00
2 changed files with 35 additions and 24 deletions

View File

@@ -23,11 +23,16 @@ from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Ima
class TelegramTelethonTransformer(Transformer):
__version__ = 'TelegramTelethonTransformer 0.0.4'
# TODO cache
# cache channels for which we cannot get the name from the web interface
bad_channels = {}
# cache channels for which we have already looked up the name
channels_cache_by_platformid = {}
channels_cache_by_id = {}
channels_cache_by_screenname = {}
# cache the ID of posts that get replies, avoiding database lookups when possible
posts_cache = {}
get_screenname_cache = {}
@@ -159,18 +164,19 @@ class TelegramTelethonTransformer(Transformer):
platform_id=chat["id"],
category=channel.category, # this should be the same as the "parent"
platform=channel.platform, # this should be the same as the "parent"
url="",
url=("https://t.me/s/" + chat["username"]) if "username" in chat else "",
screenname=chat["username"] if "username" in chat else "",
country=channel.country, # this should be the same as the "parent"
influencer=channel.influencer, # this should be the same as the "parent"
public=None,
chat=None,
chat=not chat["broadcast"],
notes=channel.id, # this should be the channel ID of the parent
source="linked_channel"
)
insert(new_chat)
# TODO this method API is chaotic and could be cleaned up
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
@@ -181,7 +187,7 @@ class TelegramTelethonTransformer(Transformer):
fwd_from = None
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
# use cache rather than a DB request if possible
# use cache to look up channel instead of a DB request if possible
if str(raw['fwd_from']['from_id']['channel_id']) not in self.channels_cache_by_platformid:
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first()
@@ -215,11 +221,11 @@ class TelegramTelethonTransformer(Transformer):
reply_to = None
if raw['reply_to']:
reply_to_id = str(raw['reply_to']['reply_to_msg_id'])
# use cache rather than a DB request if possible
# use cache to find post ID instead of a DB request, if possible
if (data.channel, reply_to_id) not in self.posts_cache:
session.commit()
flush_posts()
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
if post is None:
reply_to = -1
@@ -258,6 +264,7 @@ class TelegramTelethonTransformer(Transformer):
logger.info(f"Added {channel}")
self.channels_cache_by_screenname[screenname.lower()] = channel
channel = self.channels_cache_by_screenname[screenname.lower()]
mentions.append(channel.id)
@@ -316,14 +323,6 @@ def stripped(s):
return lstripped + rstripped
def stripped(s):
"""https://stackoverflow.com/a/29933716"""
lstripped = ''.join(takewhile(str.isspace, s))
rstripped = ''.join(reversed(tuple(takewhile(str.isspace, reversed(s)))))
return lstripped + rstripped
def add_markdown_links(raw_post):
"""This function is necessary because Telethon's markdown.unparse doesn't
correctly handle trailing whitespace or multi-line links"""

View File

@@ -97,17 +97,29 @@ def sync_channels(args, session):
logger.info(f"Channel found, updating channel {channel}")
was_researcher = channel.source == "researcher"
channel.name = c["name"]
channel.category = c["category"]
channel.platform = c["platform"]
channel.url = c["url"]
channel.screenname = c["screenname"]
channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/')))
channel.influencer = c["influencer"]
channel.public = c["public"]
channel.chat = c["chat"]
channel.notes = c["notes"]
channel.source = c["source"]
# Update only non-empty/none values from the sheet
if c["name"]:
channel.name = c["name"]
if c["category"]:
channel.category = c["category"]
if c["platform"]:
channel.platform = c["platform"]
if c["url"]:
channel.url = c["url"]
if c["screenname"]:
channel.screenname = c["screenname"]
if c["country"]:
channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/')))
if c["influencer"]:
channel.influencer = c["influencer"]
if c["public"]:
channel.public = c["public"]
if c["chat"]:
channel.chat = c["chat"]
if c["notes"]:
channel.notes = c["notes"]
if c["source"]:
channel.source = c["source"]
session.flush()
session.commit()