From d80ad442da19d0ba68775912844cfb4bc6ac10b6 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Sun, 25 Dec 2022 14:59:11 -0800 Subject: [PATCH] specified columns to update, skipped channel lookups to increase speed --- app.py | 2 +- cisticola/base.py | 2 +- cisticola/transformer/telegram_telethon.py | 112 ++++++++++----------- 3 files changed, 58 insertions(+), 58 deletions(-) diff --git a/app.py b/app.py index 892bd30..68b0542 100644 --- a/app.py +++ b/app.py @@ -122,7 +122,7 @@ def retransform(args): logger.info(f"Transforming untransformed posts") controller = get_transformer_controller() - controller.retransform_all(query_kwargs = {'platform': 'Telegram'}) + controller.retransform_all(query_kwargs = {'platform': 'Telegram'}, columns = ['content', 'outlinks']) def init_db(): engine = create_engine(os.environ["DB"]) diff --git a/cisticola/base.py b/cisticola/base.py index 6825569..d94bfd6 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -301,7 +301,7 @@ class Post: elif self.detected_language == 'nl': nlp = nlp_nl else: - logger.info(f"No language model for {self.detected_language}") + # logger.info(f"No language model for {self.detected_language}") nlp = nlp_xx ner_only = True diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index f814712..125ad77 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -132,79 +132,79 @@ class TelegramTelethonTransformer(Transformer): fwd_from = None - if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']: - channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first() + # if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']: + # channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first() - if channel is None: - (screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id']) + # if channel is None: + # (screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id']) - if name == "": - logger.info("Trying fallback web interface") - orig_channel = session.query(Channel).filter_by(id=data.channel).first() - if orig_channel.screenname is not None: - name = self.get_name_from_web_interface(orig_channel.screenname, raw['id']) + # if name == "": + # logger.info("Trying fallback web interface") + # orig_channel = session.query(Channel).filter_by(id=data.channel).first() + # if orig_channel.screenname is not None: + # name = self.get_name_from_web_interface(orig_channel.screenname, raw['id']) - channel = Channel( - name=name, - platform_id=raw['fwd_from']['from_id']['channel_id'], - platform=data.platform, - url="https://t.me/s/" + screenname if screenname is not None else "", - screenname=screenname, - category='forwarded', - source=self.__version__, - notes=notes - ) + # channel = Channel( + # name=name, + # platform_id=raw['fwd_from']['from_id']['channel_id'], + # platform=data.platform, + # url="https://t.me/s/" + screenname if screenname is not None else "", + # screenname=screenname, + # category='forwarded', + # source=self.__version__, + # notes=notes + # ) - channel = insert(channel) - logger.info(f"Added {channel}") + # channel = insert(channel) + # logger.info(f"Added {channel}") - fwd_from = channel.id + # fwd_from = channel.id reply_to = None - if raw.get('reply_to'): - reply_to_id = str(raw['reply_to']['reply_to_msg_id']) - post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() - if post is None: - reply_to = -1 - else: - reply_to = post.id + # if raw.get('reply_to'): + # reply_to_id = str(raw['reply_to']['reply_to_msg_id']) + # post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() + # if post is None: + # reply_to = -1 + # else: + # reply_to = post.id mentions = [] - for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']: + # for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']: - offset = mention_entity['offset'] - length = mention_entity['length'] + # offset = mention_entity['offset'] + # length = mention_entity['length'] - screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip() + # screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip() - channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first() + # channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first() - if channel is None: + # if channel is None: - channel = Channel( - name = None, - platform_id = None, - platform = 'Telegram', - url="https://t.me/s/" + screenname, - screenname=screenname, - category='mentioned', - source=self.__version__, - ) + # channel = Channel( + # name = None, + # platform_id = None, + # platform = 'Telegram', + # url="https://t.me/s/" + screenname, + # screenname=screenname, + # category='mentioned', + # source=self.__version__, + # ) - channel = insert(channel) - logger.info(f"Added {channel}") + # channel = insert(channel) + # logger.info(f"Added {channel}") - mentions.append(channel.id) + # mentions.append(channel.id) - channel = session.query(Channel).filter_by(id=int(data.channel)).first() + # channel = session.query(Channel).filter_by(id=int(data.channel)).first() - if channel is not None and channel.url: - url = channel.url.strip('/') + f"/{raw['id']}" - author_username = channel.screenname - else: - url = "" - author_username = "" + # if channel is not None and channel.url: + # url = channel.url.strip('/') + f"/{raw['id']}" + # author_username = channel.screenname + # else: + # url = "" + # author_username = "" return Post( raw_id = data.id, @@ -216,10 +216,10 @@ class TelegramTelethonTransformer(Transformer): date=dateutil.parser.parse(raw['date']), date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), - url=url, + url=None, content=add_markdown_links(raw), author_id=raw.get('peer_id', {}).get('channel_id'), - author_username=author_username, + author_username=None, forwarded_from=fwd_from, reply_to=reply_to, mentions = mentions,