From 3bb5af11e6e5c8e59412f74f0ffad30415b630e3 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 26 Oct 2022 08:16:49 -0500 Subject: [PATCH 1/3] changed ORM and Google Sheet sync to reflect converting channels.country to JSONB array, added index for detected_language --- cisticola/base.py | 5 +++-- sync_with_gsheet.py | 12 ++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index 01df6f4..4e0b77f 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -7,6 +7,7 @@ import io from sqlalchemy.orm import registry from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean, Index +from sqlalchemy.dialects.postgresql import JSONB import pytesseract import PIL import exiftool @@ -475,7 +476,7 @@ channel_table = Table('channels', mapper_registry.metadata, Column('platform', String), Column('url', String), Column('screenname', String), - Column('country', String), + Column('country', JSONB), Column('influencer', String), Column('public', Boolean), Column('chat', Boolean), @@ -511,7 +512,7 @@ post_table = Table('posts', mapper_registry.metadata, Column('views', Integer), Column('video_title', String), Column('video_duration', Integer), - Column('detected_language', String), + Column('detected_language', String, index = True), Column('normalized_content', String) ) diff --git a/sync_with_gsheet.py b/sync_with_gsheet.py index 1c15fbe..fcd848a 100644 --- a/sync_with_gsheet.py +++ b/sync_with_gsheet.py @@ -4,6 +4,14 @@ from loguru import logger from cisticola.base import Channel, ChannelInfo +def standardize_country(s): + _s = s.split('(')[0].split('?')[0] + if _s == 'AUS': + return 'AU' + else: + return _s.strip() + + def sync_channels(args, session): logger.info("Synchronizing channels") @@ -73,7 +81,7 @@ def sync_channels(args, session): channel.platform = c["platform"] channel.url = c["url"] channel.screenname = c["screenname"] - channel.country = c["country"] + channel.country = list(map(standardize_country, c["country"].split('/'))) channel.influencer = c["influencer"] channel.public = c["public"] channel.chat = c["chat"] @@ -114,7 +122,7 @@ def sync_channels(args, session): channel.platform = c["platform"] channel.url = c["url"] channel.screenname = c["screenname"] - channel.country = c["country"] + channel.country = list(map(standardize_country, c["country"].split('/'))) channel.influencer = c["influencer"] channel.public = c["public"] channel.chat = c["chat"] From 5a53ebacd0b45e2dfb2ed16564d8e0b2010d96b4 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 26 Oct 2022 08:22:13 -0500 Subject: [PATCH 2/3] removed special case --- sync_with_gsheet.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sync_with_gsheet.py b/sync_with_gsheet.py index fcd848a..5f939f1 100644 --- a/sync_with_gsheet.py +++ b/sync_with_gsheet.py @@ -6,10 +6,7 @@ from cisticola.base import Channel, ChannelInfo def standardize_country(s): _s = s.split('(')[0].split('?')[0] - if _s == 'AUS': - return 'AU' - else: - return _s.strip() + return _s.strip() def sync_channels(args, session): From d9e2250c5abbc0a0f18aacfd393169572afa41af Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 26 Oct 2022 08:42:35 -0500 Subject: [PATCH 3/3] added country index --- cisticola/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cisticola/base.py b/cisticola/base.py index 4e0b77f..6825569 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -476,7 +476,7 @@ channel_table = Table('channels', mapper_registry.metadata, Column('platform', String), Column('url', String), Column('screenname', String), - Column('country', JSONB), + Column('country', JSONB, index = True), Column('influencer', String), Column('public', Boolean), Column('chat', Boolean),