Merge branch 'main' of https://github.com/bellingcat/cisticola into main

2026-06-08 03:18:34 +03:00 · 2022-06-08 08:11:34 +00:00
parent 3b8b03283a bf96f248c1
commit 708d952937
11 changed files with 1833 additions and 419 deletions
--- a/16
+++ b/16
@@ -24,6 +24,10 @@ ratelimit = "*"
 pytz = "*"
 langdetect = "*"
 spacy = "==3.2.4"
+ocrd-pyexiftool = "*"
+gabber = {git = "https://github.com/stanfordio/gabber.git"}
+snscrape = {git = "https://github.com/bellingcat/snscrape"}
+polyphemus = {git = "https://github.com/bellingcat/polyphemus"}

 [dev-packages]
 pytest = "*"
@@ -39,15 +43,3 @@ python_version = "3.9"

 [pipenv]
 allow_prereleases = true
-
-[packages.polyphemus]
-git = "https://github.com/bellingcat/polyphemus.git"
-
-[packages.PyExifTool]
-git = "https://github.com/smarnach/pyexiftool.git"
-
-[packages.gabber]
-git = "https://github.com/stanfordio/gabber.git"
-
-[packages.snscrape]
-git = "https://github.com/bellingcat/snscrape"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/app.py
+++ b/app.py
@@ -1,13 +1,11 @@
 import argparse
 from loguru import logger
-import gspread
-from sqlalchemy import create_engine, func
+from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 import os
-import time
 import sys

-from cisticola.base import Channel, mapper_registry
+from cisticola.base import mapper_registry
 from cisticola.scraper import (
    ScraperController,
    VkontakteScraper,
@@ -17,85 +15,7 @@ from cisticola.scraper import (
    RumbleScraper,
 )
 from cisticola.transformer import (ETLController, TelegramTelethonTransformer)
-
-
-def sync_channels(args):
-    logger.info("Synchronizing channels")
-
-    session = get_db_session()
-
-    gc = gspread.service_account(filename="service_account.json")
-
-    # Open a sheet from a spreadsheet in one go
-    wks = gc.open_by_url(args.gsheet).worksheet("channels")
-    channels = wks.get_all_records()
-    row = 2
-
-    for c in channels:
-        if c["public"] == "":
-            c["public"] = False
-        if c["chat"] == "":
-            c["chat"] = False
-
-        for k in c.keys():
-            if c[k] == "TRUE" or c[k] == "yes":
-                c[k] = True
-            if c[k] == "FALSE" or c[k] == "no":
-                c[k] = False
-
-            if c[k] == "":
-                c[k] = None
-
-        del c["followers"]
-
-        # add new channel
-        if c["id"] == "" or c["id"] is None:
-            del c["id"]
-
-            # check to see if this already exists,
-            platform_id = None
-            if c["platform_id"] != "":
-                platform_id = c["platform_id"]
-
-            channel = (
-                session.query(Channel)
-                .filter_by(
-                    platform_id=str(platform_id), platform=c["platform"], url=c["url"]
-                )
-                .first()
-            )
-
-            if not channel:
-                channel = Channel(**c, source="researcher")
-                logger.debug(f"{channel} does not exist, adding")
-                session.add(channel)
-                session.flush()
-                session.commit()
-
-                wks.update_cell(row, 1, channel.id)
-                time.sleep(1)
-        else:
-            channel = session.query(Channel).filter_by(id=int(c["id"])).first()
-
-            logger.info(f"Updating channel {channel}")
-            channel.name = c["name"]
-            channel.category = c["category"]
-            channel.platform = c["platform"]
-            channel.url = c["url"]
-            channel.screenname = c["screenname"]
-            channel.country = c["country"]
-            channel.influencer = c["influencer"]
-            channel.public = c["public"]
-            channel.chat = c["chat"]
-            channel.notes = c["notes"]
-
-            session.flush()
-            session.commit()
-
-        row += 1
-
-    session.commit()
-
+from sync_with_gsheet import sync_channels

 def get_db_session():
    engine = create_engine(os.environ["DB"])
@@ -162,6 +82,11 @@ def transform(args):
    controller = get_transformer_controller()
    controller.transform_all_untransformed()

+def transform_info(args):
+    logger.info(f"Transforming untransformed channel info")
+
+    controller = get_transformer_controller()
+    controller.transform_all_untransformed_info()

 def init_db():
    engine = create_engine(os.environ["DB"])
@@ -191,7 +116,7 @@ if __name__ == "__main__":
        init_db()
    elif args.command == "sync-channels":
        logger.add("logs/sync-channels.log", level="TRACE", rotation="100 MB")
-        sync_channels(args)
+        sync_channels(args, get_db_session())
    elif args.command == "scrape-channels":
        logger.add("logs/scrape-channels.log", level="TRACE", rotation="100 MB")
        scrape_channels(args)
@@ -204,5 +129,8 @@ if __name__ == "__main__":
    elif args.command == "transform":
        logger.add("logs/transform.log", level="TRACE", rotation="100 MB")
        transform(args)
+    elif args.command == "transform-info":
+        logger.add("logs/transform-info.log", level="TRACE", rotation="100 MB")
+        transform_info(args)
    else:
        logger.error(f"Unrecognized command {args.command}")
--- a/cisticola/base.py
+++ b/cisticola/base.py
@@ -114,6 +114,49 @@ class RawChannelInfo:
    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime

+@dataclass
+class ChannelInfo:
+    """A processed set of information about a channel.
+    """
+
+    # Foreign key from the raw_channel_info table
+    raw_channel_info_id: int
+
+    # Foreign ckey from the channels table
+    channel: int
+
+    # platform specific ID of the channel
+    platform_id: str
+
+    #: Name of platform from which result was scraped, e.g. ``"Twitter"``.
+    platform: str
+
+    #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
+    scraper: str
+
+    #: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
+    transformer: str
+
+    #: attributes extracted from the raw channel info object
+    screenname: str
+    name: str
+    description: str
+    description_url: str
+    description_location: str
+    followers: int
+    following: int
+    verified: bool
+    date_created: datetime
+
+    #: Datetime (relative to UTC) that the scraped channel info was archived at.
+    date_archived: datetime
+    
+    #: Datetime (UTC) that the scraped channel info was transformed at.
+    date_transformed: datetime
+
+    def hydrate(self):
+        pass
+
 nlp_en = spacy.load('en_core_web_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
 nlp_de = spacy.load('de_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
 nlp_it = spacy.load('it_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
@@ -149,6 +192,9 @@ class Post:

    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime
+
+    #: Datetime (UTC) that the scraped post was transformed at.
+    date_transformed: datetime
    
    #: URL of the original post
    url: str
@@ -189,20 +235,21 @@ class Post:
    def hydrate(self):
        URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""

-        urls = re.findall(URL_REGEX, self.content)
+        # replace is here in order to prevent catastrophic backtracking
+        urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
        self.outlinks = urls

        HASHTAG_REGEX = r"(?:^|\s)[＃#]{1}(\w+)"
        
        hashtags = re.findall(HASHTAG_REGEX, self.content)
        self.hashtags = hashtags
-        
+
        # regex patterns for finding crypto addresses
        BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
        ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
        
        self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
-        
+
        try:
            self.detected_language = detect(self.content)
        except LangDetectException:
@@ -330,7 +377,7 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata,
                       Column('scraper', String),
                       Column('platform', String),
                       Column('channel', Integer, ForeignKey('channels.id'), index=True),
-                       Column('platform_id', String),
+                       Column('platform_id', String, index=True),
                       Column('date', DateTime, index=True),
                       Column('raw_data', String),
                       Column('date_archived', DateTime, index=True),
@@ -345,6 +392,28 @@ raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
                    Column('raw_data', String),
                    Column('date_archived', DateTime, index=True))

+channel_info_table = Table('channel_info', mapper_registry.metadata,
+                    Column('id', Integer, primary_key=True, autoincrement=True),
+                    Column('raw_channel_info_id', Integer, ForeignKey('raw_channel_info.id'), index=True),
+                    Column('channel', Integer, ForeignKey('channels.id'), index=True),
+                    Column('platform_id', String),
+                    Column('scraper', String),
+                    Column('platform', String),
+                    Column('transformer', String),
+                    Column('platform', String),
+                    Column('screenname', String),
+                    Column('name', String),
+                    Column('description', String),
+                    Column('description_url', String),
+                    Column('description_location', String),
+                    Column('followers', Integer),
+                    Column('following', Integer),
+                    Column('verified', Boolean),
+                    Column('date_created', DateTime),
+                    Column('date_archived', DateTime, index=True),
+                    Column('date_transformed', DateTime, index=True),
+                    )
+
 channel_table = Table('channels', mapper_registry.metadata,
                    Column('id', Integer, primary_key=True, autoincrement=True),
                    Column('name', String),
@@ -365,19 +434,20 @@ post_table = Table('posts', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
-                       Column('platform_id', Integer),
+                       Column('platform_id', Integer, index=True),
                       Column('scraper', String),
                       Column('transformer', String),
                       Column('platform', String),
                       Column('channel', Integer, ForeignKey('channels.id'), index=True),
                       Column('date', DateTime, index=True),
                       Column('date_archived', DateTime, index=True),
+                       Column('date_transformed', DateTime, index=True),
                       Column('url', String),
                       Column('author_id', String),
                       Column('author_username', String),
                       Column('content', String),
                       Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
-                       Column('reply_to', Integer, ForeignKey('posts.id'), index=True),
+                       Column('reply_to', Integer, ForeignKey('posts.id', ondelete="CASCADE"), index=True),
                       Column('named_entities', JSON),
                       Column('cryptocurrency_addresses', JSON),
                       Column('hashtags', JSON),
@@ -401,6 +471,7 @@ mapper_registry.map_imperatively(Post, post_table)
 mapper_registry.map_imperatively(Channel, channel_table)
 mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
 mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
+mapper_registry.map_imperatively(ChannelInfo, channel_info_table)
 mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
 mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
 mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
--- a/cisticola/scraper/odysee.py
+++ b/cisticola/scraper/odysee.py
@@ -6,8 +6,8 @@ from urllib.parse import urlparse
 import requests
 from loguru import logger

-from polyphemus.base import OdyseeChannel
-from polyphemus.api import get_auth_token
+from polyphemus.base import OdyseeChannelScraper, process_raw_comment_info
+from polyphemus.api import get_auth_token, get_all_comments
 from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper

@@ -29,40 +29,43 @@ class OdyseeScraper(Scraper):
    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:

        username = self.get_username_from_url(channel.url)
-        odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
+        scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
        
-        all_videos = odysee_channel.get_all_videos()
+        all_videos = scraper.get_all_videos()

        for video in all_videos:
-            if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date:
+            if since is not None and video.created.replace(tzinfo=timezone.utc) <= since.date:
                break

-            url = video.info['streaming_url']
+            url = video.streaming_url
+            if url is None:
+                archived_urls = {}
+            else:
+                archived_urls = {url: None}

-            archived_urls = {url: None}
+                if archive_media:

-            if archive_media:
+                    # Check if file is a video file or an m3u8 file
+                    r = requests.head(url)
+                    if r.headers['Content-Type'] == 'text/html; charset=utf-8':
+                        media_blob, content_type, key = self.m3u8_url_to_blob(url)
+                    else:
+                        media_blob, content_type, key = self.url_to_blob(url)

-                # Check if file is a video file or an m3u8 file
-                r = requests.head(url)
-                if r.headers['Content-Type'] == 'text/html; charset=utf-8':
-                    media_blob, content_type, key = self.m3u8_url_to_blob(url)
-                else:
-                    media_blob, content_type, key = self.url_to_blob(url)
+                    archived_url = self.archive_blob(media_blob, content_type, key)
+                    archived_urls[url] = archived_url

-                archived_url = self.archive_blob(media_blob, content_type, key)
-                archived_urls[url] = archived_url
-
-            all_comments = video.get_all_comments()
+            raw_comment_info_list = get_all_comments(video_id=video.claim_id)
+            all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list)

            yield ScraperResult(
                scraper=self.__version__,
                platform="Odysee",
                channel=channel.id,
-                platform_id=video.info['claim_id'],
-                date=datetime.fromtimestamp(video.info['created']),
+                platform_id=video.claim_id,
+                date=video.created.replace(tzinfo=timezone.utc),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(video.info),
+                raw_data=json.dumps(video.__dict__, default = str),
                archived_urls=archived_urls,
                media_archived=datetime.now(timezone.utc) if archive_media else None)

@@ -72,10 +75,10 @@ class OdyseeScraper(Scraper):
                    scraper=self.__version__,
                    platform="Odysee",
                    channel=channel.id,
-                    platform_id=comment.info['claim_id'],
-                    date=datetime.fromtimestamp(comment.info['created']),
+                    platform_id=comment.claim_id,
+                    date=comment.created.replace(tzinfo=timezone.utc),
                    date_archived=datetime.now(),
-                    raw_data=json.dumps(comment.info),
+                    raw_data=json.dumps(comment.__dict__, default = str),
                    archived_urls={},
                    media_archived=datetime.now(timezone.utc))

@@ -109,11 +112,11 @@ class OdyseeScraper(Scraper):
    def get_profile(self, channel: Channel) -> RawChannelInfo:

        username = self.get_username_from_url(channel.url)
-        odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
-        profile = odysee_channel.info
+        scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
+        profile = scraper.get_entity().__dict__

        return RawChannelInfo(scraper=self.__version__,
            platform=channel.platform,
            channel=channel.id,
-            raw_data=json.dumps(profile),
+            raw_data=json.dumps(profile, default = str),
            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/telegram_telethon.py
+++ b/cisticola/scraper/telegram_telethon.py
@@ -20,12 +20,24 @@ class TelegramTelethonScraper(Scraper):
    """An implementation of a Scraper for Telegram, using Telethon library"""
    __version__ = "TelegramTelethonScraper 0.0.2"

-    def get_username_from_url(self, url):
+    def get_username_from_url(url):
        username = url.split('https://t.me/')[1]
        if username.startswith('s/'):
            username = username.split('s/')[1]
        return username

+    def get_channel_identifier(channel: Channel):
+        identifier = channel.platform_id
+        
+        if identifier is None:
+            identifier = channel.screenname
+            if identifier is None:
+                identifier = TelegramTelethonScraper.get_username_from_url(channel.url)
+        else:
+            identifier = int(identifier)
+
+        return identifier
+
    @logger.catch
    def archive_files(self, result: ScraperResult, client : TelegramClient = None) -> ScraperResult:
        if len(result.archived_urls.keys()) == 0:
@@ -108,14 +120,12 @@ class TelegramTelethonScraper(Scraper):
                return (blob, output_file_with_ext)

    def can_handle(self, channel):
-        if channel.platform == "Telegram" and channel.public:
+        if channel.platform == "Telegram":
            return True

    @logger.catch
    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
-        username = channel.screenname
-        if username is None:
-            username = self.get_username_from_url(channel.url)
+        username = TelegramTelethonScraper.get_channel_identifier(channel)

        api_id = os.environ['TELEGRAM_API_ID']
        api_hash = os.environ['TELEGRAM_API_HASH']
@@ -156,9 +166,7 @@ class TelegramTelethonScraper(Scraper):

    @logger.catch
    def get_profile(self, channel: Channel) -> RawChannelInfo:
-        username = channel.screenname
-        if username is None:
-            username = self.get_username_from_url(channel.url)
+        username = TelegramTelethonScraper.get_channel_identifier(channel)

        api_id = os.environ['TELEGRAM_API_ID']
        api_hash = os.environ['TELEGRAM_API_HASH']
--- a/cisticola/transformer/base.py
+++ b/cisticola/transformer/base.py
@@ -2,9 +2,11 @@ from typing import List, Generator, Union, Callable
 from loguru import logger
 from sqlalchemy.orm import sessionmaker, make_transient
 from sqlalchemy.engine.base import Engine
+from sqlalchemy.sql.expression import func
 from collections import defaultdict
+from datetime import datetime

-from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry
+from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry


 class Transformer:
@@ -94,7 +96,7 @@ class ETLController:

        # This is using some adhoc unique constraints that might be worth formalizing at some point
        if type(obj) == Channel:
-            instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first()
+            instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id), platform=obj.platform).first()
            
        elif type(obj) == Post:
            instance = None
@@ -130,9 +132,9 @@ class ETLController:
        if hydrate:
            obj.hydrate()

-        logger.info(f"Inserting new object {obj}")
        session.add(obj)
-        session.flush()
+        logger.trace(f"Inserted new object {obj}")
+
        return obj

    @logger.catch(reraise=True)
@@ -151,21 +153,24 @@ class ETLController:
            logger.error("No DB session")
            return

+        session = self.session()
+
        for result in results:
-            for transformer in self.transformers:
-                handled = False
+            if result.scraper is not None and result.platform is not None:
+                for transformer in self.transformers:
+                    handled = False

-                if transformer.can_handle(result):
-                    logger.trace(f"{transformer} is handling result {result}")
-                    handled = True
-                    session = self.session()
+                    if transformer.can_handle(result):
+                        logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
+                        handled = True

-                    transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)
-                    session.commit()
-                    break
+                        transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)

-                if handled == False:
-                    logger.warning(f"No Transformer could handle {result}")
+                        session.commit()
+                        break
+
+                    if handled == False:
+                        logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")

    @logger.catch(reraise=True)
    def transform_all_untransformed(self, hydrate: bool = True):
@@ -183,13 +188,77 @@ class ETLController:
            return

        session = self.session()
-        untransformed = (
-            session.query(ScraperResult)
+
+        BATCH_SIZE = 50000
+        offset = 0
+        batch = []
+
+        query = (session.query(ScraperResult)
            .join(Post, isouter=True)
            .where(Post.raw_id == None)
            .order_by(ScraperResult.date.asc())
-            .all()
        )
-        logger.info(f"Found {len(untransformed)} items to ETL")

-        self.transform_results(untransformed, hydrate=hydrate)
+        while len(batch) > 0 or offset == 0:
+            logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {offset}")
+
+            batch = query.slice(offset, offset + BATCH_SIZE).all()
+            offset += BATCH_SIZE
+
+            logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)")
+
+            self.transform_results(batch, hydrate=hydrate)
+
+    @logger.catch(reraise=True)
+    def transform_info(self, results: List[ChannelInfo]):
+        if self.session is None:
+            logger.error("No DB session")
+            return
+
+        session = self.session()
+
+        for result in results:
+            if result.scraper is not None and result.platform is not None:
+                for transformer in self.transformers:
+                    handled = False
+
+                    if transformer.can_handle(result):
+                        logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})")
+                        handled = True
+
+                        transformer.transform_info(result, lambda obj: self.insert_or_select(obj, session, False), session)
+
+                        session.commit()
+                        break
+
+                    if handled == False:
+                        logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
+
+
+    @logger.catch(reraise=True)
+    def transform_all_untransformed_info(self):
+        if self.session is None:
+            logger.error("No DB session")
+            return
+
+        session = self.session()
+
+        BATCH_SIZE = 50000
+        offset = 0
+        batch = []
+
+        query = (session.query(RawChannelInfo)
+            .join(ChannelInfo, isouter=True)
+            .where(ChannelInfo.raw_channel_info_id == None)
+            .order_by(RawChannelInfo.date_archived.asc())
+        )
+
+        while len(batch) > 0 or offset == 0:
+            logger.info(f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}")
+
+            batch = query.slice(offset, offset + BATCH_SIZE).all()
+            offset += BATCH_SIZE
+
+            logger.info(f"Found {len(batch)} info items to ETL ({offset} already processed)")
+
+            self.transform_info(batch)
--- a/cisticola/transformer/telegram_telethon.py
+++ b/cisticola/transformer/telegram_telethon.py
@@ -3,15 +3,20 @@ from loguru import logger
 from typing import Generator, Union, Callable
 import dateutil.parser
 from bs4 import BeautifulSoup
+from psycopg2 import DatabaseError
 import requests
 import time
+from telethon.sync import TelegramClient
+from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
+import os
+from datetime import datetime, timezone

 from cisticola.transformer.base import Transformer 
-from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
+from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel


 class TelegramTelethonTransformer(Transformer):
-    __version__ = 'TelegramTelethonTransformer 0.0.1'
+    __version__ = 'TelegramTelethonTransformer 0.0.2'

    bad_channels = {}

@@ -22,39 +27,93 @@ class TelegramTelethonTransformer(Transformer):

        return False   

-    def get_screenname_from_id(self, orig_screenname, id):
+    def get_screenname_from_id(self, channel_id):
+        api_id = os.environ['TELEGRAM_API_ID']
+        api_hash = os.environ['TELEGRAM_API_HASH']
+
+        try:
+            with TelegramClient("transform.session", api_id, api_hash) as client:
+                data = client.get_entity(channel_id)
+
+                return (data.username, data.title, "")
+        except ChannelPrivateError:
+            logger.info("ChannelPrivateError")
+            return ("", "", "ChannelPrivateError")
+        except ChannelInvalidError:
+            logger.info("ChannelInvalidError")
+            return ("", "", "ChannelInvalidError")
+        except ValueError:
+            logger.info("ValueError")
+            return ("", "", "ValueError")
+
+    def get_name_from_web_interface(self, orig_screenname, id):
+        url = "https://t.me/s/" + orig_screenname + "/" + str(id)
+
+        # this doesn't work for chat channels
        if orig_screenname in self.bad_channels:
            logger.debug(f"Skipping screenname because it is not accessible for channel {orig_screenname}")
-            return ("", "")
-
-        url = "https://t.me/s/" + orig_screenname + "/" + str(id)
+            return ""

        logger.info(f"Finding channel from URL {url}")
        r = requests.get(url)

        if r.url != url:
            self.bad_channels[orig_screenname] = True
-            return ("", "")
+            return ""

        soup = BeautifulSoup(r.content)
        post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
+        name = ""
+
+        # multiple posts can be combined into one result in the web interface
+        decrement = 0
+        while len(post) == 0:
+            decrement += 1
+            if decrement > 8:
+                break
+
+            logger.info(f"Could not find post from {url}, looking for id {id - decrement}")
+            post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id - decrement)})
+
        if len(post) == 0:
            logger.warning(f"Could not find post from {url}")
-            screenname = ""
-            name = ""
        else:
            fwd_tag = post[0].findAll("a", {"class", "tgme_widget_message_forwarded_from_name"})

-            if len(fwd_tag) > 0:
-                fwd_tag = fwd_tag[0]
-                name = fwd_tag.text
-                screenname = fwd_tag['href'].split('/')[-2]
-            else:
+            if len(fwd_tag) == 0:
                fwd_tag = post[0].findAll("span", {"class", "tgme_widget_message_forwarded_from_name"})
+            
+            if len(fwd_tag) >= 1:
                name = fwd_tag[0].text
-                screenname = ""

-        return (screenname, name)
+        return name
+
+    def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
+        raw = json.loads(data.raw_data)
+
+        chat_raw = raw['chats'][0]
+
+        transformed = ChannelInfo(
+            raw_channel_info_id=data.id,
+            channel=data.channel,
+            platform_id=raw['full_chat']['id'],
+            platform=data.platform,
+            scraper=data.scraper,
+            transformer=self.__version__,
+            screenname=chat_raw['username'],
+            name=chat_raw['title'],
+            description=raw['full_chat']['about'],
+            description_url='', # does not exist for Telegram
+            description_location='', # does not exist for Telegram
+            followers=raw['full_chat']['participants_count'],
+            following=-1, # does not exist for Telegram
+            verified=False, #does not exist for Telegram
+            date_created=dateutil.parser.parse(chat_raw['date']),
+            date_archived=data.date_archived,
+            date_transformed=datetime.now(timezone.utc)
+        )
+
+        transformed = insert(transformed)

    def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
        raw = json.loads(data.raw_data)
@@ -66,32 +125,30 @@ class TelegramTelethonTransformer(Transformer):
        fwd_from = None

        if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
-            channel = session.query(Channel).filter_by(platform_id=raw['fwd_from']['from_id']['channel_id']).first()
+            channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first()

            if channel is None:
-                orig_channel = session.query(Channel).filter_by(id=data.channel).first()
-                (screenname, name) = self.get_screenname_from_id(orig_channel.screenname, raw['id'])
+                (screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])
+
+                if name == "":
+                    logger.info("Trying fallback web interface")
+                    orig_channel = session.query(Channel).filter_by(id=data.channel).first()
+                    if orig_channel.screenname is not None:
+                        name = self.get_name_from_web_interface(orig_channel.screenname, raw['id'])

                channel = Channel(
                    name=name,
                    platform_id=raw['fwd_from']['from_id']['channel_id'],
                    platform=data.platform,
-                    url="https://t.me/s/" + screenname,
+                    url="https://t.me/s/" + screenname if screenname is not None else "",
                    screenname=screenname,
                    category='forwarded',
-                    source=self.__version__
+                    source=self.__version__,
+                    notes=notes
                    )

                channel = insert(channel)
-            elif channel.screenname == "":
-                # if the screenname is empty, we can fill it in
-                orig_channel = session.query(Channel).filter_by(id=data.channel).first()
-                (screenname, name) = self.get_screenname_from_id(orig_channel.screenname, raw['id'])
-
-                channel.screenname = screenname
-                channel.name = name
-                channel.url = "https://t.me/s/" + screenname
-                session.flush()
+                logger.info(f"Added {channel}")

            fwd_from = channel.id

@@ -113,6 +170,7 @@ class TelegramTelethonTransformer(Transformer):
            channel=data.channel,
            date=dateutil.parser.parse(raw['date']),
            date_archived=data.date_archived,
+            date_transformed=datetime.now(timezone.utc),
            url="",
            content=raw['message'],
            author_id=raw['post_author'],
@@ -123,14 +181,14 @@ class TelegramTelethonTransformer(Transformer):

        transformed = insert(transformed)

-        for k in data.archived_urls:
-            if data.archived_urls[k]:
-                archived_url = data.archived_urls[k]
-                ext = archived_url.split('.')[-1]
+        # for k in data.archived_urls:
+        #     if data.archived_urls[k]:
+        #         archived_url = data.archived_urls[k]
+        #         ext = archived_url.split('.')[-1]

-                if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
-                    insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
-                else:
-                    insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
+        #         if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
+        #             insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
+        #         else:
+        #             insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))

        
--- a/russian_telegram_ingest.py
+++ b/russian_telegram_ingest.py
@@ -1,138 +0,0 @@
-import sys
-
-from sqlalchemy import create_engine
-from loguru import logger
-
-from cisticola.base import Channel
-from cisticola.scraper import (
-    ScraperController,
-    TelegramSnscrapeScraper)
-
-logger.remove()
-logger.add(sys.stderr, level="INFO")
-logger.add("../russian_telegram_ingest.log")
-
-test_channels = [
-    Channel(
-        id=0, 
-        name="QAnon Россия", 
-        platform_id=-1001319637748,
-        category="Qanon", 
-        followers=94048, 
-        platform="Telegram",
-        url="https://t.me/qanonrus", 
-        screenname="qanonrus", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False,
-        notes=""),
-    Channel(
-        id=1, 
-        name="The Great Awakening | Q", 
-        platform_id=-1001325597521,
-        category="Qanon", 
-        followers=5715,
-        platform="Telegram",
-        url="https://t.me/greatawakin", 
-        screenname="greatawakin", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False, 
-        notes=""),
-    Channel(
-        id=2, 
-        name="Великое Пробуждение", 
-        platform_id=-1001285898079,
-        category="Qanon", 
-        followers=5861, 
-        platform="Telegram",
-        url="https://t.me/greatawakeningrus", 
-        screenname="greatawakeningrus", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False, 
-        notes=""),
-    Channel(
-        id=3, 
-        name="T🕊Редакция Президент Гордон🕊", 
-        platform_id=-1001101170442,
-        category="Qanon", 
-        followers=5743, 
-        platform="Telegram",
-        url="https://t.me/prezidentgordonteam", 
-        screenname="prezidentgordonteam", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False, 
-        notes=""),
-    Channel(
-        id=4, 
-        name="ПРОЕКТ АВРОРА", 
-        platform_id=-1001279171101,
-        category="Qanon", 
-        followers=5930, 
-        platform="Telegram",
-        url="https://t.me/project_aurora", 
-        screenname="project_aurora", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False, 
-        notes=""),
-    Channel(
-        id=5, 
-        name="Сон Разума", 
-        platform_id=-1001202338312,
-        category="Qanon", 
-        followers=27099, 
-        platform="Telegram",
-        url="https://t.me/error_288", 
-        screenname="error_288", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False, 
-        notes=""),
-    Channel(
-        id=6, 
-        name="Пробуждающий Мир - официальный канал", 
-        platform_id=-1001492521207,
-        category="Qanon", 
-        followers=19097, 
-        platform="Telegram",
-        url="https://t.me/promirru", 
-        screenname="promirru", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False, 
-        notes=""),
-    Channel(
-        id=7, 
-        name="ЦЕЛЬНОЗОР", 
-        platform_id=-1001642737506,
-        category="Qanon", 
-        followers=13654, 
-        platform="Telegram",
-        url="https://t.me/tselnozor", 
-        screenname="tselnozor", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False, 
-        notes=""),]
-
-controller = ScraperController()
-
-telegram = TelegramSnscrapeScraper()
-controller.register_scraper(telegram)
-
-engine = create_engine('sqlite:///russian_telegram.db')
-controller.connect_to_db(engine)
-
-controller.scrape_channels(test_channels, archive_media = False)
-
--- a/spacy_setup.sh
+++ b/spacy_setup.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+pipenv run python -m spacy download xx_ent_wiki_sm
+pipenv run python -m spacy download fr_core_news_sm
+pipenv run python -m spacy download de_core_news_sm
+pipenv run python -m spacy download nl_core_news_sm
+pipenv run python -m spacy download it_core_news_sm
+pipenv run python -m spacy download ru_core_news_sm
+pipenv run python -m spacy download en_core_web_sm
--- a/sync_with_gsheet.py
+++ b/sync_with_gsheet.py
@@ -0,0 +1,134 @@
+import gspread
+import time
+from loguru import logger
+
+from cisticola.base import Channel, ChannelInfo
+
+def sync_channels(args, session):
+    logger.info("Synchronizing channels")
+
+    gc = gspread.service_account(filename="service_account.json")
+
+    # Open a sheet from a spreadsheet in one go
+    wks = gc.open_by_url(args.gsheet).worksheet("channels")
+    channels = wks.get_all_records()
+    row = 2
+
+    for c in channels:
+        # defaults for unset values
+        if c["public"] == "":
+            c["public"] = True
+        if c["chat"] == "":
+            c["chat"] = False
+
+        # normalize the values slightly from the Google Sheet
+        for k in c.keys():
+            if c[k] == "TRUE" or c[k] == "yes":
+                c[k] = True
+            if c[k] == "FALSE" or c[k] == "no":
+                c[k] = False
+
+            if c[k] == "":
+                c[k] = None
+
+        # add new channel
+        if c["id"] == "" or c["id"] is None:
+            del c["id"]
+
+            # check to see if this already exists,
+            platform_id = None
+            if c["platform_id"] != "":
+                platform_id = c["platform_id"]
+
+            channel = (
+                session.query(Channel)
+                .filter_by(
+                    platform_id=str(platform_id), platform=str(c["platform"])
+                )
+                .first()
+            )
+
+            if not channel:
+                channel = session.query(Channel).filter_by(platform=str(c["platform"]), url=str(c["url"])).first()
+
+            if not channel and c["screenname"] != '' and c["screenname"] is not None:
+                channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first()
+
+            if not channel:
+                channel = Channel(**c, source="researcher")
+                logger.debug(f"{channel} does not exist, adding")
+                session.add(channel)
+                session.flush()
+                session.commit()
+
+                wks.update_cell(row, 1, channel.id)
+                time.sleep(1)
+            else:
+                logger.info(f"Channel found, updating channel {channel}")
+                was_researcher = channel.source == "researcher"
+
+                channel.name = c["name"]
+                channel.category = c["category"]
+                channel.platform = c["platform"]
+                channel.url = c["url"]
+                channel.screenname = c["screenname"]
+                channel.country = c["country"]
+                channel.influencer = c["influencer"]
+                channel.public = c["public"]
+                channel.chat = c["chat"]
+                channel.notes = c["notes"]
+                channel.source = "researcher"
+
+                session.flush()
+                session.commit()
+
+                wks.update_cell(row, 1, channel.id)
+
+                # this likely means that the channel was duplicated in the Google Sheet, so add a red highlight
+                if was_researcher:
+                    logger.warning(f"This channel (ID {channel.id}) is possibly a duplicate.")
+                    
+                    wks.format(f"A{str(row)}:A{str(row)}", {
+                        "backgroundColor": {
+                            "red": 1.0,
+                            "green": 0.0,
+                            "blue": 0.0
+                    }})
+
+                time.sleep(1)
+
+        # channel has ID
+        else:
+            cid = int(c["id"])
+
+            channel = session.query(Channel).filter_by(id=cid).first()
+            channel_info = session.query(ChannelInfo).filter_by(channel=cid).order_by(ChannelInfo.date_archived.desc()).first()
+
+            logger.info(f"Updating channel {channel}")
+            logger.info(channel_info)
+
+            channel.name = c["name"]
+            channel.category = c["category"]
+            channel.platform = c["platform"]
+            channel.url = c["url"]
+            channel.screenname = c["screenname"]
+            channel.country = c["country"]
+            channel.influencer = c["influencer"]
+            channel.public = c["public"]
+            channel.chat = c["chat"]
+            channel.notes = c["notes"]
+            channel.source = "researcher"
+
+            if channel_info:
+                channel.screenname = channel_info.screenname
+                wks.update_cell(row, 7, channel_info.screenname)
+
+                channel.platform_id = channel_info.platform_id
+                wks.update_cell(row, 3, channel_info.platform_id)
+
+            session.flush()
+            session.commit()
+
+        row += 1
+
+    session.commit()