Merge pull request #62 from bellingcat/other-transformer-fixes

Fixed broken channel_info transformers, added Telegram post transformer fields
2026-06-08 03:18:34 +03:00 · 2022-06-24 11:00:50 +02:00
parent 6e962de244 289a47d7b1
commit fe0f4f9e2c
18 changed files with 438 additions and 86 deletions
--- a/app.py
+++ b/app.py
@@ -14,7 +14,14 @@ from cisticola.scraper import (
    BitchuteScraper,
    RumbleScraper,
 )
-from cisticola.transformer import (ETLController, TelegramTelethonTransformer)
+from cisticola.transformer import (
+    ETLController, 
+    TelegramTelethonTransformer,
+    GettrTransformer, 
+    RumbleTransformer, 
+    BitchuteTransformer,
+    VkontakteTransformer)
+
 from sync_with_gsheet import sync_channels

 def get_db_session():
@@ -49,7 +56,11 @@ def get_transformer_controller():
    controller = ETLController()
    controller.connect_to_db(engine)

-    transformers = [TelegramTelethonTransformer()]
+    transformers = [VkontakteTransformer(),
+        TelegramTelethonTransformer(),
+        GettrTransformer(),
+        BitchuteTransformer(),
+        RumbleTransformer()]

    controller.register_transformers(transformers)

--- a/cisticola/base.py
+++ b/cisticola/base.py
@@ -229,20 +229,40 @@ class Post:
    #: The ID of the Channel that the post was forwarded or quoted from
    forwarded_from: int = None
      
-    #: The ID of the Post that this Post is a reply to or reblog of
+    #: The ID of the Post that this Post is a reply to
    reply_to: int = None

+    #: Other users mentioned in the post
+    mentions: list = field(default_factory=list)
+
+    #: Number of positive post reactions (e.g. likes, favorites, rumbles, upvotes, etc.)
+    likes: int = None
+
+    #: Number of times the post was forwarded/retweeted/shared
+    forwards: int = None
+
+    #: Number of times the post was viewed
+    views: int = None
+
+    #: Video title, if post is a video
+    video_title: str = None
+
+    #: Video duration in seconds, if post is a video
+    video_duration: int = None
+
    def hydrate(self):
        URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""

        # replace is here in order to prevent catastrophic backtracking
        urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
        self.outlinks += urls
+        self.outlinks =  list(set(outlink for outlink in self.outlinks))

        HASHTAG_REGEX = r"(?:^|\s)[＃#]{1}(\w+)"
        
        hashtags = re.findall(HASHTAG_REGEX, self.content)
        self.hashtags += hashtags
+        self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))

        # regex patterns for finding crypto addresses
        BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
@@ -398,7 +418,6 @@ channel_info_table = Table('channel_info', mapper_registry.metadata,
                    Column('channel', Integer, ForeignKey('channels.id'), index=True),
                    Column('platform_id', String),
                    Column('scraper', String),
-                    Column('platform', String),
                    Column('transformer', String),
                    Column('platform', String),
                    Column('screenname', String),
@@ -434,7 +453,7 @@ post_table = Table('posts', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
-                       Column('platform_id', Integer, index=True),
+                       Column('platform_id', String, index=True),
                       Column('scraper', String),
                       Column('transformer', String),
                       Column('platform', String),
@@ -452,6 +471,12 @@ post_table = Table('posts', mapper_registry.metadata,
                       Column('cryptocurrency_addresses', JSON),
                       Column('hashtags', JSON),
                       Column('outlinks', JSON),
+                       Column('mentions', JSON),
+                       Column('likes', Integer),
+                       Column('forwards', Integer),
+                       Column('views', Integer),
+                       Column('video_title', String),
+                       Column('video_duration', Integer),
                       Column('detected_language', String),
                       Column('normalized_content', String)
                       )
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -105,7 +105,7 @@ class BitchuteScraper(Scraper):
        profile = {
            'description' : description_soup.text.strip(),
            'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
-            'created': parse_created(re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. '))),
+            'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
            'videos' : int(info_list[1].text.split('videos')[0].strip()),
            'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
            'owner_name' : owner_name,
@@ -483,16 +483,4 @@ def decode_cfemail(cfemail):
    for i in range(2, len(cfemail)-1, 2):
        email += chr(int(cfemail[i:i+2], 16)^k)

-    return email
-
-#---------------------------------------------------------------------------#
-
-def parse_created(created):
-
-    period_list = ['year', 'month', 'week', 'day']
-    
-    periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
-    _kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
-    kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()} 
-    
-    return datetime.now() - relativedelta(**kwargs)
+    return email
--- a/cisticola/scraper/telegram_telethon.py
+++ b/cisticola/scraper/telegram_telethon.py
@@ -166,13 +166,7 @@ class TelegramTelethonScraper(Scraper):
    @logger.catch
    def get_profile(self, channel: Channel) -> RawChannelInfo:
        username = TelegramTelethonScraper.get_channel_identifier(channel)
-
-        api_id = os.environ['TELEGRAM_API_ID']
-        api_hash = os.environ['TELEGRAM_API_HASH']
-        phone = os.environ['TELEGRAM_PHONE']
-
-        with TelegramClient(phone, api_id, api_hash) as client:
-            full_channel = client(GetFullChannelRequest(channel = username))
+        full_channel = self.client(GetFullChannelRequest(channel = username))
        profile = full_channel.to_dict()

        return RawChannelInfo(scraper=self.__version__,
--- a/cisticola/transformer/init.py
+++ b/cisticola/transformer/init.py
@@ -4,3 +4,4 @@ from .bitchute import BitchuteTransformer
 from .telegram_telethon import TelegramTelethonTransformer
 from .rumble import RumbleTransformer
 from .gettr import GettrTransformer
+from .vkontakte import VkontakteTransformer
--- a/cisticola/transformer/base.py
+++ b/cisticola/transformer/base.py
@@ -96,7 +96,7 @@ class ETLController:

        # This is using some adhoc unique constraints that might be worth formalizing at some point
        if type(obj) == Channel:
-            instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id), platform=obj.platform).first()
+            instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id or '') or obj.platform_id, platform=obj.platform).first()
            
        elif type(obj) == Post:
            instance = None
@@ -133,6 +133,8 @@ class ETLController:
            obj.hydrate()

        session.add(obj)
+        session.flush()
+
        logger.trace(f"Inserted new object {obj}")

        return obj
@@ -169,8 +171,8 @@ class ETLController:
                        session.commit()
                        break

-                    if handled == False:
-                        logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
+                if handled == False:
+                    logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")

    @logger.catch(reraise=True)
    def transform_all_untransformed(self, hydrate: bool = True):
--- a/cisticola/transformer/bitchute.py
+++ b/cisticola/transformer/bitchute.py
@@ -2,7 +2,7 @@ import json
 from loguru import logger
 from typing import Generator, Union, Callable
 from datetime import datetime, timezone
-import dateutil.parser
+from dateutil.relativedelta import relativedelta

 from bs4 import BeautifulSoup 

@@ -12,7 +12,7 @@ from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Me
 class BitchuteTransformer(Transformer):
    """A Bitchute specific ScraperResult, with a method ETL/transforming"""

-    __version__ = "BitchuteTransformer 0.0.1"
+    __version__ = "BitchuteTransformer 0.0.2"

    def can_handle(self, data: ScraperResult) -> bool:
        scraper = data.scraper.split(' ')
@@ -49,7 +49,7 @@ class BitchuteTransformer(Transformer):
            followers=raw['subscribers'],
            following=-1, # does not exist for Bitchute
            verified=False, # does not exist for Bitchute
-            date_created=dateutil.parser.parse(raw['created']),
+            date_created=parse_created(raw['created'], data.date_archived),
            date_archived=data.date_archived,
            date_transformed=datetime.now(timezone.utc)
        )
@@ -59,8 +59,32 @@ class BitchuteTransformer(Transformer):
    def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
        raw = json.loads(data.raw_data)

-        soup = BeautifulSoup(raw['body'], features = 'html.parser')
-        content = soup.find_all('p')[-1].text
+        if raw['category'] == 'comment':
+            if raw['parent_id'] is None:
+                reply_to_id = raw['thread_id']
+            else:
+                reply_to_id = raw['parent_id']
+            post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
+            if post is None:
+                if raw['parent_id'] is not None:
+                    # this block is for comments whose parent_ids correspond to deleted comments 
+                    post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
+                    if post is None:
+                        reply_to = -1
+                    else:
+                        reply_to = post.id
+                else:
+                    reply_to = -1
+            else:
+                reply_to = post.id
+            content = raw['body'].strip()
+        else:
+            reply_to = -1
+            soup = BeautifulSoup(raw['body'], features = 'html.parser')
+            soup.find('div', {'class': 'teaser'}).decompose()
+            soup.find('span', {'class': 'more'}).decompose()
+            soup.find('span', {'class': 'less hidden'}).decompose()
+            content = soup.text.strip()

        transformed = Post(
            raw_id=data.id,
@@ -72,9 +96,41 @@ class BitchuteTransformer(Transformer):
            date=data.date,
            date_archived=data.date_archived,
            date_transformed=datetime.now(timezone.utc),
-            url=raw['url'],
+            url=raw['url'] if raw['url'] else None,
            content=content,
            author_id=raw['author_id'],
-            author_username=raw['author'])
+            author_username=raw['author'],
+            reply_to=reply_to,
+            hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])),
+            likes = raw['likes'],
+            views = int(raw['views']) if raw.get('views') else None,
+            video_title = raw['subject'],
+            video_duration = _parse_duration_str(raw['length']))

-        transformed = insert(transformed)
+        transformed = insert(transformed)
+        session.flush()
+
+def parse_created(created: str, date_archived: datetime) -> datetime:
+    """Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime 
+    object relative to the specified ``date_archived``.
+    """
+    try:
+        # handle case where `created` string has already been parsed into a datetime
+        return datetime.fromisoformat(created)
+    except ValueError:
+        period_list = ['year', 'month', 'week', 'day']
+
+        periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
+        _kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
+        kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()} 
+
+        return date_archived - relativedelta(**kwargs)
+
+def _parse_duration_str(duration_str: str) -> int:
+    """Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
+    """
+    if not duration_str:
+        return None
+    else:
+        duration_list = duration_str.split(':')
+        return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])
--- a/cisticola/transformer/gettr.py
+++ b/cisticola/transformer/gettr.py
@@ -3,6 +3,9 @@ from loguru import logger
 from typing import Generator, Union, Callable
 import dateutil.parser
 from datetime import datetime, timezone
+from sqlalchemy import func
+from gogettr import PublicClient
+from gogettr.api import GettrApiError

 from cisticola.transformer.base import Transformer 
 from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
@@ -31,28 +34,68 @@ class GettrTransformer(Transformer):
            transformer=self.__version__,
            screenname=raw['username'],
            name=raw['nickname'],
-            description=raw['dsc'],
-            description_url=raw['website'],
-            description_location=raw['location'],
-            followers=raw['flg'],
-            following=raw['flw'],
+            description=raw.get('dsc'),
+            description_url=raw.get('website'),
+            description_location=raw.get('location'),
+            followers=int(raw['flg']),
+            following=int(raw['flw']),
            verified=True if raw.get('infl') else False,
-            date_created=datetime.fromtimestamp(raw['cdate']*0.001),
+            date_created=datetime.fromtimestamp(int(raw['cdate'])*0.001),
            date_archived=data.date_archived,
            date_transformed=datetime.now(timezone.utc)
        )

        transformed = insert(transformed)

+    def _get_channel_id(self, username: str, category: str, insert: Callable, session):
+
+        channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first()
+
+        if channel is None:
+            try:
+                client = PublicClient()
+                profile = client.user_info(username.lower())
+                screenname = profile.get('_id')
+                channel = Channel(
+                    name=profile.get('nickname'),
+                    platform_id=screenname,
+                    platform='Gettr',
+                    url="https://gettr.com/user/" + screenname,
+                    screenname=screenname,
+                    category=category,
+                    source=self.__version__,
+                    )
+            except GettrApiError:
+                channel = Channel(
+                    name = None,
+                    platform_id = None,
+                    platform = 'Gettr',
+                    url = None,
+                    screenname=username,
+                    category=category,
+                    source=self.__version__,
+                    notes='GettrApiError'
+                    )
+
+            channel = insert(channel)
+
+        return channel.id

    def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
        raw = json.loads(data.raw_data)

        if raw["activity"]["action"] == "shares_pst":
-            forwarded_from = raw["activity"]["uid"]
+            forwarded_from = self._get_channel_id(
+                username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session)
        else:
            forwarded_from = None

+        mentions = []
+        for mentioned_user in raw.get("utgs", []):
+            mentioned_id = self._get_channel_id(
+                username = mentioned_user, category = 'mentioned', insert = insert, session = session)
+            mentions.append(mentioned_id)
+            
        transformed = Post(
            raw_id=data.id,
            platform_id=raw["_id"],
@@ -69,7 +112,12 @@ class GettrTransformer(Transformer):
            author_username=raw["uid"],
            hashtags=raw.get("htgs", []),
            outlinks = list(filter(None, [raw.get("prevsrc")])),
-            forwarded_from = forwarded_from)
+            forwarded_from = forwarded_from,
+            mentions = mentions,
+            likes = raw.get('lkbpst'),
+            forwards = raw.get("shbpst"),
+            views = raw.get('vfpst')
+            )

        insert(transformed)

--- a/cisticola/transformer/rumble.py
+++ b/cisticola/transformer/rumble.py
@@ -3,6 +3,7 @@ from loguru import logger
 from typing import Generator, Union, Callable
 import dateutil.parser
 from datetime import datetime, timezone
+from sqlalchemy import func, JSON, String, cast, text

 from cisticola.transformer.base import Transformer 
 from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
@@ -22,19 +23,30 @@ class RumbleTransformer(Transformer):
    def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
        raw = json.loads(data.raw_data)

+        if 'id' not in raw:
+            # The first version of the Rumble ChannelInfo scraper didn't return
+            # the platform_id, so this is a workaround.
+            channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first()
+            if channel is None:
+                platform_id = None
+            else:
+                platform_id = json.loads(channel.raw_data)['id']
+        else:
+            platform_id = raw['id']
+
        transformed = ChannelInfo(
            raw_channel_info_id=data.id,
            channel=data.channel,
-            platform_id=raw['id'],
+            platform_id=platform_id,
            platform=data.platform,
            scraper=data.scraper,
            transformer=self.__version__,
-            screenname=raw['id'],
+            screenname=platform_id,
            name=raw['name'],
            description='', # does not exist for Rumble
            description_url='', # does not exist for Rumble
            description_location='', # does not exist for Rumble
-            followers=raw['subscribers'],
+            followers=_process_number(raw['subscribers']),
            following=-1, # does not exist for Rumble
            verified=raw['verified'],
            date_created=None, # does not exist for Rumble
@@ -61,10 +73,35 @@ class RumbleTransformer(Transformer):
            url=raw['link'],
            content=raw['content'],
            author_id=raw['author_id'],
-            author_username=raw['author_name'])
+            author_username=raw['author_name'],
+            views = _process_number(raw.get('views')),
+            likes = _process_number(raw.get('rumbles')),
+            video_title = raw['title'],
+            video_duration=_parse_duration_str(raw['duration']))

        insert(transformed)

        # media = self.process_media(raw, transformed.id, data)
        # for m in media:
-        #     insert(m)
+        #     insert(m)
+
+def _process_number(s):
+
+    if s is None:
+        return None
+    else:
+        s = s.replace(' ', '')
+        if s.endswith('M'):
+            return int(float(s[:-1]) * 1e6)
+        elif s.endswith('K'):
+            return int(float(s[:-1]) * 1000)
+        return int(s)
+
+def _parse_duration_str(duration_str: str) -> int:
+    """Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
+    """
+    if not duration_str:
+        return None
+    else:
+        duration_list = duration_str.split(':')
+        return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])
--- a/cisticola/transformer/telegram_telethon.py
+++ b/cisticola/transformer/telegram_telethon.py
@@ -8,15 +8,19 @@ import requests
 import time
 from telethon.sync import TelegramClient
 from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
+from telethon.tl import types
+from telethon.helpers import add_surrogate
+
 import os
 from datetime import datetime, timezone
+from sqlalchemy import func

 from cisticola.transformer.base import Transformer 
 from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel


 class TelegramTelethonTransformer(Transformer):
-    __version__ = 'TelegramTelethonTransformer 0.0.2'
+    __version__ = 'TelegramTelethonTransformer 0.0.3'

    bad_channels = {}

@@ -34,8 +38,10 @@ class TelegramTelethonTransformer(Transformer):
        try:
            with TelegramClient("transform.session", api_id, api_hash) as client:
                data = client.get_entity(channel_id)
-
-                return (data.username, data.title, "")
+                if isinstance(data, types.User):
+                    return (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "")
+                else:
+                    return (data.username, data.title, "")
        except ChannelPrivateError:
            logger.info("ChannelPrivateError")
            return ("", "", "ChannelPrivateError")
@@ -125,7 +131,7 @@ class TelegramTelethonTransformer(Transformer):
        fwd_from = None

        if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
-            channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first()
+            channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first()

            if channel is None:
                (screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])
@@ -154,13 +160,50 @@ class TelegramTelethonTransformer(Transformer):

        reply_to = None
        if raw['reply_to']:
-            reply_to_id = raw['reply_to']['reply_to_msg_id']
+            reply_to_id = str(raw['reply_to']['reply_to_msg_id'])
            post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
            if post is None:
                reply_to = -1
            else:
                reply_to = post.id

+        mentions = []
+
+        for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']:
+
+            offset = mention_entity['offset']
+            length = mention_entity['length']
+
+            screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip()
+
+            channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first()
+
+            if channel is None:
+
+                channel = Channel(
+                    name = None,
+                    platform_id = None,
+                    platform = 'Telegram',
+                    url="https://t.me/s/" + screenname,
+                    screenname=screenname,
+                    category='mentioned',
+                    source=self.__version__,
+                    )
+
+                channel = insert(channel)
+                logger.info(f"Added {channel}")
+
+            mentions.append(channel.id)
+
+        channel = session.query(Channel).filter_by(id=int(data.channel)).first()
+
+        if channel is not None and channel.url:
+            url = channel.url.strip('/') + f"/{raw['id']}"
+            author_username = channel.screenname
+        else:
+            url = ""
+            author_username = ""
+
        transformed = Post(
            raw_id = data.id,
            platform_id = raw['id'],
@@ -171,24 +214,47 @@ class TelegramTelethonTransformer(Transformer):
            date=dateutil.parser.parse(raw['date']),
            date_archived=data.date_archived,
            date_transformed=datetime.now(timezone.utc),
-            url="",
-            content=raw['message'],
-            author_id=raw['post_author'],
-            author_username="",
+            url=url,
+            content=add_markdown_links(raw),
+            author_id=raw.get('peer_id', {}).get('channel_id'),
+            author_username=author_username,
            forwarded_from=fwd_from,
-            reply_to=reply_to
+            reply_to=reply_to,
+            mentions = mentions,
+            forwards = raw.get('forwards'),
+            views = raw.get('views')
        )

        transformed = insert(transformed)

-        for k in data.archived_urls:
-            if data.archived_urls[k]:
-                archived_url = data.archived_urls[k]
-                ext = archived_url.split('.')[-1]
+        # for k in data.archived_urls:
+        #     if data.archived_urls[k]:
+        #         archived_url = data.archived_urls[k]
+        #         ext = archived_url.split('.')[-1]

-                if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
-                    insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
-                else:
-                    insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
+        #         if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
+        #             insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
+        #         else:
+        #             insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))

-        
+def add_markdown_links(raw_post):
+
+    global_offset = 0
+    transformed_content = raw_post['message']
+    links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl']
+
+    for link in links:
+        offset = global_offset + link['offset']
+        length = link['length']
+        url = link['url']
+
+        before_link = transformed_content[:offset]
+        link_text = f"[{transformed_content[offset:offset+length].strip()}]"
+        trailing_whitespace = ''.join([c for c in transformed_content[offset:offset+length] if c.isspace()])
+        link_href = f"({url})"
+        after_link = transformed_content[offset+length:]
+
+        transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link
+        global_offset += (4 + len(url))
+        
+    return transformed_content
--- a/cisticola/transformer/vkontakte.py
+++ b/cisticola/transformer/vkontakte.py
@@ -0,0 +1,73 @@
+import json
+from loguru import logger
+from typing import Generator, Union, Callable
+import dateutil.parser
+from datetime import datetime, timezone
+from sqlalchemy import func
+
+from cisticola.transformer.base import Transformer 
+from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
+
+class VkontakteTransformer(Transformer):
+    """A Vkontakte specific ScraperResult, with a method ETL/transforming"""
+
+    __version__ = "VkontakteTransformer 0.0.1"
+
+    def can_handle(self, data: ScraperResult) -> bool:
+        scraper = data.scraper.split(' ')
+        if scraper[0] == "VkontakteScraper":
+            return True
+
+        return False        
+
+    def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
+        raw = json.loads(data.raw_data)
+
+        transformed = ChannelInfo(
+            raw_channel_info_id=data.id,
+            channel=data.channel,
+            platform_id=raw['username'],
+            platform=data.platform,
+            scraper=data.scraper,
+            transformer=self.__version__,
+            screenname=raw['username'],
+            name=raw['name'],
+            description=raw.get('description'),
+            description_url=raw.get('websites'),
+            description_location=None,
+            followers=int(raw['followers']) if raw['followers'] else None,
+            following=-1,
+            verified=raw['verified'],
+            date_archived=data.date_archived,
+            date_created=None,
+            date_transformed=datetime.now(timezone.utc)
+        )
+
+        transformed = insert(transformed)
+
+
+    def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
+        raw = json.loads(data.raw_data)           
+
+        transformed = Post(
+            raw_id=data.id,
+            platform_id=data.platform_id,
+            scraper=data.scraper,
+            transformer=self.__version__,
+            platform=data.platform,
+            channel=data.channel,
+            date=data.date,
+            date_archived=data.date_archived,
+            date_transformed=datetime.now(timezone.utc),
+            url=raw['url'],
+            content=raw['content'] if raw['content'] else '',
+            author_id = None,
+            author_username=None,
+            outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
+            )
+
+        insert(transformed)
+
+        # media = self.process_media(raw, transformed.id, data)
+        # for m in media:
+        #     insert(m)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,7 +18,8 @@ BITCHUTE_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 GAB_CHANNEL_KWARGS = {
    'name': 'Capt. Marc Simon (test)',
@@ -31,7 +32,8 @@ GAB_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 GAB_GROUP_KWARGS = {
    'name': 'iran group (test)',
@@ -44,7 +46,8 @@ GAB_GROUP_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': True,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 GETTR_CHANNEL_KWARGS = {
    'name': 'LizardRepublic (test)',
@@ -57,7 +60,8 @@ GETTR_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 INSTAGRAM_CHANNEL_KWARGS = {
    'name': 'borland.88 (test)',
@@ -70,7 +74,8 @@ INSTAGRAM_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 ODYSEE_CHANNEL_KWARGS = {
    'name': "Mak1n' Bacon (test)",
@@ -83,7 +88,8 @@ ODYSEE_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 RUMBLE_CHANNEL_KWARGS = {
    'name': 'we are uploading videos wow products',
@@ -96,7 +102,8 @@ RUMBLE_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 TELEGRAM_CHANNEL_KWARGS = {
    'name': 'South West Ohio Proud Boys (test)',
@@ -109,8 +116,9 @@ TELEGRAM_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
-
+    'notes': '',
+    'source': 'researcher'}
+    
 TWITTER_CHANNEL_KWARGS = {
    'name': 'L Weber (test)',
    'platform_id': 1424979017749442595,
@@ -122,7 +130,8 @@ TWITTER_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 VKONTAKTE_CHANNEL_KWARGS = {
    'name': 'Wwg1wgA (test)',
@@ -135,7 +144,8 @@ VKONTAKTE_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 YOUTUBE_CHANNEL_KWARGS = {
    'name': 'AnEs87 (test)',
@@ -148,7 +158,8 @@ YOUTUBE_CHANNEL_KWARGS = {
    'influencer': None,
    'public': True,
    'chat': False,
-    'notes': ''}
+    'notes': '',
+    'source': 'researcher'}

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

--- a/tests/transformer/bitchute.py
+++ b/tests/transformer/bitchute.py
@@ -15,7 +15,8 @@ def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs)
    channels = [Channel(**channel_kwargs['bitchute'])]
    controller.register_scraper(scraper = BitchuteScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
-
+    controller.scrape_all_channel_info()
+    
    etl_controller.register_transformer(BitchuteTransformer())
    etl_controller.transform_all_untransformed()
    etl_controller.transform_all_untransformed_info()
--- a/tests/transformer/gettr.py
+++ b/tests/transformer/gettr.py
@@ -15,6 +15,7 @@ def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['gettr'])]
    controller.register_scraper(scraper = GettrScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+    controller.scrape_all_channel_info()

    etl_controller.register_transformer(GettrTransformer())
    etl_controller.transform_all_untransformed()
--- a/tests/transformer/rumble.py
+++ b/tests/transformer/rumble.py
@@ -15,7 +15,8 @@ def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['rumble'])]
    controller.register_scraper(scraper = RumbleScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
-
+    controller.scrape_all_channel_info()
+    
    etl_controller.register_transformer(RumbleTransformer())
    etl_controller.transform_all_untransformed()
    etl_controller.transform_all_untransformed_info()
--- a/tests/transformer/telegram_telethon.py
+++ b/tests/transformer/telegram_telethon.py
@@ -15,6 +15,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe
    channels = [Channel(**channel_kwargs['telegram'])]
    controller.register_scraper(scraper = TelegramTelethonScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+    controller.scrape_all_channel_info()

    etl_controller.register_transformer(TelegramTelethonTransformer())
    etl_controller.transform_all_untransformed()
@@ -28,7 +29,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe
    media = session.query(Media).all()

    assert len(posts) == 19
-    assert len(media) == 13
+    # assert len(media) == 13

    assert posts[16].content == "Taking pre-orders now"
-    assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"
+    # assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"
--- a/tests/transformer/twitter.py
+++ b/tests/transformer/twitter.py
@@ -15,6 +15,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['twitter'])]
    controller.register_scraper(scraper = TwitterScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+    controller.scrape_all_channel_info()

    etl_controller.register_transformer(TwitterTransformer())
    etl_controller.transform_all_untransformed()
@@ -28,7 +29,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
    media = session.query(Media).all()

    assert len(posts) == 12
-    assert len(media) == 4
+    assert len(media) == 8

    assert posts[2].content == "BARN"
    assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
--- a/tests/transformer/vkontakte.py
+++ b/tests/transformer/vkontakte.py
@@ -0,0 +1,35 @@
+from sqlalchemy.orm import sessionmaker
+import json
+
+import pytest
+
+from cisticola.base import Channel
+from cisticola.scraper import VkontakteScraper
+from cisticola.transformer import VkontakteTransformer
+from cisticola.base import Post, Media
+
+@pytest.mark.media
+def test_scrape_etl_vkontakte(engine, controller, etl_controller, channel_kwargs):
+    controller.reset_db()
+    
+    channels = [Channel(**channel_kwargs['vkontakte'])]
+    controller.register_scraper(scraper = VkontakteScraper())
+    controller.scrape_channels(channels = channels, archive_media = True)
+    controller.scrape_all_channel_info()
+
+    etl_controller.register_transformer(VkontakteTransformer())
+    etl_controller.transform_all_untransformed()
+    etl_controller.transform_all_untransformed_info()
+
+    sessionfactory = sessionmaker()
+    sessionfactory.configure(bind=engine)
+    session = sessionfactory()
+
+    posts = session.query(Post).all()
+    media = session.query(Media).all()
+
+    assert len(posts) == 23
+    # assert len(media) == 0
+
+    assert 'Nigerian gender studies' in posts[-1].content
+    # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"