Merge pull request #17 from bellingcat/channel-db

Add Channel object to ORM, store in DB
2026-06-11 21:08:34 +03:00 · 2022-03-24 13:07:03 -05:00
parent d5bf3629c2 2a3b5c8200
commit d68cbd207a
11 changed files with 314 additions and 167 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -9,10 +9,12 @@ docs/source/_*
 *.db
 .env
 *.session
+service_account.json
+.vscode/

 # Unit test / coverage reports
 reports
 .coverage
 .cache
 .pytest_cache/
-cover/
+cover/
--- a/5
+++ b/5
@@ -20,14 +20,19 @@ telethon = "*"
 pytesseract = "*"
 pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
 instaloader = "*"
+gspread = "*"

 [dev-packages]
 pytest = "*"
 pytest-cov = "*"
 pytest-html = "*"
 pytest-metadata = "*"
+black = "*"
 sphinx = "*"
 sphinx_rtd_theme = "*"

 [requires]
 python_version = "3.9"
+
+[pipenv]
+allow_prereleases = true
--- a/cisticola/base.py
+++ b/cisticola/base.py
@@ -6,7 +6,7 @@ import json
 import io

 from sqlalchemy.orm import registry
-from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
+from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
 import pytesseract
 import PIL
 import exiftool
@@ -24,8 +24,7 @@ class ScraperResult:
    #: Name of platform from which result was scraped, e.g. ``"Twitter"``.
    platform: str

-    #TODO there is probably a way of making this a Channel object foreign key
-    #: User-specified integer that uniquely identifies a channel, e.g. ``15``.
+    #: Foreign key of channel ID that this was scraped from
    channel: int

    #: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
@@ -42,27 +41,33 @@ class ScraperResult:

    #: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files. 
    archived_urls: dict
+      
+raw_data_table = Table('raw_data', mapper_registry.metadata,
+                       Column('id', Integer, primary_key=True,
+                              autoincrement=True),
+                       Column('scraper', String),
+                       Column('platform', String),
+                       Column('channel', Integer, ForeignKey('channels.id')),
+                       Column('platform_id', String),
+                       Column('date', DateTime),
+                       Column('raw_data', String),
+                       Column('date_archived', DateTime),
+                       Column('archived_urls', JSON))

@dataclass
 class Channel:
    """Information about a specific channel to be scraped.
    """

-    #: User-specified integer that uniquely identifies a channel, e.g. ``15``.
-    id: int
-
    #: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
    name: str

    #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
    platform_id: str

-    #: User-specified category for the channel, e.g. ``"qanon-adjacent"``.
+    #: User-specified category for the channel, e.g. ``"explicit_qanon"``.
    category: str

-    #: Number of followers the channel has on the given platform, e.e. ``"1465"``.
-    followers: int
-
    #: Name of platform the given channel is on, e.g. ``"Telegram"``.
    platform: str

@@ -71,28 +76,55 @@ class Channel:

    #: Screen name/username of channel.
    screenname: str
-
+      
    #: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
-    country: str
-
-    #: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
-    influencer: str
-
+    country: str = None
+    
+    #: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.    
+    influencer: str = None
+      
    #: Whether or not the channel is publicly-accessible. 
-    public: bool
-
+    public: bool = None
+      
    #: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
-    chat: bool
-
+    chat: bool = None
+      
    #: Any other additional notes about the channel.
-    notes: str
+    notes: str = ""
+      
+    #: Did the channel come from a researcher or a scraping process?
+    source: str = None
+
+    def hydrate(self):
+        pass
+
+channel_table = Table('channels', mapper_registry.metadata,
+                    Column('id', Integer, primary_key=True, autoincrement=True),
+                    Column('name', String),
+                    Column('platform_id', Integer),
+                    Column('category', String),
+                    Column('platform', String),
+                    Column('url', String),
+                    Column('screenname', String),
+                    Column('country', String),
+                    Column('influencer', String),
+                    Column('public', Boolean),
+                    Column('chat', Boolean),
+                    Column('notes', String),
+                    Column('source', String)
+                    )
+
+mapper_registry.map_imperatively(Channel, channel_table)

@dataclass
-class TransformedResult:
+class Post:
    """An object with fields for columns in the analysis table"""

    #: ID number of the scraped post in the ``raw_data`` table
    raw_id: int
+      
+    #: Platform specific post ID
+    platform_id: str

    #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
    scraper: str
@@ -111,19 +143,49 @@ class TransformedResult:

    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime
-
+    
    #: URL of the original post
    url: str

    #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
    author_id: str
-
+    
    #: Username of author who made post.
    author_username: str
-
+      
    #: Text of the original post
    content: str

+    #: The ID of the Channel that the post was forwarded or quoted from
+    forwarded_from: int = None
+      
+    #: The ID of the Post that this Post is a reply to or reblog of
+    reply_to: int = None
+
+    def hydrate(self):
+        pass
+
+post_table = Table('posts', mapper_registry.metadata,
+                       Column('id', Integer, primary_key=True,
+                              autoincrement=True),
+                       Column('raw_id', Integer, ForeignKey('raw_data.id')),
+                       Column('platform_id', Integer),
+                       Column('scraper', String),
+                       Column('transformer', String),
+                       Column('platform', String),
+                       Column('channel', Integer, ForeignKey('channels.id')),
+                       Column('date', DateTime),
+                       Column('date_archived', DateTime),
+                       Column('url', String),
+                       Column('author_id', String),
+                       Column('author_username', String),
+                       Column('content', String),
+                       Column('forwarded_from', Integer, ForeignKey('channels.id')),
+                       Column('reply_to', Integer, ForeignKey('posts.id'))
+                       )
+
+mapper_registry.map_imperatively(Post, post_table)
+
@dataclass
 class Media:
    """Base class for organizing information about a media file.
@@ -239,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
                              autoincrement=True),
                       Column('type', String),
                       Column('raw_id', Integer, ForeignKey('raw_data.id')),
-                       Column('post', Integer, ForeignKey('analysis.id')),
+                       Column('post', Integer, ForeignKey('posts.id')),
                       Column('url', String),
                       Column('original_url', String),
                       Column('exif', String),
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -291,6 +291,17 @@ class ScraperController:
        """Register a list of Scraper instances to the controller.
        """
        self.scrapers.extend(scraper)
+
+    def scrape_all_channels(self, archive_media: bool = True):
+        if self.session is None:
+            logger.error("No DB session")
+            return
+
+        session = self.session()
+
+        channels = session.query(Channel).where(Channel.source=='researcher').all()
+
+        return self.scrape_channels(channels, archive_media=archive_media)
    
    @logger.catch(reraise = True)
    def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
@@ -314,7 +325,6 @@ class ScraperController:

            for scraper in self.scrapers:
                if scraper.can_handle(channel):
-                    session = self.session()
                    handled = True
                    added = 0

--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -29,25 +29,33 @@ class TwitterScraper(Scraper):
            archived_urls = {}

            if archive_media:
-
+                media_list = []
                if tweet.media:
-                    for media in tweet.media:
-                        if type(media) == Video:
-                            variant = max(
-                                [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
-                            url = variant.url
-                        elif type(media) == Gif:
-                            url = media.variants[0].url
-                        elif type(media) == Photo:
-                            url = media.fullUrl
-                        else:
-                            logger.warning(f"Could not get media URL of {media}")
-                            url = None
+                    media_list += tweet.media

-                        if url is not None:
-                            media_blob, content_type, key = self.url_to_blob(url)
-                            archived_url = self.archive_blob(media_blob, content_type, key)
-                            archived_urls[url] = archived_url
+                if tweet.retweetedTweet and tweet.retweetedTweet.media:
+                    media_list += tweet.retweetedTweet.media
+
+                if tweet.quotedTweet and tweet.quotedTweet.media:
+                    media_list += tweet.quotedTweet.media
+
+                for media in media_list:
+                    if type(media) == Video:
+                        variant = max(
+                            [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
+                        url = variant.url
+                    elif type(media) == Gif:
+                        url = media.variants[0].url
+                    elif type(media) == Photo:
+                        url = media.fullUrl
+                    else:
+                        logger.warning(f"Could not get media URL of {media}")
+                        url = None
+
+                    if url is not None and url not in archived_urls:
+                        media_blob, content_type, key = self.url_to_blob(url)
+                        archived_url = self.archive_blob(media_blob, content_type, key)
+                        archived_urls[url] = archived_url

            yield ScraperResult(
                scraper=self.__version__,
--- a/cisticola/transformer/base.py
+++ b/cisticola/transformer/base.py
@@ -1,9 +1,11 @@
-from typing import List, Generator
+from typing import List, Generator, Union, Callable
 from loguru import logger
-from sqlalchemy.orm import sessionmaker
+from sqlalchemy.orm import sessionmaker, make_transient
 from sqlalchemy.engine.base import Engine
+from collections import defaultdict
+
+from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry

-from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry

 class Transformer:
    """Interface class for transformers."""
@@ -16,12 +18,12 @@ class Transformer:
    def can_handle(data: ScraperResult) -> bool:
        """Specifies whether or not a Transformer is capable of handling a particular
        piece of scraped data.
-        
+
        Parameters
        ----------
        data : ScraperResult
            The ScraperResult object to check for ability to handle.
-            
+
        Returns
        -------
        bool
@@ -30,39 +32,18 @@ class Transformer:

        pass

-    def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
-        """Yields Media objects from each piece of media present in a raw ScraperResult.
-        
-        Parameters
-        ----------
-        data : ScraperResult
-            The ScraperResult object to process
-        transformed : TransformedResult
-            The TransformedResult version of `data`. (E.g. as generated by `Transformer.transform()`)
+    def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
+        """Transform a ScraperResult into objects with additional parameters for analysis. This function can
+        yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel
+        objects and provide all of these to be inserted into the database.

-        Yields
-        ------
-        Media
-            A media object generated from the ScraperResult. One ScraperResult can have multiple pieces
-            of media contained within it, so this can generate an arbitrary number of Media objects
-            (or their subclasses.) These Media objects are not fully hydrated.
-        """
-        
-        pass
-
-    def transform(data: ScraperResult) -> TransformedResult:
-        """Transform a ScraperResult into a TransformedResult object. This extracts additional attributes
-        that can be used directly for analysis.
-        
        Parameters
        ----------
        data : ScraperResult
            The ScraperResult object to process.
-        
-        Returns
-        -------
-        TransformedResult
-            A TransformedResult representation of the `data` object.
+        insert : Callable
+            A function that either inserts the object into a database or finds an object with the
+            relevant unique constraints if applicable.
        """

        pass
@@ -78,7 +59,7 @@ class ETLController:

    def register_transformer(self, transformer: Transformer):
        """Adds a Transformer to the list of available Transformers.
-        
+
        Parameters
        ----------
        transformer : Transformer
@@ -89,7 +70,7 @@ class ETLController:

    def connect_to_db(self, engine: Engine):
        """Connects the ETLController to a SQLAlchemy engine.
-        
+
        Parameters
        ----------
        engine : Engine
@@ -101,11 +82,59 @@ class ETLController:
        self.session = sessionmaker()
        self.session.configure(bind=engine)

-    @logger.catch(reraise = True)
+    def insert_or_select(self, obj, session, hydrate: bool = True):
+        """Inserts an object into the database or returns an existing object from the database.
+        Regardless, the resulting object has an `id` attribute that can be referenced later."""
+
+        instance = None
+
+        # This is using some adhoc unique constraints that might be worth formalizing at some point
+        if type(obj) == Channel:
+            instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first()
+            
+        elif type(obj) == Post:
+            instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
+
+        elif issubclass(type(obj), Media):
+            instance = session.query(type(obj)).filter_by(original_url=obj.original_url, post=obj.post).first()
+            if instance:
+                logger.info(f"Found matching DB entry for {obj}: {instance}")
+                return instance
+
+            instance = session.query(type(obj)).filter_by(original_url=obj.original_url).first()
+            
+            # For Media objects we want to duplicate the entry to preserve the relationship with the post.
+            # However, we also want to avoid rehydration, hence the code below:
+            if instance:
+                logger.info(f"Found matching media record, duplicating and inserting for new post")
+
+                session.expunge(instance) 
+                make_transient(instance) 
+                instance.id = None 
+                instance.post = obj.post
+                instance.raw_id = obj.raw_id
+
+                session.add(instance)
+                session.flush()
+                return instance
+
+        if instance:
+            logger.info(f"Found matching DB entry for {obj}: {instance}")
+            return instance
+
+        if hydrate:
+            obj.hydrate()
+
+        logger.info(f"Inserting new object {obj}")
+        session.add(obj)
+        session.flush()
+        return obj
+
+    @logger.catch(reraise=True)
    def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
-        """Transforms raw ScraperResults objects into TransformedResult objects and
+        """Transforms raw ScraperResults objects into Post objects and
        Media objects. Then, adds them to the database.
-        
+
        Parameters
        ----------
        results : List[ScraperResult]
@@ -126,34 +155,18 @@ class ETLController:
                    handled = True
                    session = self.session()

-                    transformed = transformer.transform(result)
-
-                    session.add(transformed)
-                    session.flush()
-
-                    media = transformer.transform_media(result, transformed)
-
-                    count = 0
-                    for obj in media:
-                        if hydrate:
-                            logger.info(f"Hydrating {obj}")
-                            obj.hydrate()
-
-                        session.add(obj)
-                        count += 1
-
+                    transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate))
                    session.commit()
-                    logger.info(f"{transformer} generated {count} media objects")
                    break

                if handled == False:
                    logger.warning(f"No Transformer could handle {result}")

-    @logger.catch(reraise = True)
+    @logger.catch(reraise=True)
    def transform_all_untransformed(self, hydrate: bool = True):
        """Transform all ScraperResult objects in the database that do not have an
-        equivalent TransformedResult object stored.
-        
+        equivalent Post object stored.
+
        Parameters
        ----------
        hydrate : bool
@@ -165,7 +178,12 @@ class ETLController:
            return

        session = self.session()
-        untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all()
+        untransformed = (
+            session.query(ScraperResult)
+            .join(Post, isouter=True)
+            .where(Post.raw_id == None)
+            .all()
+        )
        logger.info(f"Found {len(untransformed)} items to ETL")

-        self.transform_results(untransformed, hydrate=hydrate)
+        self.transform_results(untransformed, hydrate=hydrate)
--- a/cisticola/transformer/bitchute.py
+++ b/cisticola/transformer/bitchute.py
@@ -5,7 +5,7 @@ from typing import Generator
 from bs4 import BeautifulSoup 

 from cisticola.transformer.base import Transformer 
-from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
+from cisticola.base import ScraperResult, Post, Image, Video, Media

 class BitchuteTransformer(Transformer):
    """A Bitchute specific ScraperResult, with a method ETL/transforming"""
@@ -19,7 +19,7 @@ class BitchuteTransformer(Transformer):

        return False        

-    def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
+    def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
        raw = json.loads(data.raw_data)

        orig = raw['video_url']
@@ -29,13 +29,13 @@ class BitchuteTransformer(Transformer):

        yield m

-    def transform(self, data: ScraperResult) -> TransformedResult:
+    def transform(self, data: ScraperResult) -> Post:
        raw = json.loads(data.raw_data)

        soup = BeautifulSoup(raw['body'], features = 'html.parser')
        content = soup.find_all('p')[-1].text

-        transformed = TransformedResult(
+        transformed = Post(
            raw_id=data.id,
            scraper=data.scraper,
            transformer=self.__version__,
--- a/cisticola/transformer/twitter.py
+++ b/cisticola/transformer/twitter.py
@@ -1,9 +1,10 @@
 import json
 from loguru import logger
-from typing import Generator
+from typing import Generator, Union, Callable
+import dateutil.parser

 from cisticola.transformer.base import Transformer 
-from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
+from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel

 class TwitterTransformer(Transformer):
    """A Twitter specific ScraperResult, with a method ETL/transforming"""
@@ -17,11 +18,9 @@ class TwitterTransformer(Transformer):

        return False        

-    def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
-        raw = json.loads(data.raw_data)
-
-        if raw['media']:
-            for media in raw['media']:
+    def process_media(self, tweet, post_id, data):
+        if tweet['media']:
+            for media in tweet['media']:
                orig = None

                if media["_type"] == "snscrape.modules.twitter.Photo":
@@ -40,26 +39,77 @@ class TwitterTransformer(Transformer):
                    new = data.archived_urls[orig]

                    if media["_type"] == "snscrape.modules.twitter.Photo":
-                        m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
+                        m = Image(url=new, post=post_id, raw_id=data.id, original_url=orig)
                    else:
-                        m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
+                        m = Video(url=new, post=post_id, raw_id=data.id, original_url=orig)

                    yield m

-    def transform(self, data: ScraperResult) -> TransformedResult:
+
+    def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
        raw = json.loads(data.raw_data)

-        transformed = TransformedResult(
+        transformed = Post(
            raw_id=data.id,
+            platform_id=raw['id'],
            scraper=data.scraper,
            transformer=self.__version__,
            platform=data.platform,
            channel=data.channel,
-            date=data.date,
+            date=dateutil.parser.parse(raw['date']),
            date_archived=data.date_archived,
            url=raw['url'],
            content=raw['content'],
            author_id=raw['user']['id'],
            author_username=raw['user']['username'])

-        return transformed
+        def subtweet(tweet):
+            channel = Channel(
+                name=tweet['user']['displayname'],
+                platform_id=tweet['user']['id'],
+                platform=data.platform,
+                url=tweet['user']['url'],
+                screenname=tweet['user']['username'],
+                category='forwarded',
+                source=self.__version__
+                )
+
+            channel = insert(channel)
+
+            original = Post(
+                raw_id=data.id,
+                platform_id=tweet['id'],
+                scraper=data.scraper,
+                transformer=self.__version__,
+                platform=data.platform,
+                channel=channel.id,
+                date=dateutil.parser.parse(tweet['date']),
+                date_archived=data.date_archived,
+                url=tweet['url'],
+                content=tweet['content'],
+                author_id=tweet['user']['id'],
+                author_username=tweet['user']['username']
+            )
+
+            original = insert(original)
+            transformed.forwarded_from = channel.id
+            transformed.reply_to = original.id
+
+            media = self.process_media(tweet, original.id, data)
+            for m in media:
+                insert(m)
+
+        if raw['retweetedTweet'] is not None:
+            subtweet(raw['retweetedTweet'])
+
+        if raw['quotedTweet'] is not None:
+            subtweet(raw['quotedTweet'])
+
+        insert(transformed)
+
+        media = self.process_media(raw, transformed.id, data)
+        for m in media:
+            insert(m)
+
+
+        
--- a/test.py
+++ b/test.py
@@ -1,7 +1,10 @@
 from sqlalchemy import create_engine
 from loguru import logger
+import gspread
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker

-from cisticola.base import Channel, TransformedResult, ScraperResult
+from cisticola.base import Channel, Post, ScraperResult, mapper_registry
 from cisticola.scraper import (
    ScraperController,
    BitchuteScraper,
@@ -14,26 +17,9 @@ from cisticola.scraper import (
    TwitterScraper)
 from cisticola.transformer import ETLController
 from cisticola.transformer.twitter import TwitterTransformer
-from sqlalchemy.orm import sessionmaker

 logger.add("../test.log")

-test_channels = [
-    Channel(
-        id=0, 
-        name="L Weber (test)", 
-        platform_id=1424979017749442595,
-        category="test", 
-        followers=None, 
-        platform="Twitter",
-        url="https://twitter.com/LWeber33662141", 
-        screenname="LWeber33662141", 
-        country="US",
-        influencer=None, 
-        public=True, 
-        chat=False,
-        notes="")]
-
 controller = ScraperController()

 scrapers = [
@@ -49,9 +35,35 @@ scrapers = [
 controller.register_scrapers(scrapers)

 engine = create_engine('sqlite:///test.db')
-controller.connect_to_db(engine)
+mapper_registry.metadata.create_all(bind=engine)
+session_generator = sessionmaker()
+session_generator.configure(bind=engine)
+session = session_generator()

-controller.scrape_channels(test_channels, archive_media = True)
+gc = gspread.service_account(filename='service_account.json')
+
+# Open a sheet from a spreadsheet in one go
+wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1yxd6-2Mp0jZ8r9XJklb39WE-iIMrKRyA2kymJcIfGis/edit#gid=0")
+channels = wks.worksheet("channels").get_all_records()
+
+for c in channels:
+    del c['followers']
+
+    for k in c.keys():
+        if c[k] == 'TRUE': c[k] = True
+        if c[k] == 'FALSE': c[k] = False
+
+    # check to see if this already exists, 
+    channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first()
+    
+    if not channel:
+        channel = Channel(**c, source='researcher')
+        session.add(channel)
+
+session.commit()
+
+controller.connect_to_db(engine)
+controller.scrape_all_channels(archive_media = True)

 transformer = TwitterTransformer()

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -8,11 +8,9 @@ from cisticola.transformer import ETLController
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

 BITCHUTE_CHANNEL_KWARGS = {
-    'id': 0,
    'name': 'bestonlinejewelrystoresusa@gmail.com (test)',
    'platform_id': 'bestonlinejewelrystoresusagmailcom',
    'category': 'test',
-    'followers': None,
    'platform': 'Bitchute',
    'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/',
    'screenname': None,
@@ -23,11 +21,9 @@ BITCHUTE_CHANNEL_KWARGS = {
    'notes': ''}

 GAB_CHANNEL_KWARGS = {
-    'id': 1,
    'name': 'Capt. Marc Simon (test)',
    'platform_id': 'marc_capt',
    'category': 'test',
-    'followers': None,
    'platform': 'Gab',
    'url': 'https://gab.com/marc_capt',
    'screenname': 'marc_capt',
@@ -38,11 +34,9 @@ GAB_CHANNEL_KWARGS = {
    'notes': ''}

 GETTR_CHANNEL_KWARGS = {
-    'id': 2,
    'name': 'LizardRepublic (test)',
    'platform_id': 'lizardrepublic',
    'category': 'test',
-    'followers': None,
    'platform': 'Gettr',
    'url': 'https://www.gettr.com/user/lizardrepublic',
    'screenname': 'lizardrepublic',
@@ -53,11 +47,9 @@ GETTR_CHANNEL_KWARGS = {
    'notes': ''}

 INSTAGRAM_CHANNEL_KWARGS = {
-    'id': 3,
    'name': 'borland.88 (test)',
    'platform_id': 'borland.88',
    'category': 'test',
-    'followers': None,
    'platform': 'Instagram',
    'url': 'https://www.instagram.com/borland.88/',
    'screenname': 'borland.88',
@@ -68,11 +60,9 @@ INSTAGRAM_CHANNEL_KWARGS = {
    'notes': ''}

 ODYSEE_CHANNEL_KWARGS = {
-    'id': 4,
    'name': "Mak1n' Bacon (test)",
    'platform_id': 'Mak1nBacon',
    'category': 'test',
-    'followers': None,
    'platform': 'Odysee',
    'url': 'https://odysee.com/@Mak1nBacon',
    'screenname': 'Mak1nBacon',
@@ -83,11 +73,9 @@ ODYSEE_CHANNEL_KWARGS = {
    'notes': ''}

 RUMBLE_CHANNEL_KWARGS = {
-    'id': 5,
    'name': 'we are uploading videos wow products',
    'platform_id': 'c-916305',
    'category': 'test',
-    'followers': None,
    'platform': 'Rumble',
    'url': 'https://rumble.com/c/c-916305',
    'screenname': 'we are uploading',
@@ -98,11 +86,9 @@ RUMBLE_CHANNEL_KWARGS = {
    'notes': ''}

 TELEGRAM_CHANNEL_KWARGS = {
-    'id': 6,
    'name': 'South West Ohio Proud Boys (test)',
    'platform_id': -1001276612436,
    'category': 'test',
-    'followers': None,
    'platform': 'Telegram',
    'url': 'https://t.me/SouthwestOhioPB',
    'screenname': 'SouthwestOhioPB',
@@ -113,11 +99,9 @@ TELEGRAM_CHANNEL_KWARGS = {
    'notes': ''}

 TWITTER_CHANNEL_KWARGS = {
-    'id': 7,
    'name': 'L Weber (test)',
    'platform_id': 1424979017749442595,
    'category': 'test',
-    'followers': None,
    'platform': 'Twitter',
    'url': 'https://twitter.com/LWeber33662141',
    'screenname': 'LWeber33662141',
@@ -128,11 +112,9 @@ TWITTER_CHANNEL_KWARGS = {
    'notes': ''}

 VKONTAKTE_CHANNEL_KWARGS = {
-    'id': 8,
    'name': 'Wwg1wgA (test)',
    'platform_id': 'club201278078',
    'category': 'test',
-    'followers': None,
    'platform': 'Vkontakte',
    'url': 'https://vk.com/club201278078',
    'screenname': 'Wwg1wgA',
@@ -143,11 +125,9 @@ VKONTAKTE_CHANNEL_KWARGS = {
    'notes': ''}

 YOUTUBE_CHANNEL_KWARGS = {
-    'id': 9,
    'name': 'AnEs87 (test)',
    'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
    'category': 'test',
-    'followers': None,
    'platform': 'Youtube',
    'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
    'screenname': 'AnEs87',
--- a/tests/transformer/twitter.py
+++ b/tests/transformer/twitter.py
@@ -4,7 +4,7 @@ import json
 from cisticola.base import Channel
 from cisticola.scraper import TwitterScraper
 from cisticola.transformer import TwitterTransformer
-from cisticola.base import TransformedResult, Media
+from cisticola.base import Post, Media

 def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
    controller.reset_db()
@@ -20,11 +20,11 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
    sessionfactory.configure(bind=engine)
    session = sessionfactory()

-    posts = session.query(TransformedResult).all()
+    posts = session.query(Post).all()
    media = session.query(Media).all()

-    assert len(posts) == 3
-    assert len(media) == 2
+    assert len(posts) == 10
+    assert len(media) == 7

-    assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6"
+    assert posts[-1].content == "BARN"
    assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"