Use new RawChannelInfo class

2026-06-12 13:28:34 +03:00 · 2022-03-31 15:17:25 +02:00
parent 61c99d33f6
commit 2dc9213d64
17 changed files with 1698 additions and 79 deletions
--- a/4
+++ b/4
@@ -23,6 +23,8 @@ gspread = "*"
 cryptg = "*"
 gabber = {git = "https://github.com/stanfordio/gabber.git"}
 psycopg2-binary = "*"
 tqdm = "*"
 ratelimit = "*"
 [dev-packages]
 pytest = "*"
@@ -34,7 +36,7 @@ sphinx = "*"
 sphinx_rtd_theme = "*"
 [requires]
-python_version = "3.9"
+python_version = "3.8"
 [pipenv]
 allow_prereleases = true
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/app.py
+++ b/app.py
@@ -3,6 +3,8 @@ from loguru import logger
 import gspread
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 import os
 import time
 from cisticola.base import Channel, mapper_registry
 from cisticola.scraper import (
@@ -19,7 +21,7 @@ from cisticola.scraper import (
 def sync_channels(args):
    logger.info("Synchronizing channels")
-    session = get_db_session(args)
+    session = get_db_session()
    gc = gspread.service_account(filename='service_account.json')
@@ -29,6 +31,7 @@ def sync_channels(args):
    row = 2
    for c in channels:
        logger.info(c)
        del c['id']
        del c['followers']
@@ -43,20 +46,29 @@ def sync_channels(args):
        # check to see if this already exists, 
-        channel = session.query(Channel).filter_by(platform_id=None if c['platform_id'] == '' else c['platform_id'], platform=c['platform'], url=c['url']).first()
+        platform_id = None
-        
+        if c['platform_id'] != '':
            platform_id = c['platform_id']
        channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
        logger.info(channel)
        if not channel:
            channel = Channel(**c, source='researcher')
            logger.debug(f"{channel} does not exist, adding")
            session.add(channel)
            session.flush()
            session.commit()
            wks.update_cell(row, 1, channel.id)
            time.sleep(1)
        row += 1
    session.commit()
-def get_db_session(args):
+def get_db_session():
-    engine = create_engine(args.db)
+    engine = create_engine(os.environ['DB'])
    session_generator = sessionmaker()
    session_generator.configure(bind=engine)
@@ -64,8 +76,8 @@ def get_db_session(args):
    return session
-def get_scraper_controller(args):
+def get_scraper_controller():
-    engine = create_engine(args.db)
+    engine = create_engine(os.environ['DB'])
    controller = ScraperController()
    controller.connect_to_db(engine)
@@ -90,8 +102,8 @@ def archive_media(args):
    controller = get_scraper_controller(args)
    controller.archive_unarchived_media()
-def init_db(args):
+def init_db():
-    engine = create_engine(args.db)
+    engine = create_engine(os.environ['DB'])
    mapper_registry.metadata.create_all(bind=engine)
 if __name__ == '__main__':
@@ -99,14 +111,13 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser(description = 'Cisticola command line tools')
    parser.add_argument('command',  type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"')
    parser.add_argument('--db', type=str, help='[*] Sqlalchemy database string, eg, "sqlite:///cisticola.db"')
    parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize')
    parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media')
    args = parser.parse_args()
    if args.command == 'init-db':
-        init_db(args)
+        init_db()
    elif args.command == 'sync-channels':
        sync_channels(args)
    elif args.command == 'scrape-channels':
--- a/cisticola/base.py
+++ b/cisticola/base.py
@@ -34,7 +34,7 @@ class ScraperResult:
    date: datetime
    #: JSON dump of dict that contains all data scraped for the post.
-    raw_data: str
+    raw_posts: str
    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime
@@ -44,7 +44,7 @@ class ScraperResult:
    #: Has the media in this post been archived?
    media_archived: bool
-      
+
@dataclass
 class Channel:
    """Information about a specific channel to be scraped.
@@ -89,11 +89,31 @@ class Channel:
    def hydrate(self):
        pass
@dataclass
 class RawChannelInfo:
    """A minimally processed result from a scraper
    """
    #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
    scraper: str
    #: Name of platform from which result was scraped, e.g. ``"Twitter"``.
    platform: str
    #: Foreign key of channel ID that this was scraped from
    channel: int
    #: JSON dump of dict that contains all data scraped for the post.
    raw_data: str
    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime
@dataclass
 class Post:
    """An object with fields for columns in the analysis table"""
-    #: ID number of the scraped post in the ``raw_data`` table
+    #: ID number of the scraped post in the ``raw_posts`` table
    raw_id: int
    #: Platform specific post ID
@@ -144,7 +164,7 @@ class Media:
    """Base class for organizing information about a media file.
    """
-    #: ID number of the media's corresponding scraped post in the ``raw_data`` table.
+    #: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
    raw_id: int
    #: ID number of the media's corresponging scraped post in the ``analysis`` table.
@@ -221,7 +241,7 @@ class Video(Media):
 mapper_registry = registry()
-raw_data_table = Table('raw_data', mapper_registry.metadata,
+raw_posts_table = Table('raw_posts', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('scraper', String),
@@ -229,15 +249,23 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
                       Column('channel', Integer, ForeignKey('channels.id')),
                       Column('platform_id', String),
                       Column('date', DateTime),
-                       Column('raw_data', String),
+                       Column('raw_posts', String),
                       Column('date_archived', DateTime),
                       Column('archived_urls', JSON),
                       Column('media_archived', Boolean))
 raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
                    Column('id', Integer, primary_key=True),
                    Column('scraper', String),
                    Column('platform', String),
                    Column('channel', Integer, ForeignKey('channels.id')),
                    Column('raw_data', String),
                    Column('date_archived', DateTime))
 channel_table = Table('channels', mapper_registry.metadata,
                    Column('id', Integer, primary_key=True, autoincrement=True),
                    Column('name', String),
-                    Column('platform_id', Integer),
+                    Column('platform_id', String),
                    Column('category', String),
                    Column('platform', String),
                    Column('url', String),
@@ -253,7 +281,7 @@ channel_table = Table('channels', mapper_registry.metadata,
 post_table = Table('posts', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
-                       Column('raw_id', Integer, ForeignKey('raw_data.id')),
+                       Column('raw_id', Integer, ForeignKey('raw_posts.id')),
                       Column('platform_id', Integer),
                       Column('scraper', String),
                       Column('transformer', String),
@@ -273,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('type', String),
-                       Column('raw_id', Integer, ForeignKey('raw_data.id')),
+                       Column('raw_id', Integer, ForeignKey('raw_posts.id')),
                       Column('post', Integer, ForeignKey('posts.id')),
                       Column('url', String),
                       Column('original_url', String),
@@ -282,7 +310,8 @@ media_table = Table('media', mapper_registry.metadata,
 mapper_registry.map_imperatively(Post, post_table)
 mapper_registry.map_imperatively(Channel, channel_table)
-mapper_registry.map_imperatively(ScraperResult, raw_data_table)
+mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
 mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
 mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
 mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
 mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -9,7 +9,7 @@ from typing import Generator
 import requests
 from bs4 import BeautifulSoup
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper
 class BitchuteScraper(Scraper):
@@ -57,7 +57,7 @@ class BitchuteScraper(Scraper):
                platform_id=post['id'],
                date=datetime.fromtimestamp(post['timestamp']),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post),
+                raw_posts=json.dumps(post),
                archived_urls=archived_urls,
                media_archived=archive_media)
@@ -65,7 +65,7 @@ class BitchuteScraper(Scraper):
        if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
            return True
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        base_url = channel.url
@@ -106,8 +106,12 @@ class BitchuteScraper(Scraper):
            'subscribers': counts['subscriber_count'],
            'views': int(counts['about_view_count'].split(' ')[0])}
-        return profile
+        
-
+        return RawChannelInfo(scraper=self.__version__,
            platform=channel.platform,
            channel=channel.id,
            raw_data=json.dumps(profile),
            date_archived=datetime.now(timezone.utc))
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 def strip_tags(html, convert_newlines=True):
--- a/cisticola/scraper/gab.py
+++ b/cisticola/scraper/gab.py
@@ -5,7 +5,7 @@ import os
 from gabber.client import Client, GAB_API_BASE_URL
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper
 class GabScraper(Scraper):
@@ -80,7 +80,7 @@ class GabScraper(Scraper):
                platform_id=post['id'],
                date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post),
+                raw_posts=json.dumps(post),
                archived_urls=archived_urls,
                media_archived=archive_media)
@@ -88,7 +88,7 @@ class GabScraper(Scraper):
        if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
            return True
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        client = Client(
            username = os.environ['GAB_USER'],
@@ -106,4 +106,8 @@ class GabScraper(Scraper):
            profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
-        return profile
+        return RawChannelInfo(scraper=self.__version__,
            platform=channel.platform,
            channel=channel.id,
            raw_data=json.dumps(profile),
            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -5,7 +5,7 @@ from urllib.parse import urlparse
 from gogettr import PublicClient
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper
 class GettrScraper(Scraper):
@@ -58,7 +58,7 @@ class GettrScraper(Scraper):
                platform_id=post['_id'],
                date=datetime.fromtimestamp(post['cdate']/1000.),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post),
+                raw_posts=json.dumps(post),
                archived_urls=archived_urls,
                media_archived=archive_media)
@@ -71,9 +71,13 @@ class GettrScraper(Scraper):
        key = urlparse(url).path.split('/')[-2] + ext
        return key 
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        client = client = PublicClient()
        username = self.get_username_from_url(channel.url)
        profile = client.user_info(username)
-        return profile
+        return RawChannelInfo(scraper=self.__version__,
            platform=channel.platform,
            channel=channel.id,
            raw_data=json.dumps(profile),
            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/instagram.py
+++ b/cisticola/scraper/instagram.py
@@ -8,7 +8,7 @@ from pathlib import Path
 from loguru import logger
 import instaloader 
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper
 BASE_URL = 'https://www.instagram.com/'
@@ -79,7 +79,7 @@ class InstagramScraper(Scraper):
                platform_id=post.mediaid,
                date=post.date_utc,
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post._asdict(), default=str),
+                raw_posts=json.dumps(post._asdict(), default=str),
                archived_urls=archived_urls,
                media_archived=archive_media)
@@ -96,7 +96,7 @@ class InstagramScraper(Scraper):
                    platform_id=post.mediaid,
                    date=comment.created_at_utc,
                    date_archived=datetime.now(timezone.utc),
-                    raw_data=json.dumps(comment_dict, default=str),
+                    raw_posts=json.dumps(comment_dict, default=str),
                    archived_urls={},
                    media_archived=archive_media)
@@ -104,7 +104,7 @@ class InstagramScraper(Scraper):
        if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
            return True
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        username = self.get_username_from_url(channel.url)
@@ -125,4 +125,8 @@ class InstagramScraper(Scraper):
        profile['followers'] = user_profile.followers
        profile['followees'] = user_profile.followees
-        return profile
+        return RawChannelInfo(scraper=self.__version__,
                        platform=channel.platform,
                        channel=channel.id,
                        raw_data=json.dumps(profile),
                        date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/odysee.py
+++ b/cisticola/scraper/odysee.py
@@ -8,7 +8,7 @@ from loguru import logger
 from polyphemus.base import OdyseeChannel
 from polyphemus.api import get_auth_token
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper
 class OdyseeScraper(Scraper):
@@ -60,7 +60,7 @@ class OdyseeScraper(Scraper):
                platform_id=video.info['claim_id'],
                date=datetime.fromtimestamp(video.info['created']),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(video.info),
+                raw_posts=json.dumps(video.info),
                archived_urls=archived_urls,
                media_archived=archive_media)
@@ -73,7 +73,7 @@ class OdyseeScraper(Scraper):
                    platform_id=comment.info['claim_id'],
                    date=datetime.fromtimestamp(comment.info['created']),
                    date_archived=datetime.now(),
-                    raw_data=json.dumps(comment.info),
+                    raw_posts=json.dumps(comment.info),
                    archived_urls={},
                    media_archived=True)
@@ -87,10 +87,14 @@ class OdyseeScraper(Scraper):
        return f'{key}.{ext}'
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        username = self.get_username_from_url(channel.url)
        odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
        profile = odysee_channel.info
-        return profile
+        return RawChannelInfo(scraper=self.__version__,
                        platform=channel.platform,
                        channel=channel.id,
                        raw_data=json.dumps(profile),
                        date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -5,7 +5,7 @@ from urllib.parse import urlparse
 from bs4 import BeautifulSoup
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper import Scraper, make_request
 BASE_URL = 'https://rumble.com'
@@ -39,7 +39,7 @@ class RumbleScraper(Scraper):
                platform_id=post['media_url'].split('/')[-2],
                date=post['datetime'].replace(tzinfo=timezone.utc),
                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post, default = str),
+                raw_posts=json.dumps(post, default = str),
                archived_urls=archived_urls,
                media_archived=archive_media)
@@ -52,11 +52,15 @@ class RumbleScraper(Scraper):
        if channel.platform == "Rumble" and channel.url is not None:
            return True
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        profile = get_channel_profile(url = channel.url)
-        return profile
+        return RawChannelInfo(scraper=self.__version__,
                        platform=channel.platform,
                        channel=channel.id,
                        raw_data=json.dumps(profile),
                        date_archived=datetime.now(timezone.utc))
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -128,6 +132,7 @@ def get_channel_profile(url):
        'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
        'cover':  cover_soup.get('src') if cover_soup else None,
        'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
    return profile
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/cisticola/scraper/telegram_snscrape.py
+++ b/cisticola/scraper/telegram_snscrape.py
@@ -1,10 +1,10 @@
 from typing import Generator
 from datetime import datetime, timezone
-
+import json
 import snscrape.modules
 from loguru import logger
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper
 class TelegramSnscrapeScraper(Scraper):
@@ -49,15 +49,20 @@ class TelegramSnscrapeScraper(Scraper):
                platform_id=post.url,
                date=post.date,
                date_archived=datetime.now(timezone.utc),
-                raw_data=post.json(),
+                raw_posts=post.json(),
                archived_urls=archived_urls,
                media_archived=archive_media
            )
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        scr = snscrape.modules.telegram.TelegramChannelScraper(
            channel.screenname)
        profile = scr._get_entity().__dict__
-        return profile
+        
        return RawChannelInfo(scraper=self.__version__,
            platform=channel.platform,
            channel=channel.id,
            raw_data=json.dumps(profile),
            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/telegram_telethon.py
+++ b/cisticola/scraper/telegram_telethon.py
@@ -11,7 +11,7 @@ from telethon.sync import TelegramClient
 from telethon.tl.functions.channels import GetFullChannelRequest
 from telethon.tl import types
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper
 MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
@@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper):
            key = list(result.archived_urls.keys())[0]
            if result.archived_urls[key] is None:
-                raw = json.loads(result.raw_data)
+                raw = json.loads(result.raw_posts)
                message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
@@ -141,11 +141,11 @@ class TelegramTelethonScraper(Scraper):
                    platform_id=post_url,
                    date=post.date.replace(tzinfo=timezone.utc),
                    date_archived=datetime.now(timezone.utc),
-                    raw_data=json.dumps(post.to_dict(), default=str),
+                    raw_posts=json.dumps(post.to_dict(), default=str),
                    archived_urls=archived_urls,
                    media_archived=archive_media)
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        username = self.get_username_from_url(channel.url)
@@ -157,4 +157,8 @@ class TelegramTelethonScraper(Scraper):
            full_channel = client(GetFullChannelRequest(channel = username))
        profile = full_channel.__dict__
-        return profile
+        return RawChannelInfo(scraper=self.__version__,
            platform=channel.platform,
            channel=channel.id,
            raw_data=json.dumps(profile),
            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -1,11 +1,11 @@
 from datetime import datetime, timezone
 from typing import Generator
 from urllib.parse import urlparse, parse_qs
 from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
 from loguru import logger
 import json
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
 class TwitterScraper(Scraper):
@@ -66,7 +66,7 @@ class TwitterScraper(Scraper):
                platform_id=tweet.id,
                date=tweet.date,
                date_archived=datetime.now(timezone.utc),
-                raw_data=tweet.json(),
+                raw_posts=tweet.json(),
                archived_urls=archived_urls,
                media_archived=archive_media)
@@ -91,7 +91,7 @@ class TwitterScraper(Scraper):
        key = parsed_url.path.split('/')[-1] + ext
        return key 
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        scraper = TwitterUserScraper(channel.screenname)
        entity = scraper._get_entity()
@@ -99,4 +99,8 @@ class TwitterScraper(Scraper):
        if entity is None:
            raise ChannelDoesNotExistError(channel.url)
        else:   
-            return entity.__dict__
+            return RawChannelInfo(scraper=self.__version__,
            platform=channel.platform,
            channel=channel.id,
            raw_data=json.dumps(emtity.__dict__),
            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/vkontakte.py
+++ b/cisticola/scraper/vkontakte.py
@@ -1,11 +1,10 @@
 from datetime import datetime, timezone
 from typing import Generator
 from urllib.parse import urlparse
 from snscrape.modules.vkontakte import VKontakteUserScraper
 from loguru import logger
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper
 class VkontakteScraper(Scraper):
@@ -62,7 +61,7 @@ class VkontakteScraper(Scraper):
                platform_id=post.url.split('/')[-1],
                date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
                date_archived=datetime.now(timezone.utc),
-                raw_data=post.json(),
+                raw_posts=post.json(),
                archived_urls=archived_urls,
                media_archived=archive_media)
@@ -80,10 +79,15 @@ class VkontakteScraper(Scraper):
        return key
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        username = self.get_username_from_url(channel.url)
        scraper = VKontakteUserScraper(username)
        profile = scraper._get_entity().__dict__
-        return profile
+
        return RawChannelInfo(scraper=self.__version__,
                    platform=channel.platform,
                    channel=channel.id,
                    raw_data=json.dumps(profile),
                    date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/youtube.py
+++ b/cisticola/scraper/youtube.py
@@ -2,10 +2,9 @@ from datetime import datetime, timezone
 import json
 from typing import Generator
 import tempfile
 import yt_dlp
-from cisticola.base import Channel, ScraperResult
+from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper import Scraper
 class YoutubeScraper(Scraper):
@@ -71,7 +70,7 @@ class YoutubeScraper(Scraper):
                        platform_id=video_id,
                        date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
                        date_archived=datetime.now(timezone.utc),
-                        raw_data=json.dumps(video, default = str),
+                        raw_posts=json.dumps(video, default = str),
                        archived_urls=archived_urls,
                        media_archived=archive_media)
@@ -79,8 +78,7 @@ class YoutubeScraper(Scraper):
        if channel.platform == "Youtube" and channel.url:
            return True
-    def get_profile(self, channel: Channel) -> dict:
+    def get_profile(self, channel: Channel) -> RawChannelInfo:
        ydl_opts = {}
        ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -89,7 +87,12 @@ class YoutubeScraper(Scraper):
            meta = ydl.extract_info(
                channel.url,
                process=False)
            return RawChannelInfo(scraper=self.__version__,
                    platform=channel.platform,
                    channel=channel.id,
                    raw_data=json.dumps(meta),
                    date_archived=datetime.now(timezone.utc))
        except yt_dlp.utils.DownloadError as e:
            raise e
        return meta
--- a/cisticola/transformer/bitchute.py
+++ b/cisticola/transformer/bitchute.py
@@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer):
        return False        
    def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
-        raw = json.loads(data.raw_data)
+        raw = json.loads(data.raw_posts)
        orig = raw['video_url']
        new = data.archived_urls[orig]
@@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer):
        yield m
    def transform(self, data: ScraperResult) -> Post:
-        raw = json.loads(data.raw_data)
+        raw = json.loads(data.raw_posts)
        soup = BeautifulSoup(raw['body'], features = 'html.parser')
        content = soup.find_all('p')[-1].text
--- a/cisticola/transformer/twitter.py
+++ b/cisticola/transformer/twitter.py
@@ -47,7 +47,7 @@ class TwitterTransformer(Transformer):
    def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
-        raw = json.loads(data.raw_data)
+        raw = json.loads(data.raw_posts)
        transformed = Post(
            raw_id=data.id,