Use new RawChannelInfo class

This commit is contained in:
Logan Williams
2022-03-31 15:17:25 +02:00
parent 61c99d33f6
commit 2dc9213d64
17 changed files with 1698 additions and 79 deletions

View File

@@ -23,6 +23,8 @@ gspread = "*"
cryptg = "*"
gabber = {git = "https://github.com/stanfordio/gabber.git"}
psycopg2-binary = "*"
tqdm = "*"
ratelimit = "*"
[dev-packages]
pytest = "*"
@@ -34,7 +36,7 @@ sphinx = "*"
sphinx_rtd_theme = "*"
[requires]
python_version = "3.9"
python_version = "3.8"
[pipenv]
allow_prereleases = true

1532
Pipfile.lock generated Normal file

File diff suppressed because it is too large Load Diff

33
app.py
View File

@@ -3,6 +3,8 @@ from loguru import logger
import gspread
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import os
import time
from cisticola.base import Channel, mapper_registry
from cisticola.scraper import (
@@ -19,7 +21,7 @@ from cisticola.scraper import (
def sync_channels(args):
logger.info("Synchronizing channels")
session = get_db_session(args)
session = get_db_session()
gc = gspread.service_account(filename='service_account.json')
@@ -29,6 +31,7 @@ def sync_channels(args):
row = 2
for c in channels:
logger.info(c)
del c['id']
del c['followers']
@@ -43,20 +46,29 @@ def sync_channels(args):
# check to see if this already exists,
channel = session.query(Channel).filter_by(platform_id=None if c['platform_id'] == '' else c['platform_id'], platform=c['platform'], url=c['url']).first()
platform_id = None
if c['platform_id'] != '':
platform_id = c['platform_id']
channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
logger.info(channel)
if not channel:
channel = Channel(**c, source='researcher')
logger.debug(f"{channel} does not exist, adding")
session.add(channel)
session.flush()
session.commit()
wks.update_cell(row, 1, channel.id)
time.sleep(1)
row += 1
session.commit()
def get_db_session(args):
engine = create_engine(args.db)
def get_db_session():
engine = create_engine(os.environ['DB'])
session_generator = sessionmaker()
session_generator.configure(bind=engine)
@@ -64,8 +76,8 @@ def get_db_session(args):
return session
def get_scraper_controller(args):
engine = create_engine(args.db)
def get_scraper_controller():
engine = create_engine(os.environ['DB'])
controller = ScraperController()
controller.connect_to_db(engine)
@@ -90,8 +102,8 @@ def archive_media(args):
controller = get_scraper_controller(args)
controller.archive_unarchived_media()
def init_db(args):
engine = create_engine(args.db)
def init_db():
engine = create_engine(os.environ['DB'])
mapper_registry.metadata.create_all(bind=engine)
if __name__ == '__main__':
@@ -99,14 +111,13 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Cisticola command line tools')
parser.add_argument('command', type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"')
parser.add_argument('--db', type=str, help='[*] Sqlalchemy database string, eg, "sqlite:///cisticola.db"')
parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize')
parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media')
args = parser.parse_args()
if args.command == 'init-db':
init_db(args)
init_db()
elif args.command == 'sync-channels':
sync_channels(args)
elif args.command == 'scrape-channels':

View File

@@ -34,7 +34,7 @@ class ScraperResult:
date: datetime
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
raw_posts: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@@ -44,7 +44,7 @@ class ScraperResult:
#: Has the media in this post been archived?
media_archived: bool
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
@@ -89,11 +89,31 @@ class Channel:
def hydrate(self):
pass
@dataclass
class RawChannelInfo:
"""A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: Foreign key of channel ID that this was scraped from
channel: int
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@dataclass
class Post:
"""An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_data`` table
#: ID number of the scraped post in the ``raw_posts`` table
raw_id: int
#: Platform specific post ID
@@ -144,7 +164,7 @@ class Media:
"""Base class for organizing information about a media file.
"""
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
raw_id: int
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
@@ -221,7 +241,7 @@ class Video(Media):
mapper_registry = registry()
raw_data_table = Table('raw_data', mapper_registry.metadata,
raw_posts_table = Table('raw_posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
@@ -229,15 +249,23 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('channel', Integer, ForeignKey('channels.id')),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('raw_posts', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON),
Column('media_archived', Boolean))
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('raw_data', String),
Column('date_archived', DateTime))
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String),
Column('platform_id', Integer),
Column('platform_id', String),
Column('category', String),
Column('platform', String),
Column('url', String),
@@ -253,7 +281,7 @@ channel_table = Table('channels', mapper_registry.metadata,
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('platform_id', Integer),
Column('scraper', String),
Column('transformer', String),
@@ -273,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('post', Integer, ForeignKey('posts.id')),
Column('url', String),
Column('original_url', String),
@@ -282,7 +310,8 @@ media_table = Table('media', mapper_registry.metadata,
mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -9,7 +9,7 @@ from typing import Generator
import requests
from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper):
@@ -57,7 +57,7 @@ class BitchuteScraper(Scraper):
platform_id=post['id'],
date=datetime.fromtimestamp(post['timestamp']),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -65,7 +65,7 @@ class BitchuteScraper(Scraper):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
base_url = channel.url
@@ -106,8 +106,12 @@ class BitchuteScraper(Scraper):
'subscribers': counts['subscriber_count'],
'views': int(counts['about_view_count'].split(' ')[0])}
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def strip_tags(html, convert_newlines=True):

View File

@@ -5,7 +5,7 @@ import os
from gabber.client import Client, GAB_API_BASE_URL
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class GabScraper(Scraper):
@@ -80,7 +80,7 @@ class GabScraper(Scraper):
platform_id=post['id'],
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -88,7 +88,7 @@ class GabScraper(Scraper):
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = Client(
username = os.environ['GAB_USER'],
@@ -106,4 +106,8 @@ class GabScraper(Scraper):
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -5,7 +5,7 @@ from urllib.parse import urlparse
from gogettr import PublicClient
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class GettrScraper(Scraper):
@@ -58,7 +58,7 @@ class GettrScraper(Scraper):
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -71,9 +71,13 @@ class GettrScraper(Scraper):
key = urlparse(url).path.split('/')[-2] + ext
return key
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = client = PublicClient()
username = self.get_username_from_url(channel.url)
profile = client.user_info(username)
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -8,7 +8,7 @@ from pathlib import Path
from loguru import logger
import instaloader
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
BASE_URL = 'https://www.instagram.com/'
@@ -79,7 +79,7 @@ class InstagramScraper(Scraper):
platform_id=post.mediaid,
date=post.date_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post._asdict(), default=str),
raw_posts=json.dumps(post._asdict(), default=str),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -96,7 +96,7 @@ class InstagramScraper(Scraper):
platform_id=post.mediaid,
date=comment.created_at_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(comment_dict, default=str),
raw_posts=json.dumps(comment_dict, default=str),
archived_urls={},
media_archived=archive_media)
@@ -104,7 +104,7 @@ class InstagramScraper(Scraper):
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
@@ -125,4 +125,8 @@ class InstagramScraper(Scraper):
profile['followers'] = user_profile.followers
profile['followees'] = user_profile.followees
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -8,7 +8,7 @@ from loguru import logger
from polyphemus.base import OdyseeChannel
from polyphemus.api import get_auth_token
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class OdyseeScraper(Scraper):
@@ -60,7 +60,7 @@ class OdyseeScraper(Scraper):
platform_id=video.info['claim_id'],
date=datetime.fromtimestamp(video.info['created']),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.info),
raw_posts=json.dumps(video.info),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -73,7 +73,7 @@ class OdyseeScraper(Scraper):
platform_id=comment.info['claim_id'],
date=datetime.fromtimestamp(comment.info['created']),
date_archived=datetime.now(),
raw_data=json.dumps(comment.info),
raw_posts=json.dumps(comment.info),
archived_urls={},
media_archived=True)
@@ -87,10 +87,14 @@ class OdyseeScraper(Scraper):
return f'{key}.{ext}'
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
profile = odysee_channel.info
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -5,7 +5,7 @@ from urllib.parse import urlparse
from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper, make_request
BASE_URL = 'https://rumble.com'
@@ -39,7 +39,7 @@ class RumbleScraper(Scraper):
platform_id=post['media_url'].split('/')[-2],
date=post['datetime'].replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post, default = str),
raw_posts=json.dumps(post, default = str),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -52,11 +52,15 @@ class RumbleScraper(Scraper):
if channel.platform == "Rumble" and channel.url is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
profile = get_channel_profile(url = channel.url)
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -128,6 +132,7 @@ def get_channel_profile(url):
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None,
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
return profile
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -1,10 +1,10 @@
from typing import Generator
from datetime import datetime, timezone
import json
import snscrape.modules
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class TelegramSnscrapeScraper(Scraper):
@@ -49,15 +49,20 @@ class TelegramSnscrapeScraper(Scraper):
platform_id=post.url,
date=post.date,
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
raw_posts=post.json(),
archived_urls=archived_urls,
media_archived=archive_media
)
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname)
profile = scr._get_entity().__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -11,7 +11,7 @@ from telethon.sync import TelegramClient
from telethon.tl.functions.channels import GetFullChannelRequest
from telethon.tl import types
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
@@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper):
key = list(result.archived_urls.keys())[0]
if result.archived_urls[key] is None:
raw = json.loads(result.raw_data)
raw = json.loads(result.raw_posts)
message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
@@ -141,11 +141,11 @@ class TelegramTelethonScraper(Scraper):
platform_id=post_url,
date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
raw_posts=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls,
media_archived=archive_media)
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
@@ -157,4 +157,8 @@ class TelegramTelethonScraper(Scraper):
full_channel = client(GetFullChannelRequest(channel = username))
profile = full_channel.__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,11 +1,11 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse, parse_qs
from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
from loguru import logger
import json
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
class TwitterScraper(Scraper):
@@ -66,7 +66,7 @@ class TwitterScraper(Scraper):
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(timezone.utc),
raw_data=tweet.json(),
raw_posts=tweet.json(),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -91,7 +91,7 @@ class TwitterScraper(Scraper):
key = parsed_url.path.split('/')[-1] + ext
return key
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
scraper = TwitterUserScraper(channel.screenname)
entity = scraper._get_entity()
@@ -99,4 +99,8 @@ class TwitterScraper(Scraper):
if entity is None:
raise ChannelDoesNotExistError(channel.url)
else:
return entity.__dict__
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(emtity.__dict__),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,11 +1,10 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse
from snscrape.modules.vkontakte import VKontakteUserScraper
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class VkontakteScraper(Scraper):
@@ -62,7 +61,7 @@ class VkontakteScraper(Scraper):
platform_id=post.url.split('/')[-1],
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
raw_posts=post.json(),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -80,10 +79,15 @@ class VkontakteScraper(Scraper):
return key
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
scraper = VKontakteUserScraper(username)
profile = scraper._get_entity().__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -2,10 +2,9 @@ from datetime import datetime, timezone
import json
from typing import Generator
import tempfile
import yt_dlp
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper
class YoutubeScraper(Scraper):
@@ -71,7 +70,7 @@ class YoutubeScraper(Scraper):
platform_id=video_id,
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video, default = str),
raw_posts=json.dumps(video, default = str),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -79,8 +78,7 @@ class YoutubeScraper(Scraper):
if channel.platform == "Youtube" and channel.url:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
ydl_opts = {}
ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -89,7 +87,12 @@ class YoutubeScraper(Scraper):
meta = ydl.extract_info(
channel.url,
process=False)
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(meta),
date_archived=datetime.now(timezone.utc))
except yt_dlp.utils.DownloadError as e:
raise e
return meta

View File

@@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer):
return False
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
raw = json.loads(data.raw_posts)
orig = raw['video_url']
new = data.archived_urls[orig]
@@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer):
yield m
def transform(self, data: ScraperResult) -> Post:
raw = json.loads(data.raw_data)
raw = json.loads(data.raw_posts)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text

View File

@@ -47,7 +47,7 @@ class TwitterTransformer(Transformer):
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
raw = json.loads(data.raw_posts)
transformed = Post(
raw_id=data.id,