Merge pull request #17 from bellingcat/channel-db

Add Channel object to ORM, store in DB
This commit is contained in:
Tristan Lee
2022-03-24 13:07:03 -05:00
committed by GitHub
11 changed files with 314 additions and 167 deletions

4
.gitignore vendored
View File

@@ -9,10 +9,12 @@ docs/source/_*
*.db
.env
*.session
service_account.json
.vscode/
# Unit test / coverage reports
reports
.coverage
.cache
.pytest_cache/
cover/
cover/

View File

@@ -20,14 +20,19 @@ telethon = "*"
pytesseract = "*"
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
instaloader = "*"
gspread = "*"
[dev-packages]
pytest = "*"
pytest-cov = "*"
pytest-html = "*"
pytest-metadata = "*"
black = "*"
sphinx = "*"
sphinx_rtd_theme = "*"
[requires]
python_version = "3.9"
[pipenv]
allow_prereleases = true

View File

@@ -6,7 +6,7 @@ import json
import io
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
import pytesseract
import PIL
import exiftool
@@ -24,8 +24,7 @@ class ScraperResult:
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#TODO there is probably a way of making this a Channel object foreign key
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
#: Foreign key of channel ID that this was scraped from
channel: int
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
@@ -42,27 +41,33 @@ class ScraperResult:
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict
raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON))
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
"""
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
id: int
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
name: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
platform_id: str
#: User-specified category for the channel, e.g. ``"qanon-adjacent"``.
#: User-specified category for the channel, e.g. ``"explicit_qanon"``.
category: str
#: Number of followers the channel has on the given platform, e.e. ``"1465"``.
followers: int
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
platform: str
@@ -71,28 +76,55 @@ class Channel:
#: Screen name/username of channel.
screenname: str
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
country: str
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
influencer: str
country: str = None
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
influencer: str = None
#: Whether or not the channel is publicly-accessible.
public: bool
public: bool = None
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
chat: bool
chat: bool = None
#: Any other additional notes about the channel.
notes: str
notes: str = ""
#: Did the channel come from a researcher or a scraping process?
source: str = None
def hydrate(self):
pass
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String),
Column('platform_id', Integer),
Column('category', String),
Column('platform', String),
Column('url', String),
Column('screenname', String),
Column('country', String),
Column('influencer', String),
Column('public', Boolean),
Column('chat', Boolean),
Column('notes', String),
Column('source', String)
)
mapper_registry.map_imperatively(Channel, channel_table)
@dataclass
class TransformedResult:
class Post:
"""An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_data`` table
raw_id: int
#: Platform specific post ID
platform_id: str
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
@@ -111,19 +143,49 @@ class TransformedResult:
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: URL of the original post
url: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
author_id: str
#: Username of author who made post.
author_username: str
#: Text of the original post
content: str
#: The ID of the Channel that the post was forwarded or quoted from
forwarded_from: int = None
#: The ID of the Post that this Post is a reply to or reblog of
reply_to: int = None
def hydrate(self):
pass
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('platform_id', Integer),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String),
Column('forwarded_from', Integer, ForeignKey('channels.id')),
Column('reply_to', Integer, ForeignKey('posts.id'))
)
mapper_registry.map_imperatively(Post, post_table)
@dataclass
class Media:
"""Base class for organizing information about a media file.
@@ -239,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
autoincrement=True),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('post', Integer, ForeignKey('analysis.id')),
Column('post', Integer, ForeignKey('posts.id')),
Column('url', String),
Column('original_url', String),
Column('exif', String),

View File

@@ -291,6 +291,17 @@ class ScraperController:
"""Register a list of Scraper instances to the controller.
"""
self.scrapers.extend(scraper)
def scrape_all_channels(self, archive_media: bool = True):
if self.session is None:
logger.error("No DB session")
return
session = self.session()
channels = session.query(Channel).where(Channel.source=='researcher').all()
return self.scrape_channels(channels, archive_media=archive_media)
@logger.catch(reraise = True)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
@@ -314,7 +325,6 @@ class ScraperController:
for scraper in self.scrapers:
if scraper.can_handle(channel):
session = self.session()
handled = True
added = 0

View File

@@ -29,25 +29,33 @@ class TwitterScraper(Scraper):
archived_urls = {}
if archive_media:
media_list = []
if tweet.media:
for media in tweet.media:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
media_list += tweet.media
if url is not None:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
if tweet.retweetedTweet and tweet.retweetedTweet.media:
media_list += tweet.retweetedTweet.media
if tweet.quotedTweet and tweet.quotedTweet.media:
media_list += tweet.quotedTweet.media
for media in media_list:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
if url is not None and url not in archived_urls:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,

View File

@@ -1,9 +1,11 @@
from typing import List, Generator
from typing import List, Generator, Union, Callable
from loguru import logger
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import sessionmaker, make_transient
from sqlalchemy.engine.base import Engine
from collections import defaultdict
from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry
from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry
class Transformer:
"""Interface class for transformers."""
@@ -16,12 +18,12 @@ class Transformer:
def can_handle(data: ScraperResult) -> bool:
"""Specifies whether or not a Transformer is capable of handling a particular
piece of scraped data.
Parameters
----------
data : ScraperResult
The ScraperResult object to check for ability to handle.
Returns
-------
bool
@@ -30,39 +32,18 @@ class Transformer:
pass
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
"""Yields Media objects from each piece of media present in a raw ScraperResult.
Parameters
----------
data : ScraperResult
The ScraperResult object to process
transformed : TransformedResult
The TransformedResult version of `data`. (E.g. as generated by `Transformer.transform()`)
def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel
objects and provide all of these to be inserted into the database.
Yields
------
Media
A media object generated from the ScraperResult. One ScraperResult can have multiple pieces
of media contained within it, so this can generate an arbitrary number of Media objects
(or their subclasses.) These Media objects are not fully hydrated.
"""
pass
def transform(data: ScraperResult) -> TransformedResult:
"""Transform a ScraperResult into a TransformedResult object. This extracts additional attributes
that can be used directly for analysis.
Parameters
----------
data : ScraperResult
The ScraperResult object to process.
Returns
-------
TransformedResult
A TransformedResult representation of the `data` object.
insert : Callable
A function that either inserts the object into a database or finds an object with the
relevant unique constraints if applicable.
"""
pass
@@ -78,7 +59,7 @@ class ETLController:
def register_transformer(self, transformer: Transformer):
"""Adds a Transformer to the list of available Transformers.
Parameters
----------
transformer : Transformer
@@ -89,7 +70,7 @@ class ETLController:
def connect_to_db(self, engine: Engine):
"""Connects the ETLController to a SQLAlchemy engine.
Parameters
----------
engine : Engine
@@ -101,11 +82,59 @@ class ETLController:
self.session = sessionmaker()
self.session.configure(bind=engine)
@logger.catch(reraise = True)
def insert_or_select(self, obj, session, hydrate: bool = True):
"""Inserts an object into the database or returns an existing object from the database.
Regardless, the resulting object has an `id` attribute that can be referenced later."""
instance = None
# This is using some adhoc unique constraints that might be worth formalizing at some point
if type(obj) == Channel:
instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first()
elif type(obj) == Post:
instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
elif issubclass(type(obj), Media):
instance = session.query(type(obj)).filter_by(original_url=obj.original_url, post=obj.post).first()
if instance:
logger.info(f"Found matching DB entry for {obj}: {instance}")
return instance
instance = session.query(type(obj)).filter_by(original_url=obj.original_url).first()
# For Media objects we want to duplicate the entry to preserve the relationship with the post.
# However, we also want to avoid rehydration, hence the code below:
if instance:
logger.info(f"Found matching media record, duplicating and inserting for new post")
session.expunge(instance)
make_transient(instance)
instance.id = None
instance.post = obj.post
instance.raw_id = obj.raw_id
session.add(instance)
session.flush()
return instance
if instance:
logger.info(f"Found matching DB entry for {obj}: {instance}")
return instance
if hydrate:
obj.hydrate()
logger.info(f"Inserting new object {obj}")
session.add(obj)
session.flush()
return obj
@logger.catch(reraise=True)
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
"""Transforms raw ScraperResults objects into TransformedResult objects and
"""Transforms raw ScraperResults objects into Post objects and
Media objects. Then, adds them to the database.
Parameters
----------
results : List[ScraperResult]
@@ -126,34 +155,18 @@ class ETLController:
handled = True
session = self.session()
transformed = transformer.transform(result)
session.add(transformed)
session.flush()
media = transformer.transform_media(result, transformed)
count = 0
for obj in media:
if hydrate:
logger.info(f"Hydrating {obj}")
obj.hydrate()
session.add(obj)
count += 1
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate))
session.commit()
logger.info(f"{transformer} generated {count} media objects")
break
if handled == False:
logger.warning(f"No Transformer could handle {result}")
@logger.catch(reraise = True)
@logger.catch(reraise=True)
def transform_all_untransformed(self, hydrate: bool = True):
"""Transform all ScraperResult objects in the database that do not have an
equivalent TransformedResult object stored.
equivalent Post object stored.
Parameters
----------
hydrate : bool
@@ -165,7 +178,12 @@ class ETLController:
return
session = self.session()
untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all()
untransformed = (
session.query(ScraperResult)
.join(Post, isouter=True)
.where(Post.raw_id == None)
.all()
)
logger.info(f"Found {len(untransformed)} items to ETL")
self.transform_results(untransformed, hydrate=hydrate)
self.transform_results(untransformed, hydrate=hydrate)

View File

@@ -5,7 +5,7 @@ from typing import Generator
from bs4 import BeautifulSoup
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
from cisticola.base import ScraperResult, Post, Image, Video, Media
class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
@@ -19,7 +19,7 @@ class BitchuteTransformer(Transformer):
return False
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
orig = raw['video_url']
@@ -29,13 +29,13 @@ class BitchuteTransformer(Transformer):
yield m
def transform(self, data: ScraperResult) -> TransformedResult:
def transform(self, data: ScraperResult) -> Post:
raw = json.loads(data.raw_data)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text
transformed = TransformedResult(
transformed = Post(
raw_id=data.id,
scraper=data.scraper,
transformer=self.__version__,

View File

@@ -1,9 +1,10 @@
import json
from loguru import logger
from typing import Generator
from typing import Generator, Union, Callable
import dateutil.parser
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
class TwitterTransformer(Transformer):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
@@ -17,11 +18,9 @@ class TwitterTransformer(Transformer):
return False
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
if raw['media']:
for media in raw['media']:
def process_media(self, tweet, post_id, data):
if tweet['media']:
for media in tweet['media']:
orig = None
if media["_type"] == "snscrape.modules.twitter.Photo":
@@ -40,26 +39,77 @@ class TwitterTransformer(Transformer):
new = data.archived_urls[orig]
if media["_type"] == "snscrape.modules.twitter.Photo":
m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
m = Image(url=new, post=post_id, raw_id=data.id, original_url=orig)
else:
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
m = Video(url=new, post=post_id, raw_id=data.id, original_url=orig)
yield m
def transform(self, data: ScraperResult) -> TransformedResult:
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = TransformedResult(
transformed = Post(
raw_id=data.id,
platform_id=raw['id'],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=data.date,
date=dateutil.parser.parse(raw['date']),
date_archived=data.date_archived,
url=raw['url'],
content=raw['content'],
author_id=raw['user']['id'],
author_username=raw['user']['username'])
return transformed
def subtweet(tweet):
channel = Channel(
name=tweet['user']['displayname'],
platform_id=tweet['user']['id'],
platform=data.platform,
url=tweet['user']['url'],
screenname=tweet['user']['username'],
category='forwarded',
source=self.__version__
)
channel = insert(channel)
original = Post(
raw_id=data.id,
platform_id=tweet['id'],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=channel.id,
date=dateutil.parser.parse(tweet['date']),
date_archived=data.date_archived,
url=tweet['url'],
content=tweet['content'],
author_id=tweet['user']['id'],
author_username=tweet['user']['username']
)
original = insert(original)
transformed.forwarded_from = channel.id
transformed.reply_to = original.id
media = self.process_media(tweet, original.id, data)
for m in media:
insert(m)
if raw['retweetedTweet'] is not None:
subtweet(raw['retweetedTweet'])
if raw['quotedTweet'] is not None:
subtweet(raw['quotedTweet'])
insert(transformed)
media = self.process_media(raw, transformed.id, data)
for m in media:
insert(m)

52
test.py
View File

@@ -1,7 +1,10 @@
from sqlalchemy import create_engine
from loguru import logger
import gspread
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from cisticola.base import Channel, TransformedResult, ScraperResult
from cisticola.base import Channel, Post, ScraperResult, mapper_registry
from cisticola.scraper import (
ScraperController,
BitchuteScraper,
@@ -14,26 +17,9 @@ from cisticola.scraper import (
TwitterScraper)
from cisticola.transformer import ETLController
from cisticola.transformer.twitter import TwitterTransformer
from sqlalchemy.orm import sessionmaker
logger.add("../test.log")
test_channels = [
Channel(
id=0,
name="L Weber (test)",
platform_id=1424979017749442595,
category="test",
followers=None,
platform="Twitter",
url="https://twitter.com/LWeber33662141",
screenname="LWeber33662141",
country="US",
influencer=None,
public=True,
chat=False,
notes="")]
controller = ScraperController()
scrapers = [
@@ -49,9 +35,35 @@ scrapers = [
controller.register_scrapers(scrapers)
engine = create_engine('sqlite:///test.db')
controller.connect_to_db(engine)
mapper_registry.metadata.create_all(bind=engine)
session_generator = sessionmaker()
session_generator.configure(bind=engine)
session = session_generator()
controller.scrape_channels(test_channels, archive_media = True)
gc = gspread.service_account(filename='service_account.json')
# Open a sheet from a spreadsheet in one go
wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1yxd6-2Mp0jZ8r9XJklb39WE-iIMrKRyA2kymJcIfGis/edit#gid=0")
channels = wks.worksheet("channels").get_all_records()
for c in channels:
del c['followers']
for k in c.keys():
if c[k] == 'TRUE': c[k] = True
if c[k] == 'FALSE': c[k] = False
# check to see if this already exists,
channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first()
if not channel:
channel = Channel(**c, source='researcher')
session.add(channel)
session.commit()
controller.connect_to_db(engine)
controller.scrape_all_channels(archive_media = True)
transformer = TwitterTransformer()

View File

@@ -8,11 +8,9 @@ from cisticola.transformer import ETLController
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
BITCHUTE_CHANNEL_KWARGS = {
'id': 0,
'name': 'bestonlinejewelrystoresusa@gmail.com (test)',
'platform_id': 'bestonlinejewelrystoresusagmailcom',
'category': 'test',
'followers': None,
'platform': 'Bitchute',
'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/',
'screenname': None,
@@ -23,11 +21,9 @@ BITCHUTE_CHANNEL_KWARGS = {
'notes': ''}
GAB_CHANNEL_KWARGS = {
'id': 1,
'name': 'Capt. Marc Simon (test)',
'platform_id': 'marc_capt',
'category': 'test',
'followers': None,
'platform': 'Gab',
'url': 'https://gab.com/marc_capt',
'screenname': 'marc_capt',
@@ -38,11 +34,9 @@ GAB_CHANNEL_KWARGS = {
'notes': ''}
GETTR_CHANNEL_KWARGS = {
'id': 2,
'name': 'LizardRepublic (test)',
'platform_id': 'lizardrepublic',
'category': 'test',
'followers': None,
'platform': 'Gettr',
'url': 'https://www.gettr.com/user/lizardrepublic',
'screenname': 'lizardrepublic',
@@ -53,11 +47,9 @@ GETTR_CHANNEL_KWARGS = {
'notes': ''}
INSTAGRAM_CHANNEL_KWARGS = {
'id': 3,
'name': 'borland.88 (test)',
'platform_id': 'borland.88',
'category': 'test',
'followers': None,
'platform': 'Instagram',
'url': 'https://www.instagram.com/borland.88/',
'screenname': 'borland.88',
@@ -68,11 +60,9 @@ INSTAGRAM_CHANNEL_KWARGS = {
'notes': ''}
ODYSEE_CHANNEL_KWARGS = {
'id': 4,
'name': "Mak1n' Bacon (test)",
'platform_id': 'Mak1nBacon',
'category': 'test',
'followers': None,
'platform': 'Odysee',
'url': 'https://odysee.com/@Mak1nBacon',
'screenname': 'Mak1nBacon',
@@ -83,11 +73,9 @@ ODYSEE_CHANNEL_KWARGS = {
'notes': ''}
RUMBLE_CHANNEL_KWARGS = {
'id': 5,
'name': 'we are uploading videos wow products',
'platform_id': 'c-916305',
'category': 'test',
'followers': None,
'platform': 'Rumble',
'url': 'https://rumble.com/c/c-916305',
'screenname': 'we are uploading',
@@ -98,11 +86,9 @@ RUMBLE_CHANNEL_KWARGS = {
'notes': ''}
TELEGRAM_CHANNEL_KWARGS = {
'id': 6,
'name': 'South West Ohio Proud Boys (test)',
'platform_id': -1001276612436,
'category': 'test',
'followers': None,
'platform': 'Telegram',
'url': 'https://t.me/SouthwestOhioPB',
'screenname': 'SouthwestOhioPB',
@@ -113,11 +99,9 @@ TELEGRAM_CHANNEL_KWARGS = {
'notes': ''}
TWITTER_CHANNEL_KWARGS = {
'id': 7,
'name': 'L Weber (test)',
'platform_id': 1424979017749442595,
'category': 'test',
'followers': None,
'platform': 'Twitter',
'url': 'https://twitter.com/LWeber33662141',
'screenname': 'LWeber33662141',
@@ -128,11 +112,9 @@ TWITTER_CHANNEL_KWARGS = {
'notes': ''}
VKONTAKTE_CHANNEL_KWARGS = {
'id': 8,
'name': 'Wwg1wgA (test)',
'platform_id': 'club201278078',
'category': 'test',
'followers': None,
'platform': 'Vkontakte',
'url': 'https://vk.com/club201278078',
'screenname': 'Wwg1wgA',
@@ -143,11 +125,9 @@ VKONTAKTE_CHANNEL_KWARGS = {
'notes': ''}
YOUTUBE_CHANNEL_KWARGS = {
'id': 9,
'name': 'AnEs87 (test)',
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
'category': 'test',
'followers': None,
'platform': 'Youtube',
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
'screenname': 'AnEs87',

View File

@@ -4,7 +4,7 @@ import json
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
from cisticola.transformer import TwitterTransformer
from cisticola.base import TransformedResult, Media
from cisticola.base import Post, Media
def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
@@ -20,11 +20,11 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(TransformedResult).all()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 3
assert len(media) == 2
assert len(posts) == 10
assert len(media) == 7
assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6"
assert posts[-1].content == "BARN"
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"