diff --git a/.gitignore b/.gitignore index 747d85e..632ac68 100644 --- a/.gitignore +++ b/.gitignore @@ -9,10 +9,12 @@ docs/source/_* *.db .env *.session +service_account.json +.vscode/ # Unit test / coverage reports reports .coverage .cache .pytest_cache/ -cover/ \ No newline at end of file +cover/ diff --git a/Pipfile b/Pipfile index aed16c3..47ccb65 100644 --- a/Pipfile +++ b/Pipfile @@ -20,14 +20,19 @@ telethon = "*" pytesseract = "*" pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} instaloader = "*" +gspread = "*" [dev-packages] pytest = "*" pytest-cov = "*" pytest-html = "*" pytest-metadata = "*" +black = "*" sphinx = "*" sphinx_rtd_theme = "*" [requires] python_version = "3.9" + +[pipenv] +allow_prereleases = true diff --git a/cisticola/base.py b/cisticola/base.py index dfaaee8..d5b10e8 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -6,7 +6,7 @@ import json import io from sqlalchemy.orm import registry -from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey +from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean import pytesseract import PIL import exiftool @@ -24,8 +24,7 @@ class ScraperResult: #: Name of platform from which result was scraped, e.g. ``"Twitter"``. platform: str - #TODO there is probably a way of making this a Channel object foreign key - #: User-specified integer that uniquely identifies a channel, e.g. ``15``. + #: Foreign key of channel ID that this was scraped from channel: int #: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"`` @@ -42,27 +41,33 @@ class ScraperResult: #: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files. archived_urls: dict + +raw_data_table = Table('raw_data', mapper_registry.metadata, + Column('id', Integer, primary_key=True, + autoincrement=True), + Column('scraper', String), + Column('platform', String), + Column('channel', Integer, ForeignKey('channels.id')), + Column('platform_id', String), + Column('date', DateTime), + Column('raw_data', String), + Column('date_archived', DateTime), + Column('archived_urls', JSON)) @dataclass class Channel: """Information about a specific channel to be scraped. """ - #: User-specified integer that uniquely identifies a channel, e.g. ``15``. - id: int - #: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``. name: str #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``. platform_id: str - #: User-specified category for the channel, e.g. ``"qanon-adjacent"``. + #: User-specified category for the channel, e.g. ``"explicit_qanon"``. category: str - #: Number of followers the channel has on the given platform, e.e. ``"1465"``. - followers: int - #: Name of platform the given channel is on, e.g. ``"Telegram"``. platform: str @@ -71,28 +76,55 @@ class Channel: #: Screen name/username of channel. screenname: str - + #: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``. - country: str - - #: Name of influencer, if channel belongs to an influencer that operates on multiple platforms. - influencer: str - + country: str = None + + #: Name of influencer, if channel belongs to an influencer that operates on multiple platforms. + influencer: str = None + #: Whether or not the channel is publicly-accessible. - public: bool - + public: bool = None + #: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message) - chat: bool - + chat: bool = None + #: Any other additional notes about the channel. - notes: str + notes: str = "" + + #: Did the channel come from a researcher or a scraping process? + source: str = None + + def hydrate(self): + pass + +channel_table = Table('channels', mapper_registry.metadata, + Column('id', Integer, primary_key=True, autoincrement=True), + Column('name', String), + Column('platform_id', Integer), + Column('category', String), + Column('platform', String), + Column('url', String), + Column('screenname', String), + Column('country', String), + Column('influencer', String), + Column('public', Boolean), + Column('chat', Boolean), + Column('notes', String), + Column('source', String) + ) + +mapper_registry.map_imperatively(Channel, channel_table) @dataclass -class TransformedResult: +class Post: """An object with fields for columns in the analysis table""" #: ID number of the scraped post in the ``raw_data`` table raw_id: int + + #: Platform specific post ID + platform_id: str #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. scraper: str @@ -111,19 +143,49 @@ class TransformedResult: #: Datetime (relative to UTC) that the scraped post was archived at. date_archived: datetime - + #: URL of the original post url: str #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``. author_id: str - + #: Username of author who made post. author_username: str - + #: Text of the original post content: str + #: The ID of the Channel that the post was forwarded or quoted from + forwarded_from: int = None + + #: The ID of the Post that this Post is a reply to or reblog of + reply_to: int = None + + def hydrate(self): + pass + +post_table = Table('posts', mapper_registry.metadata, + Column('id', Integer, primary_key=True, + autoincrement=True), + Column('raw_id', Integer, ForeignKey('raw_data.id')), + Column('platform_id', Integer), + Column('scraper', String), + Column('transformer', String), + Column('platform', String), + Column('channel', Integer, ForeignKey('channels.id')), + Column('date', DateTime), + Column('date_archived', DateTime), + Column('url', String), + Column('author_id', String), + Column('author_username', String), + Column('content', String), + Column('forwarded_from', Integer, ForeignKey('channels.id')), + Column('reply_to', Integer, ForeignKey('posts.id')) + ) + +mapper_registry.map_imperatively(Post, post_table) + @dataclass class Media: """Base class for organizing information about a media file. @@ -239,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata, autoincrement=True), Column('type', String), Column('raw_id', Integer, ForeignKey('raw_data.id')), - Column('post', Integer, ForeignKey('analysis.id')), + Column('post', Integer, ForeignKey('posts.id')), Column('url', String), Column('original_url', String), Column('exif', String), diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 28dbe76..ddc5510 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -291,6 +291,17 @@ class ScraperController: """Register a list of Scraper instances to the controller. """ self.scrapers.extend(scraper) + + def scrape_all_channels(self, archive_media: bool = True): + if self.session is None: + logger.error("No DB session") + return + + session = self.session() + + channels = session.query(Channel).where(Channel.source=='researcher').all() + + return self.scrape_channels(channels, archive_media=archive_media) @logger.catch(reraise = True) def scrape_channels(self, channels: List[Channel], archive_media: bool = True): @@ -314,7 +325,6 @@ class ScraperController: for scraper in self.scrapers: if scraper.can_handle(channel): - session = self.session() handled = True added = 0 diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index c019e27..8209282 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -29,25 +29,33 @@ class TwitterScraper(Scraper): archived_urls = {} if archive_media: - + media_list = [] if tweet.media: - for media in tweet.media: - if type(media) == Video: - variant = max( - [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) - url = variant.url - elif type(media) == Gif: - url = media.variants[0].url - elif type(media) == Photo: - url = media.fullUrl - else: - logger.warning(f"Could not get media URL of {media}") - url = None + media_list += tweet.media - if url is not None: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url + if tweet.retweetedTweet and tweet.retweetedTweet.media: + media_list += tweet.retweetedTweet.media + + if tweet.quotedTweet and tweet.quotedTweet.media: + media_list += tweet.quotedTweet.media + + for media in media_list: + if type(media) == Video: + variant = max( + [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) + url = variant.url + elif type(media) == Gif: + url = media.variants[0].url + elif type(media) == Photo: + url = media.fullUrl + else: + logger.warning(f"Could not get media URL of {media}") + url = None + + if url is not None and url not in archived_urls: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index ce22f03..38da38a 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -1,9 +1,11 @@ -from typing import List, Generator +from typing import List, Generator, Union, Callable from loguru import logger -from sqlalchemy.orm import sessionmaker +from sqlalchemy.orm import sessionmaker, make_transient from sqlalchemy.engine.base import Engine +from collections import defaultdict + +from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry -from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry class Transformer: """Interface class for transformers.""" @@ -16,12 +18,12 @@ class Transformer: def can_handle(data: ScraperResult) -> bool: """Specifies whether or not a Transformer is capable of handling a particular piece of scraped data. - + Parameters ---------- data : ScraperResult The ScraperResult object to check for ability to handle. - + Returns ------- bool @@ -30,39 +32,18 @@ class Transformer: pass - def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: - """Yields Media objects from each piece of media present in a raw ScraperResult. - - Parameters - ---------- - data : ScraperResult - The ScraperResult object to process - transformed : TransformedResult - The TransformedResult version of `data`. (E.g. as generated by `Transformer.transform()`) + def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]: + """Transform a ScraperResult into objects with additional parameters for analysis. This function can + yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel + objects and provide all of these to be inserted into the database. - Yields - ------ - Media - A media object generated from the ScraperResult. One ScraperResult can have multiple pieces - of media contained within it, so this can generate an arbitrary number of Media objects - (or their subclasses.) These Media objects are not fully hydrated. - """ - - pass - - def transform(data: ScraperResult) -> TransformedResult: - """Transform a ScraperResult into a TransformedResult object. This extracts additional attributes - that can be used directly for analysis. - Parameters ---------- data : ScraperResult The ScraperResult object to process. - - Returns - ------- - TransformedResult - A TransformedResult representation of the `data` object. + insert : Callable + A function that either inserts the object into a database or finds an object with the + relevant unique constraints if applicable. """ pass @@ -78,7 +59,7 @@ class ETLController: def register_transformer(self, transformer: Transformer): """Adds a Transformer to the list of available Transformers. - + Parameters ---------- transformer : Transformer @@ -89,7 +70,7 @@ class ETLController: def connect_to_db(self, engine: Engine): """Connects the ETLController to a SQLAlchemy engine. - + Parameters ---------- engine : Engine @@ -101,11 +82,59 @@ class ETLController: self.session = sessionmaker() self.session.configure(bind=engine) - @logger.catch(reraise = True) + def insert_or_select(self, obj, session, hydrate: bool = True): + """Inserts an object into the database or returns an existing object from the database. + Regardless, the resulting object has an `id` attribute that can be referenced later.""" + + instance = None + + # This is using some adhoc unique constraints that might be worth formalizing at some point + if type(obj) == Channel: + instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first() + + elif type(obj) == Post: + instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first() + + elif issubclass(type(obj), Media): + instance = session.query(type(obj)).filter_by(original_url=obj.original_url, post=obj.post).first() + if instance: + logger.info(f"Found matching DB entry for {obj}: {instance}") + return instance + + instance = session.query(type(obj)).filter_by(original_url=obj.original_url).first() + + # For Media objects we want to duplicate the entry to preserve the relationship with the post. + # However, we also want to avoid rehydration, hence the code below: + if instance: + logger.info(f"Found matching media record, duplicating and inserting for new post") + + session.expunge(instance) + make_transient(instance) + instance.id = None + instance.post = obj.post + instance.raw_id = obj.raw_id + + session.add(instance) + session.flush() + return instance + + if instance: + logger.info(f"Found matching DB entry for {obj}: {instance}") + return instance + + if hydrate: + obj.hydrate() + + logger.info(f"Inserting new object {obj}") + session.add(obj) + session.flush() + return obj + + @logger.catch(reraise=True) def transform_results(self, results: List[ScraperResult], hydrate: bool = True): - """Transforms raw ScraperResults objects into TransformedResult objects and + """Transforms raw ScraperResults objects into Post objects and Media objects. Then, adds them to the database. - + Parameters ---------- results : List[ScraperResult] @@ -126,34 +155,18 @@ class ETLController: handled = True session = self.session() - transformed = transformer.transform(result) - - session.add(transformed) - session.flush() - - media = transformer.transform_media(result, transformed) - - count = 0 - for obj in media: - if hydrate: - logger.info(f"Hydrating {obj}") - obj.hydrate() - - session.add(obj) - count += 1 - + transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate)) session.commit() - logger.info(f"{transformer} generated {count} media objects") break if handled == False: logger.warning(f"No Transformer could handle {result}") - @logger.catch(reraise = True) + @logger.catch(reraise=True) def transform_all_untransformed(self, hydrate: bool = True): """Transform all ScraperResult objects in the database that do not have an - equivalent TransformedResult object stored. - + equivalent Post object stored. + Parameters ---------- hydrate : bool @@ -165,7 +178,12 @@ class ETLController: return session = self.session() - untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all() + untransformed = ( + session.query(ScraperResult) + .join(Post, isouter=True) + .where(Post.raw_id == None) + .all() + ) logger.info(f"Found {len(untransformed)} items to ETL") - self.transform_results(untransformed, hydrate=hydrate) \ No newline at end of file + self.transform_results(untransformed, hydrate=hydrate) diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index de6f0a7..d0c5fe0 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -5,7 +5,7 @@ from typing import Generator from bs4 import BeautifulSoup from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media +from cisticola.base import ScraperResult, Post, Image, Video, Media class BitchuteTransformer(Transformer): """A Bitchute specific ScraperResult, with a method ETL/transforming""" @@ -19,7 +19,7 @@ class BitchuteTransformer(Transformer): return False - def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]: raw = json.loads(data.raw_data) orig = raw['video_url'] @@ -29,13 +29,13 @@ class BitchuteTransformer(Transformer): yield m - def transform(self, data: ScraperResult) -> TransformedResult: + def transform(self, data: ScraperResult) -> Post: raw = json.loads(data.raw_data) soup = BeautifulSoup(raw['body'], features = 'html.parser') content = soup.find_all('p')[-1].text - transformed = TransformedResult( + transformed = Post( raw_id=data.id, scraper=data.scraper, transformer=self.__version__, diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index 6c0838c..85ada05 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -1,9 +1,10 @@ import json from loguru import logger -from typing import Generator +from typing import Generator, Union, Callable +import dateutil.parser from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media +from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel class TwitterTransformer(Transformer): """A Twitter specific ScraperResult, with a method ETL/transforming""" @@ -17,11 +18,9 @@ class TwitterTransformer(Transformer): return False - def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: - raw = json.loads(data.raw_data) - - if raw['media']: - for media in raw['media']: + def process_media(self, tweet, post_id, data): + if tweet['media']: + for media in tweet['media']: orig = None if media["_type"] == "snscrape.modules.twitter.Photo": @@ -40,26 +39,77 @@ class TwitterTransformer(Transformer): new = data.archived_urls[orig] if media["_type"] == "snscrape.modules.twitter.Photo": - m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + m = Image(url=new, post=post_id, raw_id=data.id, original_url=orig) else: - m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + m = Video(url=new, post=post_id, raw_id=data.id, original_url=orig) yield m - def transform(self, data: ScraperResult) -> TransformedResult: + + def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) - transformed = TransformedResult( + transformed = Post( raw_id=data.id, + platform_id=raw['id'], scraper=data.scraper, transformer=self.__version__, platform=data.platform, channel=data.channel, - date=data.date, + date=dateutil.parser.parse(raw['date']), date_archived=data.date_archived, url=raw['url'], content=raw['content'], author_id=raw['user']['id'], author_username=raw['user']['username']) - return transformed + def subtweet(tweet): + channel = Channel( + name=tweet['user']['displayname'], + platform_id=tweet['user']['id'], + platform=data.platform, + url=tweet['user']['url'], + screenname=tweet['user']['username'], + category='forwarded', + source=self.__version__ + ) + + channel = insert(channel) + + original = Post( + raw_id=data.id, + platform_id=tweet['id'], + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=channel.id, + date=dateutil.parser.parse(tweet['date']), + date_archived=data.date_archived, + url=tweet['url'], + content=tweet['content'], + author_id=tweet['user']['id'], + author_username=tweet['user']['username'] + ) + + original = insert(original) + transformed.forwarded_from = channel.id + transformed.reply_to = original.id + + media = self.process_media(tweet, original.id, data) + for m in media: + insert(m) + + if raw['retweetedTweet'] is not None: + subtweet(raw['retweetedTweet']) + + if raw['quotedTweet'] is not None: + subtweet(raw['quotedTweet']) + + insert(transformed) + + media = self.process_media(raw, transformed.id, data) + for m in media: + insert(m) + + + diff --git a/test.py b/test.py index ade611c..131fd71 100644 --- a/test.py +++ b/test.py @@ -1,7 +1,10 @@ from sqlalchemy import create_engine from loguru import logger +import gspread +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker -from cisticola.base import Channel, TransformedResult, ScraperResult +from cisticola.base import Channel, Post, ScraperResult, mapper_registry from cisticola.scraper import ( ScraperController, BitchuteScraper, @@ -14,26 +17,9 @@ from cisticola.scraper import ( TwitterScraper) from cisticola.transformer import ETLController from cisticola.transformer.twitter import TwitterTransformer -from sqlalchemy.orm import sessionmaker logger.add("../test.log") -test_channels = [ - Channel( - id=0, - name="L Weber (test)", - platform_id=1424979017749442595, - category="test", - followers=None, - platform="Twitter", - url="https://twitter.com/LWeber33662141", - screenname="LWeber33662141", - country="US", - influencer=None, - public=True, - chat=False, - notes="")] - controller = ScraperController() scrapers = [ @@ -49,9 +35,35 @@ scrapers = [ controller.register_scrapers(scrapers) engine = create_engine('sqlite:///test.db') -controller.connect_to_db(engine) +mapper_registry.metadata.create_all(bind=engine) +session_generator = sessionmaker() +session_generator.configure(bind=engine) +session = session_generator() -controller.scrape_channels(test_channels, archive_media = True) +gc = gspread.service_account(filename='service_account.json') + +# Open a sheet from a spreadsheet in one go +wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1yxd6-2Mp0jZ8r9XJklb39WE-iIMrKRyA2kymJcIfGis/edit#gid=0") +channels = wks.worksheet("channels").get_all_records() + +for c in channels: + del c['followers'] + + for k in c.keys(): + if c[k] == 'TRUE': c[k] = True + if c[k] == 'FALSE': c[k] = False + + # check to see if this already exists, + channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first() + + if not channel: + channel = Channel(**c, source='researcher') + session.add(channel) + +session.commit() + +controller.connect_to_db(engine) +controller.scrape_all_channels(archive_media = True) transformer = TwitterTransformer() diff --git a/tests/conftest.py b/tests/conftest.py index 7703639..962fbed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,11 +8,9 @@ from cisticola.transformer import ETLController #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# BITCHUTE_CHANNEL_KWARGS = { - 'id': 0, 'name': 'bestonlinejewelrystoresusa@gmail.com (test)', 'platform_id': 'bestonlinejewelrystoresusagmailcom', 'category': 'test', - 'followers': None, 'platform': 'Bitchute', 'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/', 'screenname': None, @@ -23,11 +21,9 @@ BITCHUTE_CHANNEL_KWARGS = { 'notes': ''} GAB_CHANNEL_KWARGS = { - 'id': 1, 'name': 'Capt. Marc Simon (test)', 'platform_id': 'marc_capt', 'category': 'test', - 'followers': None, 'platform': 'Gab', 'url': 'https://gab.com/marc_capt', 'screenname': 'marc_capt', @@ -38,11 +34,9 @@ GAB_CHANNEL_KWARGS = { 'notes': ''} GETTR_CHANNEL_KWARGS = { - 'id': 2, 'name': 'LizardRepublic (test)', 'platform_id': 'lizardrepublic', 'category': 'test', - 'followers': None, 'platform': 'Gettr', 'url': 'https://www.gettr.com/user/lizardrepublic', 'screenname': 'lizardrepublic', @@ -53,11 +47,9 @@ GETTR_CHANNEL_KWARGS = { 'notes': ''} INSTAGRAM_CHANNEL_KWARGS = { - 'id': 3, 'name': 'borland.88 (test)', 'platform_id': 'borland.88', 'category': 'test', - 'followers': None, 'platform': 'Instagram', 'url': 'https://www.instagram.com/borland.88/', 'screenname': 'borland.88', @@ -68,11 +60,9 @@ INSTAGRAM_CHANNEL_KWARGS = { 'notes': ''} ODYSEE_CHANNEL_KWARGS = { - 'id': 4, 'name': "Mak1n' Bacon (test)", 'platform_id': 'Mak1nBacon', 'category': 'test', - 'followers': None, 'platform': 'Odysee', 'url': 'https://odysee.com/@Mak1nBacon', 'screenname': 'Mak1nBacon', @@ -83,11 +73,9 @@ ODYSEE_CHANNEL_KWARGS = { 'notes': ''} RUMBLE_CHANNEL_KWARGS = { - 'id': 5, 'name': 'we are uploading videos wow products', 'platform_id': 'c-916305', 'category': 'test', - 'followers': None, 'platform': 'Rumble', 'url': 'https://rumble.com/c/c-916305', 'screenname': 'we are uploading', @@ -98,11 +86,9 @@ RUMBLE_CHANNEL_KWARGS = { 'notes': ''} TELEGRAM_CHANNEL_KWARGS = { - 'id': 6, 'name': 'South West Ohio Proud Boys (test)', 'platform_id': -1001276612436, 'category': 'test', - 'followers': None, 'platform': 'Telegram', 'url': 'https://t.me/SouthwestOhioPB', 'screenname': 'SouthwestOhioPB', @@ -113,11 +99,9 @@ TELEGRAM_CHANNEL_KWARGS = { 'notes': ''} TWITTER_CHANNEL_KWARGS = { - 'id': 7, 'name': 'L Weber (test)', 'platform_id': 1424979017749442595, 'category': 'test', - 'followers': None, 'platform': 'Twitter', 'url': 'https://twitter.com/LWeber33662141', 'screenname': 'LWeber33662141', @@ -128,11 +112,9 @@ TWITTER_CHANNEL_KWARGS = { 'notes': ''} VKONTAKTE_CHANNEL_KWARGS = { - 'id': 8, 'name': 'Wwg1wgA (test)', 'platform_id': 'club201278078', 'category': 'test', - 'followers': None, 'platform': 'Vkontakte', 'url': 'https://vk.com/club201278078', 'screenname': 'Wwg1wgA', @@ -143,11 +125,9 @@ VKONTAKTE_CHANNEL_KWARGS = { 'notes': ''} YOUTUBE_CHANNEL_KWARGS = { - 'id': 9, 'name': 'AnEs87 (test)', 'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA', 'category': 'test', - 'followers': None, 'platform': 'Youtube', 'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA', 'screenname': 'AnEs87', diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py index 4ddeb68..fd95bbe 100644 --- a/tests/transformer/twitter.py +++ b/tests/transformer/twitter.py @@ -4,7 +4,7 @@ import json from cisticola.base import Channel from cisticola.scraper import TwitterScraper from cisticola.transformer import TwitterTransformer -from cisticola.base import TransformedResult, Media +from cisticola.base import Post, Media def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): controller.reset_db() @@ -20,11 +20,11 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): sessionfactory.configure(bind=engine) session = sessionfactory() - posts = session.query(TransformedResult).all() + posts = session.query(Post).all() media = session.query(Media).all() - assert len(posts) == 3 - assert len(media) == 2 + assert len(posts) == 10 + assert len(media) == 7 - assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6" + assert posts[-1].content == "BARN" assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file