From fa516da763e39db700bed3e5d447284e72910f78 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Tue, 22 Mar 2022 11:41:55 +0100 Subject: [PATCH] Rename TransformedResult to the clearer Post --- cisticola/base.py | 10 +++++----- cisticola/transformer/base.py | 16 ++++++++-------- cisticola/transformer/bitchute.py | 8 ++++---- cisticola/transformer/twitter.py | 8 ++++---- test.py | 2 +- tests/transformer/twitter.py | 4 ++-- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index ffcbfa4..b58926b 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -80,7 +80,7 @@ channel_table = Table('channels', mapper_registry.metadata, mapper_registry.map_imperatively(Channel, channel_table) @dataclass -class TransformedResult: +class Post: """An object with fields for columns in the analysis table""" raw_id: int platform_id: str @@ -102,7 +102,7 @@ class TransformedResult: -analysis_table = Table('analysis', mapper_registry.metadata, +post_table = Table('posts', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), Column('raw_id', Integer, ForeignKey('raw_data.id')), @@ -118,10 +118,10 @@ analysis_table = Table('analysis', mapper_registry.metadata, Column('author_username', String), Column('content', String), Column('forwarded_from', Integer, ForeignKey('channels.id')), - Column('reply_to', Integer, ForeignKey('analysis.id')) + Column('reply_to', Integer, ForeignKey('posts.id')) ) -mapper_registry.map_imperatively(TransformedResult, analysis_table) +mapper_registry.map_imperatively(Post, post_table) @dataclass class Media: @@ -177,7 +177,7 @@ media_table = Table('media', mapper_registry.metadata, autoincrement=True), Column('type', String), Column('raw_id', Integer, ForeignKey('raw_data.id')), - Column('post', Integer, ForeignKey('analysis.id')), + Column('post', Integer, ForeignKey('posts.id')), Column('url', String), Column('original_url', String), Column('exif', String), diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 2916f01..38da38a 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -4,7 +4,7 @@ from sqlalchemy.orm import sessionmaker, make_transient from sqlalchemy.engine.base import Engine from collections import defaultdict -from cisticola.base import ScraperResult, TransformedResult, Media, Channel, mapper_registry +from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry class Transformer: @@ -32,7 +32,7 @@ class Transformer: pass - def transform(data: ScraperResult, insert: Callable) -> Generator[Union[TransformedResult, Channel, Media], None, None]: + def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]: """Transform a ScraperResult into objects with additional parameters for analysis. This function can yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel objects and provide all of these to be inserted into the database. @@ -92,8 +92,8 @@ class ETLController: if type(obj) == Channel: instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first() - elif type(obj) == TransformedResult: - instance = session.query(TransformedResult).filter_by(platform=obj.platform, platform_id=obj.platform_id).first() + elif type(obj) == Post: + instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first() elif issubclass(type(obj), Media): instance = session.query(type(obj)).filter_by(original_url=obj.original_url, post=obj.post).first() @@ -132,7 +132,7 @@ class ETLController: @logger.catch(reraise=True) def transform_results(self, results: List[ScraperResult], hydrate: bool = True): - """Transforms raw ScraperResults objects into TransformedResult objects and + """Transforms raw ScraperResults objects into Post objects and Media objects. Then, adds them to the database. Parameters @@ -165,7 +165,7 @@ class ETLController: @logger.catch(reraise=True) def transform_all_untransformed(self, hydrate: bool = True): """Transform all ScraperResult objects in the database that do not have an - equivalent TransformedResult object stored. + equivalent Post object stored. Parameters ---------- @@ -180,8 +180,8 @@ class ETLController: session = self.session() untransformed = ( session.query(ScraperResult) - .join(TransformedResult, isouter=True) - .where(TransformedResult.raw_id == None) + .join(Post, isouter=True) + .where(Post.raw_id == None) .all() ) logger.info(f"Found {len(untransformed)} items to ETL") diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index de6f0a7..d0c5fe0 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -5,7 +5,7 @@ from typing import Generator from bs4 import BeautifulSoup from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media +from cisticola.base import ScraperResult, Post, Image, Video, Media class BitchuteTransformer(Transformer): """A Bitchute specific ScraperResult, with a method ETL/transforming""" @@ -19,7 +19,7 @@ class BitchuteTransformer(Transformer): return False - def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]: raw = json.loads(data.raw_data) orig = raw['video_url'] @@ -29,13 +29,13 @@ class BitchuteTransformer(Transformer): yield m - def transform(self, data: ScraperResult) -> TransformedResult: + def transform(self, data: ScraperResult) -> Post: raw = json.loads(data.raw_data) soup = BeautifulSoup(raw['body'], features = 'html.parser') content = soup.find_all('p')[-1].text - transformed = TransformedResult( + transformed = Post( raw_id=data.id, scraper=data.scraper, transformer=self.__version__, diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index b51afb2..85ada05 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -4,7 +4,7 @@ from typing import Generator, Union, Callable import dateutil.parser from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media, Channel +from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel class TwitterTransformer(Transformer): """A Twitter specific ScraperResult, with a method ETL/transforming""" @@ -46,10 +46,10 @@ class TwitterTransformer(Transformer): yield m - def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[TransformedResult, Channel, Media], None, None]: + def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) - transformed = TransformedResult( + transformed = Post( raw_id=data.id, platform_id=raw['id'], scraper=data.scraper, @@ -76,7 +76,7 @@ class TwitterTransformer(Transformer): channel = insert(channel) - original = TransformedResult( + original = Post( raw_id=data.id, platform_id=tweet['id'], scraper=data.scraper, diff --git a/test.py b/test.py index 454ad43..131fd71 100644 --- a/test.py +++ b/test.py @@ -4,7 +4,7 @@ import gspread from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker -from cisticola.base import Channel, TransformedResult, ScraperResult, mapper_registry +from cisticola.base import Channel, Post, ScraperResult, mapper_registry from cisticola.scraper import ( ScraperController, BitchuteScraper, diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py index 50f51a6..fd95bbe 100644 --- a/tests/transformer/twitter.py +++ b/tests/transformer/twitter.py @@ -4,7 +4,7 @@ import json from cisticola.base import Channel from cisticola.scraper import TwitterScraper from cisticola.transformer import TwitterTransformer -from cisticola.base import TransformedResult, Media +from cisticola.base import Post, Media def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): controller.reset_db() @@ -20,7 +20,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): sessionfactory.configure(bind=engine) session = sessionfactory() - posts = session.query(TransformedResult).all() + posts = session.query(Post).all() media = session.query(Media).all() assert len(posts) == 10