Rename TransformedResult to the clearer Post

This commit is contained in:
Logan Williams
2022-03-22 11:41:55 +01:00
parent c0a094eefa
commit fa516da763
6 changed files with 24 additions and 24 deletions

View File

@@ -80,7 +80,7 @@ channel_table = Table('channels', mapper_registry.metadata,
mapper_registry.map_imperatively(Channel, channel_table)
@dataclass
class TransformedResult:
class Post:
"""An object with fields for columns in the analysis table"""
raw_id: int
platform_id: str
@@ -102,7 +102,7 @@ class TransformedResult:
analysis_table = Table('analysis', mapper_registry.metadata,
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
@@ -118,10 +118,10 @@ analysis_table = Table('analysis', mapper_registry.metadata,
Column('author_username', String),
Column('content', String),
Column('forwarded_from', Integer, ForeignKey('channels.id')),
Column('reply_to', Integer, ForeignKey('analysis.id'))
Column('reply_to', Integer, ForeignKey('posts.id'))
)
mapper_registry.map_imperatively(TransformedResult, analysis_table)
mapper_registry.map_imperatively(Post, post_table)
@dataclass
class Media:
@@ -177,7 +177,7 @@ media_table = Table('media', mapper_registry.metadata,
autoincrement=True),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('post', Integer, ForeignKey('analysis.id')),
Column('post', Integer, ForeignKey('posts.id')),
Column('url', String),
Column('original_url', String),
Column('exif', String),

View File

@@ -4,7 +4,7 @@ from sqlalchemy.orm import sessionmaker, make_transient
from sqlalchemy.engine.base import Engine
from collections import defaultdict
from cisticola.base import ScraperResult, TransformedResult, Media, Channel, mapper_registry
from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry
class Transformer:
@@ -32,7 +32,7 @@ class Transformer:
pass
def transform(data: ScraperResult, insert: Callable) -> Generator[Union[TransformedResult, Channel, Media], None, None]:
def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel
objects and provide all of these to be inserted into the database.
@@ -92,8 +92,8 @@ class ETLController:
if type(obj) == Channel:
instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first()
elif type(obj) == TransformedResult:
instance = session.query(TransformedResult).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
elif type(obj) == Post:
instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
elif issubclass(type(obj), Media):
instance = session.query(type(obj)).filter_by(original_url=obj.original_url, post=obj.post).first()
@@ -132,7 +132,7 @@ class ETLController:
@logger.catch(reraise=True)
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
"""Transforms raw ScraperResults objects into TransformedResult objects and
"""Transforms raw ScraperResults objects into Post objects and
Media objects. Then, adds them to the database.
Parameters
@@ -165,7 +165,7 @@ class ETLController:
@logger.catch(reraise=True)
def transform_all_untransformed(self, hydrate: bool = True):
"""Transform all ScraperResult objects in the database that do not have an
equivalent TransformedResult object stored.
equivalent Post object stored.
Parameters
----------
@@ -180,8 +180,8 @@ class ETLController:
session = self.session()
untransformed = (
session.query(ScraperResult)
.join(TransformedResult, isouter=True)
.where(TransformedResult.raw_id == None)
.join(Post, isouter=True)
.where(Post.raw_id == None)
.all()
)
logger.info(f"Found {len(untransformed)} items to ETL")

View File

@@ -5,7 +5,7 @@ from typing import Generator
from bs4 import BeautifulSoup
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
from cisticola.base import ScraperResult, Post, Image, Video, Media
class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
@@ -19,7 +19,7 @@ class BitchuteTransformer(Transformer):
return False
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
orig = raw['video_url']
@@ -29,13 +29,13 @@ class BitchuteTransformer(Transformer):
yield m
def transform(self, data: ScraperResult) -> TransformedResult:
def transform(self, data: ScraperResult) -> Post:
raw = json.loads(data.raw_data)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text
transformed = TransformedResult(
transformed = Post(
raw_id=data.id,
scraper=data.scraper,
transformer=self.__version__,

View File

@@ -4,7 +4,7 @@ from typing import Generator, Union, Callable
import dateutil.parser
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media, Channel
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
class TwitterTransformer(Transformer):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
@@ -46,10 +46,10 @@ class TwitterTransformer(Transformer):
yield m
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[TransformedResult, Channel, Media], None, None]:
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = TransformedResult(
transformed = Post(
raw_id=data.id,
platform_id=raw['id'],
scraper=data.scraper,
@@ -76,7 +76,7 @@ class TwitterTransformer(Transformer):
channel = insert(channel)
original = TransformedResult(
original = Post(
raw_id=data.id,
platform_id=tweet['id'],
scraper=data.scraper,

View File

@@ -4,7 +4,7 @@ import gspread
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from cisticola.base import Channel, TransformedResult, ScraperResult, mapper_registry
from cisticola.base import Channel, Post, ScraperResult, mapper_registry
from cisticola.scraper import (
ScraperController,
BitchuteScraper,

View File

@@ -4,7 +4,7 @@ import json
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
from cisticola.transformer import TwitterTransformer
from cisticola.base import TransformedResult, Media
from cisticola.base import Post, Media
def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
@@ -20,7 +20,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(TransformedResult).all()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 10