Reorganize transformer defition location

This commit is contained in:
Logan Williams
2022-02-18 14:57:10 +01:00
parent c5d49ef521
commit b824b98a95
4 changed files with 78 additions and 34 deletions

View File

@@ -1,7 +1,7 @@
from dataclasses import dataclass
from datetime import datetime
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, DateTime
from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey
mapper_registry = registry()
@@ -11,7 +11,7 @@ class ScraperResult:
"""A minimally processed result from a scraper"""
scraper: str
platform: str
channel: str
channel: int
platform_id: str
date: datetime
raw_data: str
@@ -19,10 +19,10 @@ class ScraperResult:
raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('id', Integer, primary_key=True, autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', String),
Column('channel', Integer),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
@@ -45,3 +45,37 @@ class Channel:
public: bool
chat: bool
notes: str
@dataclass
class TransformedResult:
"""An object with fields for columns in the analysis table"""
raw_id: int
scraper: str
transformer: str
platform: str
channel: str
date: datetime
date_archived: datetime
url: str
content: str
author_id: str
author_username: str
analysis_table = Table('analysis', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', String),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('url', String),
Column('content', String),
Column('author_id', String),
Column('author_username', String)
)
mapper_registry.map_imperatively(TransformedResult, analysis_table)

View File

@@ -1,34 +1,16 @@
@dataclass
class TransformedResult:
"""An object with fields for columns in the analysis table"""
scraper: str
transformer: str
platform: str
channel: str
date: datetime
date_archived: datetime
url: str
content: str
author_id: str
author_username: str
import cisticola.base
class Transformer:
"""Interface class for transformers"""
class TwitterResult(ScraperResult):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
__version__ = "Transformer 0.0.0"
def transform(self) -> TransformedResult:
data = json.loads(self.raw_data)
def __init__(self):
pass
transformed = TransformedResult(
scraper=self.scraper,
transformer=self.__version__,
platform=self.platform,
channel=self.channel,
date=self.date,
date_archived=self.date_archived,
url=data['url'],
content=data['content'],
author_id=data['user']['id'],
author_username=data['user']['username'])
def can_handle(data: cisticola.base.ScraperResult) -> bool:
pass
def transform(data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult:
pass
return transformed

View File

@@ -0,0 +1,27 @@
import cisticola.transformer
import cisticola.base
import json
class TwitterTransformer(cisticola.transformer.Transformer):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
__version__ = "TwitterTransformer 0.0.1"
def transform(self, data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult:
raw = json.loads(data.raw_data)
transformed = cisticola.base.TransformedResult(
raw_id=data.id,
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=data.date,
date_archived=data.date_archived,
url=raw['url'],
content=raw['content'],
author_id=raw['user']['id'],
author_username=raw['user']['username'])
return transformed

View File

@@ -23,7 +23,8 @@ controller = cisticola.ScraperController()
scraper = cisticola.scraper.twitter.TwitterScraper()
controller.register_scraper(scraper)
engine = create_engine('sqlite:///test2.db')
engine = create_engine('sqlite:///test.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)