diff --git a/cisticola/base.py b/cisticola/base.py index 7d03abe..9aaf5a6 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from datetime import datetime from sqlalchemy.orm import registry -from sqlalchemy import Table, Column, Integer, String, DateTime +from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey mapper_registry = registry() @@ -11,7 +11,7 @@ class ScraperResult: """A minimally processed result from a scraper""" scraper: str platform: str - channel: str + channel: int platform_id: str date: datetime raw_data: str @@ -19,10 +19,10 @@ class ScraperResult: raw_data_table = Table('raw_data', mapper_registry.metadata, - Column('id', Integer, primary_key=True), + Column('id', Integer, primary_key=True, autoincrement=True), Column('scraper', String), Column('platform', String), - Column('channel', String), + Column('channel', Integer), Column('platform_id', String), Column('date', DateTime), Column('raw_data', String), @@ -45,3 +45,37 @@ class Channel: public: bool chat: bool notes: str + + +@dataclass +class TransformedResult: + """An object with fields for columns in the analysis table""" + raw_id: int + scraper: str + transformer: str + platform: str + channel: str + date: datetime + date_archived: datetime + url: str + content: str + author_id: str + author_username: str + + +analysis_table = Table('analysis', mapper_registry.metadata, + Column('id', Integer, primary_key=True, autoincrement=True), + Column('raw_id', Integer, ForeignKey('raw_data.id')), + Column('scraper', String), + Column('transformer', String), + Column('platform', String), + Column('channel', String), + Column('date', DateTime), + Column('date_archived', DateTime), + Column('url', String), + Column('content', String), + Column('author_id', String), + Column('author_username', String) + ) + +mapper_registry.map_imperatively(TransformedResult, analysis_table) diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index fcda068..1512950 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,34 +1,16 @@ -@dataclass -class TransformedResult: - """An object with fields for columns in the analysis table""" - scraper: str - transformer: str - platform: str - channel: str - date: datetime - date_archived: datetime - url: str - content: str - author_id: str - author_username: str +import cisticola.base +class Transformer: + """Interface class for transformers""" -class TwitterResult(ScraperResult): - """A Twitter specific ScraperResult, with a method ETL/transforming""" + __version__ = "Transformer 0.0.0" - def transform(self) -> TransformedResult: - data = json.loads(self.raw_data) + def __init__(self): + pass - transformed = TransformedResult( - scraper=self.scraper, - transformer=self.__version__, - platform=self.platform, - channel=self.channel, - date=self.date, - date_archived=self.date_archived, - url=data['url'], - content=data['content'], - author_id=data['user']['id'], - author_username=data['user']['username']) + def can_handle(data: cisticola.base.ScraperResult) -> bool: + pass + + def transform(data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult: + pass - return transformed diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py new file mode 100644 index 0000000..c79e01f --- /dev/null +++ b/cisticola/transformer/twitter.py @@ -0,0 +1,27 @@ +import cisticola.transformer +import cisticola.base +import json + + +class TwitterTransformer(cisticola.transformer.Transformer): + """A Twitter specific ScraperResult, with a method ETL/transforming""" + + __version__ = "TwitterTransformer 0.0.1" + + def transform(self, data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult: + raw = json.loads(data.raw_data) + + transformed = cisticola.base.TransformedResult( + raw_id=data.id, + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=data.date, + date_archived=data.date_archived, + url=raw['url'], + content=raw['content'], + author_id=raw['user']['id'], + author_username=raw['user']['username']) + + return transformed diff --git a/test.py b/test.py index 678bcdb..a31a081 100644 --- a/test.py +++ b/test.py @@ -23,7 +23,8 @@ controller = cisticola.ScraperController() scraper = cisticola.scraper.twitter.TwitterScraper() controller.register_scraper(scraper) -engine = create_engine('sqlite:///test2.db') +engine = create_engine('sqlite:///test.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels) +