mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Reorganize transformer defition location
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, DateTime
|
||||
from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
@@ -11,7 +11,7 @@ class ScraperResult:
|
||||
"""A minimally processed result from a scraper"""
|
||||
scraper: str
|
||||
platform: str
|
||||
channel: str
|
||||
channel: int
|
||||
platform_id: str
|
||||
date: datetime
|
||||
raw_data: str
|
||||
@@ -19,10 +19,10 @@ class ScraperResult:
|
||||
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True),
|
||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', String),
|
||||
Column('channel', Integer),
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_data', String),
|
||||
@@ -45,3 +45,37 @@ class Channel:
|
||||
public: bool
|
||||
chat: bool
|
||||
notes: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransformedResult:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
raw_id: int
|
||||
scraper: str
|
||||
transformer: str
|
||||
platform: str
|
||||
channel: str
|
||||
date: datetime
|
||||
date_archived: datetime
|
||||
url: str
|
||||
content: str
|
||||
author_id: str
|
||||
author_username: str
|
||||
|
||||
|
||||
analysis_table = Table('analysis', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
Column('channel', String),
|
||||
Column('date', DateTime),
|
||||
Column('date_archived', DateTime),
|
||||
Column('url', String),
|
||||
Column('content', String),
|
||||
Column('author_id', String),
|
||||
Column('author_username', String)
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
|
||||
@@ -1,34 +1,16 @@
|
||||
@dataclass
|
||||
class TransformedResult:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
scraper: str
|
||||
transformer: str
|
||||
platform: str
|
||||
channel: str
|
||||
date: datetime
|
||||
date_archived: datetime
|
||||
url: str
|
||||
content: str
|
||||
author_id: str
|
||||
author_username: str
|
||||
import cisticola.base
|
||||
|
||||
class Transformer:
|
||||
"""Interface class for transformers"""
|
||||
|
||||
class TwitterResult(ScraperResult):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
__version__ = "Transformer 0.0.0"
|
||||
|
||||
def transform(self) -> TransformedResult:
|
||||
data = json.loads(self.raw_data)
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
transformed = TransformedResult(
|
||||
scraper=self.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=self.platform,
|
||||
channel=self.channel,
|
||||
date=self.date,
|
||||
date_archived=self.date_archived,
|
||||
url=data['url'],
|
||||
content=data['content'],
|
||||
author_id=data['user']['id'],
|
||||
author_username=data['user']['username'])
|
||||
def can_handle(data: cisticola.base.ScraperResult) -> bool:
|
||||
pass
|
||||
|
||||
def transform(data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult:
|
||||
pass
|
||||
|
||||
return transformed
|
||||
|
||||
27
cisticola/transformer/twitter.py
Normal file
27
cisticola/transformer/twitter.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import cisticola.transformer
|
||||
import cisticola.base
|
||||
import json
|
||||
|
||||
|
||||
class TwitterTransformer(cisticola.transformer.Transformer):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "TwitterTransformer 0.0.1"
|
||||
|
||||
def transform(self, data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = cisticola.base.TransformedResult(
|
||||
raw_id=data.id,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=data.date,
|
||||
date_archived=data.date_archived,
|
||||
url=raw['url'],
|
||||
content=raw['content'],
|
||||
author_id=raw['user']['id'],
|
||||
author_username=raw['user']['username'])
|
||||
|
||||
return transformed
|
||||
3
test.py
3
test.py
@@ -23,7 +23,8 @@ controller = cisticola.ScraperController()
|
||||
scraper = cisticola.scraper.twitter.TwitterScraper()
|
||||
controller.register_scraper(scraper)
|
||||
|
||||
engine = create_engine('sqlite:///test2.db')
|
||||
engine = create_engine('sqlite:///test.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user