mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Rename TransformedResult to the clearer Post
This commit is contained in:
@@ -80,7 +80,7 @@ channel_table = Table('channels', mapper_registry.metadata,
|
||||
mapper_registry.map_imperatively(Channel, channel_table)
|
||||
|
||||
@dataclass
|
||||
class TransformedResult:
|
||||
class Post:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
raw_id: int
|
||||
platform_id: str
|
||||
@@ -102,7 +102,7 @@ class TransformedResult:
|
||||
|
||||
|
||||
|
||||
analysis_table = Table('analysis', mapper_registry.metadata,
|
||||
post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
@@ -118,10 +118,10 @@ analysis_table = Table('analysis', mapper_registry.metadata,
|
||||
Column('author_username', String),
|
||||
Column('content', String),
|
||||
Column('forwarded_from', Integer, ForeignKey('channels.id')),
|
||||
Column('reply_to', Integer, ForeignKey('analysis.id'))
|
||||
Column('reply_to', Integer, ForeignKey('posts.id'))
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
mapper_registry.map_imperatively(Post, post_table)
|
||||
|
||||
@dataclass
|
||||
class Media:
|
||||
@@ -177,7 +177,7 @@ media_table = Table('media', mapper_registry.metadata,
|
||||
autoincrement=True),
|
||||
Column('type', String),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('post', Integer, ForeignKey('analysis.id')),
|
||||
Column('post', Integer, ForeignKey('posts.id')),
|
||||
Column('url', String),
|
||||
Column('original_url', String),
|
||||
Column('exif', String),
|
||||
|
||||
@@ -4,7 +4,7 @@ from sqlalchemy.orm import sessionmaker, make_transient
|
||||
from sqlalchemy.engine.base import Engine
|
||||
from collections import defaultdict
|
||||
|
||||
from cisticola.base import ScraperResult, TransformedResult, Media, Channel, mapper_registry
|
||||
from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry
|
||||
|
||||
|
||||
class Transformer:
|
||||
@@ -32,7 +32,7 @@ class Transformer:
|
||||
|
||||
pass
|
||||
|
||||
def transform(data: ScraperResult, insert: Callable) -> Generator[Union[TransformedResult, Channel, Media], None, None]:
|
||||
def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
|
||||
yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel
|
||||
objects and provide all of these to be inserted into the database.
|
||||
@@ -92,8 +92,8 @@ class ETLController:
|
||||
if type(obj) == Channel:
|
||||
instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first()
|
||||
|
||||
elif type(obj) == TransformedResult:
|
||||
instance = session.query(TransformedResult).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
|
||||
elif type(obj) == Post:
|
||||
instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
|
||||
|
||||
elif issubclass(type(obj), Media):
|
||||
instance = session.query(type(obj)).filter_by(original_url=obj.original_url, post=obj.post).first()
|
||||
@@ -132,7 +132,7 @@ class ETLController:
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
|
||||
"""Transforms raw ScraperResults objects into TransformedResult objects and
|
||||
"""Transforms raw ScraperResults objects into Post objects and
|
||||
Media objects. Then, adds them to the database.
|
||||
|
||||
Parameters
|
||||
@@ -165,7 +165,7 @@ class ETLController:
|
||||
@logger.catch(reraise=True)
|
||||
def transform_all_untransformed(self, hydrate: bool = True):
|
||||
"""Transform all ScraperResult objects in the database that do not have an
|
||||
equivalent TransformedResult object stored.
|
||||
equivalent Post object stored.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -180,8 +180,8 @@ class ETLController:
|
||||
session = self.session()
|
||||
untransformed = (
|
||||
session.query(ScraperResult)
|
||||
.join(TransformedResult, isouter=True)
|
||||
.where(TransformedResult.raw_id == None)
|
||||
.join(Post, isouter=True)
|
||||
.where(Post.raw_id == None)
|
||||
.all()
|
||||
)
|
||||
logger.info(f"Found {len(untransformed)} items to ETL")
|
||||
|
||||
@@ -5,7 +5,7 @@ from typing import Generator
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
|
||||
from cisticola.base import ScraperResult, Post, Image, Video, Media
|
||||
|
||||
class BitchuteTransformer(Transformer):
|
||||
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
|
||||
@@ -19,7 +19,7 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
orig = raw['video_url']
|
||||
@@ -29,13 +29,13 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> TransformedResult:
|
||||
def transform(self, data: ScraperResult) -> Post:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
content = soup.find_all('p')[-1].text
|
||||
|
||||
transformed = TransformedResult(
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
|
||||
@@ -4,7 +4,7 @@ from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media, Channel
|
||||
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class TwitterTransformer(Transformer):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
@@ -46,10 +46,10 @@ class TwitterTransformer(Transformer):
|
||||
yield m
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[TransformedResult, Channel, Media], None, None]:
|
||||
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = TransformedResult(
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=raw['id'],
|
||||
scraper=data.scraper,
|
||||
@@ -76,7 +76,7 @@ class TwitterTransformer(Transformer):
|
||||
|
||||
channel = insert(channel)
|
||||
|
||||
original = TransformedResult(
|
||||
original = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=tweet['id'],
|
||||
scraper=data.scraper,
|
||||
|
||||
2
test.py
2
test.py
@@ -4,7 +4,7 @@ import gspread
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from cisticola.base import Channel, TransformedResult, ScraperResult, mapper_registry
|
||||
from cisticola.base import Channel, Post, ScraperResult, mapper_registry
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
BitchuteScraper,
|
||||
|
||||
@@ -4,7 +4,7 @@ import json
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TwitterScraper
|
||||
from cisticola.transformer import TwitterTransformer
|
||||
from cisticola.base import TransformedResult, Media
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
@@ -20,7 +20,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(TransformedResult).all()
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 10
|
||||
|
||||
Reference in New Issue
Block a user