Add Transformer and ETLController docstrings

This commit is contained in:
Logan Williams
2022-03-14 14:02:57 +01:00
parent fd4b617743
commit fe0d762df0

View File

@@ -1,11 +1,12 @@
from typing import List, Generator
from loguru import logger
from sqlalchemy.orm import sessionmaker
from sqlalchemy.engine.base import Engine
from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry
class Transformer:
"""Interface class for transformers"""
"""Interface class for transformers."""
__version__ = "Transformer 0.0.0"
@@ -13,25 +14,87 @@ class Transformer:
pass
def can_handle(data: ScraperResult) -> bool:
"""Specifies whether or not a Transformer is capable of handling a particular
piece of scraped data.
Parameters
----------
data : ScraperResult
The ScraperResult object to check for ability to handle.
Returns
-------
bool
True if it can be handled by this Transformer, false otherwise.
"""
pass
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
"""Yields Media objects from each piece of media present in a raw ScraperResult.
Parameters
----------
data : ScraperResult
The ScraperResult object to process
transformed : TransformedResult
The TransformedResult version of `data`. (E.g. as generated by `Transformer.transform()`)
Yields
------
Media
A media object generated from the ScraperResult. One ScraperResult can have multiple pieces
of media contained within it, so this can generate an arbitrary number of Media objects
(or their subclasses.) These Media objects are not fully hydrated.
"""
pass
def transform(data: ScraperResult) -> TransformedResult:
"""Transform a ScraperResult into a TransformedResult object. This extracts additional attributes
that can be used directly for analysis.
Parameters
----------
data : ScraperResult
The ScraperResult object to process.
Returns
-------
TransformedResult
A TransformedResult representation of the `data` object.
"""
pass
class ETLController:
"""This class will transform the raw_data tables into a format more conducive to analysis."""
"""An ETLController will transform raw scraped data (ScrapedResult objects) into a more detailed format
for analysis by using Transformer objects that have been registered with the controller.
"""
def __init__(self):
self.transformers = []
def register_transformer(self, transformer: Transformer):
"""Adds a Transformer to the list of available Transformers.
Parameters
----------
transformer : Transformer
The Transformer to register
"""
self.transformers.append(transformer)
def connect_to_db(self, engine):
def connect_to_db(self, engine: Engine):
"""Connects the ETLController to a SQLAlchemy engine.
Parameters
----------
engine : Engine
SQLAlchemy Engine object
"""
# create tables
mapper_registry.metadata.create_all(bind=engine)
@@ -40,6 +103,16 @@ class ETLController:
@logger.catch
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
"""Transforms raw ScraperResults objects into TransformedResult objects and
Media objects. Then, adds them to the database.
Parameters
----------
results : List[ScraperResult]
A list of ScraperResult objects to be transformed
hydrate : bool
Whether or not to fully hydrate transformed media. Default True.
"""
if self.session is None:
logger.error("No DB session")
return
@@ -78,6 +151,15 @@ class ETLController:
@logger.catch
def transform_all_untransformed(self, hydrate: bool = True):
"""Transform all ScraperResult objects in the database that do not have an
equivalent TransformedResult object stored.
Parameters
----------
hydrate : bool
Whether or not to fully hydrate transformed media. Default True.
"""
if self.session is None:
logger.error("No DB session")
return