From 3aec25f74cb004a8c49d04a321a27908eb46a773 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Mon, 7 Aug 2023 10:08:13 +0200 Subject: [PATCH 1/2] Simplify transform method signature --- cisticola/transformer/base.py | 5 +---- cisticola/transformer/bitchute.py | 5 ++--- cisticola/transformer/gettr.py | 5 ++--- cisticola/transformer/rumble.py | 5 ++--- cisticola/transformer/telegram_telethon.py | 9 ++++----- 5 files changed, 11 insertions(+), 18 deletions(-) diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 937f533..fb57110 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -205,7 +205,6 @@ class ETLController: def insert_or_select(self, obj, session, hydrate: bool = True): """Insert an object into the database or return an existing object from the database. - Regardless, the resulting object has an `id` attribute that can be referenced later. Parameters ---------- @@ -253,6 +252,7 @@ class ETLController: ) elif type(obj) == Post: + # attempt to add to current batch return self.insert_post(obj, session, hydrate) # instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first() @@ -351,9 +351,6 @@ class ETLController: result, lambda obj: self.insert_or_select(obj, session, hydrate), session, - lambda obj: self.insert_post( - obj, session, hydrate, flush=False - ), lambda: self.flush_posts(session), ) diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index fba56b4..efe76cb 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -82,7 +82,7 @@ class BitchuteTransformer(Transformer): transformed = insert(transformed) def transform( - self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts + self, data: ScraperResult, insert: Callable, session, flush_posts ) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) @@ -146,8 +146,7 @@ class BitchuteTransformer(Transformer): video_duration=_parse_duration_str(raw["length"]), ) - # insert_post - transformed = insert_post(transformed) + transformed = insert(transformed) def parse_created(created: str, date_archived: datetime) -> datetime: diff --git a/cisticola/transformer/gettr.py b/cisticola/transformer/gettr.py index e1c89cb..1dceb20 100644 --- a/cisticola/transformer/gettr.py +++ b/cisticola/transformer/gettr.py @@ -100,7 +100,7 @@ class GettrTransformer(Transformer): return channel.id def transform( - self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts + self, data: ScraperResult, insert: Callable, session, flush_posts ) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) @@ -147,8 +147,7 @@ class GettrTransformer(Transformer): views=raw.get("vfpst"), ) - # insert_post - insert_post(transformed) + insert(transformed) # media = self.process_media(raw, transformed.id, data) # for m in media: diff --git a/cisticola/transformer/rumble.py b/cisticola/transformer/rumble.py index cf4b428..687208e 100644 --- a/cisticola/transformer/rumble.py +++ b/cisticola/transformer/rumble.py @@ -78,7 +78,7 @@ class RumbleTransformer(Transformer): transformed = insert(transformed) def transform( - self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts + self, data: ScraperResult, insert: Callable, session, flush_posts ) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) @@ -102,8 +102,7 @@ class RumbleTransformer(Transformer): video_duration=_parse_duration_str(raw["duration"]), ) - # insert_post - insert_post(transformed) + insert(transformed) def _process_number(s): diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index 2a6c194..08ad17e 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -206,9 +206,8 @@ class TelegramTelethonTransformer(Transformer): insert(new_chat) - # TODO this method API is chaotic and could be cleaned up def transform( - self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts + self, data: ScraperResult, insert: Callable, session, flush_posts ) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) @@ -283,7 +282,8 @@ class TelegramTelethonTransformer(Transformer): # use cache to find post ID instead of a DB request, if possible if (data.channel, reply_to_id) not in self.posts_cache: session.commit() - flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session + # this is necessary because the post we are looking for could be batched but not yet committed to the DB + flush_posts() post = ( session.query(Post) .filter_by(channel=data.channel, platform_id=reply_to_id) @@ -385,8 +385,7 @@ class TelegramTelethonTransformer(Transformer): views=raw.get("views"), ) - # insert_post - insert_post(transformed) + insert(transformed) def stripped(s): From 1e2b62be57ba7448f6d32eb3b1094af8799418d1 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Mon, 7 Aug 2023 11:03:04 +0200 Subject: [PATCH 2/2] Add link to documentation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 66cfb8a..56ff450 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,6 @@ The *cisticola* application enables users to easily collect, process, and analyz It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database. -For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation. +For more information about the structure of Cisticola, as well as installation and deployment instructions, see the [documentation](https://cisticola.readthedocs.io/en/latest/index.html). ![Cisticola, the bird](docs/images/cisticola.jpeg)