diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3f080d4..f875af3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ repos: + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + language_version: python3.9 - repo: https://github.com/pycqa/isort rev: 5.12.0 hooks: - id: isort name: isort (python) - args: ["--profile", "black"] - - repo: https://github.com/psf/black - rev: 22.3.0 - hooks: - - id: black - language_version: python3.9 \ No newline at end of file + args: ["--profile", "black"] \ No newline at end of file diff --git a/README.md b/README.md index 66cfb8a..56ff450 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,6 @@ The *cisticola* application enables users to easily collect, process, and analyz It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database. -For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation. +For more information about the structure of Cisticola, as well as installation and deployment instructions, see the [documentation](https://cisticola.readthedocs.io/en/latest/index.html). ![Cisticola, the bird](docs/images/cisticola.jpeg) diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index e1edc02..b58189b 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -50,7 +50,6 @@ class Transformer: data: ScraperResult, insert: Callable, session: Session, - insert_post: Callable, flush_posts: Callable, ): """Transform a ScraperResult into objects with additional parameters for analysis. This function can @@ -209,7 +208,6 @@ class ETLController: def insert_or_select(self, obj, session, hydrate: bool = True): """Insert an object into the database or return an existing object from the database. - Regardless, the resulting object has an `id` attribute that can be referenced later. Parameters ---------- @@ -257,6 +255,7 @@ class ETLController: ) elif type(obj) == Post: + # attempt to add to current batch return self.insert_post(obj, session, hydrate) # instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first() @@ -355,9 +354,6 @@ class ETLController: result, lambda obj: self.insert_or_select(obj, session, hydrate), session, - lambda obj: self.insert_post( - obj, session, hydrate, flush=False - ), lambda: self.flush_posts(session), ) diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index 889e7cb..11e3cda 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -75,7 +75,6 @@ class BitchuteTransformer(Transformer): data: ScraperResult, insert: Callable, session: Session, - insert_post: Callable, flush_posts: Callable, ): raw = json.loads(data.raw_data) @@ -140,8 +139,7 @@ class BitchuteTransformer(Transformer): video_duration=_parse_duration_str(raw["length"]), ) - # insert_post - transformed = insert_post(transformed) + transformed = insert(transformed) def parse_created(created: str, date_archived: datetime) -> datetime: diff --git a/cisticola/transformer/gettr.py b/cisticola/transformer/gettr.py index dc2fb1c..7e78124 100644 --- a/cisticola/transformer/gettr.py +++ b/cisticola/transformer/gettr.py @@ -95,7 +95,6 @@ class GettrTransformer(Transformer): data: ScraperResult, insert: Callable, session: Session, - insert_post: Callable, flush_posts: Callable, ): raw = json.loads(data.raw_data) @@ -143,8 +142,7 @@ class GettrTransformer(Transformer): views=raw.get("vfpst"), ) - # insert_post - insert_post(transformed) + insert(transformed) # media = self.process_media(raw, transformed.id, data) # for m in media: diff --git a/cisticola/transformer/rumble.py b/cisticola/transformer/rumble.py index 4847f48..05dea73 100644 --- a/cisticola/transformer/rumble.py +++ b/cisticola/transformer/rumble.py @@ -74,7 +74,6 @@ class RumbleTransformer(Transformer): data: ScraperResult, insert: Callable, session: Session, - insert_post: Callable, flush_posts: Callable, ): raw = json.loads(data.raw_data) @@ -99,8 +98,7 @@ class RumbleTransformer(Transformer): video_duration=_parse_duration_str(raw["duration"]), ) - # insert_post - insert_post(transformed) + insert(transformed) def _process_number(s: str) -> int: diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index 8585cc3..c6ffd3e 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -195,13 +195,11 @@ class TelegramTelethonTransformer(Transformer): insert(new_chat) - # TODO this method API is chaotic and could be cleaned up def transform( self, data: ScraperResult, insert: Callable, session: Session, - insert_post: Callable, flush_posts: Callable, ): raw = json.loads(data.raw_data) @@ -277,7 +275,8 @@ class TelegramTelethonTransformer(Transformer): # use cache to find post ID instead of a DB request, if possible if (data.channel, reply_to_id) not in self.posts_cache: session.commit() - flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session + # this is necessary because the post we are looking for could be batched but not yet committed to the DB + flush_posts() post = ( session.query(Post) .filter_by(channel=data.channel, platform_id=reply_to_id) @@ -379,8 +378,7 @@ class TelegramTelethonTransformer(Transformer): views=raw.get("views"), ) - # insert_post - insert_post(transformed) + insert(transformed) def stripped(s):