merged recent changes in main

2026-06-07 19:08:35 +03:00 · 2023-08-07 20:42:02 -05:00
parent 8a10451a72 1e2b62be57
commit d27ea4d3e5
7 changed files with 14 additions and 26 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,12 +1,12 @@
 repos:
  - repo: https://github.com/psf/black
    rev: 22.3.0
    hooks:
      - id: black
        language_version: python3.9
  - repo: https://github.com/pycqa/isort
    rev: 5.12.0
    hooks:
      - id: isort
        name: isort (python)
-        args: ["--profile", "black"]
+        args: ["--profile", "black"]
  - repo: https://github.com/psf/black
    rev: 22.3.0
    hooks:
      - id: black
        language_version: python3.9
--- a/README.md
+++ b/README.md
@@ -5,6 +5,6 @@ The *cisticola* application enables users to easily collect, process, and analyz
 It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database.
-For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation. 
+For more information about the structure of Cisticola, as well as installation and deployment instructions, see the [documentation](https://cisticola.readthedocs.io/en/latest/index.html). 
 ![Cisticola, the bird](docs/images/cisticola.jpeg)
--- a/cisticola/transformer/base.py
+++ b/cisticola/transformer/base.py
@@ -50,7 +50,6 @@ class Transformer:
        data: ScraperResult,
        insert: Callable,
        session: Session,
        insert_post: Callable,
        flush_posts: Callable,
    ):
        """Transform a ScraperResult into objects with additional parameters for analysis. This function can
@@ -209,7 +208,6 @@ class ETLController:
    def insert_or_select(self, obj, session, hydrate: bool = True):
        """Insert an object into the database or return an existing object from the database.
        Regardless, the resulting object has an `id` attribute that can be referenced later.
        Parameters
        ----------
@@ -257,6 +255,7 @@ class ETLController:
            )
        elif type(obj) == Post:
            # attempt to add to current batch
            return self.insert_post(obj, session, hydrate)
            # instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
@@ -355,9 +354,6 @@ class ETLController:
                            result,
                            lambda obj: self.insert_or_select(obj, session, hydrate),
                            session,
                            lambda obj: self.insert_post(
                                obj, session, hydrate, flush=False
                            ),
                            lambda: self.flush_posts(session),
                        )
--- a/cisticola/transformer/bitchute.py
+++ b/cisticola/transformer/bitchute.py
@@ -75,7 +75,6 @@ class BitchuteTransformer(Transformer):
        data: ScraperResult,
        insert: Callable,
        session: Session,
        insert_post: Callable,
        flush_posts: Callable,
    ):
        raw = json.loads(data.raw_data)
@@ -140,8 +139,7 @@ class BitchuteTransformer(Transformer):
            video_duration=_parse_duration_str(raw["length"]),
        )
-        # insert_post
+        transformed = insert(transformed)
        transformed = insert_post(transformed)
 def parse_created(created: str, date_archived: datetime) -> datetime:
--- a/cisticola/transformer/gettr.py
+++ b/cisticola/transformer/gettr.py
@@ -95,7 +95,6 @@ class GettrTransformer(Transformer):
        data: ScraperResult,
        insert: Callable,
        session: Session,
        insert_post: Callable,
        flush_posts: Callable,
    ):
        raw = json.loads(data.raw_data)
@@ -143,8 +142,7 @@ class GettrTransformer(Transformer):
            views=raw.get("vfpst"),
        )
-        # insert_post
+        insert(transformed)
        insert_post(transformed)
        # media = self.process_media(raw, transformed.id, data)
        # for m in media:
--- a/cisticola/transformer/rumble.py
+++ b/cisticola/transformer/rumble.py
@@ -74,7 +74,6 @@ class RumbleTransformer(Transformer):
        data: ScraperResult,
        insert: Callable,
        session: Session,
        insert_post: Callable,
        flush_posts: Callable,
    ):
        raw = json.loads(data.raw_data)
@@ -99,8 +98,7 @@ class RumbleTransformer(Transformer):
            video_duration=_parse_duration_str(raw["duration"]),
        )
-        # insert_post
+        insert(transformed)
        insert_post(transformed)
 def _process_number(s: str) -> int:
--- a/cisticola/transformer/telegram_telethon.py
+++ b/cisticola/transformer/telegram_telethon.py
@@ -195,13 +195,11 @@ class TelegramTelethonTransformer(Transformer):
                insert(new_chat)
    # TODO this method API is chaotic and could be cleaned up
    def transform(
        self,
        data: ScraperResult,
        insert: Callable,
        session: Session,
        insert_post: Callable,
        flush_posts: Callable,
    ):
        raw = json.loads(data.raw_data)
@@ -277,7 +275,8 @@ class TelegramTelethonTransformer(Transformer):
            # use cache to find post ID instead of a DB request, if possible
            if (data.channel, reply_to_id) not in self.posts_cache:
                session.commit()
-                flush_posts()  # TODO this is necessary because the post we are looking for might have been added in the same session
+                # this is necessary because the post we are looking for could be batched but not yet committed to the DB
                flush_posts()
                post = (
                    session.query(Post)
                    .filter_by(channel=data.channel, platform_id=reply_to_id)
@@ -379,8 +378,7 @@ class TelegramTelethonTransformer(Transformer):
            views=raw.get("views"),
        )
-        # insert_post
+        insert(transformed)
        insert_post(transformed)
 def stripped(s):