merged recent changes in main

This commit is contained in:
Tristan Lee
2023-08-07 20:42:02 -05:00
7 changed files with 14 additions and 26 deletions

View File

@@ -1,12 +1,12 @@
repos:
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
language_version: python3.9
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
name: isort (python)
args: ["--profile", "black"]
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
language_version: python3.9
args: ["--profile", "black"]

View File

@@ -5,6 +5,6 @@ The *cisticola* application enables users to easily collect, process, and analyz
It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database.
For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation.
For more information about the structure of Cisticola, as well as installation and deployment instructions, see the [documentation](https://cisticola.readthedocs.io/en/latest/index.html).
![Cisticola, the bird](docs/images/cisticola.jpeg)

View File

@@ -50,7 +50,6 @@ class Transformer:
data: ScraperResult,
insert: Callable,
session: Session,
insert_post: Callable,
flush_posts: Callable,
):
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
@@ -209,7 +208,6 @@ class ETLController:
def insert_or_select(self, obj, session, hydrate: bool = True):
"""Insert an object into the database or return an existing object from the database.
Regardless, the resulting object has an `id` attribute that can be referenced later.
Parameters
----------
@@ -257,6 +255,7 @@ class ETLController:
)
elif type(obj) == Post:
# attempt to add to current batch
return self.insert_post(obj, session, hydrate)
# instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
@@ -355,9 +354,6 @@ class ETLController:
result,
lambda obj: self.insert_or_select(obj, session, hydrate),
session,
lambda obj: self.insert_post(
obj, session, hydrate, flush=False
),
lambda: self.flush_posts(session),
)

View File

@@ -75,7 +75,6 @@ class BitchuteTransformer(Transformer):
data: ScraperResult,
insert: Callable,
session: Session,
insert_post: Callable,
flush_posts: Callable,
):
raw = json.loads(data.raw_data)
@@ -140,8 +139,7 @@ class BitchuteTransformer(Transformer):
video_duration=_parse_duration_str(raw["length"]),
)
# insert_post
transformed = insert_post(transformed)
transformed = insert(transformed)
def parse_created(created: str, date_archived: datetime) -> datetime:

View File

@@ -95,7 +95,6 @@ class GettrTransformer(Transformer):
data: ScraperResult,
insert: Callable,
session: Session,
insert_post: Callable,
flush_posts: Callable,
):
raw = json.loads(data.raw_data)
@@ -143,8 +142,7 @@ class GettrTransformer(Transformer):
views=raw.get("vfpst"),
)
# insert_post
insert_post(transformed)
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:

View File

@@ -74,7 +74,6 @@ class RumbleTransformer(Transformer):
data: ScraperResult,
insert: Callable,
session: Session,
insert_post: Callable,
flush_posts: Callable,
):
raw = json.loads(data.raw_data)
@@ -99,8 +98,7 @@ class RumbleTransformer(Transformer):
video_duration=_parse_duration_str(raw["duration"]),
)
# insert_post
insert_post(transformed)
insert(transformed)
def _process_number(s: str) -> int:

View File

@@ -195,13 +195,11 @@ class TelegramTelethonTransformer(Transformer):
insert(new_chat)
# TODO this method API is chaotic and could be cleaned up
def transform(
self,
data: ScraperResult,
insert: Callable,
session: Session,
insert_post: Callable,
flush_posts: Callable,
):
raw = json.loads(data.raw_data)
@@ -277,7 +275,8 @@ class TelegramTelethonTransformer(Transformer):
# use cache to find post ID instead of a DB request, if possible
if (data.channel, reply_to_id) not in self.posts_cache:
session.commit()
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
# this is necessary because the post we are looking for could be batched but not yet committed to the DB
flush_posts()
post = (
session.query(Post)
.filter_by(channel=data.channel, platform_id=reply_to_id)
@@ -379,8 +378,7 @@ class TelegramTelethonTransformer(Transformer):
views=raw.get("views"),
)
# insert_post
insert_post(transformed)
insert(transformed)
def stripped(s):