merged recent changes in main

This commit is contained in:
Tristan Lee
2023-08-07 20:42:02 -05:00
7 changed files with 14 additions and 26 deletions

View File

@@ -1,12 +1,12 @@
repos: repos:
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
language_version: python3.9
- repo: https://github.com/pycqa/isort - repo: https://github.com/pycqa/isort
rev: 5.12.0 rev: 5.12.0
hooks: hooks:
- id: isort - id: isort
name: isort (python) name: isort (python)
args: ["--profile", "black"] args: ["--profile", "black"]
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
language_version: python3.9

View File

@@ -5,6 +5,6 @@ The *cisticola* application enables users to easily collect, process, and analyz
It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database. It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database.
For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation. For more information about the structure of Cisticola, as well as installation and deployment instructions, see the [documentation](https://cisticola.readthedocs.io/en/latest/index.html).
![Cisticola, the bird](docs/images/cisticola.jpeg) ![Cisticola, the bird](docs/images/cisticola.jpeg)

View File

@@ -50,7 +50,6 @@ class Transformer:
data: ScraperResult, data: ScraperResult,
insert: Callable, insert: Callable,
session: Session, session: Session,
insert_post: Callable,
flush_posts: Callable, flush_posts: Callable,
): ):
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can """Transform a ScraperResult into objects with additional parameters for analysis. This function can
@@ -209,7 +208,6 @@ class ETLController:
def insert_or_select(self, obj, session, hydrate: bool = True): def insert_or_select(self, obj, session, hydrate: bool = True):
"""Insert an object into the database or return an existing object from the database. """Insert an object into the database or return an existing object from the database.
Regardless, the resulting object has an `id` attribute that can be referenced later.
Parameters Parameters
---------- ----------
@@ -257,6 +255,7 @@ class ETLController:
) )
elif type(obj) == Post: elif type(obj) == Post:
# attempt to add to current batch
return self.insert_post(obj, session, hydrate) return self.insert_post(obj, session, hydrate)
# instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first() # instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
@@ -355,9 +354,6 @@ class ETLController:
result, result,
lambda obj: self.insert_or_select(obj, session, hydrate), lambda obj: self.insert_or_select(obj, session, hydrate),
session, session,
lambda obj: self.insert_post(
obj, session, hydrate, flush=False
),
lambda: self.flush_posts(session), lambda: self.flush_posts(session),
) )

View File

@@ -75,7 +75,6 @@ class BitchuteTransformer(Transformer):
data: ScraperResult, data: ScraperResult,
insert: Callable, insert: Callable,
session: Session, session: Session,
insert_post: Callable,
flush_posts: Callable, flush_posts: Callable,
): ):
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
@@ -140,8 +139,7 @@ class BitchuteTransformer(Transformer):
video_duration=_parse_duration_str(raw["length"]), video_duration=_parse_duration_str(raw["length"]),
) )
# insert_post transformed = insert(transformed)
transformed = insert_post(transformed)
def parse_created(created: str, date_archived: datetime) -> datetime: def parse_created(created: str, date_archived: datetime) -> datetime:

View File

@@ -95,7 +95,6 @@ class GettrTransformer(Transformer):
data: ScraperResult, data: ScraperResult,
insert: Callable, insert: Callable,
session: Session, session: Session,
insert_post: Callable,
flush_posts: Callable, flush_posts: Callable,
): ):
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
@@ -143,8 +142,7 @@ class GettrTransformer(Transformer):
views=raw.get("vfpst"), views=raw.get("vfpst"),
) )
# insert_post insert(transformed)
insert_post(transformed)
# media = self.process_media(raw, transformed.id, data) # media = self.process_media(raw, transformed.id, data)
# for m in media: # for m in media:

View File

@@ -74,7 +74,6 @@ class RumbleTransformer(Transformer):
data: ScraperResult, data: ScraperResult,
insert: Callable, insert: Callable,
session: Session, session: Session,
insert_post: Callable,
flush_posts: Callable, flush_posts: Callable,
): ):
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
@@ -99,8 +98,7 @@ class RumbleTransformer(Transformer):
video_duration=_parse_duration_str(raw["duration"]), video_duration=_parse_duration_str(raw["duration"]),
) )
# insert_post insert(transformed)
insert_post(transformed)
def _process_number(s: str) -> int: def _process_number(s: str) -> int:

View File

@@ -195,13 +195,11 @@ class TelegramTelethonTransformer(Transformer):
insert(new_chat) insert(new_chat)
# TODO this method API is chaotic and could be cleaned up
def transform( def transform(
self, self,
data: ScraperResult, data: ScraperResult,
insert: Callable, insert: Callable,
session: Session, session: Session,
insert_post: Callable,
flush_posts: Callable, flush_posts: Callable,
): ):
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
@@ -277,7 +275,8 @@ class TelegramTelethonTransformer(Transformer):
# use cache to find post ID instead of a DB request, if possible # use cache to find post ID instead of a DB request, if possible
if (data.channel, reply_to_id) not in self.posts_cache: if (data.channel, reply_to_id) not in self.posts_cache:
session.commit() session.commit()
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session # this is necessary because the post we are looking for could be batched but not yet committed to the DB
flush_posts()
post = ( post = (
session.query(Post) session.query(Post)
.filter_by(channel=data.channel, platform_id=reply_to_id) .filter_by(channel=data.channel, platform_id=reply_to_id)
@@ -379,8 +378,7 @@ class TelegramTelethonTransformer(Transformer):
views=raw.get("views"), views=raw.get("views"),
) )
# insert_post insert(transformed)
insert_post(transformed)
def stripped(s): def stripped(s):