mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-07 19:08:35 +03:00
merged recent changes in main
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
repos:
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 22.3.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.9
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.12.0
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort (python)
|
||||
args: ["--profile", "black"]
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 22.3.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.9
|
||||
@@ -5,6 +5,6 @@ The *cisticola* application enables users to easily collect, process, and analyz
|
||||
|
||||
It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database.
|
||||
|
||||
For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation.
|
||||
For more information about the structure of Cisticola, as well as installation and deployment instructions, see the [documentation](https://cisticola.readthedocs.io/en/latest/index.html).
|
||||
|
||||

|
||||
|
||||
@@ -50,7 +50,6 @@ class Transformer:
|
||||
data: ScraperResult,
|
||||
insert: Callable,
|
||||
session: Session,
|
||||
insert_post: Callable,
|
||||
flush_posts: Callable,
|
||||
):
|
||||
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
|
||||
@@ -209,7 +208,6 @@ class ETLController:
|
||||
|
||||
def insert_or_select(self, obj, session, hydrate: bool = True):
|
||||
"""Insert an object into the database or return an existing object from the database.
|
||||
Regardless, the resulting object has an `id` attribute that can be referenced later.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -257,6 +255,7 @@ class ETLController:
|
||||
)
|
||||
|
||||
elif type(obj) == Post:
|
||||
# attempt to add to current batch
|
||||
return self.insert_post(obj, session, hydrate)
|
||||
# instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
|
||||
|
||||
@@ -355,9 +354,6 @@ class ETLController:
|
||||
result,
|
||||
lambda obj: self.insert_or_select(obj, session, hydrate),
|
||||
session,
|
||||
lambda obj: self.insert_post(
|
||||
obj, session, hydrate, flush=False
|
||||
),
|
||||
lambda: self.flush_posts(session),
|
||||
)
|
||||
|
||||
|
||||
@@ -75,7 +75,6 @@ class BitchuteTransformer(Transformer):
|
||||
data: ScraperResult,
|
||||
insert: Callable,
|
||||
session: Session,
|
||||
insert_post: Callable,
|
||||
flush_posts: Callable,
|
||||
):
|
||||
raw = json.loads(data.raw_data)
|
||||
@@ -140,8 +139,7 @@ class BitchuteTransformer(Transformer):
|
||||
video_duration=_parse_duration_str(raw["length"]),
|
||||
)
|
||||
|
||||
# insert_post
|
||||
transformed = insert_post(transformed)
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def parse_created(created: str, date_archived: datetime) -> datetime:
|
||||
|
||||
@@ -95,7 +95,6 @@ class GettrTransformer(Transformer):
|
||||
data: ScraperResult,
|
||||
insert: Callable,
|
||||
session: Session,
|
||||
insert_post: Callable,
|
||||
flush_posts: Callable,
|
||||
):
|
||||
raw = json.loads(data.raw_data)
|
||||
@@ -143,8 +142,7 @@ class GettrTransformer(Transformer):
|
||||
views=raw.get("vfpst"),
|
||||
)
|
||||
|
||||
# insert_post
|
||||
insert_post(transformed)
|
||||
insert(transformed)
|
||||
|
||||
# media = self.process_media(raw, transformed.id, data)
|
||||
# for m in media:
|
||||
|
||||
@@ -74,7 +74,6 @@ class RumbleTransformer(Transformer):
|
||||
data: ScraperResult,
|
||||
insert: Callable,
|
||||
session: Session,
|
||||
insert_post: Callable,
|
||||
flush_posts: Callable,
|
||||
):
|
||||
raw = json.loads(data.raw_data)
|
||||
@@ -99,8 +98,7 @@ class RumbleTransformer(Transformer):
|
||||
video_duration=_parse_duration_str(raw["duration"]),
|
||||
)
|
||||
|
||||
# insert_post
|
||||
insert_post(transformed)
|
||||
insert(transformed)
|
||||
|
||||
|
||||
def _process_number(s: str) -> int:
|
||||
|
||||
@@ -195,13 +195,11 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
insert(new_chat)
|
||||
|
||||
# TODO this method API is chaotic and could be cleaned up
|
||||
def transform(
|
||||
self,
|
||||
data: ScraperResult,
|
||||
insert: Callable,
|
||||
session: Session,
|
||||
insert_post: Callable,
|
||||
flush_posts: Callable,
|
||||
):
|
||||
raw = json.loads(data.raw_data)
|
||||
@@ -277,7 +275,8 @@ class TelegramTelethonTransformer(Transformer):
|
||||
# use cache to find post ID instead of a DB request, if possible
|
||||
if (data.channel, reply_to_id) not in self.posts_cache:
|
||||
session.commit()
|
||||
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
|
||||
# this is necessary because the post we are looking for could be batched but not yet committed to the DB
|
||||
flush_posts()
|
||||
post = (
|
||||
session.query(Post)
|
||||
.filter_by(channel=data.channel, platform_id=reply_to_id)
|
||||
@@ -379,8 +378,7 @@ class TelegramTelethonTransformer(Transformer):
|
||||
views=raw.get("views"),
|
||||
)
|
||||
|
||||
# insert_post
|
||||
insert_post(transformed)
|
||||
insert(transformed)
|
||||
|
||||
|
||||
def stripped(s):
|
||||
|
||||
Reference in New Issue
Block a user