mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-07 19:08:35 +03:00
merged recent changes in main
This commit is contained in:
@@ -1,12 +1,12 @@
|
|||||||
repos:
|
repos:
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 22.3.0
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
language_version: python3.9
|
||||||
- repo: https://github.com/pycqa/isort
|
- repo: https://github.com/pycqa/isort
|
||||||
rev: 5.12.0
|
rev: 5.12.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
name: isort (python)
|
name: isort (python)
|
||||||
args: ["--profile", "black"]
|
args: ["--profile", "black"]
|
||||||
- repo: https://github.com/psf/black
|
|
||||||
rev: 22.3.0
|
|
||||||
hooks:
|
|
||||||
- id: black
|
|
||||||
language_version: python3.9
|
|
||||||
@@ -5,6 +5,6 @@ The *cisticola* application enables users to easily collect, process, and analyz
|
|||||||
|
|
||||||
It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database.
|
It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database.
|
||||||
|
|
||||||
For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation.
|
For more information about the structure of Cisticola, as well as installation and deployment instructions, see the [documentation](https://cisticola.readthedocs.io/en/latest/index.html).
|
||||||
|
|
||||||

|

|
||||||
|
|||||||
@@ -50,7 +50,6 @@ class Transformer:
|
|||||||
data: ScraperResult,
|
data: ScraperResult,
|
||||||
insert: Callable,
|
insert: Callable,
|
||||||
session: Session,
|
session: Session,
|
||||||
insert_post: Callable,
|
|
||||||
flush_posts: Callable,
|
flush_posts: Callable,
|
||||||
):
|
):
|
||||||
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
|
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
|
||||||
@@ -209,7 +208,6 @@ class ETLController:
|
|||||||
|
|
||||||
def insert_or_select(self, obj, session, hydrate: bool = True):
|
def insert_or_select(self, obj, session, hydrate: bool = True):
|
||||||
"""Insert an object into the database or return an existing object from the database.
|
"""Insert an object into the database or return an existing object from the database.
|
||||||
Regardless, the resulting object has an `id` attribute that can be referenced later.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -257,6 +255,7 @@ class ETLController:
|
|||||||
)
|
)
|
||||||
|
|
||||||
elif type(obj) == Post:
|
elif type(obj) == Post:
|
||||||
|
# attempt to add to current batch
|
||||||
return self.insert_post(obj, session, hydrate)
|
return self.insert_post(obj, session, hydrate)
|
||||||
# instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
|
# instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
|
||||||
|
|
||||||
@@ -355,9 +354,6 @@ class ETLController:
|
|||||||
result,
|
result,
|
||||||
lambda obj: self.insert_or_select(obj, session, hydrate),
|
lambda obj: self.insert_or_select(obj, session, hydrate),
|
||||||
session,
|
session,
|
||||||
lambda obj: self.insert_post(
|
|
||||||
obj, session, hydrate, flush=False
|
|
||||||
),
|
|
||||||
lambda: self.flush_posts(session),
|
lambda: self.flush_posts(session),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -75,7 +75,6 @@ class BitchuteTransformer(Transformer):
|
|||||||
data: ScraperResult,
|
data: ScraperResult,
|
||||||
insert: Callable,
|
insert: Callable,
|
||||||
session: Session,
|
session: Session,
|
||||||
insert_post: Callable,
|
|
||||||
flush_posts: Callable,
|
flush_posts: Callable,
|
||||||
):
|
):
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
@@ -140,8 +139,7 @@ class BitchuteTransformer(Transformer):
|
|||||||
video_duration=_parse_duration_str(raw["length"]),
|
video_duration=_parse_duration_str(raw["length"]),
|
||||||
)
|
)
|
||||||
|
|
||||||
# insert_post
|
transformed = insert(transformed)
|
||||||
transformed = insert_post(transformed)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_created(created: str, date_archived: datetime) -> datetime:
|
def parse_created(created: str, date_archived: datetime) -> datetime:
|
||||||
|
|||||||
@@ -95,7 +95,6 @@ class GettrTransformer(Transformer):
|
|||||||
data: ScraperResult,
|
data: ScraperResult,
|
||||||
insert: Callable,
|
insert: Callable,
|
||||||
session: Session,
|
session: Session,
|
||||||
insert_post: Callable,
|
|
||||||
flush_posts: Callable,
|
flush_posts: Callable,
|
||||||
):
|
):
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
@@ -143,8 +142,7 @@ class GettrTransformer(Transformer):
|
|||||||
views=raw.get("vfpst"),
|
views=raw.get("vfpst"),
|
||||||
)
|
)
|
||||||
|
|
||||||
# insert_post
|
insert(transformed)
|
||||||
insert_post(transformed)
|
|
||||||
|
|
||||||
# media = self.process_media(raw, transformed.id, data)
|
# media = self.process_media(raw, transformed.id, data)
|
||||||
# for m in media:
|
# for m in media:
|
||||||
|
|||||||
@@ -74,7 +74,6 @@ class RumbleTransformer(Transformer):
|
|||||||
data: ScraperResult,
|
data: ScraperResult,
|
||||||
insert: Callable,
|
insert: Callable,
|
||||||
session: Session,
|
session: Session,
|
||||||
insert_post: Callable,
|
|
||||||
flush_posts: Callable,
|
flush_posts: Callable,
|
||||||
):
|
):
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
@@ -99,8 +98,7 @@ class RumbleTransformer(Transformer):
|
|||||||
video_duration=_parse_duration_str(raw["duration"]),
|
video_duration=_parse_duration_str(raw["duration"]),
|
||||||
)
|
)
|
||||||
|
|
||||||
# insert_post
|
insert(transformed)
|
||||||
insert_post(transformed)
|
|
||||||
|
|
||||||
|
|
||||||
def _process_number(s: str) -> int:
|
def _process_number(s: str) -> int:
|
||||||
|
|||||||
@@ -195,13 +195,11 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
|
|
||||||
insert(new_chat)
|
insert(new_chat)
|
||||||
|
|
||||||
# TODO this method API is chaotic and could be cleaned up
|
|
||||||
def transform(
|
def transform(
|
||||||
self,
|
self,
|
||||||
data: ScraperResult,
|
data: ScraperResult,
|
||||||
insert: Callable,
|
insert: Callable,
|
||||||
session: Session,
|
session: Session,
|
||||||
insert_post: Callable,
|
|
||||||
flush_posts: Callable,
|
flush_posts: Callable,
|
||||||
):
|
):
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
@@ -277,7 +275,8 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
# use cache to find post ID instead of a DB request, if possible
|
# use cache to find post ID instead of a DB request, if possible
|
||||||
if (data.channel, reply_to_id) not in self.posts_cache:
|
if (data.channel, reply_to_id) not in self.posts_cache:
|
||||||
session.commit()
|
session.commit()
|
||||||
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
|
# this is necessary because the post we are looking for could be batched but not yet committed to the DB
|
||||||
|
flush_posts()
|
||||||
post = (
|
post = (
|
||||||
session.query(Post)
|
session.query(Post)
|
||||||
.filter_by(channel=data.channel, platform_id=reply_to_id)
|
.filter_by(channel=data.channel, platform_id=reply_to_id)
|
||||||
@@ -379,8 +378,7 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
views=raw.get("views"),
|
views=raw.get("views"),
|
||||||
)
|
)
|
||||||
|
|
||||||
# insert_post
|
insert(transformed)
|
||||||
insert_post(transformed)
|
|
||||||
|
|
||||||
|
|
||||||
def stripped(s):
|
def stripped(s):
|
||||||
|
|||||||
Reference in New Issue
Block a user