diff --git a/app.py b/app.py index 21e29c9..afa3b6c 100644 --- a/app.py +++ b/app.py @@ -117,7 +117,7 @@ def transform(args): if args.min_date: min_date = datetime.datetime.fromisoformat(args.min_date) else: - min_date = 0 + min_date = datetime.datetime(1970, 1, 1) controller.transform_all_untransformed(min_date=min_date) diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index b69fde5..b9a8e27 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -234,13 +234,14 @@ class ETLController: logger.trace(f"{transformer} is handling result {result.id} ({result.date})") handled = True - transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session) + transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session, lambda obj: self.insert_post(obj, session, hydrate, flush=False), lambda: self.flush_posts(session)) break if handled == False: logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})") + self.flush_posts(session) session.commit() diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index 571ee7b..5619b1d 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -56,7 +56,7 @@ class BitchuteTransformer(Transformer): transformed = insert(transformed) - def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) if raw['category'] == 'comment': @@ -64,6 +64,7 @@ class BitchuteTransformer(Transformer): reply_to_id = raw['thread_id'] else: reply_to_id = raw['parent_id'] + flush_posts() post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() if post is None: if raw['parent_id'] is not None: @@ -108,7 +109,7 @@ class BitchuteTransformer(Transformer): video_duration = _parse_duration_str(raw['length'])) # insert_post - transformed = insert(transformed) + transformed = insert_post(transformed) def parse_created(created: str, date_archived: datetime) -> datetime: """Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime diff --git a/cisticola/transformer/gettr.py b/cisticola/transformer/gettr.py index db17eae..8eaf5f8 100644 --- a/cisticola/transformer/gettr.py +++ b/cisticola/transformer/gettr.py @@ -81,7 +81,7 @@ class GettrTransformer(Transformer): return channel.id - def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) if raw["activity"]["action"] == "shares_pst": @@ -119,7 +119,8 @@ class GettrTransformer(Transformer): views = raw.get('vfpst') ) - insert(transformed) + # insert_post + insert_post(transformed) # media = self.process_media(raw, transformed.id, data) # for m in media: diff --git a/cisticola/transformer/rumble.py b/cisticola/transformer/rumble.py index b8810b6..6225d2a 100644 --- a/cisticola/transformer/rumble.py +++ b/cisticola/transformer/rumble.py @@ -57,7 +57,7 @@ class RumbleTransformer(Transformer): transformed = insert(transformed) - def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) transformed = Post( @@ -80,7 +80,7 @@ class RumbleTransformer(Transformer): video_duration=_parse_duration_str(raw['duration'])) # insert_post - insert(transformed) + insert_post(transformed) def _process_number(s): diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index b24957e..d8fd005 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -159,7 +159,7 @@ class TelegramTelethonTransformer(Transformer): insert(new_chat) - def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) if raw['_'] != 'Message': @@ -204,6 +204,7 @@ class TelegramTelethonTransformer(Transformer): if raw['reply_to']: reply_to_id = str(raw['reply_to']['reply_to_msg_id']) session.commit() + flush_posts() post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() if post is None: reply_to = -1 @@ -286,7 +287,7 @@ class TelegramTelethonTransformer(Transformer): ) # insert_post - insert(transformed) + insert_post(transformed) def stripped(s): """https://stackoverflow.com/a/29933716""" diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index 5f9a81a..52ece05 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -72,7 +72,7 @@ class TwitterTransformer(Transformer): transformed = insert(transformed) - def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) transformed = Post( @@ -134,4 +134,4 @@ class TwitterTransformer(Transformer): subtweet(raw['quotedTweet']) #insert_post - insert(transformed) \ No newline at end of file + insert_post(transformed) \ No newline at end of file diff --git a/cisticola/transformer/vkontakte.py b/cisticola/transformer/vkontakte.py index 550941e..ad50f94 100644 --- a/cisticola/transformer/vkontakte.py +++ b/cisticola/transformer/vkontakte.py @@ -46,7 +46,7 @@ class VkontakteTransformer(Transformer): transformed = insert(transformed) - def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) transformed = Post( @@ -67,7 +67,7 @@ class VkontakteTransformer(Transformer): ) # insert_post - insert(transformed) + insert_post(transformed) # media = self.process_media(raw, transformed.id, data) # for m in media: