diff --git a/app.py b/app.py index daaedeb..9d848f8 100644 --- a/app.py +++ b/app.py @@ -19,7 +19,8 @@ from cisticola.transformer import ( TelegramTelethonTransformer, GettrTransformer, RumbleTransformer, - BitchuteTransformer) + BitchuteTransformer, + VkontakteTransformer) from sync_with_gsheet import sync_channels @@ -55,9 +56,10 @@ def get_transformer_controller(): controller = ETLController() controller.connect_to_db(engine) - transformers = [TelegramTelethonTransformer(), - BitchuteTransformer(), - GettrTransformer(), + transformers = [VkontakteTransformer(), + TelegramTelethonTransformer(), + GettrTransformer(), + BitchuteTransformer(), RumbleTransformer()] controller.register_transformers(transformers) diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index 37df764..48f96bd 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -4,3 +4,4 @@ from .bitchute import BitchuteTransformer from .telegram_telethon import TelegramTelethonTransformer from .rumble import RumbleTransformer from .gettr import GettrTransformer +from .vkontakte import VkontakteTransformer diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index eb48d06..b5326e2 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -160,7 +160,7 @@ class TelegramTelethonTransformer(Transformer): reply_to = None if raw['reply_to']: - reply_to_id = raw['reply_to']['reply_to_msg_id'] + reply_to_id = str(raw['reply_to']['reply_to_msg_id']) post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() if post is None: reply_to = -1 @@ -197,7 +197,7 @@ class TelegramTelethonTransformer(Transformer): channel = session.query(Channel).filter_by(id=int(data.channel)).first() - if channel is not None: + if channel is not None and channel.url: url = channel.url.strip('/') + f"/{raw['id']}" author_username = channel.screenname else: diff --git a/cisticola/transformer/vkontakte.py b/cisticola/transformer/vkontakte.py new file mode 100644 index 0000000..2351972 --- /dev/null +++ b/cisticola/transformer/vkontakte.py @@ -0,0 +1,73 @@ +import json +from loguru import logger +from typing import Generator, Union, Callable +import dateutil.parser +from datetime import datetime, timezone +from sqlalchemy import func + +from cisticola.transformer.base import Transformer +from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel + +class VkontakteTransformer(Transformer): + """A Vkontakte specific ScraperResult, with a method ETL/transforming""" + + __version__ = "VkontakteTransformer 0.0.1" + + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "VkontakteScraper": + return True + + return False + + def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = ChannelInfo( + raw_channel_info_id=data.id, + channel=data.channel, + platform_id=raw['username'], + platform=data.platform, + scraper=data.scraper, + transformer=self.__version__, + screenname=raw['username'], + name=raw['name'], + description=raw.get('description'), + description_url=raw.get('websites'), + description_location=None, + followers=int(raw['followers']) if raw['followers'] else None, + following=-1, + verified=raw['verified'], + date_archived=data.date_archived, + date_created=None, + date_transformed=datetime.now(timezone.utc) + ) + + transformed = insert(transformed) + + + def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = Post( + raw_id=data.id, + platform_id=data.platform_id, + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=data.date, + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), + url=raw['url'], + content=raw['content'] if raw['content'] else '', + author_id = None, + author_username=None, + outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [], + ) + + insert(transformed) + + # media = self.process_media(raw, transformed.id, data) + # for m in media: + # insert(m) \ No newline at end of file diff --git a/tests/transformer/vkontakte.py b/tests/transformer/vkontakte.py new file mode 100644 index 0000000..ea26b62 --- /dev/null +++ b/tests/transformer/vkontakte.py @@ -0,0 +1,35 @@ +from sqlalchemy.orm import sessionmaker +import json + +import pytest + +from cisticola.base import Channel +from cisticola.scraper import VkontakteScraper +from cisticola.transformer import VkontakteTransformer +from cisticola.base import Post, Media + +@pytest.mark.media +def test_scrape_etl_vkontakte(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['vkontakte'])] + controller.register_scraper(scraper = VkontakteScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + controller.scrape_all_channel_info() + + etl_controller.register_transformer(VkontakteTransformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(Post).all() + media = session.query(Media).all() + + assert len(posts) == 23 + # assert len(media) == 0 + + assert 'Nigerian gender studies' in posts[-1].content + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file