tested telegram transformers and implemented vk transformers

This commit is contained in:
Tristan Lee
2022-06-23 15:06:10 -05:00
parent bb2e2806e6
commit 289a47d7b1
5 changed files with 117 additions and 6 deletions

10
app.py
View File

@@ -19,7 +19,8 @@ from cisticola.transformer import (
TelegramTelethonTransformer,
GettrTransformer,
RumbleTransformer,
BitchuteTransformer)
BitchuteTransformer,
VkontakteTransformer)
from sync_with_gsheet import sync_channels
@@ -55,9 +56,10 @@ def get_transformer_controller():
controller = ETLController()
controller.connect_to_db(engine)
transformers = [TelegramTelethonTransformer(),
BitchuteTransformer(),
GettrTransformer(),
transformers = [VkontakteTransformer(),
TelegramTelethonTransformer(),
GettrTransformer(),
BitchuteTransformer(),
RumbleTransformer()]
controller.register_transformers(transformers)

View File

@@ -4,3 +4,4 @@ from .bitchute import BitchuteTransformer
from .telegram_telethon import TelegramTelethonTransformer
from .rumble import RumbleTransformer
from .gettr import GettrTransformer
from .vkontakte import VkontakteTransformer

View File

@@ -160,7 +160,7 @@ class TelegramTelethonTransformer(Transformer):
reply_to = None
if raw['reply_to']:
reply_to_id = raw['reply_to']['reply_to_msg_id']
reply_to_id = str(raw['reply_to']['reply_to_msg_id'])
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
if post is None:
reply_to = -1
@@ -197,7 +197,7 @@ class TelegramTelethonTransformer(Transformer):
channel = session.query(Channel).filter_by(id=int(data.channel)).first()
if channel is not None:
if channel is not None and channel.url:
url = channel.url.strip('/') + f"/{raw['id']}"
author_username = channel.screenname
else:

View File

@@ -0,0 +1,73 @@
import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from sqlalchemy import func
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class VkontakteTransformer(Transformer):
"""A Vkontakte specific ScraperResult, with a method ETL/transforming"""
__version__ = "VkontakteTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "VkontakteScraper":
return True
return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['username'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['username'],
name=raw['name'],
description=raw.get('description'),
description_url=raw.get('websites'),
description_location=None,
followers=int(raw['followers']) if raw['followers'] else None,
following=-1,
verified=raw['verified'],
date_archived=data.date_archived,
date_created=None,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = Post(
raw_id=data.id,
platform_id=data.platform_id,
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
content=raw['content'] if raw['content'] else '',
author_id = None,
author_username=None,
outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
)
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)

View File

@@ -0,0 +1,35 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import VkontakteScraper
from cisticola.transformer import VkontakteTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_vkontakte(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(VkontakteTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 23
# assert len(media) == 0
assert 'Nigerian gender studies' in posts[-1].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"