mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 05:18:33 +03:00
tested telegram transformers and implemented vk transformers
This commit is contained in:
10
app.py
10
app.py
@@ -19,7 +19,8 @@ from cisticola.transformer import (
|
||||
TelegramTelethonTransformer,
|
||||
GettrTransformer,
|
||||
RumbleTransformer,
|
||||
BitchuteTransformer)
|
||||
BitchuteTransformer,
|
||||
VkontakteTransformer)
|
||||
|
||||
from sync_with_gsheet import sync_channels
|
||||
|
||||
@@ -55,9 +56,10 @@ def get_transformer_controller():
|
||||
controller = ETLController()
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
transformers = [TelegramTelethonTransformer(),
|
||||
BitchuteTransformer(),
|
||||
GettrTransformer(),
|
||||
transformers = [VkontakteTransformer(),
|
||||
TelegramTelethonTransformer(),
|
||||
GettrTransformer(),
|
||||
BitchuteTransformer(),
|
||||
RumbleTransformer()]
|
||||
|
||||
controller.register_transformers(transformers)
|
||||
|
||||
@@ -4,3 +4,4 @@ from .bitchute import BitchuteTransformer
|
||||
from .telegram_telethon import TelegramTelethonTransformer
|
||||
from .rumble import RumbleTransformer
|
||||
from .gettr import GettrTransformer
|
||||
from .vkontakte import VkontakteTransformer
|
||||
|
||||
@@ -160,7 +160,7 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
reply_to = None
|
||||
if raw['reply_to']:
|
||||
reply_to_id = raw['reply_to']['reply_to_msg_id']
|
||||
reply_to_id = str(raw['reply_to']['reply_to_msg_id'])
|
||||
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
|
||||
if post is None:
|
||||
reply_to = -1
|
||||
@@ -197,7 +197,7 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
channel = session.query(Channel).filter_by(id=int(data.channel)).first()
|
||||
|
||||
if channel is not None:
|
||||
if channel is not None and channel.url:
|
||||
url = channel.url.strip('/') + f"/{raw['id']}"
|
||||
author_username = channel.screenname
|
||||
else:
|
||||
|
||||
73
cisticola/transformer/vkontakte.py
Normal file
73
cisticola/transformer/vkontakte.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy import func
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class VkontakteTransformer(Transformer):
|
||||
"""A Vkontakte specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "VkontakteTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "VkontakteScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['username'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['username'],
|
||||
name=raw['name'],
|
||||
description=raw.get('description'),
|
||||
description_url=raw.get('websites'),
|
||||
description_location=None,
|
||||
followers=int(raw['followers']) if raw['followers'] else None,
|
||||
following=-1,
|
||||
verified=raw['verified'],
|
||||
date_archived=data.date_archived,
|
||||
date_created=None,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=data.platform_id,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=data.date,
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['url'],
|
||||
content=raw['content'] if raw['content'] else '',
|
||||
author_id = None,
|
||||
author_username=None,
|
||||
outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
|
||||
)
|
||||
|
||||
insert(transformed)
|
||||
|
||||
# media = self.process_media(raw, transformed.id, data)
|
||||
# for m in media:
|
||||
# insert(m)
|
||||
35
tests/transformer/vkontakte.py
Normal file
35
tests/transformer/vkontakte.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import VkontakteScraper
|
||||
from cisticola.transformer import VkontakteTransformer
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_etl_vkontakte(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['vkontakte'])]
|
||||
controller.register_scraper(scraper = VkontakteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
controller.scrape_all_channel_info()
|
||||
|
||||
etl_controller.register_transformer(VkontakteTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 23
|
||||
# assert len(media) == 0
|
||||
|
||||
assert 'Nigerian gender studies' in posts[-1].content
|
||||
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
|
||||
Reference in New Issue
Block a user