mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
73 lines
2.6 KiB
Python
73 lines
2.6 KiB
Python
import json
|
|
from loguru import logger
|
|
from typing import Generator, Union, Callable
|
|
import dateutil.parser
|
|
from datetime import datetime, timezone
|
|
from sqlalchemy import func
|
|
|
|
from cisticola.transformer.base import Transformer
|
|
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
|
|
|
class VkontakteTransformer(Transformer):
|
|
"""A Vkontakte specific ScraperResult, with a method ETL/transforming"""
|
|
|
|
__version__ = "VkontakteTransformer 0.0.1"
|
|
|
|
def can_handle(self, data: ScraperResult) -> bool:
|
|
scraper = data.scraper.split(' ')
|
|
if scraper[0] == "VkontakteScraper":
|
|
return True
|
|
|
|
return False
|
|
|
|
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
|
raw = json.loads(data.raw_data)
|
|
|
|
transformed = ChannelInfo(
|
|
raw_channel_info_id=data.id,
|
|
channel=data.channel,
|
|
platform_id=raw['username'],
|
|
platform=data.platform,
|
|
scraper=data.scraper,
|
|
transformer=self.__version__,
|
|
screenname=raw['username'],
|
|
name=raw['name'],
|
|
description=raw.get('description'),
|
|
description_url=raw.get('websites'),
|
|
description_location=None,
|
|
followers=int(raw['followers']) if raw['followers'] else None,
|
|
following=-1,
|
|
verified=raw['verified'],
|
|
date_archived=data.date_archived,
|
|
date_created=None,
|
|
date_transformed=datetime.now(timezone.utc)
|
|
)
|
|
|
|
transformed = insert(transformed)
|
|
|
|
|
|
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
|
raw = json.loads(data.raw_data)
|
|
|
|
transformed = Post(
|
|
raw_id=data.id,
|
|
platform_id=data.platform_id,
|
|
scraper=data.scraper,
|
|
transformer=self.__version__,
|
|
platform=data.platform,
|
|
channel=data.channel,
|
|
date=data.date,
|
|
date_archived=data.date_archived,
|
|
date_transformed=datetime.now(timezone.utc),
|
|
url=raw['url'],
|
|
content=raw['content'] if raw['content'] else '',
|
|
author_id = None,
|
|
author_username=None,
|
|
outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
|
|
)
|
|
|
|
insert(transformed)
|
|
|
|
# media = self.process_media(raw, transformed.id, data)
|
|
# for m in media:
|
|
# insert(m) |