mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
126 lines
4.7 KiB
Python
126 lines
4.7 KiB
Python
import json
|
|
from loguru import logger
|
|
from typing import Generator, Union, Callable
|
|
import dateutil.parser
|
|
from datetime import datetime, timezone
|
|
from sqlalchemy import func
|
|
from gogettr import PublicClient
|
|
from gogettr.api import GettrApiError
|
|
|
|
from cisticola.transformer.base import Transformer
|
|
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
|
|
|
class GettrTransformer(Transformer):
|
|
"""A Gettr specific ScraperResult, with a method ETL/transforming"""
|
|
|
|
__version__ = "GettrTransformer 0.0.1"
|
|
|
|
def can_handle(self, data: ScraperResult) -> bool:
|
|
scraper = data.scraper.split(' ')
|
|
if scraper[0] == "GettrScraper":
|
|
return True
|
|
|
|
return False
|
|
|
|
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
|
raw = json.loads(data.raw_data)
|
|
|
|
transformed = ChannelInfo(
|
|
raw_channel_info_id=data.id,
|
|
channel=data.channel,
|
|
platform_id=raw['_id'],
|
|
platform=data.platform,
|
|
scraper=data.scraper,
|
|
transformer=self.__version__,
|
|
screenname=raw['username'],
|
|
name=raw['nickname'],
|
|
description=raw.get('dsc'),
|
|
description_url=raw.get('website'),
|
|
description_location=raw.get('location'),
|
|
followers=int(raw['flg']),
|
|
following=int(raw['flw']),
|
|
verified=True if raw.get('infl') else False,
|
|
date_created=datetime.fromtimestamp(int(raw['cdate'])*0.001),
|
|
date_archived=data.date_archived,
|
|
date_transformed=datetime.now(timezone.utc)
|
|
)
|
|
|
|
transformed = insert(transformed)
|
|
|
|
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
|
|
|
|
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first()
|
|
|
|
if channel is None:
|
|
try:
|
|
client = PublicClient()
|
|
profile = client.user_info(username.lower())
|
|
screenname = profile.get('_id')
|
|
channel = Channel(
|
|
name=profile.get('nickname'),
|
|
platform_id=screenname,
|
|
platform='Gettr',
|
|
url="https://gettr.com/user/" + screenname,
|
|
screenname=screenname,
|
|
category=category,
|
|
source=self.__version__,
|
|
)
|
|
except GettrApiError:
|
|
channel = Channel(
|
|
name = None,
|
|
platform_id = None,
|
|
platform = 'Gettr',
|
|
url = None,
|
|
screenname=username,
|
|
category=category,
|
|
source=self.__version__,
|
|
notes='GettrApiError'
|
|
)
|
|
|
|
channel = insert(channel)
|
|
|
|
return channel.id
|
|
|
|
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
|
raw = json.loads(data.raw_data)
|
|
|
|
if raw["activity"]["action"] == "shares_pst":
|
|
forwarded_from = self._get_channel_id(
|
|
username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session)
|
|
else:
|
|
forwarded_from = None
|
|
|
|
mentions = []
|
|
for mentioned_user in raw.get("utgs", []):
|
|
mentioned_id = self._get_channel_id(
|
|
username = mentioned_user, category = 'mentioned', insert = insert, session = session)
|
|
mentions.append(mentioned_id)
|
|
|
|
transformed = Post(
|
|
raw_id=data.id,
|
|
platform_id=raw["_id"],
|
|
scraper=data.scraper,
|
|
transformer=self.__version__,
|
|
platform=data.platform,
|
|
channel=data.channel,
|
|
date=datetime.fromtimestamp(raw["activity"]["cdate"] / 1000.0),
|
|
date_archived=data.date_archived,
|
|
date_transformed=datetime.now(timezone.utc),
|
|
url="https://www.gettr.com/post/" + raw["_id"],
|
|
content=raw.get("txt", ""),
|
|
author_id=raw["receiver_id"],
|
|
author_username=raw["uid"],
|
|
hashtags=raw.get("htgs", []),
|
|
outlinks = list(filter(None, [raw.get("prevsrc")])),
|
|
forwarded_from = forwarded_from,
|
|
mentions = mentions,
|
|
likes = raw.get('lkbpst'),
|
|
forwards = raw.get("shbpst"),
|
|
views = raw.get('vfpst')
|
|
)
|
|
|
|
insert(transformed)
|
|
|
|
# media = self.process_media(raw, transformed.id, data)
|
|
# for m in media:
|
|
# insert(m) |