got post transformers and channel_info transformers working for Rumble, Bitchute, Gettr

This commit is contained in:
Tristan Lee
2022-06-21 19:05:41 -05:00
parent 619fe42a31
commit bb2e2806e6
6 changed files with 97 additions and 39 deletions

View File

@@ -256,6 +256,7 @@ class Post:
# replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
self.outlinks += urls
self.outlinks = list(set(outlink for outlink in self.outlinks))
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"

View File

@@ -69,7 +69,10 @@ class BitchuteTransformer(Transformer):
if raw['parent_id'] is not None:
# this block is for comments whose parent_ids correspond to deleted comments
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
reply_to = post.id
if post is None:
reply_to = -1
else:
reply_to = post.id
else:
reply_to = -1
else:
@@ -102,7 +105,7 @@ class BitchuteTransformer(Transformer):
likes = raw['likes'],
views = int(raw['views']) if raw.get('views') else None,
video_title = raw['subject'],
video_duration = parse_duration_str(raw['length']))
video_duration = _parse_duration_str(raw['length']))
transformed = insert(transformed)
session.flush()
@@ -123,7 +126,9 @@ def parse_created(created: str, date_archived: datetime) -> datetime:
return date_archived - relativedelta(**kwargs)
def parse_duration_str(duration_str: str) -> int:
def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
"""
if not duration_str:
return None
else:

View File

@@ -3,6 +3,7 @@ from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from sqlalchemy import func
from gogettr import PublicClient
from gogettr.api import GettrApiError
@@ -46,42 +47,55 @@ class GettrTransformer(Transformer):
transformed = insert(transformed)
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first()
if channel is None:
try:
client = PublicClient()
profile = client.user_info(username.lower())
screenname = profile.get('_id')
channel = Channel(
name=profile.get('nickname'),
platform_id=screenname,
platform='Gettr',
url="https://gettr.com/user/" + screenname,
screenname=screenname,
category=category,
source=self.__version__,
)
except GettrApiError:
channel = Channel(
name = None,
platform_id = None,
platform = 'Gettr',
url = None,
screenname=username,
category=category,
source=self.__version__,
notes='GettrApiError'
)
channel = insert(channel)
return channel.id
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if raw["activity"]["action"] == "shares_pst":
fwd_from = str(raw["activity"]["uid"])
channel = session.query(Channel).filter_by(platform_id=str(fwd_from)).first()
if channel is None:
try:
client = PublicClient()
profile = client.user_info(fwd_from.lower())
screenname = profile.get('_id')
channel = Channel(
name=profile.get('nickname'),
platform_id=screenname,
platform=data.platform,
url="https://gettr.com/user/" + screenname,
screenname=screenname,
category='forwarded',
source=self.__version__,
)
except GettrApiError:
channel = Channel(
name=None,
platform_id=fwd_from,
platform=data.platform,
url="https://gettr.com/user/" + fwd_from,
screenname=fwd_from,
category='forwarded',
source=self.__version__,
)
channel = insert(channel)
forwarded_from = channel.id
forwarded_from = self._get_channel_id(
username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session)
else:
forwarded_from = None
mentions = []
for mentioned_user in raw.get("utgs", []):
mentioned_id = self._get_channel_id(
username = mentioned_user, category = 'mentioned', insert = insert, session = session)
mentions.append(mentioned_id)
transformed = Post(
raw_id=data.id,
platform_id=raw["_id"],
@@ -99,6 +113,7 @@ class GettrTransformer(Transformer):
hashtags=raw.get("htgs", []),
outlinks = list(filter(None, [raw.get("prevsrc")])),
forwarded_from = forwarded_from,
mentions = mentions,
likes = raw.get('lkbpst'),
forwards = raw.get("shbpst"),
views = raw.get('vfpst')

View File

@@ -3,6 +3,7 @@ from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from sqlalchemy import func, JSON, String, cast, text
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
@@ -22,19 +23,30 @@ class RumbleTransformer(Transformer):
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if 'id' not in raw:
# The first version of the Rumble ChannelInfo scraper didn't return
# the platform_id, so this is a workaround.
channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first()
if channel is None:
platform_id = None
else:
platform_id = json.loads(channel.raw_data)['id']
else:
platform_id = raw['id']
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['id'],
platform_id=platform_id,
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['id'],
screenname=platform_id,
name=raw['name'],
description='', # does not exist for Rumble
description_url='', # does not exist for Rumble
description_location='', # does not exist for Rumble
followers=raw['subscribers'],
followers=_process_number(raw['subscribers']),
following=-1, # does not exist for Rumble
verified=raw['verified'],
date_created=None, # does not exist for Rumble
@@ -63,7 +75,9 @@ class RumbleTransformer(Transformer):
author_id=raw['author_id'],
author_username=raw['author_name'],
views = _process_number(raw.get('views')),
likes = _process_number(raw.get('rumbles')))
likes = _process_number(raw.get('rumbles')),
video_title = raw['title'],
video_duration=_parse_duration_str(raw['duration']))
insert(transformed)
@@ -76,4 +90,18 @@ def _process_number(s):
if s is None:
return None
else:
return int(s.replace(',', ''))
s = s.replace(' ', '')
if s.endswith('M'):
return int(float(s[:-1]) * 1e6)
elif s.endswith('K'):
return int(float(s[:-1]) * 1000)
return int(s)
def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
"""
if not duration_str:
return None
else:
duration_list = duration_str.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])

View File

@@ -131,7 +131,7 @@ class TelegramTelethonTransformer(Transformer):
fwd_from = None
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first()
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first()
if channel is None:
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])