got post transformers and channel_info transformers working for Rumble, Bitchute, Gettr

This commit is contained in:
Tristan Lee
2022-06-21 19:05:41 -05:00
parent 619fe42a31
commit bb2e2806e6
6 changed files with 97 additions and 39 deletions

13
app.py
View File

@@ -14,7 +14,13 @@ from cisticola.scraper import (
BitchuteScraper, BitchuteScraper,
RumbleScraper, RumbleScraper,
) )
from cisticola.transformer import (ETLController, TelegramTelethonTransformer) from cisticola.transformer import (
ETLController,
TelegramTelethonTransformer,
GettrTransformer,
RumbleTransformer,
BitchuteTransformer)
from sync_with_gsheet import sync_channels from sync_with_gsheet import sync_channels
def get_db_session(): def get_db_session():
@@ -49,7 +55,10 @@ def get_transformer_controller():
controller = ETLController() controller = ETLController()
controller.connect_to_db(engine) controller.connect_to_db(engine)
transformers = [TelegramTelethonTransformer()] transformers = [TelegramTelethonTransformer(),
BitchuteTransformer(),
GettrTransformer(),
RumbleTransformer()]
controller.register_transformers(transformers) controller.register_transformers(transformers)

View File

@@ -256,6 +256,7 @@ class Post:
# replace is here in order to prevent catastrophic backtracking # replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", "")) urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
self.outlinks += urls self.outlinks += urls
self.outlinks = list(set(outlink for outlink in self.outlinks))
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)" HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"

View File

@@ -69,7 +69,10 @@ class BitchuteTransformer(Transformer):
if raw['parent_id'] is not None: if raw['parent_id'] is not None:
# this block is for comments whose parent_ids correspond to deleted comments # this block is for comments whose parent_ids correspond to deleted comments
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first() post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
reply_to = post.id if post is None:
reply_to = -1
else:
reply_to = post.id
else: else:
reply_to = -1 reply_to = -1
else: else:
@@ -102,7 +105,7 @@ class BitchuteTransformer(Transformer):
likes = raw['likes'], likes = raw['likes'],
views = int(raw['views']) if raw.get('views') else None, views = int(raw['views']) if raw.get('views') else None,
video_title = raw['subject'], video_title = raw['subject'],
video_duration = parse_duration_str(raw['length'])) video_duration = _parse_duration_str(raw['length']))
transformed = insert(transformed) transformed = insert(transformed)
session.flush() session.flush()
@@ -123,7 +126,9 @@ def parse_created(created: str, date_archived: datetime) -> datetime:
return date_archived - relativedelta(**kwargs) return date_archived - relativedelta(**kwargs)
def parse_duration_str(duration_str: str) -> int: def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
"""
if not duration_str: if not duration_str:
return None return None
else: else:

View File

@@ -3,6 +3,7 @@ from loguru import logger
from typing import Generator, Union, Callable from typing import Generator, Union, Callable
import dateutil.parser import dateutil.parser
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import func
from gogettr import PublicClient from gogettr import PublicClient
from gogettr.api import GettrApiError from gogettr.api import GettrApiError
@@ -46,42 +47,55 @@ class GettrTransformer(Transformer):
transformed = insert(transformed) transformed = insert(transformed)
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first()
if channel is None:
try:
client = PublicClient()
profile = client.user_info(username.lower())
screenname = profile.get('_id')
channel = Channel(
name=profile.get('nickname'),
platform_id=screenname,
platform='Gettr',
url="https://gettr.com/user/" + screenname,
screenname=screenname,
category=category,
source=self.__version__,
)
except GettrApiError:
channel = Channel(
name = None,
platform_id = None,
platform = 'Gettr',
url = None,
screenname=username,
category=category,
source=self.__version__,
notes='GettrApiError'
)
channel = insert(channel)
return channel.id
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
if raw["activity"]["action"] == "shares_pst": if raw["activity"]["action"] == "shares_pst":
fwd_from = str(raw["activity"]["uid"]) forwarded_from = self._get_channel_id(
channel = session.query(Channel).filter_by(platform_id=str(fwd_from)).first() username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session)
if channel is None:
try:
client = PublicClient()
profile = client.user_info(fwd_from.lower())
screenname = profile.get('_id')
channel = Channel(
name=profile.get('nickname'),
platform_id=screenname,
platform=data.platform,
url="https://gettr.com/user/" + screenname,
screenname=screenname,
category='forwarded',
source=self.__version__,
)
except GettrApiError:
channel = Channel(
name=None,
platform_id=fwd_from,
platform=data.platform,
url="https://gettr.com/user/" + fwd_from,
screenname=fwd_from,
category='forwarded',
source=self.__version__,
)
channel = insert(channel)
forwarded_from = channel.id
else: else:
forwarded_from = None forwarded_from = None
mentions = []
for mentioned_user in raw.get("utgs", []):
mentioned_id = self._get_channel_id(
username = mentioned_user, category = 'mentioned', insert = insert, session = session)
mentions.append(mentioned_id)
transformed = Post( transformed = Post(
raw_id=data.id, raw_id=data.id,
platform_id=raw["_id"], platform_id=raw["_id"],
@@ -99,6 +113,7 @@ class GettrTransformer(Transformer):
hashtags=raw.get("htgs", []), hashtags=raw.get("htgs", []),
outlinks = list(filter(None, [raw.get("prevsrc")])), outlinks = list(filter(None, [raw.get("prevsrc")])),
forwarded_from = forwarded_from, forwarded_from = forwarded_from,
mentions = mentions,
likes = raw.get('lkbpst'), likes = raw.get('lkbpst'),
forwards = raw.get("shbpst"), forwards = raw.get("shbpst"),
views = raw.get('vfpst') views = raw.get('vfpst')

View File

@@ -3,6 +3,7 @@ from loguru import logger
from typing import Generator, Union, Callable from typing import Generator, Union, Callable
import dateutil.parser import dateutil.parser
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import func, JSON, String, cast, text
from cisticola.transformer.base import Transformer from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
@@ -22,19 +23,30 @@ class RumbleTransformer(Transformer):
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
if 'id' not in raw:
# The first version of the Rumble ChannelInfo scraper didn't return
# the platform_id, so this is a workaround.
channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first()
if channel is None:
platform_id = None
else:
platform_id = json.loads(channel.raw_data)['id']
else:
platform_id = raw['id']
transformed = ChannelInfo( transformed = ChannelInfo(
raw_channel_info_id=data.id, raw_channel_info_id=data.id,
channel=data.channel, channel=data.channel,
platform_id=raw['id'], platform_id=platform_id,
platform=data.platform, platform=data.platform,
scraper=data.scraper, scraper=data.scraper,
transformer=self.__version__, transformer=self.__version__,
screenname=raw['id'], screenname=platform_id,
name=raw['name'], name=raw['name'],
description='', # does not exist for Rumble description='', # does not exist for Rumble
description_url='', # does not exist for Rumble description_url='', # does not exist for Rumble
description_location='', # does not exist for Rumble description_location='', # does not exist for Rumble
followers=raw['subscribers'], followers=_process_number(raw['subscribers']),
following=-1, # does not exist for Rumble following=-1, # does not exist for Rumble
verified=raw['verified'], verified=raw['verified'],
date_created=None, # does not exist for Rumble date_created=None, # does not exist for Rumble
@@ -63,7 +75,9 @@ class RumbleTransformer(Transformer):
author_id=raw['author_id'], author_id=raw['author_id'],
author_username=raw['author_name'], author_username=raw['author_name'],
views = _process_number(raw.get('views')), views = _process_number(raw.get('views')),
likes = _process_number(raw.get('rumbles'))) likes = _process_number(raw.get('rumbles')),
video_title = raw['title'],
video_duration=_parse_duration_str(raw['duration']))
insert(transformed) insert(transformed)
@@ -76,4 +90,18 @@ def _process_number(s):
if s is None: if s is None:
return None return None
else: else:
return int(s.replace(',', '')) s = s.replace(' ', '')
if s.endswith('M'):
return int(float(s[:-1]) * 1e6)
elif s.endswith('K'):
return int(float(s[:-1]) * 1000)
return int(s)
def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
"""
if not duration_str:
return None
else:
duration_list = duration_str.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])

View File

@@ -131,7 +131,7 @@ class TelegramTelethonTransformer(Transformer):
fwd_from = None fwd_from = None
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']: if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first() channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first()
if channel is None: if channel is None:
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id']) (screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])