got transformers for Bitchute, Rumble, and Gettr working for all raw_posts.

This commit is contained in:
Tristan Lee
2022-06-20 21:45:41 -05:00
parent a2a7882f1c
commit 619fe42a31
6 changed files with 114 additions and 14 deletions

View File

@@ -229,12 +229,27 @@ class Post:
#: The ID of the Channel that the post was forwarded or quoted from
forwarded_from: int = None
#: The ID of the Post that this Post is a reply to or reblog of
#: The ID of the Post that this Post is a reply to
reply_to: int = None
#: Other users mentioned in the post
mentions: list = field(default_factory=list)
#: Number of positive post reactions (e.g. likes, favorites, rumbles, upvotes, etc.)
likes: int = None
#: Number of times the post was forwarded/retweeted/shared
forwards: int = None
#: Number of times the post was viewed
views: int = None
#: Video title, if post is a video
video_title: str = None
#: Video duration in seconds, if post is a video
video_duration: int = None
def hydrate(self):
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
@@ -246,6 +261,7 @@ class Post:
hashtags = re.findall(HASHTAG_REGEX, self.content)
self.hashtags += hashtags
self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
# regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
@@ -436,7 +452,7 @@ post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
Column('platform_id', Integer, index=True),
Column('platform_id', String, index=True),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
@@ -455,6 +471,11 @@ post_table = Table('posts', mapper_registry.metadata,
Column('hashtags', JSON),
Column('outlinks', JSON),
Column('mentions', JSON),
Column('likes', Integer),
Column('forwards', Integer),
Column('views', Integer),
Column('video_title', String),
Column('video_duration', Integer),
Column('detected_language', String),
Column('normalized_content', String)
)

View File

@@ -171,8 +171,8 @@ class ETLController:
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
@logger.catch(reraise=True)
def transform_all_untransformed(self, hydrate: bool = True):

View File

@@ -59,8 +59,29 @@ class BitchuteTransformer(Transformer):
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text
if raw['category'] == 'comment':
if raw['parent_id'] is None:
reply_to_id = raw['thread_id']
else:
reply_to_id = raw['parent_id']
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
if post is None:
if raw['parent_id'] is not None:
# this block is for comments whose parent_ids correspond to deleted comments
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
reply_to = post.id
else:
reply_to = -1
else:
reply_to = post.id
content = raw['body'].strip()
else:
reply_to = -1
soup = BeautifulSoup(raw['body'], features = 'html.parser')
soup.find('div', {'class': 'teaser'}).decompose()
soup.find('span', {'class': 'more'}).decompose()
soup.find('span', {'class': 'less hidden'}).decompose()
content = soup.text.strip()
transformed = Post(
raw_id=data.id,
@@ -72,12 +93,19 @@ class BitchuteTransformer(Transformer):
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
url=raw['url'] if raw['url'] else None,
content=content,
author_id=raw['author_id'],
author_username=raw['author'])
author_username=raw['author'],
reply_to=reply_to,
hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])),
likes = raw['likes'],
views = int(raw['views']) if raw.get('views') else None,
video_title = raw['subject'],
video_duration = parse_duration_str(raw['length']))
transformed = insert(transformed)
session.flush()
def parse_created(created: str, date_archived: datetime) -> datetime:
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime
@@ -93,4 +121,11 @@ def parse_created(created: str, date_archived: datetime) -> datetime:
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
return date_archived - relativedelta(**kwargs)
return date_archived - relativedelta(**kwargs)
def parse_duration_str(duration_str: str) -> int:
if not duration_str:
return None
else:
duration_list = duration_str.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])

View File

@@ -3,6 +3,8 @@ from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from gogettr import PublicClient
from gogettr.api import GettrApiError
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
@@ -49,7 +51,34 @@ class GettrTransformer(Transformer):
raw = json.loads(data.raw_data)
if raw["activity"]["action"] == "shares_pst":
forwarded_from = raw["activity"]["uid"]
fwd_from = str(raw["activity"]["uid"])
channel = session.query(Channel).filter_by(platform_id=str(fwd_from)).first()
if channel is None:
try:
client = PublicClient()
profile = client.user_info(fwd_from.lower())
screenname = profile.get('_id')
channel = Channel(
name=profile.get('nickname'),
platform_id=screenname,
platform=data.platform,
url="https://gettr.com/user/" + screenname,
screenname=screenname,
category='forwarded',
source=self.__version__,
)
except GettrApiError:
channel = Channel(
name=None,
platform_id=fwd_from,
platform=data.platform,
url="https://gettr.com/user/" + fwd_from,
screenname=fwd_from,
category='forwarded',
source=self.__version__,
)
channel = insert(channel)
forwarded_from = channel.id
else:
forwarded_from = None
@@ -69,7 +98,11 @@ class GettrTransformer(Transformer):
author_username=raw["uid"],
hashtags=raw.get("htgs", []),
outlinks = list(filter(None, [raw.get("prevsrc")])),
forwarded_from = forwarded_from)
forwarded_from = forwarded_from,
likes = raw.get('lkbpst'),
forwards = raw.get("shbpst"),
views = raw.get('vfpst')
)
insert(transformed)

View File

@@ -61,10 +61,19 @@ class RumbleTransformer(Transformer):
url=raw['link'],
content=raw['content'],
author_id=raw['author_id'],
author_username=raw['author_name'])
author_username=raw['author_name'],
views = _process_number(raw.get('views')),
likes = _process_number(raw.get('rumbles')))
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)
# insert(m)
def _process_number(s):
if s is None:
return None
else:
return int(s.replace(',', ''))

View File

@@ -220,7 +220,9 @@ class TelegramTelethonTransformer(Transformer):
author_username=author_username,
forwarded_from=fwd_from,
reply_to=reply_to,
mentions = mentions
mentions = mentions,
forwards = raw.get('forwards'),
views = raw.get('views')
)
transformed = insert(transformed)