mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
got transformers for Bitchute, Rumble, and Gettr working for all raw_posts.
This commit is contained in:
@@ -229,12 +229,27 @@ class Post:
|
||||
#: The ID of the Channel that the post was forwarded or quoted from
|
||||
forwarded_from: int = None
|
||||
|
||||
#: The ID of the Post that this Post is a reply to or reblog of
|
||||
#: The ID of the Post that this Post is a reply to
|
||||
reply_to: int = None
|
||||
|
||||
#: Other users mentioned in the post
|
||||
mentions: list = field(default_factory=list)
|
||||
|
||||
#: Number of positive post reactions (e.g. likes, favorites, rumbles, upvotes, etc.)
|
||||
likes: int = None
|
||||
|
||||
#: Number of times the post was forwarded/retweeted/shared
|
||||
forwards: int = None
|
||||
|
||||
#: Number of times the post was viewed
|
||||
views: int = None
|
||||
|
||||
#: Video title, if post is a video
|
||||
video_title: str = None
|
||||
|
||||
#: Video duration in seconds, if post is a video
|
||||
video_duration: int = None
|
||||
|
||||
def hydrate(self):
|
||||
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
|
||||
|
||||
@@ -246,6 +261,7 @@ class Post:
|
||||
|
||||
hashtags = re.findall(HASHTAG_REGEX, self.content)
|
||||
self.hashtags += hashtags
|
||||
self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
|
||||
|
||||
# regex patterns for finding crypto addresses
|
||||
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
|
||||
@@ -436,7 +452,7 @@ post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
|
||||
Column('platform_id', Integer, index=True),
|
||||
Column('platform_id', String, index=True),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
@@ -455,6 +471,11 @@ post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('hashtags', JSON),
|
||||
Column('outlinks', JSON),
|
||||
Column('mentions', JSON),
|
||||
Column('likes', Integer),
|
||||
Column('forwards', Integer),
|
||||
Column('views', Integer),
|
||||
Column('video_title', String),
|
||||
Column('video_duration', Integer),
|
||||
Column('detected_language', String),
|
||||
Column('normalized_content', String)
|
||||
)
|
||||
|
||||
@@ -171,8 +171,8 @@ class ETLController:
|
||||
session.commit()
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_all_untransformed(self, hydrate: bool = True):
|
||||
|
||||
@@ -59,8 +59,29 @@ class BitchuteTransformer(Transformer):
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
content = soup.find_all('p')[-1].text
|
||||
if raw['category'] == 'comment':
|
||||
if raw['parent_id'] is None:
|
||||
reply_to_id = raw['thread_id']
|
||||
else:
|
||||
reply_to_id = raw['parent_id']
|
||||
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
|
||||
if post is None:
|
||||
if raw['parent_id'] is not None:
|
||||
# this block is for comments whose parent_ids correspond to deleted comments
|
||||
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
|
||||
reply_to = post.id
|
||||
else:
|
||||
reply_to = -1
|
||||
else:
|
||||
reply_to = post.id
|
||||
content = raw['body'].strip()
|
||||
else:
|
||||
reply_to = -1
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
soup.find('div', {'class': 'teaser'}).decompose()
|
||||
soup.find('span', {'class': 'more'}).decompose()
|
||||
soup.find('span', {'class': 'less hidden'}).decompose()
|
||||
content = soup.text.strip()
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
@@ -72,12 +93,19 @@ class BitchuteTransformer(Transformer):
|
||||
date=data.date,
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['url'],
|
||||
url=raw['url'] if raw['url'] else None,
|
||||
content=content,
|
||||
author_id=raw['author_id'],
|
||||
author_username=raw['author'])
|
||||
author_username=raw['author'],
|
||||
reply_to=reply_to,
|
||||
hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])),
|
||||
likes = raw['likes'],
|
||||
views = int(raw['views']) if raw.get('views') else None,
|
||||
video_title = raw['subject'],
|
||||
video_duration = parse_duration_str(raw['length']))
|
||||
|
||||
transformed = insert(transformed)
|
||||
session.flush()
|
||||
|
||||
def parse_created(created: str, date_archived: datetime) -> datetime:
|
||||
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime
|
||||
@@ -93,4 +121,11 @@ def parse_created(created: str, date_archived: datetime) -> datetime:
|
||||
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
|
||||
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
|
||||
|
||||
return date_archived - relativedelta(**kwargs)
|
||||
return date_archived - relativedelta(**kwargs)
|
||||
|
||||
def parse_duration_str(duration_str: str) -> int:
|
||||
if not duration_str:
|
||||
return None
|
||||
else:
|
||||
duration_list = duration_str.split(':')
|
||||
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])
|
||||
@@ -3,6 +3,8 @@ from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
from gogettr import PublicClient
|
||||
from gogettr.api import GettrApiError
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
@@ -49,7 +51,34 @@ class GettrTransformer(Transformer):
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
if raw["activity"]["action"] == "shares_pst":
|
||||
forwarded_from = raw["activity"]["uid"]
|
||||
fwd_from = str(raw["activity"]["uid"])
|
||||
channel = session.query(Channel).filter_by(platform_id=str(fwd_from)).first()
|
||||
if channel is None:
|
||||
try:
|
||||
client = PublicClient()
|
||||
profile = client.user_info(fwd_from.lower())
|
||||
screenname = profile.get('_id')
|
||||
channel = Channel(
|
||||
name=profile.get('nickname'),
|
||||
platform_id=screenname,
|
||||
platform=data.platform,
|
||||
url="https://gettr.com/user/" + screenname,
|
||||
screenname=screenname,
|
||||
category='forwarded',
|
||||
source=self.__version__,
|
||||
)
|
||||
except GettrApiError:
|
||||
channel = Channel(
|
||||
name=None,
|
||||
platform_id=fwd_from,
|
||||
platform=data.platform,
|
||||
url="https://gettr.com/user/" + fwd_from,
|
||||
screenname=fwd_from,
|
||||
category='forwarded',
|
||||
source=self.__version__,
|
||||
)
|
||||
channel = insert(channel)
|
||||
forwarded_from = channel.id
|
||||
else:
|
||||
forwarded_from = None
|
||||
|
||||
@@ -69,7 +98,11 @@ class GettrTransformer(Transformer):
|
||||
author_username=raw["uid"],
|
||||
hashtags=raw.get("htgs", []),
|
||||
outlinks = list(filter(None, [raw.get("prevsrc")])),
|
||||
forwarded_from = forwarded_from)
|
||||
forwarded_from = forwarded_from,
|
||||
likes = raw.get('lkbpst'),
|
||||
forwards = raw.get("shbpst"),
|
||||
views = raw.get('vfpst')
|
||||
)
|
||||
|
||||
insert(transformed)
|
||||
|
||||
|
||||
@@ -61,10 +61,19 @@ class RumbleTransformer(Transformer):
|
||||
url=raw['link'],
|
||||
content=raw['content'],
|
||||
author_id=raw['author_id'],
|
||||
author_username=raw['author_name'])
|
||||
author_username=raw['author_name'],
|
||||
views = _process_number(raw.get('views')),
|
||||
likes = _process_number(raw.get('rumbles')))
|
||||
|
||||
insert(transformed)
|
||||
|
||||
# media = self.process_media(raw, transformed.id, data)
|
||||
# for m in media:
|
||||
# insert(m)
|
||||
# insert(m)
|
||||
|
||||
def _process_number(s):
|
||||
|
||||
if s is None:
|
||||
return None
|
||||
else:
|
||||
return int(s.replace(',', ''))
|
||||
@@ -220,7 +220,9 @@ class TelegramTelethonTransformer(Transformer):
|
||||
author_username=author_username,
|
||||
forwarded_from=fwd_from,
|
||||
reply_to=reply_to,
|
||||
mentions = mentions
|
||||
mentions = mentions,
|
||||
forwards = raw.get('forwards'),
|
||||
views = raw.get('views')
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
Reference in New Issue
Block a user