mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 21:38:33 +03:00
got post transformers and channel_info transformers working for Rumble, Bitchute, Gettr
This commit is contained in:
13
app.py
13
app.py
@@ -14,7 +14,13 @@ from cisticola.scraper import (
|
|||||||
BitchuteScraper,
|
BitchuteScraper,
|
||||||
RumbleScraper,
|
RumbleScraper,
|
||||||
)
|
)
|
||||||
from cisticola.transformer import (ETLController, TelegramTelethonTransformer)
|
from cisticola.transformer import (
|
||||||
|
ETLController,
|
||||||
|
TelegramTelethonTransformer,
|
||||||
|
GettrTransformer,
|
||||||
|
RumbleTransformer,
|
||||||
|
BitchuteTransformer)
|
||||||
|
|
||||||
from sync_with_gsheet import sync_channels
|
from sync_with_gsheet import sync_channels
|
||||||
|
|
||||||
def get_db_session():
|
def get_db_session():
|
||||||
@@ -49,7 +55,10 @@ def get_transformer_controller():
|
|||||||
controller = ETLController()
|
controller = ETLController()
|
||||||
controller.connect_to_db(engine)
|
controller.connect_to_db(engine)
|
||||||
|
|
||||||
transformers = [TelegramTelethonTransformer()]
|
transformers = [TelegramTelethonTransformer(),
|
||||||
|
BitchuteTransformer(),
|
||||||
|
GettrTransformer(),
|
||||||
|
RumbleTransformer()]
|
||||||
|
|
||||||
controller.register_transformers(transformers)
|
controller.register_transformers(transformers)
|
||||||
|
|
||||||
|
|||||||
@@ -256,6 +256,7 @@ class Post:
|
|||||||
# replace is here in order to prevent catastrophic backtracking
|
# replace is here in order to prevent catastrophic backtracking
|
||||||
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
|
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
|
||||||
self.outlinks += urls
|
self.outlinks += urls
|
||||||
|
self.outlinks = list(set(outlink for outlink in self.outlinks))
|
||||||
|
|
||||||
HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)"
|
HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)"
|
||||||
|
|
||||||
|
|||||||
@@ -69,7 +69,10 @@ class BitchuteTransformer(Transformer):
|
|||||||
if raw['parent_id'] is not None:
|
if raw['parent_id'] is not None:
|
||||||
# this block is for comments whose parent_ids correspond to deleted comments
|
# this block is for comments whose parent_ids correspond to deleted comments
|
||||||
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
|
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
|
||||||
reply_to = post.id
|
if post is None:
|
||||||
|
reply_to = -1
|
||||||
|
else:
|
||||||
|
reply_to = post.id
|
||||||
else:
|
else:
|
||||||
reply_to = -1
|
reply_to = -1
|
||||||
else:
|
else:
|
||||||
@@ -102,7 +105,7 @@ class BitchuteTransformer(Transformer):
|
|||||||
likes = raw['likes'],
|
likes = raw['likes'],
|
||||||
views = int(raw['views']) if raw.get('views') else None,
|
views = int(raw['views']) if raw.get('views') else None,
|
||||||
video_title = raw['subject'],
|
video_title = raw['subject'],
|
||||||
video_duration = parse_duration_str(raw['length']))
|
video_duration = _parse_duration_str(raw['length']))
|
||||||
|
|
||||||
transformed = insert(transformed)
|
transformed = insert(transformed)
|
||||||
session.flush()
|
session.flush()
|
||||||
@@ -123,7 +126,9 @@ def parse_created(created: str, date_archived: datetime) -> datetime:
|
|||||||
|
|
||||||
return date_archived - relativedelta(**kwargs)
|
return date_archived - relativedelta(**kwargs)
|
||||||
|
|
||||||
def parse_duration_str(duration_str: str) -> int:
|
def _parse_duration_str(duration_str: str) -> int:
|
||||||
|
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
|
||||||
|
"""
|
||||||
if not duration_str:
|
if not duration_str:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ from loguru import logger
|
|||||||
from typing import Generator, Union, Callable
|
from typing import Generator, Union, Callable
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
from sqlalchemy import func
|
||||||
from gogettr import PublicClient
|
from gogettr import PublicClient
|
||||||
from gogettr.api import GettrApiError
|
from gogettr.api import GettrApiError
|
||||||
|
|
||||||
@@ -46,42 +47,55 @@ class GettrTransformer(Transformer):
|
|||||||
|
|
||||||
transformed = insert(transformed)
|
transformed = insert(transformed)
|
||||||
|
|
||||||
|
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
|
||||||
|
|
||||||
|
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first()
|
||||||
|
|
||||||
|
if channel is None:
|
||||||
|
try:
|
||||||
|
client = PublicClient()
|
||||||
|
profile = client.user_info(username.lower())
|
||||||
|
screenname = profile.get('_id')
|
||||||
|
channel = Channel(
|
||||||
|
name=profile.get('nickname'),
|
||||||
|
platform_id=screenname,
|
||||||
|
platform='Gettr',
|
||||||
|
url="https://gettr.com/user/" + screenname,
|
||||||
|
screenname=screenname,
|
||||||
|
category=category,
|
||||||
|
source=self.__version__,
|
||||||
|
)
|
||||||
|
except GettrApiError:
|
||||||
|
channel = Channel(
|
||||||
|
name = None,
|
||||||
|
platform_id = None,
|
||||||
|
platform = 'Gettr',
|
||||||
|
url = None,
|
||||||
|
screenname=username,
|
||||||
|
category=category,
|
||||||
|
source=self.__version__,
|
||||||
|
notes='GettrApiError'
|
||||||
|
)
|
||||||
|
|
||||||
|
channel = insert(channel)
|
||||||
|
|
||||||
|
return channel.id
|
||||||
|
|
||||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
if raw["activity"]["action"] == "shares_pst":
|
if raw["activity"]["action"] == "shares_pst":
|
||||||
fwd_from = str(raw["activity"]["uid"])
|
forwarded_from = self._get_channel_id(
|
||||||
channel = session.query(Channel).filter_by(platform_id=str(fwd_from)).first()
|
username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session)
|
||||||
if channel is None:
|
|
||||||
try:
|
|
||||||
client = PublicClient()
|
|
||||||
profile = client.user_info(fwd_from.lower())
|
|
||||||
screenname = profile.get('_id')
|
|
||||||
channel = Channel(
|
|
||||||
name=profile.get('nickname'),
|
|
||||||
platform_id=screenname,
|
|
||||||
platform=data.platform,
|
|
||||||
url="https://gettr.com/user/" + screenname,
|
|
||||||
screenname=screenname,
|
|
||||||
category='forwarded',
|
|
||||||
source=self.__version__,
|
|
||||||
)
|
|
||||||
except GettrApiError:
|
|
||||||
channel = Channel(
|
|
||||||
name=None,
|
|
||||||
platform_id=fwd_from,
|
|
||||||
platform=data.platform,
|
|
||||||
url="https://gettr.com/user/" + fwd_from,
|
|
||||||
screenname=fwd_from,
|
|
||||||
category='forwarded',
|
|
||||||
source=self.__version__,
|
|
||||||
)
|
|
||||||
channel = insert(channel)
|
|
||||||
forwarded_from = channel.id
|
|
||||||
else:
|
else:
|
||||||
forwarded_from = None
|
forwarded_from = None
|
||||||
|
|
||||||
|
mentions = []
|
||||||
|
for mentioned_user in raw.get("utgs", []):
|
||||||
|
mentioned_id = self._get_channel_id(
|
||||||
|
username = mentioned_user, category = 'mentioned', insert = insert, session = session)
|
||||||
|
mentions.append(mentioned_id)
|
||||||
|
|
||||||
transformed = Post(
|
transformed = Post(
|
||||||
raw_id=data.id,
|
raw_id=data.id,
|
||||||
platform_id=raw["_id"],
|
platform_id=raw["_id"],
|
||||||
@@ -99,6 +113,7 @@ class GettrTransformer(Transformer):
|
|||||||
hashtags=raw.get("htgs", []),
|
hashtags=raw.get("htgs", []),
|
||||||
outlinks = list(filter(None, [raw.get("prevsrc")])),
|
outlinks = list(filter(None, [raw.get("prevsrc")])),
|
||||||
forwarded_from = forwarded_from,
|
forwarded_from = forwarded_from,
|
||||||
|
mentions = mentions,
|
||||||
likes = raw.get('lkbpst'),
|
likes = raw.get('lkbpst'),
|
||||||
forwards = raw.get("shbpst"),
|
forwards = raw.get("shbpst"),
|
||||||
views = raw.get('vfpst')
|
views = raw.get('vfpst')
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ from loguru import logger
|
|||||||
from typing import Generator, Union, Callable
|
from typing import Generator, Union, Callable
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
from sqlalchemy import func, JSON, String, cast, text
|
||||||
|
|
||||||
from cisticola.transformer.base import Transformer
|
from cisticola.transformer.base import Transformer
|
||||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||||
@@ -22,19 +23,30 @@ class RumbleTransformer(Transformer):
|
|||||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
|
if 'id' not in raw:
|
||||||
|
# The first version of the Rumble ChannelInfo scraper didn't return
|
||||||
|
# the platform_id, so this is a workaround.
|
||||||
|
channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first()
|
||||||
|
if channel is None:
|
||||||
|
platform_id = None
|
||||||
|
else:
|
||||||
|
platform_id = json.loads(channel.raw_data)['id']
|
||||||
|
else:
|
||||||
|
platform_id = raw['id']
|
||||||
|
|
||||||
transformed = ChannelInfo(
|
transformed = ChannelInfo(
|
||||||
raw_channel_info_id=data.id,
|
raw_channel_info_id=data.id,
|
||||||
channel=data.channel,
|
channel=data.channel,
|
||||||
platform_id=raw['id'],
|
platform_id=platform_id,
|
||||||
platform=data.platform,
|
platform=data.platform,
|
||||||
scraper=data.scraper,
|
scraper=data.scraper,
|
||||||
transformer=self.__version__,
|
transformer=self.__version__,
|
||||||
screenname=raw['id'],
|
screenname=platform_id,
|
||||||
name=raw['name'],
|
name=raw['name'],
|
||||||
description='', # does not exist for Rumble
|
description='', # does not exist for Rumble
|
||||||
description_url='', # does not exist for Rumble
|
description_url='', # does not exist for Rumble
|
||||||
description_location='', # does not exist for Rumble
|
description_location='', # does not exist for Rumble
|
||||||
followers=raw['subscribers'],
|
followers=_process_number(raw['subscribers']),
|
||||||
following=-1, # does not exist for Rumble
|
following=-1, # does not exist for Rumble
|
||||||
verified=raw['verified'],
|
verified=raw['verified'],
|
||||||
date_created=None, # does not exist for Rumble
|
date_created=None, # does not exist for Rumble
|
||||||
@@ -63,7 +75,9 @@ class RumbleTransformer(Transformer):
|
|||||||
author_id=raw['author_id'],
|
author_id=raw['author_id'],
|
||||||
author_username=raw['author_name'],
|
author_username=raw['author_name'],
|
||||||
views = _process_number(raw.get('views')),
|
views = _process_number(raw.get('views')),
|
||||||
likes = _process_number(raw.get('rumbles')))
|
likes = _process_number(raw.get('rumbles')),
|
||||||
|
video_title = raw['title'],
|
||||||
|
video_duration=_parse_duration_str(raw['duration']))
|
||||||
|
|
||||||
insert(transformed)
|
insert(transformed)
|
||||||
|
|
||||||
@@ -76,4 +90,18 @@ def _process_number(s):
|
|||||||
if s is None:
|
if s is None:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return int(s.replace(',', ''))
|
s = s.replace(' ', '')
|
||||||
|
if s.endswith('M'):
|
||||||
|
return int(float(s[:-1]) * 1e6)
|
||||||
|
elif s.endswith('K'):
|
||||||
|
return int(float(s[:-1]) * 1000)
|
||||||
|
return int(s)
|
||||||
|
|
||||||
|
def _parse_duration_str(duration_str: str) -> int:
|
||||||
|
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
|
||||||
|
"""
|
||||||
|
if not duration_str:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
duration_list = duration_str.split(':')
|
||||||
|
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])
|
||||||
@@ -131,7 +131,7 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
fwd_from = None
|
fwd_from = None
|
||||||
|
|
||||||
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
|
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
|
||||||
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first()
|
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first()
|
||||||
|
|
||||||
if channel is None:
|
if channel is None:
|
||||||
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])
|
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])
|
||||||
|
|||||||
Reference in New Issue
Block a user