mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
added Rumble transformers and test
This commit is contained in:
@@ -105,20 +105,38 @@ def process_video(video):
|
||||
views = None
|
||||
else:
|
||||
views = view_span.get('data-value')
|
||||
|
||||
|
||||
author_a = video.find('a', {'rel': 'author'})
|
||||
if author_a is None:
|
||||
author_id = None
|
||||
author_name = None
|
||||
else:
|
||||
author_id = author_a['href'].split('/')[-1]
|
||||
author_name = author_a.text
|
||||
|
||||
video_link = BASE_URL + video.find('a', href = True)['href']
|
||||
r = make_request(url = video_link)
|
||||
soup = BeautifulSoup(r.content, features = 'html.parser')
|
||||
|
||||
content_div = soup.find('div', {'class': 'container content media-description'})
|
||||
|
||||
info = {
|
||||
'title' : video.find('h3').text,
|
||||
'thumbnail' : video.find('img')['src'],
|
||||
'link' : BASE_URL + video.find('a', href = True)['href'],
|
||||
'link' : video_link,
|
||||
'views' : views,
|
||||
'rumbles' : rumbles,
|
||||
'content': '' if content_div is None else content_div.get_text('\n'),
|
||||
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
|
||||
'datetime' : datetime.fromisoformat(video.find('time')['datetime'])}
|
||||
'datetime' : datetime.fromisoformat(video.find('time')['datetime']),
|
||||
'author_id': author_id,
|
||||
'author_name': author_name}
|
||||
|
||||
info['media_url'] = get_media_url(info['link'])
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def get_channel_videos(url):
|
||||
|
||||
page = 1
|
||||
@@ -150,8 +168,15 @@ def get_channel_profile(url):
|
||||
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
|
||||
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
|
||||
|
||||
author_a = soup.find('a', {'rel': 'author'})
|
||||
if author_a is None:
|
||||
author_id = None
|
||||
else:
|
||||
author_id = author_a['href'].split('/')[-1]
|
||||
|
||||
profile = {
|
||||
'name': soup.find('h1').text,
|
||||
'id': author_id,
|
||||
'verified': verified_svg is not None,
|
||||
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
|
||||
'cover': cover_soup.get('src') if cover_soup else None,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from .base import ETLController
|
||||
from .twitter import TwitterTransformer
|
||||
from .bitchute import BitchuteTransformer
|
||||
from .telegram_telethon import TelegramTelethonTransformer
|
||||
from .telegram_telethon import TelegramTelethonTransformer
|
||||
from .rumble import RumbleTransformer
|
||||
|
||||
70
cisticola/transformer/rumble.py
Normal file
70
cisticola/transformer/rumble.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class RumbleTransformer(Transformer):
|
||||
"""A Rumble specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "RumbleTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "RumbleScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['id'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['id'],
|
||||
name=raw['name'],
|
||||
description='', # does not exist for Rumble
|
||||
description_url='', # does not exist for Rumble
|
||||
description_location='', # does not exist for Rumble
|
||||
followers=raw['subscribers'],
|
||||
following=-1, # does not exist for Rumble
|
||||
verified=raw['verified'],
|
||||
date_created=None, # does not exist for Rumble
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=raw['media_url'].strip('/').split('/')[-1],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=dateutil.parser.parse(raw['datetime']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['link'],
|
||||
content=raw['content'],
|
||||
author_id=raw['author_id'],
|
||||
author_username=raw['author_name'])
|
||||
|
||||
insert(transformed)
|
||||
|
||||
# media = self.process_media(raw, transformed.id, data)
|
||||
# for m in media:
|
||||
# insert(m)
|
||||
34
tests/transformer/rumble.py
Normal file
34
tests/transformer/rumble.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import RumbleScraper
|
||||
from cisticola.transformer import RumbleTransformer
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['rumble'])]
|
||||
controller.register_scraper(scraper = RumbleScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(RumbleTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 7
|
||||
# assert len(media) == 0
|
||||
|
||||
assert '#whitegold #icedoutcuban' in posts[0].content
|
||||
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
|
||||
Reference in New Issue
Block a user