added Rumble transformers and test

This commit is contained in:
Tristan Lee
2022-05-19 19:40:48 -05:00
parent e2094522c9
commit 591f1986e8
4 changed files with 134 additions and 4 deletions

View File

@@ -105,20 +105,38 @@ def process_video(video):
views = None
else:
views = view_span.get('data-value')
author_a = video.find('a', {'rel': 'author'})
if author_a is None:
author_id = None
author_name = None
else:
author_id = author_a['href'].split('/')[-1]
author_name = author_a.text
video_link = BASE_URL + video.find('a', href = True)['href']
r = make_request(url = video_link)
soup = BeautifulSoup(r.content, features = 'html.parser')
content_div = soup.find('div', {'class': 'container content media-description'})
info = {
'title' : video.find('h3').text,
'thumbnail' : video.find('img')['src'],
'link' : BASE_URL + video.find('a', href = True)['href'],
'link' : video_link,
'views' : views,
'rumbles' : rumbles,
'content': '' if content_div is None else content_div.get_text('\n'),
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
'datetime' : datetime.fromisoformat(video.find('time')['datetime'])}
'datetime' : datetime.fromisoformat(video.find('time')['datetime']),
'author_id': author_id,
'author_name': author_name}
info['media_url'] = get_media_url(info['link'])
return info
def get_channel_videos(url):
page = 1
@@ -150,8 +168,15 @@ def get_channel_profile(url):
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
author_a = soup.find('a', {'rel': 'author'})
if author_a is None:
author_id = None
else:
author_id = author_a['href'].split('/')[-1]
profile = {
'name': soup.find('h1').text,
'id': author_id,
'verified': verified_svg is not None,
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None,

View File

@@ -1,4 +1,5 @@
from .base import ETLController
from .twitter import TwitterTransformer
from .bitchute import BitchuteTransformer
from .telegram_telethon import TelegramTelethonTransformer
from .telegram_telethon import TelegramTelethonTransformer
from .rumble import RumbleTransformer

View File

@@ -0,0 +1,70 @@
import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class RumbleTransformer(Transformer):
"""A Rumble specific ScraperResult, with a method ETL/transforming"""
__version__ = "RumbleTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "RumbleScraper":
return True
return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['id'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['id'],
name=raw['name'],
description='', # does not exist for Rumble
description_url='', # does not exist for Rumble
description_location='', # does not exist for Rumble
followers=raw['subscribers'],
following=-1, # does not exist for Rumble
verified=raw['verified'],
date_created=None, # does not exist for Rumble
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = Post(
raw_id=data.id,
platform_id=raw['media_url'].strip('/').split('/')[-1],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=dateutil.parser.parse(raw['datetime']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['link'],
content=raw['content'],
author_id=raw['author_id'],
author_username=raw['author_name'])
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)

View File

@@ -0,0 +1,34 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import RumbleScraper
from cisticola.transformer import RumbleTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(RumbleTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 7
# assert len(media) == 0
assert '#whitegold #icedoutcuban' in posts[0].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"