From 591f1986e814538f6120f2b5e60e959036cc10eb Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Thu, 19 May 2022 19:40:48 -0500 Subject: [PATCH] added Rumble transformers and test --- cisticola/scraper/rumble.py | 31 ++++++++++++-- cisticola/transformer/__init__.py | 3 +- cisticola/transformer/rumble.py | 70 +++++++++++++++++++++++++++++++ tests/transformer/rumble.py | 34 +++++++++++++++ 4 files changed, 134 insertions(+), 4 deletions(-) create mode 100644 cisticola/transformer/rumble.py create mode 100644 tests/transformer/rumble.py diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 0c7177f..ad87247 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -105,20 +105,38 @@ def process_video(video): views = None else: views = view_span.get('data-value') - + + author_a = video.find('a', {'rel': 'author'}) + if author_a is None: + author_id = None + author_name = None + else: + author_id = author_a['href'].split('/')[-1] + author_name = author_a.text + + video_link = BASE_URL + video.find('a', href = True)['href'] + r = make_request(url = video_link) + soup = BeautifulSoup(r.content, features = 'html.parser') + + content_div = soup.find('div', {'class': 'container content media-description'}) + info = { 'title' : video.find('h3').text, 'thumbnail' : video.find('img')['src'], - 'link' : BASE_URL + video.find('a', href = True)['href'], + 'link' : video_link, 'views' : views, 'rumbles' : rumbles, + 'content': '' if content_div is None else content_div.get_text('\n'), 'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'], - 'datetime' : datetime.fromisoformat(video.find('time')['datetime'])} + 'datetime' : datetime.fromisoformat(video.find('time')['datetime']), + 'author_id': author_id, + 'author_name': author_name} info['media_url'] = get_media_url(info['link']) return info + def get_channel_videos(url): page = 1 @@ -150,8 +168,15 @@ def get_channel_profile(url): thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'}) cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'}) + author_a = soup.find('a', {'rel': 'author'}) + if author_a is None: + author_id = None + else: + author_id = author_a['href'].split('/')[-1] + profile = { 'name': soup.find('h1').text, + 'id': author_id, 'verified': verified_svg is not None, 'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None, 'cover': cover_soup.get('src') if cover_soup else None, diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index b4e0968..77f9f0b 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,4 +1,5 @@ from .base import ETLController from .twitter import TwitterTransformer from .bitchute import BitchuteTransformer -from .telegram_telethon import TelegramTelethonTransformer \ No newline at end of file +from .telegram_telethon import TelegramTelethonTransformer +from .rumble import RumbleTransformer diff --git a/cisticola/transformer/rumble.py b/cisticola/transformer/rumble.py new file mode 100644 index 0000000..91ef244 --- /dev/null +++ b/cisticola/transformer/rumble.py @@ -0,0 +1,70 @@ +import json +from loguru import logger +from typing import Generator, Union, Callable +import dateutil.parser +from datetime import datetime, timezone + +from cisticola.transformer.base import Transformer +from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel + +class RumbleTransformer(Transformer): + """A Rumble specific ScraperResult, with a method ETL/transforming""" + + __version__ = "RumbleTransformer 0.0.1" + + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "RumbleScraper": + return True + + return False + + def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = ChannelInfo( + raw_channel_info_id=data.id, + channel=data.channel, + platform_id=raw['id'], + platform=data.platform, + scraper=data.scraper, + transformer=self.__version__, + screenname=raw['id'], + name=raw['name'], + description='', # does not exist for Rumble + description_url='', # does not exist for Rumble + description_location='', # does not exist for Rumble + followers=raw['subscribers'], + following=-1, # does not exist for Rumble + verified=raw['verified'], + date_created=None, # does not exist for Rumble + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc) + ) + + transformed = insert(transformed) + + + def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = Post( + raw_id=data.id, + platform_id=raw['media_url'].strip('/').split('/')[-1], + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=dateutil.parser.parse(raw['datetime']), + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), + url=raw['link'], + content=raw['content'], + author_id=raw['author_id'], + author_username=raw['author_name']) + + insert(transformed) + + # media = self.process_media(raw, transformed.id, data) + # for m in media: + # insert(m) \ No newline at end of file diff --git a/tests/transformer/rumble.py b/tests/transformer/rumble.py new file mode 100644 index 0000000..95450ed --- /dev/null +++ b/tests/transformer/rumble.py @@ -0,0 +1,34 @@ +from sqlalchemy.orm import sessionmaker +import json + +import pytest + +from cisticola.base import Channel +from cisticola.scraper import RumbleScraper +from cisticola.transformer import RumbleTransformer +from cisticola.base import Post, Media + +@pytest.mark.media +def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['rumble'])] + controller.register_scraper(scraper = RumbleScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(RumbleTransformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(Post).all() + media = session.query(Media).all() + + assert len(posts) == 7 + # assert len(media) == 0 + + assert '#whitegold #icedoutcuban' in posts[0].content + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file