Files
cisticola/cisticola/scraper/rumble.py
Tristan Lee 8ecb904249 merged main
2022-04-01 02:05:25 -05:00

150 lines
5.0 KiB
Python

from datetime import datetime, timezone
import json
from typing import Generator
from urllib.parse import urlparse
from loguru import logger
from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper, make_request
BASE_URL = 'https://rumble.com'
class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
scraper = get_channel_videos(channel.url)
for post in scraper:
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
break
url = post['media_url']
archived_urls = {url: None}
if archive_media:
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Rumble",
channel=channel.id,
platform_id=post['media_url'].split('/')[-2],
date=post['datetime'].replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_posts=json.dumps(post, default = str),
archived_urls=archived_urls,
media_archived=archive_media)
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]
key = urlparse(url).path.split('/')[-2] + ext
return key
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def can_handle(self, channel):
if channel.platform == "Rumble" and channel.url is not None:
return True
def get_profile(self, channel: Channel) -> RawChannelInfo:
profile = get_channel_profile(url = channel.url)
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_media_url(url):
r = make_request(url = url)
soup = BeautifulSoup(r.content, features = 'html.parser')
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
media_url = script[0]['embedUrl']
return media_url
def process_video(video):
rumble_soup = video.find('span', {'class' : 'video-item--rumbles'})
if rumble_soup is None:
rumbles = '0'
else:
rumbles = rumble_soup['data-value']
info = {
'title' : video.find('h3').text,
'thumbnail' : video.find('img')['src'],
'link' : BASE_URL + video.find('a', href = True)['href'],
'views' : video.find('span', {'class' : 'video-item--views'})['data-value'],
'rumbles' : rumbles,
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
'datetime' : datetime.fromisoformat(video.find('time')['datetime'])}
info['media_url'] = get_media_url(info['link'])
return info
def get_channel_videos(url):
page = 1
channel_url = f'{url}?page='
while True:
url = channel_url + str(page)
r = make_request(url = url, break_codes = [404])
if r.status_code == 404:
break
soup = BeautifulSoup(r.content, features = 'lxml')
video_list = soup.find_all('li', {'class' : 'video-listing-entry'})
for video in video_list:
yield process_video(video)
page += 1
def get_channel_profile(url):
channel_url = f'{url}'
r = make_request(url = channel_url)
soup = BeautifulSoup(r.content, features = 'lxml')
verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'})
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
profile = {
'name': soup.find('h1').text,
'verified': verified_svg is not None,
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None,
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
return profile
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#