Files
cisticola/cisticola/scraper/odysee.py

110 lines
4.1 KiB
Python

from datetime import datetime, timezone
import json
from typing import Generator
from urllib.parse import urlparse
import requests
from loguru import logger
from polyphemus.base import OdyseeChannelScraper, process_raw_comment_info
from polyphemus.api import get_auth_token, get_all_comments
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class OdyseeScraper(Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library"""
__version__ = "OdyseeScraper 0.0.0"
def __init__(self):
super().__init__()
self.auth_token = get_auth_token()
def get_username_from_url(self, url):
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
all_videos = scraper.get_all_videos()
for video in all_videos:
if since is not None and video.created.replace(tzinfo=timezone.utc) <= since.date:
break
url = video.streaming_url
if url is None:
archived_urls = {}
else:
archived_urls = {url: None}
raw_comment_info_list = get_all_comments(video_id=video.claim_id)
all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list)
yield ScraperResult(
scraper=self.__version__,
platform="Odysee",
channel=channel.id,
platform_id=video.claim_id,
date=video.created.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.__dict__, default = str),
archived_urls=archived_urls,
media_archived=None)
for comment in all_comments:
yield ScraperResult(
scraper=self.__version__,
platform="Odysee",
channel=channel.id,
platform_id=comment.claim_id,
date=comment.created.replace(tzinfo=timezone.utc),
date_archived=datetime.now(),
raw_data=json.dumps(comment.__dict__, default = str),
archived_urls={},
media_archived=datetime.now(timezone.utc))
@logger.catch
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = datetime.now(timezone.utc)
return result
def can_handle(self, channel):
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:
key = urlparse(url).path.split('/')[-2]
ext = content_type.split('/')[-1]
return f'{key}.{ext}'
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
profile = scraper.get_entity().__dict__
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile, default = str),
date_archived=datetime.now(timezone.utc))