From dfc5b77726be867fe1ee404241f1f9ab151a9361 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 12 Apr 2022 23:23:21 -0500 Subject: [PATCH 1/2] incorporated polyphemus refactoring changes --- cisticola/scraper/odysee.py | 55 +++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 0823e8e..67ef1b1 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -6,8 +6,8 @@ from urllib.parse import urlparse import requests from loguru import logger -from polyphemus.base import OdyseeChannel -from polyphemus.api import get_auth_token +from polyphemus.base import OdyseeChannelScraper, process_raw_comment_info +from polyphemus.api import get_auth_token, get_all_comments from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper @@ -29,40 +29,43 @@ class OdyseeScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: username = self.get_username_from_url(channel.url) - odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token) + scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token) - all_videos = odysee_channel.get_all_videos() + all_videos = scraper.get_all_videos() for video in all_videos: - if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date: + if since is not None and datetime.fromtimestamp(video.created) <= since.date: break - url = video.info['streaming_url'] + url = video.streaming_url + if url is None: + archived_urls = {} + else: + archived_urls = {url: None} - archived_urls = {url: None} + if archive_media: - if archive_media: + # Check if file is a video file or an m3u8 file + r = requests.head(url) + if r.headers['Content-Type'] == 'text/html; charset=utf-8': + media_blob, content_type, key = self.m3u8_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) - # Check if file is a video file or an m3u8 file - r = requests.head(url) - if r.headers['Content-Type'] == 'text/html; charset=utf-8': - media_blob, content_type, key = self.m3u8_url_to_blob(url) - else: - media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url - - all_comments = video.get_all_comments() + raw_comment_info_list = get_all_comments(video_id=video.claim_id) + all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list) yield ScraperResult( scraper=self.__version__, platform="Odysee", channel=channel.id, - platform_id=video.info['claim_id'], - date=datetime.fromtimestamp(video.info['created']), + platform_id=video.claim_id, + date=datetime.fromtimestamp(video.created), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(video.info), + raw_data=json.dumps(video.__dict__), archived_urls=archived_urls, media_archived=datetime.now(timezone.utc) if archive_media else None) @@ -72,10 +75,10 @@ class OdyseeScraper(Scraper): scraper=self.__version__, platform="Odysee", channel=channel.id, - platform_id=comment.info['claim_id'], - date=datetime.fromtimestamp(comment.info['created']), + platform_id=comment.claim_id, + date=datetime.fromtimestamp(comment.created), date_archived=datetime.now(), - raw_data=json.dumps(comment.info), + raw_data=json.dumps(comment.__dict__), archived_urls={}, media_archived=datetime.now(timezone.utc)) @@ -108,8 +111,8 @@ class OdyseeScraper(Scraper): def get_profile(self, channel: Channel) -> RawChannelInfo: username = self.get_username_from_url(channel.url) - odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token) - profile = odysee_channel.info + scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token) + profile = scraper.get_entity().__dict__ return RawChannelInfo(scraper=self.__version__, platform=channel.platform, From 27b51267a7916a7734730018374bd78c89f7f7c0 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 13 Apr 2022 00:02:12 -0500 Subject: [PATCH 2/2] fixed bugs from incorporating polyphemus refactoring changes --- cisticola/scraper/odysee.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 67ef1b1..b4d14d6 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -34,7 +34,7 @@ class OdyseeScraper(Scraper): all_videos = scraper.get_all_videos() for video in all_videos: - if since is not None and datetime.fromtimestamp(video.created) <= since.date: + if since is not None and video.created.replace(tzinfo=timezone.utc) <= since.date: break url = video.streaming_url @@ -63,9 +63,9 @@ class OdyseeScraper(Scraper): platform="Odysee", channel=channel.id, platform_id=video.claim_id, - date=datetime.fromtimestamp(video.created), + date=video.created.replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(video.__dict__), + raw_data=json.dumps(video.__dict__, default = str), archived_urls=archived_urls, media_archived=datetime.now(timezone.utc) if archive_media else None) @@ -76,9 +76,9 @@ class OdyseeScraper(Scraper): platform="Odysee", channel=channel.id, platform_id=comment.claim_id, - date=datetime.fromtimestamp(comment.created), + date=comment.created.replace(tzinfo=timezone.utc), date_archived=datetime.now(), - raw_data=json.dumps(comment.__dict__), + raw_data=json.dumps(comment.__dict__, default = str), archived_urls={}, media_archived=datetime.now(timezone.utc)) @@ -117,5 +117,5 @@ class OdyseeScraper(Scraper): return RawChannelInfo(scraper=self.__version__, platform=channel.platform, channel=channel.id, - raw_data=json.dumps(profile), + raw_data=json.dumps(profile, default = str), date_archived=datetime.now(timezone.utc)) \ No newline at end of file