From 44a673f889b7c9fe81ef9643371837fb9bed5c7e Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 11 Apr 2022 10:27:12 -0500 Subject: [PATCH] refactored base classes to have structure more similar to snscrape, made scraper 'get' methods return dataclasses or list of dataclasses rather than dicts --- polyphemus/api.py | 61 +++++---- polyphemus/base.py | 319 +++++++++++++++++++++++++++------------------ tests/api.py | 12 +- tests/base.py | 35 +++-- tests/conftest.py | 2 +- 5 files changed, 252 insertions(+), 177 deletions(-) diff --git a/polyphemus/api.py b/polyphemus/api.py index dc7430e..ae97b55 100644 --- a/polyphemus/api.py +++ b/polyphemus/api.py @@ -7,6 +7,8 @@ import json from urllib.parse import quote +from typing import Tuple, Optional, List +import time import requests @@ -23,7 +25,7 @@ NEW_USER_API_URL = 'https://api.odysee.com/user/new' #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def make_request(request, kwargs): +def make_request(request: str, kwargs: dict) -> requests.Response: """Wrapper for retrying request multiple times. """ @@ -32,12 +34,24 @@ def make_request(request, kwargs): msg = f'`request` argument must be either `requests.get` or `requests.post`, not {type(request)}' raise ValueError(msg) - n_retries = 0 - response = request(**kwargs) + if 'timeout' not in kwargs: + kwargs['timeout'] = 15 - while response.status_code != 200 and n_retries < 5: - n_retries += 1 - response = request(**kwargs) + n_retries = 0 + + response = requests.Response() + response.status_code = 418 + + while n_retries < 5: + time.sleep(2 ** n_retries - 1) + try: + response = request(**kwargs) + if response.status_code == 200: + return response + else: + n_retries += 1 + except Exception: + n_retries += 1 if response.status_code != 200: msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}: status code {response.status_code}' @@ -47,9 +61,12 @@ def make_request(request, kwargs): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_auth_token(): +def get_auth_token() -> str: - """Get a fresh authorization token, to use for API calls that require it. + """Get a fresh authorization token, to use for API calls that require it. + + Note: calling this function many times in quick succession may result in a + 503 error. """ response = make_request( @@ -63,7 +80,7 @@ def get_auth_token(): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_channel_info(channel_name): +def get_channel_info(channel_name: str) -> dict: """Get the channel information and ID from the channel name. """ @@ -99,7 +116,7 @@ def get_channel_info(channel_name): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_subscribers(channel_id, auth_token = None): +def get_subscribers(channel_id: str, auth_token: str = None) -> int: """Get the number of subscribers for a channel. """ @@ -124,19 +141,19 @@ def get_subscribers(channel_id, auth_token = None): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_all_videos(channel_id): +def get_raw_video_info_list(channel_id: str) -> dict: """Get a list of all videos posted by a specified channel name. Returns ------- - all_videos: list + raw_video_info_list: list List of dictionaries, with each dict corresponding to a JSON response containing data about a single video. """ - all_videos = [] + raw_video_info_list = [] page = 1 @@ -164,14 +181,14 @@ def get_all_videos(channel_id): if not videos: break else: - all_videos.extend(videos) + raw_video_info_list.extend(videos) page += 1 - return all_videos + return raw_video_info_list #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_views(video_id, auth_token = None): +def get_views(video_id: str, auth_token: str = None) -> int: """Get the number of views for a given video. """ @@ -195,7 +212,7 @@ def get_views(video_id, auth_token = None): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_video_reactions(video_id, auth_token = None): +def get_video_reactions(video_id: str, auth_token: str = None) -> Tuple[Optional[int], Optional[int]]: """Get all reactions for a given video. """ @@ -223,7 +240,7 @@ def get_video_reactions(video_id, auth_token = None): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_all_comments(video_id): +def get_all_comments(video_id: str) -> List[dict]: """Get a list of all comments for a single video. @@ -277,7 +294,7 @@ def get_all_comments(video_id): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def append_comment_reactions(comment_info_list): +def append_comment_reactions(comment_info_list: List[dict]) -> List[dict]: """Get reaction data for each comment and insert ``'reactions'`` key into dict for each comment. @@ -325,7 +342,7 @@ def append_comment_reactions(comment_info_list): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_recommended(video_title, video_id): +def get_recommended(video_title: str, video_id: str) -> List[dict]: name = quote(video_title) @@ -350,7 +367,7 @@ def get_recommended(video_title, video_id): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def normalized_name_to_video_info(normalized_name): +def normalized_name_to_video_info(normalized_name: str) -> dict: video_url = f"lbry://{normalized_name}" @@ -372,7 +389,7 @@ def normalized_name_to_video_info(normalized_name): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_streaming_url(canonical_url): +def get_streaming_url(canonical_url: str) -> str: json_data = { "jsonrpc":"2.0", diff --git a/polyphemus/base.py b/polyphemus/base.py index 4625236..92601d7 100644 --- a/polyphemus/base.py +++ b/polyphemus/base.py @@ -7,48 +7,111 @@ import json from urllib.parse import unquote +from dataclasses import dataclass +import typing +from datetime import datetime from polyphemus import api #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class OdyseeChannel: +@dataclass +class Channel: + channel_id: str + created: datetime + subscribers: int + raw : str + title : typing.Optional[str] = None + description: typing.Optional[str] = None + cover_image: typing.Optional[str] = None + thumbnail_image: typing.Optional[str] = None + +@dataclass +class Video: + canonical_url: str + streaming_url: str + type: str + claim_id: str + created: datetime + title: str + views: int + raw: str + text: typing.Optional[str] = None + thumbnail : typing.Optional[str] = None + channel_id: typing.Optional[str] = None + channel_name: typing.Optional[str] = None + duration: typing.Optional[int] = None + languages : typing.Optional[typing.List[str]] = None + tags: typing.Optional[typing.List[str]] = None + likes: typing.Optional[int] = None + dislikes: typing.Optional[int] = None + is_comment: bool = False + +@dataclass +class Comment: + text: str + created: datetime + claim_id : str + video_claim_id : str + channel_id: str + channel_name : str + replies: int + likes: int + dislikes: int + raw : str + is_comment: bool = True + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +class OdyseeChannelScraper: #-------------------------------------------------------------------------# - def __init__(self, channel_name, auth_token = None): + def __init__(self, channel_name: str, auth_token: str = None): self._channel_name = unquote(channel_name) - info = api.get_channel_info(channel_name = self._channel_name) - - self.info = info - self._channel_id = self.info['channel_id'] - if auth_token is None: self.auth_token = api.get_auth_token() else: self.auth_token = auth_token - self.info['subscribers'] = api.get_subscribers( - channel_id = self.info['channel_id'], - auth_token = self.auth_token) + self._raw_channel_info = api.get_channel_info(channel_name = self._channel_name) + self._channel_id = self._raw_channel_info['channel_id'] #-------------------------------------------------------------------------# - def get_all_videos(self): + def get_entity(self) -> Channel: - """Return list of OdyseeVideo objects for all videos posted by the channel + subscribers = api.get_subscribers( + channel_id = self._channel_id, + auth_token = self.auth_token) + + return Channel( + channel_id=self._raw_channel_info['channel_id'], + title=self._raw_channel_info['title'], + created=datetime.fromtimestamp(self._raw_channel_info['created']), + description=self._raw_channel_info['description'], + cover_image=self._raw_channel_info['cover_image'], + thumbnail_image=self._raw_channel_info['thumbnail_image'], + raw=self._raw_channel_info['raw'], + subscribers=subscribers) + + #-------------------------------------------------------------------------# + + def get_all_videos(self) -> typing.Generator[Video, None, None]: + + """Return list of Video objects for all videos posted by the channel """ - all_video_info = api.get_all_videos(channel_id=self.info['channel_id']) - self.all_videos = (OdyseeVideo(video, self.auth_token) for video in all_video_info) + raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id) + videos = (process_raw_video_info(raw_video_info, self.auth_token) for raw_video_info in raw_video_info_list) - return self.all_videos + return videos #-------------------------------------------------------------------------# - def get_all_videos_and_comments(self): + def get_all_videos_and_comments(self) -> typing.Tuple[typing.List['Video'], typing.List['Comment']]: """Return list of OdyseeVideo and OdyseeComment objects for all videos posted by the channel and all comments posted to those videos @@ -56,133 +119,131 @@ class OdyseeChannel: all_videos = list(self.get_all_videos()) - all_comments = [] + raw_comment_info_list = [] for video in all_videos: - all_comments.extend(video.get_all_comments()) + raw_comment_info_list.extend(api.get_all_comments(video_id=video.claim_id)) + + all_comments = [process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list] return all_videos, all_comments #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class OdyseeVideo: +def process_raw_video_info(raw_video_info: dict, auth_token = None) -> Video: - #-------------------------------------------------------------------------# + if auth_token is None: + auth_token = api.get_auth_token() + else: + auth_token = auth_token + + # Handle edge cases + #.....................................................................# + + if 'video' in raw_video_info['value']: + video_type = 'video' + duration = raw_video_info['value']['video'].get('duration') + elif 'audio' in raw_video_info['value']: + video_type = 'audio' + duration = raw_video_info['value']['audio'].get('duration') + elif 'claim_hash' in raw_video_info['value']: + video_type = 'repost' + duration = None + raw_video_info['value'] = raw_video_info['reposted_claim']['value'] + raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url'] + elif 'image' in raw_video_info['value']: + video_type = 'image' + duration = None + else: + video_type = 'other' + duration = None + + if 'signing_channel' in raw_video_info: + channel_name = raw_video_info['signing_channel'].get('name') + if 'claim_id' in raw_video_info['signing_channel']: + channel_id = raw_video_info['signing_channel']['claim_id'] + else: + channel_id = raw_video_info['signing_channel']['channel_id'] + else: + channel_name = None + channel_id = None + + if 'release_time' in raw_video_info['value']: + created = raw_video_info['value']['release_time'] + else: + created = raw_video_info['meta']['creation_timestamp'] + + if 'thumbnail' in raw_video_info['value']: + thumbnail = raw_video_info['value']['thumbnail'].get('url', None) + else: + thumbnail = None - def __init__(self, full_video_info, auth_token = None): - - if auth_token is None: - self.auth_token = api.get_auth_token() - else: - self.auth_token = auth_token - - # Handle edge cases - #.....................................................................# - - if 'video' in full_video_info['value']: - video_type = 'video' - duration = full_video_info['value']['video'].get('duration') - elif 'audio' in full_video_info['value']: - video_type = 'audio' - duration = full_video_info['value']['audio'].get('duration') - elif 'claim_hash' in full_video_info['value']: - video_type = 'repost' - duration = None - full_video_info['value'] = full_video_info['reposted_claim']['value'] - full_video_info['canonical_url'] = full_video_info['reposted_claim']['canonical_url'] - elif 'image' in full_video_info['value']: - video_type = 'image' - duration = None - else: - video_type = 'other' - duration = None - - if 'signing_channel' in full_video_info: - channel_name = full_video_info['signing_channel'].get('name') - if 'claim_id' in full_video_info['signing_channel']: - channel_id = full_video_info['signing_channel']['claim_id'] - else: - channel_id = full_video_info['signing_channel']['channel_id'] - else: - channel_name = None - channel_id = None - - if 'release_time' in full_video_info['value']: - created = full_video_info['value']['release_time'] - else: - created = full_video_info['meta']['creation_timestamp'] - - if 'thumbnail' in full_video_info['value']: - thumbnail = full_video_info['value']['thumbnail'].get('url', None) - else: - thumbnail = None - - # Store relevant information in flat dict - #.....................................................................# - - self.info = { - 'canonical_url' : full_video_info['canonical_url'], - 'type' : video_type, - 'channel_id' : channel_id, - 'channel_name' : channel_name, - 'claim_id' : full_video_info['claim_id'], - 'created' : int(created), - 'text' : full_video_info['value'].get('description'), - 'languages' : full_video_info['value'].get('languages'), - 'tags' : full_video_info['value'].get('tags',[]), - 'title' : full_video_info['value']['title'], - 'duration' : duration, - 'thumbnail' : thumbnail, - 'is_comment' : False, - 'raw' : json.dumps(full_video_info)} - - self.claim_id = self.info['claim_id'] - - self.info['views'] = api.get_views(video_id=self.claim_id, auth_token = self.auth_token) - - self.info['likes'], self.info['dislikes'] = api.get_video_reactions( - video_id = self.claim_id, - auth_token = self.auth_token) - - self.info['streaming_url'] = api.get_streaming_url(self.info['canonical_url']) - - #-------------------------------------------------------------------------# - - def get_all_comments(self): - - all_comment_info = api.get_all_comments(video_id=self.claim_id) - self.all_comments = (OdyseeComment(comment) for comment in all_comment_info) - - return self.all_comments - - #-------------------------------------------------------------------------# + # Retrieve additional fields + #.....................................................................# - def get_recommended(self): - - recommended_video_info = api.get_recommended( - video_title=self.info['title'], video_id=self.claim_id) - recommended_videos = [OdyseeVideo(video_info, self.auth_token) for video_info in recommended_video_info] + claim_id = raw_video_info['claim_id'] - return recommended_videos + views = api.get_views(video_id=claim_id, auth_token = auth_token) + + likes, dislikes = api.get_video_reactions( + video_id = claim_id, + auth_token = auth_token) + + streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) + + # Return Video object + #.....................................................................# + + return Video( + canonical_url = raw_video_info['canonical_url'], + type = video_type, + channel_id = channel_id, + channel_name = channel_name, + claim_id = raw_video_info['claim_id'], + created = datetime.fromtimestamp(int(created)), + text = raw_video_info['value'].get('description'), + languages = raw_video_info['value'].get('languages'), + tags = raw_video_info['value'].get('tags',[]), + title = raw_video_info['value']['title'], + duration = duration, + thumbnail = thumbnail, + is_comment = False, + raw = json.dumps(raw_video_info), + views = views, + likes = likes, + dislikes = dislikes, + streaming_url = streaming_url) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class OdyseeComment: +def process_raw_comment_info(raw_comment_info: dict) -> Comment: - def __init__(self, full_comment_info): - - # Store relevant information in flat dict - self.info = { - 'text' : full_comment_info['comment'], - 'created' : full_comment_info['timestamp'], - 'claim_id' : full_comment_info.get('comment_id'), - 'video_claim_id' : full_comment_info['claim_id'], - 'channel_id' : full_comment_info['channel_id'], - 'channel_name' : full_comment_info['channel_name'], - 'replies' : full_comment_info.get('replies', 0), - 'likes' : full_comment_info['likes'], - 'dislikes' : full_comment_info['dislikes'], - 'is_comment' : True, - 'raw' : json.dumps(full_comment_info)} + return Comment( + text = raw_comment_info['comment'], + created = raw_comment_info['timestamp'], + claim_id = raw_comment_info.get('comment_id'), + video_claim_id = raw_comment_info['claim_id'], + channel_id = raw_comment_info['channel_id'], + channel_name = raw_comment_info['channel_name'], + replies = raw_comment_info.get('replies', 0), + likes = raw_comment_info['likes'], + dislikes = raw_comment_info['dislikes'], + is_comment = True, + raw = json.dumps(raw_comment_info)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_recommended(video: Video, auth_token: str = None) -> typing.List['Video']: + + if auth_token is None: + auth_token = api.get_auth_token() + else: + auth_token = auth_token + + recommended_video_info_list = api.get_recommended( + video_title=video.title, video_id=video.claim_id) + recommended_videos = [process_raw_video_info(raw_video_info, auth_token) for raw_video_info in recommended_video_info_list] + + return recommended_videos + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/api.py b/tests/api.py index ff9e60f..5fe7c47 100644 --- a/tests/api.py +++ b/tests/api.py @@ -23,7 +23,7 @@ KWARGS_LIST = [ ('get_auth_token', []), ('get_channel_info', ['channel_name']), ('get_subscribers', ['channel_id', 'auth_token']), - ('get_all_videos', ['channel_id']), + ('get_raw_video_info_list', ['channel_id']), ('get_views', ['video_id', 'auth_token']), ('get_video_reactions', ['video_id', 'auth_token']), ('get_all_comments', ['video_id']), @@ -34,12 +34,12 @@ KWARGS_LIST = [ #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -@pytest.mark.parametrize( 'function_str,kwargs', KWARGS_LIST ) -def test_minimal_init( resources, function_str, kwargs ): +@pytest.mark.parametrize('function_str,kwargs', KWARGS_LIST) +def test_minimal_init(resources, function_str, kwargs): - function = eval( f'api.{function_str}') - function_kwargs = { kwarg : resources[ kwarg ] for kwarg in kwargs } + function = eval(f'api.{function_str}') + function_kwargs = {kwarg: resources[kwarg] for kwarg in kwargs} - function( **function_kwargs ) + function(**function_kwargs) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/base.py b/tests/base.py index 6da7031..2a0387e 100644 --- a/tests/base.py +++ b/tests/base.py @@ -19,38 +19,35 @@ from polyphemus import base #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class TestOdyseeChannel: +class TestOdyseeChannelScraper: @pytest.fixture(autouse=True) def test_simple_init(self, resources): - self.channel = base.OdyseeChannel(channel_name = resources['channel_name']) + self.scraper = base.OdyseeChannelScraper(channel_name = resources['channel_name']) + + def test_get_entity(self): + self.scraper.get_entity() def test_get_all_videos(self): - self.channel.get_all_videos() + self.scraper.get_all_videos() def test_get_all_videos_and_comments(self): - self.channel.get_all_videos_and_comments() + self.scraper.get_all_videos_and_comments() #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class TestOdyseeVideo: +def test_process_raw_video_info(resources): + video = base.process_raw_video_info(raw_video_info = resources['full_video_info'], auth_token = resources['auth_token']) - @pytest.fixture(autouse=True) - def test_simple_init(self, resources): - self.video = base.OdyseeVideo(full_video_info = resources['full_video_info']) - - def test_get_all_comments(self): - self.video.get_all_comments() - - def test_get_recommended(self): - self.video.get_recommended() - #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class TestOdyseeComment: +def test_get_recommended(resources): + video = base.process_raw_video_info(raw_video_info = resources['full_video_info'], auth_token = resources['auth_token']) + base.get_recommended(video = video) - @pytest.fixture(autouse=True) - def test_simple_init(self, resources): - self.comment = base.OdyseeComment(full_comment_info = resources['full_comment_info']) +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def test_process_raw_comment_info(resources): + base.process_raw_comment_info(raw_comment_info = resources['full_comment_info']) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 2b528fe..8aa3b46 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -91,7 +91,7 @@ def resources(): normalized_name = NORMALIZED_NAME, canonical_url = CANONICAL_URL, full_video_info = FULL_VIDEO_INFO, - full_comment_info = {**COMMENT_INFO_LIST[0], **{'likes' : 8, 'dislikes' : 0}}, + full_comment_info = {**COMMENT_INFO_LIST[0], **{'likes': 8, 'dislikes': 0}}, comment_info_list = COMMENT_INFO_LIST, auth_token = get_auth_token())