From 44a673f889b7c9fe81ef9643371837fb9bed5c7e Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 11 Apr 2022 10:27:12 -0500 Subject: [PATCH 1/5] refactored base classes to have structure more similar to snscrape, made scraper 'get' methods return dataclasses or list of dataclasses rather than dicts --- polyphemus/api.py | 61 +++++---- polyphemus/base.py | 319 +++++++++++++++++++++++++++------------------ tests/api.py | 12 +- tests/base.py | 35 +++-- tests/conftest.py | 2 +- 5 files changed, 252 insertions(+), 177 deletions(-) diff --git a/polyphemus/api.py b/polyphemus/api.py index dc7430e..ae97b55 100644 --- a/polyphemus/api.py +++ b/polyphemus/api.py @@ -7,6 +7,8 @@ import json from urllib.parse import quote +from typing import Tuple, Optional, List +import time import requests @@ -23,7 +25,7 @@ NEW_USER_API_URL = 'https://api.odysee.com/user/new' #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def make_request(request, kwargs): +def make_request(request: str, kwargs: dict) -> requests.Response: """Wrapper for retrying request multiple times. """ @@ -32,12 +34,24 @@ def make_request(request, kwargs): msg = f'`request` argument must be either `requests.get` or `requests.post`, not {type(request)}' raise ValueError(msg) - n_retries = 0 - response = request(**kwargs) + if 'timeout' not in kwargs: + kwargs['timeout'] = 15 - while response.status_code != 200 and n_retries < 5: - n_retries += 1 - response = request(**kwargs) + n_retries = 0 + + response = requests.Response() + response.status_code = 418 + + while n_retries < 5: + time.sleep(2 ** n_retries - 1) + try: + response = request(**kwargs) + if response.status_code == 200: + return response + else: + n_retries += 1 + except Exception: + n_retries += 1 if response.status_code != 200: msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}: status code {response.status_code}' @@ -47,9 +61,12 @@ def make_request(request, kwargs): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_auth_token(): +def get_auth_token() -> str: - """Get a fresh authorization token, to use for API calls that require it. + """Get a fresh authorization token, to use for API calls that require it. + + Note: calling this function many times in quick succession may result in a + 503 error. """ response = make_request( @@ -63,7 +80,7 @@ def get_auth_token(): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_channel_info(channel_name): +def get_channel_info(channel_name: str) -> dict: """Get the channel information and ID from the channel name. """ @@ -99,7 +116,7 @@ def get_channel_info(channel_name): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_subscribers(channel_id, auth_token = None): +def get_subscribers(channel_id: str, auth_token: str = None) -> int: """Get the number of subscribers for a channel. """ @@ -124,19 +141,19 @@ def get_subscribers(channel_id, auth_token = None): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_all_videos(channel_id): +def get_raw_video_info_list(channel_id: str) -> dict: """Get a list of all videos posted by a specified channel name. Returns ------- - all_videos: list + raw_video_info_list: list List of dictionaries, with each dict corresponding to a JSON response containing data about a single video. """ - all_videos = [] + raw_video_info_list = [] page = 1 @@ -164,14 +181,14 @@ def get_all_videos(channel_id): if not videos: break else: - all_videos.extend(videos) + raw_video_info_list.extend(videos) page += 1 - return all_videos + return raw_video_info_list #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_views(video_id, auth_token = None): +def get_views(video_id: str, auth_token: str = None) -> int: """Get the number of views for a given video. """ @@ -195,7 +212,7 @@ def get_views(video_id, auth_token = None): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_video_reactions(video_id, auth_token = None): +def get_video_reactions(video_id: str, auth_token: str = None) -> Tuple[Optional[int], Optional[int]]: """Get all reactions for a given video. """ @@ -223,7 +240,7 @@ def get_video_reactions(video_id, auth_token = None): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_all_comments(video_id): +def get_all_comments(video_id: str) -> List[dict]: """Get a list of all comments for a single video. @@ -277,7 +294,7 @@ def get_all_comments(video_id): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def append_comment_reactions(comment_info_list): +def append_comment_reactions(comment_info_list: List[dict]) -> List[dict]: """Get reaction data for each comment and insert ``'reactions'`` key into dict for each comment. @@ -325,7 +342,7 @@ def append_comment_reactions(comment_info_list): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_recommended(video_title, video_id): +def get_recommended(video_title: str, video_id: str) -> List[dict]: name = quote(video_title) @@ -350,7 +367,7 @@ def get_recommended(video_title, video_id): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def normalized_name_to_video_info(normalized_name): +def normalized_name_to_video_info(normalized_name: str) -> dict: video_url = f"lbry://{normalized_name}" @@ -372,7 +389,7 @@ def normalized_name_to_video_info(normalized_name): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_streaming_url(canonical_url): +def get_streaming_url(canonical_url: str) -> str: json_data = { "jsonrpc":"2.0", diff --git a/polyphemus/base.py b/polyphemus/base.py index 4625236..92601d7 100644 --- a/polyphemus/base.py +++ b/polyphemus/base.py @@ -7,48 +7,111 @@ import json from urllib.parse import unquote +from dataclasses import dataclass +import typing +from datetime import datetime from polyphemus import api #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class OdyseeChannel: +@dataclass +class Channel: + channel_id: str + created: datetime + subscribers: int + raw : str + title : typing.Optional[str] = None + description: typing.Optional[str] = None + cover_image: typing.Optional[str] = None + thumbnail_image: typing.Optional[str] = None + +@dataclass +class Video: + canonical_url: str + streaming_url: str + type: str + claim_id: str + created: datetime + title: str + views: int + raw: str + text: typing.Optional[str] = None + thumbnail : typing.Optional[str] = None + channel_id: typing.Optional[str] = None + channel_name: typing.Optional[str] = None + duration: typing.Optional[int] = None + languages : typing.Optional[typing.List[str]] = None + tags: typing.Optional[typing.List[str]] = None + likes: typing.Optional[int] = None + dislikes: typing.Optional[int] = None + is_comment: bool = False + +@dataclass +class Comment: + text: str + created: datetime + claim_id : str + video_claim_id : str + channel_id: str + channel_name : str + replies: int + likes: int + dislikes: int + raw : str + is_comment: bool = True + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +class OdyseeChannelScraper: #-------------------------------------------------------------------------# - def __init__(self, channel_name, auth_token = None): + def __init__(self, channel_name: str, auth_token: str = None): self._channel_name = unquote(channel_name) - info = api.get_channel_info(channel_name = self._channel_name) - - self.info = info - self._channel_id = self.info['channel_id'] - if auth_token is None: self.auth_token = api.get_auth_token() else: self.auth_token = auth_token - self.info['subscribers'] = api.get_subscribers( - channel_id = self.info['channel_id'], - auth_token = self.auth_token) + self._raw_channel_info = api.get_channel_info(channel_name = self._channel_name) + self._channel_id = self._raw_channel_info['channel_id'] #-------------------------------------------------------------------------# - def get_all_videos(self): + def get_entity(self) -> Channel: - """Return list of OdyseeVideo objects for all videos posted by the channel + subscribers = api.get_subscribers( + channel_id = self._channel_id, + auth_token = self.auth_token) + + return Channel( + channel_id=self._raw_channel_info['channel_id'], + title=self._raw_channel_info['title'], + created=datetime.fromtimestamp(self._raw_channel_info['created']), + description=self._raw_channel_info['description'], + cover_image=self._raw_channel_info['cover_image'], + thumbnail_image=self._raw_channel_info['thumbnail_image'], + raw=self._raw_channel_info['raw'], + subscribers=subscribers) + + #-------------------------------------------------------------------------# + + def get_all_videos(self) -> typing.Generator[Video, None, None]: + + """Return list of Video objects for all videos posted by the channel """ - all_video_info = api.get_all_videos(channel_id=self.info['channel_id']) - self.all_videos = (OdyseeVideo(video, self.auth_token) for video in all_video_info) + raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id) + videos = (process_raw_video_info(raw_video_info, self.auth_token) for raw_video_info in raw_video_info_list) - return self.all_videos + return videos #-------------------------------------------------------------------------# - def get_all_videos_and_comments(self): + def get_all_videos_and_comments(self) -> typing.Tuple[typing.List['Video'], typing.List['Comment']]: """Return list of OdyseeVideo and OdyseeComment objects for all videos posted by the channel and all comments posted to those videos @@ -56,133 +119,131 @@ class OdyseeChannel: all_videos = list(self.get_all_videos()) - all_comments = [] + raw_comment_info_list = [] for video in all_videos: - all_comments.extend(video.get_all_comments()) + raw_comment_info_list.extend(api.get_all_comments(video_id=video.claim_id)) + + all_comments = [process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list] return all_videos, all_comments #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class OdyseeVideo: +def process_raw_video_info(raw_video_info: dict, auth_token = None) -> Video: - #-------------------------------------------------------------------------# + if auth_token is None: + auth_token = api.get_auth_token() + else: + auth_token = auth_token + + # Handle edge cases + #.....................................................................# + + if 'video' in raw_video_info['value']: + video_type = 'video' + duration = raw_video_info['value']['video'].get('duration') + elif 'audio' in raw_video_info['value']: + video_type = 'audio' + duration = raw_video_info['value']['audio'].get('duration') + elif 'claim_hash' in raw_video_info['value']: + video_type = 'repost' + duration = None + raw_video_info['value'] = raw_video_info['reposted_claim']['value'] + raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url'] + elif 'image' in raw_video_info['value']: + video_type = 'image' + duration = None + else: + video_type = 'other' + duration = None + + if 'signing_channel' in raw_video_info: + channel_name = raw_video_info['signing_channel'].get('name') + if 'claim_id' in raw_video_info['signing_channel']: + channel_id = raw_video_info['signing_channel']['claim_id'] + else: + channel_id = raw_video_info['signing_channel']['channel_id'] + else: + channel_name = None + channel_id = None + + if 'release_time' in raw_video_info['value']: + created = raw_video_info['value']['release_time'] + else: + created = raw_video_info['meta']['creation_timestamp'] + + if 'thumbnail' in raw_video_info['value']: + thumbnail = raw_video_info['value']['thumbnail'].get('url', None) + else: + thumbnail = None - def __init__(self, full_video_info, auth_token = None): - - if auth_token is None: - self.auth_token = api.get_auth_token() - else: - self.auth_token = auth_token - - # Handle edge cases - #.....................................................................# - - if 'video' in full_video_info['value']: - video_type = 'video' - duration = full_video_info['value']['video'].get('duration') - elif 'audio' in full_video_info['value']: - video_type = 'audio' - duration = full_video_info['value']['audio'].get('duration') - elif 'claim_hash' in full_video_info['value']: - video_type = 'repost' - duration = None - full_video_info['value'] = full_video_info['reposted_claim']['value'] - full_video_info['canonical_url'] = full_video_info['reposted_claim']['canonical_url'] - elif 'image' in full_video_info['value']: - video_type = 'image' - duration = None - else: - video_type = 'other' - duration = None - - if 'signing_channel' in full_video_info: - channel_name = full_video_info['signing_channel'].get('name') - if 'claim_id' in full_video_info['signing_channel']: - channel_id = full_video_info['signing_channel']['claim_id'] - else: - channel_id = full_video_info['signing_channel']['channel_id'] - else: - channel_name = None - channel_id = None - - if 'release_time' in full_video_info['value']: - created = full_video_info['value']['release_time'] - else: - created = full_video_info['meta']['creation_timestamp'] - - if 'thumbnail' in full_video_info['value']: - thumbnail = full_video_info['value']['thumbnail'].get('url', None) - else: - thumbnail = None - - # Store relevant information in flat dict - #.....................................................................# - - self.info = { - 'canonical_url' : full_video_info['canonical_url'], - 'type' : video_type, - 'channel_id' : channel_id, - 'channel_name' : channel_name, - 'claim_id' : full_video_info['claim_id'], - 'created' : int(created), - 'text' : full_video_info['value'].get('description'), - 'languages' : full_video_info['value'].get('languages'), - 'tags' : full_video_info['value'].get('tags',[]), - 'title' : full_video_info['value']['title'], - 'duration' : duration, - 'thumbnail' : thumbnail, - 'is_comment' : False, - 'raw' : json.dumps(full_video_info)} - - self.claim_id = self.info['claim_id'] - - self.info['views'] = api.get_views(video_id=self.claim_id, auth_token = self.auth_token) - - self.info['likes'], self.info['dislikes'] = api.get_video_reactions( - video_id = self.claim_id, - auth_token = self.auth_token) - - self.info['streaming_url'] = api.get_streaming_url(self.info['canonical_url']) - - #-------------------------------------------------------------------------# - - def get_all_comments(self): - - all_comment_info = api.get_all_comments(video_id=self.claim_id) - self.all_comments = (OdyseeComment(comment) for comment in all_comment_info) - - return self.all_comments - - #-------------------------------------------------------------------------# + # Retrieve additional fields + #.....................................................................# - def get_recommended(self): - - recommended_video_info = api.get_recommended( - video_title=self.info['title'], video_id=self.claim_id) - recommended_videos = [OdyseeVideo(video_info, self.auth_token) for video_info in recommended_video_info] + claim_id = raw_video_info['claim_id'] - return recommended_videos + views = api.get_views(video_id=claim_id, auth_token = auth_token) + + likes, dislikes = api.get_video_reactions( + video_id = claim_id, + auth_token = auth_token) + + streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) + + # Return Video object + #.....................................................................# + + return Video( + canonical_url = raw_video_info['canonical_url'], + type = video_type, + channel_id = channel_id, + channel_name = channel_name, + claim_id = raw_video_info['claim_id'], + created = datetime.fromtimestamp(int(created)), + text = raw_video_info['value'].get('description'), + languages = raw_video_info['value'].get('languages'), + tags = raw_video_info['value'].get('tags',[]), + title = raw_video_info['value']['title'], + duration = duration, + thumbnail = thumbnail, + is_comment = False, + raw = json.dumps(raw_video_info), + views = views, + likes = likes, + dislikes = dislikes, + streaming_url = streaming_url) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class OdyseeComment: +def process_raw_comment_info(raw_comment_info: dict) -> Comment: - def __init__(self, full_comment_info): - - # Store relevant information in flat dict - self.info = { - 'text' : full_comment_info['comment'], - 'created' : full_comment_info['timestamp'], - 'claim_id' : full_comment_info.get('comment_id'), - 'video_claim_id' : full_comment_info['claim_id'], - 'channel_id' : full_comment_info['channel_id'], - 'channel_name' : full_comment_info['channel_name'], - 'replies' : full_comment_info.get('replies', 0), - 'likes' : full_comment_info['likes'], - 'dislikes' : full_comment_info['dislikes'], - 'is_comment' : True, - 'raw' : json.dumps(full_comment_info)} + return Comment( + text = raw_comment_info['comment'], + created = raw_comment_info['timestamp'], + claim_id = raw_comment_info.get('comment_id'), + video_claim_id = raw_comment_info['claim_id'], + channel_id = raw_comment_info['channel_id'], + channel_name = raw_comment_info['channel_name'], + replies = raw_comment_info.get('replies', 0), + likes = raw_comment_info['likes'], + dislikes = raw_comment_info['dislikes'], + is_comment = True, + raw = json.dumps(raw_comment_info)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_recommended(video: Video, auth_token: str = None) -> typing.List['Video']: + + if auth_token is None: + auth_token = api.get_auth_token() + else: + auth_token = auth_token + + recommended_video_info_list = api.get_recommended( + video_title=video.title, video_id=video.claim_id) + recommended_videos = [process_raw_video_info(raw_video_info, auth_token) for raw_video_info in recommended_video_info_list] + + return recommended_videos + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/api.py b/tests/api.py index ff9e60f..5fe7c47 100644 --- a/tests/api.py +++ b/tests/api.py @@ -23,7 +23,7 @@ KWARGS_LIST = [ ('get_auth_token', []), ('get_channel_info', ['channel_name']), ('get_subscribers', ['channel_id', 'auth_token']), - ('get_all_videos', ['channel_id']), + ('get_raw_video_info_list', ['channel_id']), ('get_views', ['video_id', 'auth_token']), ('get_video_reactions', ['video_id', 'auth_token']), ('get_all_comments', ['video_id']), @@ -34,12 +34,12 @@ KWARGS_LIST = [ #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -@pytest.mark.parametrize( 'function_str,kwargs', KWARGS_LIST ) -def test_minimal_init( resources, function_str, kwargs ): +@pytest.mark.parametrize('function_str,kwargs', KWARGS_LIST) +def test_minimal_init(resources, function_str, kwargs): - function = eval( f'api.{function_str}') - function_kwargs = { kwarg : resources[ kwarg ] for kwarg in kwargs } + function = eval(f'api.{function_str}') + function_kwargs = {kwarg: resources[kwarg] for kwarg in kwargs} - function( **function_kwargs ) + function(**function_kwargs) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/base.py b/tests/base.py index 6da7031..2a0387e 100644 --- a/tests/base.py +++ b/tests/base.py @@ -19,38 +19,35 @@ from polyphemus import base #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class TestOdyseeChannel: +class TestOdyseeChannelScraper: @pytest.fixture(autouse=True) def test_simple_init(self, resources): - self.channel = base.OdyseeChannel(channel_name = resources['channel_name']) + self.scraper = base.OdyseeChannelScraper(channel_name = resources['channel_name']) + + def test_get_entity(self): + self.scraper.get_entity() def test_get_all_videos(self): - self.channel.get_all_videos() + self.scraper.get_all_videos() def test_get_all_videos_and_comments(self): - self.channel.get_all_videos_and_comments() + self.scraper.get_all_videos_and_comments() #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class TestOdyseeVideo: +def test_process_raw_video_info(resources): + video = base.process_raw_video_info(raw_video_info = resources['full_video_info'], auth_token = resources['auth_token']) - @pytest.fixture(autouse=True) - def test_simple_init(self, resources): - self.video = base.OdyseeVideo(full_video_info = resources['full_video_info']) - - def test_get_all_comments(self): - self.video.get_all_comments() - - def test_get_recommended(self): - self.video.get_recommended() - #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -class TestOdyseeComment: +def test_get_recommended(resources): + video = base.process_raw_video_info(raw_video_info = resources['full_video_info'], auth_token = resources['auth_token']) + base.get_recommended(video = video) - @pytest.fixture(autouse=True) - def test_simple_init(self, resources): - self.comment = base.OdyseeComment(full_comment_info = resources['full_comment_info']) +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def test_process_raw_comment_info(resources): + base.process_raw_comment_info(raw_comment_info = resources['full_comment_info']) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 2b528fe..8aa3b46 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -91,7 +91,7 @@ def resources(): normalized_name = NORMALIZED_NAME, canonical_url = CANONICAL_URL, full_video_info = FULL_VIDEO_INFO, - full_comment_info = {**COMMENT_INFO_LIST[0], **{'likes' : 8, 'dislikes' : 0}}, + full_comment_info = {**COMMENT_INFO_LIST[0], **{'likes': 8, 'dislikes': 0}}, comment_info_list = COMMENT_INFO_LIST, auth_token = get_auth_token()) From 0aac7493a4721df58bdc934ccd28102fbfa4f88b Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 11 Apr 2022 23:28:44 -0500 Subject: [PATCH 2/5] updated examples with refactored scraper, increased speed of recommendation engine fetchibng by implementing normalized_names_to_video_info routine, that allows requesting multiple videos at a time --- examples/generate_network.py | 25 ++++++++++++-------- examples/scrape.py | 10 ++++---- polyphemus/api.py | 44 ++++++++++++++++++++++++++++-------- polyphemus/base.py | 30 +++++++++++++++--------- tests/api.py | 1 + tests/conftest.py | 1 + 6 files changed, 75 insertions(+), 36 deletions(-) diff --git a/examples/generate_network.py b/examples/generate_network.py index 86f3a42..8d58d0a 100644 --- a/examples/generate_network.py +++ b/examples/generate_network.py @@ -20,21 +20,23 @@ OUTPUT_DIR = '../../data' if __name__ == '__main__': - odysee_channel = polyphemus.base.OdyseeChannel(channel_name = CHANNEL_NAME) + auth_token = polyphemus.api.get_auth_token() + + scraper = polyphemus.base.OdyseeChannelScraper(channel_name = CHANNEL_NAME, auth_token = auth_token) edge_list = list() already_done = list() - new_videos = odysee_channel.get_all_videos() - master_video_dict = dict(zip([v.info['claim_id'] for v in new_videos], new_videos)) + new_videos = list(scraper.get_all_videos()) + master_video_dict = dict(zip([v.claim_id for v in new_videos], new_videos)) for iteration in range(ITERATIONS): print(f'\n\nITERATION: {iteration}, N_VIDEOS: {len(new_videos)}\n\n') for i, video in enumerate(new_videos): - claim_id = video.info['claim_id'] - title = video.info['title'] + claim_id = video.claim_id + title = video.title print(f'\nVIDEO: {i}; CLAIM_ID: {claim_id}\n') @@ -47,20 +49,23 @@ if __name__ == '__main__': edge_list.append((claim_id, rec_claim_id)) if rec_video_info['claim_id'] not in master_video_dict: - master_video_dict[rec_claim_id] = polyphemus.base.OdyseeVideo(rec_video_info) + master_video_dict[rec_claim_id] = polyphemus.base.process_raw_video_info( + raw_video_info = rec_video_info, + auth_token = auth_token, + additional_fields = False) already_done.append(claim_id) - new_videos = [video for video in master_video_dict.values() if video.info['claim_id'] not in already_done] + new_videos = [video for video in master_video_dict.values() if video.claim_id not in already_done] #-------------------------------------------------------------------------# os.makedirs(OUTPUT_DIR, exist_ok = True) - with open(Path(OUTPUT_DIR, 'master_video_dict.pkl'), 'wb') as f: + with open(Path(OUTPUT_DIR, f'master_video_dict_iterations={ITERATIONS}.pkl'), 'wb') as f: pickle.dump(master_video_dict, f) - with open(Path(OUTPUT_DIR, 'edge_list.pkl'), 'wb') as f: - pickle.dump(edge_list) + with open(Path(OUTPUT_DIR, f'edge_list_iterations={ITERATIONS}.pkl'), 'wb') as f: + pickle.dump(edge_list, f) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/examples/scrape.py b/examples/scrape.py index 30db635..7f32d02 100644 --- a/examples/scrape.py +++ b/examples/scrape.py @@ -11,7 +11,7 @@ import os import pandas as pd -from polyphemus.base import OdyseeChannel +from polyphemus.base import OdyseeChannelScraper #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -22,13 +22,13 @@ OUTPUT_DIR = Path('.').resolve().parents[1]/'data' if __name__ == '__main__': - odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME) + odysee_channel = OdyseeChannelScraper(channel_name = CHANNEL_NAME) video_list, comment_list = odysee_channel.get_all_videos_and_comments() - channel_df = pd.DataFrame([odysee_channel.info]) - video_df = pd.DataFrame([v.info for v in video_list]) - comment_df = pd.DataFrame([c.info for c in comment_list]) + channel_df = pd.DataFrame([odysee_channel.get_entity().__dict__]) + video_df = pd.DataFrame([v.__dict__ for v in video_list]) + comment_df = pd.DataFrame([c.__dict__ for c in comment_list]) output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME) os.makedirs(output_subdir, exist_ok = True) diff --git a/polyphemus/api.py b/polyphemus/api.py index ae97b55..955b71f 100644 --- a/polyphemus/api.py +++ b/polyphemus/api.py @@ -7,7 +7,8 @@ import json from urllib.parse import quote -from typing import Tuple, Optional, List +from typing import Tuple, Optional, List, Callable + import time import requests @@ -25,7 +26,7 @@ NEW_USER_API_URL = 'https://api.odysee.com/user/new' #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def make_request(request: str, kwargs: dict) -> requests.Response: +def make_request(request: Callable, kwargs: dict) -> requests.Response: """Wrapper for retrying request multiple times. """ @@ -42,6 +43,9 @@ def make_request(request: str, kwargs: dict) -> requests.Response: response = requests.Response() response.status_code = 418 + exceptions = [] + status_codes = [] + while n_retries < 5: time.sleep(2 ** n_retries - 1) try: @@ -49,15 +53,14 @@ def make_request(request: str, kwargs: dict) -> requests.Response: if response.status_code == 200: return response else: + status_codes.append(response.status_code) n_retries += 1 - except Exception: + except Exception as exception: + exceptions.append(exception) n_retries += 1 - if response.status_code != 200: - msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}: status code {response.status_code}' - raise ValueError(msg) - - return response + msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Status codes: {status_codes}; exceptions: {exceptions}' + raise ValueError(msg) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -359,8 +362,7 @@ def get_recommended(video_title: str, video_id: str) -> List[dict]: 'params': params}) result = json.loads(response.text) - - recommended_video_info = [ normalized_name_to_video_info(r['name']) for r in result] + recommended_video_info = normalized_names_to_video_info([r['name'] for r in result]) recommended_video_info = [vi for vi in recommended_video_info if ((vi.get('value_type') == 'stream') & any(key in vi.get('value', []) for key in ('video', 'audio')))] return recommended_video_info @@ -389,6 +391,28 @@ def normalized_name_to_video_info(normalized_name: str) -> dict: #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# +def normalized_names_to_video_info(normalized_names: List[str]) -> dict: + + video_urls = [f"lbry://{normalized_name}" for normalized_name in normalized_names] + + json_data = { + "jsonrpc":"2.0", + "method":"resolve", + "params":{ + "urls":video_urls}} + + response = make_request( + request = requests.post, + kwargs = { + 'url' : BACKEND_API_URL, + 'json': json_data}) + + result = json.loads(response.text) + + return [result['result'][video_url] for video_url in video_urls] + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + def get_streaming_url(canonical_url: str) -> str: json_data = { diff --git a/polyphemus/base.py b/polyphemus/base.py index 92601d7..d26e58c 100644 --- a/polyphemus/base.py +++ b/polyphemus/base.py @@ -29,13 +29,13 @@ class Channel: @dataclass class Video: canonical_url: str - streaming_url: str type: str claim_id: str created: datetime title: str - views: int raw: str + views: typing.Optional[int] = None + streaming_url: typing.Optional[str] = None text: typing.Optional[str] = None thumbnail : typing.Optional[str] = None channel_id: typing.Optional[str] = None @@ -83,6 +83,9 @@ class OdyseeChannelScraper: def get_entity(self) -> Channel: + """Return Channel object containing information about the specified channel. + """ + subscribers = api.get_subscribers( channel_id = self._channel_id, auth_token = self.auth_token) @@ -101,7 +104,7 @@ class OdyseeChannelScraper: def get_all_videos(self) -> typing.Generator[Video, None, None]: - """Return list of Video objects for all videos posted by the channel + """Return list of Video objects for all videos posted by the specified channel """ raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id) @@ -130,7 +133,7 @@ class OdyseeChannelScraper: #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def process_raw_video_info(raw_video_info: dict, auth_token = None) -> Video: +def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additional_fields: bool = True) -> Video: if auth_token is None: auth_token = api.get_auth_token() @@ -180,16 +183,21 @@ def process_raw_video_info(raw_video_info: dict, auth_token = None) -> Video: # Retrieve additional fields #.....................................................................# - + claim_id = raw_video_info['claim_id'] - views = api.get_views(video_id=claim_id, auth_token = auth_token) + if additional_fields: + streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) + views = api.get_views(video_id=claim_id, auth_token = auth_token) + likes, dislikes = api.get_video_reactions( + video_id = claim_id, + auth_token = auth_token) - likes, dislikes = api.get_video_reactions( - video_id = claim_id, - auth_token = auth_token) - - streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) + else: + streaming_url = None + views = None + likes = None + dislikes = None # Return Video object #.....................................................................# diff --git a/tests/api.py b/tests/api.py index 5fe7c47..3a2fd0f 100644 --- a/tests/api.py +++ b/tests/api.py @@ -29,6 +29,7 @@ KWARGS_LIST = [ ('get_all_comments', ['video_id']), ('append_comment_reactions', ['comment_info_list']), ('normalized_name_to_video_info', ['normalized_name']), + ('normalized_names_to_video_info', ['normalized_names']), ('get_streaming_url', ['canonical_url']), ('get_recommended', ['video_title', 'video_id']),] diff --git a/tests/conftest.py b/tests/conftest.py index 8aa3b46..ec4ef57 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -89,6 +89,7 @@ def resources(): video_id = VIDEO_ID, video_title = VIDEO_TITLE, normalized_name = NORMALIZED_NAME, + normalized_names = [NORMALIZED_NAME], canonical_url = CANONICAL_URL, full_video_info = FULL_VIDEO_INFO, full_comment_info = {**COMMENT_INFO_LIST[0], **{'likes': 8, 'dislikes': 0}}, From 71eecf7c9ebd52ef25735745a6b2ec255d118817 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 12 Apr 2022 02:45:01 -0500 Subject: [PATCH 3/5] added recommendation engine and updated example, handled additional edge cases --- examples/generate_network.py | 55 ++++++--------------- polyphemus/api.py | 2 +- polyphemus/base.py | 92 ++++++++++++++++++++++++++++++++---- 3 files changed, 98 insertions(+), 51 deletions(-) diff --git a/examples/generate_network.py b/examples/generate_network.py index 8d58d0a..37dc800 100644 --- a/examples/generate_network.py +++ b/examples/generate_network.py @@ -6,66 +6,39 @@ from pathlib import Path import pickle import os +import networkx as nx + import polyphemus #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# CHANNEL_NAME = 'PatriotFront' -ITERATIONS = 3 +ITERATIONS = 2 -OUTPUT_DIR = '../../data' +OUTPUT_DIR = Path('../../data', f'{CHANNEL_NAME}_recommendation_iterations={ITERATIONS}') #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# if __name__ == '__main__': - auth_token = polyphemus.api.get_auth_token() + engine = polyphemus.base.RecommendationEngine(channel_list= [CHANNEL_NAME]) - scraper = polyphemus.base.OdyseeChannelScraper(channel_name = CHANNEL_NAME, auth_token = auth_token) + weighted_edge_list, claim_id_to_video = engine.generate(iterations = 1) - edge_list = list() - already_done = list() - - new_videos = list(scraper.get_all_videos()) - master_video_dict = dict(zip([v.claim_id for v in new_videos], new_videos)) - - for iteration in range(ITERATIONS): - - print(f'\n\nITERATION: {iteration}, N_VIDEOS: {len(new_videos)}\n\n') - - for i, video in enumerate(new_videos): - claim_id = video.claim_id - title = video.title - - print(f'\nVIDEO: {i}; CLAIM_ID: {claim_id}\n') - - recommended_video_info = polyphemus.api.get_recommended(title, claim_id) - - for rec_video_info in recommended_video_info: - rec_claim_id = rec_video_info['claim_id'] - print(f'REC_CLAIM_ID: {rec_claim_id}') - - edge_list.append((claim_id, rec_claim_id)) - - if rec_video_info['claim_id'] not in master_video_dict: - master_video_dict[rec_claim_id] = polyphemus.base.process_raw_video_info( - raw_video_info = rec_video_info, - auth_token = auth_token, - additional_fields = False) - - already_done.append(claim_id) - - new_videos = [video for video in master_video_dict.values() if video.claim_id not in already_done] + G = nx.DiGraph() + G.add_weighted_edges_from(weighted_edge_list) #-------------------------------------------------------------------------# os.makedirs(OUTPUT_DIR, exist_ok = True) - with open(Path(OUTPUT_DIR, f'master_video_dict_iterations={ITERATIONS}.pkl'), 'wb') as f: - pickle.dump(master_video_dict, f) + nx.write_gexf(G = G, path = Path(OUTPUT_DIR, 'network.gexf')) - with open(Path(OUTPUT_DIR, f'edge_list_iterations={ITERATIONS}.pkl'), 'wb') as f: - pickle.dump(edge_list, f) + with open(Path(OUTPUT_DIR, f'weighted_edge_list.pkl'), 'wb') as f: + pickle.dump(weighted_edge_list, f) + + with open(Path(OUTPUT_DIR, f'claim_id_to_video.pkl'), 'wb') as f: + pickle.dump(claim_id_to_video, f) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/polyphemus/api.py b/polyphemus/api.py index 955b71f..13c460a 100644 --- a/polyphemus/api.py +++ b/polyphemus/api.py @@ -427,7 +427,7 @@ def get_streaming_url(canonical_url: str) -> str: 'url' : BACKEND_API_URL, 'json': json_data}) - video_url = json.loads(response.text)['result'].get('streaming_url') + video_url = json.loads(response.text).get('result', {}).get('streaming_url') return video_url diff --git a/polyphemus/base.py b/polyphemus/base.py index d26e58c..4be9e48 100644 --- a/polyphemus/base.py +++ b/polyphemus/base.py @@ -10,6 +10,7 @@ from urllib.parse import unquote from dataclasses import dataclass import typing from datetime import datetime +from collections import Counter from polyphemus import api @@ -102,13 +103,13 @@ class OdyseeChannelScraper: #-------------------------------------------------------------------------# - def get_all_videos(self) -> typing.Generator[Video, None, None]: + def get_all_videos(self, additional_fields: bool = True) -> typing.Generator[Video, None, None]: """Return list of Video objects for all videos posted by the specified channel """ raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id) - videos = (process_raw_video_info(raw_video_info, self.auth_token) for raw_video_info in raw_video_info_list) + videos = (process_raw_video_info(raw_video_info = raw_video_info, auth_token = self.auth_token, additional_fields = additional_fields) for raw_video_info in raw_video_info_list) return videos @@ -140,6 +141,10 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio else: auth_token = auth_token + raw = json.dumps(raw_video_info) + + claim_id = raw_video_info['claim_id'] + # Handle edge cases #.....................................................................# @@ -152,8 +157,12 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio elif 'claim_hash' in raw_video_info['value']: video_type = 'repost' duration = None - raw_video_info['value'] = raw_video_info['reposted_claim']['value'] - raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url'] + if 'reposted_claim' in raw_video_info: + raw_video_info['value'] = raw_video_info['reposted_claim']['value'] + raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url'] + claim_id = raw_video_info['reposted_claim']['claim_id'] + else: + raw_video_info['value'] = {} elif 'image' in raw_video_info['value']: video_type = 'image' duration = None @@ -184,10 +193,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio # Retrieve additional fields #.....................................................................# - claim_id = raw_video_info['claim_id'] - if additional_fields: - streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) + if raw_video_info['name'] == 'live': + streaming_url = None + else: + streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) views = api.get_views(video_id=claim_id, auth_token = auth_token) likes, dislikes = api.get_video_reactions( video_id = claim_id, @@ -212,11 +222,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio text = raw_video_info['value'].get('description'), languages = raw_video_info['value'].get('languages'), tags = raw_video_info['value'].get('tags',[]), - title = raw_video_info['value']['title'], + title = raw_video_info['value'].get('title'), duration = duration, thumbnail = thumbnail, is_comment = False, - raw = json.dumps(raw_video_info), + raw = raw, views = views, likes = likes, dislikes = dislikes, @@ -254,4 +264,68 @@ def get_recommended(video: Video, auth_token: str = None) -> typing.List['Video' return recommended_videos +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +class RecommendationEngine: + + #-------------------------------------------------------------------------# + + def __init__(self, channel_list): + + self.channel_list = channel_list + self.auth_token = api.get_auth_token() + + self.edge_list = [] + self.new_videos = [] + + self.already_done_claim_ids = [] + self.claim_id_to_video = {} + + #-------------------------------------------------------------------------# + + def generate(self, iterations = 1): + + for channel_name in self.channel_list: + print(channel_name) + scraper = OdyseeChannelScraper(channel_name = channel_name, auth_token = self.auth_token) + + self.new_videos.extend(list(scraper.get_all_videos(additional_fields = False))) + + self.claim_id_to_video = dict(zip([v.claim_id for v in self.new_videos], self.new_videos)) + + for iteration in range(int(iterations)): + + for i, video in enumerate(self.new_videos): + claim_id = video.claim_id + title = video.title + + print(f'ITERATION: {iteration} | VIDEO: {i} / {len(self.new_videos)} | CLAIM_ID: {claim_id}') + + recommended_video_info = api.get_recommended(video_title = title, video_id = claim_id) + + for rec_video_info in recommended_video_info: + rec_claim_id = rec_video_info['claim_id'] + + self.edge_list.append((claim_id, rec_claim_id)) + + if rec_video_info['claim_id'] not in self.claim_id_to_video: + + self.claim_id_to_video[rec_claim_id] = process_raw_video_info( + raw_video_info = rec_video_info, + auth_token = self.auth_token, + additional_fields = False) + + self.already_done_claim_ids.append(claim_id) + + self.new_videos = [video for video in self.claim_id_to_video.values() if video.claim_id not in self.already_done_claim_ids] + + claim_id_to_channel = {claim_id : video.channel_name for claim_id, video in self.claim_id_to_video.items()} + _channel_edge_list = [(claim_id_to_channel[target], claim_id_to_channel[source]) for target, source in self.edge_list] + channel_edge_list = [(source, target) for source, target in _channel_edge_list if all(item is not None for item in (source, target))] + + c = Counter(channel_edge_list) + self.weighted_edge_list = [(source, target, weight) for (source, target), weight in c.most_common()] + + return self.weighted_edge_list, self.claim_id_to_video + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file From bcb68a17fb2859a4be1a1eb27851192372890660 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 12 Apr 2022 22:46:51 -0500 Subject: [PATCH 4/5] implemented method for retrieving ALL videos from a channel, not just the first 1000, increased robustness of make_requests wrapper, added missing unit tests --- polyphemus/api.py | 129 +++++++++++++++++++++++++++++++++------------- tests/api.py | 5 +- tests/base.py | 11 ++++ 3 files changed, 106 insertions(+), 39 deletions(-) diff --git a/polyphemus/api.py b/polyphemus/api.py index 13c460a..e0e2464 100644 --- a/polyphemus/api.py +++ b/polyphemus/api.py @@ -24,11 +24,33 @@ COMMENT_API_URL = 'https://comments.odysee.com/api/v2' RECOMMENDATION_API_URL = 'https://recsys.odysee.com/search' NEW_USER_API_URL = 'https://api.odysee.com/user/new' +# Allow responses to `get_streaming_url` that contain no `streaming_url` field +ALLOWED_ERROR_CODES = [-32603] + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def make_request(request: Callable, kwargs: dict) -> requests.Response: - """Wrapper for retrying request multiple times. + """Wrapper for retrying request multiple times and handling errors. + + This function handles Python exceptions (e.g. HTTPConnectionPool), + unsuccessful HTTP error codes (e.g. 429, 403), and errors in the + JSON response. If after 5 retries (using exponential backoff) the request + is unsuccessful, an exception is raised. + + Parameters + ---------- + request: function + The requests function to be called. + One of {requests.get and requests.post} + kwargs: dict + Keyword arguments for the ``request`` function. Must include ``url`` key. + e.g. ``{'url': 'https://api.odysee.com/user/new'}`` + Uses a default timeout of 15 seconds. + + Returns + ------- + response: requests.Response """ if request not in [requests.get, requests.post]: @@ -43,23 +65,33 @@ def make_request(request: Callable, kwargs: dict) -> requests.Response: response = requests.Response() response.status_code = 418 - exceptions = [] - status_codes = [] + retry_reasons = [] + # TODO this looks a bit gross, try to refactor while n_retries < 5: time.sleep(2 ** n_retries - 1) try: response = request(**kwargs) if response.status_code == 200: - return response + parsed_response = json.loads(response.text) + if isinstance(parsed_response, list): + return response + if parsed_response.get('error') is not None: + if parsed_response['error'].get('code', None) not in ALLOWED_ERROR_CODES: + retry_reasons.append(f'JSON response error: {parsed_response["error"]}') + n_retries += 1 + else: + return response + else: + return response else: - status_codes.append(response.status_code) + retry_reasons.append(f'HTTP status code: {response.status_code}') n_retries += 1 except Exception as exception: - exceptions.append(exception) + retry_reasons.append(f'Python exception: {exception}') n_retries += 1 - msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Status codes: {status_codes}; exceptions: {exceptions}' + msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Retry reasons: {retry_reasons}' raise ValueError(msg) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -148,6 +180,19 @@ def get_raw_video_info_list(channel_id: str) -> dict: """Get a list of all videos posted by a specified channel name. + Odysee's ``claim_search`` API (which is used on the browser and LBRY + desktop app) only allows up to 1000 videos to be fetched for a single value + of the ``release_time`` parameter. You can check this by going to an Odysee + channel with a lot of videos (e.g. @etresouverain) and holding the + "Page Down" button until you reach the bottom, there will only be 1000 + videos. + + This function loops over all pages for a single ``release_time`` and + fetches the raw video info for all videos until it reaches that 1000 video + limit, then uses the minimum of the ``creation_timestamp`` for all videos + as the new ``release_time``, and starts over looping over all pages for + that new ``release_time``. + Returns ------- raw_video_info_list: list @@ -156,9 +201,10 @@ def get_raw_video_info_list(channel_id: str) -> dict: """ - raw_video_info_list = [] - + claim_id_to_raw_video_info = {} page = 1 + release_time = int(time.time()) + 86400 + hit_video_limit = False while True: @@ -169,7 +215,8 @@ def get_raw_video_info_list(channel_id: str) -> dict: "page_size":30, "page":page, "order_by":["release_time"], - "channel_ids":[channel_id]}} + "channel_ids":[channel_id], + "release_time": f"<{release_time}"}} response = make_request( request = requests.post, @@ -180,14 +227,30 @@ def get_raw_video_info_list(channel_id: str) -> dict: result = json.loads(response.text) videos = result['result']['items'] + new_videos = {video['claim_id'] : video for video in videos if video['claim_id'] not in claim_id_to_raw_video_info} - if not videos: - break + if len(new_videos) == 0: + # if there are no new videos that haven't already been scraped + if hit_video_limit: + # if Odysee's limit of 1000 videos for a given timestamp was + # reached (which updates the `release_time`) on the last + # request, this means we have scraped all videos on the channel, + # so we break the loop. + break + else: + # we have hit Odysee's limit of 1000 videos for a given + # timestamp, so we update `release_time` and reset `page` + hit_video_limit = True + release_time = min([raw_video_info['meta']['creation_timestamp'] for raw_video_info in claim_id_to_raw_video_info.values()], default = 0) + page = 1 else: - raw_video_info_list.extend(videos) + # there were unscraped videos from the last request, so we keep + # going in the loop and increment the `page` variable + claim_id_to_raw_video_info.update(new_videos) page += 1 + hit_video_limit = False - return raw_video_info_list + return list(claim_id_to_raw_video_info.values()) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -346,6 +409,10 @@ def append_comment_reactions(comment_info_list: List[dict]) -> List[dict]: #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def get_recommended(video_title: str, video_id: str) -> List[dict]: + + """Get list of raw video info dicts for a specified video title and video + claim_id. + """ name = quote(video_title) @@ -369,30 +436,17 @@ def get_recommended(video_title: str, video_id: str) -> List[dict]: #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def normalized_name_to_video_info(normalized_name: str) -> dict: - - video_url = f"lbry://{normalized_name}" - - json_data = { - "jsonrpc":"2.0", - "method":"resolve", - "params":{ - "urls":[video_url]}} - - response = make_request( - request = requests.post, - kwargs = { - 'url' : BACKEND_API_URL, - 'json': json_data}) - - result = json.loads(response.text) - - return result['result'][video_url] - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - def normalized_names_to_video_info(normalized_names: List[str]) -> dict: + """Convert a list of normalized names of videos to a list of raw video dicts for those videos. Example of a "normalized name" is: + + ``'si-une-tude-montre-que-le-masque-permet'``, + + corresponding to the video: + + ``https://odysee.com/@filsdepangolin#e/si-une-tude-montre-que-le-masque-permet#e``. + """ + video_urls = [f"lbry://{normalized_name}" for normalized_name in normalized_names] json_data = { @@ -414,6 +468,9 @@ def normalized_names_to_video_info(normalized_names: List[str]) -> dict: #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def get_streaming_url(canonical_url: str) -> str: + + """Retrieve the `streaming_url` for a specified video. + """ json_data = { "jsonrpc":"2.0", diff --git a/tests/api.py b/tests/api.py index 3a2fd0f..a6b40e0 100644 --- a/tests/api.py +++ b/tests/api.py @@ -28,10 +28,9 @@ KWARGS_LIST = [ ('get_video_reactions', ['video_id', 'auth_token']), ('get_all_comments', ['video_id']), ('append_comment_reactions', ['comment_info_list']), - ('normalized_name_to_video_info', ['normalized_name']), + ('get_recommended', ['video_title', 'video_id']), ('normalized_names_to_video_info', ['normalized_names']), - ('get_streaming_url', ['canonical_url']), - ('get_recommended', ['video_title', 'video_id']),] + ('get_streaming_url', ['canonical_url']),] #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/tests/base.py b/tests/base.py index 2a0387e..7d1287d 100644 --- a/tests/base.py +++ b/tests/base.py @@ -50,4 +50,15 @@ def test_get_recommended(resources): def test_process_raw_comment_info(resources): base.process_raw_comment_info(raw_comment_info = resources['full_comment_info']) +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +class TestRecommendationEngine: + + @pytest.fixture(autouse=True) + def test_simple_init(self, resources): + self.engine = base.RecommendationEngine(channel_list = [resources['channel_name']]) + + def test_generate(self): + self.engine.generate(iterations = 1) + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file From 1627b38ae4778f5bff93566807b1d5016d0f2610 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 12 Apr 2022 23:06:42 -0500 Subject: [PATCH 5/5] deleted unused base.get_Recommended function (deprecated by RecommendationEngine) --- polyphemus/base.py | 15 --------------- tests/base.py | 6 ------ 2 files changed, 21 deletions(-) diff --git a/polyphemus/base.py b/polyphemus/base.py index 4be9e48..533d44c 100644 --- a/polyphemus/base.py +++ b/polyphemus/base.py @@ -251,21 +251,6 @@ def process_raw_comment_info(raw_comment_info: dict) -> Comment: #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def get_recommended(video: Video, auth_token: str = None) -> typing.List['Video']: - - if auth_token is None: - auth_token = api.get_auth_token() - else: - auth_token = auth_token - - recommended_video_info_list = api.get_recommended( - video_title=video.title, video_id=video.claim_id) - recommended_videos = [process_raw_video_info(raw_video_info, auth_token) for raw_video_info in recommended_video_info_list] - - return recommended_videos - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - class RecommendationEngine: #-------------------------------------------------------------------------# diff --git a/tests/base.py b/tests/base.py index 7d1287d..aae7047 100644 --- a/tests/base.py +++ b/tests/base.py @@ -41,12 +41,6 @@ def test_process_raw_video_info(resources): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def test_get_recommended(resources): - video = base.process_raw_video_info(raw_video_info = resources['full_video_info'], auth_token = resources['auth_token']) - base.get_recommended(video = video) - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - def test_process_raw_comment_info(resources): base.process_raw_comment_info(raw_comment_info = resources['full_comment_info'])