From 0aac7493a4721df58bdc934ccd28102fbfa4f88b Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 11 Apr 2022 23:28:44 -0500 Subject: [PATCH] updated examples with refactored scraper, increased speed of recommendation engine fetchibng by implementing normalized_names_to_video_info routine, that allows requesting multiple videos at a time --- examples/generate_network.py | 25 ++++++++++++-------- examples/scrape.py | 10 ++++---- polyphemus/api.py | 44 ++++++++++++++++++++++++++++-------- polyphemus/base.py | 30 +++++++++++++++--------- tests/api.py | 1 + tests/conftest.py | 1 + 6 files changed, 75 insertions(+), 36 deletions(-) diff --git a/examples/generate_network.py b/examples/generate_network.py index 86f3a42..8d58d0a 100644 --- a/examples/generate_network.py +++ b/examples/generate_network.py @@ -20,21 +20,23 @@ OUTPUT_DIR = '../../data' if __name__ == '__main__': - odysee_channel = polyphemus.base.OdyseeChannel(channel_name = CHANNEL_NAME) + auth_token = polyphemus.api.get_auth_token() + + scraper = polyphemus.base.OdyseeChannelScraper(channel_name = CHANNEL_NAME, auth_token = auth_token) edge_list = list() already_done = list() - new_videos = odysee_channel.get_all_videos() - master_video_dict = dict(zip([v.info['claim_id'] for v in new_videos], new_videos)) + new_videos = list(scraper.get_all_videos()) + master_video_dict = dict(zip([v.claim_id for v in new_videos], new_videos)) for iteration in range(ITERATIONS): print(f'\n\nITERATION: {iteration}, N_VIDEOS: {len(new_videos)}\n\n') for i, video in enumerate(new_videos): - claim_id = video.info['claim_id'] - title = video.info['title'] + claim_id = video.claim_id + title = video.title print(f'\nVIDEO: {i}; CLAIM_ID: {claim_id}\n') @@ -47,20 +49,23 @@ if __name__ == '__main__': edge_list.append((claim_id, rec_claim_id)) if rec_video_info['claim_id'] not in master_video_dict: - master_video_dict[rec_claim_id] = polyphemus.base.OdyseeVideo(rec_video_info) + master_video_dict[rec_claim_id] = polyphemus.base.process_raw_video_info( + raw_video_info = rec_video_info, + auth_token = auth_token, + additional_fields = False) already_done.append(claim_id) - new_videos = [video for video in master_video_dict.values() if video.info['claim_id'] not in already_done] + new_videos = [video for video in master_video_dict.values() if video.claim_id not in already_done] #-------------------------------------------------------------------------# os.makedirs(OUTPUT_DIR, exist_ok = True) - with open(Path(OUTPUT_DIR, 'master_video_dict.pkl'), 'wb') as f: + with open(Path(OUTPUT_DIR, f'master_video_dict_iterations={ITERATIONS}.pkl'), 'wb') as f: pickle.dump(master_video_dict, f) - with open(Path(OUTPUT_DIR, 'edge_list.pkl'), 'wb') as f: - pickle.dump(edge_list) + with open(Path(OUTPUT_DIR, f'edge_list_iterations={ITERATIONS}.pkl'), 'wb') as f: + pickle.dump(edge_list, f) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/examples/scrape.py b/examples/scrape.py index 30db635..7f32d02 100644 --- a/examples/scrape.py +++ b/examples/scrape.py @@ -11,7 +11,7 @@ import os import pandas as pd -from polyphemus.base import OdyseeChannel +from polyphemus.base import OdyseeChannelScraper #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -22,13 +22,13 @@ OUTPUT_DIR = Path('.').resolve().parents[1]/'data' if __name__ == '__main__': - odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME) + odysee_channel = OdyseeChannelScraper(channel_name = CHANNEL_NAME) video_list, comment_list = odysee_channel.get_all_videos_and_comments() - channel_df = pd.DataFrame([odysee_channel.info]) - video_df = pd.DataFrame([v.info for v in video_list]) - comment_df = pd.DataFrame([c.info for c in comment_list]) + channel_df = pd.DataFrame([odysee_channel.get_entity().__dict__]) + video_df = pd.DataFrame([v.__dict__ for v in video_list]) + comment_df = pd.DataFrame([c.__dict__ for c in comment_list]) output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME) os.makedirs(output_subdir, exist_ok = True) diff --git a/polyphemus/api.py b/polyphemus/api.py index ae97b55..955b71f 100644 --- a/polyphemus/api.py +++ b/polyphemus/api.py @@ -7,7 +7,8 @@ import json from urllib.parse import quote -from typing import Tuple, Optional, List +from typing import Tuple, Optional, List, Callable + import time import requests @@ -25,7 +26,7 @@ NEW_USER_API_URL = 'https://api.odysee.com/user/new' #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def make_request(request: str, kwargs: dict) -> requests.Response: +def make_request(request: Callable, kwargs: dict) -> requests.Response: """Wrapper for retrying request multiple times. """ @@ -42,6 +43,9 @@ def make_request(request: str, kwargs: dict) -> requests.Response: response = requests.Response() response.status_code = 418 + exceptions = [] + status_codes = [] + while n_retries < 5: time.sleep(2 ** n_retries - 1) try: @@ -49,15 +53,14 @@ def make_request(request: str, kwargs: dict) -> requests.Response: if response.status_code == 200: return response else: + status_codes.append(response.status_code) n_retries += 1 - except Exception: + except Exception as exception: + exceptions.append(exception) n_retries += 1 - if response.status_code != 200: - msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}: status code {response.status_code}' - raise ValueError(msg) - - return response + msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Status codes: {status_codes}; exceptions: {exceptions}' + raise ValueError(msg) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -359,8 +362,7 @@ def get_recommended(video_title: str, video_id: str) -> List[dict]: 'params': params}) result = json.loads(response.text) - - recommended_video_info = [ normalized_name_to_video_info(r['name']) for r in result] + recommended_video_info = normalized_names_to_video_info([r['name'] for r in result]) recommended_video_info = [vi for vi in recommended_video_info if ((vi.get('value_type') == 'stream') & any(key in vi.get('value', []) for key in ('video', 'audio')))] return recommended_video_info @@ -389,6 +391,28 @@ def normalized_name_to_video_info(normalized_name: str) -> dict: #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# +def normalized_names_to_video_info(normalized_names: List[str]) -> dict: + + video_urls = [f"lbry://{normalized_name}" for normalized_name in normalized_names] + + json_data = { + "jsonrpc":"2.0", + "method":"resolve", + "params":{ + "urls":video_urls}} + + response = make_request( + request = requests.post, + kwargs = { + 'url' : BACKEND_API_URL, + 'json': json_data}) + + result = json.loads(response.text) + + return [result['result'][video_url] for video_url in video_urls] + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + def get_streaming_url(canonical_url: str) -> str: json_data = { diff --git a/polyphemus/base.py b/polyphemus/base.py index 92601d7..d26e58c 100644 --- a/polyphemus/base.py +++ b/polyphemus/base.py @@ -29,13 +29,13 @@ class Channel: @dataclass class Video: canonical_url: str - streaming_url: str type: str claim_id: str created: datetime title: str - views: int raw: str + views: typing.Optional[int] = None + streaming_url: typing.Optional[str] = None text: typing.Optional[str] = None thumbnail : typing.Optional[str] = None channel_id: typing.Optional[str] = None @@ -83,6 +83,9 @@ class OdyseeChannelScraper: def get_entity(self) -> Channel: + """Return Channel object containing information about the specified channel. + """ + subscribers = api.get_subscribers( channel_id = self._channel_id, auth_token = self.auth_token) @@ -101,7 +104,7 @@ class OdyseeChannelScraper: def get_all_videos(self) -> typing.Generator[Video, None, None]: - """Return list of Video objects for all videos posted by the channel + """Return list of Video objects for all videos posted by the specified channel """ raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id) @@ -130,7 +133,7 @@ class OdyseeChannelScraper: #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -def process_raw_video_info(raw_video_info: dict, auth_token = None) -> Video: +def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additional_fields: bool = True) -> Video: if auth_token is None: auth_token = api.get_auth_token() @@ -180,16 +183,21 @@ def process_raw_video_info(raw_video_info: dict, auth_token = None) -> Video: # Retrieve additional fields #.....................................................................# - + claim_id = raw_video_info['claim_id'] - views = api.get_views(video_id=claim_id, auth_token = auth_token) + if additional_fields: + streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) + views = api.get_views(video_id=claim_id, auth_token = auth_token) + likes, dislikes = api.get_video_reactions( + video_id = claim_id, + auth_token = auth_token) - likes, dislikes = api.get_video_reactions( - video_id = claim_id, - auth_token = auth_token) - - streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) + else: + streaming_url = None + views = None + likes = None + dislikes = None # Return Video object #.....................................................................# diff --git a/tests/api.py b/tests/api.py index 5fe7c47..3a2fd0f 100644 --- a/tests/api.py +++ b/tests/api.py @@ -29,6 +29,7 @@ KWARGS_LIST = [ ('get_all_comments', ['video_id']), ('append_comment_reactions', ['comment_info_list']), ('normalized_name_to_video_info', ['normalized_name']), + ('normalized_names_to_video_info', ['normalized_names']), ('get_streaming_url', ['canonical_url']), ('get_recommended', ['video_title', 'video_id']),] diff --git a/tests/conftest.py b/tests/conftest.py index 8aa3b46..ec4ef57 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -89,6 +89,7 @@ def resources(): video_id = VIDEO_ID, video_title = VIDEO_TITLE, normalized_name = NORMALIZED_NAME, + normalized_names = [NORMALIZED_NAME], canonical_url = CANONICAL_URL, full_video_info = FULL_VIDEO_INFO, full_comment_info = {**COMMENT_INFO_LIST[0], **{'likes': 8, 'dislikes': 0}},