diff --git a/README.md b/README.md index b724dc8..baac4e8 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,5 @@ Scraper for alt-tech video sharing platform [Odysee](https://odysee.com/). ### TODO - Implement CLI -- Work on reverse-engineering auth_token instead of having it hard-coded \ No newline at end of file +- Add error handling/backoff waiting to requests +- Work on reverse-engineering auth_token instead of having it hard-coded diff --git a/examples/scrape.py b/examples/scrape.py index c0ed5d4..30db635 100644 --- a/examples/scrape.py +++ b/examples/scrape.py @@ -24,11 +24,11 @@ if __name__ == '__main__': odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME) - video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments() + video_list, comment_list = odysee_channel.get_all_videos_and_comments() channel_df = pd.DataFrame([odysee_channel.info]) - video_df = pd.DataFrame(video_info_list) - comment_df = pd.DataFrame(comment_info_list) + video_df = pd.DataFrame([v.info for v in video_list]) + comment_df = pd.DataFrame([c.info for c in comment_list]) output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME) os.makedirs(output_subdir, exist_ok = True) diff --git a/polyphemus/__init__.py b/polyphemus/__init__.py index b555347..93d73dd 100644 --- a/polyphemus/__init__.py +++ b/polyphemus/__init__.py @@ -2,7 +2,7 @@ #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# +from . import api from . import base -from . import utils #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/polyphemus/api.py b/polyphemus/api.py new file mode 100644 index 0000000..9a25618 --- /dev/null +++ b/polyphemus/api.py @@ -0,0 +1,296 @@ +# -*- coding: UTF-8 -*- + +"""Functions to request and process information from Odysee APIs +""" + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +import json +from urllib.parse import quote + +import requests + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +#TODO Figure out how to reverse-engineer this +AUTH_TOKEN = 'BseGAiye641UqUsv4g31ZcUCRiLasv3U' + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_channel_info(channel_name): + + """Get the channel information and ID from the channel name. + """ + + channel_url = f'lbry://@{channel_name}' + + api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' + + post_json = { + "jsonrpc":"2.0", + "method":"resolve", + "params":{ + "urls":[channel_url]}} + + response = requests.post( + url = api_url, + json = post_json) + + result = json.loads(response.text) + + info = result['result'][channel_url] + + info = { + 'channel_id' : info['claim_id'], + 'title' : info['value']['title'], + 'created': info['timestamp'], + 'description': info['value']['description'], + 'cover_image': info['value']['cover']['url'], + 'thumbnail_image': info['value']['thumbnail']['url'], + 'raw' : response.text} + + return info + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_subscribers(claim_id): + + """Get the number of subscribers for a channel. + """ + + api_url = 'https://api.odysee.com/subscription/sub_count' + + post_data = { + 'auth_token': AUTH_TOKEN, + 'claim_id': claim_id } + + response = requests.post(url = api_url, data = post_data) + result = json.loads(response.text) + subscribers = result['data'][0] + + return subscribers + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_all_videos(channel_id): + + """Get a list of all videos posted by a specified channel name. + + Returns + ------- + all_videos: list + List of dictionaries, with each dict corresponding to a JSON response + containing data about a single video. + + """ + + api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' + + all_videos = [] + + page = 1 + + while True: + + post_data = { + "jsonrpc":"2.0", + "method":"claim_search", + "params":{ + "page_size":30, + "page":page, + "order_by":["release_time"], + "channel_ids":[channel_id]}} + + response = requests.post( + url = api_url, + json = post_data) + + result = json.loads(response.text) + + videos = result['result']['items'] + + if not videos: + break + else: + all_videos.extend(videos) + page += 1 + + return all_videos + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_views(claim_id): + + """Get the number of views for a given video. + """ + + api_url = 'https://api.odysee.com/file/view_count' + + params = { + 'auth_token': AUTH_TOKEN, + 'claim_id': claim_id } + + response = requests.get(api_url, params = params) + views = json.loads(response.text)['data'][0] + + return views + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_video_reactions(claim_id): + + """Get all reactions for a given video. + """ + + api_url = 'https://api.odysee.com/reaction/list' + + post_data = { + 'auth_token': AUTH_TOKEN, + 'claim_ids': claim_id } + + response = requests.post(url = api_url, data = post_data) + result = json.loads(response.text) + reactions = result['data']['others_reactions'][claim_id ] + + return reactions['like'], reactions['dislike'] + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_all_comments(claim_id): + + """Get a list of all comments for a single video. + + Parameters + ---------- + claim_id: str + Claim ID for the video whose comments are to be scraped + e.g. ``'84d2a91e910bee523af5422439a639f677b9c78f'`` + + Returns + ------- + all_comments: list + List of dictionaries, with each dict corresponding to a JSON response + containing data about a single comment for the specified video. + """ + + api_url = 'https://comments.odysee.com/api/v2' + + all_comments = [] + + page = 1 + + while True: + + post_data = { + "jsonrpc":"2.0", + "id":1, + "method":"comment.List", + "params":{ + "page":page, + "claim_id":claim_id, + "page_size":10, + "top_level":False, + "sort_by":3}} + + response = requests.post( + url = api_url, + json = post_data) + + result = json.loads(response.text) + + if 'items' not in result['result']: + break + else: + _comments = result['result']['items'] + comments = append_comment_reactions(comments = _comments) + all_comments.extend(comments) + page += 1 + + return all_comments + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def append_comment_reactions(comments): + + """Get reaction data for each comment and insert ``'reactions'`` key into + dict for each comment. + + Parameters + ---------- + comments: list + List of dictionaries, with each dict corresponding to a JSON response + containing data about a single comment for the specified video. + + Returns + ------- + comments: list + List of dictionaries, with each dict corresponding to a JSON response + containing data about a single comment for the specified video, with + additional ``'reactions'`` field containing reaction information for + each comment. + + """ + + comment_ids = ','.join([c['comment_id'] for c in comments]) + + post_data = { + "jsonrpc":"2.0", + "id":1, + "method":"reaction.List", + "params":{ + "comment_ids":comment_ids}} + + api_url = 'https://comments.odysee.com/api/v2' + response = requests.post(url = api_url, json = post_data) + result = json.loads(response.text) + + reactions = result['result']['others_reactions'] + + for comment in comments: + comment['likes'] = reactions[comment['comment_id']]['like'] + comment['dislikes'] = reactions[comment['comment_id']]['dislike'] + + return comments + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_recommended(title, claim_id): + + api_url = 'https://recsys.odysee.com/search' + + name = quote(title) + + params = { + 's':name, + 'size':'20', + 'from':'0', + 'related_to':claim_id} + + response = requests.get(api_url, params = params) + result = json.loads(response.text) + + recommended_video_info = [ name_to_video_info(r['name']) for r in result] + recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream'] + + return recommended_video_info + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def name_to_video_info(name): + + url = f"lbry://{name}" + + post_data = { + "jsonrpc":"2.0", + "method":"resolve", + "params":{ + "urls":[url]}} + + api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' + + response = requests.post(url = api_url, json = post_data) + result = json.loads(response.text) + + return result['result'][url] + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/polyphemus/base.py b/polyphemus/base.py index 019c277..80c46a7 100644 --- a/polyphemus/base.py +++ b/polyphemus/base.py @@ -8,7 +8,7 @@ import json from urllib.parse import quote -import requests +from polyphemus import api #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -24,136 +24,42 @@ class OdyseeChannel: def __init__(self, channel_name): self._channel_name = channel_name - self.get_channel_info() - - #-------------------------------------------------------------------------# - def get_channel_info(self): - - """Get the channel information and ID from the channel name. - """ - - channel_url = f'lbry://@{self._channel_name}' - - api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' - - post_json = { - "jsonrpc":"2.0", - "method":"resolve", - "params":{ - "urls":[channel_url]}} - - response = requests.post( - url = api_url, - json = post_json) - - result = json.loads(response.text) - - info = result['result'][channel_url] - - info = { - 'channel_id' : info['claim_id'], - 'title' : info['value']['title'], - 'created': info['timestamp'], - 'description': info['value']['description'], - 'cover_image': info['value']['cover']['url'], - 'thumbnail_image': info['value']['thumbnail']['url'], - 'raw' : response.text} + info = api.get_channel_info(channel_name = self._channel_name) self.info = info self._channel_id = self.info['channel_id'] - self.get_subscribers() + self.info['subscribers'] = api.get_subscribers(claim_id = self.info['channel_id']) #-------------------------------------------------------------------------# - def get_subscribers(self): - - """Get the number of subscribers for a channel. - """ - - api_url = 'https://api.odysee.com/subscription/sub_count' - - post_data = { - 'auth_token': AUTH_TOKEN, - 'claim_id': self.info['channel_id'] } - - response = requests.post(url = api_url, data = post_data) - result = json.loads(response.text) - subscribers = result['data'][0] - - self.info['subscribers'] = subscribers - - #-------------------------------------------------------------------------# - def get_all_videos(self): - """Get a list of all videos posted by a specified channel name. - - Returns - ------- - all_videos: list - List of dictionaries, with each dict corresponding to a JSON response - containing data about a single video. - + """Return list of OdyseeVideo objects for all videos posted by the channel """ - api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' + all_video_info = api.get_all_videos(channel_id=self.info['channel_id']) + self.all_videos = [OdyseeVideo(video) for video in all_video_info] + + return self.all_videos - all_videos = [] - - page = 1 - - while True: - - post_data = { - "jsonrpc":"2.0", - "method":"claim_search", - "params":{ - "page_size":30, - "page":page, - "order_by":["release_time"], - "channel_ids":[self._channel_id]}} - - response = requests.post( - url = api_url, - json = post_data) - - result = json.loads(response.text) - - videos = result['result']['items'] - - if not videos: - break - else: - all_videos.extend(videos) - page += 1 - - self._all_videos = all_videos - #-------------------------------------------------------------------------# - def process_all_videos(self): - - self.get_all_videos() - all_videos_processed = [OdyseeVideo(video) for video in self._all_videos] - - return all_videos_processed - - #-------------------------------------------------------------------------# + def get_all_videos_and_comments(self): - def process_all_videos_and_comments(self): - - self.get_all_videos() - all_videos = [OdyseeVideo(video) for video in self._all_videos] - all_videos_processed = [video for video in all_videos] - - all_comments_processed = [] + """Return list of OdyseeVideo and OdyseeComment objects for all videos + posted by the channel and all comments posted to those videos + """ + + all_videos = self.get_all_videos() + + all_comments = [] for video in all_videos: - all_comments_processed.extend(video.process_all_comments()) + all_comments.extend(video.get_all_comments()) - return all_videos_processed, all_comments_processed + return all_videos, all_comments #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -178,129 +84,26 @@ class OdyseeVideo: self._claim_id = self.info ['claim_id'] - self.get_views() - self.get_video_reactions() + self.info['views'] = api.get_views(claim_id=self._claim_id) - #-------------------------------------------------------------------------# + self.info['likes'], self.info['dislikes']= api.get_video_reactions( + claim_id = self._claim_id) - def get_views(self): - - """Get the number of views for a given video. - """ - - api_url = 'https://api.odysee.com/file/view_count' - - params = { - 'auth_token': AUTH_TOKEN, - 'claim_id': self._claim_id } - - response = requests.get(api_url, params = params) - views = json.loads(response.text)['data'][0] - - self.info['views'] = views - - #-------------------------------------------------------------------------# - - def get_video_reactions(self): - - """Get all reactions for a given video. - """ - - api_url = 'https://api.odysee.com/reaction/list' - - post_data = { - 'auth_token': AUTH_TOKEN, - 'claim_ids': self._claim_id } - - response = requests.post(url = api_url, data = post_data) - result = json.loads(response.text) - reactions = result['data']['others_reactions'][self._claim_id ] - - self.info['likes'] = reactions['like'] - self.info['dislikes'] = reactions['dislike'] - #-------------------------------------------------------------------------# def get_all_comments(self): - - """Get a list of all comments for a single video. - - Parameters - ---------- - claim_id: str - Claim ID for the video whose comments are to be scraped - e.g. ``'84d2a91e910bee523af5422439a639f677b9c78f'`` - - Returns - ------- - all_comments: list - List of dictionaries, with each dict corresponding to a JSON response - containing data about a single comment for the specified video. - """ - - api_url = 'https://comments.odysee.com/api/v2' - - all_comments = [] - - page = 1 - - while True: - - post_data = { - "jsonrpc":"2.0", - "id":1, - "method":"comment.List", - "params":{ - "page":page, - "claim_id":self._claim_id, - "page_size":10, - "top_level":False, - "sort_by":3}} - - response = requests.post( - url = api_url, - json = post_data) - - result = json.loads(response.text) - - if 'items' not in result['result']: - break - else: - _comments = result['result']['items'] - comments = append_comment_reactions(comments = _comments) - all_comments.extend(comments) - page += 1 - - self._all_comments = all_comments - #-------------------------------------------------------------------------# - - def process_all_comments(self): + all_comment_info = api.get_all_comments(claim_id=self._claim_id) + self.all_comments = [OdyseeComment(comment) for comment in all_comment_info] - self.get_all_comments() - all_comments_processed = [OdyseeComment(comment).info for comment in self._all_comments] - - return all_comments_processed + return self.all_comments #-------------------------------------------------------------------------# def get_recommended(self): - api_url = 'https://recsys.odysee.com/search' - - name = quote(self.info['title']) - - params = { - 's':name, - 'size':'20', - 'from':'0', - 'related_to':self._claim_id} - - response = requests.get(api_url, params = params) - result = json.loads(response.text) - - recommended_video_info = [_name_to_video_info(r['name']) for r in result] - recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream'] + recommended_video_info = api.get_recommended( + title=self.info['title'], claim_id=self._claim_id) recommended_videos = [OdyseeVideo(video_info) for video_info in recommended_video_info] return recommended_videos @@ -322,67 +125,4 @@ class OdyseeComment: 'dislikes' : full_comment_info['dislikes'], 'raw' : json.dumps(full_comment_info)} -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - -def append_comment_reactions(comments): - - """Get reaction data for each comment and insert ``'reactions'`` key into - dict for each comment. - - Parameters - ---------- - comments: list - List of dictionaries, with each dict corresponding to a JSON response - containing data about a single comment for the specified video. - - Returns - ------- - comments: list - List of dictionaries, with each dict corresponding to a JSON response - containing data about a single comment for the specified video, with - additional ``'reactions'`` field containing reaction information for - each comment. - - """ - - comment_ids = ','.join([c['comment_id'] for c in comments]) - - post_data = { - "jsonrpc":"2.0", - "id":1, - "method":"reaction.List", - "params":{ - "comment_ids":comment_ids}} - - api_url = 'https://comments.odysee.com/api/v2' - response = requests.post(url = api_url, json = post_data) - result = json.loads(response.text) - - reactions = result['result']['others_reactions'] - - for comment in comments: - comment['likes'] = reactions[comment['comment_id']]['like'] - comment['dislikes'] = reactions[comment['comment_id']]['dislike'] - - return comments - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - -def name_to_video_info(name): - - url = f"lbry://{name}" - - post_data = { - "jsonrpc":"2.0", - "method":"resolve", - "params":{ - "urls":[url]}} - - api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' - - response = requests.post(url = api_url, json = post_data) - result = json.loads(response.text) - - return result['result'][url] - #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/polyphemus/utils.py b/polyphemus/utils.py deleted file mode 100644 index decf4b8..0000000 --- a/polyphemus/utils.py +++ /dev/null @@ -1,77 +0,0 @@ -# -*- coding: UTF-8 -*- - -"""Utility functions for scraping video data from Odysee video platform. -""" - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - -import json - -import requests - -from .base import OdyseeVideo - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - -ODYSEE_DOMAIN = 'https://odysee.com/' - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - -def _name_to_video_info(name): - - url = f"lbry://{name}" - - post_data = { - "jsonrpc":"2.0", - "method":"resolve", - "params":{ - "urls":[url]}} - - api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' - - response = requests.post(url = api_url, json = post_data) - result = json.loads(response.text) - - return result['result'][url] - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - -def _url_to_video_info(url): - - if url.startswith(ODYSEE_DOMAIN): - name = url.split(ODYSEE_DOMAIN)[1] - url = f"lbry://{name}" - - post_data = { - "jsonrpc":"2.0", - "method":"resolve", - "params":{ - "urls":[url]}} - - api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' - - response = requests.post(url = api_url, json = post_data) - result = json.loads(response.text) - - return result['result'][url] - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - -def name_to_video(name): - - video_info = _name_to_video_info(name) - video = OdyseeVideo(video_info) - - return video - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - -def url_to_video(name): - - video_info = _url_to_video_info(name) - video = OdyseeVideo(video_info) - - return video - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -