From 80264bbe1364e644ff1e74b97a72b469366bfcd5 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Thu, 10 Feb 2022 21:57:17 -0600 Subject: [PATCH] initial commit: got basic functionality working --- .gitignore | 2 + README.md | 8 ++ polyphemus.py | 365 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 375 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 polyphemus.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f8ca565 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.csv +**/data/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..dfb7cd0 --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# Polyphemus + +Scraper for alt-tech video sharing platform [Odysee](https://odysee.com/). + +### TODO +- Add number of subscribers to channel data +- Implement CLI +- Work on reverse-engineering auth_token instead of having it hard-coded \ No newline at end of file diff --git a/polyphemus.py b/polyphemus.py new file mode 100644 index 0000000..5c219ef --- /dev/null +++ b/polyphemus.py @@ -0,0 +1,365 @@ +# -*- coding: UTF-8 -*- + +"""Functions and classes for scraping video data from Odysee video platform. +""" + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +import json +import csv +from pathlib import Path +import os + +import requests +import pandas as pd + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +#TODO Figure out how to reverse-engineer this +AUTH_TOKEN = 'BseGAiye641UqUsv4g31ZcUCRiLasv3U' + +CHANNEL_NAME = 'PatriotFront' +OUTPUT_DIR = Path('.').resolve().parent/'data' + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +class OdyseeChannel: + + #-------------------------------------------------------------------------# + + def __init__(self, channel_name): + + self._channel_name = channel_name + self.get_channel_info() + + #-------------------------------------------------------------------------# + + def get_channel_info(self): + + """Get the channel information and ID from the channel name. + """ + + channel_url = f'lbry://@{self._channel_name}' + + api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' + + post_json = { + "jsonrpc":"2.0", + "method":"resolve", + "params":{ + "urls":[channel_url]}} + + response = requests.post( + url = api_url, + json = post_json) + + result = json.loads(response.text) + + info = result['result'][channel_url] + + channel_info = { + 'channel_id' : info['claim_id'], + 'title' : info['value']['title'], + 'created': info['timestamp'], + 'description': info['value']['description'], + 'cover_image': info['value']['cover']['url'], + 'thumbnail_image': info['value']['thumbnail']['url'], + 'raw' : response.text} + + self._channel_info = channel_info + self._channel_id = self._channel_info['channel_id'] + + #-------------------------------------------------------------------------# + + def get_all_videos(self): + + """Get a list of all videos posted by a specified channel name. + + Returns + ------- + all_videos: list + List of dictionaries, with each dict corresponding to a JSON response + containing data about a single video. + + """ + + api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' + + all_videos = [] + + page = 1 + + while True: + + post_data = { + "jsonrpc":"2.0", + "method":"claim_search", + "params":{ + "page_size":30, + "page":page, + "order_by":["release_time"], + "channel_ids":[self._channel_id]}} + + response = requests.post( + url = api_url, + json = post_data) + + result = json.loads(response.text) + + videos = result['result']['items'] + + if not videos: + break + else: + all_videos.extend(videos) + page += 1 + + self._all_videos = all_videos + + #-------------------------------------------------------------------------# + + def process_all_videos(self): + + self.get_all_videos() + all_videos_processed = [OdyseeVideo(video)._video_info for video in self._all_videos] + + return all_videos_processed + + #-------------------------------------------------------------------------# + + def process_all_videos_and_comments(self): + + self.get_all_videos() + all_videos = [OdyseeVideo(video) for video in self._all_videos] + all_videos_processed = [video._video_info for video in all_videos] + + all_comments_processed = [] + + for video in all_videos: + all_comments_processed.extend(video.process_all_comments()) + + return all_videos_processed, all_comments_processed + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +class OdyseeVideo: + + #-------------------------------------------------------------------------# + + def __init__(self, full_video_info): + + self._video_info = { + 'canonical_url' : full_video_info['canonical_url'], + 'claim_id' : full_video_info['claim_id'], + 'created' : full_video_info['value']['release_time'], + 'description' : full_video_info['value']['description'], + 'languages' : full_video_info['value']['languages'], + 'tags' : full_video_info['value'].get('tags',[]), + 'title' : full_video_info['value']['title'], + 'duration' : full_video_info['value']['video']['duration'], + 'thumbnail' : full_video_info['value']['thumbnail']['url'], + 'raw' : json.dumps(full_video_info)} + + self._claim_id = self._video_info ['claim_id'] + + self.get_views() + self.get_video_reactions() + + #-------------------------------------------------------------------------# + + def get_views(self): + + """Get the number of views for a given video. + """ + + api_url = 'https://api.odysee.com/file/view_count' + + params = { + 'auth_token': AUTH_TOKEN, + 'claim_id': self._claim_id } + + response = requests.get(api_url, params = params) + views = json.loads(response.text)['data'][0] + + self._video_info['views'] = views + + #-------------------------------------------------------------------------# + + def get_video_reactions(self): + + """Get all reactions for a given video. + """ + + api_url = f'https://api.odysee.com/reaction/list' + + post_data = { + 'auth_token': AUTH_TOKEN, + 'claim_ids': self._claim_id } + + response = requests.post(url = api_url, data = post_data) + result = json.loads(response.text) + reactions = result['data']['others_reactions'][self._claim_id ] + + self._video_info['likes'] = reactions['like'] + self._video_info['dislikes'] = reactions['dislike'] + + #-------------------------------------------------------------------------# + + def get_all_comments(self): + + """Get a list of all comments for a single video. + + Parameters + ---------- + claim_id: str + Claim ID for the video whose comments are to be scraped + e.g. ``'84d2a91e910bee523af5422439a639f677b9c78f'`` + + Returns + ------- + all_comments: list + List of dictionaries, with each dict corresponding to a JSON response + containing data about a single comment for the specified video. + """ + + api_url = 'https://comments.odysee.com/api/v2' + + all_comments = [] + + page = 1 + + while True: + + post_data = { + "jsonrpc":"2.0", + "id":1, + "method":"comment.List", + "params":{ + "page":page, + "claim_id":self._claim_id, + "page_size":10, + "top_level":False, + "sort_by":3}} + + response = requests.post( + url = api_url, + json = post_data) + + result = json.loads(response.text) + + if 'items' not in result['result']: + break + else: + _comments = result['result']['items'] + comments = append_comment_reactions(comments = _comments) + all_comments.extend(comments) + page += 1 + + self._all_comments = all_comments + + #-------------------------------------------------------------------------# + + def process_all_comments(self): + + self.get_all_comments() + all_comments_processed = [OdyseeComment(comment)._comment_info for comment in self._all_comments] + + return all_comments_processed + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +class OdyseeComment: + + def __init__(self, full_comment_info): + + self._comment_info = { + 'comment' : full_comment_info['comment'], + 'created' : full_comment_info['timestamp'], + 'video_claim_id' : full_comment_info['claim_id'], + 'channel_id' : full_comment_info['channel_id'], + 'channel_name' : full_comment_info['channel_name'], + 'replies' : full_comment_info.get('replies', 0), + 'likes' : full_comment_info['likes'], + 'dislikes' : full_comment_info['dislikes'], + 'raw' : json.dumps(full_comment_info)} + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def append_comment_reactions(comments): + + """Get reaction data for each comment and insert ``'reactions'`` key into + dict for each comment. + + Parameters + ---------- + comments: list + List of dictionaries, with each dict corresponding to a JSON response + containing data about a single comment for the specified video. + + Returns + ------- + comments: list + List of dictionaries, with each dict corresponding to a JSON response + containing data about a single comment for the specified video, with + additional ``'reactions'`` field containing reaction information for + each comment. + + """ + + comment_ids = ','.join([c['comment_id'] for c in comments]) + + post_data = { + "jsonrpc":"2.0", + "id":1, + "method":"reaction.List", + "params":{ + "comment_ids":comment_ids}} + + api_url = 'https://comments.odysee.com/api/v2' + response = requests.post(url = api_url, json = post_data) + result = json.loads(response.text) + + reactions = result['result']['others_reactions'] + + for comment in comments: + comment['likes'] = reactions[comment['comment_id']]['like'] + comment['dislikes'] = reactions[comment['comment_id']]['dislike'] + + return comments + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +if __name__ == '__main__': + + odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME) + + video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments() + + channel_df = pd.DataFrame([odysee_channel._channel_info]) + video_df = pd.DataFrame(video_info_list) + comment_df = pd.DataFrame(comment_info_list) + + output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME) + os.makedirs(output_subdir, exist_ok = True) + + channel_df.to_csv( + path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_channel.csv'), + index = False, + quoting = csv.QUOTE_NONNUMERIC ) + + video_df.to_csv( + path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_videos.csv'), + index = False, + quoting = csv.QUOTE_NONNUMERIC ) + + comment_df.to_csv( + path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_comments.csv'), + index = False, + quoting = csv.QUOTE_NONNUMERIC ) + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + + + + +