refactored code to avoid circular imports, isolated all requests to api.py module

This commit is contained in:
Tristan Lee
2022-02-17 12:45:40 -06:00
parent c4b6d023c5
commit a6d2527bc7
6 changed files with 328 additions and 368 deletions

View File

@@ -4,4 +4,5 @@ Scraper for alt-tech video sharing platform [Odysee](https://odysee.com/).
### TODO
- Implement CLI
- Work on reverse-engineering auth_token instead of having it hard-coded
- Add error handling/backoff waiting to requests
- Work on reverse-engineering auth_token instead of having it hard-coded

View File

@@ -24,11 +24,11 @@ if __name__ == '__main__':
odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME)
video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments()
video_list, comment_list = odysee_channel.get_all_videos_and_comments()
channel_df = pd.DataFrame([odysee_channel.info])
video_df = pd.DataFrame(video_info_list)
comment_df = pd.DataFrame(comment_info_list)
video_df = pd.DataFrame([v.info for v in video_list])
comment_df = pd.DataFrame([c.info for c in comment_list])
output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME)
os.makedirs(output_subdir, exist_ok = True)

View File

@@ -2,7 +2,7 @@
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
from . import api
from . import base
from . import utils
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

296
polyphemus/api.py Normal file
View File

@@ -0,0 +1,296 @@
# -*- coding: UTF-8 -*-
"""Functions to request and process information from Odysee APIs
"""
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
import json
from urllib.parse import quote
import requests
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#TODO Figure out how to reverse-engineer this
AUTH_TOKEN = 'BseGAiye641UqUsv4g31ZcUCRiLasv3U'
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_channel_info(channel_name):
"""Get the channel information and ID from the channel name.
"""
channel_url = f'lbry://@{channel_name}'
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
post_json = {
"jsonrpc":"2.0",
"method":"resolve",
"params":{
"urls":[channel_url]}}
response = requests.post(
url = api_url,
json = post_json)
result = json.loads(response.text)
info = result['result'][channel_url]
info = {
'channel_id' : info['claim_id'],
'title' : info['value']['title'],
'created': info['timestamp'],
'description': info['value']['description'],
'cover_image': info['value']['cover']['url'],
'thumbnail_image': info['value']['thumbnail']['url'],
'raw' : response.text}
return info
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_subscribers(claim_id):
"""Get the number of subscribers for a channel.
"""
api_url = 'https://api.odysee.com/subscription/sub_count'
post_data = {
'auth_token': AUTH_TOKEN,
'claim_id': claim_id }
response = requests.post(url = api_url, data = post_data)
result = json.loads(response.text)
subscribers = result['data'][0]
return subscribers
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_all_videos(channel_id):
"""Get a list of all videos posted by a specified channel name.
Returns
-------
all_videos: list<dict>
List of dictionaries, with each dict corresponding to a JSON response
containing data about a single video.
"""
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
all_videos = []
page = 1
while True:
post_data = {
"jsonrpc":"2.0",
"method":"claim_search",
"params":{
"page_size":30,
"page":page,
"order_by":["release_time"],
"channel_ids":[channel_id]}}
response = requests.post(
url = api_url,
json = post_data)
result = json.loads(response.text)
videos = result['result']['items']
if not videos:
break
else:
all_videos.extend(videos)
page += 1
return all_videos
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_views(claim_id):
"""Get the number of views for a given video.
"""
api_url = 'https://api.odysee.com/file/view_count'
params = {
'auth_token': AUTH_TOKEN,
'claim_id': claim_id }
response = requests.get(api_url, params = params)
views = json.loads(response.text)['data'][0]
return views
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_video_reactions(claim_id):
"""Get all reactions for a given video.
"""
api_url = 'https://api.odysee.com/reaction/list'
post_data = {
'auth_token': AUTH_TOKEN,
'claim_ids': claim_id }
response = requests.post(url = api_url, data = post_data)
result = json.loads(response.text)
reactions = result['data']['others_reactions'][claim_id ]
return reactions['like'], reactions['dislike']
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_all_comments(claim_id):
"""Get a list of all comments for a single video.
Parameters
----------
claim_id: str
Claim ID for the video whose comments are to be scraped
e.g. ``'84d2a91e910bee523af5422439a639f677b9c78f'``
Returns
-------
all_comments: list<dict>
List of dictionaries, with each dict corresponding to a JSON response
containing data about a single comment for the specified video.
"""
api_url = 'https://comments.odysee.com/api/v2'
all_comments = []
page = 1
while True:
post_data = {
"jsonrpc":"2.0",
"id":1,
"method":"comment.List",
"params":{
"page":page,
"claim_id":claim_id,
"page_size":10,
"top_level":False,
"sort_by":3}}
response = requests.post(
url = api_url,
json = post_data)
result = json.loads(response.text)
if 'items' not in result['result']:
break
else:
_comments = result['result']['items']
comments = append_comment_reactions(comments = _comments)
all_comments.extend(comments)
page += 1
return all_comments
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def append_comment_reactions(comments):
"""Get reaction data for each comment and insert ``'reactions'`` key into
dict for each comment.
Parameters
----------
comments: list<dict>
List of dictionaries, with each dict corresponding to a JSON response
containing data about a single comment for the specified video.
Returns
-------
comments: list<dict>
List of dictionaries, with each dict corresponding to a JSON response
containing data about a single comment for the specified video, with
additional ``'reactions'`` field containing reaction information for
each comment.
"""
comment_ids = ','.join([c['comment_id'] for c in comments])
post_data = {
"jsonrpc":"2.0",
"id":1,
"method":"reaction.List",
"params":{
"comment_ids":comment_ids}}
api_url = 'https://comments.odysee.com/api/v2'
response = requests.post(url = api_url, json = post_data)
result = json.loads(response.text)
reactions = result['result']['others_reactions']
for comment in comments:
comment['likes'] = reactions[comment['comment_id']]['like']
comment['dislikes'] = reactions[comment['comment_id']]['dislike']
return comments
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_recommended(title, claim_id):
api_url = 'https://recsys.odysee.com/search'
name = quote(title)
params = {
's':name,
'size':'20',
'from':'0',
'related_to':claim_id}
response = requests.get(api_url, params = params)
result = json.loads(response.text)
recommended_video_info = [ name_to_video_info(r['name']) for r in result]
recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream']
return recommended_video_info
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def name_to_video_info(name):
url = f"lbry://{name}"
post_data = {
"jsonrpc":"2.0",
"method":"resolve",
"params":{
"urls":[url]}}
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
response = requests.post(url = api_url, json = post_data)
result = json.loads(response.text)
return result['result'][url]
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -8,7 +8,7 @@
import json
from urllib.parse import quote
import requests
from polyphemus import api
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -24,136 +24,42 @@ class OdyseeChannel:
def __init__(self, channel_name):
self._channel_name = channel_name
self.get_channel_info()
#-------------------------------------------------------------------------#
def get_channel_info(self):
"""Get the channel information and ID from the channel name.
"""
channel_url = f'lbry://@{self._channel_name}'
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
post_json = {
"jsonrpc":"2.0",
"method":"resolve",
"params":{
"urls":[channel_url]}}
response = requests.post(
url = api_url,
json = post_json)
result = json.loads(response.text)
info = result['result'][channel_url]
info = {
'channel_id' : info['claim_id'],
'title' : info['value']['title'],
'created': info['timestamp'],
'description': info['value']['description'],
'cover_image': info['value']['cover']['url'],
'thumbnail_image': info['value']['thumbnail']['url'],
'raw' : response.text}
info = api.get_channel_info(channel_name = self._channel_name)
self.info = info
self._channel_id = self.info['channel_id']
self.get_subscribers()
self.info['subscribers'] = api.get_subscribers(claim_id = self.info['channel_id'])
#-------------------------------------------------------------------------#
def get_subscribers(self):
"""Get the number of subscribers for a channel.
"""
api_url = 'https://api.odysee.com/subscription/sub_count'
post_data = {
'auth_token': AUTH_TOKEN,
'claim_id': self.info['channel_id'] }
response = requests.post(url = api_url, data = post_data)
result = json.loads(response.text)
subscribers = result['data'][0]
self.info['subscribers'] = subscribers
#-------------------------------------------------------------------------#
def get_all_videos(self):
"""Get a list of all videos posted by a specified channel name.
Returns
-------
all_videos: list<dict>
List of dictionaries, with each dict corresponding to a JSON response
containing data about a single video.
"""Return list of OdyseeVideo objects for all videos posted by the channel
"""
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
all_video_info = api.get_all_videos(channel_id=self.info['channel_id'])
self.all_videos = [OdyseeVideo(video) for video in all_video_info]
return self.all_videos
all_videos = []
page = 1
while True:
post_data = {
"jsonrpc":"2.0",
"method":"claim_search",
"params":{
"page_size":30,
"page":page,
"order_by":["release_time"],
"channel_ids":[self._channel_id]}}
response = requests.post(
url = api_url,
json = post_data)
result = json.loads(response.text)
videos = result['result']['items']
if not videos:
break
else:
all_videos.extend(videos)
page += 1
self._all_videos = all_videos
#-------------------------------------------------------------------------#
def process_all_videos(self):
self.get_all_videos()
all_videos_processed = [OdyseeVideo(video) for video in self._all_videos]
return all_videos_processed
#-------------------------------------------------------------------------#
def get_all_videos_and_comments(self):
def process_all_videos_and_comments(self):
self.get_all_videos()
all_videos = [OdyseeVideo(video) for video in self._all_videos]
all_videos_processed = [video for video in all_videos]
all_comments_processed = []
"""Return list of OdyseeVideo and OdyseeComment objects for all videos
posted by the channel and all comments posted to those videos
"""
all_videos = self.get_all_videos()
all_comments = []
for video in all_videos:
all_comments_processed.extend(video.process_all_comments())
all_comments.extend(video.get_all_comments())
return all_videos_processed, all_comments_processed
return all_videos, all_comments
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -178,129 +84,26 @@ class OdyseeVideo:
self._claim_id = self.info ['claim_id']
self.get_views()
self.get_video_reactions()
self.info['views'] = api.get_views(claim_id=self._claim_id)
#-------------------------------------------------------------------------#
self.info['likes'], self.info['dislikes']= api.get_video_reactions(
claim_id = self._claim_id)
def get_views(self):
"""Get the number of views for a given video.
"""
api_url = 'https://api.odysee.com/file/view_count'
params = {
'auth_token': AUTH_TOKEN,
'claim_id': self._claim_id }
response = requests.get(api_url, params = params)
views = json.loads(response.text)['data'][0]
self.info['views'] = views
#-------------------------------------------------------------------------#
def get_video_reactions(self):
"""Get all reactions for a given video.
"""
api_url = 'https://api.odysee.com/reaction/list'
post_data = {
'auth_token': AUTH_TOKEN,
'claim_ids': self._claim_id }
response = requests.post(url = api_url, data = post_data)
result = json.loads(response.text)
reactions = result['data']['others_reactions'][self._claim_id ]
self.info['likes'] = reactions['like']
self.info['dislikes'] = reactions['dislike']
#-------------------------------------------------------------------------#
def get_all_comments(self):
"""Get a list of all comments for a single video.
Parameters
----------
claim_id: str
Claim ID for the video whose comments are to be scraped
e.g. ``'84d2a91e910bee523af5422439a639f677b9c78f'``
Returns
-------
all_comments: list<dict>
List of dictionaries, with each dict corresponding to a JSON response
containing data about a single comment for the specified video.
"""
api_url = 'https://comments.odysee.com/api/v2'
all_comments = []
page = 1
while True:
post_data = {
"jsonrpc":"2.0",
"id":1,
"method":"comment.List",
"params":{
"page":page,
"claim_id":self._claim_id,
"page_size":10,
"top_level":False,
"sort_by":3}}
response = requests.post(
url = api_url,
json = post_data)
result = json.loads(response.text)
if 'items' not in result['result']:
break
else:
_comments = result['result']['items']
comments = append_comment_reactions(comments = _comments)
all_comments.extend(comments)
page += 1
self._all_comments = all_comments
#-------------------------------------------------------------------------#
def process_all_comments(self):
all_comment_info = api.get_all_comments(claim_id=self._claim_id)
self.all_comments = [OdyseeComment(comment) for comment in all_comment_info]
self.get_all_comments()
all_comments_processed = [OdyseeComment(comment).info for comment in self._all_comments]
return all_comments_processed
return self.all_comments
#-------------------------------------------------------------------------#
def get_recommended(self):
api_url = 'https://recsys.odysee.com/search'
name = quote(self.info['title'])
params = {
's':name,
'size':'20',
'from':'0',
'related_to':self._claim_id}
response = requests.get(api_url, params = params)
result = json.loads(response.text)
recommended_video_info = [_name_to_video_info(r['name']) for r in result]
recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream']
recommended_video_info = api.get_recommended(
title=self.info['title'], claim_id=self._claim_id)
recommended_videos = [OdyseeVideo(video_info) for video_info in recommended_video_info]
return recommended_videos
@@ -322,67 +125,4 @@ class OdyseeComment:
'dislikes' : full_comment_info['dislikes'],
'raw' : json.dumps(full_comment_info)}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def append_comment_reactions(comments):
"""Get reaction data for each comment and insert ``'reactions'`` key into
dict for each comment.
Parameters
----------
comments: list<dict>
List of dictionaries, with each dict corresponding to a JSON response
containing data about a single comment for the specified video.
Returns
-------
comments: list<dict>
List of dictionaries, with each dict corresponding to a JSON response
containing data about a single comment for the specified video, with
additional ``'reactions'`` field containing reaction information for
each comment.
"""
comment_ids = ','.join([c['comment_id'] for c in comments])
post_data = {
"jsonrpc":"2.0",
"id":1,
"method":"reaction.List",
"params":{
"comment_ids":comment_ids}}
api_url = 'https://comments.odysee.com/api/v2'
response = requests.post(url = api_url, json = post_data)
result = json.loads(response.text)
reactions = result['result']['others_reactions']
for comment in comments:
comment['likes'] = reactions[comment['comment_id']]['like']
comment['dislikes'] = reactions[comment['comment_id']]['dislike']
return comments
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def name_to_video_info(name):
url = f"lbry://{name}"
post_data = {
"jsonrpc":"2.0",
"method":"resolve",
"params":{
"urls":[url]}}
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
response = requests.post(url = api_url, json = post_data)
result = json.loads(response.text)
return result['result'][url]
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -1,77 +0,0 @@
# -*- coding: UTF-8 -*-
"""Utility functions for scraping video data from Odysee video platform.
"""
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
import json
import requests
from .base import OdyseeVideo
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
ODYSEE_DOMAIN = 'https://odysee.com/'
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def _name_to_video_info(name):
url = f"lbry://{name}"
post_data = {
"jsonrpc":"2.0",
"method":"resolve",
"params":{
"urls":[url]}}
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
response = requests.post(url = api_url, json = post_data)
result = json.loads(response.text)
return result['result'][url]
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def _url_to_video_info(url):
if url.startswith(ODYSEE_DOMAIN):
name = url.split(ODYSEE_DOMAIN)[1]
url = f"lbry://{name}"
post_data = {
"jsonrpc":"2.0",
"method":"resolve",
"params":{
"urls":[url]}}
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
response = requests.post(url = api_url, json = post_data)
result = json.loads(response.text)
return result['result'][url]
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def name_to_video(name):
video_info = _name_to_video_info(name)
video = OdyseeVideo(video_info)
return video
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def url_to_video(name):
video_info = _url_to_video_info(name)
video = OdyseeVideo(video_info)
return video
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#