mirror of
https://github.com/bellingcat/polyphemus.git
synced 2026-06-07 19:08:33 +03:00
491 lines
15 KiB
Python
491 lines
15 KiB
Python
# -*- coding: UTF-8 -*-
|
|
|
|
"""Functions to request and process information from Odysee APIs
|
|
"""
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
import json
|
|
from urllib.parse import quote
|
|
from typing import Tuple, Optional, List, Callable
|
|
|
|
import time
|
|
|
|
import requests
|
|
|
|
# API endpoints for Odysee data
|
|
#-----------------------------------------------------------------------------#
|
|
|
|
BACKEND_API_URL = 'https://api.na-backend.odysee.com/api/v1/proxy'
|
|
SUBSCRIBER_API_URL = 'https://api.odysee.com/subscription/sub_count'
|
|
VIEW_API_URL = 'https://api.odysee.com/file/view_count'
|
|
REACTION_API_URL = 'https://api.odysee.com/reaction/list'
|
|
COMMENT_API_URL = 'https://comments.odysee.com/api/v2'
|
|
RECOMMENDATION_API_URL = 'https://recsys.odysee.com/search'
|
|
NEW_USER_API_URL = 'https://api.odysee.com/user/new'
|
|
|
|
# Allow responses to `get_streaming_url` that contain no `streaming_url` field
|
|
ALLOWED_ERROR_CODES = [-32603]
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def make_request(request: Callable, kwargs: dict) -> requests.Response:
|
|
|
|
"""Wrapper for retrying request multiple times and handling errors.
|
|
|
|
This function handles Python exceptions (e.g. HTTPConnectionPool),
|
|
unsuccessful HTTP error codes (e.g. 429, 403), and errors in the
|
|
JSON response. If after 5 retries (using exponential backoff) the request
|
|
is unsuccessful, an exception is raised.
|
|
|
|
Parameters
|
|
----------
|
|
request: function
|
|
The requests function to be called.
|
|
One of {requests.get and requests.post}
|
|
kwargs: dict
|
|
Keyword arguments for the ``request`` function. Must include ``url`` key.
|
|
e.g. ``{'url': 'https://api.odysee.com/user/new'}``
|
|
Uses a default timeout of 15 seconds.
|
|
|
|
Returns
|
|
-------
|
|
response: requests.Response
|
|
"""
|
|
|
|
if request not in [requests.get, requests.post]:
|
|
msg = f'`request` argument must be either `requests.get` or `requests.post`, not {type(request)}'
|
|
raise ValueError(msg)
|
|
|
|
if 'timeout' not in kwargs:
|
|
kwargs['timeout'] = 15
|
|
|
|
n_retries = 0
|
|
|
|
response = requests.Response()
|
|
response.status_code = 418
|
|
|
|
retry_reasons = []
|
|
|
|
# TODO this looks a bit gross, try to refactor
|
|
while n_retries < 10:
|
|
time.sleep(2 ** n_retries - 1)
|
|
try:
|
|
response = request(**kwargs)
|
|
if response.status_code == 200:
|
|
parsed_response = json.loads(response.text)
|
|
if isinstance(parsed_response, list):
|
|
return response
|
|
if parsed_response.get('error') is not None:
|
|
if parsed_response['error'].get('code', None) not in ALLOWED_ERROR_CODES:
|
|
retry_reasons.append(f'JSON response error: {parsed_response["error"]}')
|
|
n_retries += 1
|
|
else:
|
|
return response
|
|
else:
|
|
return response
|
|
else:
|
|
retry_reasons.append(f'HTTP status code: {response.status_code}')
|
|
n_retries += 1
|
|
except Exception as exception:
|
|
retry_reasons.append(f'Python exception: {exception}')
|
|
n_retries += 1
|
|
|
|
msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Retry reasons: {retry_reasons}'
|
|
raise ValueError(msg)
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def get_auth_token() -> str:
|
|
|
|
"""Get a fresh authorization token, to use for API calls that require it.
|
|
|
|
Note: calling this function many times in quick succession may result in a
|
|
503 error.
|
|
"""
|
|
|
|
response = make_request(
|
|
request = requests.post,
|
|
kwargs = {
|
|
'url' : NEW_USER_API_URL})
|
|
|
|
auth_token = json.loads(response.text)['data']['auth_token']
|
|
|
|
return auth_token
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def get_channel_info(channel_name: str) -> dict:
|
|
|
|
"""Get the channel information and ID from the channel name.
|
|
"""
|
|
|
|
channel_url = f'lbry://@{channel_name}'
|
|
|
|
json_data = {
|
|
"jsonrpc":"2.0",
|
|
"method":"resolve",
|
|
"params":{
|
|
"urls":[channel_url]}}
|
|
|
|
response = make_request(
|
|
request = requests.post,
|
|
kwargs = {
|
|
'url' : BACKEND_API_URL,
|
|
'json': json_data})
|
|
|
|
result = json.loads(response.text)
|
|
|
|
info = result['result'][channel_url]
|
|
|
|
info = {
|
|
'channel_id' : info['claim_id'],
|
|
'title' : info['value'].get('title'),
|
|
'created': info['timestamp'],
|
|
'description': info['value'].get('description'),
|
|
'cover_image': info['value'].get('cover',{}).get('url'),
|
|
'thumbnail_image': info['value'].get('thumbnail',{}).get('url'),
|
|
'raw' : response.text}
|
|
|
|
return info
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def get_subscribers(channel_id: str, auth_token: str = None) -> int:
|
|
|
|
"""Get the number of subscribers for a channel.
|
|
"""
|
|
|
|
if auth_token is None:
|
|
auth_token = get_auth_token()
|
|
|
|
json_data = {
|
|
'auth_token': auth_token,
|
|
'claim_id': channel_id }
|
|
|
|
response = make_request(
|
|
request = requests.post,
|
|
kwargs = {
|
|
'url' : SUBSCRIBER_API_URL,
|
|
'data': json_data})
|
|
|
|
result = json.loads(response.text)
|
|
subscribers = result['data'][0]
|
|
|
|
return subscribers
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def get_raw_video_info_list(channel_id: str) -> dict:
|
|
|
|
"""Get a list of all videos posted by a specified channel name.
|
|
|
|
Odysee's ``claim_search`` API (which is used on the browser and LBRY
|
|
desktop app) only allows up to 1000 videos to be fetched for a single value
|
|
of the ``release_time`` parameter. You can check this by going to an Odysee
|
|
channel with a lot of videos (e.g. @etresouverain) and holding the
|
|
"Page Down" button until you reach the bottom, there will only be 1000
|
|
videos.
|
|
|
|
This function loops over all pages for a single ``release_time`` and
|
|
fetches the raw video info for all videos until it reaches that 1000 video
|
|
limit, then uses the minimum of the ``creation_timestamp`` for all videos
|
|
as the new ``release_time``, and starts over looping over all pages for
|
|
that new ``release_time``.
|
|
|
|
Returns
|
|
-------
|
|
raw_video_info_list: list<dict>
|
|
List of dictionaries, with each dict corresponding to a JSON response
|
|
containing data about a single video.
|
|
|
|
"""
|
|
|
|
claim_id_to_raw_video_info = {}
|
|
page = 1
|
|
release_time = int(time.time()) + 86400
|
|
hit_video_limit = False
|
|
|
|
while True:
|
|
|
|
json_data = {
|
|
"jsonrpc":"2.0",
|
|
"method":"claim_search",
|
|
"params":{
|
|
"page_size":30,
|
|
"page":page,
|
|
"order_by":["release_time"],
|
|
"channel_ids":[channel_id],
|
|
"release_time": f"<{release_time}"}}
|
|
|
|
response = make_request(
|
|
request = requests.post,
|
|
kwargs = {
|
|
'url' : BACKEND_API_URL,
|
|
'json': json_data})
|
|
|
|
result = json.loads(response.text)
|
|
|
|
videos = result['result']['items']
|
|
new_videos = {video['claim_id'] : video for video in videos if video['claim_id'] not in claim_id_to_raw_video_info}
|
|
|
|
if len(new_videos) == 0:
|
|
# if there are no new videos that haven't already been scraped
|
|
if hit_video_limit:
|
|
# if Odysee's limit of 1000 videos for a given timestamp was
|
|
# reached (which updates the `release_time`) on the last
|
|
# request, this means we have scraped all videos on the channel,
|
|
# so we break the loop.
|
|
break
|
|
else:
|
|
# we have hit Odysee's limit of 1000 videos for a given
|
|
# timestamp, so we update `release_time` and reset `page`
|
|
hit_video_limit = True
|
|
release_time = min([raw_video_info['meta']['creation_timestamp'] for raw_video_info in claim_id_to_raw_video_info.values()], default = 0)
|
|
page = 1
|
|
else:
|
|
# there were unscraped videos from the last request, so we keep
|
|
# going in the loop and increment the `page` variable
|
|
claim_id_to_raw_video_info.update(new_videos)
|
|
page += 1
|
|
hit_video_limit = False
|
|
|
|
return list(claim_id_to_raw_video_info.values())
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def get_views(video_id: str, auth_token: str = None) -> int:
|
|
|
|
"""Get the number of views for a given video.
|
|
"""
|
|
|
|
if auth_token is None:
|
|
auth_token = get_auth_token()
|
|
|
|
params = {
|
|
'auth_token': auth_token,
|
|
'claim_id': video_id }
|
|
|
|
response = make_request(
|
|
request = requests.get,
|
|
kwargs = {
|
|
'url' : VIEW_API_URL,
|
|
'params': params})
|
|
|
|
views = json.loads(response.text)['data'][0]
|
|
|
|
return views
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def get_video_reactions(video_id: str, auth_token: str = None) -> Tuple[Optional[int], Optional[int]]:
|
|
|
|
"""Get all reactions for a given video.
|
|
"""
|
|
|
|
if auth_token is None:
|
|
auth_token = get_auth_token()
|
|
|
|
post_data = {
|
|
'auth_token': auth_token,
|
|
'claim_ids': video_id }
|
|
|
|
response = make_request(
|
|
request = requests.post,
|
|
kwargs = {
|
|
'url' : REACTION_API_URL,
|
|
'data': post_data})
|
|
|
|
result = json.loads(response.text)
|
|
|
|
if result['success']:
|
|
reactions = result['data']['others_reactions'][video_id]
|
|
return reactions['like'], reactions['dislike']
|
|
else:
|
|
return None, None
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def get_all_comments(video_id: str) -> List[dict]:
|
|
|
|
"""Get a list of all comments for a single video.
|
|
|
|
Parameters
|
|
----------
|
|
video_id: str
|
|
Claim ID for the video whose comments are to be scraped
|
|
e.g. ``'84d2a91e910bee523af5422439a639f677b9c78f'``
|
|
|
|
Returns
|
|
-------
|
|
all_comments: list<dict>
|
|
List of dictionaries, with each dict corresponding to a JSON response
|
|
containing data about a single comment for the specified video.
|
|
"""
|
|
|
|
all_comments = []
|
|
|
|
page = 1
|
|
|
|
while True:
|
|
|
|
json_data = {
|
|
"jsonrpc":"2.0",
|
|
"id":1,
|
|
"method":"comment.List",
|
|
"params":{
|
|
"page":page,
|
|
"claim_id":video_id,
|
|
"page_size":10,
|
|
"top_level":False,
|
|
"sort_by":3}}
|
|
|
|
response = make_request(
|
|
request = requests.post,
|
|
kwargs = {
|
|
'url' : COMMENT_API_URL,
|
|
'json': json_data})
|
|
|
|
result = json.loads(response.text)
|
|
|
|
if 'items' not in result['result']:
|
|
break
|
|
else:
|
|
_comments = result['result']['items']
|
|
comments = append_comment_reactions(comment_info_list = _comments)
|
|
all_comments.extend(comments)
|
|
page += 1
|
|
|
|
return all_comments
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def append_comment_reactions(comment_info_list: List[dict]) -> List[dict]:
|
|
|
|
"""Get reaction data for each comment and insert ``'reactions'`` key into
|
|
dict for each comment.
|
|
|
|
Parameters
|
|
----------
|
|
comment_info_list: list<dict>
|
|
List of dictionaries, with each dict corresponding to a JSON response
|
|
containing data about a single comment for the specified video.
|
|
|
|
Returns
|
|
-------
|
|
comments: list<dict>
|
|
List of dictionaries, with each dict corresponding to a JSON response
|
|
containing data about a single comment for the specified video, with
|
|
additional ``'reactions'`` field containing reaction information for
|
|
each comment.
|
|
|
|
"""
|
|
|
|
comment_ids = ','.join([c['comment_id'] for c in comment_info_list])
|
|
|
|
json_data = {
|
|
"jsonrpc":"2.0",
|
|
"id":1,
|
|
"method":"reaction.List",
|
|
"params":{
|
|
"comment_ids":comment_ids}}
|
|
|
|
response = make_request(
|
|
request = requests.post,
|
|
kwargs = {
|
|
'url' : COMMENT_API_URL,
|
|
'json': json_data})
|
|
|
|
result = json.loads(response.text)
|
|
|
|
reactions = result['result']['others_reactions']
|
|
|
|
for comment in comment_info_list:
|
|
comment['likes'] = reactions[comment['comment_id']]['like']
|
|
comment['dislikes'] = reactions[comment['comment_id']]['dislike']
|
|
|
|
return comment_info_list
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def get_recommended(video_title: str, video_id: str) -> List[dict]:
|
|
|
|
"""Get list of raw video info dicts for a specified video title and video
|
|
claim_id.
|
|
"""
|
|
|
|
name = quote(video_title)
|
|
|
|
params = {
|
|
's':name,
|
|
'size':'20',
|
|
'from':'0',
|
|
'related_to':video_id}
|
|
|
|
response = make_request(
|
|
request = requests.get,
|
|
kwargs = {
|
|
'url' : RECOMMENDATION_API_URL,
|
|
'params': params})
|
|
|
|
result = json.loads(response.text)
|
|
recommended_video_info = normalized_names_to_video_info([r['name'] for r in result])
|
|
recommended_video_info = [vi for vi in recommended_video_info if ((vi.get('value_type') == 'stream') & any(key in vi.get('value', []) for key in ('video', 'audio')))]
|
|
|
|
return recommended_video_info
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def normalized_names_to_video_info(normalized_names: List[str]) -> dict:
|
|
|
|
"""Convert a list of normalized names of videos to a list of raw video dicts for those videos. Example of a "normalized name" is:
|
|
|
|
``'si-une-tude-montre-que-le-masque-permet'``,
|
|
|
|
corresponding to the video:
|
|
|
|
``https://odysee.com/@filsdepangolin#e/si-une-tude-montre-que-le-masque-permet#e``.
|
|
"""
|
|
|
|
video_urls = [f"lbry://{normalized_name}" for normalized_name in normalized_names]
|
|
|
|
json_data = {
|
|
"jsonrpc":"2.0",
|
|
"method":"resolve",
|
|
"params":{
|
|
"urls":video_urls}}
|
|
|
|
response = make_request(
|
|
request = requests.post,
|
|
kwargs = {
|
|
'url' : BACKEND_API_URL,
|
|
'json': json_data})
|
|
|
|
result = json.loads(response.text)
|
|
|
|
return [result['result'][video_url] for video_url in video_urls]
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
|
|
|
def get_streaming_url(canonical_url: str) -> str:
|
|
|
|
"""Retrieve the `streaming_url` for a specified video.
|
|
"""
|
|
|
|
json_data = {
|
|
"jsonrpc":"2.0",
|
|
"method":"get",
|
|
"params":{
|
|
"uri":canonical_url}}
|
|
|
|
response = make_request(
|
|
request = requests.post,
|
|
kwargs = {
|
|
'url' : BACKEND_API_URL,
|
|
'json': json_data})
|
|
|
|
video_url = json.loads(response.text).get('result', {}).get('streaming_url')
|
|
|
|
return video_url
|
|
|
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# |