mirror of
https://github.com/bellingcat/polyphemus.git
synced 2026-06-08 03:18:32 +03:00
added capabilities for handling edge cases for video inputs
This commit is contained in:
@@ -4,5 +4,8 @@ Scraper for alt-tech video sharing platform [Odysee](https://odysee.com/).
|
|||||||
|
|
||||||
### TODO
|
### TODO
|
||||||
- Implement CLI
|
- Implement CLI
|
||||||
|
- Profile run-time, look into implementing async requests
|
||||||
- Add error handling/backoff waiting to requests
|
- Add error handling/backoff waiting to requests
|
||||||
|
- Implement basic test suite
|
||||||
|
- Formaize network graph generation into class/module
|
||||||
- Work on reverse-engineering auth_token instead of having it hard-coded
|
- Work on reverse-engineering auth_token instead of having it hard-coded
|
||||||
|
|||||||
66
examples/generate_network.py
Normal file
66
examples/generate_network.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
|
||||||
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
|
||||||
|
import polyphemus
|
||||||
|
|
||||||
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
|
CHANNEL_NAME = 'PatriotFront'
|
||||||
|
|
||||||
|
ITERATIONS = 3
|
||||||
|
|
||||||
|
OUTPUT_DIR = '../../data'
|
||||||
|
|
||||||
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
odysee_channel = polyphemus.base.OdyseeChannel(channel_name = CHANNEL_NAME)
|
||||||
|
|
||||||
|
edge_list = list()
|
||||||
|
already_done = list()
|
||||||
|
|
||||||
|
new_videos = odysee_channel.get_all_videos()
|
||||||
|
master_video_dict = dict(zip([v.info['claim_id'] for v in new_videos], new_videos))
|
||||||
|
|
||||||
|
for iteration in range(ITERATIONS):
|
||||||
|
|
||||||
|
print(f'\n\nITERATION: {iteration}, N_VIDEOS: {len(new_videos)}\n\n')
|
||||||
|
|
||||||
|
for i, video in enumerate(new_videos):
|
||||||
|
claim_id = video.info['claim_id']
|
||||||
|
title = video.info['title']
|
||||||
|
|
||||||
|
print(f'\nVIDEO: {i}; CLAIM_ID: {claim_id}\n')
|
||||||
|
|
||||||
|
recommended_video_info = polyphemus.api.get_recommended(title, claim_id)
|
||||||
|
|
||||||
|
for rec_video_info in recommended_video_info:
|
||||||
|
rec_claim_id = rec_video_info['claim_id']
|
||||||
|
print(f'REC_CLAIM_ID: {rec_claim_id}')
|
||||||
|
|
||||||
|
edge_list.append((claim_id, rec_claim_id))
|
||||||
|
|
||||||
|
if rec_video_info['claim_id'] not in master_video_dict:
|
||||||
|
master_video_dict[rec_claim_id] = polyphemus.base.OdyseeVideo(rec_video_info)
|
||||||
|
|
||||||
|
already_done.append(claim_id)
|
||||||
|
|
||||||
|
new_videos = [video for video in master_video_dict.values() if video.info['claim_id'] not in already_done]
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------#
|
||||||
|
|
||||||
|
os.makedirs(OUTPUT_DIR, exist_ok = True)
|
||||||
|
|
||||||
|
with open(Path(OUTPUT_DIR, 'master_video_dict.pkl'), 'wb') as f:
|
||||||
|
pickle.dump(master_video_dict, f)
|
||||||
|
|
||||||
|
with open(Path(OUTPUT_DIR, 'edge_list.pkl'), 'wb') as f:
|
||||||
|
pickle.dump(edge_list)
|
||||||
|
|
||||||
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
@@ -15,6 +15,16 @@ import requests
|
|||||||
#TODO Figure out how to reverse-engineer this
|
#TODO Figure out how to reverse-engineer this
|
||||||
AUTH_TOKEN = 'BseGAiye641UqUsv4g31ZcUCRiLasv3U'
|
AUTH_TOKEN = 'BseGAiye641UqUsv4g31ZcUCRiLasv3U'
|
||||||
|
|
||||||
|
# API endpoints for Odysee data
|
||||||
|
#-----------------------------------------------------------------------------#
|
||||||
|
|
||||||
|
BACKEND_API_URL = 'https://api.na-backend.odysee.com/api/v1/proxy'
|
||||||
|
SUBSCRIBER_API_URL = 'https://api.odysee.com/subscription/sub_count'
|
||||||
|
VIEW_API_URL = 'https://api.odysee.com/file/view_count'
|
||||||
|
REACTION_API_URL = 'https://api.odysee.com/reaction/list'
|
||||||
|
COMMENT_API_URL = 'https://comments.odysee.com/api/v2'
|
||||||
|
RECOMMENDATION_API_URL = 'https://recsys.odysee.com/search'
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
def get_channel_info(channel_name):
|
def get_channel_info(channel_name):
|
||||||
@@ -24,8 +34,6 @@ def get_channel_info(channel_name):
|
|||||||
|
|
||||||
channel_url = f'lbry://@{channel_name}'
|
channel_url = f'lbry://@{channel_name}'
|
||||||
|
|
||||||
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
|
|
||||||
|
|
||||||
post_json = {
|
post_json = {
|
||||||
"jsonrpc":"2.0",
|
"jsonrpc":"2.0",
|
||||||
"method":"resolve",
|
"method":"resolve",
|
||||||
@@ -33,7 +41,7 @@ def get_channel_info(channel_name):
|
|||||||
"urls":[channel_url]}}
|
"urls":[channel_url]}}
|
||||||
|
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url = api_url,
|
url = BACKEND_API_URL,
|
||||||
json = post_json)
|
json = post_json)
|
||||||
|
|
||||||
result = json.loads(response.text)
|
result = json.loads(response.text)
|
||||||
@@ -58,13 +66,11 @@ def get_subscribers(claim_id):
|
|||||||
"""Get the number of subscribers for a channel.
|
"""Get the number of subscribers for a channel.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
api_url = 'https://api.odysee.com/subscription/sub_count'
|
json_data = {
|
||||||
|
|
||||||
post_data = {
|
|
||||||
'auth_token': AUTH_TOKEN,
|
'auth_token': AUTH_TOKEN,
|
||||||
'claim_id': claim_id }
|
'claim_id': claim_id }
|
||||||
|
|
||||||
response = requests.post(url = api_url, data = post_data)
|
response = requests.post(url = SUBSCRIBER_API_URL, data = json_data)
|
||||||
result = json.loads(response.text)
|
result = json.loads(response.text)
|
||||||
subscribers = result['data'][0]
|
subscribers = result['data'][0]
|
||||||
|
|
||||||
@@ -84,15 +90,13 @@ def get_all_videos(channel_id):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
|
|
||||||
|
|
||||||
all_videos = []
|
all_videos = []
|
||||||
|
|
||||||
page = 1
|
page = 1
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
post_data = {
|
json_data = {
|
||||||
"jsonrpc":"2.0",
|
"jsonrpc":"2.0",
|
||||||
"method":"claim_search",
|
"method":"claim_search",
|
||||||
"params":{
|
"params":{
|
||||||
@@ -102,8 +106,8 @@ def get_all_videos(channel_id):
|
|||||||
"channel_ids":[channel_id]}}
|
"channel_ids":[channel_id]}}
|
||||||
|
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url = api_url,
|
url = BACKEND_API_URL,
|
||||||
json = post_data)
|
json = json_data)
|
||||||
|
|
||||||
result = json.loads(response.text)
|
result = json.loads(response.text)
|
||||||
|
|
||||||
@@ -124,13 +128,11 @@ def get_views(claim_id):
|
|||||||
"""Get the number of views for a given video.
|
"""Get the number of views for a given video.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
api_url = 'https://api.odysee.com/file/view_count'
|
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
'auth_token': AUTH_TOKEN,
|
'auth_token': AUTH_TOKEN,
|
||||||
'claim_id': claim_id }
|
'claim_id': claim_id }
|
||||||
|
|
||||||
response = requests.get(api_url, params = params)
|
response = requests.get(url = VIEW_API_URL, params = params)
|
||||||
views = json.loads(response.text)['data'][0]
|
views = json.loads(response.text)['data'][0]
|
||||||
|
|
||||||
return views
|
return views
|
||||||
@@ -142,17 +144,18 @@ def get_video_reactions(claim_id):
|
|||||||
"""Get all reactions for a given video.
|
"""Get all reactions for a given video.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
api_url = 'https://api.odysee.com/reaction/list'
|
|
||||||
|
|
||||||
post_data = {
|
post_data = {
|
||||||
'auth_token': AUTH_TOKEN,
|
'auth_token': AUTH_TOKEN,
|
||||||
'claim_ids': claim_id }
|
'claim_ids': claim_id }
|
||||||
|
|
||||||
response = requests.post(url = api_url, data = post_data)
|
response = requests.post(url = REACTION_API_URL, data = post_data)
|
||||||
result = json.loads(response.text)
|
result = json.loads(response.text)
|
||||||
reactions = result['data']['others_reactions'][claim_id ]
|
|
||||||
|
|
||||||
return reactions['like'], reactions['dislike']
|
if result['success']:
|
||||||
|
reactions = result['data']['others_reactions'][claim_id ]
|
||||||
|
return reactions['like'], reactions['dislike']
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
@@ -173,15 +176,13 @@ def get_all_comments(claim_id):
|
|||||||
containing data about a single comment for the specified video.
|
containing data about a single comment for the specified video.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
api_url = 'https://comments.odysee.com/api/v2'
|
|
||||||
|
|
||||||
all_comments = []
|
all_comments = []
|
||||||
|
|
||||||
page = 1
|
page = 1
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
post_data = {
|
json_data = {
|
||||||
"jsonrpc":"2.0",
|
"jsonrpc":"2.0",
|
||||||
"id":1,
|
"id":1,
|
||||||
"method":"comment.List",
|
"method":"comment.List",
|
||||||
@@ -193,8 +194,8 @@ def get_all_comments(claim_id):
|
|||||||
"sort_by":3}}
|
"sort_by":3}}
|
||||||
|
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url = api_url,
|
url = COMMENT_API_URL,
|
||||||
json = post_data)
|
json = json_data)
|
||||||
|
|
||||||
result = json.loads(response.text)
|
result = json.loads(response.text)
|
||||||
|
|
||||||
@@ -233,15 +234,14 @@ def append_comment_reactions(comments):
|
|||||||
|
|
||||||
comment_ids = ','.join([c['comment_id'] for c in comments])
|
comment_ids = ','.join([c['comment_id'] for c in comments])
|
||||||
|
|
||||||
post_data = {
|
json_data = {
|
||||||
"jsonrpc":"2.0",
|
"jsonrpc":"2.0",
|
||||||
"id":1,
|
"id":1,
|
||||||
"method":"reaction.List",
|
"method":"reaction.List",
|
||||||
"params":{
|
"params":{
|
||||||
"comment_ids":comment_ids}}
|
"comment_ids":comment_ids}}
|
||||||
|
|
||||||
api_url = 'https://comments.odysee.com/api/v2'
|
response = requests.post(url = COMMENT_API_URL, json = json_data)
|
||||||
response = requests.post(url = api_url, json = post_data)
|
|
||||||
result = json.loads(response.text)
|
result = json.loads(response.text)
|
||||||
|
|
||||||
reactions = result['result']['others_reactions']
|
reactions = result['result']['others_reactions']
|
||||||
@@ -256,8 +256,6 @@ def append_comment_reactions(comments):
|
|||||||
|
|
||||||
def get_recommended(title, claim_id):
|
def get_recommended(title, claim_id):
|
||||||
|
|
||||||
api_url = 'https://recsys.odysee.com/search'
|
|
||||||
|
|
||||||
name = quote(title)
|
name = quote(title)
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
@@ -266,11 +264,11 @@ def get_recommended(title, claim_id):
|
|||||||
'from':'0',
|
'from':'0',
|
||||||
'related_to':claim_id}
|
'related_to':claim_id}
|
||||||
|
|
||||||
response = requests.get(api_url, params = params)
|
response = requests.get(url = RECOMMENDATION_API_URL, params = params)
|
||||||
result = json.loads(response.text)
|
result = json.loads(response.text)
|
||||||
|
|
||||||
recommended_video_info = [ name_to_video_info(r['name']) for r in result]
|
recommended_video_info = [ name_to_video_info(r['name']) for r in result]
|
||||||
recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream']
|
recommended_video_info = [vi for vi in recommended_video_info if ((vi.get('value_type') == 'stream') & any(key in vi.get('value', []) for key in ('video', 'audio')))]
|
||||||
|
|
||||||
return recommended_video_info
|
return recommended_video_info
|
||||||
|
|
||||||
@@ -278,19 +276,17 @@ def get_recommended(title, claim_id):
|
|||||||
|
|
||||||
def name_to_video_info(name):
|
def name_to_video_info(name):
|
||||||
|
|
||||||
url = f"lbry://{name}"
|
video_url = f"lbry://{name}"
|
||||||
|
|
||||||
post_data = {
|
json_data = {
|
||||||
"jsonrpc":"2.0",
|
"jsonrpc":"2.0",
|
||||||
"method":"resolve",
|
"method":"resolve",
|
||||||
"params":{
|
"params":{
|
||||||
"urls":[url]}}
|
"urls":[video_url]}}
|
||||||
|
|
||||||
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
|
response = requests.post(url = BACKEND_API_URL, json = json_data)
|
||||||
|
|
||||||
response = requests.post(url = api_url, json = post_data)
|
|
||||||
result = json.loads(response.text)
|
result = json.loads(response.text)
|
||||||
|
|
||||||
return result['result'][url]
|
return result['result'][video_url]
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
@@ -68,21 +68,58 @@ class OdyseeVideo:
|
|||||||
#-------------------------------------------------------------------------#
|
#-------------------------------------------------------------------------#
|
||||||
|
|
||||||
def __init__(self, full_video_info):
|
def __init__(self, full_video_info):
|
||||||
|
|
||||||
|
# Handle edge cases
|
||||||
|
#.....................................................................#
|
||||||
|
|
||||||
|
if 'video' in full_video_info['value']:
|
||||||
|
video_type = 'video'
|
||||||
|
duration = full_video_info['value']['video'].get('duration')
|
||||||
|
elif 'audio' in full_video_info['value']:
|
||||||
|
video_type = 'audio'
|
||||||
|
duration = full_video_info['value']['audio'].get('duration')
|
||||||
|
else:
|
||||||
|
raise KeyError(f'nether `video` or `audio` keys are in `full_video_info["value"]`, only {full_video_info["value"].keys()}')
|
||||||
|
|
||||||
|
if 'signing_channel' in full_video_info:
|
||||||
|
channel_name = full_video_info['signing_channel'].get('name')
|
||||||
|
if 'claim_id' in full_video_info['signing_channel']:
|
||||||
|
channel_id = full_video_info['signing_channel']['claim_id']
|
||||||
|
else:
|
||||||
|
channel_id = full_video_info['signing_channel']['channel_id']
|
||||||
|
else:
|
||||||
|
channel_name = None
|
||||||
|
channel_id = None
|
||||||
|
|
||||||
|
if 'release_time' in full_video_info['value']:
|
||||||
|
created = full_video_info['value']['release_time']
|
||||||
|
else:
|
||||||
|
created = full_video_info['meta']['creation_timestamp']
|
||||||
|
|
||||||
|
if 'thumbnail' in full_video_info['value']:
|
||||||
|
thumbnail = full_video_info['value']['thumbnail'].get('url', None)
|
||||||
|
else:
|
||||||
|
thumbnail = None
|
||||||
|
|
||||||
|
# Store relevant information in flat dict
|
||||||
|
#.....................................................................#
|
||||||
|
|
||||||
self.info = {
|
self.info = {
|
||||||
'canonical_url' : full_video_info['canonical_url'],
|
'canonical_url' : full_video_info['canonical_url'],
|
||||||
'channel' : full_video_info['signing_channel']['name'],
|
'type' : video_type,
|
||||||
|
'channel_id' : channel_id,
|
||||||
|
'channel' : channel_name,
|
||||||
'claim_id' : full_video_info['claim_id'],
|
'claim_id' : full_video_info['claim_id'],
|
||||||
'created' : full_video_info['value']['release_time'],
|
'created' : created,
|
||||||
'description' : full_video_info['value'].get('description'),
|
'description' : full_video_info['value'].get('description'),
|
||||||
'languages' : full_video_info['value'].get('languages'),
|
'languages' : full_video_info['value'].get('languages'),
|
||||||
'tags' : full_video_info['value'].get('tags',[]),
|
'tags' : full_video_info['value'].get('tags',[]),
|
||||||
'title' : full_video_info['value']['title'],
|
'title' : full_video_info['value']['title'],
|
||||||
'duration' : full_video_info['value']['video']['duration'],
|
'duration' : duration,
|
||||||
'thumbnail' : full_video_info['value']['thumbnail']['url'],
|
'thumbnail' : thumbnail,
|
||||||
'raw' : json.dumps(full_video_info)}
|
'raw' : json.dumps(full_video_info)}
|
||||||
|
|
||||||
self._claim_id = self.info ['claim_id']
|
self._claim_id = self.info['claim_id']
|
||||||
|
|
||||||
self.info['views'] = api.get_views(claim_id=self._claim_id)
|
self.info['views'] = api.get_views(claim_id=self._claim_id)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user