mirror of
https://github.com/bellingcat/polyphemus.git
synced 2026-06-11 04:48:32 +03:00
updated examples with refactored scraper, increased speed of recommendation engine fetchibng by implementing normalized_names_to_video_info routine, that allows requesting multiple videos at a time
This commit is contained in:
@@ -20,21 +20,23 @@ OUTPUT_DIR = '../../data'
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
odysee_channel = polyphemus.base.OdyseeChannel(channel_name = CHANNEL_NAME)
|
||||
auth_token = polyphemus.api.get_auth_token()
|
||||
|
||||
scraper = polyphemus.base.OdyseeChannelScraper(channel_name = CHANNEL_NAME, auth_token = auth_token)
|
||||
|
||||
edge_list = list()
|
||||
already_done = list()
|
||||
|
||||
new_videos = odysee_channel.get_all_videos()
|
||||
master_video_dict = dict(zip([v.info['claim_id'] for v in new_videos], new_videos))
|
||||
new_videos = list(scraper.get_all_videos())
|
||||
master_video_dict = dict(zip([v.claim_id for v in new_videos], new_videos))
|
||||
|
||||
for iteration in range(ITERATIONS):
|
||||
|
||||
print(f'\n\nITERATION: {iteration}, N_VIDEOS: {len(new_videos)}\n\n')
|
||||
|
||||
for i, video in enumerate(new_videos):
|
||||
claim_id = video.info['claim_id']
|
||||
title = video.info['title']
|
||||
claim_id = video.claim_id
|
||||
title = video.title
|
||||
|
||||
print(f'\nVIDEO: {i}; CLAIM_ID: {claim_id}\n')
|
||||
|
||||
@@ -47,20 +49,23 @@ if __name__ == '__main__':
|
||||
edge_list.append((claim_id, rec_claim_id))
|
||||
|
||||
if rec_video_info['claim_id'] not in master_video_dict:
|
||||
master_video_dict[rec_claim_id] = polyphemus.base.OdyseeVideo(rec_video_info)
|
||||
master_video_dict[rec_claim_id] = polyphemus.base.process_raw_video_info(
|
||||
raw_video_info = rec_video_info,
|
||||
auth_token = auth_token,
|
||||
additional_fields = False)
|
||||
|
||||
already_done.append(claim_id)
|
||||
|
||||
new_videos = [video for video in master_video_dict.values() if video.info['claim_id'] not in already_done]
|
||||
new_videos = [video for video in master_video_dict.values() if video.claim_id not in already_done]
|
||||
|
||||
#-------------------------------------------------------------------------#
|
||||
|
||||
os.makedirs(OUTPUT_DIR, exist_ok = True)
|
||||
|
||||
with open(Path(OUTPUT_DIR, 'master_video_dict.pkl'), 'wb') as f:
|
||||
with open(Path(OUTPUT_DIR, f'master_video_dict_iterations={ITERATIONS}.pkl'), 'wb') as f:
|
||||
pickle.dump(master_video_dict, f)
|
||||
|
||||
with open(Path(OUTPUT_DIR, 'edge_list.pkl'), 'wb') as f:
|
||||
pickle.dump(edge_list)
|
||||
with open(Path(OUTPUT_DIR, f'edge_list_iterations={ITERATIONS}.pkl'), 'wb') as f:
|
||||
pickle.dump(edge_list, f)
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
@@ -11,7 +11,7 @@ import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from polyphemus.base import OdyseeChannel
|
||||
from polyphemus.base import OdyseeChannelScraper
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -22,13 +22,13 @@ OUTPUT_DIR = Path('.').resolve().parents[1]/'data'
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME)
|
||||
odysee_channel = OdyseeChannelScraper(channel_name = CHANNEL_NAME)
|
||||
|
||||
video_list, comment_list = odysee_channel.get_all_videos_and_comments()
|
||||
|
||||
channel_df = pd.DataFrame([odysee_channel.info])
|
||||
video_df = pd.DataFrame([v.info for v in video_list])
|
||||
comment_df = pd.DataFrame([c.info for c in comment_list])
|
||||
channel_df = pd.DataFrame([odysee_channel.get_entity().__dict__])
|
||||
video_df = pd.DataFrame([v.__dict__ for v in video_list])
|
||||
comment_df = pd.DataFrame([c.__dict__ for c in comment_list])
|
||||
|
||||
output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME)
|
||||
os.makedirs(output_subdir, exist_ok = True)
|
||||
|
||||
@@ -7,7 +7,8 @@
|
||||
|
||||
import json
|
||||
from urllib.parse import quote
|
||||
from typing import Tuple, Optional, List
|
||||
from typing import Tuple, Optional, List, Callable
|
||||
|
||||
import time
|
||||
|
||||
import requests
|
||||
@@ -25,7 +26,7 @@ NEW_USER_API_URL = 'https://api.odysee.com/user/new'
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def make_request(request: str, kwargs: dict) -> requests.Response:
|
||||
def make_request(request: Callable, kwargs: dict) -> requests.Response:
|
||||
|
||||
"""Wrapper for retrying request multiple times.
|
||||
"""
|
||||
@@ -42,6 +43,9 @@ def make_request(request: str, kwargs: dict) -> requests.Response:
|
||||
response = requests.Response()
|
||||
response.status_code = 418
|
||||
|
||||
exceptions = []
|
||||
status_codes = []
|
||||
|
||||
while n_retries < 5:
|
||||
time.sleep(2 ** n_retries - 1)
|
||||
try:
|
||||
@@ -49,15 +53,14 @@ def make_request(request: str, kwargs: dict) -> requests.Response:
|
||||
if response.status_code == 200:
|
||||
return response
|
||||
else:
|
||||
status_codes.append(response.status_code)
|
||||
n_retries += 1
|
||||
except Exception:
|
||||
except Exception as exception:
|
||||
exceptions.append(exception)
|
||||
n_retries += 1
|
||||
|
||||
if response.status_code != 200:
|
||||
msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}: status code {response.status_code}'
|
||||
raise ValueError(msg)
|
||||
|
||||
return response
|
||||
msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Status codes: {status_codes}; exceptions: {exceptions}'
|
||||
raise ValueError(msg)
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -359,8 +362,7 @@ def get_recommended(video_title: str, video_id: str) -> List[dict]:
|
||||
'params': params})
|
||||
|
||||
result = json.loads(response.text)
|
||||
|
||||
recommended_video_info = [ normalized_name_to_video_info(r['name']) for r in result]
|
||||
recommended_video_info = normalized_names_to_video_info([r['name'] for r in result])
|
||||
recommended_video_info = [vi for vi in recommended_video_info if ((vi.get('value_type') == 'stream') & any(key in vi.get('value', []) for key in ('video', 'audio')))]
|
||||
|
||||
return recommended_video_info
|
||||
@@ -389,6 +391,28 @@ def normalized_name_to_video_info(normalized_name: str) -> dict:
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def normalized_names_to_video_info(normalized_names: List[str]) -> dict:
|
||||
|
||||
video_urls = [f"lbry://{normalized_name}" for normalized_name in normalized_names]
|
||||
|
||||
json_data = {
|
||||
"jsonrpc":"2.0",
|
||||
"method":"resolve",
|
||||
"params":{
|
||||
"urls":video_urls}}
|
||||
|
||||
response = make_request(
|
||||
request = requests.post,
|
||||
kwargs = {
|
||||
'url' : BACKEND_API_URL,
|
||||
'json': json_data})
|
||||
|
||||
result = json.loads(response.text)
|
||||
|
||||
return [result['result'][video_url] for video_url in video_urls]
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def get_streaming_url(canonical_url: str) -> str:
|
||||
|
||||
json_data = {
|
||||
|
||||
@@ -29,13 +29,13 @@ class Channel:
|
||||
@dataclass
|
||||
class Video:
|
||||
canonical_url: str
|
||||
streaming_url: str
|
||||
type: str
|
||||
claim_id: str
|
||||
created: datetime
|
||||
title: str
|
||||
views: int
|
||||
raw: str
|
||||
views: typing.Optional[int] = None
|
||||
streaming_url: typing.Optional[str] = None
|
||||
text: typing.Optional[str] = None
|
||||
thumbnail : typing.Optional[str] = None
|
||||
channel_id: typing.Optional[str] = None
|
||||
@@ -83,6 +83,9 @@ class OdyseeChannelScraper:
|
||||
|
||||
def get_entity(self) -> Channel:
|
||||
|
||||
"""Return Channel object containing information about the specified channel.
|
||||
"""
|
||||
|
||||
subscribers = api.get_subscribers(
|
||||
channel_id = self._channel_id,
|
||||
auth_token = self.auth_token)
|
||||
@@ -101,7 +104,7 @@ class OdyseeChannelScraper:
|
||||
|
||||
def get_all_videos(self) -> typing.Generator[Video, None, None]:
|
||||
|
||||
"""Return list of Video objects for all videos posted by the channel
|
||||
"""Return list of Video objects for all videos posted by the specified channel
|
||||
"""
|
||||
|
||||
raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id)
|
||||
@@ -130,7 +133,7 @@ class OdyseeChannelScraper:
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def process_raw_video_info(raw_video_info: dict, auth_token = None) -> Video:
|
||||
def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additional_fields: bool = True) -> Video:
|
||||
|
||||
if auth_token is None:
|
||||
auth_token = api.get_auth_token()
|
||||
@@ -180,16 +183,21 @@ def process_raw_video_info(raw_video_info: dict, auth_token = None) -> Video:
|
||||
|
||||
# Retrieve additional fields
|
||||
#.....................................................................#
|
||||
|
||||
|
||||
claim_id = raw_video_info['claim_id']
|
||||
|
||||
views = api.get_views(video_id=claim_id, auth_token = auth_token)
|
||||
if additional_fields:
|
||||
streaming_url = api.get_streaming_url(raw_video_info['canonical_url'])
|
||||
views = api.get_views(video_id=claim_id, auth_token = auth_token)
|
||||
likes, dislikes = api.get_video_reactions(
|
||||
video_id = claim_id,
|
||||
auth_token = auth_token)
|
||||
|
||||
likes, dislikes = api.get_video_reactions(
|
||||
video_id = claim_id,
|
||||
auth_token = auth_token)
|
||||
|
||||
streaming_url = api.get_streaming_url(raw_video_info['canonical_url'])
|
||||
else:
|
||||
streaming_url = None
|
||||
views = None
|
||||
likes = None
|
||||
dislikes = None
|
||||
|
||||
# Return Video object
|
||||
#.....................................................................#
|
||||
|
||||
@@ -29,6 +29,7 @@ KWARGS_LIST = [
|
||||
('get_all_comments', ['video_id']),
|
||||
('append_comment_reactions', ['comment_info_list']),
|
||||
('normalized_name_to_video_info', ['normalized_name']),
|
||||
('normalized_names_to_video_info', ['normalized_names']),
|
||||
('get_streaming_url', ['canonical_url']),
|
||||
('get_recommended', ['video_title', 'video_id']),]
|
||||
|
||||
|
||||
@@ -89,6 +89,7 @@ def resources():
|
||||
video_id = VIDEO_ID,
|
||||
video_title = VIDEO_TITLE,
|
||||
normalized_name = NORMALIZED_NAME,
|
||||
normalized_names = [NORMALIZED_NAME],
|
||||
canonical_url = CANONICAL_URL,
|
||||
full_video_info = FULL_VIDEO_INFO,
|
||||
full_comment_info = {**COMMENT_INFO_LIST[0], **{'likes': 8, 'dislikes': 0}},
|
||||
|
||||
Reference in New Issue
Block a user