From 71eecf7c9ebd52ef25735745a6b2ec255d118817 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 12 Apr 2022 02:45:01 -0500 Subject: [PATCH] added recommendation engine and updated example, handled additional edge cases --- examples/generate_network.py | 55 ++++++--------------- polyphemus/api.py | 2 +- polyphemus/base.py | 92 ++++++++++++++++++++++++++++++++---- 3 files changed, 98 insertions(+), 51 deletions(-) diff --git a/examples/generate_network.py b/examples/generate_network.py index 8d58d0a..37dc800 100644 --- a/examples/generate_network.py +++ b/examples/generate_network.py @@ -6,66 +6,39 @@ from pathlib import Path import pickle import os +import networkx as nx + import polyphemus #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# CHANNEL_NAME = 'PatriotFront' -ITERATIONS = 3 +ITERATIONS = 2 -OUTPUT_DIR = '../../data' +OUTPUT_DIR = Path('../../data', f'{CHANNEL_NAME}_recommendation_iterations={ITERATIONS}') #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# if __name__ == '__main__': - auth_token = polyphemus.api.get_auth_token() + engine = polyphemus.base.RecommendationEngine(channel_list= [CHANNEL_NAME]) - scraper = polyphemus.base.OdyseeChannelScraper(channel_name = CHANNEL_NAME, auth_token = auth_token) + weighted_edge_list, claim_id_to_video = engine.generate(iterations = 1) - edge_list = list() - already_done = list() - - new_videos = list(scraper.get_all_videos()) - master_video_dict = dict(zip([v.claim_id for v in new_videos], new_videos)) - - for iteration in range(ITERATIONS): - - print(f'\n\nITERATION: {iteration}, N_VIDEOS: {len(new_videos)}\n\n') - - for i, video in enumerate(new_videos): - claim_id = video.claim_id - title = video.title - - print(f'\nVIDEO: {i}; CLAIM_ID: {claim_id}\n') - - recommended_video_info = polyphemus.api.get_recommended(title, claim_id) - - for rec_video_info in recommended_video_info: - rec_claim_id = rec_video_info['claim_id'] - print(f'REC_CLAIM_ID: {rec_claim_id}') - - edge_list.append((claim_id, rec_claim_id)) - - if rec_video_info['claim_id'] not in master_video_dict: - master_video_dict[rec_claim_id] = polyphemus.base.process_raw_video_info( - raw_video_info = rec_video_info, - auth_token = auth_token, - additional_fields = False) - - already_done.append(claim_id) - - new_videos = [video for video in master_video_dict.values() if video.claim_id not in already_done] + G = nx.DiGraph() + G.add_weighted_edges_from(weighted_edge_list) #-------------------------------------------------------------------------# os.makedirs(OUTPUT_DIR, exist_ok = True) - with open(Path(OUTPUT_DIR, f'master_video_dict_iterations={ITERATIONS}.pkl'), 'wb') as f: - pickle.dump(master_video_dict, f) + nx.write_gexf(G = G, path = Path(OUTPUT_DIR, 'network.gexf')) - with open(Path(OUTPUT_DIR, f'edge_list_iterations={ITERATIONS}.pkl'), 'wb') as f: - pickle.dump(edge_list, f) + with open(Path(OUTPUT_DIR, f'weighted_edge_list.pkl'), 'wb') as f: + pickle.dump(weighted_edge_list, f) + + with open(Path(OUTPUT_DIR, f'claim_id_to_video.pkl'), 'wb') as f: + pickle.dump(claim_id_to_video, f) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/polyphemus/api.py b/polyphemus/api.py index 955b71f..13c460a 100644 --- a/polyphemus/api.py +++ b/polyphemus/api.py @@ -427,7 +427,7 @@ def get_streaming_url(canonical_url: str) -> str: 'url' : BACKEND_API_URL, 'json': json_data}) - video_url = json.loads(response.text)['result'].get('streaming_url') + video_url = json.loads(response.text).get('result', {}).get('streaming_url') return video_url diff --git a/polyphemus/base.py b/polyphemus/base.py index d26e58c..4be9e48 100644 --- a/polyphemus/base.py +++ b/polyphemus/base.py @@ -10,6 +10,7 @@ from urllib.parse import unquote from dataclasses import dataclass import typing from datetime import datetime +from collections import Counter from polyphemus import api @@ -102,13 +103,13 @@ class OdyseeChannelScraper: #-------------------------------------------------------------------------# - def get_all_videos(self) -> typing.Generator[Video, None, None]: + def get_all_videos(self, additional_fields: bool = True) -> typing.Generator[Video, None, None]: """Return list of Video objects for all videos posted by the specified channel """ raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id) - videos = (process_raw_video_info(raw_video_info, self.auth_token) for raw_video_info in raw_video_info_list) + videos = (process_raw_video_info(raw_video_info = raw_video_info, auth_token = self.auth_token, additional_fields = additional_fields) for raw_video_info in raw_video_info_list) return videos @@ -140,6 +141,10 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio else: auth_token = auth_token + raw = json.dumps(raw_video_info) + + claim_id = raw_video_info['claim_id'] + # Handle edge cases #.....................................................................# @@ -152,8 +157,12 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio elif 'claim_hash' in raw_video_info['value']: video_type = 'repost' duration = None - raw_video_info['value'] = raw_video_info['reposted_claim']['value'] - raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url'] + if 'reposted_claim' in raw_video_info: + raw_video_info['value'] = raw_video_info['reposted_claim']['value'] + raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url'] + claim_id = raw_video_info['reposted_claim']['claim_id'] + else: + raw_video_info['value'] = {} elif 'image' in raw_video_info['value']: video_type = 'image' duration = None @@ -184,10 +193,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio # Retrieve additional fields #.....................................................................# - claim_id = raw_video_info['claim_id'] - if additional_fields: - streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) + if raw_video_info['name'] == 'live': + streaming_url = None + else: + streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) views = api.get_views(video_id=claim_id, auth_token = auth_token) likes, dislikes = api.get_video_reactions( video_id = claim_id, @@ -212,11 +222,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio text = raw_video_info['value'].get('description'), languages = raw_video_info['value'].get('languages'), tags = raw_video_info['value'].get('tags',[]), - title = raw_video_info['value']['title'], + title = raw_video_info['value'].get('title'), duration = duration, thumbnail = thumbnail, is_comment = False, - raw = json.dumps(raw_video_info), + raw = raw, views = views, likes = likes, dislikes = dislikes, @@ -254,4 +264,68 @@ def get_recommended(video: Video, auth_token: str = None) -> typing.List['Video' return recommended_videos +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +class RecommendationEngine: + + #-------------------------------------------------------------------------# + + def __init__(self, channel_list): + + self.channel_list = channel_list + self.auth_token = api.get_auth_token() + + self.edge_list = [] + self.new_videos = [] + + self.already_done_claim_ids = [] + self.claim_id_to_video = {} + + #-------------------------------------------------------------------------# + + def generate(self, iterations = 1): + + for channel_name in self.channel_list: + print(channel_name) + scraper = OdyseeChannelScraper(channel_name = channel_name, auth_token = self.auth_token) + + self.new_videos.extend(list(scraper.get_all_videos(additional_fields = False))) + + self.claim_id_to_video = dict(zip([v.claim_id for v in self.new_videos], self.new_videos)) + + for iteration in range(int(iterations)): + + for i, video in enumerate(self.new_videos): + claim_id = video.claim_id + title = video.title + + print(f'ITERATION: {iteration} | VIDEO: {i} / {len(self.new_videos)} | CLAIM_ID: {claim_id}') + + recommended_video_info = api.get_recommended(video_title = title, video_id = claim_id) + + for rec_video_info in recommended_video_info: + rec_claim_id = rec_video_info['claim_id'] + + self.edge_list.append((claim_id, rec_claim_id)) + + if rec_video_info['claim_id'] not in self.claim_id_to_video: + + self.claim_id_to_video[rec_claim_id] = process_raw_video_info( + raw_video_info = rec_video_info, + auth_token = self.auth_token, + additional_fields = False) + + self.already_done_claim_ids.append(claim_id) + + self.new_videos = [video for video in self.claim_id_to_video.values() if video.claim_id not in self.already_done_claim_ids] + + claim_id_to_channel = {claim_id : video.channel_name for claim_id, video in self.claim_id_to_video.items()} + _channel_edge_list = [(claim_id_to_channel[target], claim_id_to_channel[source]) for target, source in self.edge_list] + channel_edge_list = [(source, target) for source, target in _channel_edge_list if all(item is not None for item in (source, target))] + + c = Counter(channel_edge_list) + self.weighted_edge_list = [(source, target, weight) for (source, target), weight in c.most_common()] + + return self.weighted_edge_list, self.claim_id_to_video + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file