added recommendation engine and updated example, handled additional edge cases

This commit is contained in:
Tristan Lee
2022-04-12 02:45:01 -05:00
parent 0aac7493a4
commit 71eecf7c9e
3 changed files with 98 additions and 51 deletions

View File

@@ -6,66 +6,39 @@ from pathlib import Path
import pickle import pickle
import os import os
import networkx as nx
import polyphemus import polyphemus
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
CHANNEL_NAME = 'PatriotFront' CHANNEL_NAME = 'PatriotFront'
ITERATIONS = 3 ITERATIONS = 2
OUTPUT_DIR = '../../data' OUTPUT_DIR = Path('../../data', f'{CHANNEL_NAME}_recommendation_iterations={ITERATIONS}')
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
if __name__ == '__main__': if __name__ == '__main__':
auth_token = polyphemus.api.get_auth_token() engine = polyphemus.base.RecommendationEngine(channel_list= [CHANNEL_NAME])
scraper = polyphemus.base.OdyseeChannelScraper(channel_name = CHANNEL_NAME, auth_token = auth_token) weighted_edge_list, claim_id_to_video = engine.generate(iterations = 1)
edge_list = list() G = nx.DiGraph()
already_done = list() G.add_weighted_edges_from(weighted_edge_list)
new_videos = list(scraper.get_all_videos())
master_video_dict = dict(zip([v.claim_id for v in new_videos], new_videos))
for iteration in range(ITERATIONS):
print(f'\n\nITERATION: {iteration}, N_VIDEOS: {len(new_videos)}\n\n')
for i, video in enumerate(new_videos):
claim_id = video.claim_id
title = video.title
print(f'\nVIDEO: {i}; CLAIM_ID: {claim_id}\n')
recommended_video_info = polyphemus.api.get_recommended(title, claim_id)
for rec_video_info in recommended_video_info:
rec_claim_id = rec_video_info['claim_id']
print(f'REC_CLAIM_ID: {rec_claim_id}')
edge_list.append((claim_id, rec_claim_id))
if rec_video_info['claim_id'] not in master_video_dict:
master_video_dict[rec_claim_id] = polyphemus.base.process_raw_video_info(
raw_video_info = rec_video_info,
auth_token = auth_token,
additional_fields = False)
already_done.append(claim_id)
new_videos = [video for video in master_video_dict.values() if video.claim_id not in already_done]
#-------------------------------------------------------------------------# #-------------------------------------------------------------------------#
os.makedirs(OUTPUT_DIR, exist_ok = True) os.makedirs(OUTPUT_DIR, exist_ok = True)
with open(Path(OUTPUT_DIR, f'master_video_dict_iterations={ITERATIONS}.pkl'), 'wb') as f: nx.write_gexf(G = G, path = Path(OUTPUT_DIR, 'network.gexf'))
pickle.dump(master_video_dict, f)
with open(Path(OUTPUT_DIR, f'edge_list_iterations={ITERATIONS}.pkl'), 'wb') as f: with open(Path(OUTPUT_DIR, f'weighted_edge_list.pkl'), 'wb') as f:
pickle.dump(edge_list, f) pickle.dump(weighted_edge_list, f)
with open(Path(OUTPUT_DIR, f'claim_id_to_video.pkl'), 'wb') as f:
pickle.dump(claim_id_to_video, f)
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -427,7 +427,7 @@ def get_streaming_url(canonical_url: str) -> str:
'url' : BACKEND_API_URL, 'url' : BACKEND_API_URL,
'json': json_data}) 'json': json_data})
video_url = json.loads(response.text)['result'].get('streaming_url') video_url = json.loads(response.text).get('result', {}).get('streaming_url')
return video_url return video_url

View File

@@ -10,6 +10,7 @@ from urllib.parse import unquote
from dataclasses import dataclass from dataclasses import dataclass
import typing import typing
from datetime import datetime from datetime import datetime
from collections import Counter
from polyphemus import api from polyphemus import api
@@ -102,13 +103,13 @@ class OdyseeChannelScraper:
#-------------------------------------------------------------------------# #-------------------------------------------------------------------------#
def get_all_videos(self) -> typing.Generator[Video, None, None]: def get_all_videos(self, additional_fields: bool = True) -> typing.Generator[Video, None, None]:
"""Return list of Video objects for all videos posted by the specified channel """Return list of Video objects for all videos posted by the specified channel
""" """
raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id) raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id)
videos = (process_raw_video_info(raw_video_info, self.auth_token) for raw_video_info in raw_video_info_list) videos = (process_raw_video_info(raw_video_info = raw_video_info, auth_token = self.auth_token, additional_fields = additional_fields) for raw_video_info in raw_video_info_list)
return videos return videos
@@ -140,6 +141,10 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
else: else:
auth_token = auth_token auth_token = auth_token
raw = json.dumps(raw_video_info)
claim_id = raw_video_info['claim_id']
# Handle edge cases # Handle edge cases
#.....................................................................# #.....................................................................#
@@ -152,8 +157,12 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
elif 'claim_hash' in raw_video_info['value']: elif 'claim_hash' in raw_video_info['value']:
video_type = 'repost' video_type = 'repost'
duration = None duration = None
raw_video_info['value'] = raw_video_info['reposted_claim']['value'] if 'reposted_claim' in raw_video_info:
raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url'] raw_video_info['value'] = raw_video_info['reposted_claim']['value']
raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url']
claim_id = raw_video_info['reposted_claim']['claim_id']
else:
raw_video_info['value'] = {}
elif 'image' in raw_video_info['value']: elif 'image' in raw_video_info['value']:
video_type = 'image' video_type = 'image'
duration = None duration = None
@@ -184,10 +193,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
# Retrieve additional fields # Retrieve additional fields
#.....................................................................# #.....................................................................#
claim_id = raw_video_info['claim_id']
if additional_fields: if additional_fields:
streaming_url = api.get_streaming_url(raw_video_info['canonical_url']) if raw_video_info['name'] == 'live':
streaming_url = None
else:
streaming_url = api.get_streaming_url(raw_video_info['canonical_url'])
views = api.get_views(video_id=claim_id, auth_token = auth_token) views = api.get_views(video_id=claim_id, auth_token = auth_token)
likes, dislikes = api.get_video_reactions( likes, dislikes = api.get_video_reactions(
video_id = claim_id, video_id = claim_id,
@@ -212,11 +222,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
text = raw_video_info['value'].get('description'), text = raw_video_info['value'].get('description'),
languages = raw_video_info['value'].get('languages'), languages = raw_video_info['value'].get('languages'),
tags = raw_video_info['value'].get('tags',[]), tags = raw_video_info['value'].get('tags',[]),
title = raw_video_info['value']['title'], title = raw_video_info['value'].get('title'),
duration = duration, duration = duration,
thumbnail = thumbnail, thumbnail = thumbnail,
is_comment = False, is_comment = False,
raw = json.dumps(raw_video_info), raw = raw,
views = views, views = views,
likes = likes, likes = likes,
dislikes = dislikes, dislikes = dislikes,
@@ -254,4 +264,68 @@ def get_recommended(video: Video, auth_token: str = None) -> typing.List['Video'
return recommended_videos return recommended_videos
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
class RecommendationEngine:
#-------------------------------------------------------------------------#
def __init__(self, channel_list):
self.channel_list = channel_list
self.auth_token = api.get_auth_token()
self.edge_list = []
self.new_videos = []
self.already_done_claim_ids = []
self.claim_id_to_video = {}
#-------------------------------------------------------------------------#
def generate(self, iterations = 1):
for channel_name in self.channel_list:
print(channel_name)
scraper = OdyseeChannelScraper(channel_name = channel_name, auth_token = self.auth_token)
self.new_videos.extend(list(scraper.get_all_videos(additional_fields = False)))
self.claim_id_to_video = dict(zip([v.claim_id for v in self.new_videos], self.new_videos))
for iteration in range(int(iterations)):
for i, video in enumerate(self.new_videos):
claim_id = video.claim_id
title = video.title
print(f'ITERATION: {iteration} | VIDEO: {i} / {len(self.new_videos)} | CLAIM_ID: {claim_id}')
recommended_video_info = api.get_recommended(video_title = title, video_id = claim_id)
for rec_video_info in recommended_video_info:
rec_claim_id = rec_video_info['claim_id']
self.edge_list.append((claim_id, rec_claim_id))
if rec_video_info['claim_id'] not in self.claim_id_to_video:
self.claim_id_to_video[rec_claim_id] = process_raw_video_info(
raw_video_info = rec_video_info,
auth_token = self.auth_token,
additional_fields = False)
self.already_done_claim_ids.append(claim_id)
self.new_videos = [video for video in self.claim_id_to_video.values() if video.claim_id not in self.already_done_claim_ids]
claim_id_to_channel = {claim_id : video.channel_name for claim_id, video in self.claim_id_to_video.items()}
_channel_edge_list = [(claim_id_to_channel[target], claim_id_to_channel[source]) for target, source in self.edge_list]
channel_edge_list = [(source, target) for source, target in _channel_edge_list if all(item is not None for item in (source, target))]
c = Counter(channel_edge_list)
self.weighted_edge_list = [(source, target, weight) for (source, target), weight in c.most_common()]
return self.weighted_edge_list, self.claim_id_to_video
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#