From 71eecf7c9ebd52ef25735745a6b2ec255d118817 Mon Sep 17 00:00:00 2001
From: Tristan Lee <tristan@bellingcat.com>
Date: Tue, 12 Apr 2022 02:45:01 -0500
Subject: [PATCH] added recommendation engine and updated example, handled
 additional edge cases

---
 examples/generate_network.py | 55 ++++++---------------
 polyphemus/api.py            |  2 +-
 polyphemus/base.py           | 92 ++++++++++++++++++++++++++++++++----
 3 files changed, 98 insertions(+), 51 deletions(-)

diff --git a/examples/generate_network.py b/examples/generate_network.py
index 8d58d0a..37dc800 100644
--- a/examples/generate_network.py
+++ b/examples/generate_network.py
@@ -6,66 +6,39 @@ from pathlib import Path
 import pickle
 import os
 
+import networkx as nx
+
 import polyphemus
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 
 CHANNEL_NAME = 'PatriotFront'
 
-ITERATIONS = 3
+ITERATIONS = 2
 
-OUTPUT_DIR = '../../data'
+OUTPUT_DIR = Path('../../data', f'{CHANNEL_NAME}_recommendation_iterations={ITERATIONS}')
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 
 if __name__ == '__main__':
 
-    auth_token = polyphemus.api.get_auth_token()
+    engine = polyphemus.base.RecommendationEngine(channel_list= [CHANNEL_NAME])
 
-    scraper = polyphemus.base.OdyseeChannelScraper(channel_name = CHANNEL_NAME, auth_token = auth_token)
+    weighted_edge_list, claim_id_to_video = engine.generate(iterations = 1)
 
-    edge_list = list()
-    already_done = list()
-
-    new_videos = list(scraper.get_all_videos())
-    master_video_dict = dict(zip([v.claim_id for v in new_videos], new_videos))
-
-    for iteration in range(ITERATIONS):
-        
-        print(f'\n\nITERATION: {iteration}, N_VIDEOS: {len(new_videos)}\n\n')
-
-        for i, video in enumerate(new_videos):
-            claim_id = video.claim_id
-            title = video.title
-
-            print(f'\nVIDEO: {i}; CLAIM_ID: {claim_id}\n')
-
-            recommended_video_info = polyphemus.api.get_recommended(title, claim_id)
-
-            for rec_video_info in recommended_video_info:
-                rec_claim_id = rec_video_info['claim_id']
-                print(f'REC_CLAIM_ID: {rec_claim_id}')
-
-                edge_list.append((claim_id, rec_claim_id))
-
-                if rec_video_info['claim_id'] not in master_video_dict:
-                    master_video_dict[rec_claim_id] = polyphemus.base.process_raw_video_info(
-                        raw_video_info = rec_video_info,
-                        auth_token = auth_token,
-                        additional_fields = False)
-
-            already_done.append(claim_id)
-
-        new_videos = [video for video in master_video_dict.values() if video.claim_id not in already_done]
+    G = nx.DiGraph()
+    G.add_weighted_edges_from(weighted_edge_list)
 
     #-------------------------------------------------------------------------#
 
     os.makedirs(OUTPUT_DIR, exist_ok = True)
 
-    with open(Path(OUTPUT_DIR, f'master_video_dict_iterations={ITERATIONS}.pkl'), 'wb') as f:
-        pickle.dump(master_video_dict, f)
+    nx.write_gexf(G = G, path = Path(OUTPUT_DIR, 'network.gexf'))
 
-    with open(Path(OUTPUT_DIR, f'edge_list_iterations={ITERATIONS}.pkl'), 'wb') as f:
-        pickle.dump(edge_list, f)
+    with open(Path(OUTPUT_DIR, f'weighted_edge_list.pkl'), 'wb') as f:
+        pickle.dump(weighted_edge_list, f)
+
+    with open(Path(OUTPUT_DIR, f'claim_id_to_video.pkl'), 'wb') as f:
+        pickle.dump(claim_id_to_video, f)
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
\ No newline at end of file
diff --git a/polyphemus/api.py b/polyphemus/api.py
index 955b71f..13c460a 100644
--- a/polyphemus/api.py
+++ b/polyphemus/api.py
@@ -427,7 +427,7 @@ def get_streaming_url(canonical_url: str) -> str:
             'url' : BACKEND_API_URL, 
             'json': json_data})
 
-    video_url = json.loads(response.text)['result'].get('streaming_url')
+    video_url = json.loads(response.text).get('result', {}).get('streaming_url')
 
     return video_url
 
diff --git a/polyphemus/base.py b/polyphemus/base.py
index d26e58c..4be9e48 100644
--- a/polyphemus/base.py
+++ b/polyphemus/base.py
@@ -10,6 +10,7 @@ from urllib.parse import unquote
 from dataclasses import dataclass
 import typing
 from datetime import datetime 
+from collections import Counter
 
 from polyphemus import api
 
@@ -102,13 +103,13 @@ class OdyseeChannelScraper:
         
     #-------------------------------------------------------------------------#
 
-    def get_all_videos(self) -> typing.Generator[Video, None, None]:
+    def get_all_videos(self, additional_fields: bool = True) -> typing.Generator[Video, None, None]:
 
         """Return list of Video objects for all videos posted by the specified channel
         """
 
         raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id)
-        videos = (process_raw_video_info(raw_video_info, self.auth_token) for raw_video_info in raw_video_info_list)
+        videos = (process_raw_video_info(raw_video_info = raw_video_info, auth_token = self.auth_token, additional_fields = additional_fields) for raw_video_info in raw_video_info_list)
         
         return videos
 
@@ -140,6 +141,10 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
     else:
         auth_token = auth_token
 
+    raw = json.dumps(raw_video_info)
+
+    claim_id = raw_video_info['claim_id']
+
     # Handle edge cases
     #.....................................................................#
 
@@ -152,8 +157,12 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
     elif 'claim_hash' in raw_video_info['value']:
         video_type = 'repost'
         duration = None
-        raw_video_info['value'] = raw_video_info['reposted_claim']['value']
-        raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url']
+        if 'reposted_claim' in raw_video_info:
+            raw_video_info['value'] = raw_video_info['reposted_claim']['value']
+            raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url']
+            claim_id = raw_video_info['reposted_claim']['claim_id']
+        else:
+            raw_video_info['value'] = {}
     elif 'image' in raw_video_info['value']:
         video_type = 'image'
         duration = None
@@ -184,10 +193,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
     # Retrieve additional fields
     #.....................................................................#
 
-    claim_id = raw_video_info['claim_id']
-
     if additional_fields:
-        streaming_url = api.get_streaming_url(raw_video_info['canonical_url'])
+        if raw_video_info['name'] == 'live':
+            streaming_url = None
+        else:
+            streaming_url = api.get_streaming_url(raw_video_info['canonical_url'])
         views = api.get_views(video_id=claim_id, auth_token = auth_token)
         likes, dislikes = api.get_video_reactions(
             video_id = claim_id,
@@ -212,11 +222,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
         text = raw_video_info['value'].get('description'),
         languages = raw_video_info['value'].get('languages'),
         tags = raw_video_info['value'].get('tags',[]),
-        title = raw_video_info['value']['title'],
+        title = raw_video_info['value'].get('title'),
         duration = duration,
         thumbnail = thumbnail,
         is_comment = False,
-        raw = json.dumps(raw_video_info),
+        raw = raw,
         views = views,
         likes = likes,
         dislikes = dislikes,
@@ -254,4 +264,68 @@ def get_recommended(video: Video, auth_token: str = None) -> typing.List['Video'
 
     return recommended_videos
 
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+class RecommendationEngine:
+
+    #-------------------------------------------------------------------------#
+    
+    def __init__(self, channel_list):
+        
+        self.channel_list = channel_list
+        self.auth_token = api.get_auth_token()
+        
+        self.edge_list = []
+        self.new_videos = []
+        
+        self.already_done_claim_ids = []
+        self.claim_id_to_video = {}
+
+    #-------------------------------------------------------------------------#
+
+    def generate(self, iterations = 1):
+        
+        for channel_name in self.channel_list:
+            print(channel_name)
+            scraper = OdyseeChannelScraper(channel_name = channel_name, auth_token = self.auth_token)
+            
+            self.new_videos.extend(list(scraper.get_all_videos(additional_fields = False)))
+            
+        self.claim_id_to_video = dict(zip([v.claim_id for v in self.new_videos], self.new_videos))
+        
+        for iteration in range(int(iterations)):
+
+            for i, video in enumerate(self.new_videos):
+                claim_id = video.claim_id
+                title = video.title
+
+                print(f'ITERATION: {iteration} | VIDEO: {i} / {len(self.new_videos)} | CLAIM_ID: {claim_id}')
+
+                recommended_video_info = api.get_recommended(video_title = title, video_id = claim_id)
+
+                for rec_video_info in recommended_video_info:
+                    rec_claim_id = rec_video_info['claim_id']
+
+                    self.edge_list.append((claim_id, rec_claim_id))
+
+                    if rec_video_info['claim_id'] not in self.claim_id_to_video:
+                        
+                        self.claim_id_to_video[rec_claim_id] = process_raw_video_info(
+                            raw_video_info = rec_video_info,
+                            auth_token = self.auth_token,
+                            additional_fields = False)
+
+                self.already_done_claim_ids.append(claim_id)
+
+            self.new_videos = [video for video in self.claim_id_to_video.values() if video.claim_id not in self.already_done_claim_ids]
+            
+        claim_id_to_channel = {claim_id : video.channel_name for claim_id, video in self.claim_id_to_video.items()}
+        _channel_edge_list = [(claim_id_to_channel[target], claim_id_to_channel[source]) for target, source in self.edge_list]
+        channel_edge_list = [(source, target) for source, target in _channel_edge_list if all(item is not None for item in (source, target))]
+
+        c = Counter(channel_edge_list)
+        self.weighted_edge_list = [(source, target, weight) for (source, target), weight in c.most_common()]
+        
+        return self.weighted_edge_list, self.claim_id_to_video
+
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
\ No newline at end of file