mirror of
https://github.com/bellingcat/polyphemus.git
synced 2026-06-08 03:18:32 +03:00
added recommendation engine and updated example, handled additional edge cases
This commit is contained in:
@@ -6,66 +6,39 @@ from pathlib import Path
|
|||||||
import pickle
|
import pickle
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import networkx as nx
|
||||||
|
|
||||||
import polyphemus
|
import polyphemus
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
CHANNEL_NAME = 'PatriotFront'
|
CHANNEL_NAME = 'PatriotFront'
|
||||||
|
|
||||||
ITERATIONS = 3
|
ITERATIONS = 2
|
||||||
|
|
||||||
OUTPUT_DIR = '../../data'
|
OUTPUT_DIR = Path('../../data', f'{CHANNEL_NAME}_recommendation_iterations={ITERATIONS}')
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
auth_token = polyphemus.api.get_auth_token()
|
engine = polyphemus.base.RecommendationEngine(channel_list= [CHANNEL_NAME])
|
||||||
|
|
||||||
scraper = polyphemus.base.OdyseeChannelScraper(channel_name = CHANNEL_NAME, auth_token = auth_token)
|
weighted_edge_list, claim_id_to_video = engine.generate(iterations = 1)
|
||||||
|
|
||||||
edge_list = list()
|
G = nx.DiGraph()
|
||||||
already_done = list()
|
G.add_weighted_edges_from(weighted_edge_list)
|
||||||
|
|
||||||
new_videos = list(scraper.get_all_videos())
|
|
||||||
master_video_dict = dict(zip([v.claim_id for v in new_videos], new_videos))
|
|
||||||
|
|
||||||
for iteration in range(ITERATIONS):
|
|
||||||
|
|
||||||
print(f'\n\nITERATION: {iteration}, N_VIDEOS: {len(new_videos)}\n\n')
|
|
||||||
|
|
||||||
for i, video in enumerate(new_videos):
|
|
||||||
claim_id = video.claim_id
|
|
||||||
title = video.title
|
|
||||||
|
|
||||||
print(f'\nVIDEO: {i}; CLAIM_ID: {claim_id}\n')
|
|
||||||
|
|
||||||
recommended_video_info = polyphemus.api.get_recommended(title, claim_id)
|
|
||||||
|
|
||||||
for rec_video_info in recommended_video_info:
|
|
||||||
rec_claim_id = rec_video_info['claim_id']
|
|
||||||
print(f'REC_CLAIM_ID: {rec_claim_id}')
|
|
||||||
|
|
||||||
edge_list.append((claim_id, rec_claim_id))
|
|
||||||
|
|
||||||
if rec_video_info['claim_id'] not in master_video_dict:
|
|
||||||
master_video_dict[rec_claim_id] = polyphemus.base.process_raw_video_info(
|
|
||||||
raw_video_info = rec_video_info,
|
|
||||||
auth_token = auth_token,
|
|
||||||
additional_fields = False)
|
|
||||||
|
|
||||||
already_done.append(claim_id)
|
|
||||||
|
|
||||||
new_videos = [video for video in master_video_dict.values() if video.claim_id not in already_done]
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------#
|
#-------------------------------------------------------------------------#
|
||||||
|
|
||||||
os.makedirs(OUTPUT_DIR, exist_ok = True)
|
os.makedirs(OUTPUT_DIR, exist_ok = True)
|
||||||
|
|
||||||
with open(Path(OUTPUT_DIR, f'master_video_dict_iterations={ITERATIONS}.pkl'), 'wb') as f:
|
nx.write_gexf(G = G, path = Path(OUTPUT_DIR, 'network.gexf'))
|
||||||
pickle.dump(master_video_dict, f)
|
|
||||||
|
|
||||||
with open(Path(OUTPUT_DIR, f'edge_list_iterations={ITERATIONS}.pkl'), 'wb') as f:
|
with open(Path(OUTPUT_DIR, f'weighted_edge_list.pkl'), 'wb') as f:
|
||||||
pickle.dump(edge_list, f)
|
pickle.dump(weighted_edge_list, f)
|
||||||
|
|
||||||
|
with open(Path(OUTPUT_DIR, f'claim_id_to_video.pkl'), 'wb') as f:
|
||||||
|
pickle.dump(claim_id_to_video, f)
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
@@ -427,7 +427,7 @@ def get_streaming_url(canonical_url: str) -> str:
|
|||||||
'url' : BACKEND_API_URL,
|
'url' : BACKEND_API_URL,
|
||||||
'json': json_data})
|
'json': json_data})
|
||||||
|
|
||||||
video_url = json.loads(response.text)['result'].get('streaming_url')
|
video_url = json.loads(response.text).get('result', {}).get('streaming_url')
|
||||||
|
|
||||||
return video_url
|
return video_url
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from urllib.parse import unquote
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import typing
|
import typing
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
from polyphemus import api
|
from polyphemus import api
|
||||||
|
|
||||||
@@ -102,13 +103,13 @@ class OdyseeChannelScraper:
|
|||||||
|
|
||||||
#-------------------------------------------------------------------------#
|
#-------------------------------------------------------------------------#
|
||||||
|
|
||||||
def get_all_videos(self) -> typing.Generator[Video, None, None]:
|
def get_all_videos(self, additional_fields: bool = True) -> typing.Generator[Video, None, None]:
|
||||||
|
|
||||||
"""Return list of Video objects for all videos posted by the specified channel
|
"""Return list of Video objects for all videos posted by the specified channel
|
||||||
"""
|
"""
|
||||||
|
|
||||||
raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id)
|
raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id)
|
||||||
videos = (process_raw_video_info(raw_video_info, self.auth_token) for raw_video_info in raw_video_info_list)
|
videos = (process_raw_video_info(raw_video_info = raw_video_info, auth_token = self.auth_token, additional_fields = additional_fields) for raw_video_info in raw_video_info_list)
|
||||||
|
|
||||||
return videos
|
return videos
|
||||||
|
|
||||||
@@ -140,6 +141,10 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
|
|||||||
else:
|
else:
|
||||||
auth_token = auth_token
|
auth_token = auth_token
|
||||||
|
|
||||||
|
raw = json.dumps(raw_video_info)
|
||||||
|
|
||||||
|
claim_id = raw_video_info['claim_id']
|
||||||
|
|
||||||
# Handle edge cases
|
# Handle edge cases
|
||||||
#.....................................................................#
|
#.....................................................................#
|
||||||
|
|
||||||
@@ -152,8 +157,12 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
|
|||||||
elif 'claim_hash' in raw_video_info['value']:
|
elif 'claim_hash' in raw_video_info['value']:
|
||||||
video_type = 'repost'
|
video_type = 'repost'
|
||||||
duration = None
|
duration = None
|
||||||
raw_video_info['value'] = raw_video_info['reposted_claim']['value']
|
if 'reposted_claim' in raw_video_info:
|
||||||
raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url']
|
raw_video_info['value'] = raw_video_info['reposted_claim']['value']
|
||||||
|
raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url']
|
||||||
|
claim_id = raw_video_info['reposted_claim']['claim_id']
|
||||||
|
else:
|
||||||
|
raw_video_info['value'] = {}
|
||||||
elif 'image' in raw_video_info['value']:
|
elif 'image' in raw_video_info['value']:
|
||||||
video_type = 'image'
|
video_type = 'image'
|
||||||
duration = None
|
duration = None
|
||||||
@@ -184,10 +193,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
|
|||||||
# Retrieve additional fields
|
# Retrieve additional fields
|
||||||
#.....................................................................#
|
#.....................................................................#
|
||||||
|
|
||||||
claim_id = raw_video_info['claim_id']
|
|
||||||
|
|
||||||
if additional_fields:
|
if additional_fields:
|
||||||
streaming_url = api.get_streaming_url(raw_video_info['canonical_url'])
|
if raw_video_info['name'] == 'live':
|
||||||
|
streaming_url = None
|
||||||
|
else:
|
||||||
|
streaming_url = api.get_streaming_url(raw_video_info['canonical_url'])
|
||||||
views = api.get_views(video_id=claim_id, auth_token = auth_token)
|
views = api.get_views(video_id=claim_id, auth_token = auth_token)
|
||||||
likes, dislikes = api.get_video_reactions(
|
likes, dislikes = api.get_video_reactions(
|
||||||
video_id = claim_id,
|
video_id = claim_id,
|
||||||
@@ -212,11 +222,11 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
|
|||||||
text = raw_video_info['value'].get('description'),
|
text = raw_video_info['value'].get('description'),
|
||||||
languages = raw_video_info['value'].get('languages'),
|
languages = raw_video_info['value'].get('languages'),
|
||||||
tags = raw_video_info['value'].get('tags',[]),
|
tags = raw_video_info['value'].get('tags',[]),
|
||||||
title = raw_video_info['value']['title'],
|
title = raw_video_info['value'].get('title'),
|
||||||
duration = duration,
|
duration = duration,
|
||||||
thumbnail = thumbnail,
|
thumbnail = thumbnail,
|
||||||
is_comment = False,
|
is_comment = False,
|
||||||
raw = json.dumps(raw_video_info),
|
raw = raw,
|
||||||
views = views,
|
views = views,
|
||||||
likes = likes,
|
likes = likes,
|
||||||
dislikes = dislikes,
|
dislikes = dislikes,
|
||||||
@@ -254,4 +264,68 @@ def get_recommended(video: Video, auth_token: str = None) -> typing.List['Video'
|
|||||||
|
|
||||||
return recommended_videos
|
return recommended_videos
|
||||||
|
|
||||||
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
|
class RecommendationEngine:
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------#
|
||||||
|
|
||||||
|
def __init__(self, channel_list):
|
||||||
|
|
||||||
|
self.channel_list = channel_list
|
||||||
|
self.auth_token = api.get_auth_token()
|
||||||
|
|
||||||
|
self.edge_list = []
|
||||||
|
self.new_videos = []
|
||||||
|
|
||||||
|
self.already_done_claim_ids = []
|
||||||
|
self.claim_id_to_video = {}
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------#
|
||||||
|
|
||||||
|
def generate(self, iterations = 1):
|
||||||
|
|
||||||
|
for channel_name in self.channel_list:
|
||||||
|
print(channel_name)
|
||||||
|
scraper = OdyseeChannelScraper(channel_name = channel_name, auth_token = self.auth_token)
|
||||||
|
|
||||||
|
self.new_videos.extend(list(scraper.get_all_videos(additional_fields = False)))
|
||||||
|
|
||||||
|
self.claim_id_to_video = dict(zip([v.claim_id for v in self.new_videos], self.new_videos))
|
||||||
|
|
||||||
|
for iteration in range(int(iterations)):
|
||||||
|
|
||||||
|
for i, video in enumerate(self.new_videos):
|
||||||
|
claim_id = video.claim_id
|
||||||
|
title = video.title
|
||||||
|
|
||||||
|
print(f'ITERATION: {iteration} | VIDEO: {i} / {len(self.new_videos)} | CLAIM_ID: {claim_id}')
|
||||||
|
|
||||||
|
recommended_video_info = api.get_recommended(video_title = title, video_id = claim_id)
|
||||||
|
|
||||||
|
for rec_video_info in recommended_video_info:
|
||||||
|
rec_claim_id = rec_video_info['claim_id']
|
||||||
|
|
||||||
|
self.edge_list.append((claim_id, rec_claim_id))
|
||||||
|
|
||||||
|
if rec_video_info['claim_id'] not in self.claim_id_to_video:
|
||||||
|
|
||||||
|
self.claim_id_to_video[rec_claim_id] = process_raw_video_info(
|
||||||
|
raw_video_info = rec_video_info,
|
||||||
|
auth_token = self.auth_token,
|
||||||
|
additional_fields = False)
|
||||||
|
|
||||||
|
self.already_done_claim_ids.append(claim_id)
|
||||||
|
|
||||||
|
self.new_videos = [video for video in self.claim_id_to_video.values() if video.claim_id not in self.already_done_claim_ids]
|
||||||
|
|
||||||
|
claim_id_to_channel = {claim_id : video.channel_name for claim_id, video in self.claim_id_to_video.items()}
|
||||||
|
_channel_edge_list = [(claim_id_to_channel[target], claim_id_to_channel[source]) for target, source in self.edge_list]
|
||||||
|
channel_edge_list = [(source, target) for source, target in _channel_edge_list if all(item is not None for item in (source, target))]
|
||||||
|
|
||||||
|
c = Counter(channel_edge_list)
|
||||||
|
self.weighted_edge_list = [(source, target, weight) for (source, target), weight in c.most_common()]
|
||||||
|
|
||||||
|
return self.weighted_edge_list, self.claim_id_to_video
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
Reference in New Issue
Block a user