mirror of
https://github.com/bellingcat/polyphemus.git
synced 2026-06-07 19:08:33 +03:00
fixed problems in recommendationengine class, updated README
This commit is contained in:
@@ -4,8 +4,4 @@ Scraper for alt-tech video sharing platform [Odysee](https://odysee.com/).
|
|||||||
|
|
||||||
### TODO
|
### TODO
|
||||||
- Implement CLI
|
- Implement CLI
|
||||||
- Profile run-time, look into implementing async requests
|
- Profile run-time, look into implementing async requests
|
||||||
- Add error handling/backoff waiting to requests
|
|
||||||
- Implement basic test suite
|
|
||||||
- Formaize network graph generation into class/module
|
|
||||||
- Work on reverse-engineering auth_token instead of having it hard-coded
|
|
||||||
@@ -24,7 +24,7 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
engine = polyphemus.base.RecommendationEngine(channel_list= [CHANNEL_NAME])
|
engine = polyphemus.base.RecommendationEngine(channel_list= [CHANNEL_NAME])
|
||||||
|
|
||||||
weighted_edge_list, claim_id_to_video = engine.generate(iterations = 1)
|
weighted_edge_list, channels, claim_id_to_video = engine.generate(iterations = ITERATIONS)
|
||||||
|
|
||||||
G = nx.DiGraph()
|
G = nx.DiGraph()
|
||||||
G.add_weighted_edges_from(weighted_edge_list)
|
G.add_weighted_edges_from(weighted_edge_list)
|
||||||
|
|||||||
8
polyphemus/_cli.py
Normal file
8
polyphemus/_cli.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
|
||||||
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
|
from . import api
|
||||||
|
from . import base
|
||||||
|
|
||||||
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
@@ -68,7 +68,7 @@ def make_request(request: Callable, kwargs: dict) -> requests.Response:
|
|||||||
retry_reasons = []
|
retry_reasons = []
|
||||||
|
|
||||||
# TODO this looks a bit gross, try to refactor
|
# TODO this looks a bit gross, try to refactor
|
||||||
while n_retries < 5:
|
while n_retries < 10:
|
||||||
time.sleep(2 ** n_retries - 1)
|
time.sleep(2 ** n_retries - 1)
|
||||||
try:
|
try:
|
||||||
response = request(**kwargs)
|
response = request(**kwargs)
|
||||||
|
|||||||
@@ -218,7 +218,7 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
|
|||||||
channel_id = channel_id,
|
channel_id = channel_id,
|
||||||
channel_name = channel_name,
|
channel_name = channel_name,
|
||||||
claim_id = raw_video_info['claim_id'],
|
claim_id = raw_video_info['claim_id'],
|
||||||
created = datetime.fromtimestamp(int(created)),
|
created = datetime.fromtimestamp(max(int(created), 0)),
|
||||||
text = raw_video_info['value'].get('description'),
|
text = raw_video_info['value'].get('description'),
|
||||||
languages = raw_video_info['value'].get('languages'),
|
languages = raw_video_info['value'].get('languages'),
|
||||||
tags = raw_video_info['value'].get('tags',[]),
|
tags = raw_video_info['value'].get('tags',[]),
|
||||||
@@ -269,14 +269,15 @@ class RecommendationEngine:
|
|||||||
#-------------------------------------------------------------------------#
|
#-------------------------------------------------------------------------#
|
||||||
|
|
||||||
def generate(self, iterations = 1):
|
def generate(self, iterations = 1):
|
||||||
|
|
||||||
for channel_name in self.channel_list:
|
if not self.new_videos:
|
||||||
print(channel_name)
|
for channel_name in self.channel_list:
|
||||||
scraper = OdyseeChannelScraper(channel_name = channel_name, auth_token = self.auth_token)
|
print(channel_name)
|
||||||
|
scraper = OdyseeChannelScraper(channel_name = channel_name, auth_token = self.auth_token)
|
||||||
self.new_videos.extend(list(scraper.get_all_videos(additional_fields = False)))
|
|
||||||
|
self.new_videos.extend(list(scraper.get_all_videos(additional_fields = False)))
|
||||||
self.claim_id_to_video = dict(zip([v.claim_id for v in self.new_videos], self.new_videos))
|
|
||||||
|
self.claim_id_to_video.update(dict(zip([v.claim_id for v in self.new_videos], self.new_videos)))
|
||||||
|
|
||||||
for iteration in range(int(iterations)):
|
for iteration in range(int(iterations)):
|
||||||
|
|
||||||
@@ -311,6 +312,15 @@ class RecommendationEngine:
|
|||||||
c = Counter(channel_edge_list)
|
c = Counter(channel_edge_list)
|
||||||
self.weighted_edge_list = [(source, target, weight) for (source, target), weight in c.most_common()]
|
self.weighted_edge_list = [(source, target, weight) for (source, target), weight in c.most_common()]
|
||||||
|
|
||||||
return self.weighted_edge_list, self.claim_id_to_video
|
usernames = set([channel.strip('@') for edge in self.weighted_edge_list for channel in edge[:2]])
|
||||||
|
|
||||||
|
self.channels = {}
|
||||||
|
for username in usernames:
|
||||||
|
try:
|
||||||
|
self.channels['@' + username] = OdyseeChannelScraper(channel_name = username, auth_token=self.auth_token).get_entity().__dict__
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return self.weighted_edge_list, self.channels, self.claim_id_to_video
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
55
setup.py
55
setup.py
@@ -10,36 +10,39 @@ from setuptools import setup
|
|||||||
def readme( ):
|
def readme( ):
|
||||||
|
|
||||||
with open( os.path.abspath(
|
with open( os.path.abspath(
|
||||||
os.path.join(
|
os.path.join(
|
||||||
os.path.dirname( __file__ ),
|
os.path.dirname( __file__ ),
|
||||||
'README.md' ) ) ) as f:
|
'README.md' ) ) ) as f:
|
||||||
|
|
||||||
return f.read( )
|
return f.read( )
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name = 'polyphemus',
|
name = 'polyphemus',
|
||||||
version = '0.1',
|
version = '0.1',
|
||||||
description = 'Scraping Odysee video data',
|
description = 'Scraping Odysee video data',
|
||||||
long_description = readme( ),
|
long_description = readme(),
|
||||||
author = 'Bellingcat',
|
author = 'Bellingcat',
|
||||||
packages = [
|
packages = [
|
||||||
'polyphemus' ],
|
'polyphemus'],
|
||||||
install_requires = [
|
install_requires = [
|
||||||
'requests >= 2.27.0',
|
'requests >= 2.27.0',
|
||||||
'beautifulsoup4 >= 4.10.0',
|
'beautifulsoup4 >= 4.10.0',
|
||||||
'pandas >= 1.4.0'],
|
'pandas >= 1.4.0'],
|
||||||
extras_require = {
|
extras_require = {
|
||||||
'docs': [
|
'docs': [
|
||||||
'sphinx >= 3.3.1',
|
'sphinx >= 3.3.1',
|
||||||
'sphinx_rtd_theme >= 0.5',],
|
'sphinx_rtd_theme >= 0.5',],
|
||||||
'tests': [
|
'tests': [
|
||||||
'pytest >= 6.1.2',
|
'pytest >= 6.1.2',
|
||||||
'pytest-cov >= 2.10.1',
|
'pytest-cov >= 2.10.1',
|
||||||
'pytest-html >= 3.0.0',
|
'pytest-html >= 3.0.0',
|
||||||
'pytest-metadata >= 1.10.0']},
|
'pytest-metadata >= 1.10.0']},
|
||||||
include_package_data = True,
|
include_package_data = True,
|
||||||
zip_safe = False )
|
zip_safe = False,
|
||||||
|
entry_points = {
|
||||||
|
'console_scripts': [
|
||||||
|
'polyphemus = polyphemus._cli:main']})
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
Reference in New Issue
Block a user