fixed problems in recommendationengine class, updated README

This commit is contained in:
Tristan Lee
2022-08-09 09:45:40 -05:00
parent b18e5591fa
commit cb30169ece
6 changed files with 60 additions and 43 deletions

View File

@@ -4,8 +4,4 @@ Scraper for alt-tech video sharing platform [Odysee](https://odysee.com/).
### TODO ### TODO
- Implement CLI - Implement CLI
- Profile run-time, look into implementing async requests - Profile run-time, look into implementing async requests
- Add error handling/backoff waiting to requests
- Implement basic test suite
- Formaize network graph generation into class/module
- Work on reverse-engineering auth_token instead of having it hard-coded

View File

@@ -24,7 +24,7 @@ if __name__ == '__main__':
engine = polyphemus.base.RecommendationEngine(channel_list= [CHANNEL_NAME]) engine = polyphemus.base.RecommendationEngine(channel_list= [CHANNEL_NAME])
weighted_edge_list, claim_id_to_video = engine.generate(iterations = 1) weighted_edge_list, channels, claim_id_to_video = engine.generate(iterations = ITERATIONS)
G = nx.DiGraph() G = nx.DiGraph()
G.add_weighted_edges_from(weighted_edge_list) G.add_weighted_edges_from(weighted_edge_list)

8
polyphemus/_cli.py Normal file
View File

@@ -0,0 +1,8 @@
# -*- coding: UTF-8 -*-
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
from . import api
from . import base
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -68,7 +68,7 @@ def make_request(request: Callable, kwargs: dict) -> requests.Response:
retry_reasons = [] retry_reasons = []
# TODO this looks a bit gross, try to refactor # TODO this looks a bit gross, try to refactor
while n_retries < 5: while n_retries < 10:
time.sleep(2 ** n_retries - 1) time.sleep(2 ** n_retries - 1)
try: try:
response = request(**kwargs) response = request(**kwargs)

View File

@@ -218,7 +218,7 @@ def process_raw_video_info(raw_video_info: dict, auth_token: str = None, additio
channel_id = channel_id, channel_id = channel_id,
channel_name = channel_name, channel_name = channel_name,
claim_id = raw_video_info['claim_id'], claim_id = raw_video_info['claim_id'],
created = datetime.fromtimestamp(int(created)), created = datetime.fromtimestamp(max(int(created), 0)),
text = raw_video_info['value'].get('description'), text = raw_video_info['value'].get('description'),
languages = raw_video_info['value'].get('languages'), languages = raw_video_info['value'].get('languages'),
tags = raw_video_info['value'].get('tags',[]), tags = raw_video_info['value'].get('tags',[]),
@@ -269,14 +269,15 @@ class RecommendationEngine:
#-------------------------------------------------------------------------# #-------------------------------------------------------------------------#
def generate(self, iterations = 1): def generate(self, iterations = 1):
for channel_name in self.channel_list: if not self.new_videos:
print(channel_name) for channel_name in self.channel_list:
scraper = OdyseeChannelScraper(channel_name = channel_name, auth_token = self.auth_token) print(channel_name)
scraper = OdyseeChannelScraper(channel_name = channel_name, auth_token = self.auth_token)
self.new_videos.extend(list(scraper.get_all_videos(additional_fields = False)))
self.new_videos.extend(list(scraper.get_all_videos(additional_fields = False)))
self.claim_id_to_video = dict(zip([v.claim_id for v in self.new_videos], self.new_videos))
self.claim_id_to_video.update(dict(zip([v.claim_id for v in self.new_videos], self.new_videos)))
for iteration in range(int(iterations)): for iteration in range(int(iterations)):
@@ -311,6 +312,15 @@ class RecommendationEngine:
c = Counter(channel_edge_list) c = Counter(channel_edge_list)
self.weighted_edge_list = [(source, target, weight) for (source, target), weight in c.most_common()] self.weighted_edge_list = [(source, target, weight) for (source, target), weight in c.most_common()]
return self.weighted_edge_list, self.claim_id_to_video usernames = set([channel.strip('@') for edge in self.weighted_edge_list for channel in edge[:2]])
self.channels = {}
for username in usernames:
try:
self.channels['@' + username] = OdyseeChannelScraper(channel_name = username, auth_token=self.auth_token).get_entity().__dict__
except KeyError:
pass
return self.weighted_edge_list, self.channels, self.claim_id_to_video
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -10,36 +10,39 @@ from setuptools import setup
def readme( ): def readme( ):
with open( os.path.abspath( with open( os.path.abspath(
os.path.join( os.path.join(
os.path.dirname( __file__ ), os.path.dirname( __file__ ),
'README.md' ) ) ) as f: 'README.md' ) ) ) as f:
return f.read( ) return f.read( )
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
setup( setup(
name = 'polyphemus', name = 'polyphemus',
version = '0.1', version = '0.1',
description = 'Scraping Odysee video data', description = 'Scraping Odysee video data',
long_description = readme( ), long_description = readme(),
author = 'Bellingcat', author = 'Bellingcat',
packages = [ packages = [
'polyphemus' ], 'polyphemus'],
install_requires = [ install_requires = [
'requests >= 2.27.0', 'requests >= 2.27.0',
'beautifulsoup4 >= 4.10.0', 'beautifulsoup4 >= 4.10.0',
'pandas >= 1.4.0'], 'pandas >= 1.4.0'],
extras_require = { extras_require = {
'docs': [ 'docs': [
'sphinx >= 3.3.1', 'sphinx >= 3.3.1',
'sphinx_rtd_theme >= 0.5',], 'sphinx_rtd_theme >= 0.5',],
'tests': [ 'tests': [
'pytest >= 6.1.2', 'pytest >= 6.1.2',
'pytest-cov >= 2.10.1', 'pytest-cov >= 2.10.1',
'pytest-html >= 3.0.0', 'pytest-html >= 3.0.0',
'pytest-metadata >= 1.10.0']}, 'pytest-metadata >= 1.10.0']},
include_package_data = True, include_package_data = True,
zip_safe = False ) zip_safe = False,
entry_points = {
'console_scripts': [
'polyphemus = polyphemus._cli:main']})
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#