diff --git a/.gitignore b/.gitignore index f8ca565..92279f2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ *.csv -**/data/ \ No newline at end of file +**/data/ + +**/__pycache__/ +*.pyc \ No newline at end of file diff --git a/examples/scrape.py b/examples/scrape.py new file mode 100644 index 0000000..ca32fe3 --- /dev/null +++ b/examples/scrape.py @@ -0,0 +1,52 @@ +# -*- coding: UTF-8 -*- + +"""Scrape all video and comment data from a specified Odysee channel +""" + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +import csv +from pathlib import Path +import os + +import pandas as pd + +from polyphemus.base import OdyseeChannel + + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +CHANNEL_NAME = 'PatriotFront' +OUTPUT_DIR = Path('.').resolve().parents[1]/'data' + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +if __name__ == '__main__': + + odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME) + + video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments() + + channel_df = pd.DataFrame([odysee_channel.info]) + video_df = pd.DataFrame(video_info_list) + comment_df = pd.DataFrame(comment_info_list) + + output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME) + os.makedirs(output_subdir, exist_ok = True) + + channel_df.to_csv( + path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_channel.csv'), + index = False, + quoting = csv.QUOTE_NONNUMERIC ) + + video_df.to_csv( + path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_videos.csv'), + index = False, + quoting = csv.QUOTE_NONNUMERIC ) + + comment_df.to_csv( + path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_comments.csv'), + index = False, + quoting = csv.QUOTE_NONNUMERIC ) + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/polyphemus/__init__.py b/polyphemus/__init__.py new file mode 100644 index 0000000..b555347 --- /dev/null +++ b/polyphemus/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: UTF-8 -*- + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +from . import base +from . import utils + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/polyphemus.py b/polyphemus/base.py similarity index 81% rename from polyphemus.py rename to polyphemus/base.py index 5c219ef..46dc5d6 100644 --- a/polyphemus.py +++ b/polyphemus/base.py @@ -1,26 +1,20 @@ # -*- coding: UTF-8 -*- -"""Functions and classes for scraping video data from Odysee video platform. +"""Base classes and methods for scraping video data from Odysee video platform. """ #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# import json -import csv -from pathlib import Path -import os +from urllib.parse import quote import requests -import pandas as pd #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #TODO Figure out how to reverse-engineer this AUTH_TOKEN = 'BseGAiye641UqUsv4g31ZcUCRiLasv3U' -CHANNEL_NAME = 'PatriotFront' -OUTPUT_DIR = Path('.').resolve().parent/'data' - #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# class OdyseeChannel: @@ -57,7 +51,7 @@ class OdyseeChannel: info = result['result'][channel_url] - channel_info = { + info = { 'channel_id' : info['claim_id'], 'title' : info['value']['title'], 'created': info['timestamp'], @@ -66,8 +60,8 @@ class OdyseeChannel: 'thumbnail_image': info['value']['thumbnail']['url'], 'raw' : response.text} - self._channel_info = channel_info - self._channel_id = self._channel_info['channel_id'] + self.info = info + self._channel_id = self.info['channel_id'] #-------------------------------------------------------------------------# @@ -121,7 +115,7 @@ class OdyseeChannel: def process_all_videos(self): self.get_all_videos() - all_videos_processed = [OdyseeVideo(video)._video_info for video in self._all_videos] + all_videos_processed = [OdyseeVideo(video).info for video in self._all_videos] return all_videos_processed @@ -131,7 +125,7 @@ class OdyseeChannel: self.get_all_videos() all_videos = [OdyseeVideo(video) for video in self._all_videos] - all_videos_processed = [video._video_info for video in all_videos] + all_videos_processed = [video.info for video in all_videos] all_comments_processed = [] @@ -148,19 +142,20 @@ class OdyseeVideo: def __init__(self, full_video_info): - self._video_info = { + self.info = { 'canonical_url' : full_video_info['canonical_url'], + 'channel' : full_video_info['signing_channel']['name'], 'claim_id' : full_video_info['claim_id'], 'created' : full_video_info['value']['release_time'], - 'description' : full_video_info['value']['description'], - 'languages' : full_video_info['value']['languages'], + 'description' : full_video_info['value'].get('description'), + 'languages' : full_video_info['value'].get('languages'), 'tags' : full_video_info['value'].get('tags',[]), 'title' : full_video_info['value']['title'], 'duration' : full_video_info['value']['video']['duration'], 'thumbnail' : full_video_info['value']['thumbnail']['url'], 'raw' : json.dumps(full_video_info)} - self._claim_id = self._video_info ['claim_id'] + self._claim_id = self.info ['claim_id'] self.get_views() self.get_video_reactions() @@ -181,7 +176,7 @@ class OdyseeVideo: response = requests.get(api_url, params = params) views = json.loads(response.text)['data'][0] - self._video_info['views'] = views + self.info['views'] = views #-------------------------------------------------------------------------# @@ -200,8 +195,8 @@ class OdyseeVideo: result = json.loads(response.text) reactions = result['data']['others_reactions'][self._claim_id ] - self._video_info['likes'] = reactions['like'] - self._video_info['dislikes'] = reactions['dislike'] + self.info['likes'] = reactions['like'] + self.info['dislikes'] = reactions['dislike'] #-------------------------------------------------------------------------# @@ -262,17 +257,40 @@ class OdyseeVideo: def process_all_comments(self): self.get_all_comments() - all_comments_processed = [OdyseeComment(comment)._comment_info for comment in self._all_comments] + all_comments_processed = [OdyseeComment(comment).info for comment in self._all_comments] return all_comments_processed + #-------------------------------------------------------------------------# + + def get_recommended(self, n = 20): + + api_url = 'https://recsys.odysee.com/search' + + name = quote(self.info['title']) + + params = { + 's':name, + 'size':str(int(n)), + 'from':'0', + 'related_to':self._claim_id} + + response = requests.get(api_url, params = params) + result = json.loads(response.text) + + recommended_video_info = [name_to_video_info(r['name']) for r in result] + recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream'] + recommended_videos = [OdyseeVideo(video_info) for video_info in recommended_video_info] + + return recommended_videos + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# class OdyseeComment: def __init__(self, full_comment_info): - self._comment_info = { + self.info = { 'comment' : full_comment_info['comment'], 'created' : full_comment_info['timestamp'], 'video_claim_id' : full_comment_info['claim_id'], @@ -329,37 +347,21 @@ def append_comment_reactions(comments): #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -if __name__ == '__main__': - - odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME) - - video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments() - - channel_df = pd.DataFrame([odysee_channel._channel_info]) - video_df = pd.DataFrame(video_info_list) - comment_df = pd.DataFrame(comment_info_list) - - output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME) - os.makedirs(output_subdir, exist_ok = True) - - channel_df.to_csv( - path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_channel.csv'), - index = False, - quoting = csv.QUOTE_NONNUMERIC ) - - video_df.to_csv( - path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_videos.csv'), - index = False, - quoting = csv.QUOTE_NONNUMERIC ) - - comment_df.to_csv( - path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_comments.csv'), - index = False, - quoting = csv.QUOTE_NONNUMERIC ) - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# - +def name_to_video_info(name): + url = f"lbry://{name}" + post_data = { + "jsonrpc":"2.0", + "method":"resolve", + "params":{ + "urls":[url]}} + api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' + response = requests.post(url = api_url, json = post_data) + result = json.loads(response.text) + + return result['result'][url] + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/polyphemus/utils.py b/polyphemus/utils.py new file mode 100644 index 0000000..69672dd --- /dev/null +++ b/polyphemus/utils.py @@ -0,0 +1,43 @@ +# -*- coding: UTF-8 -*- + +"""Utility functions for scraping video data from Odysee video platform. +""" + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +import json + +import requests + +from .base import OdyseeVideo + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def name_to_video_info(name): + + url = f"lbry://{name}" + + post_data = { + "jsonrpc":"2.0", + "method":"resolve", + "params":{ + "urls":[url]}} + + api_url = 'https://api.na-backend.odysee.com/api/v1/proxy' + + response = requests.post(url = api_url, json = post_data) + result = json.loads(response.text) + + return result['result'][url] + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def name_to_video(name): + + video_info = name_to_video_info(name) + video = OdyseeVideo(video_info) + + return video + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..69637da --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +# -*- coding: UTF-8 -*- + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +import os.path +from setuptools import setup + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def readme( ): + + with open( os.path.abspath( + os.path.join( + os.path.dirname( __file__ ), + 'README.md' ) ) ) as f: + + return f.read( ) + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +setup( + name = 'polyphemus', + version = '0.1', + description = 'Scraping Odysee video data', + long_description = readme( ), + author = 'Bellingcat', + packages = [ + 'polyphemus' ], + install_requires = [ + 'requests >= 2.27.0', + 'beautifulsoup4 >= 4.10.0', + 'pandas >= 1.4.0'], + include_package_data = True, + zip_safe = False ) + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file