converted script into Python package

2026-06-11 21:08:32 +03:00 · 2022-02-11 14:53:36 -06:00
parent 80264bbe13
commit 1324983b49
6 changed files with 197 additions and 53 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 *.csv
-**/data/
+**/data/
+
+**/__pycache__/
+*.pyc
--- a/examples/scrape.py
+++ b/examples/scrape.py
@@ -0,0 +1,52 @@
+# -*- coding: UTF-8 -*-
+
+"""Scrape all video and comment data from a specified Odysee channel
+"""
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+import csv
+from pathlib import Path 
+import os 
+
+import pandas as pd
+
+from polyphemus.base import OdyseeChannel
+
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+CHANNEL_NAME = 'PatriotFront'
+OUTPUT_DIR = Path('.').resolve().parents[1]/'data'
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+if __name__ == '__main__':
+
+    odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME)
+
+    video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments()
+
+    channel_df = pd.DataFrame([odysee_channel.info])
+    video_df = pd.DataFrame(video_info_list)
+    comment_df = pd.DataFrame(comment_info_list)
+
+    output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME)
+    os.makedirs(output_subdir, exist_ok = True)
+
+    channel_df.to_csv(
+        path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_channel.csv'),
+        index = False,
+        quoting = csv.QUOTE_NONNUMERIC )
+
+    video_df.to_csv(
+        path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_videos.csv'),
+        index = False,
+        quoting = csv.QUOTE_NONNUMERIC )
+
+    comment_df.to_csv(
+        path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_comments.csv'),
+        index = False,
+        quoting = csv.QUOTE_NONNUMERIC )
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/polyphemus/init.py
+++ b/polyphemus/init.py
@@ -0,0 +1,8 @@
+# -*- coding: UTF-8 -*-
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+from . import base 
+from . import utils 
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/polyphemus/base.py
+++ b/polyphemus/base.py
@@ -1,26 +1,20 @@
 # -*- coding: UTF-8 -*-

-"""Functions and classes for scraping video data from Odysee video platform.
+"""Base classes and methods for scraping video data from Odysee video platform.
 """

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

 import json
-import csv
-from pathlib import Path 
-import os 
+from urllib.parse import quote

 import requests
-import pandas as pd

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

 #TODO Figure out how to reverse-engineer this
 AUTH_TOKEN = 'BseGAiye641UqUsv4g31ZcUCRiLasv3U'

-CHANNEL_NAME = 'PatriotFront'
-OUTPUT_DIR = Path('.').resolve().parent/'data'
-
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

 class OdyseeChannel:
@@ -57,7 +51,7 @@ class OdyseeChannel:
        
        info = result['result'][channel_url]
        
-        channel_info = {
+        info = {
            'channel_id' : info['claim_id'],
            'title' : info['value']['title'],
            'created': info['timestamp'],
@@ -66,8 +60,8 @@ class OdyseeChannel:
            'thumbnail_image': info['value']['thumbnail']['url'],
            'raw' : response.text}

-        self._channel_info = channel_info
-        self._channel_id = self._channel_info['channel_id']
+        self.info = info
+        self._channel_id = self.info['channel_id']
    
    #-------------------------------------------------------------------------#

@@ -121,7 +115,7 @@ class OdyseeChannel:
    def process_all_videos(self):
        
        self.get_all_videos()
-        all_videos_processed = [OdyseeVideo(video)._video_info for video in self._all_videos]
+        all_videos_processed = [OdyseeVideo(video).info for video in self._all_videos]
        
        return all_videos_processed
    
@@ -131,7 +125,7 @@ class OdyseeChannel:
        
        self.get_all_videos()
        all_videos = [OdyseeVideo(video) for video in self._all_videos]
-        all_videos_processed = [video._video_info for video in all_videos]
+        all_videos_processed = [video.info for video in all_videos]
        
        all_comments_processed = []
        
@@ -148,19 +142,20 @@ class OdyseeVideo:
    
    def __init__(self, full_video_info):
        
-        self._video_info = {
+        self.info = {
            'canonical_url' : full_video_info['canonical_url'],
+            'channel' : full_video_info['signing_channel']['name'],
            'claim_id' : full_video_info['claim_id'],
            'created' : full_video_info['value']['release_time'],
-            'description' : full_video_info['value']['description'],
-            'languages' : full_video_info['value']['languages'],
+            'description' : full_video_info['value'].get('description'),
+            'languages' : full_video_info['value'].get('languages'),
            'tags' : full_video_info['value'].get('tags',[]),
            'title' : full_video_info['value']['title'],
            'duration' : full_video_info['value']['video']['duration'],
            'thumbnail' : full_video_info['value']['thumbnail']['url'],
            'raw' : json.dumps(full_video_info)}
        
-        self._claim_id = self._video_info ['claim_id']
+        self._claim_id = self.info ['claim_id']

        self.get_views()
        self.get_video_reactions()
@@ -181,7 +176,7 @@ class OdyseeVideo:
        response = requests.get(api_url, params = params)
        views = json.loads(response.text)['data'][0]

-        self._video_info['views'] = views
+        self.info['views'] = views
    
    #-------------------------------------------------------------------------#

@@ -200,8 +195,8 @@ class OdyseeVideo:
        result = json.loads(response.text)
        reactions = result['data']['others_reactions'][self._claim_id ]

-        self._video_info['likes'] = reactions['like']
-        self._video_info['dislikes'] = reactions['dislike']
+        self.info['likes'] = reactions['like']
+        self.info['dislikes'] = reactions['dislike']
    
    #-------------------------------------------------------------------------#

@@ -262,17 +257,40 @@ class OdyseeVideo:
    def process_all_comments(self):
        
        self.get_all_comments()
-        all_comments_processed = [OdyseeComment(comment)._comment_info for comment in self._all_comments]
+        all_comments_processed = [OdyseeComment(comment).info for comment in self._all_comments]
        
        return all_comments_processed

+    #-------------------------------------------------------------------------#
+    
+    def get_recommended(self, n = 20):
+        
+        api_url = 'https://recsys.odysee.com/search'
+
+        name = quote(self.info['title'])
+
+        params = {
+            's':name,
+            'size':str(int(n)),
+            'from':'0',
+            'related_to':self._claim_id}
+        
+        response = requests.get(api_url, params = params)
+        result = json.loads(response.text)
+        
+        recommended_video_info = [name_to_video_info(r['name']) for r in result]
+        recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream']
+        recommended_videos = [OdyseeVideo(video_info) for video_info in recommended_video_info]
+
+        return recommended_videos
+
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

 class OdyseeComment:

    def __init__(self, full_comment_info):
        
-        self._comment_info = {
+        self.info = {
            'comment' : full_comment_info['comment'],
            'created' : full_comment_info['timestamp'],
            'video_claim_id' : full_comment_info['claim_id'],
@@ -329,37 +347,21 @@ def append_comment_reactions(comments):

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

-if __name__ == '__main__':
-
-    odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME)
-
-    video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments()
-
-    channel_df = pd.DataFrame([odysee_channel._channel_info])
-    video_df = pd.DataFrame(video_info_list)
-    comment_df = pd.DataFrame(comment_info_list)
-
-    output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME)
-    os.makedirs(output_subdir, exist_ok = True)
-
-    channel_df.to_csv(
-        path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_channel.csv'),
-        index = False,
-        quoting = csv.QUOTE_NONNUMERIC )
-
-    video_df.to_csv(
-        path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_videos.csv'),
-        index = False,
-        quoting = csv.QUOTE_NONNUMERIC )
-
-    comment_df.to_csv(
-        path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_comments.csv'),
-        index = False,
-        quoting = csv.QUOTE_NONNUMERIC )
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
+def name_to_video_info(name):

+    url = f"lbry://{name}"
    
+    post_data = {
+        "jsonrpc":"2.0",
+        "method":"resolve",
+        "params":{
+            "urls":[url]}}

+    api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'

+    response = requests.post(url = api_url, json = post_data)
+    result = json.loads(response.text)
+    
+    return result['result'][url]
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/polyphemus/utils.py
+++ b/polyphemus/utils.py
@@ -0,0 +1,43 @@
+# -*- coding: UTF-8 -*-
+
+"""Utility functions for scraping video data from Odysee video platform.
+"""
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+import json
+
+import requests 
+
+from .base import OdyseeVideo
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def name_to_video_info(name):
+
+    url = f"lbry://{name}"
+    
+    post_data = {
+        "jsonrpc":"2.0",
+        "method":"resolve",
+        "params":{
+            "urls":[url]}}
+
+    api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
+
+    response = requests.post(url = api_url, json = post_data)
+    result = json.loads(response.text)
+    
+    return result['result'][url]
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def name_to_video(name):
+
+    video_info = name_to_video_info(name)
+    video = OdyseeVideo(video_info)
+
+    return video
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,36 @@
+# -*- coding: UTF-8 -*-
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+import os.path
+from setuptools import setup
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def readme( ):
+
+  with open( os.path.abspath(
+    os.path.join(
+      os.path.dirname( __file__ ),
+      'README.md' ) ) ) as f:
+
+    return f.read( )
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+setup(
+  name = 'polyphemus',
+  version = '0.1',
+  description = 'Scraping Odysee video data',
+  long_description = readme( ),
+  author = 'Bellingcat',
+  packages = [
+    'polyphemus' ],
+  install_requires = [
+    'requests >= 2.27.0',
+    'beautifulsoup4 >= 4.10.0',
+    'pandas >= 1.4.0'],
+  include_package_data = True,
+  zip_safe = False )
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#