refactored code to avoid circular imports, isolated all requests to api.py module

2026-06-08 03:18:32 +03:00 · 2022-02-17 12:45:40 -06:00
parent c4b6d023c5
commit a6d2527bc7
6 changed files with 328 additions and 368 deletions
--- a/README.md
+++ b/README.md
@@ -4,4 +4,5 @@ Scraper for alt-tech video sharing platform [Odysee](https://odysee.com/).

 ### TODO
 - Implement CLI
- Work on reverse-engineering auth_token instead of having it hard-coded
+- Add error handling/backoff waiting to requests
+- Work on reverse-engineering auth_token instead of having it hard-coded
--- a/examples/scrape.py
+++ b/examples/scrape.py
@@ -24,11 +24,11 @@ if __name__ == '__main__':

    odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME)

-    video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments()
+    video_list, comment_list = odysee_channel.get_all_videos_and_comments()

    channel_df = pd.DataFrame([odysee_channel.info])
-    video_df = pd.DataFrame(video_info_list)
-    comment_df = pd.DataFrame(comment_info_list)
+    video_df = pd.DataFrame([v.info for v in video_list])
+    comment_df = pd.DataFrame([c.info for c in comment_list])

    output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME)
    os.makedirs(output_subdir, exist_ok = True)
--- a/polyphemus/init.py
+++ b/polyphemus/init.py
@@ -2,7 +2,7 @@

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

+from . import api
 from . import base 
-from . import utils 

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/polyphemus/api.py
+++ b/polyphemus/api.py
@@ -0,0 +1,296 @@
+# -*- coding: UTF-8 -*-
+
+"""Functions to request and process information from Odysee APIs
+"""
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+import json
+from urllib.parse import quote
+
+import requests
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+#TODO Figure out how to reverse-engineer this
+AUTH_TOKEN = 'BseGAiye641UqUsv4g31ZcUCRiLasv3U'
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def get_channel_info(channel_name):
+
+    """Get the channel information and ID from the channel name. 
+    """
+
+    channel_url = f'lbry://@{channel_name}'
+
+    api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
+
+    post_json = {
+        "jsonrpc":"2.0",
+        "method":"resolve",
+        "params":{
+            "urls":[channel_url]}}
+
+    response = requests.post(
+        url = api_url, 
+        json = post_json)
+
+    result = json.loads(response.text)
+    
+    info = result['result'][channel_url]
+    
+    info = {
+        'channel_id' : info['claim_id'],
+        'title' : info['value']['title'],
+        'created': info['timestamp'],
+        'description': info['value']['description'],
+        'cover_image': info['value']['cover']['url'],
+        'thumbnail_image': info['value']['thumbnail']['url'],
+        'raw' : response.text}
+
+    return info 
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def get_subscribers(claim_id):
+
+    """Get the number of subscribers for a channel.  
+    """
+
+    api_url = 'https://api.odysee.com/subscription/sub_count'
+
+    post_data = {
+        'auth_token': AUTH_TOKEN,
+        'claim_id': claim_id }
+
+    response = requests.post(url = api_url, data = post_data)
+    result = json.loads(response.text)
+    subscribers = result['data'][0]
+
+    return subscribers
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def get_all_videos(channel_id):
+
+    """Get a list of all videos posted by a specified channel name. 
+
+    Returns
+    -------
+    all_videos: list<dict>
+        List of dictionaries, with each dict corresponding to a JSON response 
+        containing data about a single video.
+
+    """
+
+    api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
+
+    all_videos = []
+
+    page = 1
+
+    while True:
+
+        post_data = {
+            "jsonrpc":"2.0",
+            "method":"claim_search",
+            "params":{
+                "page_size":30,
+                "page":page,
+                "order_by":["release_time"],
+                "channel_ids":[channel_id]}}
+
+        response = requests.post(
+            url = api_url, 
+            json = post_data)
+
+        result = json.loads(response.text)
+
+        videos = result['result']['items']
+
+        if not videos:
+            break
+        else:
+            all_videos.extend(videos)
+            page += 1
+
+    return all_videos
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def get_views(claim_id):
+
+    """Get the number of views for a given video.
+    """
+
+    api_url = 'https://api.odysee.com/file/view_count'
+
+    params = {
+        'auth_token': AUTH_TOKEN,
+        'claim_id': claim_id }
+
+    response = requests.get(api_url, params = params)
+    views = json.loads(response.text)['data'][0]
+
+    return views
+    
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def get_video_reactions(claim_id):
+
+    """Get all reactions for a given video.  
+    """
+
+    api_url = 'https://api.odysee.com/reaction/list'
+
+    post_data = {
+        'auth_token': AUTH_TOKEN,
+        'claim_ids': claim_id }
+
+    response = requests.post(url = api_url, data = post_data)
+    result = json.loads(response.text)
+    reactions = result['data']['others_reactions'][claim_id ]
+
+    return reactions['like'], reactions['dislike']
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def get_all_comments(claim_id):
+
+    """Get a list of all comments for a single video. 
+
+    Parameters
+    ----------
+    claim_id: str
+        Claim ID for the video whose comments are to be scraped
+        e.g. ``'84d2a91e910bee523af5422439a639f677b9c78f'`` 
+
+    Returns
+    -------
+    all_comments: list<dict>
+        List of dictionaries, with each dict corresponding to a JSON response 
+        containing data about a single comment for the specified video.
+    """
+
+    api_url = 'https://comments.odysee.com/api/v2'
+
+    all_comments = []
+
+    page = 1
+
+    while True:
+
+        post_data = {
+            "jsonrpc":"2.0",
+            "id":1,
+            "method":"comment.List",
+            "params":{
+                "page":page,
+                "claim_id":claim_id,
+                "page_size":10,
+                "top_level":False,
+                "sort_by":3}}
+
+        response = requests.post(
+            url = api_url, 
+            json = post_data)
+
+        result = json.loads(response.text)
+
+        if 'items' not in result['result']:
+            break
+        else:
+            _comments = result['result']['items']
+            comments = append_comment_reactions(comments = _comments)
+            all_comments.extend(comments)
+            page += 1
+
+    return all_comments
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def append_comment_reactions(comments):
+    
+    """Get reaction data for each comment and insert ``'reactions'`` key into 
+    dict for each comment.
+
+    Parameters
+    ----------
+    comments: list<dict>
+        List of dictionaries, with each dict corresponding to a JSON response 
+        containing data about a single comment for the specified video.
+
+    Returns
+    -------
+    comments: list<dict>
+        List of dictionaries, with each dict corresponding to a JSON response 
+        containing data about a single comment for the specified video, with 
+        additional ``'reactions'`` field containing reaction information for 
+        each comment.
+
+    """
+    
+    comment_ids = ','.join([c['comment_id'] for c in comments])
+
+    post_data = {
+        "jsonrpc":"2.0",
+        "id":1,
+        "method":"reaction.List",
+        "params":{
+            "comment_ids":comment_ids}}
+
+    api_url = 'https://comments.odysee.com/api/v2'
+    response = requests.post(url = api_url, json = post_data)
+    result = json.loads(response.text)
+
+    reactions = result['result']['others_reactions']
+    
+    for comment in comments:
+        comment['likes'] = reactions[comment['comment_id']]['like']
+        comment['dislikes'] = reactions[comment['comment_id']]['dislike']
+        
+    return comments
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def get_recommended(title, claim_id):
+    
+    api_url = 'https://recsys.odysee.com/search'
+
+    name = quote(title)
+
+    params = {
+        's':name,
+        'size':'20',
+        'from':'0',
+        'related_to':claim_id}
+    
+    response = requests.get(api_url, params = params)
+    result = json.loads(response.text)
+    
+    recommended_video_info = [ name_to_video_info(r['name']) for r in result]
+    recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream']
+
+    return recommended_video_info
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def name_to_video_info(name):
+
+    url = f"lbry://{name}"
+    
+    post_data = {
+        "jsonrpc":"2.0",
+        "method":"resolve",
+        "params":{
+            "urls":[url]}}
+
+    api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
+
+    response = requests.post(url = api_url, json = post_data)
+    result = json.loads(response.text)
+    
+    return result['result'][url]
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/polyphemus/base.py
+++ b/polyphemus/base.py
@@ -8,7 +8,7 @@
 import json
 from urllib.parse import quote

-import requests
+from polyphemus import api

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

@@ -24,136 +24,42 @@ class OdyseeChannel:
    def __init__(self, channel_name):
        
        self._channel_name = channel_name
-        self.get_channel_info()
-    
-    #-------------------------------------------------------------------------#

-    def get_channel_info(self):
-    
-        """Get the channel information and ID from the channel name. 
-        """
-
-        channel_url = f'lbry://@{self._channel_name}'
-
-        api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
-
-        post_json = {
-            "jsonrpc":"2.0",
-            "method":"resolve",
-            "params":{
-                "urls":[channel_url]}}
-
-        response = requests.post(
-            url = api_url, 
-            json = post_json)
-
-        result = json.loads(response.text)
-        
-        info = result['result'][channel_url]
-        
-        info = {
-            'channel_id' : info['claim_id'],
-            'title' : info['value']['title'],
-            'created': info['timestamp'],
-            'description': info['value']['description'],
-            'cover_image': info['value']['cover']['url'],
-            'thumbnail_image': info['value']['thumbnail']['url'],
-            'raw' : response.text}
+        info = api.get_channel_info(channel_name = self._channel_name)

        self.info = info
        self._channel_id = self.info['channel_id']

-        self.get_subscribers()
+        self.info['subscribers'] = api.get_subscribers(claim_id = self.info['channel_id'])
    
    #-------------------------------------------------------------------------#

-    def get_subscribers(self):
-
-        """Get the number of subscribers for a channel.  
-        """
-
-        api_url = 'https://api.odysee.com/subscription/sub_count'
-
-        post_data = {
-            'auth_token': AUTH_TOKEN,
-            'claim_id': self.info['channel_id'] }
-
-        response = requests.post(url = api_url, data = post_data)
-        result = json.loads(response.text)
-        subscribers = result['data'][0]
-
-        self.info['subscribers'] = subscribers
-
-    #-------------------------------------------------------------------------#
-
    def get_all_videos(self):

-        """Get a list of all videos posted by a specified channel name. 
-
-        Returns
-        -------
-        all_videos: list<dict>
-            List of dictionaries, with each dict corresponding to a JSON response 
-            containing data about a single video.
-
+        """Return list of OdyseeVideo objects for all videos posted by the channel
        """

-        api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
+        all_video_info = api.get_all_videos(channel_id=self.info['channel_id'])
+        self.all_videos = [OdyseeVideo(video) for video in all_video_info]
+        
+        return self.all_videos

-        all_videos = []
-
-        page = 1
-
-        while True:
-
-            post_data = {
-                "jsonrpc":"2.0",
-                "method":"claim_search",
-                "params":{
-                    "page_size":30,
-                    "page":page,
-                    "order_by":["release_time"],
-                    "channel_ids":[self._channel_id]}}
-
-            response = requests.post(
-                url = api_url, 
-                json = post_data)
-
-            result = json.loads(response.text)
-
-            videos = result['result']['items']
-
-            if not videos:
-                break
-            else:
-                all_videos.extend(videos)
-                page += 1
-
-        self._all_videos = all_videos
-    
    #-------------------------------------------------------------------------#

-    def process_all_videos(self):
-        
-        self.get_all_videos()
-        all_videos_processed = [OdyseeVideo(video) for video in self._all_videos]
-        
-        return all_videos_processed
-    
-    #-------------------------------------------------------------------------#
+    def get_all_videos_and_comments(self):

-    def process_all_videos_and_comments(self):
-        
-        self.get_all_videos()
-        all_videos = [OdyseeVideo(video) for video in self._all_videos]
-        all_videos_processed = [video for video in all_videos]
-        
-        all_comments_processed = []
+        """Return list of OdyseeVideo and OdyseeComment objects for all videos 
+        posted by the channel and all comments posted to those videos
+        """
+
+        all_videos = self.get_all_videos()
+
+        all_comments = []
        
        for video in all_videos:
-            all_comments_processed.extend(video.process_all_comments())
+            all_comments.extend(video.get_all_comments())
        
-        return all_videos_processed, all_comments_processed
+        return all_videos, all_comments
    
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

@@ -178,129 +84,26 @@ class OdyseeVideo:
        
        self._claim_id = self.info ['claim_id']

-        self.get_views()
-        self.get_video_reactions()
+        self.info['views'] = api.get_views(claim_id=self._claim_id)

-    #-------------------------------------------------------------------------#
+        self.info['likes'], self.info['dislikes']= api.get_video_reactions(
+            claim_id = self._claim_id)
        
-    def get_views(self):
-
-        """Get the number of views for a given video.
-        """
-
-        api_url = 'https://api.odysee.com/file/view_count'
-
-        params = {
-            'auth_token': AUTH_TOKEN,
-            'claim_id': self._claim_id }
-
-        response = requests.get(api_url, params = params)
-        views = json.loads(response.text)['data'][0]
-
-        self.info['views'] = views
-    
-    #-------------------------------------------------------------------------#
-
-    def get_video_reactions(self):
-
-        """Get all reactions for a given video.  
-        """
-
-        api_url = 'https://api.odysee.com/reaction/list'
-
-        post_data = {
-            'auth_token': AUTH_TOKEN,
-            'claim_ids': self._claim_id }
-
-        response = requests.post(url = api_url, data = post_data)
-        result = json.loads(response.text)
-        reactions = result['data']['others_reactions'][self._claim_id ]
-
-        self.info['likes'] = reactions['like']
-        self.info['dislikes'] = reactions['dislike']
-    
    #-------------------------------------------------------------------------#

    def get_all_comments(self):
-
-        """Get a list of all comments for a single video. 
-
-        Parameters
-        ----------
-        claim_id: str
-            Claim ID for the video whose comments are to be scraped
-            e.g. ``'84d2a91e910bee523af5422439a639f677b9c78f'`` 
-
-        Returns
-        -------
-        all_comments: list<dict>
-            List of dictionaries, with each dict corresponding to a JSON response 
-            containing data about a single comment for the specified video.
-        """
-
-        api_url = 'https://comments.odysee.com/api/v2'
-
-        all_comments = []
-
-        page = 1
-
-        while True:
-
-            post_data = {
-                "jsonrpc":"2.0",
-                "id":1,
-                "method":"comment.List",
-                "params":{
-                    "page":page,
-                    "claim_id":self._claim_id,
-                    "page_size":10,
-                    "top_level":False,
-                    "sort_by":3}}
-
-            response = requests.post(
-                url = api_url, 
-                json = post_data)
-
-            result = json.loads(response.text)
-
-            if 'items' not in result['result']:
-                break
-            else:
-                _comments = result['result']['items']
-                comments = append_comment_reactions(comments = _comments)
-                all_comments.extend(comments)
-                page += 1
-
-        self._all_comments = all_comments
        
-    #-------------------------------------------------------------------------#
-
-    def process_all_comments(self):
+        all_comment_info = api.get_all_comments(claim_id=self._claim_id)
+        self.all_comments = [OdyseeComment(comment) for comment in all_comment_info]
        
-        self.get_all_comments()
-        all_comments_processed = [OdyseeComment(comment).info for comment in self._all_comments]
-        
-        return all_comments_processed
+        return self.all_comments

    #-------------------------------------------------------------------------#
    
    def get_recommended(self):
        
-        api_url = 'https://recsys.odysee.com/search'
-
-        name = quote(self.info['title'])
-
-        params = {
-            's':name,
-            'size':'20',
-            'from':'0',
-            'related_to':self._claim_id}
-        
-        response = requests.get(api_url, params = params)
-        result = json.loads(response.text)
-        
-        recommended_video_info = [_name_to_video_info(r['name']) for r in result]
-        recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream']
+        recommended_video_info = api.get_recommended(
+            title=self.info['title'], claim_id=self._claim_id)
        recommended_videos = [OdyseeVideo(video_info) for video_info in recommended_video_info]

        return recommended_videos
@@ -322,67 +125,4 @@ class OdyseeComment:
            'dislikes' : full_comment_info['dislikes'],
            'raw' : json.dumps(full_comment_info)}

-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
-def append_comment_reactions(comments):
-    
-    """Get reaction data for each comment and insert ``'reactions'`` key into 
-    dict for each comment.
-
-    Parameters
-    ----------
-    comments: list<dict>
-        List of dictionaries, with each dict corresponding to a JSON response 
-        containing data about a single comment for the specified video.
-
-    Returns
-    -------
-    comments: list<dict>
-        List of dictionaries, with each dict corresponding to a JSON response 
-        containing data about a single comment for the specified video, with 
-        additional ``'reactions'`` field containing reaction information for 
-        each comment.
-
-    """
-    
-    comment_ids = ','.join([c['comment_id'] for c in comments])
-
-    post_data = {
-        "jsonrpc":"2.0",
-        "id":1,
-        "method":"reaction.List",
-        "params":{
-            "comment_ids":comment_ids}}
-
-    api_url = 'https://comments.odysee.com/api/v2'
-    response = requests.post(url = api_url, json = post_data)
-    result = json.loads(response.text)
-
-    reactions = result['result']['others_reactions']
-    
-    for comment in comments:
-        comment['likes'] = reactions[comment['comment_id']]['like']
-        comment['dislikes'] = reactions[comment['comment_id']]['dislike']
-        
-    return comments
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
-def name_to_video_info(name):
-
-    url = f"lbry://{name}"
-    
-    post_data = {
-        "jsonrpc":"2.0",
-        "method":"resolve",
-        "params":{
-            "urls":[url]}}
-
-    api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
-
-    response = requests.post(url = api_url, json = post_data)
-    result = json.loads(response.text)
-    
-    return result['result'][url]
-
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/polyphemus/utils.py
+++ b/polyphemus/utils.py
@@ -1,77 +0,0 @@
-# -*- coding: UTF-8 -*-
-
-"""Utility functions for scraping video data from Odysee video platform.
-"""
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
-import json
-
-import requests 
-
-from .base import OdyseeVideo
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
-ODYSEE_DOMAIN = 'https://odysee.com/'
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
-def _name_to_video_info(name):
-
-    url = f"lbry://{name}"
-    
-    post_data = {
-        "jsonrpc":"2.0",
-        "method":"resolve",
-        "params":{
-            "urls":[url]}}
-
-    api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
-
-    response = requests.post(url = api_url, json = post_data)
-    result = json.loads(response.text)
-    
-    return result['result'][url]
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
-def _url_to_video_info(url):
-
-    if url.startswith(ODYSEE_DOMAIN):
-        name = url.split(ODYSEE_DOMAIN)[1]
-        url = f"lbry://{name}"
-    
-    post_data = {
-        "jsonrpc":"2.0",
-        "method":"resolve",
-        "params":{
-            "urls":[url]}}
-
-    api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
-
-    response = requests.post(url = api_url, json = post_data)
-    result = json.loads(response.text)
-    
-    return result['result'][url]
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
-def name_to_video(name):
-
-    video_info = _name_to_video_info(name)
-    video = OdyseeVideo(video_info)
-
-    return video
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
-def url_to_video(name):
-
-    video_info = _url_to_video_info(name)
-    video = OdyseeVideo(video_info)
-
-    return video
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-