refactored base classes to have structure more similar to snscrape, made scraper 'get' methods return dataclasses or list of dataclasses rather than dicts

2026-06-13 13:58:32 +03:00 · 2022-04-11 10:27:12 -05:00
parent 3fd841f76a
commit 44a673f889
5 changed files with 252 additions and 177 deletions
--- a/polyphemus/api.py
+++ b/polyphemus/api.py
@@ -7,6 +7,8 @@
 import json
 from urllib.parse import quote
 from typing import Tuple, Optional, List
 import time
 import requests
@@ -23,7 +25,7 @@ NEW_USER_API_URL = 'https://api.odysee.com/user/new'
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def make_request(request, kwargs):
+def make_request(request: str, kwargs: dict) -> requests.Response:
    """Wrapper for retrying request multiple times.
    """
@@ -32,12 +34,24 @@ def make_request(request, kwargs):
        msg = f'`request` argument must be either `requests.get` or `requests.post`, not {type(request)}'
        raise ValueError(msg)
-    n_retries = 0
+    if 'timeout' not in kwargs:
-    response = request(**kwargs)
+        kwargs['timeout'] = 15
-    while response.status_code != 200 and n_retries < 5:
+    n_retries = 0
-        n_retries += 1
+
-        response = request(**kwargs)
+    response = requests.Response()
    response.status_code = 418
    while n_retries < 5:
        time.sleep(2 ** n_retries - 1)
        try:
            response = request(**kwargs)
            if response.status_code == 200:
                return response
            else:
                n_retries += 1
        except Exception:
            n_retries += 1            
    if response.status_code != 200:
        msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}: status code {response.status_code}'
@@ -47,9 +61,12 @@ def make_request(request, kwargs):
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def get_auth_token():
+def get_auth_token() -> str:
-    """Get a fresh authorization token, to use for API calls that require it. 
+    """Get a fresh authorization token, to use for API calls that require it.
    Note: calling this function many times in quick succession may result in a 
    503 error. 
    """
    response = make_request(
@@ -63,7 +80,7 @@ def get_auth_token():
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def get_channel_info(channel_name):
+def get_channel_info(channel_name: str) -> dict:
    """Get the channel information and ID from the channel name. 
    """
@@ -99,7 +116,7 @@ def get_channel_info(channel_name):
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def get_subscribers(channel_id, auth_token = None):
+def get_subscribers(channel_id: str, auth_token: str = None) -> int:
    """Get the number of subscribers for a channel.  
    """
@@ -124,19 +141,19 @@ def get_subscribers(channel_id, auth_token = None):
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def get_all_videos(channel_id):
+def get_raw_video_info_list(channel_id: str) -> dict:
    """Get a list of all videos posted by a specified channel name. 
    Returns
    -------
-    all_videos: list<dict>
+    raw_video_info_list: list<dict>
        List of dictionaries, with each dict corresponding to a JSON response 
        containing data about a single video.
    """
-    all_videos = []
+    raw_video_info_list = []
    page = 1
@@ -164,14 +181,14 @@ def get_all_videos(channel_id):
        if not videos:
            break
        else:
-            all_videos.extend(videos)
+            raw_video_info_list.extend(videos)
            page += 1
-    return all_videos
+    return raw_video_info_list
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def get_views(video_id, auth_token = None):
+def get_views(video_id: str, auth_token: str = None) -> int:
    """Get the number of views for a given video.
    """
@@ -195,7 +212,7 @@ def get_views(video_id, auth_token = None):
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def get_video_reactions(video_id, auth_token = None):
+def get_video_reactions(video_id: str, auth_token: str = None) -> Tuple[Optional[int], Optional[int]]:
    """Get all reactions for a given video.  
    """
@@ -223,7 +240,7 @@ def get_video_reactions(video_id, auth_token = None):
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def get_all_comments(video_id):
+def get_all_comments(video_id: str) -> List[dict]:
    """Get a list of all comments for a single video. 
@@ -277,7 +294,7 @@ def get_all_comments(video_id):
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def append_comment_reactions(comment_info_list):
+def append_comment_reactions(comment_info_list: List[dict]) -> List[dict]:
    """Get reaction data for each comment and insert ``'reactions'`` key into 
    dict for each comment.
@@ -325,7 +342,7 @@ def append_comment_reactions(comment_info_list):
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def get_recommended(video_title, video_id):
+def get_recommended(video_title: str, video_id: str) -> List[dict]:
    name = quote(video_title)
@@ -350,7 +367,7 @@ def get_recommended(video_title, video_id):
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def normalized_name_to_video_info(normalized_name):
+def normalized_name_to_video_info(normalized_name: str) -> dict:
    video_url = f"lbry://{normalized_name}"
@@ -372,7 +389,7 @@ def normalized_name_to_video_info(normalized_name):
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-def get_streaming_url(canonical_url):
+def get_streaming_url(canonical_url: str) -> str:
    json_data = {
        "jsonrpc":"2.0",
--- a/polyphemus/base.py
+++ b/polyphemus/base.py
@@ -7,48 +7,111 @@
 import json
 from urllib.parse import unquote
 from dataclasses import dataclass
 import typing
 from datetime import datetime 
 from polyphemus import api
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-class OdyseeChannel:
+@dataclass
 class Channel:
    channel_id: str
    created: datetime
    subscribers: int
    raw : str
    title : typing.Optional[str] = None
    description: typing.Optional[str] = None
    cover_image: typing.Optional[str] = None
    thumbnail_image: typing.Optional[str] = None
@dataclass
 class Video:
    canonical_url: str
    streaming_url: str
    type: str
    claim_id: str
    created: datetime
    title: str
    views: int
    raw: str
    text: typing.Optional[str] = None
    thumbnail : typing.Optional[str] = None
    channel_id: typing.Optional[str] = None
    channel_name: typing.Optional[str] = None
    duration: typing.Optional[int] = None
    languages : typing.Optional[typing.List[str]] = None
    tags: typing.Optional[typing.List[str]] = None
    likes: typing.Optional[int] = None
    dislikes: typing.Optional[int] = None
    is_comment: bool = False
@dataclass
 class Comment:
    text: str
    created: datetime
    claim_id : str
    video_claim_id : str
    channel_id: str
    channel_name : str
    replies: int
    likes: int
    dislikes: int
    raw : str
    is_comment: bool = True
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 class OdyseeChannelScraper:
    #-------------------------------------------------------------------------#
-    def __init__(self, channel_name, auth_token = None):
+    def __init__(self, channel_name: str, auth_token: str = None):
        self._channel_name = unquote(channel_name)
        info = api.get_channel_info(channel_name = self._channel_name)
        self.info = info
        self._channel_id = self.info['channel_id']
        if auth_token is None:
            self.auth_token = api.get_auth_token()
        else:
            self.auth_token = auth_token
-        self.info['subscribers'] = api.get_subscribers(
+        self._raw_channel_info = api.get_channel_info(channel_name = self._channel_name)
-            channel_id = self.info['channel_id'],
+        self._channel_id = self._raw_channel_info['channel_id']
            auth_token = self.auth_token)
    #-------------------------------------------------------------------------#
-    def get_all_videos(self):
+    def get_entity(self) -> Channel:
-        """Return list of OdyseeVideo objects for all videos posted by the channel
+        subscribers = api.get_subscribers(
            channel_id = self._channel_id,
            auth_token = self.auth_token)
        return Channel(
            channel_id=self._raw_channel_info['channel_id'],
            title=self._raw_channel_info['title'],
            created=datetime.fromtimestamp(self._raw_channel_info['created']),
            description=self._raw_channel_info['description'],
            cover_image=self._raw_channel_info['cover_image'],
            thumbnail_image=self._raw_channel_info['thumbnail_image'],
            raw=self._raw_channel_info['raw'],
            subscribers=subscribers)
    #-------------------------------------------------------------------------#
    def get_all_videos(self) -> typing.Generator[Video, None, None]:
        """Return list of Video objects for all videos posted by the channel
        """
-        all_video_info = api.get_all_videos(channel_id=self.info['channel_id'])
+        raw_video_info_list = api.get_raw_video_info_list(channel_id=self._channel_id)
-        self.all_videos = (OdyseeVideo(video, self.auth_token) for video in all_video_info)
+        videos = (process_raw_video_info(raw_video_info, self.auth_token) for raw_video_info in raw_video_info_list)
-        return self.all_videos
+        return videos
    #-------------------------------------------------------------------------#
-    def get_all_videos_and_comments(self):
+    def get_all_videos_and_comments(self) -> typing.Tuple[typing.List['Video'], typing.List['Comment']]:
        """Return list of OdyseeVideo and OdyseeComment objects for all videos 
        posted by the channel and all comments posted to those videos
@@ -56,133 +119,131 @@ class OdyseeChannel:
        all_videos = list(self.get_all_videos())
-        all_comments = []
+        raw_comment_info_list = []
        for video in all_videos:
-            all_comments.extend(video.get_all_comments())
+            raw_comment_info_list.extend(api.get_all_comments(video_id=video.claim_id))
        all_comments = [process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list]
        return all_videos, all_comments
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-class OdyseeVideo:
+def process_raw_video_info(raw_video_info: dict, auth_token = None) -> Video:
-    #-------------------------------------------------------------------------#
+    if auth_token is None:
        auth_token = api.get_auth_token()
    else:
        auth_token = auth_token
    # Handle edge cases
    #.....................................................................#
    if 'video' in raw_video_info['value']:
        video_type = 'video'
        duration = raw_video_info['value']['video'].get('duration')
    elif 'audio' in raw_video_info['value']:
        video_type = 'audio'
        duration = raw_video_info['value']['audio'].get('duration')
    elif 'claim_hash' in raw_video_info['value']:
        video_type = 'repost'
        duration = None
        raw_video_info['value'] = raw_video_info['reposted_claim']['value']
        raw_video_info['canonical_url'] = raw_video_info['reposted_claim']['canonical_url']
    elif 'image' in raw_video_info['value']:
        video_type = 'image'
        duration = None
    else:
        video_type = 'other'
        duration = None
    if 'signing_channel' in raw_video_info:
        channel_name = raw_video_info['signing_channel'].get('name')
        if 'claim_id' in raw_video_info['signing_channel']:
            channel_id = raw_video_info['signing_channel']['claim_id']
        else:
            channel_id = raw_video_info['signing_channel']['channel_id']
    else:
        channel_name = None
        channel_id = None
    if 'release_time' in raw_video_info['value']:
        created = raw_video_info['value']['release_time']
    else:
        created = raw_video_info['meta']['creation_timestamp']
    if 'thumbnail' in raw_video_info['value']:
        thumbnail = raw_video_info['value']['thumbnail'].get('url', None)
    else:
        thumbnail = None
-    def __init__(self, full_video_info, auth_token = None):
+    # Retrieve additional fields
-
+    #.....................................................................#
        if auth_token is None:
            self.auth_token = api.get_auth_token()
        else:
            self.auth_token = auth_token
        # Handle edge cases
        #.....................................................................#
        if 'video' in full_video_info['value']:
            video_type = 'video'
            duration = full_video_info['value']['video'].get('duration')
        elif 'audio' in full_video_info['value']:
            video_type = 'audio'
            duration = full_video_info['value']['audio'].get('duration')
        elif 'claim_hash' in full_video_info['value']:
            video_type = 'repost'
            duration = None
            full_video_info['value'] = full_video_info['reposted_claim']['value']
            full_video_info['canonical_url'] = full_video_info['reposted_claim']['canonical_url']
        elif 'image' in full_video_info['value']:
            video_type = 'image'
            duration = None
        else:
            video_type = 'other'
            duration = None
        if 'signing_channel' in full_video_info:
            channel_name = full_video_info['signing_channel'].get('name')
            if 'claim_id' in full_video_info['signing_channel']:
                channel_id = full_video_info['signing_channel']['claim_id']
            else:
                channel_id = full_video_info['signing_channel']['channel_id']
        else:
            channel_name = None
            channel_id = None
        if 'release_time' in full_video_info['value']:
            created = full_video_info['value']['release_time']
        else:
            created = full_video_info['meta']['creation_timestamp']
        if 'thumbnail' in full_video_info['value']:
            thumbnail = full_video_info['value']['thumbnail'].get('url', None)
        else:
            thumbnail = None
        # Store relevant information in flat dict
        #.....................................................................#
        self.info = {
            'canonical_url' : full_video_info['canonical_url'],
            'type' : video_type,
            'channel_id' : channel_id,
            'channel_name' : channel_name,
            'claim_id' : full_video_info['claim_id'],
            'created' : int(created),
            'text' : full_video_info['value'].get('description'),
            'languages' : full_video_info['value'].get('languages'),
            'tags' : full_video_info['value'].get('tags',[]),
            'title' : full_video_info['value']['title'],
            'duration' : duration,
            'thumbnail' : thumbnail,
            'is_comment' : False,
            'raw' : json.dumps(full_video_info)}
        self.claim_id = self.info['claim_id']
        self.info['views'] = api.get_views(video_id=self.claim_id, auth_token = self.auth_token)
        self.info['likes'], self.info['dislikes'] = api.get_video_reactions(
            video_id = self.claim_id,
            auth_token = self.auth_token)
        self.info['streaming_url'] = api.get_streaming_url(self.info['canonical_url'])
    #-------------------------------------------------------------------------#
    def get_all_comments(self):
        all_comment_info = api.get_all_comments(video_id=self.claim_id)
        self.all_comments = (OdyseeComment(comment) for comment in all_comment_info)
        return self.all_comments
    #-------------------------------------------------------------------------#
-    def get_recommended(self):
+    claim_id = raw_video_info['claim_id']
        recommended_video_info = api.get_recommended(
            video_title=self.info['title'], video_id=self.claim_id)
        recommended_videos = [OdyseeVideo(video_info, self.auth_token) for video_info in recommended_video_info]
-        return recommended_videos
+    views = api.get_views(video_id=claim_id, auth_token = auth_token)
    likes, dislikes = api.get_video_reactions(
        video_id = claim_id,
        auth_token = auth_token)
    streaming_url = api.get_streaming_url(raw_video_info['canonical_url'])
    # Return Video object
    #.....................................................................#
    return Video(
        canonical_url = raw_video_info['canonical_url'],
        type = video_type,
        channel_id = channel_id,
        channel_name = channel_name,
        claim_id = raw_video_info['claim_id'],
        created = datetime.fromtimestamp(int(created)),
        text = raw_video_info['value'].get('description'),
        languages = raw_video_info['value'].get('languages'),
        tags = raw_video_info['value'].get('tags',[]),
        title = raw_video_info['value']['title'],
        duration = duration,
        thumbnail = thumbnail,
        is_comment = False,
        raw = json.dumps(raw_video_info),
        views = views,
        likes = likes,
        dislikes = dislikes,
        streaming_url = streaming_url)
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-class OdyseeComment:
+def process_raw_comment_info(raw_comment_info: dict) -> Comment:
-    def __init__(self, full_comment_info):
+    return Comment(
-        
+        text = raw_comment_info['comment'],
-        # Store relevant information in flat dict
+        created = raw_comment_info['timestamp'],
-        self.info = {
+        claim_id = raw_comment_info.get('comment_id'),
-            'text' : full_comment_info['comment'],
+        video_claim_id = raw_comment_info['claim_id'],
-            'created' : full_comment_info['timestamp'],
+        channel_id = raw_comment_info['channel_id'],
-            'claim_id' : full_comment_info.get('comment_id'),
+        channel_name = raw_comment_info['channel_name'],
-            'video_claim_id' : full_comment_info['claim_id'],
+        replies = raw_comment_info.get('replies', 0),
-            'channel_id' : full_comment_info['channel_id'],
+        likes = raw_comment_info['likes'],
-            'channel_name' : full_comment_info['channel_name'],
+        dislikes = raw_comment_info['dislikes'],
-            'replies' : full_comment_info.get('replies', 0),
+        is_comment = True,
-            'likes' : full_comment_info['likes'],
+        raw = json.dumps(raw_comment_info))
            'dislikes' : full_comment_info['dislikes'],
            'is_comment' : True,
            'raw' : json.dumps(full_comment_info)}
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 def get_recommended(video: Video, auth_token: str = None) -> typing.List['Video']:
    if auth_token is None:
        auth_token = api.get_auth_token()
    else:
        auth_token = auth_token
    recommended_video_info_list = api.get_recommended(
        video_title=video.title, video_id=video.claim_id)
    recommended_videos = [process_raw_video_info(raw_video_info, auth_token) for raw_video_info in recommended_video_info_list]
    return recommended_videos
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/tests/api.py
+++ b/tests/api.py
@@ -23,7 +23,7 @@ KWARGS_LIST = [
    ('get_auth_token', []),
    ('get_channel_info', ['channel_name']),
    ('get_subscribers', ['channel_id', 'auth_token']),
-    ('get_all_videos', ['channel_id']),
+    ('get_raw_video_info_list', ['channel_id']),
    ('get_views', ['video_id', 'auth_token']),
    ('get_video_reactions', ['video_id', 'auth_token']),
    ('get_all_comments', ['video_id']),
@@ -34,12 +34,12 @@ KWARGS_LIST = [
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-@pytest.mark.parametrize( 'function_str,kwargs', KWARGS_LIST )
+@pytest.mark.parametrize('function_str,kwargs', KWARGS_LIST)
-def test_minimal_init( resources, function_str, kwargs ):
+def test_minimal_init(resources, function_str, kwargs):
-  function = eval( f'api.{function_str}')
+  function = eval(f'api.{function_str}')
-  function_kwargs = { kwarg : resources[ kwarg ] for kwarg in kwargs }
+  function_kwargs = {kwarg: resources[kwarg] for kwarg in kwargs}
-  function( **function_kwargs )
+  function(**function_kwargs)
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/tests/base.py
+++ b/tests/base.py
@@ -19,38 +19,35 @@ from polyphemus import base
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-class TestOdyseeChannel:
+class TestOdyseeChannelScraper:
    @pytest.fixture(autouse=True)
    def test_simple_init(self, resources):
-        self.channel = base.OdyseeChannel(channel_name = resources['channel_name'])
+        self.scraper = base.OdyseeChannelScraper(channel_name = resources['channel_name'])
    def test_get_entity(self):
        self.scraper.get_entity()
    def test_get_all_videos(self):
-        self.channel.get_all_videos()
+        self.scraper.get_all_videos()
    def test_get_all_videos_and_comments(self):
-        self.channel.get_all_videos_and_comments()
+        self.scraper.get_all_videos_and_comments()
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-class TestOdyseeVideo:
+def test_process_raw_video_info(resources):
    video = base.process_raw_video_info(raw_video_info = resources['full_video_info'], auth_token = resources['auth_token'])
    @pytest.fixture(autouse=True)
    def test_simple_init(self, resources):
        self.video = base.OdyseeVideo(full_video_info = resources['full_video_info'])
    def test_get_all_comments(self):
        self.video.get_all_comments()
    def test_get_recommended(self):
        self.video.get_recommended()
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-class TestOdyseeComment:
+def test_get_recommended(resources):
    video = base.process_raw_video_info(raw_video_info = resources['full_video_info'], auth_token = resources['auth_token'])
    base.get_recommended(video = video)
-    @pytest.fixture(autouse=True)
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-    def test_simple_init(self, resources):
+
-        self.comment = base.OdyseeComment(full_comment_info = resources['full_comment_info'])
+def test_process_raw_comment_info(resources):
    base.process_raw_comment_info(raw_comment_info = resources['full_comment_info'])
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -91,7 +91,7 @@ def resources():
        normalized_name = NORMALIZED_NAME,
        canonical_url = CANONICAL_URL,
        full_video_info = FULL_VIDEO_INFO,
-        full_comment_info = {**COMMENT_INFO_LIST[0], **{'likes' : 8, 'dislikes' : 0}},
+        full_comment_info = {**COMMENT_INFO_LIST[0], **{'likes': 8, 'dislikes': 0}},
        comment_info_list = COMMENT_INFO_LIST,
        auth_token = get_auth_token())