From bcb68a17fb2859a4be1a1eb27851192372890660 Mon Sep 17 00:00:00 2001
From: Tristan Lee <tristan@bellingcat.com>
Date: Tue, 12 Apr 2022 22:46:51 -0500
Subject: [PATCH] implemented method for retrieving ALL videos from a channel,
 not just the first 1000, increased robustness of make_requests wrapper, added
 missing unit tests

---
 polyphemus/api.py | 129 +++++++++++++++++++++++++++++++++-------------
 tests/api.py      |   5 +-
 tests/base.py     |  11 ++++
 3 files changed, 106 insertions(+), 39 deletions(-)

diff --git a/polyphemus/api.py b/polyphemus/api.py
index 13c460a..e0e2464 100644
--- a/polyphemus/api.py
+++ b/polyphemus/api.py
@@ -24,11 +24,33 @@ COMMENT_API_URL = 'https://comments.odysee.com/api/v2'
 RECOMMENDATION_API_URL = 'https://recsys.odysee.com/search'
 NEW_USER_API_URL = 'https://api.odysee.com/user/new'
 
+# Allow responses to `get_streaming_url` that contain no `streaming_url` field
+ALLOWED_ERROR_CODES = [-32603]
+
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 
 def make_request(request: Callable, kwargs: dict) -> requests.Response:
 
-    """Wrapper for retrying request multiple times.
+    """Wrapper for retrying request multiple times and handling errors.
+
+    This function handles Python exceptions (e.g. HTTPConnectionPool), 
+    unsuccessful HTTP error codes (e.g. 429, 403), and errors in the 
+    JSON response. If after 5 retries (using exponential backoff) the request 
+    is unsuccessful, an exception is raised. 
+
+    Parameters
+    ----------
+    request: function
+        The requests function to be called.
+        One of {requests.get and requests.post}
+    kwargs: dict
+        Keyword arguments for the ``request`` function. Must include ``url`` key.
+        e.g. ``{'url': 'https://api.odysee.com/user/new'}``
+        Uses a default timeout of 15 seconds.
+
+    Returns
+    -------
+    response: requests.Response
     """
 
     if request not in [requests.get, requests.post]:
@@ -43,23 +65,33 @@ def make_request(request: Callable, kwargs: dict) -> requests.Response:
     response = requests.Response()
     response.status_code = 418
 
-    exceptions = []
-    status_codes = []
+    retry_reasons = []
 
+    # TODO this looks a bit gross, try to refactor
     while n_retries < 5:
         time.sleep(2 ** n_retries - 1)
         try:
             response = request(**kwargs)
             if response.status_code == 200:
-                return response
+                parsed_response = json.loads(response.text)
+                if isinstance(parsed_response, list):
+                    return response
+                if parsed_response.get('error') is not None:
+                    if parsed_response['error'].get('code', None) not in ALLOWED_ERROR_CODES:
+                        retry_reasons.append(f'JSON response error: {parsed_response["error"]}')
+                        n_retries += 1
+                    else:
+                        return response
+                else:
+                    return response
             else:
-                status_codes.append(response.status_code)
+                retry_reasons.append(f'HTTP status code: {response.status_code}')
                 n_retries += 1
         except Exception as exception:
-            exceptions.append(exception)
+            retry_reasons.append(f'Python exception: {exception}')
             n_retries += 1            
 
-    msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Status codes: {status_codes}; exceptions: {exceptions}'
+    msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Retry reasons: {retry_reasons}'
     raise ValueError(msg)
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -148,6 +180,19 @@ def get_raw_video_info_list(channel_id: str) -> dict:
 
     """Get a list of all videos posted by a specified channel name. 
 
+    Odysee's ``claim_search`` API (which is used on the browser and LBRY 
+    desktop app) only allows up to 1000 videos to be fetched for a single value 
+    of the ``release_time`` parameter. You can check this by going to an Odysee 
+    channel with a lot of videos (e.g. @etresouverain) and holding the 
+    "Page Down" button until you reach the bottom, there will only be 1000 
+    videos. 
+    
+    This function loops over all pages for a single ``release_time`` and 
+    fetches the raw video info for all videos until it reaches that 1000 video 
+    limit, then uses the minimum of the ``creation_timestamp`` for all videos 
+    as the new ``release_time``, and starts over looping over all pages for 
+    that new ``release_time``. 
+
     Returns
     -------
     raw_video_info_list: list<dict>
@@ -156,9 +201,10 @@ def get_raw_video_info_list(channel_id: str) -> dict:
 
     """
 
-    raw_video_info_list = []
-
+    claim_id_to_raw_video_info = {}
     page = 1
+    release_time = int(time.time()) + 86400
+    hit_video_limit = False
 
     while True:
 
@@ -169,7 +215,8 @@ def get_raw_video_info_list(channel_id: str) -> dict:
                 "page_size":30,
                 "page":page,
                 "order_by":["release_time"],
-                "channel_ids":[channel_id]}}
+                "channel_ids":[channel_id],
+                "release_time": f"<{release_time}"}}
 
         response = make_request(
             request = requests.post,
@@ -180,14 +227,30 @@ def get_raw_video_info_list(channel_id: str) -> dict:
         result = json.loads(response.text)
 
         videos = result['result']['items']
+        new_videos = {video['claim_id'] : video for video in videos if video['claim_id'] not in claim_id_to_raw_video_info}
 
-        if not videos:
-            break
+        if len(new_videos) == 0:
+            # if there are no new videos that haven't already been scraped
+            if hit_video_limit:
+                # if Odysee's limit of 1000 videos for a given timestamp was 
+                # reached (which updates the `release_time`) on the last 
+                # request, this means we have scraped all videos on the channel, 
+                # so we break the loop.
+                break
+            else:
+                # we have hit Odysee's limit of 1000 videos for a given 
+                # timestamp, so we update `release_time` and reset `page`
+                hit_video_limit = True
+                release_time = min([raw_video_info['meta']['creation_timestamp'] for raw_video_info in claim_id_to_raw_video_info.values()], default = 0)
+                page = 1
         else:
-            raw_video_info_list.extend(videos)
+            # there were unscraped videos from the last request, so we keep 
+            # going in the loop and increment the `page` variable
+            claim_id_to_raw_video_info.update(new_videos)
             page += 1
+            hit_video_limit = False
 
-    return raw_video_info_list
+    return list(claim_id_to_raw_video_info.values())
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 
@@ -346,6 +409,10 @@ def append_comment_reactions(comment_info_list: List[dict]) -> List[dict]:
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 
 def get_recommended(video_title: str, video_id: str) -> List[dict]:
+
+    """Get list of raw video info dicts for a specified video title and video 
+    claim_id.
+    """
     
     name = quote(video_title)
 
@@ -369,30 +436,17 @@ def get_recommended(video_title: str, video_id: str) -> List[dict]:
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 
-def normalized_name_to_video_info(normalized_name: str) -> dict:
-
-    video_url = f"lbry://{normalized_name}"
-    
-    json_data = {
-        "jsonrpc":"2.0",
-        "method":"resolve",
-        "params":{
-            "urls":[video_url]}}
-
-    response = make_request(
-        request = requests.post,
-        kwargs = {
-            'url' : BACKEND_API_URL, 
-            'json': json_data})
-
-    result = json.loads(response.text)
-    
-    return result['result'][video_url]
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
-
 def normalized_names_to_video_info(normalized_names: List[str]) -> dict:
 
+    """Convert a list of normalized names of videos to a list of raw video dicts for those videos. Example of a "normalized name" is:
+
+        ``'si-une-tude-montre-que-le-masque-permet'``, 
+    
+    corresponding to the video:
+    
+        ``https://odysee.com/@filsdepangolin#e/si-une-tude-montre-que-le-masque-permet#e``.
+    """
+
     video_urls = [f"lbry://{normalized_name}" for normalized_name in normalized_names]
     
     json_data = {
@@ -414,6 +468,9 @@ def normalized_names_to_video_info(normalized_names: List[str]) -> dict:
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 
 def get_streaming_url(canonical_url: str) -> str:
+
+    """Retrieve the `streaming_url` for a specified video.
+    """
     
     json_data = {
         "jsonrpc":"2.0",
diff --git a/tests/api.py b/tests/api.py
index 3a2fd0f..a6b40e0 100644
--- a/tests/api.py
+++ b/tests/api.py
@@ -28,10 +28,9 @@ KWARGS_LIST = [
     ('get_video_reactions', ['video_id', 'auth_token']),
     ('get_all_comments', ['video_id']),
     ('append_comment_reactions', ['comment_info_list']),
-    ('normalized_name_to_video_info', ['normalized_name']),
+    ('get_recommended', ['video_title', 'video_id']),
     ('normalized_names_to_video_info', ['normalized_names']),
-    ('get_streaming_url', ['canonical_url']),
-    ('get_recommended', ['video_title', 'video_id']),]
+    ('get_streaming_url', ['canonical_url']),]
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 
diff --git a/tests/base.py b/tests/base.py
index 2a0387e..7d1287d 100644
--- a/tests/base.py
+++ b/tests/base.py
@@ -50,4 +50,15 @@ def test_get_recommended(resources):
 def test_process_raw_comment_info(resources):
     base.process_raw_comment_info(raw_comment_info = resources['full_comment_info'])
 
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+class TestRecommendationEngine:
+
+    @pytest.fixture(autouse=True)
+    def test_simple_init(self, resources):
+        self.engine = base.RecommendationEngine(channel_list = [resources['channel_name']])
+
+    def test_generate(self):
+        self.engine.generate(iterations = 1)
+
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
\ No newline at end of file