mirror of
https://github.com/bellingcat/polyphemus.git
synced 2026-06-08 03:18:32 +03:00
implemented method for retrieving ALL videos from a channel, not just the first 1000, increased robustness of make_requests wrapper, added missing unit tests
This commit is contained in:
@@ -24,11 +24,33 @@ COMMENT_API_URL = 'https://comments.odysee.com/api/v2'
|
||||
RECOMMENDATION_API_URL = 'https://recsys.odysee.com/search'
|
||||
NEW_USER_API_URL = 'https://api.odysee.com/user/new'
|
||||
|
||||
# Allow responses to `get_streaming_url` that contain no `streaming_url` field
|
||||
ALLOWED_ERROR_CODES = [-32603]
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def make_request(request: Callable, kwargs: dict) -> requests.Response:
|
||||
|
||||
"""Wrapper for retrying request multiple times.
|
||||
"""Wrapper for retrying request multiple times and handling errors.
|
||||
|
||||
This function handles Python exceptions (e.g. HTTPConnectionPool),
|
||||
unsuccessful HTTP error codes (e.g. 429, 403), and errors in the
|
||||
JSON response. If after 5 retries (using exponential backoff) the request
|
||||
is unsuccessful, an exception is raised.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
request: function
|
||||
The requests function to be called.
|
||||
One of {requests.get and requests.post}
|
||||
kwargs: dict
|
||||
Keyword arguments for the ``request`` function. Must include ``url`` key.
|
||||
e.g. ``{'url': 'https://api.odysee.com/user/new'}``
|
||||
Uses a default timeout of 15 seconds.
|
||||
|
||||
Returns
|
||||
-------
|
||||
response: requests.Response
|
||||
"""
|
||||
|
||||
if request not in [requests.get, requests.post]:
|
||||
@@ -43,23 +65,33 @@ def make_request(request: Callable, kwargs: dict) -> requests.Response:
|
||||
response = requests.Response()
|
||||
response.status_code = 418
|
||||
|
||||
exceptions = []
|
||||
status_codes = []
|
||||
retry_reasons = []
|
||||
|
||||
# TODO this looks a bit gross, try to refactor
|
||||
while n_retries < 5:
|
||||
time.sleep(2 ** n_retries - 1)
|
||||
try:
|
||||
response = request(**kwargs)
|
||||
if response.status_code == 200:
|
||||
return response
|
||||
parsed_response = json.loads(response.text)
|
||||
if isinstance(parsed_response, list):
|
||||
return response
|
||||
if parsed_response.get('error') is not None:
|
||||
if parsed_response['error'].get('code', None) not in ALLOWED_ERROR_CODES:
|
||||
retry_reasons.append(f'JSON response error: {parsed_response["error"]}')
|
||||
n_retries += 1
|
||||
else:
|
||||
return response
|
||||
else:
|
||||
return response
|
||||
else:
|
||||
status_codes.append(response.status_code)
|
||||
retry_reasons.append(f'HTTP status code: {response.status_code}')
|
||||
n_retries += 1
|
||||
except Exception as exception:
|
||||
exceptions.append(exception)
|
||||
retry_reasons.append(f'Python exception: {exception}')
|
||||
n_retries += 1
|
||||
|
||||
msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Status codes: {status_codes}; exceptions: {exceptions}'
|
||||
msg = f'Maximum number of retries reached for request {request} with kwargs {kwargs}. Retry reasons: {retry_reasons}'
|
||||
raise ValueError(msg)
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
@@ -148,6 +180,19 @@ def get_raw_video_info_list(channel_id: str) -> dict:
|
||||
|
||||
"""Get a list of all videos posted by a specified channel name.
|
||||
|
||||
Odysee's ``claim_search`` API (which is used on the browser and LBRY
|
||||
desktop app) only allows up to 1000 videos to be fetched for a single value
|
||||
of the ``release_time`` parameter. You can check this by going to an Odysee
|
||||
channel with a lot of videos (e.g. @etresouverain) and holding the
|
||||
"Page Down" button until you reach the bottom, there will only be 1000
|
||||
videos.
|
||||
|
||||
This function loops over all pages for a single ``release_time`` and
|
||||
fetches the raw video info for all videos until it reaches that 1000 video
|
||||
limit, then uses the minimum of the ``creation_timestamp`` for all videos
|
||||
as the new ``release_time``, and starts over looping over all pages for
|
||||
that new ``release_time``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
raw_video_info_list: list<dict>
|
||||
@@ -156,9 +201,10 @@ def get_raw_video_info_list(channel_id: str) -> dict:
|
||||
|
||||
"""
|
||||
|
||||
raw_video_info_list = []
|
||||
|
||||
claim_id_to_raw_video_info = {}
|
||||
page = 1
|
||||
release_time = int(time.time()) + 86400
|
||||
hit_video_limit = False
|
||||
|
||||
while True:
|
||||
|
||||
@@ -169,7 +215,8 @@ def get_raw_video_info_list(channel_id: str) -> dict:
|
||||
"page_size":30,
|
||||
"page":page,
|
||||
"order_by":["release_time"],
|
||||
"channel_ids":[channel_id]}}
|
||||
"channel_ids":[channel_id],
|
||||
"release_time": f"<{release_time}"}}
|
||||
|
||||
response = make_request(
|
||||
request = requests.post,
|
||||
@@ -180,14 +227,30 @@ def get_raw_video_info_list(channel_id: str) -> dict:
|
||||
result = json.loads(response.text)
|
||||
|
||||
videos = result['result']['items']
|
||||
new_videos = {video['claim_id'] : video for video in videos if video['claim_id'] not in claim_id_to_raw_video_info}
|
||||
|
||||
if not videos:
|
||||
break
|
||||
if len(new_videos) == 0:
|
||||
# if there are no new videos that haven't already been scraped
|
||||
if hit_video_limit:
|
||||
# if Odysee's limit of 1000 videos for a given timestamp was
|
||||
# reached (which updates the `release_time`) on the last
|
||||
# request, this means we have scraped all videos on the channel,
|
||||
# so we break the loop.
|
||||
break
|
||||
else:
|
||||
# we have hit Odysee's limit of 1000 videos for a given
|
||||
# timestamp, so we update `release_time` and reset `page`
|
||||
hit_video_limit = True
|
||||
release_time = min([raw_video_info['meta']['creation_timestamp'] for raw_video_info in claim_id_to_raw_video_info.values()], default = 0)
|
||||
page = 1
|
||||
else:
|
||||
raw_video_info_list.extend(videos)
|
||||
# there were unscraped videos from the last request, so we keep
|
||||
# going in the loop and increment the `page` variable
|
||||
claim_id_to_raw_video_info.update(new_videos)
|
||||
page += 1
|
||||
hit_video_limit = False
|
||||
|
||||
return raw_video_info_list
|
||||
return list(claim_id_to_raw_video_info.values())
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -346,6 +409,10 @@ def append_comment_reactions(comment_info_list: List[dict]) -> List[dict]:
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def get_recommended(video_title: str, video_id: str) -> List[dict]:
|
||||
|
||||
"""Get list of raw video info dicts for a specified video title and video
|
||||
claim_id.
|
||||
"""
|
||||
|
||||
name = quote(video_title)
|
||||
|
||||
@@ -369,30 +436,17 @@ def get_recommended(video_title: str, video_id: str) -> List[dict]:
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def normalized_name_to_video_info(normalized_name: str) -> dict:
|
||||
|
||||
video_url = f"lbry://{normalized_name}"
|
||||
|
||||
json_data = {
|
||||
"jsonrpc":"2.0",
|
||||
"method":"resolve",
|
||||
"params":{
|
||||
"urls":[video_url]}}
|
||||
|
||||
response = make_request(
|
||||
request = requests.post,
|
||||
kwargs = {
|
||||
'url' : BACKEND_API_URL,
|
||||
'json': json_data})
|
||||
|
||||
result = json.loads(response.text)
|
||||
|
||||
return result['result'][video_url]
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def normalized_names_to_video_info(normalized_names: List[str]) -> dict:
|
||||
|
||||
"""Convert a list of normalized names of videos to a list of raw video dicts for those videos. Example of a "normalized name" is:
|
||||
|
||||
``'si-une-tude-montre-que-le-masque-permet'``,
|
||||
|
||||
corresponding to the video:
|
||||
|
||||
``https://odysee.com/@filsdepangolin#e/si-une-tude-montre-que-le-masque-permet#e``.
|
||||
"""
|
||||
|
||||
video_urls = [f"lbry://{normalized_name}" for normalized_name in normalized_names]
|
||||
|
||||
json_data = {
|
||||
@@ -414,6 +468,9 @@ def normalized_names_to_video_info(normalized_names: List[str]) -> dict:
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def get_streaming_url(canonical_url: str) -> str:
|
||||
|
||||
"""Retrieve the `streaming_url` for a specified video.
|
||||
"""
|
||||
|
||||
json_data = {
|
||||
"jsonrpc":"2.0",
|
||||
|
||||
@@ -28,10 +28,9 @@ KWARGS_LIST = [
|
||||
('get_video_reactions', ['video_id', 'auth_token']),
|
||||
('get_all_comments', ['video_id']),
|
||||
('append_comment_reactions', ['comment_info_list']),
|
||||
('normalized_name_to_video_info', ['normalized_name']),
|
||||
('get_recommended', ['video_title', 'video_id']),
|
||||
('normalized_names_to_video_info', ['normalized_names']),
|
||||
('get_streaming_url', ['canonical_url']),
|
||||
('get_recommended', ['video_title', 'video_id']),]
|
||||
('get_streaming_url', ['canonical_url']),]
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
|
||||
@@ -50,4 +50,15 @@ def test_get_recommended(resources):
|
||||
def test_process_raw_comment_info(resources):
|
||||
base.process_raw_comment_info(raw_comment_info = resources['full_comment_info'])
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
class TestRecommendationEngine:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def test_simple_init(self, resources):
|
||||
self.engine = base.RecommendationEngine(channel_list = [resources['channel_name']])
|
||||
|
||||
def test_generate(self):
|
||||
self.engine.generate(iterations = 1)
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
Reference in New Issue
Block a user