feat(llm): make LLM request queue rate limits configurable and more conservative
Co-authored-by: Ahmed Allam <ahmed39652003@gmail.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -26,7 +27,15 @@ def should_retry_exception(exception: Exception) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
class LLMRequestQueue:
|
class LLMRequestQueue:
|
||||||
def __init__(self, max_concurrent: int = 6, delay_between_requests: float = 1.0):
|
def __init__(self, max_concurrent: int = 6, delay_between_requests: float = 5.0):
|
||||||
|
rate_limit_delay = os.getenv("LLM_RATE_LIMIT_DELAY")
|
||||||
|
if rate_limit_delay:
|
||||||
|
delay_between_requests = float(rate_limit_delay)
|
||||||
|
|
||||||
|
rate_limit_concurrent = os.getenv("LLM_RATE_LIMIT_CONCURRENT")
|
||||||
|
if rate_limit_concurrent:
|
||||||
|
max_concurrent = int(rate_limit_concurrent)
|
||||||
|
|
||||||
self.max_concurrent = max_concurrent
|
self.max_concurrent = max_concurrent
|
||||||
self.delay_between_requests = delay_between_requests
|
self.delay_between_requests = delay_between_requests
|
||||||
self._semaphore = threading.BoundedSemaphore(max_concurrent)
|
self._semaphore = threading.BoundedSemaphore(max_concurrent)
|
||||||
@@ -52,8 +61,8 @@ class LLMRequestQueue:
|
|||||||
self._semaphore.release()
|
self._semaphore.release()
|
||||||
|
|
||||||
@retry( # type: ignore[misc]
|
@retry( # type: ignore[misc]
|
||||||
stop=stop_after_attempt(5),
|
stop=stop_after_attempt(7),
|
||||||
wait=wait_exponential(multiplier=2, min=1, max=30),
|
wait=wait_exponential(multiplier=6, min=12, max=150),
|
||||||
retry=retry_if_exception(should_retry_exception),
|
retry=retry_if_exception(should_retry_exception),
|
||||||
reraise=True,
|
reraise=True,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user