fix: improve request queue reliability and reduce stuck requests
This commit is contained in:
@@ -17,4 +17,4 @@ class LLMConfig:
|
|||||||
self.enable_prompt_caching = enable_prompt_caching
|
self.enable_prompt_caching = enable_prompt_caching
|
||||||
self.prompt_modules = prompt_modules or []
|
self.prompt_modules = prompt_modules or []
|
||||||
|
|
||||||
self.timeout = timeout or int(os.getenv("LLM_TIMEOUT", "600"))
|
self.timeout = timeout or int(os.getenv("LLM_TIMEOUT", "300"))
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ def should_retry_exception(exception: Exception) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
class LLMRequestQueue:
|
class LLMRequestQueue:
|
||||||
def __init__(self, max_concurrent: int = 6, delay_between_requests: float = 5.0):
|
def __init__(self, max_concurrent: int = 1, delay_between_requests: float = 4.0):
|
||||||
rate_limit_delay = os.getenv("LLM_RATE_LIMIT_DELAY")
|
rate_limit_delay = os.getenv("LLM_RATE_LIMIT_DELAY")
|
||||||
if rate_limit_delay:
|
if rate_limit_delay:
|
||||||
delay_between_requests = float(rate_limit_delay)
|
delay_between_requests = float(rate_limit_delay)
|
||||||
@@ -61,8 +61,8 @@ class LLMRequestQueue:
|
|||||||
self._semaphore.release()
|
self._semaphore.release()
|
||||||
|
|
||||||
@retry( # type: ignore[misc]
|
@retry( # type: ignore[misc]
|
||||||
stop=stop_after_attempt(7),
|
stop=stop_after_attempt(3),
|
||||||
wait=wait_exponential(multiplier=6, min=12, max=150),
|
wait=wait_exponential(multiplier=8, min=8, max=64),
|
||||||
retry=retry_if_exception(should_retry_exception),
|
retry=retry_if_exception(should_retry_exception),
|
||||||
reraise=True,
|
reraise=True,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user