feat(llm): make LLM request queue rate limits configurable and more conservative

Co-authored-by: Ahmed Allam <ahmed39652003@gmail.com>
This commit is contained in:
cyberseall
2025-11-22 09:07:43 -08:00
committed by GitHub
parent 0c811845f1
commit 86e6ed49bb

View File

@@ -1,5 +1,6 @@
import asyncio
import logging
import os
import threading
import time
from typing import Any
@@ -26,7 +27,15 @@ def should_retry_exception(exception: Exception) -> bool:
class LLMRequestQueue:
def __init__(self, max_concurrent: int = 6, delay_between_requests: float = 1.0):
def __init__(self, max_concurrent: int = 6, delay_between_requests: float = 5.0):
rate_limit_delay = os.getenv("LLM_RATE_LIMIT_DELAY")
if rate_limit_delay:
delay_between_requests = float(rate_limit_delay)
rate_limit_concurrent = os.getenv("LLM_RATE_LIMIT_CONCURRENT")
if rate_limit_concurrent:
max_concurrent = int(rate_limit_concurrent)
self.max_concurrent = max_concurrent
self.delay_between_requests = delay_between_requests
self._semaphore = threading.BoundedSemaphore(max_concurrent)
@@ -52,8 +61,8 @@ class LLMRequestQueue:
self._semaphore.release()
@retry( # type: ignore[misc]
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=2, min=1, max=30),
stop=stop_after_attempt(7),
wait=wait_exponential(multiplier=6, min=12, max=150),
retry=retry_if_exception(should_retry_exception),
reraise=True,
)