feat(llm): make LLM request queue rate limits configurable and more conservative

Co-authored-by: Ahmed Allam <ahmed39652003@gmail.com>
This commit is contained in:
cyberseall
2025-11-22 09:07:43 -08:00
committed by GitHub
parent 0c811845f1
commit 86e6ed49bb

View File

@@ -1,5 +1,6 @@
import asyncio import asyncio
import logging import logging
import os
import threading import threading
import time import time
from typing import Any from typing import Any
@@ -26,7 +27,15 @@ def should_retry_exception(exception: Exception) -> bool:
class LLMRequestQueue: class LLMRequestQueue:
def __init__(self, max_concurrent: int = 6, delay_between_requests: float = 1.0): def __init__(self, max_concurrent: int = 6, delay_between_requests: float = 5.0):
rate_limit_delay = os.getenv("LLM_RATE_LIMIT_DELAY")
if rate_limit_delay:
delay_between_requests = float(rate_limit_delay)
rate_limit_concurrent = os.getenv("LLM_RATE_LIMIT_CONCURRENT")
if rate_limit_concurrent:
max_concurrent = int(rate_limit_concurrent)
self.max_concurrent = max_concurrent self.max_concurrent = max_concurrent
self.delay_between_requests = delay_between_requests self.delay_between_requests = delay_between_requests
self._semaphore = threading.BoundedSemaphore(max_concurrent) self._semaphore = threading.BoundedSemaphore(max_concurrent)
@@ -52,8 +61,8 @@ class LLMRequestQueue:
self._semaphore.release() self._semaphore.release()
@retry( # type: ignore[misc] @retry( # type: ignore[misc]
stop=stop_after_attempt(5), stop=stop_after_attempt(7),
wait=wait_exponential(multiplier=2, min=1, max=30), wait=wait_exponential(multiplier=6, min=12, max=150),
retry=retry_if_exception(should_retry_exception), retry=retry_if_exception(should_retry_exception),
reraise=True, reraise=True,
) )