Tune speed profiles based on real-world Olla analysis

Log analysis: 30s too short (model load ~50s), 300s wastes time. Real embed takes 1-20s when model is warm, 40-60s on cold load. Tuned profiles: fast: 90s timeout, 5 retries, batch 5, 1s delay medium: 120s timeout, 3 retries, batch 5, 2s delay slow: 300s timeout, 3 retries, batch 5, 5s delay Result: 1-2s/batch when model warm (was 200s/batch avg) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 11:08:28 +03:00
parent 0a07045e17
commit 6c5a828b13
1 changed files with 9 additions and 9 deletions
--- a/setup.py
+++ b/setup.py
@@ -63,23 +63,23 @@ SKIP_EXT = set()
 SPEED_PROFILES = {
    "fast": {
-        "embed_timeout": 30,     # 30s — real embeds take ~18s, fail fast on bad route
+        "embed_timeout": 90,     # 90s — covers cold model load (~50s) + embed (~20s)
-        "embed_retries": 7,      # more retries since they're cheap at 30s
+        "embed_retries": 5,      # enough retries for Olla routing issues
-        "embed_batch": 10,       # larger batches when connection is good
+        "embed_batch": 5,        # safe — 10 can be slow during model load
-        "batch_delay": 1,        # minimal delay
+        "batch_delay": 1,        # minimal delay — model stays warm between batches
        "verify_interval": 20,   # check LanceDB every 20 batches
-        "description": "Aggressive — 30s timeout, 7 retries, batch 10, 1s delay",
+        "description": "Fast — 90s timeout, 5 retries, batch 5, 1s delay",
    },
    "medium": {
-        "embed_timeout": 60,     # 60s — tolerates some slow responses
+        "embed_timeout": 120,    # 120s — comfortable for model load + embed
-        "embed_retries": 5,      # standard retries
+        "embed_retries": 3,      # fewer retries needed with longer timeout
        "embed_batch": 5,        # safe batch size
        "batch_delay": 2,        # reasonable delay
        "verify_interval": 10,   # check every 10 batches
-        "description": "Balanced — 60s timeout, 5 retries, batch 5, 2s delay",
+        "description": "Balanced — 120s timeout, 3 retries, batch 5, 2s delay",
    },
    "slow": {
-        "embed_timeout": 300,    # 300s — waits for cold model loads
+        "embed_timeout": 300,    # 300s — waits for cold model loads on slow GPU
        "embed_retries": 3,      # fewer retries since each is expensive
        "embed_batch": 5,        # safe batch size
        "batch_delay": 5,        # generous delay