Tune speed profiles based on real-world Olla analysis

Log analysis: 30s too short (model load ~50s), 300s wastes time.
Real embed takes 1-20s when model is warm, 40-60s on cold load.

Tuned profiles:
  fast:   90s timeout, 5 retries, batch 5, 1s delay
  medium: 120s timeout, 3 retries, batch 5, 2s delay
  slow:   300s timeout, 3 retries, batch 5, 5s delay

Result: 1-2s/batch when model warm (was 200s/batch avg)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
salvacybersec
2026-04-07 11:08:28 +03:00
parent 0a07045e17
commit 6c5a828b13

View File

@@ -63,23 +63,23 @@ SKIP_EXT = set()
SPEED_PROFILES = { SPEED_PROFILES = {
"fast": { "fast": {
"embed_timeout": 30, # 30s — real embeds take ~18s, fail fast on bad route "embed_timeout": 90, # 90s — covers cold model load (~50s) + embed (~20s)
"embed_retries": 7, # more retries since they're cheap at 30s "embed_retries": 5, # enough retries for Olla routing issues
"embed_batch": 10, # larger batches when connection is good "embed_batch": 5, # safe — 10 can be slow during model load
"batch_delay": 1, # minimal delay "batch_delay": 1, # minimal delay — model stays warm between batches
"verify_interval": 20, # check LanceDB every 20 batches "verify_interval": 20, # check LanceDB every 20 batches
"description": "Aggressive30s timeout, 7 retries, batch 10, 1s delay", "description": "Fast90s timeout, 5 retries, batch 5, 1s delay",
}, },
"medium": { "medium": {
"embed_timeout": 60, # 60s — tolerates some slow responses "embed_timeout": 120, # 120s — comfortable for model load + embed
"embed_retries": 5, # standard retries "embed_retries": 3, # fewer retries needed with longer timeout
"embed_batch": 5, # safe batch size "embed_batch": 5, # safe batch size
"batch_delay": 2, # reasonable delay "batch_delay": 2, # reasonable delay
"verify_interval": 10, # check every 10 batches "verify_interval": 10, # check every 10 batches
"description": "Balanced — 60s timeout, 5 retries, batch 5, 2s delay", "description": "Balanced — 120s timeout, 3 retries, batch 5, 2s delay",
}, },
"slow": { "slow": {
"embed_timeout": 300, # 300s — waits for cold model loads "embed_timeout": 300, # 300s — waits for cold model loads on slow GPU
"embed_retries": 3, # fewer retries since each is expensive "embed_retries": 3, # fewer retries since each is expensive
"embed_batch": 5, # safe batch size "embed_batch": 5, # safe batch size
"batch_delay": 5, # generous delay "batch_delay": 5, # generous delay