From 6c5a828b13e21f40e172f4493210786defa603ad Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Tue, 7 Apr 2026 11:08:28 +0300 Subject: [PATCH] Tune speed profiles based on real-world Olla analysis Log analysis: 30s too short (model load ~50s), 300s wastes time. Real embed takes 1-20s when model is warm, 40-60s on cold load. Tuned profiles: fast: 90s timeout, 5 retries, batch 5, 1s delay medium: 120s timeout, 3 retries, batch 5, 2s delay slow: 300s timeout, 3 retries, batch 5, 5s delay Result: 1-2s/batch when model warm (was 200s/batch avg) Co-Authored-By: Claude Opus 4.6 (1M context) --- setup.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 412e809..9963c3b 100644 --- a/setup.py +++ b/setup.py @@ -63,23 +63,23 @@ SKIP_EXT = set() SPEED_PROFILES = { "fast": { - "embed_timeout": 30, # 30s — real embeds take ~18s, fail fast on bad route - "embed_retries": 7, # more retries since they're cheap at 30s - "embed_batch": 10, # larger batches when connection is good - "batch_delay": 1, # minimal delay + "embed_timeout": 90, # 90s — covers cold model load (~50s) + embed (~20s) + "embed_retries": 5, # enough retries for Olla routing issues + "embed_batch": 5, # safe — 10 can be slow during model load + "batch_delay": 1, # minimal delay — model stays warm between batches "verify_interval": 20, # check LanceDB every 20 batches - "description": "Aggressive — 30s timeout, 7 retries, batch 10, 1s delay", + "description": "Fast — 90s timeout, 5 retries, batch 5, 1s delay", }, "medium": { - "embed_timeout": 60, # 60s — tolerates some slow responses - "embed_retries": 5, # standard retries + "embed_timeout": 120, # 120s — comfortable for model load + embed + "embed_retries": 3, # fewer retries needed with longer timeout "embed_batch": 5, # safe batch size "batch_delay": 2, # reasonable delay "verify_interval": 10, # check every 10 batches - "description": "Balanced — 60s timeout, 5 retries, batch 5, 2s delay", + "description": "Balanced — 120s timeout, 3 retries, batch 5, 2s delay", }, "slow": { - "embed_timeout": 300, # 300s — waits for cold model loads + "embed_timeout": 300, # 300s — waits for cold model loads on slow GPU "embed_retries": 3, # fewer retries since each is expensive "embed_batch": 5, # safe batch size "batch_delay": 5, # generous delay