Tune speed profiles based on real-world Olla analysis
Log analysis: 30s too short (model load ~50s), 300s wastes time. Real embed takes 1-20s when model is warm, 40-60s on cold load. Tuned profiles: fast: 90s timeout, 5 retries, batch 5, 1s delay medium: 120s timeout, 3 retries, batch 5, 2s delay slow: 300s timeout, 3 retries, batch 5, 5s delay Result: 1-2s/batch when model warm (was 200s/batch avg) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
18
setup.py
18
setup.py
@@ -63,23 +63,23 @@ SKIP_EXT = set()
|
|||||||
|
|
||||||
SPEED_PROFILES = {
|
SPEED_PROFILES = {
|
||||||
"fast": {
|
"fast": {
|
||||||
"embed_timeout": 30, # 30s — real embeds take ~18s, fail fast on bad route
|
"embed_timeout": 90, # 90s — covers cold model load (~50s) + embed (~20s)
|
||||||
"embed_retries": 7, # more retries since they're cheap at 30s
|
"embed_retries": 5, # enough retries for Olla routing issues
|
||||||
"embed_batch": 10, # larger batches when connection is good
|
"embed_batch": 5, # safe — 10 can be slow during model load
|
||||||
"batch_delay": 1, # minimal delay
|
"batch_delay": 1, # minimal delay — model stays warm between batches
|
||||||
"verify_interval": 20, # check LanceDB every 20 batches
|
"verify_interval": 20, # check LanceDB every 20 batches
|
||||||
"description": "Aggressive — 30s timeout, 7 retries, batch 10, 1s delay",
|
"description": "Fast — 90s timeout, 5 retries, batch 5, 1s delay",
|
||||||
},
|
},
|
||||||
"medium": {
|
"medium": {
|
||||||
"embed_timeout": 60, # 60s — tolerates some slow responses
|
"embed_timeout": 120, # 120s — comfortable for model load + embed
|
||||||
"embed_retries": 5, # standard retries
|
"embed_retries": 3, # fewer retries needed with longer timeout
|
||||||
"embed_batch": 5, # safe batch size
|
"embed_batch": 5, # safe batch size
|
||||||
"batch_delay": 2, # reasonable delay
|
"batch_delay": 2, # reasonable delay
|
||||||
"verify_interval": 10, # check every 10 batches
|
"verify_interval": 10, # check every 10 batches
|
||||||
"description": "Balanced — 60s timeout, 5 retries, batch 5, 2s delay",
|
"description": "Balanced — 120s timeout, 3 retries, batch 5, 2s delay",
|
||||||
},
|
},
|
||||||
"slow": {
|
"slow": {
|
||||||
"embed_timeout": 300, # 300s — waits for cold model loads
|
"embed_timeout": 300, # 300s — waits for cold model loads on slow GPU
|
||||||
"embed_retries": 3, # fewer retries since each is expensive
|
"embed_retries": 3, # fewer retries since each is expensive
|
||||||
"embed_batch": 5, # safe batch size
|
"embed_batch": 5, # safe batch size
|
||||||
"batch_delay": 5, # generous delay
|
"batch_delay": 5, # generous delay
|
||||||
|
|||||||
Reference in New Issue
Block a user