From 6c5a828b13e21f40e172f4493210786defa603ad Mon Sep 17 00:00:00 2001
From: salvacybersec <salva@opsecti.local>
Date: Tue, 7 Apr 2026 11:08:28 +0300
Subject: [PATCH] Tune speed profiles based on real-world Olla analysis

Log analysis: 30s too short (model load ~50s), 300s wastes time.
Real embed takes 1-20s when model is warm, 40-60s on cold load.

Tuned profiles:
  fast:   90s timeout, 5 retries, batch 5, 1s delay
  medium: 120s timeout, 3 retries, batch 5, 2s delay
  slow:   300s timeout, 3 retries, batch 5, 5s delay

Result: 1-2s/batch when model warm (was 200s/batch avg)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 setup.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 412e809..9963c3b 100644
--- a/setup.py
+++ b/setup.py
@@ -63,23 +63,23 @@ SKIP_EXT = set()
 
 SPEED_PROFILES = {
     "fast": {
-        "embed_timeout": 30,     # 30s — real embeds take ~18s, fail fast on bad route
-        "embed_retries": 7,      # more retries since they're cheap at 30s
-        "embed_batch": 10,       # larger batches when connection is good
-        "batch_delay": 1,        # minimal delay
+        "embed_timeout": 90,     # 90s — covers cold model load (~50s) + embed (~20s)
+        "embed_retries": 5,      # enough retries for Olla routing issues
+        "embed_batch": 5,        # safe — 10 can be slow during model load
+        "batch_delay": 1,        # minimal delay — model stays warm between batches
         "verify_interval": 20,   # check LanceDB every 20 batches
-        "description": "Aggressive — 30s timeout, 7 retries, batch 10, 1s delay",
+        "description": "Fast — 90s timeout, 5 retries, batch 5, 1s delay",
     },
     "medium": {
-        "embed_timeout": 60,     # 60s — tolerates some slow responses
-        "embed_retries": 5,      # standard retries
+        "embed_timeout": 120,    # 120s — comfortable for model load + embed
+        "embed_retries": 3,      # fewer retries needed with longer timeout
         "embed_batch": 5,        # safe batch size
         "batch_delay": 2,        # reasonable delay
         "verify_interval": 10,   # check every 10 batches
-        "description": "Balanced — 60s timeout, 5 retries, batch 5, 2s delay",
+        "description": "Balanced — 120s timeout, 3 retries, batch 5, 2s delay",
     },
     "slow": {
-        "embed_timeout": 300,    # 300s — waits for cold model loads
+        "embed_timeout": 300,    # 300s — waits for cold model loads on slow GPU
         "embed_retries": 3,      # fewer retries since each is expensive
         "embed_batch": 5,        # safe batch size
         "batch_delay": 5,        # generous delay