Add structured logging + log panel to monitor

- setup.py: logging module with file (setup.log) + console output
  - Line-buffered output (fixes background execution buffering)
  - API calls with timeout (300s), retry (3x), debug logging
  - Per-batch progress: [1/29] persona batch 1/20 (20 docs)
  - --verbose flag for debug-level console
- monitor.py: log tail in CLI + web dashboard
  - CLI: colorized last 15 log lines
  - Web: scrollable log panel with level-based colors
- Smaller embed batches (20 instead of 50) for reliability

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
salvacybersec
2026-04-07 00:30:29 +03:00
parent 9105c03b4b
commit 1028d11507
3 changed files with 177 additions and 26 deletions

1
.gitignore vendored
View File

@@ -1,5 +1,6 @@
# State files (machine-specific, regenerated by script)
upload_progress.json
setup.log
# OCR output (large binary files)
ocr_output/

View File

@@ -25,6 +25,7 @@ except ImportError:
CONFIG_PATH = Path(__file__).parent / "config.yaml"
PROGRESS_PATH = Path(__file__).parent / "upload_progress.json"
LOG_PATH = Path(__file__).parent / "setup.log"
LANCEDB_PATH = Path.home() / ".config/anythingllm-desktop/storage/lancedb"
DOCS_PATH = Path.home() / ".config/anythingllm-desktop/storage/documents"
VCACHE_PATH = Path.home() / ".config/anythingllm-desktop/storage/vector-cache"
@@ -152,6 +153,16 @@ def collect_status():
api_ok = check_api(config)
script_running = check_script_running()
# Read last N lines from setup.log
log_lines = []
if LOG_PATH.exists():
try:
with open(LOG_PATH, "r", encoding="utf-8") as f:
all_lines = f.readlines()
log_lines = [l.rstrip() for l in all_lines[-15:]]
except Exception:
pass
return {
"personas": personas,
"clusters": clusters,
@@ -165,6 +176,7 @@ def collect_status():
"api_online": api_ok,
"script_running": script_running,
"timestamp": time.strftime("%H:%M:%S"),
"log_tail": log_lines,
}
@@ -231,6 +243,22 @@ def cli_output(status):
lines.append("")
# Log tail
log_tail = status.get("log_tail", [])
if log_tail:
lines.append(f" {BOLD}── Log (setup.log) ──{RESET}")
for ll in log_tail:
# Colorize log levels
if "[ERROR]" in ll:
lines.append(f" {RED}{ll}{RESET}")
elif "[WARNING]" in ll:
lines.append(f" \033[33m{ll}{RESET}")
elif "" in ll:
lines.append(f" {GREEN}{ll}{RESET}")
else:
lines.append(f" {DIM}{ll}{RESET}")
lines.append("")
return "\n".join(lines)
@@ -297,6 +325,13 @@ HTML_TEMPLATE = """<!DOCTYPE html>
.summary-card .label { color: #565f89; font-size: 11px; text-transform: uppercase; }
.summary-card .value { color: #7aa2f7; font-size: 20px; font-weight: bold; margin-top: 2px; }
.summary-card .unit { color: #565f89; font-size: 12px; }
.log-panel { background: #0d0d12; border: 1px solid #1a1b26; border-radius: 8px; padding: 12px 16px; margin-top: 20px; max-height: 300px; overflow-y: auto; }
.log-panel h3 { color: #565f89; font-size: 12px; text-transform: uppercase; margin-bottom: 8px; }
.log-line { font-size: 12px; line-height: 1.6; color: #565f89; white-space: pre-wrap; word-break: break-all; }
.log-line.error { color: #f7768e; }
.log-line.warning { color: #e0af68; }
.log-line.success { color: #9ece6a; }
.log-line.info { color: #7aa2f7; }
</style>
</head>
<body>
@@ -306,6 +341,7 @@ HTML_TEMPLATE = """<!DOCTYPE html>
<div class="summary" id="summary"></div>
<div class="status-bar" id="statusbar"></div>
<div id="clusters"></div>
<div class="log-panel" id="logpanel"><h3>Log (setup.log)</h3><div id="loglines">No log data</div></div>
<script>
const CLUSTER_ORDER = ['intel', 'cyber', 'military', 'humanities', 'engineering'];
@@ -358,6 +394,23 @@ function render(data) {
html += '</table></div>';
});
document.getElementById('clusters').innerHTML = html;
// Log panel
const logLines = data.log_tail || [];
if (logLines.length > 0) {
let logHtml = '';
logLines.forEach(line => {
let cls = '';
if (line.includes('[ERROR]')) cls = 'error';
else if (line.includes('[WARNING]')) cls = 'warning';
else if (line.includes('')) cls = 'success';
else if (line.includes('[INFO]')) cls = 'info';
logHtml += `<div class="log-line ${cls}">${line.replace(/</g,'&lt;')}</div>`;
});
document.getElementById('loglines').innerHTML = logHtml;
const panel = document.getElementById('logpanel');
panel.scrollTop = panel.scrollHeight;
}
}
async function poll() {

143
setup.py
View File

@@ -26,11 +26,13 @@ Usage:
import argparse
import json
import logging
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
import yaml
@@ -44,9 +46,43 @@ except ImportError:
CONFIG_PATH = Path(__file__).parent / "config.yaml"
PROGRESS_PATH = Path(__file__).parent / "upload_progress.json"
LOG_PATH = Path(__file__).parent / "setup.log"
ANYTHINGLLM_STORAGE = Path.home() / ".config/anythingllm-desktop/storage"
SKIP_EXT = set()
# ──────────────────────────────────────────────────────────
# LOGGING
# ──────────────────────────────────────────────────────────
log = logging.getLogger("anythingllm")
def setup_logging(verbose=False):
log.setLevel(logging.DEBUG)
fmt = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S")
# File handler — always debug level
fh = logging.FileHandler(LOG_PATH, encoding="utf-8")
fh.setLevel(logging.DEBUG)
fh.setFormatter(fmt)
log.addHandler(fh)
# Console handler — info or debug based on --verbose
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG if verbose else logging.INFO)
ch.setFormatter(fmt)
log.addHandler(ch)
# Flush on every write (critical for background/piped execution)
for h in log.handlers:
if hasattr(h, 'stream'):
h.stream = os.fdopen(os.dup(h.stream.fileno()), 'w', buffering=1)
def log_print(msg, level="info"):
"""Log and print (backward compat for functions that still use print)."""
getattr(log, level)(msg)
CLUSTERS = {
"intel": ["frodo", "echo", "ghost", "oracle", "wraith", "scribe", "polyglot"],
"cyber": ["neo", "bastion", "sentinel", "specter", "phantom", "cipher", "vortex"],
@@ -80,16 +116,36 @@ def save_progress(progress):
# API
# ──────────────────────────────────────────────────────────
def api_request(config, method, endpoint, **kwargs):
def api_request(config, method, endpoint, timeout=120, retries=3, **kwargs):
url = f"{config['anythingllm']['base_url']}{endpoint}"
headers = {"Authorization": f"Bearer {config['anythingllm']['api_key']}"}
if "json" in kwargs:
headers["Content-Type"] = "application/json"
resp = getattr(requests, method)(url, headers=headers, **kwargs)
for attempt in range(retries):
try:
log.debug(f"API {method.upper()} {endpoint} (attempt {attempt+1})")
resp = getattr(requests, method)(url, headers=headers, timeout=timeout, **kwargs)
if resp.status_code not in (200, 201):
print(f" API error {resp.status_code}: {resp.text[:300]}")
log.error(f"API {resp.status_code}: {resp.text[:300]}")
if attempt < retries - 1:
time.sleep(3)
continue
return None
log.debug(f"API {method.upper()} {endpoint}{resp.status_code}")
return resp.json()
except requests.exceptions.Timeout:
log.warning(f"API timeout ({timeout}s) on {endpoint} (attempt {attempt+1}/{retries})")
if attempt < retries - 1:
time.sleep(5)
except requests.exceptions.ConnectionError as e:
log.error(f"API connection error: {e}")
if attempt < retries - 1:
time.sleep(5)
except Exception as e:
log.error(f"API unexpected error: {e}")
return None
return None
def api_upload(config, file_path, folder_name=None):
@@ -436,43 +492,81 @@ def upload_file_batch(config, folder_name, files, progress, batch_size, delay):
def assign_to_workspaces(config, persona_folders, progress, batch_size, delay):
"""Phase C2: assign uploaded docs to persona workspaces."""
print("── Assigning to workspaces ──\n")
log.info("── Assigning to workspaces ──")
existing_ws = get_existing_workspaces(config)
for codename, folders in sorted(persona_folders.items()):
if not existing_ws:
log.error("Could not fetch workspaces from API")
return
total_personas = len(persona_folders)
total_embedded = 0
total_failed = 0
for idx, (codename, folders) in enumerate(sorted(persona_folders.items()), 1):
ws_name = config["workspaces"][codename]["name"]
ws_info = existing_ws.get(ws_name)
if not ws_info:
log.warning(f"[{idx}/{total_personas}] {codename}: workspace '{ws_name}' not found, skipping")
continue
slug = ws_info["slug"]
doc_locs = []
for fn in folders:
folder_docs = 0
for fpath, info in progress["uploaded_files"].items():
if info.get("folder") == fn and info.get("location"):
doc_locs.append(info["location"])
folder_docs += 1
if folder_docs > 0:
log.debug(f" {codename}/{fn}: {folder_docs} docs")
already = set(progress.get("workspace_docs", {}).get(codename, []))
new_docs = [loc for loc in doc_locs if loc not in already]
if not new_docs:
if doc_locs:
print(f" {codename}: {len(doc_locs)} docs assigned")
log.info(f"[{idx}/{total_personas}]{codename}: {len(doc_locs)} docs already assigned")
else:
log.info(f"[{idx}/{total_personas}] ○ {codename}: no uploaded docs found")
continue
print(f" {codename} ({slug}): {len(new_docs)} docs")
for bs in range(0, len(new_docs), batch_size):
batch = new_docs[bs:bs + batch_size]
log.info(f"[{idx}/{total_personas}]{codename} ({slug}): {len(new_docs)} docs to embed")
# Use smaller batches for embedding (10-20 is safer than 50)
embed_batch = min(batch_size, 20)
persona_ok = 0
persona_fail = 0
for bs in range(0, len(new_docs), embed_batch):
batch = new_docs[bs:bs + embed_batch]
batch_num = bs // embed_batch + 1
total_batches = (len(new_docs) + embed_batch - 1) // embed_batch
log.debug(f" {codename} batch {batch_num}/{total_batches} ({len(batch)} docs)")
result = api_request(config, "post", f"/workspace/{slug}/update-embeddings",
json={"adds": batch, "deletes": []})
json={"adds": batch, "deletes": []},
timeout=300, retries=3)
if result:
progress.setdefault("workspace_docs", {}).setdefault(codename, []).extend(batch)
print(f"{len(batch)} docs embedded")
persona_ok += len(batch)
log.info(f"{codename} batch {batch_num}/{total_batches}: "
f"{len(batch)} embedded ({persona_ok}/{len(new_docs)})")
else:
print(f" ✗ batch failed")
if bs + batch_size < len(new_docs):
time.sleep(delay)
persona_fail += len(batch)
log.error(f"{codename} batch {batch_num}/{total_batches}: FAILED")
# Save after every batch
save_progress(progress)
print()
if bs + embed_batch < len(new_docs):
time.sleep(delay)
total_embedded += persona_ok
total_failed += persona_fail
log.info(f" {codename} done: {persona_ok} ok, {persona_fail} failed")
log.info(f"── Assignment complete: {total_embedded} embedded, {total_failed} failed ──")
def upload_documents(config, persona_list=None, priority_filter=None,
@@ -628,10 +722,10 @@ def show_status(config):
def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
"""Re-assign already-uploaded docs to workspaces without scanning/uploading.
Skips the slow folder scan — uses upload_progress.json directly."""
print("═══ Re-assign Workspaces ═══\n")
log.info("═══ Re-assign Workspaces ═══")
if not check_api(config):
print("AnythingLLM API not reachable.")
log.error("AnythingLLM API not reachable")
return
progress = load_progress()
@@ -642,10 +736,10 @@ def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
if persona_list:
for p in persona_list:
progress.get("workspace_docs", {}).pop(p, None)
print(f"Reset assignments for: {', '.join(persona_list)}\n")
log.info(f"Reset assignments for: {', '.join(persona_list)}")
else:
progress["workspace_docs"] = {}
print("Reset all workspace assignments\n")
log.info("Reset all workspace assignments")
save_progress(progress)
# Build persona_folders from config (no disk scan needed)
@@ -659,8 +753,8 @@ def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
]
uploaded = len(progress.get("uploaded_files", {}))
print(f" Uploaded files in progress: {uploaded}")
print(f" Personas to assign: {len(persona_folders)}\n")
log.info(f"Uploaded files in progress: {uploaded}")
log.info(f"Personas to assign: {len(persona_folders)}")
if dry_run:
existing_ws = get_existing_workspaces(config)
@@ -673,11 +767,11 @@ def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
if info.get("folder") == fn and info.get("location"):
doc_count += 1
already = len(progress.get("workspace_docs", {}).get(codename, []))
print(f" {codename} ({slug}): {doc_count} docs, {already} already assigned")
log.info(f" {codename} ({slug}): {doc_count} docs, {already} already assigned")
return
assign_to_workspaces(config, persona_folders, progress, batch_size, delay)
print(" Done.\n")
log.info("Re-assign complete.")
def resolve_persona_list(args, config):
@@ -709,8 +803,11 @@ def main():
parser.add_argument("--max-size", type=int, default=100, help="Max file MB (default: 100)")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--resume", action="store_true")
parser.add_argument("--verbose", "-v", action="store_true", help="Debug-level console output")
args = parser.parse_args()
setup_logging(verbose=args.verbose)
log.info(f"AnythingLLM Integration started — args: {vars(args)}")
config = load_config()
if not any([args.storage_setup, args.create_workspaces,