Add structured logging + log panel to monitor

- setup.py: logging module with file (setup.log) + console output
  - Line-buffered output (fixes background execution buffering)
  - API calls with timeout (300s), retry (3x), debug logging
  - Per-batch progress: [1/29] persona batch 1/20 (20 docs)
  - --verbose flag for debug-level console
- monitor.py: log tail in CLI + web dashboard
  - CLI: colorized last 15 log lines
  - Web: scrollable log panel with level-based colors
- Smaller embed batches (20 instead of 50) for reliability

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
salvacybersec
2026-04-07 00:30:29 +03:00
parent 9105c03b4b
commit 1028d11507
3 changed files with 177 additions and 26 deletions

1
.gitignore vendored
View File

@@ -1,5 +1,6 @@
# State files (machine-specific, regenerated by script) # State files (machine-specific, regenerated by script)
upload_progress.json upload_progress.json
setup.log
# OCR output (large binary files) # OCR output (large binary files)
ocr_output/ ocr_output/

View File

@@ -25,6 +25,7 @@ except ImportError:
CONFIG_PATH = Path(__file__).parent / "config.yaml" CONFIG_PATH = Path(__file__).parent / "config.yaml"
PROGRESS_PATH = Path(__file__).parent / "upload_progress.json" PROGRESS_PATH = Path(__file__).parent / "upload_progress.json"
LOG_PATH = Path(__file__).parent / "setup.log"
LANCEDB_PATH = Path.home() / ".config/anythingllm-desktop/storage/lancedb" LANCEDB_PATH = Path.home() / ".config/anythingllm-desktop/storage/lancedb"
DOCS_PATH = Path.home() / ".config/anythingllm-desktop/storage/documents" DOCS_PATH = Path.home() / ".config/anythingllm-desktop/storage/documents"
VCACHE_PATH = Path.home() / ".config/anythingllm-desktop/storage/vector-cache" VCACHE_PATH = Path.home() / ".config/anythingllm-desktop/storage/vector-cache"
@@ -152,6 +153,16 @@ def collect_status():
api_ok = check_api(config) api_ok = check_api(config)
script_running = check_script_running() script_running = check_script_running()
# Read last N lines from setup.log
log_lines = []
if LOG_PATH.exists():
try:
with open(LOG_PATH, "r", encoding="utf-8") as f:
all_lines = f.readlines()
log_lines = [l.rstrip() for l in all_lines[-15:]]
except Exception:
pass
return { return {
"personas": personas, "personas": personas,
"clusters": clusters, "clusters": clusters,
@@ -165,6 +176,7 @@ def collect_status():
"api_online": api_ok, "api_online": api_ok,
"script_running": script_running, "script_running": script_running,
"timestamp": time.strftime("%H:%M:%S"), "timestamp": time.strftime("%H:%M:%S"),
"log_tail": log_lines,
} }
@@ -231,6 +243,22 @@ def cli_output(status):
lines.append("") lines.append("")
# Log tail
log_tail = status.get("log_tail", [])
if log_tail:
lines.append(f" {BOLD}── Log (setup.log) ──{RESET}")
for ll in log_tail:
# Colorize log levels
if "[ERROR]" in ll:
lines.append(f" {RED}{ll}{RESET}")
elif "[WARNING]" in ll:
lines.append(f" \033[33m{ll}{RESET}")
elif "" in ll:
lines.append(f" {GREEN}{ll}{RESET}")
else:
lines.append(f" {DIM}{ll}{RESET}")
lines.append("")
return "\n".join(lines) return "\n".join(lines)
@@ -297,6 +325,13 @@ HTML_TEMPLATE = """<!DOCTYPE html>
.summary-card .label { color: #565f89; font-size: 11px; text-transform: uppercase; } .summary-card .label { color: #565f89; font-size: 11px; text-transform: uppercase; }
.summary-card .value { color: #7aa2f7; font-size: 20px; font-weight: bold; margin-top: 2px; } .summary-card .value { color: #7aa2f7; font-size: 20px; font-weight: bold; margin-top: 2px; }
.summary-card .unit { color: #565f89; font-size: 12px; } .summary-card .unit { color: #565f89; font-size: 12px; }
.log-panel { background: #0d0d12; border: 1px solid #1a1b26; border-radius: 8px; padding: 12px 16px; margin-top: 20px; max-height: 300px; overflow-y: auto; }
.log-panel h3 { color: #565f89; font-size: 12px; text-transform: uppercase; margin-bottom: 8px; }
.log-line { font-size: 12px; line-height: 1.6; color: #565f89; white-space: pre-wrap; word-break: break-all; }
.log-line.error { color: #f7768e; }
.log-line.warning { color: #e0af68; }
.log-line.success { color: #9ece6a; }
.log-line.info { color: #7aa2f7; }
</style> </style>
</head> </head>
<body> <body>
@@ -306,6 +341,7 @@ HTML_TEMPLATE = """<!DOCTYPE html>
<div class="summary" id="summary"></div> <div class="summary" id="summary"></div>
<div class="status-bar" id="statusbar"></div> <div class="status-bar" id="statusbar"></div>
<div id="clusters"></div> <div id="clusters"></div>
<div class="log-panel" id="logpanel"><h3>Log (setup.log)</h3><div id="loglines">No log data</div></div>
<script> <script>
const CLUSTER_ORDER = ['intel', 'cyber', 'military', 'humanities', 'engineering']; const CLUSTER_ORDER = ['intel', 'cyber', 'military', 'humanities', 'engineering'];
@@ -358,6 +394,23 @@ function render(data) {
html += '</table></div>'; html += '</table></div>';
}); });
document.getElementById('clusters').innerHTML = html; document.getElementById('clusters').innerHTML = html;
// Log panel
const logLines = data.log_tail || [];
if (logLines.length > 0) {
let logHtml = '';
logLines.forEach(line => {
let cls = '';
if (line.includes('[ERROR]')) cls = 'error';
else if (line.includes('[WARNING]')) cls = 'warning';
else if (line.includes('')) cls = 'success';
else if (line.includes('[INFO]')) cls = 'info';
logHtml += `<div class="log-line ${cls}">${line.replace(/</g,'&lt;')}</div>`;
});
document.getElementById('loglines').innerHTML = logHtml;
const panel = document.getElementById('logpanel');
panel.scrollTop = panel.scrollHeight;
}
} }
async function poll() { async function poll() {

143
setup.py
View File

@@ -26,11 +26,13 @@ Usage:
import argparse import argparse
import json import json
import logging
import os import os
import shutil import shutil
import subprocess import subprocess
import sys import sys
import time import time
from datetime import datetime
from pathlib import Path from pathlib import Path
import yaml import yaml
@@ -44,9 +46,43 @@ except ImportError:
CONFIG_PATH = Path(__file__).parent / "config.yaml" CONFIG_PATH = Path(__file__).parent / "config.yaml"
PROGRESS_PATH = Path(__file__).parent / "upload_progress.json" PROGRESS_PATH = Path(__file__).parent / "upload_progress.json"
LOG_PATH = Path(__file__).parent / "setup.log"
ANYTHINGLLM_STORAGE = Path.home() / ".config/anythingllm-desktop/storage" ANYTHINGLLM_STORAGE = Path.home() / ".config/anythingllm-desktop/storage"
SKIP_EXT = set() SKIP_EXT = set()
# ──────────────────────────────────────────────────────────
# LOGGING
# ──────────────────────────────────────────────────────────
log = logging.getLogger("anythingllm")
def setup_logging(verbose=False):
log.setLevel(logging.DEBUG)
fmt = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S")
# File handler — always debug level
fh = logging.FileHandler(LOG_PATH, encoding="utf-8")
fh.setLevel(logging.DEBUG)
fh.setFormatter(fmt)
log.addHandler(fh)
# Console handler — info or debug based on --verbose
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG if verbose else logging.INFO)
ch.setFormatter(fmt)
log.addHandler(ch)
# Flush on every write (critical for background/piped execution)
for h in log.handlers:
if hasattr(h, 'stream'):
h.stream = os.fdopen(os.dup(h.stream.fileno()), 'w', buffering=1)
def log_print(msg, level="info"):
"""Log and print (backward compat for functions that still use print)."""
getattr(log, level)(msg)
CLUSTERS = { CLUSTERS = {
"intel": ["frodo", "echo", "ghost", "oracle", "wraith", "scribe", "polyglot"], "intel": ["frodo", "echo", "ghost", "oracle", "wraith", "scribe", "polyglot"],
"cyber": ["neo", "bastion", "sentinel", "specter", "phantom", "cipher", "vortex"], "cyber": ["neo", "bastion", "sentinel", "specter", "phantom", "cipher", "vortex"],
@@ -80,16 +116,36 @@ def save_progress(progress):
# API # API
# ────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────
def api_request(config, method, endpoint, **kwargs): def api_request(config, method, endpoint, timeout=120, retries=3, **kwargs):
url = f"{config['anythingllm']['base_url']}{endpoint}" url = f"{config['anythingllm']['base_url']}{endpoint}"
headers = {"Authorization": f"Bearer {config['anythingllm']['api_key']}"} headers = {"Authorization": f"Bearer {config['anythingllm']['api_key']}"}
if "json" in kwargs: if "json" in kwargs:
headers["Content-Type"] = "application/json" headers["Content-Type"] = "application/json"
resp = getattr(requests, method)(url, headers=headers, **kwargs)
for attempt in range(retries):
try:
log.debug(f"API {method.upper()} {endpoint} (attempt {attempt+1})")
resp = getattr(requests, method)(url, headers=headers, timeout=timeout, **kwargs)
if resp.status_code not in (200, 201): if resp.status_code not in (200, 201):
print(f" API error {resp.status_code}: {resp.text[:300]}") log.error(f"API {resp.status_code}: {resp.text[:300]}")
if attempt < retries - 1:
time.sleep(3)
continue
return None return None
log.debug(f"API {method.upper()} {endpoint}{resp.status_code}")
return resp.json() return resp.json()
except requests.exceptions.Timeout:
log.warning(f"API timeout ({timeout}s) on {endpoint} (attempt {attempt+1}/{retries})")
if attempt < retries - 1:
time.sleep(5)
except requests.exceptions.ConnectionError as e:
log.error(f"API connection error: {e}")
if attempt < retries - 1:
time.sleep(5)
except Exception as e:
log.error(f"API unexpected error: {e}")
return None
return None
def api_upload(config, file_path, folder_name=None): def api_upload(config, file_path, folder_name=None):
@@ -436,43 +492,81 @@ def upload_file_batch(config, folder_name, files, progress, batch_size, delay):
def assign_to_workspaces(config, persona_folders, progress, batch_size, delay): def assign_to_workspaces(config, persona_folders, progress, batch_size, delay):
"""Phase C2: assign uploaded docs to persona workspaces.""" """Phase C2: assign uploaded docs to persona workspaces."""
print("── Assigning to workspaces ──\n") log.info("── Assigning to workspaces ──")
existing_ws = get_existing_workspaces(config) existing_ws = get_existing_workspaces(config)
for codename, folders in sorted(persona_folders.items()): if not existing_ws:
log.error("Could not fetch workspaces from API")
return
total_personas = len(persona_folders)
total_embedded = 0
total_failed = 0
for idx, (codename, folders) in enumerate(sorted(persona_folders.items()), 1):
ws_name = config["workspaces"][codename]["name"] ws_name = config["workspaces"][codename]["name"]
ws_info = existing_ws.get(ws_name) ws_info = existing_ws.get(ws_name)
if not ws_info: if not ws_info:
log.warning(f"[{idx}/{total_personas}] {codename}: workspace '{ws_name}' not found, skipping")
continue continue
slug = ws_info["slug"] slug = ws_info["slug"]
doc_locs = [] doc_locs = []
for fn in folders: for fn in folders:
folder_docs = 0
for fpath, info in progress["uploaded_files"].items(): for fpath, info in progress["uploaded_files"].items():
if info.get("folder") == fn and info.get("location"): if info.get("folder") == fn and info.get("location"):
doc_locs.append(info["location"]) doc_locs.append(info["location"])
folder_docs += 1
if folder_docs > 0:
log.debug(f" {codename}/{fn}: {folder_docs} docs")
already = set(progress.get("workspace_docs", {}).get(codename, [])) already = set(progress.get("workspace_docs", {}).get(codename, []))
new_docs = [loc for loc in doc_locs if loc not in already] new_docs = [loc for loc in doc_locs if loc not in already]
if not new_docs: if not new_docs:
if doc_locs: if doc_locs:
print(f" {codename}: {len(doc_locs)} docs assigned") log.info(f"[{idx}/{total_personas}]{codename}: {len(doc_locs)} docs already assigned")
else:
log.info(f"[{idx}/{total_personas}] ○ {codename}: no uploaded docs found")
continue continue
print(f" {codename} ({slug}): {len(new_docs)} docs") log.info(f"[{idx}/{total_personas}]{codename} ({slug}): {len(new_docs)} docs to embed")
for bs in range(0, len(new_docs), batch_size):
batch = new_docs[bs:bs + batch_size] # Use smaller batches for embedding (10-20 is safer than 50)
embed_batch = min(batch_size, 20)
persona_ok = 0
persona_fail = 0
for bs in range(0, len(new_docs), embed_batch):
batch = new_docs[bs:bs + embed_batch]
batch_num = bs // embed_batch + 1
total_batches = (len(new_docs) + embed_batch - 1) // embed_batch
log.debug(f" {codename} batch {batch_num}/{total_batches} ({len(batch)} docs)")
result = api_request(config, "post", f"/workspace/{slug}/update-embeddings", result = api_request(config, "post", f"/workspace/{slug}/update-embeddings",
json={"adds": batch, "deletes": []}) json={"adds": batch, "deletes": []},
timeout=300, retries=3)
if result: if result:
progress.setdefault("workspace_docs", {}).setdefault(codename, []).extend(batch) progress.setdefault("workspace_docs", {}).setdefault(codename, []).extend(batch)
print(f"{len(batch)} docs embedded") persona_ok += len(batch)
log.info(f"{codename} batch {batch_num}/{total_batches}: "
f"{len(batch)} embedded ({persona_ok}/{len(new_docs)})")
else: else:
print(f" ✗ batch failed") persona_fail += len(batch)
if bs + batch_size < len(new_docs): log.error(f"{codename} batch {batch_num}/{total_batches}: FAILED")
time.sleep(delay)
# Save after every batch
save_progress(progress) save_progress(progress)
print()
if bs + embed_batch < len(new_docs):
time.sleep(delay)
total_embedded += persona_ok
total_failed += persona_fail
log.info(f" {codename} done: {persona_ok} ok, {persona_fail} failed")
log.info(f"── Assignment complete: {total_embedded} embedded, {total_failed} failed ──")
def upload_documents(config, persona_list=None, priority_filter=None, def upload_documents(config, persona_list=None, priority_filter=None,
@@ -628,10 +722,10 @@ def show_status(config):
def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False): def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
"""Re-assign already-uploaded docs to workspaces without scanning/uploading. """Re-assign already-uploaded docs to workspaces without scanning/uploading.
Skips the slow folder scan — uses upload_progress.json directly.""" Skips the slow folder scan — uses upload_progress.json directly."""
print("═══ Re-assign Workspaces ═══\n") log.info("═══ Re-assign Workspaces ═══")
if not check_api(config): if not check_api(config):
print("AnythingLLM API not reachable.") log.error("AnythingLLM API not reachable")
return return
progress = load_progress() progress = load_progress()
@@ -642,10 +736,10 @@ def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
if persona_list: if persona_list:
for p in persona_list: for p in persona_list:
progress.get("workspace_docs", {}).pop(p, None) progress.get("workspace_docs", {}).pop(p, None)
print(f"Reset assignments for: {', '.join(persona_list)}\n") log.info(f"Reset assignments for: {', '.join(persona_list)}")
else: else:
progress["workspace_docs"] = {} progress["workspace_docs"] = {}
print("Reset all workspace assignments\n") log.info("Reset all workspace assignments")
save_progress(progress) save_progress(progress)
# Build persona_folders from config (no disk scan needed) # Build persona_folders from config (no disk scan needed)
@@ -659,8 +753,8 @@ def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
] ]
uploaded = len(progress.get("uploaded_files", {})) uploaded = len(progress.get("uploaded_files", {}))
print(f" Uploaded files in progress: {uploaded}") log.info(f"Uploaded files in progress: {uploaded}")
print(f" Personas to assign: {len(persona_folders)}\n") log.info(f"Personas to assign: {len(persona_folders)}")
if dry_run: if dry_run:
existing_ws = get_existing_workspaces(config) existing_ws = get_existing_workspaces(config)
@@ -673,11 +767,11 @@ def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
if info.get("folder") == fn and info.get("location"): if info.get("folder") == fn and info.get("location"):
doc_count += 1 doc_count += 1
already = len(progress.get("workspace_docs", {}).get(codename, [])) already = len(progress.get("workspace_docs", {}).get(codename, []))
print(f" {codename} ({slug}): {doc_count} docs, {already} already assigned") log.info(f" {codename} ({slug}): {doc_count} docs, {already} already assigned")
return return
assign_to_workspaces(config, persona_folders, progress, batch_size, delay) assign_to_workspaces(config, persona_folders, progress, batch_size, delay)
print(" Done.\n") log.info("Re-assign complete.")
def resolve_persona_list(args, config): def resolve_persona_list(args, config):
@@ -709,8 +803,11 @@ def main():
parser.add_argument("--max-size", type=int, default=100, help="Max file MB (default: 100)") parser.add_argument("--max-size", type=int, default=100, help="Max file MB (default: 100)")
parser.add_argument("--dry-run", action="store_true") parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--resume", action="store_true") parser.add_argument("--resume", action="store_true")
parser.add_argument("--verbose", "-v", action="store_true", help="Debug-level console output")
args = parser.parse_args() args = parser.parse_args()
setup_logging(verbose=args.verbose)
log.info(f"AnythingLLM Integration started — args: {vars(args)}")
config = load_config() config = load_config()
if not any([args.storage_setup, args.create_workspaces, if not any([args.storage_setup, args.create_workspaces,