commit 9e9b75e0b37cb5ec0c6bddda4eb023c2da5299b0 Author: salvacybersec Date: Mon Apr 6 23:07:44 2026 +0300 Initial commit: AnythingLLM persona RAG integration 28 persona workspace with document upload, OCR pipeline, and vector embedding assignment via AnythingLLM API. Supports 5 clusters (intel, cyber, military, humanities, engineering) with batch processing and resume capability. Co-Authored-By: Claude Opus 4.6 (1M context) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ebb715b --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# State files (machine-specific, regenerated by script) +upload_progress.json + +# OCR output (large binary files) +ocr_output/ + +# Python +__pycache__/ +*.pyc +.venv/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..071ac6b --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +# AnythingLLM × Persona RAG Integration + +28 persona workspace'i olan, kitap kütüphanesinden beslenen RAG sistemi. Her persona kendi uzmanlık alanındaki dokümanlarla vektör embed edilmiş durumda. + +## Mimari + +- **AnythingLLM Desktop** — `http://localhost:3001` +- **LLM:** Ollama local (qwen3:14b) +- **Embedding:** Google Gemini (gemini-embedding-001) +- **Vector DB:** LanceDB +- **OCR:** ocrmypdf (tur+eng) +- **Kitap Kaynağı:** `/mnt/storage/Common/Books/` + +## Personalar (5 Cluster) + +| Cluster | Personalar | +|---------|-----------| +| Intel | Frodo, Echo, Ghost, Oracle, Wraith, Scribe, Polyglot | +| Cyber | Neo, Bastion, Sentinel, Specter, Phantom, Cipher, Vortex | +| Military | Marshal, Centurion, Corsair, Warden, Medic | +| Humanities | Chronos, Tribune, Arbiter, Ledger, Sage, Herald, Scholar, Gambit | +| Engineering | Forge, Architect | + +## Kullanım + +```bash +# Durum kontrolü +python3 setup.py --status + +# Workspace oluştur / güncelle +python3 setup.py --create-workspaces + +# Tam pipeline (upload + OCR + embed) +python3 setup.py --upload-documents --resume + +# Tek cluster veya persona +python3 setup.py --upload-documents --cluster cyber --resume +python3 setup.py --upload-documents --persona neo --priority 1 --resume + +# Önizleme +python3 setup.py --upload-documents --dry-run +``` + +## Pipeline + +``` +Phase A: Text dosyaları upload +Phase B: Scanned PDF'leri OCR (ocrmypdf) +Phase C: OCR'lı dosyaları upload +Final: Workspace'lere assign/embed +``` + +## Recovery + +Vektör DB silinirse: +1. `upload_progress.json`'da `workspace_docs` → `{}` sıfırla +2. `python3 setup.py --upload-documents --resume` (sadece re-embed yapar) + +## Dosyalar + +- `setup.py` — Ana entegrasyon scripti (upload, OCR, workspace assignment) +- `config.yaml` — Persona-klasör eşlemeleri, API config, batch ayarları +- `upload_progress.json` — Upload/atama state tracker (gitignore'd) diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..7b75466 --- /dev/null +++ b/config.yaml @@ -0,0 +1,425 @@ +# AnythingLLM × Persona Library Integration Config +# Maps personas to book folders for workspace-based RAG +# +# Usage: python3 setup.py [--dry-run] [--persona ] [--upload-documents] + +anythingllm: + base_url: "http://localhost:3001/api/v1" + api_key: "SXQGXH3-AQ64B8E-KQNMDWC-WZBQAFW" + +storage: + book_library: "/mnt/storage/Common/Books" + personas_dir: "/home/salva/Documents/personas/personas" + # AnythingLLM copies uploaded originals to direct-uploads/ + # This symlink sends them to HDD so SSD stays clean + hdd_storage: "/mnt/storage/anythingllm" + +embedding: + primary: + engine: "gemini" + model: "gemini-embedding-001" + fallback: + engine: "ollama" + base_path: "http://127.0.0.1:40114/olla/ollama" + model: "nomic-embed-text" + +# Batch processing — avoid API rate limits +processing: + batch_size: 50 # files per batch + delay_between_batches: 5 # seconds + max_concurrent: 3 # parallel uploads + skip_extensions: # don't process these + - ".bin" + - ".gz" + - ".zip" + - ".html" + - ".php" + - ".jpg" + - ".pptx" + - ".ppt" + - ".doc" + +# ───────────────────────────────────────────────────────────── +# PERSONA → BOOK FOLDER MAPPINGS +# ───────────────────────────────────────────────────────────── +# priority: 1=core (always load), 2=extended (load if capacity allows) +# max_files: cap per folder to keep workspace focused + +workspaces: + + # ══════════════════════════════════════════════════════════ + # INTELLIGENCE CLUSTER + # ══════════════════════════════════════════════════════════ + + frodo: + name: "Frodo — Stratejik İstihbarat" + persona_file: "frodo/general.md" + folders: + - path: "Istihbarat/TeoriVeAnaliz" + priority: 1 + - path: "Istihbarat/Arastirmalar" + priority: 1 + - path: "UluslararasiIliskiler" + priority: 1 + - path: "GuvenlikStratejileri" + priority: 1 + - path: "SETA" + priority: 2 + - path: "ORSAM" + priority: 2 + - path: "Istihbarat/TurkIstihbarati" + priority: 2 + - path: "Istihbarat/RusIstihbarati" + priority: 2 + + echo: + name: "Echo — SIGINT/COMINT" + persona_file: "echo/general.md" + folders: + - path: "SiberGuvenlik/FOIA-IA-NSA-SIGINT" + priority: 1 + - path: "SiberGuvenlik/ElektronikGuvenlik" + priority: 1 + - path: "Istihbarat/FOIA-CIA-SogukSavas" + priority: 2 + - path: "SiberGuvenlik/FOIA-SiberSavas" + priority: 2 + + ghost: + name: "Ghost — PSYOP & Bilgi Savaşı" + persona_file: "ghost/general.md" + folders: + - path: "SiberGuvenlik/BilgiSavasi" + priority: 1 + - path: "Istihbarat/SorguTeknikleri" + priority: 1 + - path: "GuvenlikStratejileri" + priority: 2 + + oracle: + name: "Oracle — OSINT & Dijital İstihbarat" + persona_file: "oracle/general.md" + folders: + - path: "SiberGuvenlik/OSINT" + priority: 1 + - path: "Istihbarat/Arastirmalar" + priority: 2 + + wraith: + name: "Wraith — HUMINT & Karşı İstihbarat" + persona_file: "wraith/general.md" + folders: + - path: "Istihbarat/TeoriVeAnaliz" + priority: 1 + - path: "Istihbarat/BiyografiVeAnilar" + priority: 1 + - path: "Istihbarat/TurkIstihbarati" + priority: 1 + - path: "Istihbarat/RusIstihbarati" + priority: 1 + - path: "Istihbarat/SorguTeknikleri" + priority: 2 + - path: "Istihbarat/IstihbaratTarihi" + priority: 2 + + scribe: + name: "Scribe — FOIA Arşivci" + persona_file: "scribe/general.md" + folders: + - path: "FOIA" + priority: 1 + - path: "Istihbarat/CIA" + priority: 1 + - path: "Istihbarat/FOIA-CIA-OrtaDogu" + priority: 1 + - path: "Istihbarat/FOIA-CIA-SogukSavas" + priority: 1 + - path: "Istihbarat/FOIA-CIA-Turkey" + priority: 1 + - path: "Istihbarat/FOIA-FBI-COINTELPRO" + priority: 2 + - path: "Istihbarat/FOIA-FBI-Vault" + priority: 2 + - path: "Istihbarat/FOIA-IA-CIA-SogukSavas" + priority: 2 + - path: "Istihbarat/FOIA-IA-CIA-Kuba-OrtaDogu" + priority: 2 + - path: "Istihbarat/FOIA-IA-FBI" + priority: 2 + - path: "Istihbarat/FOIA-IA-WWII" + priority: 2 + - path: "SiberGuvenlik/FOIA-CyberWarfare" + priority: 2 + - path: "SiberGuvenlik/FOIA-IA-NSA-SIGINT" + priority: 2 + + polyglot: + name: "Polyglot — Dilbilim & LINGINT" + persona_file: "polyglot/general.md" + folders: + - path: "Egitim" + priority: 1 + + # ══════════════════════════════════════════════════════════ + # CYBERSECURITY CLUSTER + # ══════════════════════════════════════════════════════════ + + neo: + name: "Neo — Red Team & Exploit Dev" + persona_file: "neo/general.md" + folders: + - path: "SiberGuvenlik/PenetrasyonTesti" + priority: 1 + - path: "SiberGuvenlik/SaldiriTeknikleri" + priority: 1 + - path: "SiberGuvenlik/ZafiyetArastirmasi" + priority: 1 + - path: "SiberGuvenlik/WebGuvenligi" + priority: 2 + + bastion: + name: "Bastion — Blue Team & DFIR" + persona_file: "bastion/general.md" + folders: + - path: "SiberGuvenlik/AdliBilisim" + priority: 1 + - path: "SiberGuvenlik/GenelGuvenlik" + priority: 1 + - path: "SiberGuvenlik/AgGuvenligi" + priority: 2 + - path: "SiberGuvenlik/WindowsGuvenligi" + priority: 2 + + sentinel: + name: "Sentinel — Siber Tehdit İstihbaratı" + persona_file: "sentinel/general.md" + folders: + - path: "SiberGuvenlik/TehditIstihbarati" + priority: 1 + - path: "SiberGuvenlik/SiberSavas" + priority: 1 + - path: "SiberGuvenlik/SiberGuvenlikStratejisi" + priority: 1 + - path: "SiberGuvenlik/FOIA-CyberWarfare" + priority: 2 + + specter: + name: "Specter — Zararlı Yazılım & Tersine Mühendislik" + persona_file: "specter/general.md" + folders: + - path: "SiberGuvenlik/ZararliYazilimAnalizi" + priority: 1 + - path: "SiberGuvenlik/TersineMuhendislik" + priority: 1 + - path: "SiberGuvenlik/KernelGuvenligi" + priority: 2 + + phantom: + name: "Phantom — Web Uygulama Güvenliği" + persona_file: "phantom/general.md" + folders: + - path: "SiberGuvenlik/WebGuvenligi" + priority: 1 + - path: "SiberGuvenlik/PenetrasyonTesti" + priority: 2 + - path: "SiberGuvenlik/BulutGuvenligi" + priority: 2 + + cipher: + name: "Cipher — Kriptografi" + persona_file: "cipher/general.md" + folders: + - path: "SiberGuvenlik/Kriptografi" + priority: 1 + - path: "SiberGuvenlik/BilgiGuvenligi" + priority: 2 + + vortex: + name: "Vortex — Ağ Operasyonları" + persona_file: "vortex/general.md" + folders: + - path: "SiberGuvenlik/AgGuvenligi" + priority: 1 + - path: "SiberGuvenlik/DonaninGuvenligi" + priority: 2 + - path: "SiberGuvenlik/IoT" + priority: 2 + + # ══════════════════════════════════════════════════════════ + # MILITARY CLUSTER + # ══════════════════════════════════════════════════════════ + + marshal: + name: "Marshal — Askeri Doktrin & Strateji" + persona_file: "marshal/general.md" + folders: + - path: "AskeriDoktrin" + priority: 1 + - path: "NATO/Doktrin" + priority: 1 + - path: "GuvenlikStratejileri" + priority: 1 + - path: "NATO/Tatbikat" + priority: 2 + + centurion: + name: "Centurion — Askeri Tarih" + persona_file: "centurion/general.md" + folders: + - path: "AskeriTarih" + priority: 1 + - path: "AskeriDoktrin" + priority: 2 + - path: "DunyaTarihi" + priority: 2 + + corsair: + name: "Corsair — Özel Harekat & Düzensiz Savaş" + persona_file: "corsair/general.md" + folders: + - path: "AskeriDoktrin" + priority: 1 + - path: "Istihbarat/TerorMucadele" + priority: 1 + - path: "GuvenlikStratejileri" + priority: 2 + + warden: + name: "Warden — Savunma Analizi & Silah Sistemleri" + persona_file: "warden/general.md" + folders: + - path: "AskeriDoktrin" + priority: 1 + - path: "NATO/Teknik" + priority: 1 + - path: "GuvenlikStratejileri" + priority: 2 + - path: "Istihbarat/SavunmaBakanligiRaporlari" + priority: 2 + + medic: + name: "Medic — Biyomedikal & KBRN" + persona_file: "medic/general.md" + folders: + - path: "Biyomedikal" + priority: 1 + - path: "Istihbarat/KBRN" + priority: 1 + - path: "BilimVeArastirma" + priority: 2 + + # ══════════════════════════════════════════════════════════ + # HUMANITIES & ANALYSIS CLUSTER + # ══════════════════════════════════════════════════════════ + + chronos: + name: "Chronos — Dünya Tarihi & Medeniyet" + persona_file: "chronos/general.md" + folders: + - path: "DunyaTarihi" + priority: 1 + - path: "OsmanliTarihi" + priority: 1 + - path: "CumhuriyetTarihi" + priority: 1 + - path: "RusyaTarihi" + priority: 1 + - path: "YahudiTarihi" + priority: 2 + - path: "AskeriTarih" + priority: 2 + + tribune: + name: "Tribune — Siyaset Bilimi & Rejim Analizi" + persona_file: "tribune/general.md" + folders: + - path: "UluslararasiIliskiler" + priority: 1 + - path: "SETA" + priority: 1 + - path: "ORSAM" + priority: 1 + - path: "CumhuriyetTarihi" + priority: 2 + + arbiter: + name: "Arbiter — Uluslararası Hukuk" + persona_file: "arbiter/general.md" + folders: + - path: "Hukuk" + priority: 1 + - path: "UluslararasiIliskiler" + priority: 2 + - path: "NATO/Idari" + priority: 2 + + ledger: + name: "Ledger — Ekonomik İstihbarat & FININT" + persona_file: "ledger/general.md" + folders: + - path: "EkonomiVeFinans" + priority: 1 + + sage: + name: "Sage — Felsefe & İktidar Teorisi" + persona_file: "sage/general.md" + folders: + - path: "FelsefeVeEdebiyat" + priority: 1 + + herald: + name: "Herald — Medya Analizi & Stratejik İletişim" + persona_file: "herald/general.md" + folders: + - path: "SETA" + priority: 1 + - path: "ORSAM" + priority: 2 + - path: "UluslararasiIliskiler" + priority: 2 + + scholar: + name: "Scholar — Akademik Araştırma" + persona_file: "scholar/general.md" + folders: + - path: "BilimVeArastirma" + priority: 1 + - path: "Egitim" + priority: 1 + - path: "UluslararasiIliskiler" + priority: 2 + + gambit: + name: "Gambit — Satranç & Stratejik Düşünce" + persona_file: "gambit/general.md" + folders: + - path: "Satranc" + priority: 1 + + # ══════════════════════════════════════════════════════════ + # ENGINEERING CLUSTER + # ══════════════════════════════════════════════════════════ + + forge: + name: "Forge — Yazılım & AI/ML" + persona_file: "forge/general.md" + folders: + - path: "AI" + priority: 1 + - path: "Teknoloji" + priority: 1 + - path: "SiberGuvenlik/Programlama" + priority: 2 + - path: "SiberGuvenlik/YapayZekaGuvenligi" + priority: 2 + + architect: + name: "Architect — DevOps & Altyapı" + persona_file: "architect/general.md" + folders: + - path: "Teknoloji" + priority: 1 + - path: "SiberGuvenlik/Linux" + priority: 2 + - path: "SiberGuvenlik/BulutGuvenligi" + priority: 2 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fc28a66 --- /dev/null +++ b/setup.py @@ -0,0 +1,680 @@ +#!/usr/bin/env python3 +""" +AnythingLLM × Persona Library Integration Setup + +Three-phase pipeline: + Phase A: Upload all text-based files + Phase B: OCR all scanned PDFs in-place + Phase C: Upload newly OCR'd files + assign to workspaces + +Usage: + python3 setup.py --storage-setup # Symlink direct-uploads/ to HDD + python3 setup.py --create-workspaces # Create workspaces + load persona prompts + python3 setup.py --upload-documents # Full pipeline: upload → OCR → upload → assign + python3 setup.py --persona frodo # Single persona + python3 setup.py --cluster intel # Intel cluster (frodo,echo,ghost,oracle,wraith,scribe,polyglot) + python3 setup.py --cluster cyber # Cyber cluster + python3 setup.py --cluster military # Military cluster + python3 setup.py --cluster humanities # Humanities cluster + python3 setup.py --cluster engineering # Engineering cluster + python3 setup.py --priority 1 # Only priority 1 (core) folders + python3 setup.py --dry-run # Preview + python3 setup.py --status # Show state +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +import time +from pathlib import Path + +import yaml + +try: + import requests +except ImportError: + print("pip install requests pyyaml") + sys.exit(1) + + +CONFIG_PATH = Path(__file__).parent / "config.yaml" +PROGRESS_PATH = Path(__file__).parent / "upload_progress.json" +ANYTHINGLLM_STORAGE = Path.home() / ".config/anythingllm-desktop/storage" +SKIP_EXT = set() + +CLUSTERS = { + "intel": ["frodo", "echo", "ghost", "oracle", "wraith", "scribe", "polyglot"], + "cyber": ["neo", "bastion", "sentinel", "specter", "phantom", "cipher", "vortex"], + "military": ["marshal", "centurion", "corsair", "warden", "medic"], + "humanities": ["chronos", "tribune", "arbiter", "ledger", "sage", "herald", "scholar", "gambit"], + "engineering": ["forge", "architect"], +} + + +def load_config(): + with open(CONFIG_PATH) as f: + cfg = yaml.safe_load(f) + global SKIP_EXT + SKIP_EXT = set(cfg["processing"]["skip_extensions"]) + return cfg + + +def load_progress(): + if PROGRESS_PATH.exists(): + with open(PROGRESS_PATH) as f: + return json.load(f) + return {"uploaded_files": {}, "workspace_docs": {}, "ocr_done": [], "ocr_failed": []} + + +def save_progress(progress): + with open(PROGRESS_PATH, "w") as f: + json.dump(progress, f, indent=2, ensure_ascii=False) + + +# ────────────────────────────────────────────────────────── +# API +# ────────────────────────────────────────────────────────── + +def api_request(config, method, endpoint, **kwargs): + url = f"{config['anythingllm']['base_url']}{endpoint}" + headers = {"Authorization": f"Bearer {config['anythingllm']['api_key']}"} + if "json" in kwargs: + headers["Content-Type"] = "application/json" + resp = getattr(requests, method)(url, headers=headers, **kwargs) + if resp.status_code not in (200, 201): + print(f" API error {resp.status_code}: {resp.text[:300]}") + return None + return resp.json() + + +def api_upload(config, file_path, folder_name=None): + endpoint = f"/document/upload/{folder_name}" if folder_name else "/document/upload" + url = f"{config['anythingllm']['base_url']}{endpoint}" + headers = {"Authorization": f"Bearer {config['anythingllm']['api_key']}"} + size_mb = file_path.stat().st_size / (1024 * 1024) + timeout = max(120, int(120 + (size_mb / 10) * 30)) + try: + with open(file_path, "rb") as f: + files = {"file": (file_path.name, f, "application/octet-stream")} + resp = requests.post(url, headers=headers, files=files, timeout=timeout) + if resp.status_code not in (200, 201): + return None, resp.text[:200] + data = resp.json() + if data.get("success") and data.get("documents"): + return data["documents"], None + return None, data.get("error", "Unknown error") + except requests.exceptions.Timeout: + return None, f"timeout ({timeout}s)" + except Exception as e: + return None, str(e) + + +def check_api(config): + try: + return api_request(config, "get", "/auth") is not None + except Exception: + return False + + +def check_collector_alive(): + try: + return requests.get("http://127.0.0.1:8888", timeout=3).status_code == 200 + except Exception: + return False + + +def wait_for_collector(max_wait=90): + print(" ⏳ waiting for collector...", end="", flush=True) + for i in range(max_wait): + if check_collector_alive(): + print(" ✓") + return True + time.sleep(1) + if i % 10 == 9: + print(".", end="", flush=True) + print(" ✗ timeout") + return False + + +def get_existing_workspaces(config): + result = api_request(config, "get", "/workspaces") + if result and "workspaces" in result: + return {ws["name"]: ws for ws in result["workspaces"]} + return {} + + +# ────────────────────────────────────────────────────────── +# PDF DETECTION & OCR +# ────────────────────────────────────────────────────────── + +def is_scanned_pdf(file_path): + """Fast scan detection via pdffonts (~0.04s vs pdftotext ~2s). + No fonts embedded = scanned/image-only PDF.""" + if file_path.suffix.lower() != ".pdf": + return False + try: + proc = subprocess.Popen( + ["pdffonts", "-l", "3", str(file_path)], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + ) + try: + stdout, _ = proc.communicate(timeout=3) + lines = [l for l in stdout.decode(errors="ignore").strip().split("\n") if l.strip()] + return len(lines) <= 2 + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + return False # assume text if slow + except Exception: + return False + + +def ocr_pdf(file_path, language="tur+eng", dpi=200): + """OCR a scanned PDF in-place. Returns True on success.""" + import tempfile + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".pdf", dir=file_path.parent) + os.close(tmp_fd) + tmp_path = Path(tmp_path) + try: + result = subprocess.run( + ["ocrmypdf", "--skip-text", "--rotate-pages", "--deskew", + "--jobs", "4", "--image-dpi", str(dpi), "-l", language, + "--output-type", "pdf", "--quiet", + str(file_path), str(tmp_path)], + capture_output=True, text=True, timeout=600, + ) + if result.returncode in (0, 6) and tmp_path.exists() and tmp_path.stat().st_size > 0: + tmp_path.replace(file_path) + return True + tmp_path.unlink(missing_ok=True) + return False + except Exception: + tmp_path.unlink(missing_ok=True) + return False + + +# ────────────────────────────────────────────────────────── +# STEP 1: Storage offload +# ────────────────────────────────────────────────────────── + +def storage_setup(config, dry_run=False): + hdd_base = Path(config["storage"]["hdd_storage"]) + src = ANYTHINGLLM_STORAGE / "direct-uploads" + dst = hdd_base / "direct-uploads" + + print("═══ Storage: direct-uploads/ → HDD ═══\n") + if src.is_symlink(): + print(f" ✓ already symlinked → {src.resolve()}") + return + if not dry_run: + dst.mkdir(parents=True, exist_ok=True) + if src.exists(): + shutil.copytree(str(src), str(dst), dirs_exist_ok=True) + shutil.rmtree(src) + src.symlink_to(dst) + print(f" ✓ done\n") + + +# ────────────────────────────────────────────────────────── +# STEP 2: Create workspaces + load prompts +# ────────────────────────────────────────────────────────── + +def extract_system_prompt(config, persona_file): + personas_dir = Path(config["storage"]["personas_dir"]) + fp = personas_dir / persona_file + if not fp.exists(): + alt = personas_dir / persona_file.replace("/general.md", "") / "_meta.yaml" + fp = alt if alt.exists() else None + if not fp: + return None + + content = fp.read_text(encoding="utf-8") + if fp.suffix == ".yaml": + meta = yaml.safe_load(content) + return meta.get("system_prompt", meta.get("description", "")) + + parts = content.split("---") + if len(parts) >= 3: + try: + fm = yaml.safe_load(parts[1]) + except yaml.YAMLError: + fm = {} + tone = fm.get("tone", "") + body = "---".join(parts[2:]).strip() + return f"Tone: {tone}\n\n{body}" if tone else body + return content + + +def create_workspaces(config, persona_list=None, dry_run=False): + print("═══ Creating Workspaces ═══\n") + if not config["anythingllm"]["api_key"]: + print(" ✗ API key not set!") + return + + existing = get_existing_workspaces(config) + created = skipped = 0 + + for codename, ws_config in config["workspaces"].items(): + if persona_list and codename not in persona_list: + continue + name = ws_config["name"] + persona_file = ws_config.get("persona_file", "") + system_prompt = extract_system_prompt(config, persona_file) if persona_file else "" + + if name in existing: + # Update prompt if workspace exists + slug = existing[name].get("slug", "?") + if system_prompt and not dry_run: + api_request(config, "post", f"/workspace/{slug}/update", + json={"openAiPrompt": system_prompt}) + print(f" ✓ {codename} (prompt: {len(system_prompt or '')} chars)") + skipped += 1 + continue + + print(f" → {codename}: creating '{name}'") + if not dry_run: + result = api_request(config, "post", "/workspace/new", json={"name": name}) + if result: + slug = result.get("workspace", {}).get("slug", "?") + if system_prompt: + api_request(config, "post", f"/workspace/{slug}/update", + json={"openAiPrompt": system_prompt}) + print(f" ✓ created + prompt ({len(system_prompt)} chars)") + created += 1 + else: + print(f" ✗ failed") + + print(f"\n Created: {created}, Updated: {skipped}\n") + + +# ────────────────────────────────────────────────────────── +# STEP 3: Three-phase upload pipeline +# ────────────────────────────────────────────────────────── + +def collect_all_files(config, book_library, persona_list=None, priority_filter=None, max_size_mb=100): + """Scan all folders and classify files as text-based or scanned.""" + text_files = {} # folder_name → [paths] + scanned_files = {} # folder_name → [paths] + persona_folders = {} # codename → [folder_names] + folder_to_path = {} # folder_name → source path + + for codename, ws_config in config["workspaces"].items(): + if persona_list and codename not in persona_list: + continue + persona_folders[codename] = [] + + for entry in ws_config.get("folders", []): + folder_path = entry["path"] + priority = entry.get("priority", 2) + if priority_filter and priority > priority_filter: + continue + + folder_name = folder_path.replace("/", "_") + persona_folders[codename].append(folder_name) + + if folder_name in text_files: + continue # already scanned + + src = book_library / folder_path + folder_to_path[folder_name] = src + text_files[folder_name] = [] + scanned_files[folder_name] = [] + + if not src.exists(): + print(f" ⚠ {folder_path} not found") + continue + + # Use find with -printf for speed on HDD (one syscall, no per-file stat) + try: + find_result = subprocess.run( + ["find", str(src), "-type", "f", "-not", "-empty", + "-printf", "%s %p\n"], + capture_output=True, text=True, timeout=120, + ) + all_files = [] + max_bytes = (max_size_mb or 9999) * 1024 * 1024 + for line in find_result.stdout.strip().split("\n"): + if not line: + continue + parts = line.split(" ", 1) + if len(parts) != 2: + continue + size, path = int(parts[0]), Path(parts[1]) + if path.suffix.lower() not in SKIP_EXT and size <= max_bytes: + all_files.append(path) + all_files.sort() + except Exception: + all_files = sorted( + f for f in src.rglob("*") + if f.is_file() and f.suffix.lower() not in SKIP_EXT + and f.stat().st_size > 0 + and (not max_size_mb or f.stat().st_size / (1024*1024) <= max_size_mb) + ) + print(f" {folder_name}: {len(all_files)} files found", flush=True) + + # Classify every file (pdffonts is fast: ~0.04s per file) + for i, f in enumerate(all_files): + if is_scanned_pdf(f): + scanned_files[folder_name].append(f) + else: + text_files[folder_name].append(f) + if (i + 1) % 500 == 0: + print(f" {folder_name}: {i+1}/{len(all_files)} classified...", flush=True) + + t = len(text_files[folder_name]) + s = len(scanned_files[folder_name]) + print(f" {folder_name}: {t} text, {s} scanned") + + return text_files, scanned_files, persona_folders, folder_to_path + + +def upload_file_batch(config, folder_name, files, progress, batch_size, delay): + """Upload a list of files to a folder. Returns (uploaded, failed) counts.""" + uploaded = failed = 0 + new_files = [f for f in files if str(f) not in progress["uploaded_files"]] + if not new_files: + return 0, 0 + + print(f" → {folder_name}: {len(new_files)} files") + + for i, fp in enumerate(new_files): + if uploaded > 0 and uploaded % batch_size == 0: + print(f" ⏸ batch pause ({delay}s)...") + time.sleep(delay) + + size_mb = fp.stat().st_size / (1024 * 1024) + print(f" [{i+1}/{len(new_files)}] {fp.name} ({size_mb:.1f}MB)", end="", flush=True) + + if not check_collector_alive(): + print(f" ⚠ collector down") + if not wait_for_collector(): + print(" ✗ stopping — restart AnythingLLM and --resume") + save_progress(progress) + return uploaded, failed + + docs, error = None, None + for attempt in range(3): + docs, error = api_upload(config, fp, folder_name=folder_name) + if docs: + break + if error and "not online" in str(error): + print(f" ↻", end="", flush=True) + time.sleep(5) + if not wait_for_collector(): + save_progress(progress) + return uploaded, failed + elif attempt < 2: + time.sleep(2) + print(f" ↻", end="", flush=True) + + if docs: + progress["uploaded_files"][str(fp)] = { + "location": docs[0].get("location", ""), + "folder": folder_name, + "name": docs[0].get("name", ""), + } + uploaded += 1 + print(f" ✓") + else: + failed += 1 + progress.setdefault("failed_files", {})[str(fp)] = { + "folder": folder_name, "error": str(error)[:200], + } + print(f" ✗ {error}") + + if uploaded % 10 == 0: + save_progress(progress) + + save_progress(progress) + return uploaded, failed + + +def assign_to_workspaces(config, persona_folders, progress, batch_size, delay): + """Phase C2: assign uploaded docs to persona workspaces.""" + print("── Assigning to workspaces ──\n") + existing_ws = get_existing_workspaces(config) + + for codename, folders in sorted(persona_folders.items()): + ws_name = config["workspaces"][codename]["name"] + ws_info = existing_ws.get(ws_name) + if not ws_info: + continue + + slug = ws_info["slug"] + doc_locs = [] + for fn in folders: + for fpath, info in progress["uploaded_files"].items(): + if info.get("folder") == fn and info.get("location"): + doc_locs.append(info["location"]) + + already = set(progress.get("workspace_docs", {}).get(codename, [])) + new_docs = [loc for loc in doc_locs if loc not in already] + if not new_docs: + if doc_locs: + print(f" ✓ {codename}: {len(doc_locs)} docs assigned") + continue + + print(f" → {codename} ({slug}): {len(new_docs)} docs") + for bs in range(0, len(new_docs), batch_size): + batch = new_docs[bs:bs + batch_size] + result = api_request(config, "post", f"/workspace/{slug}/update-embeddings", + json={"adds": batch, "deletes": []}) + if result: + progress.setdefault("workspace_docs", {}).setdefault(codename, []).extend(batch) + print(f" ✓ {len(batch)} docs embedded") + else: + print(f" ✗ batch failed") + if bs + batch_size < len(new_docs): + time.sleep(delay) + save_progress(progress) + print() + + +def upload_documents(config, persona_list=None, priority_filter=None, + dry_run=False, resume=False, max_size_mb=100): + """Three-phase pipeline: text upload → OCR → OCR upload → assign.""" + print("═══ Upload Pipeline ═══\n") + + if not check_api(config): + print(" ✗ AnythingLLM API not reachable.") + return + + book_library = Path(config["storage"]["book_library"]) + batch_size = config["processing"]["batch_size"] + delay = config["processing"]["delay_between_batches"] + progress = load_progress() if resume else { + "uploaded_files": {}, "workspace_docs": {}, "ocr_done": [], "ocr_failed": [], + } + + # Scan & classify + print(" Scanning folders...\n") + text_files, scanned_files, persona_folders, _ = collect_all_files( + config, book_library, persona_list, priority_filter, max_size_mb, + ) + + total_text = sum(len(v) for v in text_files.values()) + total_scanned = sum(len(v) for v in scanned_files.values()) + already = len(progress["uploaded_files"]) + + print(f"\n Text-based files: {total_text}") + print(f" Scanned PDFs: {total_scanned}") + print(f" Already uploaded: {already}") + print(f" Personas: {len(persona_folders)}\n") + + if dry_run: + for fn in sorted(set(list(text_files.keys()) + list(scanned_files.keys()))): + t = len([f for f in text_files.get(fn, []) if str(f) not in progress["uploaded_files"]]) + s = len([f for f in scanned_files.get(fn, []) if str(f) not in progress["ocr_done"]]) + if t or s: + print(f" {fn}: {t} text, {s} scanned") + print(f"\n Personas:") + for c, flds in sorted(persona_folders.items()): + print(f" {c}: {', '.join(flds)}") + return + + # ── Phase A: Upload text-based files ── + print("══ Phase A: Upload text-based files ══\n") + total_up = total_fail = 0 + for fn, files in sorted(text_files.items()): + up, fail = upload_file_batch(config, fn, files, progress, batch_size, delay) + total_up += up + total_fail += fail + print(f"\n Phase A done: {total_up} uploaded, {total_fail} failed\n") + + # ── Phase B: OCR scanned PDFs ── + total_scanned_remaining = sum( + 1 for files in scanned_files.values() + for f in files if str(f) not in progress.get("ocr_done", []) + ) + if total_scanned_remaining > 0: + print(f"══ Phase B: OCR {total_scanned_remaining} scanned PDFs ══\n") + ocr_ok = ocr_fail = 0 + for fn, files in sorted(scanned_files.items()): + pending = [f for f in files if str(f) not in progress.get("ocr_done", [])] + if not pending: + continue + print(f" → {fn}: {len(pending)} PDFs") + for i, pdf in enumerate(pending): + size_mb = pdf.stat().st_size / (1024 * 1024) + print(f" [{i+1}/{len(pending)}] {pdf.name} ({size_mb:.1f}MB)", end="", flush=True) + if ocr_pdf(pdf): + progress.setdefault("ocr_done", []).append(str(pdf)) + ocr_ok += 1 + print(f" ✓") + else: + progress.setdefault("ocr_failed", []).append(str(pdf)) + ocr_fail += 1 + print(f" ✗") + if (ocr_ok + ocr_fail) % 5 == 0: + save_progress(progress) + save_progress(progress) + print(f"\n Phase B done: {ocr_ok} OCR'd, {ocr_fail} failed\n") + + # ── Phase C: Upload OCR'd files ── + ocr_to_upload = {fn: [f for f in files if str(f) in progress.get("ocr_done", [])] + for fn, files in scanned_files.items()} + total_ocr_upload = sum( + 1 for files in ocr_to_upload.values() + for f in files if str(f) not in progress["uploaded_files"] + ) + if total_ocr_upload > 0: + print(f"══ Phase C: Upload {total_ocr_upload} OCR'd files ══\n") + total_up2 = total_fail2 = 0 + for fn, files in sorted(ocr_to_upload.items()): + if not files: + continue + up, fail = upload_file_batch(config, fn, files, progress, batch_size, delay) + total_up2 += up + total_fail2 += fail + print(f"\n Phase C done: {total_up2} uploaded, {total_fail2} failed\n") + + # ── Assign to workspaces ── + assign_to_workspaces(config, persona_folders, progress, batch_size, delay) + + +# ────────────────────────────────────────────────────────── +# STATUS +# ────────────────────────────────────────────────────────── + +def show_status(config): + print("═══ Integration Status ═══\n") + + # Storage + du = ANYTHINGLLM_STORAGE / "direct-uploads" + if du.is_symlink(): + print(f" ✓ direct-uploads/ → {du.resolve()} (HDD)") + elif du.exists(): + print(f" ⚠ direct-uploads/ on SSD — run --storage-setup") + + for d in ["documents", "lancedb", "vector-cache"]: + p = ANYTHINGLLM_STORAGE / d + if p.exists(): + try: + sz = sum(f.stat().st_size for f in p.rglob("*") if f.is_file()) / (1024**2) + print(f" ● {d}/ ({sz:.0f} MB)") + except Exception: + print(f" ● {d}/") + + api_ok = check_api(config) if config["anythingllm"]["api_key"] else False + collector_ok = check_collector_alive() + print(f"\n API: {'✓' if api_ok else '✗'} Collector: {'✓' if collector_ok else '✗'}") + + progress = load_progress() + uploaded = len(progress.get("uploaded_files", {})) + ocr_done = len(progress.get("ocr_done", [])) + assigned = len(progress.get("workspace_docs", {})) + + if uploaded or ocr_done: + print(f"\n Uploaded: {uploaded} OCR'd: {ocr_done} Personas assigned: {assigned}") + folders = {} + for info in progress.get("uploaded_files", {}).values(): + f = info.get("folder", "?") + folders[f] = folders.get(f, 0) + 1 + for f, c in sorted(folders.items(), key=lambda x: -x[1])[:20]: + print(f" {c:4d} {f}") + + print() + + +# ────────────────────────────────────────────────────────── +# MAIN +# ────────────────────────────────────────────────────────── + +def resolve_persona_list(args, config): + """Resolve --persona / --cluster to a list of codenames.""" + if args.persona: + return [args.persona] + if args.cluster: + cl = CLUSTERS.get(args.cluster) + if not cl: + print(f"Unknown cluster: {args.cluster}") + print(f"Available: {', '.join(CLUSTERS.keys())}") + sys.exit(1) + return cl + return None # all + + +def main(): + parser = argparse.ArgumentParser(description="AnythingLLM × Persona Integration") + parser.add_argument("--storage-setup", action="store_true") + parser.add_argument("--create-workspaces", action="store_true") + parser.add_argument("--upload-documents", action="store_true") + parser.add_argument("--all", action="store_true", help="Run all steps") + parser.add_argument("--status", action="store_true") + parser.add_argument("--persona", type=str, help="Single persona filter") + parser.add_argument("--cluster", type=str, help="Cluster filter: intel, cyber, military, humanities, engineering") + parser.add_argument("--priority", type=int, help="Max priority (1=core)") + parser.add_argument("--max-size", type=int, default=100, help="Max file MB (default: 100)") + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--resume", action="store_true") + + args = parser.parse_args() + config = load_config() + + if not any([args.storage_setup, args.create_workspaces, + args.upload_documents, args.all, args.status]): + parser.print_help() + return + + persona_list = resolve_persona_list(args, config) + + if args.status: + show_status(config) + return + if args.storage_setup or args.all: + storage_setup(config, dry_run=args.dry_run) + if args.create_workspaces or args.all: + create_workspaces(config, persona_list=persona_list, dry_run=args.dry_run) + if args.upload_documents or args.all: + upload_documents(config, persona_list=persona_list, + priority_filter=args.priority, + dry_run=args.dry_run, + resume=args.resume or args.all, + max_size_mb=args.max_size) + + +if __name__ == "__main__": + main()