From c45efcb261e20a2a0d0961e1dcee764140e6975e Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Tue, 7 Apr 2026 00:04:36 +0300 Subject: [PATCH] Add --reassign mode for fast vector recovery without disk scanning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skips the slow folder scan (50K+ files) and upload phases — directly re-embeds already-uploaded documents to workspaces using progress state. Use with --reset to clear assignment tracking first. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 76 +++++++++++++++++++++++++++++++++++++++++++++---------- setup.py | 64 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 126 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 071ac6b..6e6d137 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ - **OCR:** ocrmypdf (tur+eng) - **Kitap Kaynağı:** `/mnt/storage/Common/Books/` +Kitaplar diskte tek kopya halinde durur (`/mnt/storage/Common/Books/`). AnythingLLM API'sine upload edildiğinde tokenize edilmiş metadata kopyası `documents/` klasörüne alınır. `config.yaml`'da aynı klasör birden fazla persona'ya atanabilir — diskte duplicate oluşmaz, her workspace kendi vektör embedding'ini tutar. + ## Personalar (5 Cluster) | Cluster | Personalar | @@ -23,24 +25,52 @@ ## Kullanım +### Durum Kontrolü + ```bash -# Durum kontrolü python3 setup.py --status +``` -# Workspace oluştur / güncelle +### Workspace Oluştur / Güncelle + +```bash python3 setup.py --create-workspaces +python3 setup.py --create-workspaces --persona frodo +``` -# Tam pipeline (upload + OCR + embed) +### Tam Pipeline (upload + OCR + embed) + +```bash python3 setup.py --upload-documents --resume - -# Tek cluster veya persona python3 setup.py --upload-documents --cluster cyber --resume python3 setup.py --upload-documents --persona neo --priority 1 --resume - -# Önizleme python3 setup.py --upload-documents --dry-run ``` +### Re-assign (vektör recovery — tarama/upload yapmadan) + +Dokümanlar zaten upload edilmiş ama vektörler silinmişse veya workspace atamaları bozulmuşsa: + +```bash +# Önizleme +python3 setup.py --reassign --reset --dry-run + +# Tüm workspace'leri sıfırla + yeniden embed et +python3 setup.py --reassign --reset + +# Sadece tek persona veya cluster +python3 setup.py --reassign --reset --persona frodo +python3 setup.py --reassign --reset --cluster intel + +# Sıfırlamadan sadece eksik atamaları tamamla +python3 setup.py --reassign +``` + +| Flag | Açıklama | +|------|----------| +| `--reassign` | Disk taraması ve upload yapmadan, `upload_progress.json`'daki mevcut dosyaları workspace'lere embed eder | +| `--reset` | `--reassign` ile birlikte kullanılır. Önce `workspace_docs` kaydını sıfırlar, sonra tümünü yeniden atar | + ## Pipeline ``` @@ -50,14 +80,34 @@ Phase C: OCR'lı dosyaları upload Final: Workspace'lere assign/embed ``` +`--reassign` modu sadece "Final" adımını çalıştırır — diğer fazları atlar. + ## Recovery -Vektör DB silinirse: -1. `upload_progress.json`'da `workspace_docs` → `{}` sıfırla -2. `python3 setup.py --upload-documents --resume` (sadece re-embed yapar) +### Vektör DB Silinirse + +```bash +python3 setup.py --reassign --reset +``` + +### Tek Persona Vektörü Bozulursa + +```bash +python3 setup.py --reassign --reset --persona frodo +``` + +### Tam Sıfırlama (her şey baştan) + +```bash +rm upload_progress.json +python3 setup.py --all +``` ## Dosyalar -- `setup.py` — Ana entegrasyon scripti (upload, OCR, workspace assignment) -- `config.yaml` — Persona-klasör eşlemeleri, API config, batch ayarları -- `upload_progress.json` — Upload/atama state tracker (gitignore'd) +| Dosya | Açıklama | +|-------|----------| +| `setup.py` | Ana entegrasyon scripti (upload, OCR, workspace assignment, reassign) | +| `config.yaml` | Persona-klasör eşlemeleri, API config, batch ayarları | +| `upload_progress.json` | Upload/atama state tracker (gitignore'd, makineye özel) | +| `ocr_output/` | OCR çıktıları (gitignore'd, büyük dosyalar) | diff --git a/setup.py b/setup.py index fc28a66..724d6fa 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,8 @@ Usage: python3 setup.py --storage-setup # Symlink direct-uploads/ to HDD python3 setup.py --create-workspaces # Create workspaces + load persona prompts python3 setup.py --upload-documents # Full pipeline: upload → OCR → upload → assign + python3 setup.py --reassign # Re-assign existing docs to workspaces (no scan/upload) + python3 setup.py --reassign --reset # Reset assignment tracking + re-assign all python3 setup.py --persona frodo # Single persona python3 setup.py --cluster intel # Intel cluster (frodo,echo,ghost,oracle,wraith,scribe,polyglot) python3 setup.py --cluster cyber # Cyber cluster @@ -623,6 +625,61 @@ def show_status(config): # MAIN # ────────────────────────────────────────────────────────── +def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False): + """Re-assign already-uploaded docs to workspaces without scanning/uploading. + Skips the slow folder scan — uses upload_progress.json directly.""" + print("═══ Re-assign Workspaces ═══\n") + + if not check_api(config): + print(" ✗ AnythingLLM API not reachable.") + return + + progress = load_progress() + batch_size = config["processing"]["batch_size"] + delay = config["processing"]["delay_between_batches"] + + if reset: + if persona_list: + for p in persona_list: + progress.get("workspace_docs", {}).pop(p, None) + print(f" ✓ Reset assignments for: {', '.join(persona_list)}\n") + else: + progress["workspace_docs"] = {} + print(" ✓ Reset all workspace assignments\n") + save_progress(progress) + + # Build persona_folders from config (no disk scan needed) + persona_folders = {} + for codename, ws_config in config["workspaces"].items(): + if persona_list and codename not in persona_list: + continue + persona_folders[codename] = [ + entry["path"].replace("/", "_") + for entry in ws_config.get("folders", []) + ] + + uploaded = len(progress.get("uploaded_files", {})) + print(f" Uploaded files in progress: {uploaded}") + print(f" Personas to assign: {len(persona_folders)}\n") + + if dry_run: + existing_ws = get_existing_workspaces(config) + for codename, folders in sorted(persona_folders.items()): + ws_name = config["workspaces"][codename]["name"] + slug = existing_ws.get(ws_name, {}).get("slug", "?") + doc_count = 0 + for fn in folders: + for info in progress.get("uploaded_files", {}).values(): + if info.get("folder") == fn and info.get("location"): + doc_count += 1 + already = len(progress.get("workspace_docs", {}).get(codename, [])) + print(f" {codename} ({slug}): {doc_count} docs, {already} already assigned") + return + + assign_to_workspaces(config, persona_folders, progress, batch_size, delay) + print(" Done.\n") + + def resolve_persona_list(args, config): """Resolve --persona / --cluster to a list of codenames.""" if args.persona: @@ -643,6 +700,8 @@ def main(): parser.add_argument("--create-workspaces", action="store_true") parser.add_argument("--upload-documents", action="store_true") parser.add_argument("--all", action="store_true", help="Run all steps") + parser.add_argument("--reassign", action="store_true", help="Re-assign uploaded docs to workspaces (no scan/upload)") + parser.add_argument("--reset", action="store_true", help="Reset assignment tracking before reassign") parser.add_argument("--status", action="store_true") parser.add_argument("--persona", type=str, help="Single persona filter") parser.add_argument("--cluster", type=str, help="Cluster filter: intel, cyber, military, humanities, engineering") @@ -655,7 +714,7 @@ def main(): config = load_config() if not any([args.storage_setup, args.create_workspaces, - args.upload_documents, args.all, args.status]): + args.upload_documents, args.reassign, args.all, args.status]): parser.print_help() return @@ -668,6 +727,9 @@ def main(): storage_setup(config, dry_run=args.dry_run) if args.create_workspaces or args.all: create_workspaces(config, persona_list=persona_list, dry_run=args.dry_run) + if args.reassign: + reassign_workspaces(config, persona_list=persona_list, + reset=args.reset, dry_run=args.dry_run) if args.upload_documents or args.all: upload_documents(config, persona_list=persona_list, priority_filter=args.priority,