Add --reassign mode for fast vector recovery without disk scanning

Skips the slow folder scan (50K+ files) and upload phases — directly
re-embeds already-uploaded documents to workspaces using progress state.
Use with --reset to clear assignment tracking first.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
salvacybersec
2026-04-07 00:04:36 +03:00
parent 9e9b75e0b3
commit c45efcb261
2 changed files with 126 additions and 14 deletions

View File

@@ -11,6 +11,8 @@ Usage:
python3 setup.py --storage-setup # Symlink direct-uploads/ to HDD
python3 setup.py --create-workspaces # Create workspaces + load persona prompts
python3 setup.py --upload-documents # Full pipeline: upload → OCR → upload → assign
python3 setup.py --reassign # Re-assign existing docs to workspaces (no scan/upload)
python3 setup.py --reassign --reset # Reset assignment tracking + re-assign all
python3 setup.py --persona frodo # Single persona
python3 setup.py --cluster intel # Intel cluster (frodo,echo,ghost,oracle,wraith,scribe,polyglot)
python3 setup.py --cluster cyber # Cyber cluster
@@ -623,6 +625,61 @@ def show_status(config):
# MAIN
# ──────────────────────────────────────────────────────────
def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
"""Re-assign already-uploaded docs to workspaces without scanning/uploading.
Skips the slow folder scan — uses upload_progress.json directly."""
print("═══ Re-assign Workspaces ═══\n")
if not check_api(config):
print(" ✗ AnythingLLM API not reachable.")
return
progress = load_progress()
batch_size = config["processing"]["batch_size"]
delay = config["processing"]["delay_between_batches"]
if reset:
if persona_list:
for p in persona_list:
progress.get("workspace_docs", {}).pop(p, None)
print(f" ✓ Reset assignments for: {', '.join(persona_list)}\n")
else:
progress["workspace_docs"] = {}
print(" ✓ Reset all workspace assignments\n")
save_progress(progress)
# Build persona_folders from config (no disk scan needed)
persona_folders = {}
for codename, ws_config in config["workspaces"].items():
if persona_list and codename not in persona_list:
continue
persona_folders[codename] = [
entry["path"].replace("/", "_")
for entry in ws_config.get("folders", [])
]
uploaded = len(progress.get("uploaded_files", {}))
print(f" Uploaded files in progress: {uploaded}")
print(f" Personas to assign: {len(persona_folders)}\n")
if dry_run:
existing_ws = get_existing_workspaces(config)
for codename, folders in sorted(persona_folders.items()):
ws_name = config["workspaces"][codename]["name"]
slug = existing_ws.get(ws_name, {}).get("slug", "?")
doc_count = 0
for fn in folders:
for info in progress.get("uploaded_files", {}).values():
if info.get("folder") == fn and info.get("location"):
doc_count += 1
already = len(progress.get("workspace_docs", {}).get(codename, []))
print(f" {codename} ({slug}): {doc_count} docs, {already} already assigned")
return
assign_to_workspaces(config, persona_folders, progress, batch_size, delay)
print(" Done.\n")
def resolve_persona_list(args, config):
"""Resolve --persona / --cluster to a list of codenames."""
if args.persona:
@@ -643,6 +700,8 @@ def main():
parser.add_argument("--create-workspaces", action="store_true")
parser.add_argument("--upload-documents", action="store_true")
parser.add_argument("--all", action="store_true", help="Run all steps")
parser.add_argument("--reassign", action="store_true", help="Re-assign uploaded docs to workspaces (no scan/upload)")
parser.add_argument("--reset", action="store_true", help="Reset assignment tracking before reassign")
parser.add_argument("--status", action="store_true")
parser.add_argument("--persona", type=str, help="Single persona filter")
parser.add_argument("--cluster", type=str, help="Cluster filter: intel, cyber, military, humanities, engineering")
@@ -655,7 +714,7 @@ def main():
config = load_config()
if not any([args.storage_setup, args.create_workspaces,
args.upload_documents, args.all, args.status]):
args.upload_documents, args.reassign, args.all, args.status]):
parser.print_help()
return
@@ -668,6 +727,9 @@ def main():
storage_setup(config, dry_run=args.dry_run)
if args.create_workspaces or args.all:
create_workspaces(config, persona_list=persona_list, dry_run=args.dry_run)
if args.reassign:
reassign_workspaces(config, persona_list=persona_list,
reset=args.reset, dry_run=args.dry_run)
if args.upload_documents or args.all:
upload_documents(config, persona_list=persona_list,
priority_filter=args.priority,