Add --reassign mode for fast vector recovery without disk scanning
Skips the slow folder scan (50K+ files) and upload phases — directly re-embeds already-uploaded documents to workspaces using progress state. Use with --reset to clear assignment tracking first. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
64
setup.py
64
setup.py
@@ -11,6 +11,8 @@ Usage:
|
||||
python3 setup.py --storage-setup # Symlink direct-uploads/ to HDD
|
||||
python3 setup.py --create-workspaces # Create workspaces + load persona prompts
|
||||
python3 setup.py --upload-documents # Full pipeline: upload → OCR → upload → assign
|
||||
python3 setup.py --reassign # Re-assign existing docs to workspaces (no scan/upload)
|
||||
python3 setup.py --reassign --reset # Reset assignment tracking + re-assign all
|
||||
python3 setup.py --persona frodo # Single persona
|
||||
python3 setup.py --cluster intel # Intel cluster (frodo,echo,ghost,oracle,wraith,scribe,polyglot)
|
||||
python3 setup.py --cluster cyber # Cyber cluster
|
||||
@@ -623,6 +625,61 @@ def show_status(config):
|
||||
# MAIN
|
||||
# ──────────────────────────────────────────────────────────
|
||||
|
||||
def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
|
||||
"""Re-assign already-uploaded docs to workspaces without scanning/uploading.
|
||||
Skips the slow folder scan — uses upload_progress.json directly."""
|
||||
print("═══ Re-assign Workspaces ═══\n")
|
||||
|
||||
if not check_api(config):
|
||||
print(" ✗ AnythingLLM API not reachable.")
|
||||
return
|
||||
|
||||
progress = load_progress()
|
||||
batch_size = config["processing"]["batch_size"]
|
||||
delay = config["processing"]["delay_between_batches"]
|
||||
|
||||
if reset:
|
||||
if persona_list:
|
||||
for p in persona_list:
|
||||
progress.get("workspace_docs", {}).pop(p, None)
|
||||
print(f" ✓ Reset assignments for: {', '.join(persona_list)}\n")
|
||||
else:
|
||||
progress["workspace_docs"] = {}
|
||||
print(" ✓ Reset all workspace assignments\n")
|
||||
save_progress(progress)
|
||||
|
||||
# Build persona_folders from config (no disk scan needed)
|
||||
persona_folders = {}
|
||||
for codename, ws_config in config["workspaces"].items():
|
||||
if persona_list and codename not in persona_list:
|
||||
continue
|
||||
persona_folders[codename] = [
|
||||
entry["path"].replace("/", "_")
|
||||
for entry in ws_config.get("folders", [])
|
||||
]
|
||||
|
||||
uploaded = len(progress.get("uploaded_files", {}))
|
||||
print(f" Uploaded files in progress: {uploaded}")
|
||||
print(f" Personas to assign: {len(persona_folders)}\n")
|
||||
|
||||
if dry_run:
|
||||
existing_ws = get_existing_workspaces(config)
|
||||
for codename, folders in sorted(persona_folders.items()):
|
||||
ws_name = config["workspaces"][codename]["name"]
|
||||
slug = existing_ws.get(ws_name, {}).get("slug", "?")
|
||||
doc_count = 0
|
||||
for fn in folders:
|
||||
for info in progress.get("uploaded_files", {}).values():
|
||||
if info.get("folder") == fn and info.get("location"):
|
||||
doc_count += 1
|
||||
already = len(progress.get("workspace_docs", {}).get(codename, []))
|
||||
print(f" {codename} ({slug}): {doc_count} docs, {already} already assigned")
|
||||
return
|
||||
|
||||
assign_to_workspaces(config, persona_folders, progress, batch_size, delay)
|
||||
print(" Done.\n")
|
||||
|
||||
|
||||
def resolve_persona_list(args, config):
|
||||
"""Resolve --persona / --cluster to a list of codenames."""
|
||||
if args.persona:
|
||||
@@ -643,6 +700,8 @@ def main():
|
||||
parser.add_argument("--create-workspaces", action="store_true")
|
||||
parser.add_argument("--upload-documents", action="store_true")
|
||||
parser.add_argument("--all", action="store_true", help="Run all steps")
|
||||
parser.add_argument("--reassign", action="store_true", help="Re-assign uploaded docs to workspaces (no scan/upload)")
|
||||
parser.add_argument("--reset", action="store_true", help="Reset assignment tracking before reassign")
|
||||
parser.add_argument("--status", action="store_true")
|
||||
parser.add_argument("--persona", type=str, help="Single persona filter")
|
||||
parser.add_argument("--cluster", type=str, help="Cluster filter: intel, cyber, military, humanities, engineering")
|
||||
@@ -655,7 +714,7 @@ def main():
|
||||
config = load_config()
|
||||
|
||||
if not any([args.storage_setup, args.create_workspaces,
|
||||
args.upload_documents, args.all, args.status]):
|
||||
args.upload_documents, args.reassign, args.all, args.status]):
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
@@ -668,6 +727,9 @@ def main():
|
||||
storage_setup(config, dry_run=args.dry_run)
|
||||
if args.create_workspaces or args.all:
|
||||
create_workspaces(config, persona_list=persona_list, dry_run=args.dry_run)
|
||||
if args.reassign:
|
||||
reassign_workspaces(config, persona_list=persona_list,
|
||||
reset=args.reset, dry_run=args.dry_run)
|
||||
if args.upload_documents or args.all:
|
||||
upload_documents(config, persona_list=persona_list,
|
||||
priority_filter=args.priority,
|
||||
|
||||
Reference in New Issue
Block a user