Add --ocr-only mode for standalone batch OCR without upload/embed
Scans entire library for scanned PDFs (pdffonts detection), OCRs them in-place with ocrmypdf (tur+eng, 3 retries per file). No AnythingLLM API needed — works offline. Supports --persona, --cluster, --priority filters and --dry-run preview. Usage: python3 setup.py --ocr-only --dry-run # preview python3 setup.py --ocr-only # OCR all scanned PDFs python3 setup.py --ocr-only --cluster intel Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
92
setup.py
92
setup.py
@@ -928,6 +928,91 @@ def show_status(config):
|
|||||||
# MAIN
|
# MAIN
|
||||||
# ──────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def ocr_only(config, persona_list=None, priority_filter=None,
|
||||||
|
dry_run=False, max_size_mb=100, workers=4):
|
||||||
|
"""Scan entire library for scanned PDFs and OCR them in-place.
|
||||||
|
No upload, no embedding — just makes PDFs text-searchable."""
|
||||||
|
log.info("═══ OCR-Only Mode ═══")
|
||||||
|
|
||||||
|
book_library = Path(config["storage"]["book_library"])
|
||||||
|
progress = load_progress()
|
||||||
|
|
||||||
|
# Scan & classify
|
||||||
|
log.info("Scanning folders for scanned PDFs...")
|
||||||
|
text_files, scanned_files, persona_folders, _ = collect_all_files(
|
||||||
|
config, book_library, persona_list, priority_filter, max_size_mb,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_scanned = sum(len(v) for v in scanned_files.values())
|
||||||
|
already_ocr = len(progress.get("ocr_done", []))
|
||||||
|
already_failed = len(progress.get("ocr_failed", []))
|
||||||
|
|
||||||
|
# Count pending
|
||||||
|
pending_count = 0
|
||||||
|
for files in scanned_files.values():
|
||||||
|
for f in files:
|
||||||
|
sf = str(f)
|
||||||
|
if sf not in progress.get("ocr_done", []) and sf not in progress.get("ocr_failed", []):
|
||||||
|
pending_count += 1
|
||||||
|
|
||||||
|
log.info(f"Scanned PDFs found: {total_scanned}")
|
||||||
|
log.info(f"Already OCR'd: {already_ocr}")
|
||||||
|
log.info(f"Previously failed: {already_failed}")
|
||||||
|
log.info(f"Pending OCR: {pending_count}")
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
for fn, files in sorted(scanned_files.items()):
|
||||||
|
pending = [f for f in files
|
||||||
|
if str(f) not in progress.get("ocr_done", [])
|
||||||
|
and str(f) not in progress.get("ocr_failed", [])]
|
||||||
|
if pending:
|
||||||
|
log.info(f" {fn}: {len(pending)} PDFs to OCR")
|
||||||
|
return
|
||||||
|
|
||||||
|
if pending_count == 0:
|
||||||
|
log.info("Nothing to OCR.")
|
||||||
|
return
|
||||||
|
|
||||||
|
ocr_ok = ocr_fail = 0
|
||||||
|
for fn, files in sorted(scanned_files.items()):
|
||||||
|
pending = [f for f in files
|
||||||
|
if str(f) not in progress.get("ocr_done", [])
|
||||||
|
and str(f) not in progress.get("ocr_failed", [])]
|
||||||
|
if not pending:
|
||||||
|
continue
|
||||||
|
|
||||||
|
log.info(f"→ {fn}: {len(pending)} PDFs")
|
||||||
|
|
||||||
|
for i, pdf in enumerate(pending):
|
||||||
|
size_mb = pdf.stat().st_size / (1024 * 1024)
|
||||||
|
log.info(f" [{ocr_ok + ocr_fail + 1}/{pending_count}] "
|
||||||
|
f"{pdf.name} ({size_mb:.1f}MB)")
|
||||||
|
|
||||||
|
success = False
|
||||||
|
for attempt in range(3):
|
||||||
|
if ocr_pdf(pdf, language="tur+eng", dpi=200):
|
||||||
|
success = True
|
||||||
|
break
|
||||||
|
log.warning(f" OCR attempt {attempt+1}/3 failed, retrying...")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
progress.setdefault("ocr_done", []).append(str(pdf))
|
||||||
|
ocr_ok += 1
|
||||||
|
log.info(f" ✓ OCR OK")
|
||||||
|
else:
|
||||||
|
progress.setdefault("ocr_failed", []).append(str(pdf))
|
||||||
|
ocr_fail += 1
|
||||||
|
log.error(f" ✗ OCR FAILED after 3 attempts")
|
||||||
|
|
||||||
|
if (ocr_ok + ocr_fail) % 5 == 0:
|
||||||
|
save_progress(progress)
|
||||||
|
|
||||||
|
save_progress(progress)
|
||||||
|
|
||||||
|
log.info(f"── OCR complete: {ocr_ok} OK, {ocr_fail} failed ──")
|
||||||
|
|
||||||
|
|
||||||
def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
|
def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False):
|
||||||
"""Re-assign already-uploaded docs to workspaces without scanning/uploading.
|
"""Re-assign already-uploaded docs to workspaces without scanning/uploading.
|
||||||
Skips the slow folder scan — uses upload_progress.json directly."""
|
Skips the slow folder scan — uses upload_progress.json directly."""
|
||||||
@@ -1003,6 +1088,7 @@ def main():
|
|||||||
parser.add_argument("--create-workspaces", action="store_true")
|
parser.add_argument("--create-workspaces", action="store_true")
|
||||||
parser.add_argument("--upload-documents", action="store_true")
|
parser.add_argument("--upload-documents", action="store_true")
|
||||||
parser.add_argument("--all", action="store_true", help="Run all steps")
|
parser.add_argument("--all", action="store_true", help="Run all steps")
|
||||||
|
parser.add_argument("--ocr-only", action="store_true", help="Only scan and OCR scanned PDFs (no upload/embed)")
|
||||||
parser.add_argument("--reassign", action="store_true", help="Re-assign uploaded docs to workspaces (no scan/upload)")
|
parser.add_argument("--reassign", action="store_true", help="Re-assign uploaded docs to workspaces (no scan/upload)")
|
||||||
parser.add_argument("--reset", action="store_true", help="Reset assignment tracking before reassign")
|
parser.add_argument("--reset", action="store_true", help="Reset assignment tracking before reassign")
|
||||||
parser.add_argument("--status", action="store_true")
|
parser.add_argument("--status", action="store_true")
|
||||||
@@ -1020,7 +1106,7 @@ def main():
|
|||||||
config = load_config()
|
config = load_config()
|
||||||
|
|
||||||
if not any([args.storage_setup, args.create_workspaces,
|
if not any([args.storage_setup, args.create_workspaces,
|
||||||
args.upload_documents, args.reassign, args.all, args.status]):
|
args.upload_documents, args.ocr_only, args.reassign, args.all, args.status]):
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -1033,6 +1119,10 @@ def main():
|
|||||||
storage_setup(config, dry_run=args.dry_run)
|
storage_setup(config, dry_run=args.dry_run)
|
||||||
if args.create_workspaces or args.all:
|
if args.create_workspaces or args.all:
|
||||||
create_workspaces(config, persona_list=persona_list, dry_run=args.dry_run)
|
create_workspaces(config, persona_list=persona_list, dry_run=args.dry_run)
|
||||||
|
if args.ocr_only:
|
||||||
|
ocr_only(config, persona_list=persona_list,
|
||||||
|
priority_filter=args.priority,
|
||||||
|
dry_run=args.dry_run, max_size_mb=args.max_size)
|
||||||
if args.reassign:
|
if args.reassign:
|
||||||
reassign_workspaces(config, persona_list=persona_list,
|
reassign_workspaces(config, persona_list=persona_list,
|
||||||
reset=args.reset, dry_run=args.dry_run)
|
reset=args.reset, dry_run=args.dry_run)
|
||||||
|
|||||||
Reference in New Issue
Block a user