diff --git a/setup.py b/setup.py index 52f4ce6..6f47ba2 100644 --- a/setup.py +++ b/setup.py @@ -928,6 +928,91 @@ def show_status(config): # MAIN # ────────────────────────────────────────────────────────── +def ocr_only(config, persona_list=None, priority_filter=None, + dry_run=False, max_size_mb=100, workers=4): + """Scan entire library for scanned PDFs and OCR them in-place. + No upload, no embedding — just makes PDFs text-searchable.""" + log.info("═══ OCR-Only Mode ═══") + + book_library = Path(config["storage"]["book_library"]) + progress = load_progress() + + # Scan & classify + log.info("Scanning folders for scanned PDFs...") + text_files, scanned_files, persona_folders, _ = collect_all_files( + config, book_library, persona_list, priority_filter, max_size_mb, + ) + + total_scanned = sum(len(v) for v in scanned_files.values()) + already_ocr = len(progress.get("ocr_done", [])) + already_failed = len(progress.get("ocr_failed", [])) + + # Count pending + pending_count = 0 + for files in scanned_files.values(): + for f in files: + sf = str(f) + if sf not in progress.get("ocr_done", []) and sf not in progress.get("ocr_failed", []): + pending_count += 1 + + log.info(f"Scanned PDFs found: {total_scanned}") + log.info(f"Already OCR'd: {already_ocr}") + log.info(f"Previously failed: {already_failed}") + log.info(f"Pending OCR: {pending_count}") + + if dry_run: + for fn, files in sorted(scanned_files.items()): + pending = [f for f in files + if str(f) not in progress.get("ocr_done", []) + and str(f) not in progress.get("ocr_failed", [])] + if pending: + log.info(f" {fn}: {len(pending)} PDFs to OCR") + return + + if pending_count == 0: + log.info("Nothing to OCR.") + return + + ocr_ok = ocr_fail = 0 + for fn, files in sorted(scanned_files.items()): + pending = [f for f in files + if str(f) not in progress.get("ocr_done", []) + and str(f) not in progress.get("ocr_failed", [])] + if not pending: + continue + + log.info(f"→ {fn}: {len(pending)} PDFs") + + for i, pdf in enumerate(pending): + size_mb = pdf.stat().st_size / (1024 * 1024) + log.info(f" [{ocr_ok + ocr_fail + 1}/{pending_count}] " + f"{pdf.name} ({size_mb:.1f}MB)") + + success = False + for attempt in range(3): + if ocr_pdf(pdf, language="tur+eng", dpi=200): + success = True + break + log.warning(f" OCR attempt {attempt+1}/3 failed, retrying...") + time.sleep(2) + + if success: + progress.setdefault("ocr_done", []).append(str(pdf)) + ocr_ok += 1 + log.info(f" ✓ OCR OK") + else: + progress.setdefault("ocr_failed", []).append(str(pdf)) + ocr_fail += 1 + log.error(f" ✗ OCR FAILED after 3 attempts") + + if (ocr_ok + ocr_fail) % 5 == 0: + save_progress(progress) + + save_progress(progress) + + log.info(f"── OCR complete: {ocr_ok} OK, {ocr_fail} failed ──") + + def reassign_workspaces(config, persona_list=None, reset=False, dry_run=False): """Re-assign already-uploaded docs to workspaces without scanning/uploading. Skips the slow folder scan — uses upload_progress.json directly.""" @@ -1003,6 +1088,7 @@ def main(): parser.add_argument("--create-workspaces", action="store_true") parser.add_argument("--upload-documents", action="store_true") parser.add_argument("--all", action="store_true", help="Run all steps") + parser.add_argument("--ocr-only", action="store_true", help="Only scan and OCR scanned PDFs (no upload/embed)") parser.add_argument("--reassign", action="store_true", help="Re-assign uploaded docs to workspaces (no scan/upload)") parser.add_argument("--reset", action="store_true", help="Reset assignment tracking before reassign") parser.add_argument("--status", action="store_true") @@ -1020,7 +1106,7 @@ def main(): config = load_config() if not any([args.storage_setup, args.create_workspaces, - args.upload_documents, args.reassign, args.all, args.status]): + args.upload_documents, args.ocr_only, args.reassign, args.all, args.status]): parser.print_help() return @@ -1033,6 +1119,10 @@ def main(): storage_setup(config, dry_run=args.dry_run) if args.create_workspaces or args.all: create_workspaces(config, persona_list=persona_list, dry_run=args.dry_run) + if args.ocr_only: + ocr_only(config, persona_list=persona_list, + priority_filter=args.priority, + dry_run=args.dry_run, max_size_mb=args.max_size) if args.reassign: reassign_workspaces(config, persona_list=persona_list, reset=args.reset, dry_run=args.dry_run)