OCR Workflows#
This page covers the common user-facing flows: single-image OCR, PDFs, multi-page photographed spreads, and async entry points.
OCR One Image#
Use OCRClient when each input image already represents one page.
from churro_ocr.ocr import OCRClient
from churro_ocr.providers import OCRBackendSpec, build_ocr_backend
backend = build_ocr_backend(
OCRBackendSpec(
provider="litellm",
model="vertex_ai/gemini-2.5-flash",
)
)
page = OCRClient(backend).ocr_image(image_path="scan.png")
print(page.text)
print(page.provider_name)
print(page.model_name)
OCR A PDF#
If you install the pdf extra, DocumentOCRPipeline can rasterize a PDF and OCR each page.
from churro_ocr import DocumentOCRPipeline
from churro_ocr.providers import OCRBackendSpec, build_ocr_backend
pipeline = DocumentOCRPipeline(
build_ocr_backend(
OCRBackendSpec(
provider="litellm",
model="vertex_ai/gemini-2.5-flash",
)
),
max_concurrency=4,
)
result = pipeline.process_pdf_sync("document.pdf", dpi=300, trim_margin=30)
for page in result.pages:
print(page.page_index, page.text)
Detect Pages And OCR A Photographed Spread#
This flow is useful when one input image contains multiple pages.
from pathlib import Path
from churro_ocr import DocumentOCRPipeline, PageDetectionRequest
from churro_ocr.providers import (
LLMPageDetector,
LiteLLMTransportConfig,
OCRBackendSpec,
build_ocr_backend,
)
INPUT_IMAGE = Path("spread.jpg")
OUTPUT_DIR = Path("output")
MODEL = "vertex_ai/gemini-2.5-flash"
transport = LiteLLMTransportConfig()
ocr_backend = build_ocr_backend(
OCRBackendSpec(
provider="litellm",
model=MODEL,
transport=transport,
)
)
pipeline = DocumentOCRPipeline(
ocr_backend,
detection_backend=LLMPageDetector(
model=MODEL,
transport=transport,
),
max_concurrency=4,
)
result = pipeline.process_image_sync(
PageDetectionRequest(
image_path=INPUT_IMAGE,
trim_margin=20,
)
)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
for page in result.pages:
image_path = OUTPUT_DIR / f"page_{page.page_index:04d}.png"
text_path = OUTPUT_DIR / f"page_{page.page_index:04d}.txt"
page.image.save(image_path)
text_path.write_text(page.text or "", encoding="utf-8")
If your input is already one page per image, skip the detection_backend and use OCRClient.
Async Entry Points#
Every sync helper has an async equivalent.
Async OCR For One Page#
import asyncio
from churro_ocr.ocr import OCRClient
from churro_ocr.providers import OCRBackendSpec, build_ocr_backend
async def main() -> None:
backend = build_ocr_backend(
OCRBackendSpec(
provider="litellm",
model="vertex_ai/gemini-2.5-flash",
)
)
page = await OCRClient(backend).aocr_image(
image_path="scan.png",
page_index=3,
source_index=7,
metadata={"job_id": "demo"},
)
print(page.text)
print(page.metadata)
asyncio.run(main())
Async Document OCR#
import asyncio
from churro_ocr import DocumentOCRPipeline, PageDetectionRequest
from churro_ocr.providers import OCRBackendSpec, build_ocr_backend
async def main() -> None:
pipeline = DocumentOCRPipeline(
build_ocr_backend(
OCRBackendSpec(
provider="litellm",
model="vertex_ai/gemini-2.5-flash",
)
),
max_concurrency=4,
)
image_result = await pipeline.process_image(
PageDetectionRequest(image_path="spread.jpg", trim_margin=20),
ocr_metadata={"job_id": "demo-image"},
)
print(image_result.texts())
asyncio.run(main())