"""Public OCR provider specs, options, and model profile resolution."""
from __future__ import annotations
from collections.abc import Callable
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Literal, cast
from PIL import Image
from churro_ocr._internal.image import ensure_rgb
from churro_ocr.errors import ConfigurationError
from churro_ocr.providers._ocr_processing import (
chandra_image_preprocessor,
chandra_text_postprocessor,
deepseek_ocr_2_text_postprocessor,
default_ocr_image_preprocessor,
default_ocr_text_postprocessor,
firered_ocr_text_postprocessor,
glm_ocr_image_preprocessor,
glm_ocr_text_postprocessor,
identity_text_postprocessor,
infinity_parser_7b_text_postprocessor,
lfm2_5_vl_text_postprocessor,
nanonets_ocr2_3b_text_postprocessor,
olmocr_image_preprocessor,
olmocr_text_postprocessor,
paddleocr_vl_text_postprocessor,
qianfan_ocr_text_postprocessor,
)
from churro_ocr.templates import (
CHANDRA_OCR_2_MODEL_ID,
CHANDRA_OCR_2_OCR_TEMPLATE,
CHURRO_3B_MODEL_ID,
CHURRO_3B_XML_TEMPLATE,
DEEPSEEK_OCR_2_MODEL_ID,
DEEPSEEK_OCR_2_OCR_TEMPLATE,
DEFAULT_OCR_TEMPLATE,
DOTS_MOCR_MODEL_ID,
DOTS_MOCR_OCR_TEMPLATE,
DOTS_OCR_1_5_MODEL_ID,
DOTS_OCR_1_5_OCR_TEMPLATE,
FIRERED_OCR_MODEL_ID,
FIRERED_OCR_OCR_TEMPLATE,
GLM_OCR_MODEL_ID,
GLM_OCR_OCR_TEMPLATE,
INFINITY_PARSER_7B_MODEL_ID,
INFINITY_PARSER_7B_OCR_TEMPLATE,
LFM2_5_VL_1_6B_MODEL_ID,
LFM2_5_VL_1_6B_OCR_TEMPLATE,
MINERU2_5_2509_1_2B_MODEL_ID,
MINERU2_5_2509_1_2B_OCR_TEMPLATE,
NANONETS_OCR2_3B_MODEL_ID,
NANONETS_OCR2_3B_OCR_TEMPLATE,
OLMOCR_2_7B_1025_FP8_MODEL_ID,
OLMOCR_2_7B_1025_MODEL_ID,
OLMOCR_2_7B_1025_OCR_TEMPLATE,
PADDLEOCR_VL_1_5_MODEL_ID,
PADDLEOCR_VL_1_5_OCR_TEMPLATE,
QIANFAN_OCR_MODEL_ID,
QIANFAN_OCR_OCR_TEMPLATE,
OCRConversation,
OCRPromptTemplateLike,
)
from churro_ocr.types import MetadataDict
if TYPE_CHECKING:
from pathlib import Path
OCRProvider = Literal["litellm", "openai-compatible", "azure", "mistral", "hf"]
MistralOCRModel = Literal["mistral-ocr-2505", "mistral-ocr-2512"]
ImagePreprocessor = Callable[[Image.Image], Image.Image]
TextPostprocessorResult = str | tuple[str, MetadataDict]
TextPostprocessor = Callable[[str], TextPostprocessorResult]
VisionInputBuilder = Callable[[OCRConversation], object]
DEFAULT_OCR_MAX_TOKENS = 25_000
CHANDRA_OCR_MAX_TOKENS = 12_384
DEEPSEEK_OCR_2_MAX_TOKENS = 8_192
FIRERED_OCR_MAX_TOKENS = 4_096
GLM_OCR_MAX_TOKENS = 8_192
INFINITY_PARSER_7B_MAX_TOKENS = 8_192
NANONETS_OCR2_3B_MAX_TOKENS = 15_000
OLMOCR_MAX_TOKENS = 8_000
PADDLEOCR_VL_MAX_TOKENS = 4_096
QIANFAN_OCR_MAX_TOKENS = 4_096
INFINITY_PARSER_7B_MIN_PIXELS = 256 * 28 * 28
INFINITY_PARSER_7B_MAX_PIXELS = 2304 * 28 * 28
MISTRAL_OCR_MODEL_IDS: tuple[MistralOCRModel, ...] = (
"mistral-ocr-2505",
"mistral-ocr-2512",
)
def _configuration_error(message: str) -> ConfigurationError:
return ConfigurationError(message)
[docs]
def validate_mistral_ocr_model(
model: str | None,
*,
context: str = "OCR provider 'mistral'",
) -> MistralOCRModel:
"""Return a supported pinned Mistral OCR model id or raise a configuration error."""
supported_models = ", ".join(MISTRAL_OCR_MODEL_IDS)
if model is None:
message = f"{context} requires `model` to be one of: {supported_models}."
raise _configuration_error(message)
if model not in MISTRAL_OCR_MODEL_IDS:
message = f"{context} only supports `model` values {supported_models}; got {model!r}."
raise _configuration_error(message)
return cast("MistralOCRModel", model)
[docs]
@dataclass(slots=True, frozen=True)
class LiteLLMTransportConfig:
"""Shared transport config for LiteLLM-based multimodal requests.
:param api_base: Optional API base URL override.
:param api_key: Optional API key forwarded to LiteLLM.
:param api_version: Optional API version string for providers that need one.
:param image_detail: Optional image-detail hint supported by some providers.
:param completion_kwargs: Extra completion kwargs merged into LiteLLM calls.
:param cache_dir: Optional disk-cache directory for LiteLLM request caching.
"""
api_base: str | None = None
api_key: str | None = None
api_version: str | None = None
image_detail: str | None = None
completion_kwargs: dict[str, object] = field(default_factory=dict)
cache_dir: str | Path | None = None
[docs]
@dataclass(slots=True, frozen=True)
class OpenAICompatibleOptions:
"""Provider options for OpenAI-compatible OCR servers.
:param model_prefix: Provider prefix prepended to the configured model name.
"""
model_prefix: str | None = None
[docs]
@dataclass(slots=True, frozen=True)
class HuggingFaceOptions:
"""Provider options for local Hugging Face OCR backends.
:param trust_remote_code: Whether to allow remote model code execution.
:param processor_kwargs: Extra kwargs passed to ``AutoProcessor.from_pretrained``.
:param model_kwargs: Extra kwargs passed to model loading.
:param generation_kwargs: Extra generation kwargs passed at inference time.
:param vision_input_builder: Optional override for building multimodal inputs.
:param backend_variant: Optional implementation preset such as ``"dots-ocr-1.5"``.
"""
trust_remote_code: bool | None = None
processor_kwargs: dict[str, object] = field(default_factory=dict)
model_kwargs: dict[str, object] = field(default_factory=dict)
generation_kwargs: dict[str, object] = field(default_factory=dict)
vision_input_builder: VisionInputBuilder | None = None
backend_variant: str | None = None
[docs]
@dataclass(slots=True, frozen=True)
class AzureDocumentIntelligenceOptions:
"""Provider options for Azure Document Intelligence OCR.
:param endpoint: Azure Document Intelligence endpoint URL.
:param api_key: Azure API key for the configured resource.
"""
endpoint: str | None = None
api_key: str | None = None
[docs]
@dataclass(slots=True, frozen=True)
class MistralOptions:
"""Provider options for Mistral OCR.
:param api_key: Mistral API key used for OCR requests.
"""
api_key: str | None = None
OCRProviderOptions = (
OpenAICompatibleOptions | HuggingFaceOptions | AzureDocumentIntelligenceOptions | MistralOptions
)
[docs]
@dataclass(slots=True, frozen=True)
class OCRModelProfile:
"""Model-level OCR behavior shared across provider adapters.
:param profile_name: Stable profile identifier.
:param template: Prompt template used to render OCR input.
:param image_preprocessor: Image preprocessor applied before OCR.
:param text_postprocessor: Text postprocessor applied after OCR.
:param display_name: Optional human-readable model name.
:param transport: Default LiteLLM transport settings for this profile.
:param huggingface: Default Hugging Face backend options for this profile.
"""
profile_name: str
template: OCRPromptTemplateLike = DEFAULT_OCR_TEMPLATE
image_preprocessor: ImagePreprocessor = default_ocr_image_preprocessor
text_postprocessor: TextPostprocessor = default_ocr_text_postprocessor
display_name: str | None = None
transport: LiteLLMTransportConfig = field(default_factory=LiteLLMTransportConfig)
huggingface: HuggingFaceOptions = field(default_factory=HuggingFaceOptions)
[docs]
@dataclass(slots=True, frozen=True)
class OCRBackendSpec:
"""Declarative builder input for OCR backends.
:param provider: OCR provider identifier.
:param model: Provider-specific model identifier.
:param profile: Optional built-in or custom model profile.
:param transport: Optional transport settings for LiteLLM-based providers.
:param options: Optional provider-specific options dataclass.
"""
provider: OCRProvider
model: str | None = None
profile: str | OCRModelProfile | None = None
transport: LiteLLMTransportConfig | None = None
options: OCRProviderOptions | None = None
[docs]
def default_ocr_profile() -> OCRModelProfile:
"""Return the generic OCR model profile.
:returns: Baseline profile used when no more specific profile matches.
"""
return OCRModelProfile(profile_name="default")
def churro_3b_profile() -> OCRModelProfile:
"""Return the built-in ``stanford-oval/churro-3B`` OCR profile.
:returns: Profile configured for the built-in CHURRO 3B template.
"""
return OCRModelProfile(
profile_name=CHURRO_3B_MODEL_ID,
template=CHURRO_3B_XML_TEMPLATE,
text_postprocessor=identity_text_postprocessor,
display_name="churro-3B",
)
[docs]
def chandra_ocr_2_profile() -> OCRModelProfile:
"""Return the built-in ``datalab-to/chandra-ocr-2`` OCR profile."""
return OCRModelProfile(
profile_name=CHANDRA_OCR_2_MODEL_ID,
template=CHANDRA_OCR_2_OCR_TEMPLATE,
image_preprocessor=chandra_image_preprocessor,
text_postprocessor=chandra_text_postprocessor,
display_name="chandra-ocr-2",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": CHANDRA_OCR_MAX_TOKENS,
"temperature": 0.0,
"top_p": 0.1,
}
),
huggingface=HuggingFaceOptions(
generation_kwargs={
"max_new_tokens": CHANDRA_OCR_MAX_TOKENS,
},
backend_variant="chandra-ocr-2",
),
)
[docs]
def deepseek_ocr_2_profile() -> OCRModelProfile:
"""Return the built-in ``deepseek-ai/DeepSeek-OCR-2`` OCR profile."""
return OCRModelProfile(
profile_name=DEEPSEEK_OCR_2_MODEL_ID,
template=DEEPSEEK_OCR_2_OCR_TEMPLATE,
text_postprocessor=deepseek_ocr_2_text_postprocessor,
display_name="DeepSeek-OCR-2",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": DEEPSEEK_OCR_2_MAX_TOKENS,
"temperature": 0.0,
}
),
huggingface=HuggingFaceOptions(
model_kwargs={
"use_safetensors": True,
},
generation_kwargs={
"max_new_tokens": DEEPSEEK_OCR_2_MAX_TOKENS,
},
trust_remote_code=True,
backend_variant="deepseek-ocr-2",
),
)
[docs]
def firered_ocr_profile() -> OCRModelProfile:
"""Return the built-in ``FireRedTeam/FireRed-OCR`` OCR profile."""
return OCRModelProfile(
profile_name=FIRERED_OCR_MODEL_ID,
template=FIRERED_OCR_OCR_TEMPLATE,
image_preprocessor=default_ocr_image_preprocessor,
text_postprocessor=firered_ocr_text_postprocessor,
display_name="FireRed-OCR",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": FIRERED_OCR_MAX_TOKENS,
"temperature": 0.0,
"top_p": 1.0,
}
),
huggingface=HuggingFaceOptions(
generation_kwargs={
"max_new_tokens": FIRERED_OCR_MAX_TOKENS,
"do_sample": False,
},
),
)
[docs]
def nanonets_ocr2_3b_profile() -> OCRModelProfile:
"""Return the built-in ``nanonets/Nanonets-OCR2-3B`` OCR profile."""
return OCRModelProfile(
profile_name=NANONETS_OCR2_3B_MODEL_ID,
template=NANONETS_OCR2_3B_OCR_TEMPLATE,
text_postprocessor=nanonets_ocr2_3b_text_postprocessor,
display_name="Nanonets-OCR2-3B",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": NANONETS_OCR2_3B_MAX_TOKENS,
"temperature": 0.0,
}
),
huggingface=HuggingFaceOptions(
generation_kwargs={
"max_new_tokens": NANONETS_OCR2_3B_MAX_TOKENS,
"do_sample": False,
},
),
)
[docs]
def qianfan_ocr_profile() -> OCRModelProfile:
"""Return the built-in ``baidu/Qianfan-OCR`` OCR profile."""
return OCRModelProfile(
profile_name=QIANFAN_OCR_MODEL_ID,
template=QIANFAN_OCR_OCR_TEMPLATE,
image_preprocessor=ensure_rgb,
text_postprocessor=qianfan_ocr_text_postprocessor,
display_name="Qianfan-OCR",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": QIANFAN_OCR_MAX_TOKENS,
"temperature": 0.0,
}
),
huggingface=HuggingFaceOptions(
generation_kwargs={
"max_new_tokens": QIANFAN_OCR_MAX_TOKENS,
"do_sample": False,
},
trust_remote_code=True,
backend_variant="qianfan-ocr",
),
)
[docs]
def glm_ocr_profile() -> OCRModelProfile:
"""Return the built-in ``zai-org/GLM-OCR`` OCR profile."""
return OCRModelProfile(
profile_name=GLM_OCR_MODEL_ID,
template=GLM_OCR_OCR_TEMPLATE,
image_preprocessor=glm_ocr_image_preprocessor,
text_postprocessor=glm_ocr_text_postprocessor,
display_name="GLM-OCR",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": GLM_OCR_MAX_TOKENS,
"temperature": 0.0,
}
),
huggingface=HuggingFaceOptions(
generation_kwargs={
"max_new_tokens": GLM_OCR_MAX_TOKENS,
"do_sample": False,
},
backend_variant="glm-ocr",
),
)
def dots_ocr_1_5_profile() -> OCRModelProfile:
"""Return the built-in ``kristaller486/dots.ocr-1.5`` OCR profile.
:returns: Profile configured for the built-in Dots OCR 1.5 template.
"""
return OCRModelProfile(
profile_name=DOTS_OCR_1_5_MODEL_ID,
template=DOTS_OCR_1_5_OCR_TEMPLATE,
text_postprocessor=identity_text_postprocessor,
display_name="dots.ocr-1.5",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": 2_048,
"temperature": 0.0,
}
),
huggingface=HuggingFaceOptions(
trust_remote_code=True,
backend_variant="dots-ocr-1.5",
),
)
def dots_mocr_profile() -> OCRModelProfile:
"""Return the built-in ``rednote-hilab/dots.mocr`` OCR profile."""
return OCRModelProfile(
profile_name=DOTS_MOCR_MODEL_ID,
template=DOTS_MOCR_OCR_TEMPLATE,
text_postprocessor=identity_text_postprocessor,
display_name="dots.mocr",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": DEFAULT_OCR_MAX_TOKENS,
"temperature": 0.0,
}
),
huggingface=HuggingFaceOptions(
trust_remote_code=True,
backend_variant="dots-mocr",
),
)
[docs]
def paddleocr_vl_1_5_profile() -> OCRModelProfile:
"""Return the built-in ``PaddlePaddle/PaddleOCR-VL-1.5`` OCR profile."""
return OCRModelProfile(
profile_name=PADDLEOCR_VL_1_5_MODEL_ID,
template=PADDLEOCR_VL_1_5_OCR_TEMPLATE,
text_postprocessor=paddleocr_vl_text_postprocessor,
display_name="PaddleOCR-VL-1.5",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": PADDLEOCR_VL_MAX_TOKENS,
"temperature": 0.0,
}
),
huggingface=HuggingFaceOptions(
generation_kwargs={
"max_new_tokens": PADDLEOCR_VL_MAX_TOKENS,
"do_sample": False,
},
backend_variant="paddleocr-vl-1.5",
),
)
[docs]
def infinity_parser_7b_profile() -> OCRModelProfile:
"""Return the built-in ``infly/Infinity-Parser-7B`` OCR profile."""
return OCRModelProfile(
profile_name=INFINITY_PARSER_7B_MODEL_ID,
template=INFINITY_PARSER_7B_OCR_TEMPLATE,
image_preprocessor=ensure_rgb,
text_postprocessor=infinity_parser_7b_text_postprocessor,
display_name="Infinity-Parser-7B",
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": INFINITY_PARSER_7B_MAX_TOKENS,
"temperature": 0.0,
"top_p": 0.95,
}
),
huggingface=HuggingFaceOptions(
processor_kwargs={
"min_pixels": INFINITY_PARSER_7B_MIN_PIXELS,
"max_pixels": INFINITY_PARSER_7B_MAX_PIXELS,
},
generation_kwargs={
"max_new_tokens": 4_096,
},
),
)
[docs]
def mineru2_5_2509_1_2b_profile() -> OCRModelProfile:
"""Return the built-in ``opendatalab/MinerU2.5-2509-1.2B`` OCR profile."""
return OCRModelProfile(
profile_name=MINERU2_5_2509_1_2B_MODEL_ID,
template=MINERU2_5_2509_1_2B_OCR_TEMPLATE,
image_preprocessor=ensure_rgb,
text_postprocessor=identity_text_postprocessor,
display_name="MinerU2.5-2509-1.2B",
huggingface=HuggingFaceOptions(
processor_kwargs={
"use_fast": True,
},
backend_variant="mineru2.5",
),
)
def _olmocr_profile(*, profile_name: str, display_name: str) -> OCRModelProfile:
return OCRModelProfile(
profile_name=profile_name,
template=OLMOCR_2_7B_1025_OCR_TEMPLATE,
image_preprocessor=olmocr_image_preprocessor,
text_postprocessor=olmocr_text_postprocessor,
display_name=display_name,
transport=LiteLLMTransportConfig(
completion_kwargs={
"max_tokens": OLMOCR_MAX_TOKENS,
"temperature": 0.1,
}
),
huggingface=HuggingFaceOptions(
generation_kwargs={
"max_new_tokens": OLMOCR_MAX_TOKENS,
"temperature": 0.1,
"do_sample": True,
},
),
)
[docs]
def lfm2_5_vl_1_6b_profile() -> OCRModelProfile:
"""Return the built-in ``LiquidAI/LFM2.5-VL-1.6B`` OCR profile."""
return OCRModelProfile(
profile_name=LFM2_5_VL_1_6B_MODEL_ID,
template=LFM2_5_VL_1_6B_OCR_TEMPLATE,
text_postprocessor=lfm2_5_vl_text_postprocessor,
display_name="LFM2.5-VL-1.6B",
huggingface=HuggingFaceOptions(
generation_kwargs={
"max_new_tokens": 512,
"do_sample": False,
"repetition_penalty": 1.05,
},
backend_variant="lfm2.5-vl",
),
)
def olmocr_2_7b_1025_profile() -> OCRModelProfile:
"""Return the built-in ``allenai/olmOCR-2-7B-1025`` OCR profile."""
return _olmocr_profile(
profile_name=OLMOCR_2_7B_1025_MODEL_ID,
display_name="olmOCR-2-7B-1025",
)
def olmocr_2_7b_1025_fp8_profile() -> OCRModelProfile:
"""Return the built-in ``allenai/olmOCR-2-7B-1025-FP8`` OCR profile."""
return _olmocr_profile(
profile_name=OLMOCR_2_7B_1025_FP8_MODEL_ID,
display_name="olmOCR-2-7B-1025-FP8",
)
def _profile_registry() -> dict[str, OCRModelProfile]:
default_profile = default_ocr_profile()
churro_profile = churro_3b_profile()
chandra_profile = chandra_ocr_2_profile()
deepseek_profile = deepseek_ocr_2_profile()
firered_profile = firered_ocr_profile()
glm_profile = glm_ocr_profile()
dots_mocr = dots_mocr_profile()
dots_profile = dots_ocr_1_5_profile()
infinity_parser_profile = infinity_parser_7b_profile()
lfm2_5_vl_profile = lfm2_5_vl_1_6b_profile()
mineru2_5_profile = mineru2_5_2509_1_2b_profile()
nanonets_ocr2_profile = nanonets_ocr2_3b_profile()
olmocr_profile = olmocr_2_7b_1025_profile()
olmocr_fp8_profile = olmocr_2_7b_1025_fp8_profile()
paddleocr_vl_profile = paddleocr_vl_1_5_profile()
qianfan_profile = qianfan_ocr_profile()
return {
default_profile.profile_name: default_profile,
churro_profile.profile_name: churro_profile,
chandra_profile.profile_name: chandra_profile,
deepseek_profile.profile_name: deepseek_profile,
firered_profile.profile_name: firered_profile,
glm_profile.profile_name: glm_profile,
dots_mocr.profile_name: dots_mocr,
dots_profile.profile_name: dots_profile,
infinity_parser_profile.profile_name: infinity_parser_profile,
lfm2_5_vl_profile.profile_name: lfm2_5_vl_profile,
mineru2_5_profile.profile_name: mineru2_5_profile,
nanonets_ocr2_profile.profile_name: nanonets_ocr2_profile,
olmocr_profile.profile_name: olmocr_profile,
olmocr_fp8_profile.profile_name: olmocr_fp8_profile,
paddleocr_vl_profile.profile_name: paddleocr_vl_profile,
qianfan_profile.profile_name: qianfan_profile,
}
[docs]
def resolve_ocr_profile(
*,
model_id: str | None,
profile: str | OCRModelProfile | None = None,
) -> OCRModelProfile:
"""Resolve the OCR model profile for a model or explicit profile.
:param model_id: Model identifier that may map to a built-in profile.
:param profile: Explicit profile name or profile object to use.
:returns: Resolved OCR model profile.
:raises ValueError: If ``profile`` is a string that does not match a known profile.
"""
if isinstance(profile, OCRModelProfile):
return profile
registry = _profile_registry()
if isinstance(profile, str):
try:
return registry[profile]
except KeyError as exc:
message = f"Unknown OCR profile '{profile}'."
raise ValueError(message) from exc
if model_id is not None and model_id in registry:
return registry[model_id]
return registry["default"]
__all__ = [
"DEFAULT_OCR_MAX_TOKENS",
"MISTRAL_OCR_MODEL_IDS",
"AzureDocumentIntelligenceOptions",
"HuggingFaceOptions",
"ImagePreprocessor",
"LiteLLMTransportConfig",
"MistralOCRModel",
"MistralOptions",
"OCRBackendSpec",
"OCRModelProfile",
"OCRProvider",
"OpenAICompatibleOptions",
"TextPostprocessor",
"VisionInputBuilder",
"chandra_image_preprocessor",
"chandra_ocr_2_profile",
"chandra_text_postprocessor",
"deepseek_ocr_2_profile",
"deepseek_ocr_2_text_postprocessor",
"default_ocr_image_preprocessor",
"default_ocr_profile",
"default_ocr_text_postprocessor",
"firered_ocr_profile",
"firered_ocr_text_postprocessor",
"glm_ocr_image_preprocessor",
"glm_ocr_profile",
"glm_ocr_text_postprocessor",
"identity_text_postprocessor",
"infinity_parser_7b_profile",
"infinity_parser_7b_text_postprocessor",
"lfm2_5_vl_1_6b_profile",
"lfm2_5_vl_text_postprocessor",
"mineru2_5_2509_1_2b_profile",
"nanonets_ocr2_3b_profile",
"nanonets_ocr2_3b_text_postprocessor",
"olmocr_image_preprocessor",
"olmocr_text_postprocessor",
"paddleocr_vl_1_5_profile",
"paddleocr_vl_text_postprocessor",
"qianfan_ocr_profile",
"qianfan_ocr_text_postprocessor",
"resolve_ocr_profile",
"validate_mistral_ocr_model",
]