Source code for churro_ocr.prompts.ocr

"""Default OCR prompts."""

from __future__ import annotations

import re

DEFAULT_OCR_OUTPUT_TAG = "output"

DEFAULT_OCR_SYSTEM_PROMPT = (
    "You are an expert in diplomatic transcription of historical documents from various "
    "languages. Your task is to extract the full text from a given page. Only output the "
    f"transcribed text between <{DEFAULT_OCR_OUTPUT_TAG}> and </{DEFAULT_OCR_OUTPUT_TAG}> tags."
)

DEFAULT_OCR_USER_PROMPT = (
    "Follow these instructions:\n\n"
    "1. You will be provided with a scanned document page.\n\n"
    "2. Perform transcription on the entirety of the page, converting all visible text into "
    "the following format. Include handwritten and print text, if any. Include tables, "
    "captions, headers, main text and all other visible text.\n\n"
    "3. If you encounter any non-text elements, simply skip them without attempting to "
    "describe them.\n\n"
    "4. Do not modernize or standardize the text. For example, if the transcription is using "
    '"ſ" instead of "s" or "а" instead of "a", keep it that way.\n\n'
    "5. When you come across text in languages other than English, transcribe it as "
    "accurately as possible without translation.\n\n"
    "6. Output the OCR result in the following format:\n\n"
    f"<{DEFAULT_OCR_OUTPUT_TAG}>\n"
    "extracted text here\n"
    f"</{DEFAULT_OCR_OUTPUT_TAG}>\n\n"
    "Remember, your goal is to accurately transcribe the text from the scanned page as much "
    "as possible. Process the entire page, even if it contains a large amount of text, and "
    "provide clear, well-formatted output. Pay attention to the appropriate reading order "
    "and layout of the text."
)

DEFAULT_MARKDOWN_OCR_USER_PROMPT = (
    "Transcribe the full page in reading order as Markdown. Preserve headings, lists, "
    "tables, and line breaks when they are visible."
)


[docs] def strip_ocr_output_tag(text: str, *, output_tag: str = DEFAULT_OCR_OUTPUT_TAG) -> str: """Remove outer OCR output tags and any stray tag tokens when present. :param text: Raw OCR response text. :param output_tag: Expected wrapper tag name. :returns: OCR text with the outer wrapper removed when present. """ outer_wrapper_pattern = re.compile( rf"^\s*<{re.escape(output_tag)}>\s*(.*?)\s*</{re.escape(output_tag)}>\s*$", flags=re.DOTALL, ) match = outer_wrapper_pattern.match(text) if match is not None: return match.group(1).strip() stray_tag_pattern = re.compile(rf"</?{re.escape(output_tag)}\b[^>]*>", flags=re.IGNORECASE) return stray_tag_pattern.sub("", text).strip()