Code & formula

In [ ]:

Copied!





"""Example: Comparing CodeFormula models for code and formula extraction.

This example demonstrates how to use both the CodeFormulaV2 model
and the Granite Docling model for extracting code blocks and mathematical
formulas from PDF documents, allowing you to compare their outputs.
"""
"""Example: Comparing CodeFormula models for code and formula extraction.

This example demonstrates how to use both the CodeFormulaV2 model
and the Granite Docling model for extracting code blocks and mathematical
formulas from PDF documents, allowing you to compare their outputs.
"""

In [ ]:

Copied!

from pathlib import Path
from pathlib import Path

In [ ]:

Copied!

from docling_core.types.doc import CodeItem, FormulaItem
from docling_core.types.doc import CodeItem, FormulaItem

In [ ]:

Copied!





from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    CodeFormulaVlmOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    CodeFormulaVlmOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

In [ ]:

Copied!





def extract_with_preset(preset_name: str, input_doc: Path):
    """Extract code and formulas using a specific preset.

    Args:
        preset_name: Name of the preset to use ('codeformulav2' or 'granite_docling')
        input_doc: Path to the input PDF document

    Returns:
        The converted document
    """
    print(f"\n{'=' * 60}")
    print(f"Processing with preset: {preset_name}")
    print(f"{'=' * 60}\n")

    # Create options with the specified preset
    code_formula_options = CodeFormulaVlmOptions.from_preset(preset_name)

    # Display preset information
    print(f"Model: {code_formula_options.model_spec.name}")
    print(f"Repo ID: {code_formula_options.model_spec.default_repo_id}")
    print(f"Scale: {code_formula_options.scale}")
    print(f"Max tokens: {code_formula_options.model_spec.max_new_tokens}")
    print()

    # Configure the PDF pipeline to use code/formula enrichment
    pipeline_options = PdfPipelineOptions(
        do_code_enrichment=True,
        do_formula_enrichment=True,
        code_formula_options=code_formula_options,
    )

    # Create converter with the configured options
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # Convert the document
    result = converter.convert(input_doc)
    doc = result.document

    # Print extracted code blocks
    code_blocks = [
        item for item, _ in doc.iterate_items() if isinstance(item, CodeItem)
    ]
    print(f"Code blocks found: {len(code_blocks)}")
    for i, item in enumerate(code_blocks, 1):
        print(f"\n  Code block {i}:")
        print(f"    Language: {item.code_language}")
        print(f"    Text: {item.text[:100]}{'...' if len(item.text) > 100 else ''}")

    # Print extracted formulas
    formulas = [
        item for item, _ in doc.iterate_items() if isinstance(item, FormulaItem)
    ]
    print(f"\nFormulas found: {len(formulas)}")
    for i, item in enumerate(formulas, 1):
        print(f"\n  Formula {i}:")
        print(f"    Text: {item.text[:100]}{'...' if len(item.text) > 100 else ''}")

    return doc
def extract_with_preset(preset_name: str, input_doc: Path):
    """Extract code and formulas using a specific preset.

    Args:
        preset_name: Name of the preset to use ('codeformulav2' or 'granite_docling')
        input_doc: Path to the input PDF document

    Returns:
        The converted document
    """
    print(f"\n{'=' * 60}")
    print(f"Processing with preset: {preset_name}")
    print(f"{'=' * 60}\n")

    # Create options with the specified preset
    code_formula_options = CodeFormulaVlmOptions.from_preset(preset_name)

    # Display preset information
    print(f"Model: {code_formula_options.model_spec.name}")
    print(f"Repo ID: {code_formula_options.model_spec.default_repo_id}")
    print(f"Scale: {code_formula_options.scale}")
    print(f"Max tokens: {code_formula_options.model_spec.max_new_tokens}")
    print()

    # Configure the PDF pipeline to use code/formula enrichment
    pipeline_options = PdfPipelineOptions(
        do_code_enrichment=True,
        do_formula_enrichment=True,
        code_formula_options=code_formula_options,
    )

    # Create converter with the configured options
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # Convert the document
    result = converter.convert(input_doc)
    doc = result.document

    # Print extracted code blocks
    code_blocks = [
        item for item, _ in doc.iterate_items() if isinstance(item, CodeItem)
    ]
    print(f"Code blocks found: {len(code_blocks)}")
    for i, item in enumerate(code_blocks, 1):
        print(f"\n  Code block {i}:")
        print(f"    Language: {item.code_language}")
        print(f"    Text: {item.text[:100]}{'...' if len(item.text) > 100 else ''}")

    # Print extracted formulas
    formulas = [
        item for item, _ in doc.iterate_items() if isinstance(item, FormulaItem)
    ]
    print(f"\nFormulas found: {len(formulas)}")
    for i, item in enumerate(formulas, 1):
        print(f"\n  Formula {i}:")
        print(f"    Text: {item.text[:100]}{'...' if len(item.text) > 100 else ''}")

    return doc

In [ ]:

Copied!





def main():
    """Main function to compare both presets."""
    input_doc = Path("tests/data/pdf/code_and_formula.pdf")

    if not input_doc.exists():
        print(f"Error: Input file not found: {input_doc}")
        print("Please provide a valid PDF file with code and formulas.")
        return

    print("Comparing CodeFormula presets for code and formula extraction")
    print(f"Input document: {input_doc}")

    # Extract with CodeFormulaV2 model
    extract_with_preset("codeformulav2", input_doc)

    # Extract with Granite Docling model
    extract_with_preset("granite_docling", input_doc)

    print(f"\n{'=' * 60}")
    print("Comparison complete!")
    print(f"{'=' * 60}")
    print("\nBoth presets have been tested. You can compare the outputs above.")
    print("\nKey differences:")
    print("- CodeFormulaV2: Uses specialized CodeFormulaV2 model")
    print(
        "- Granite Docling: Uses IBM Granite-Docling-258M with extended context (8192 tokens)"
    )
def main():
    """Main function to compare both presets."""
    input_doc = Path("tests/data/pdf/code_and_formula.pdf")

    if not input_doc.exists():
        print(f"Error: Input file not found: {input_doc}")
        print("Please provide a valid PDF file with code and formulas.")
        return

    print("Comparing CodeFormula presets for code and formula extraction")
    print(f"Input document: {input_doc}")

    # Extract with CodeFormulaV2 model
    extract_with_preset("codeformulav2", input_doc)

    # Extract with Granite Docling model
    extract_with_preset("granite_docling", input_doc)

    print(f"\n{'=' * 60}")
    print("Comparison complete!")
    print(f"{'=' * 60}")
    print("\nBoth presets have been tested. You can compare the outputs above.")
    print("\nKey differences:")
    print("- CodeFormulaV2: Uses specialized CodeFormulaV2 model")
    print(
        "- Granite Docling: Uses IBM Granite-Docling-258M with extended context (8192 tokens)"
    )

In [ ]:

Copied!

if __name__ == "__main__":
    main()
if __name__ == "__main__":
    main()