ASR pipeline with Whisper

Minimal ASR pipeline example: transcribe an audio file to Markdown text.

What this example does

Configures the ASR pipeline with a default model spec and converts one audio file.
Prints the recognized speech segments in Markdown with timestamps.

Prerequisites

Install Docling with ASR extras and any audio dependencies (ffmpeg, etc.).
Ensure your environment can download or access the configured ASR model.
Some formats require ffmpeg codecs; install ffmpeg and ensure it's on PATH.

How to run

From the repository root, run: python docs/examples/minimal_asr_pipeline.py.
The script prints the transcription to stdout.

Customizing the model

The script automatically selects the best model for your hardware (MLX Whisper for Apple Silicon, native Whisper otherwise).
Edit get_asr_converter() to manually override pipeline_options.asr_options with any model from asr_model_specs.
Keep InputFormat.AUDIO and AsrPipeline unchanged for a minimal setup.

Input audio

Defaults to tests/data/audio/sample_10s.mp3. Update audio_path to your own file if needed.

In [ ]:

Copied!





from pathlib import Path

from docling_core.types.doc import DoclingDocument

from docling.datamodel import asr_model_specs
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AsrPipelineOptions
from docling.document_converter import AudioFormatOption, DocumentConverter
from docling.pipeline.asr_pipeline import AsrPipeline


def get_asr_converter():
    """Create a DocumentConverter configured for ASR with automatic model selection.

    Uses `asr_model_specs.WHISPER_TURBO` which automatically selects the best
    implementation for your hardware:
    - MLX Whisper Turbo for Apple Silicon (M1/M2/M3) with mlx-whisper installed
    - Native Whisper Turbo as fallback

    You can swap in another model spec from `docling.datamodel.asr_model_specs`
    to experiment with different model sizes.
    """
    pipeline_options = AsrPipelineOptions()
    pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO

    converter = DocumentConverter(
        format_options={
            InputFormat.AUDIO: AudioFormatOption(
                pipeline_cls=AsrPipeline,
                pipeline_options=pipeline_options,
            )
        }
    )
    return converter


def asr_pipeline_conversion(audio_path: Path) -> DoclingDocument:
    """Run the ASR pipeline and return a `DoclingDocument` transcript."""
    # Check if the test audio file exists
    assert audio_path.exists(), f"Test audio file not found: {audio_path}"

    converter = get_asr_converter()

    # Convert the audio file
    result: ConversionResult = converter.convert(audio_path)

    # Verify conversion was successful
    assert result.status == ConversionStatus.SUCCESS, (
        f"Conversion failed with status: {result.status}"
    )
    return result.document


if __name__ == "__main__":
    audio_path = Path("tests/data/audio/sample_10s.mp3")

    doc = asr_pipeline_conversion(audio_path=audio_path)
    print(doc.export_to_markdown())

    # Expected output:
    #
    # [time: 0.0-4.0]  Shakespeare on Scenery by Oscar Wilde
    #
    # [time: 5.28-9.96]  This is a LibriVox recording. All LibriVox recordings are in the public domain.

from pathlib import Path

from docling_core.types.doc import DoclingDocument

from docling.datamodel import asr_model_specs
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AsrPipelineOptions
from docling.document_converter import AudioFormatOption, DocumentConverter
from docling.pipeline.asr_pipeline import AsrPipeline


def get_asr_converter():
    """Create a DocumentConverter configured for ASR with automatic model selection.

    Uses `asr_model_specs.WHISPER_TURBO` which automatically selects the best
    implementation for your hardware:
    - MLX Whisper Turbo for Apple Silicon (M1/M2/M3) with mlx-whisper installed
    - Native Whisper Turbo as fallback

    You can swap in another model spec from `docling.datamodel.asr_model_specs`
    to experiment with different model sizes.
    """
    pipeline_options = AsrPipelineOptions()
    pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO

    converter = DocumentConverter(
        format_options={
            InputFormat.AUDIO: AudioFormatOption(
                pipeline_cls=AsrPipeline,
                pipeline_options=pipeline_options,
            )
        }
    )
    return converter


def asr_pipeline_conversion(audio_path: Path) -> DoclingDocument:
    """Run the ASR pipeline and return a `DoclingDocument` transcript."""
    # Check if the test audio file exists
    assert audio_path.exists(), f"Test audio file not found: {audio_path}"

    converter = get_asr_converter()

    # Convert the audio file
    result: ConversionResult = converter.convert(audio_path)

    # Verify conversion was successful
    assert result.status == ConversionStatus.SUCCESS, (
        f"Conversion failed with status: {result.status}"
    )
    return result.document


if __name__ == "__main__":
    audio_path = Path("tests/data/audio/sample_10s.mp3")

    doc = asr_pipeline_conversion(audio_path=audio_path)
    print(doc.export_to_markdown())

    # Expected output:
    #
    # [time: 0.0-4.0]  Shakespeare on Scenery by Oscar Wilde
    #
    # [time: 5.28-9.96]  This is a LibriVox recording. All LibriVox recordings are in the public domain.