Picture description inline

Picture Description with Inline VLM Models

What this example does

Demonstrates picture description in standard PDF pipeline
Shows default preset, changing presets, and manual configuration without presets
Enriches documents with AI-generated image captions

Prerequisites

Install Docling with VLM extras: pip install docling[vlm]
Ensure your environment can download model weights

How to run

From the repository root: python docs/examples/picture_description_inline.py

Notes

This uses the standard PDF pipeline (not VlmPipeline)
For API-based picture description, see pictures_description_api.py
For legacy PictureDescriptionVlmOptions approach, see picture_description_inline_legacy.py

In [ ]:

Copied!





import logging
import os
from pathlib import Path

from docling_core.types.doc import PictureItem

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    PictureDescriptionVlmEngineOptions,
    PictureDescriptionVlmOptions,
)
from docling.datamodel.pipeline_options_vlm_model import ResponseFormat
from docling.datamodel.stage_model_specs import VlmModelSpec
from docling.datamodel.vlm_engine_options import AutoInlineVlmEngineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

logging.basicConfig(level=logging.INFO)

# Test document with images
input_doc_path = Path("tests/data/pdf/2206.01062.pdf")

# Check if running in CI
IS_CI = os.environ.get("CI", "").lower() in ("true", "1", "yes")

###### EXAMPLE 1: Using default VLM for picture description (SmolVLM)

print("=" * 60)
print("Example 1: Default picture description (SmolVLM preset)")
print("=" * 60)

pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
# When no picture_description_options is set, it uses the default (SmolVLM)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

result = converter.convert(input_doc_path)

# Print picture descriptions
for element, _level in result.document.iterate_items():
    if isinstance(element, PictureItem):
        print(
            f"Picture {element.self_ref}\n"
            f"Caption: {element.caption_text(doc=result.document)}\n"
            f"Meta: {element.meta}"
        )


###### EXAMPLE 2: Change to Granite Vision preset (skipped in CI)

if not IS_CI:
    print("\n" + "=" * 60)
    print("Example 2: Using Granite Vision preset")
    print("=" * 60)

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_picture_description = True
    pipeline_options.picture_description_options = (
        PictureDescriptionVlmEngineOptions.from_preset("granite_vision")
    )

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

    result = converter.convert(input_doc_path)

    for element, _level in result.document.iterate_items():
        if isinstance(element, PictureItem):
            print(
                f"Picture {element.self_ref}\n"
                f"Caption: {element.caption_text(doc=result.document)}\n"
                f"Meta: {element.meta}"
            )
else:
    print("\n" + "=" * 60)
    print("Example 2: Skipped (running in CI environment)")
    print("=" * 60)


###### EXAMPLE 3: Without presets - manually configuring model and runtime

print("\n" + "=" * 60)
print("Example 3: Manual configuration without presets")
print("=" * 60)

# You can manually configure the model spec and runtime options without using presets

pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = PictureDescriptionVlmEngineOptions(
    model_spec=VlmModelSpec(
        name="SmolVLM-256M-Custom",
        default_repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
        prompt="Provide a detailed technical description of this image, focusing on any diagrams, charts, or technical content.",
        response_format=ResponseFormat.PLAINTEXT,
    ),
    engine_options=AutoInlineVlmEngineOptions(),
    prompt="Provide a detailed technical description of this image, focusing on any diagrams, charts, or technical content.",
)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

result = converter.convert(input_doc_path)

for element, _level in result.document.iterate_items():
    if isinstance(element, PictureItem):
        print(
            f"Picture {element.self_ref}\n"
            f"Caption: {element.caption_text(doc=result.document)}\n"
            f"Meta: {element.meta}"
        )

import logging
import os
from pathlib import Path

from docling_core.types.doc import PictureItem

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    PictureDescriptionVlmEngineOptions,
    PictureDescriptionVlmOptions,
)
from docling.datamodel.pipeline_options_vlm_model import ResponseFormat
from docling.datamodel.stage_model_specs import VlmModelSpec
from docling.datamodel.vlm_engine_options import AutoInlineVlmEngineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

logging.basicConfig(level=logging.INFO)

# Test document with images
input_doc_path = Path("tests/data/pdf/2206.01062.pdf")

# Check if running in CI
IS_CI = os.environ.get("CI", "").lower() in ("true", "1", "yes")

###### EXAMPLE 1: Using default VLM for picture description (SmolVLM)

print("=" * 60)
print("Example 1: Default picture description (SmolVLM preset)")
print("=" * 60)

pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
# When no picture_description_options is set, it uses the default (SmolVLM)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

result = converter.convert(input_doc_path)

# Print picture descriptions
for element, _level in result.document.iterate_items():
    if isinstance(element, PictureItem):
        print(
            f"Picture {element.self_ref}\n"
            f"Caption: {element.caption_text(doc=result.document)}\n"
            f"Meta: {element.meta}"
        )


###### EXAMPLE 2: Change to Granite Vision preset (skipped in CI)

if not IS_CI:
    print("\n" + "=" * 60)
    print("Example 2: Using Granite Vision preset")
    print("=" * 60)

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_picture_description = True
    pipeline_options.picture_description_options = (
        PictureDescriptionVlmEngineOptions.from_preset("granite_vision")
    )

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

    result = converter.convert(input_doc_path)

    for element, _level in result.document.iterate_items():
        if isinstance(element, PictureItem):
            print(
                f"Picture {element.self_ref}\n"
                f"Caption: {element.caption_text(doc=result.document)}\n"
                f"Meta: {element.meta}"
            )
else:
    print("\n" + "=" * 60)
    print("Example 2: Skipped (running in CI environment)")
    print("=" * 60)


###### EXAMPLE 3: Without presets - manually configuring model and runtime

print("\n" + "=" * 60)
print("Example 3: Manual configuration without presets")
print("=" * 60)

# You can manually configure the model spec and runtime options without using presets

pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = PictureDescriptionVlmEngineOptions(
    model_spec=VlmModelSpec(
        name="SmolVLM-256M-Custom",
        default_repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
        prompt="Provide a detailed technical description of this image, focusing on any diagrams, charts, or technical content.",
        response_format=ResponseFormat.PLAINTEXT,
    ),
    engine_options=AutoInlineVlmEngineOptions(),
    prompt="Provide a detailed technical description of this image, focusing on any diagrams, charts, or technical content.",
)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

result = converter.convert(input_doc_path)

for element, _level in result.document.iterate_items():
    if isinstance(element, PictureItem):
        print(
            f"Picture {element.self_ref}\n"
            f"Caption: {element.caption_text(doc=result.document)}\n"
            f"Meta: {element.meta}"
        )

Summary¶

This example shows three approaches:

Default: No configuration needed, uses SmolVLM preset automatically
Preset-based: Use from_preset() to select a different model (e.g., granite_vision)
Manual configuration: Manually create VlmModelSpec and runtime options without presets

Available presets: smolvlm, granite_vision, pixtral, qwen

For API-based picture description (vLLM, LM Studio, watsonx.ai), see pictures_description_api.py For the legacy approach using PictureDescriptionVlmOptions, see picture_description_inline_legacy.py