Skip to content

Document converter

This is an automatic generated API reference of the main components of Docling.

document_converter

Classes:

DocumentConverter

DocumentConverter(allowed_formats: Optional[List[InputFormat]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None)

Methods:

Attributes:

allowed_formats instance-attribute

allowed_formats = allowed_formats if allowed_formats is not None else list(InputFormat)

format_to_options instance-attribute

format_to_options = {format: _get_default_option(format=format) if (custom_option := get(format)) is None else _lG7dekt9j1xmfor format in allowed_formats}

initialized_pipelines instance-attribute

initialized_pipelines: Dict[Tuple[Type[BasePipeline], str], BasePipeline] = {}

convert

convert(source: Union[Path, str, DocumentStream], headers: Optional[Dict[str, str]] = None, raises_on_error: bool = True, max_num_pages: int = maxsize, max_file_size: int = maxsize, page_range: PageRange = DEFAULT_PAGE_RANGE) -> ConversionResult

convert_all

convert_all(source: Iterable[Union[Path, str, DocumentStream]], headers: Optional[Dict[str, str]] = None, raises_on_error: bool = True, max_num_pages: int = maxsize, max_file_size: int = maxsize, page_range: PageRange = DEFAULT_PAGE_RANGE) -> Iterator[ConversionResult]

initialize_pipeline

initialize_pipeline(format: InputFormat)

Initialize the conversion pipeline for the selected format.

ConversionResult

Bases: BaseModel

Attributes:

assembled class-attribute instance-attribute

assembled: AssembledUnit = AssembledUnit()

confidence class-attribute instance-attribute

confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)

document class-attribute instance-attribute

document: DoclingDocument = _EMPTY_DOCLING_DOC

errors class-attribute instance-attribute

errors: List[ErrorItem] = []

input instance-attribute

input: InputDocument

legacy_document property

legacy_document

pages class-attribute instance-attribute

pages: List[Page] = []

status class-attribute instance-attribute

timings class-attribute instance-attribute

timings: Dict[str, ProfilingItem] = {}

ConversionStatus

Bases: str, Enum

Attributes:

FAILURE class-attribute instance-attribute

FAILURE = 'failure'

PARTIAL_SUCCESS class-attribute instance-attribute

PARTIAL_SUCCESS = 'partial_success'

PENDING class-attribute instance-attribute

PENDING = 'pending'

SKIPPED class-attribute instance-attribute

SKIPPED = 'skipped'

STARTED class-attribute instance-attribute

STARTED = 'started'

SUCCESS class-attribute instance-attribute

SUCCESS = 'success'

FormatOption

Bases: BaseModel

Methods:

Attributes:

backend instance-attribute

backend: Type[AbstractDocumentBackend]

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls instance-attribute

pipeline_cls: Type[BasePipeline]

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

InputFormat

Bases: str, Enum

A document format supported by document backend parsers.

Attributes:

ASCIIDOC class-attribute instance-attribute

ASCIIDOC = 'asciidoc'

CSV class-attribute instance-attribute

CSV = 'csv'

DOCX class-attribute instance-attribute

DOCX = 'docx'

HTML class-attribute instance-attribute

HTML = 'html'

IMAGE class-attribute instance-attribute

IMAGE = 'image'

JSON_DOCLING class-attribute instance-attribute

JSON_DOCLING = 'json_docling'

MD class-attribute instance-attribute

MD = 'md'

PDF class-attribute instance-attribute

PDF = 'pdf'

PPTX class-attribute instance-attribute

PPTX = 'pptx'

XLSX class-attribute instance-attribute

XLSX = 'xlsx'

XML_JATS class-attribute instance-attribute

XML_JATS = 'xml_jats'

XML_USPTO class-attribute instance-attribute

XML_USPTO = 'xml_uspto'

PdfFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = StandardPdfPipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

ImageFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = StandardPdfPipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

StandardPdfPipeline

StandardPdfPipeline(pipeline_options: PdfPipelineOptions)

Bases: PaginatedPipeline

Methods:

Attributes:

build_pipe instance-attribute

build_pipe = [PagePreprocessingModel(options=PagePreprocessingOptions(images_scale=images_scale, create_parsed_page=generate_parsed_pages)), ocr_model, LayoutModel(artifacts_path=artifacts_path, accelerator_options=accelerator_options), TableStructureModel(enabled=do_table_structure, artifacts_path=artifacts_path, options=table_structure_options, accelerator_options=accelerator_options), PageAssembleModel(options=PageAssembleOptions())]

enrichment_pipe instance-attribute

enrichment_pipe = [CodeFormulaModel(enabled=do_code_enrichment or do_formula_enrichment, artifacts_path=artifacts_path, options=CodeFormulaModelOptions(do_code_enrichment=do_code_enrichment, do_formula_enrichment=do_formula_enrichment), accelerator_options=accelerator_options), DocumentPictureClassifier(enabled=do_picture_classification, artifacts_path=artifacts_path, options=DocumentPictureClassifierOptions(), accelerator_options=accelerator_options), picture_description_model]

keep_backend instance-attribute

keep_backend = True

keep_images instance-attribute

keep_images = generate_page_images or generate_picture_images or generate_table_images

pipeline_options instance-attribute

pipeline_options: PdfPipelineOptions

reading_order_model instance-attribute

reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())

download_models_hf staticmethod

download_models_hf(local_dir: Optional[Path] = None, force: bool = False) -> Path

execute

execute(in_doc: InputDocument, raises_on_error: bool) -> ConversionResult

get_default_options classmethod

get_default_options() -> PdfPipelineOptions

get_ocr_model

get_ocr_model(artifacts_path: Optional[Path] = None) -> BaseOcrModel

get_picture_description_model

get_picture_description_model(artifacts_path: Optional[Path] = None) -> Optional[PictureDescriptionBaseModel]

initialize_page

initialize_page(conv_res: ConversionResult, page: Page) -> Page

is_backend_supported classmethod

is_backend_supported(backend: AbstractDocumentBackend)

WordFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

PowerpointFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

MarkdownFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

AsciiDocFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = AsciiDocBackend

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

HTMLFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

SimplePipeline

SimplePipeline(pipeline_options: PipelineOptions)

Bases: BasePipeline

SimpleModelPipeline.

This class is used at the moment for formats / backends which produce straight DoclingDocument output.

Methods:

Attributes:

build_pipe instance-attribute

build_pipe: List[Callable] = []

enrichment_pipe instance-attribute

enrichment_pipe: List[GenericEnrichmentModel[Any]] = []

keep_images instance-attribute

keep_images = False

pipeline_options instance-attribute

pipeline_options = pipeline_options

execute

execute(in_doc: InputDocument, raises_on_error: bool) -> ConversionResult

get_default_options classmethod

get_default_options() -> PipelineOptions

is_backend_supported classmethod

is_backend_supported(backend: AbstractDocumentBackend)