Skip to content

Export tables

Extract tables from a PDF and export them as CSV and HTML.

What this example does - Converts a PDF and iterates detected tables. - Prints each table as Markdown to stdout, and saves CSV/HTML to scratch/.

Prerequisites - Install Docling and pandas.

How to run - From the repo root: python docs/examples/export_tables.py. - Outputs are written to scratch/.

Input document - Defaults to tests/data/pdf/2206.01062.pdf. Change input_doc_path as needed.

Notes - table.export_to_dataframe() returns a pandas DataFrame for convenient export/processing. - Printing via DataFrame.to_markdown() may require the optional tabulate package (pip install tabulate). If unavailable, skip the print or use to_csv().

import logging
import time
from pathlib import Path

import pandas as pd

from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)


def main():
    logging.basicConfig(level=logging.INFO)

    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    output_dir = Path("scratch")

    doc_converter = DocumentConverter()

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)

    doc_filename = conv_res.input.file.stem

    # Export tables
    for table_ix, table in enumerate(conv_res.document.tables):
        table_df: pd.DataFrame = table.export_to_dataframe(doc=conv_res.document)
        print(f"## Table {table_ix}")
        print(table_df.to_markdown())

        # Save the table as CSV
        element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
        _log.info(f"Saving CSV table to {element_csv_filename}")
        table_df.to_csv(element_csv_filename)

        # Save the table as HTML
        element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
        _log.info(f"Saving HTML table to {element_html_filename}")
        with element_html_filename.open("w") as fp:
            fp.write(table.export_to_html(doc=conv_res.document))

    end_time = time.time() - start_time

    _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")


if __name__ == "__main__":
    main()