Docling evaluate

In [ ]:

Copied!





# SPDX-License-Identifier: MIT
"""
Evaluate a Docling JSON export and suggest pipeline / option changes.

Typical flow (agent or human):

  docling input.pdf --to json --output /tmp/
  docling input.pdf --to md --output /tmp/
  python3 scripts/docling-evaluate.py /tmp/input.json --markdown /tmp/input.md

Exit codes: 0 = pass; 1 = fail or --fail-on-warn with status warn
"""
# SPDX-License-Identifier: MIT
"""
Evaluate a Docling JSON export and suggest pipeline / option changes.

Typical flow (agent or human):

  docling input.pdf --to json --output /tmp/
  docling input.pdf --to md --output /tmp/
  python3 scripts/docling-evaluate.py /tmp/input.json --markdown /tmp/input.md

Exit codes: 0 = pass; 1 = fail or --fail-on-warn with status warn
"""

In [ ]:

Copied!

from __future__ import annotations
from __future__ import annotations

In [ ]:

Copied!





import argparse
import json
import sys
from collections import Counter
from pathlib import Path
from typing import Any
import argparse
import json
import sys
from collections import Counter
from pathlib import Path
from typing import Any

In [ ]:

Copied!





def load_document(path: Path):
    data = json.loads(path.read_text(encoding="utf-8"))
    try:
        from docling_core.types.doc.document import DoclingDocument

        return DoclingDocument.model_validate(data), data
    except Exception:
        return None, data
def load_document(path: Path):
    data = json.loads(path.read_text(encoding="utf-8"))
    try:
        from docling_core.types.doc.document import DoclingDocument

        return DoclingDocument.model_validate(data), data
    except Exception:
        return None, data

In [ ]:

Copied!





def page_numbers_from_doc(doc) -> set[int]:
    pages: set[int] = set()
    for item, _ in doc.iterate_items():
        for prov in getattr(item, "prov", None) or []:
            p = getattr(prov, "page_no", None)
            if p is not None:
                pages.add(int(p))
    return pages
def page_numbers_from_doc(doc) -> set[int]:
    pages: set[int] = set()
    for item, _ in doc.iterate_items():
        for prov in getattr(item, "prov", None) or []:
            p = getattr(prov, "page_no", None)
            if p is not None:
                pages.add(int(p))
    return pages

In [ ]:

Copied!





def collect_text_samples(doc, limit: int = 200) -> list[str]:
    texts: list[str] = []
    for item, _ in doc.iterate_items():
        t = getattr(item, "text", None)
        if t and str(t).strip():
            texts.append(str(t).strip())
            if len(texts) >= limit:
                break
    return texts
def collect_text_samples(doc, limit: int = 200) -> list[str]:
    texts: list[str] = []
    for item, _ in doc.iterate_items():
        t = getattr(item, "text", None)
        if t and str(t).strip():
            texts.append(str(t).strip())
            if len(texts) >= limit:
                break
    return texts

In [ ]:

Copied!





def metrics_from_doc(doc) -> dict[str, Any]:
    n_tables = len(getattr(doc, "tables", []) or [])
    n_pictures = len(getattr(doc, "pictures", []) or [])
    n_headers = 0
    n_text_items = 0
    total_chars = 0
    for item, _ in doc.iterate_items():
        label = getattr(getattr(item, "label", None), "name", None) or ""
        if label == "SECTION_HEADER":
            n_headers += 1
        t = getattr(item, "text", None)
        if t:
            n_text_items += 1
            total_chars += len(str(t))

    pages = page_numbers_from_doc(doc)
    n_pages = len(pages) if pages else 0
    density = (total_chars / n_pages) if n_pages else total_chars

    samples = collect_text_samples(doc)
    rep = Counter(samples)
    top_rep = rep.most_common(1)[0] if rep else ("", 0)
    dup_ratio = (
        sum(c for _, c in rep.items() if c > 2) / max(len(rep), 1) if rep else 0.0
    )

    md = ""
    try:
        md = doc.export_to_markdown()
    except Exception:
        pass

    replacement = md.count("\ufffd") + sum(str(t).count("\ufffd") for t in samples)

    return {
        "page_count": n_pages,
        "section_headers": n_headers,
        "text_items": n_text_items,
        "total_text_chars": total_chars,
        "chars_per_page": round(density, 2),
        "tables": n_tables,
        "pictures": n_pictures,
        "markdown_chars": len(md),
        "replacement_chars": replacement,
        "most_repeated_text_count": int(top_rep[1]) if top_rep else 0,
        "duplicate_heavy": dup_ratio > 0.15 and len(samples) > 10,
    }
def metrics_from_doc(doc) -> dict[str, Any]:
    n_tables = len(getattr(doc, "tables", []) or [])
    n_pictures = len(getattr(doc, "pictures", []) or [])
    n_headers = 0
    n_text_items = 0
    total_chars = 0
    for item, _ in doc.iterate_items():
        label = getattr(getattr(item, "label", None), "name", None) or ""
        if label == "SECTION_HEADER":
            n_headers += 1
        t = getattr(item, "text", None)
        if t:
            n_text_items += 1
            total_chars += len(str(t))

    pages = page_numbers_from_doc(doc)
    n_pages = len(pages) if pages else 0
    density = (total_chars / n_pages) if n_pages else total_chars

    samples = collect_text_samples(doc)
    rep = Counter(samples)
    top_rep = rep.most_common(1)[0] if rep else ("", 0)
    dup_ratio = (
        sum(c for _, c in rep.items() if c > 2) / max(len(rep), 1) if rep else 0.0
    )

    md = ""
    try:
        md = doc.export_to_markdown()
    except Exception:
        pass

    replacement = md.count("\ufffd") + sum(str(t).count("\ufffd") for t in samples)

    return {
        "page_count": n_pages,
        "section_headers": n_headers,
        "text_items": n_text_items,
        "total_text_chars": total_chars,
        "chars_per_page": round(density, 2),
        "tables": n_tables,
        "pictures": n_pictures,
        "markdown_chars": len(md),
        "replacement_chars": replacement,
        "most_repeated_text_count": int(top_rep[1]) if top_rep else 0,
        "duplicate_heavy": dup_ratio > 0.15 and len(samples) > 10,
    }

In [ ]:

Copied!





def heuristic_metrics(data: dict) -> dict[str, Any]:
    """Fallback when DoclingDocument cannot be validated (older export / drift)."""
    texts = data.get("texts") or []
    tables = data.get("tables") or []
    body = data.get("body") or {}
    children = body.get("children") if isinstance(body, dict) else None
    n_children = len(children) if isinstance(children, list) else 0
    char_sum = 0
    for t in texts:
        if isinstance(t, dict):
            char_sum += len(str(t.get("text") or ""))
    return {
        "page_count": 0,
        "section_headers": 0,
        "text_items": len(texts),
        "total_text_chars": char_sum,
        "chars_per_page": 0.0,
        "tables": len(tables),
        "pictures": len(data.get("pictures") or []),
        "markdown_chars": 0,
        "replacement_chars": 0,
        "most_repeated_text_count": 0,
        "duplicate_heavy": False,
        "heuristic_only": True,
        "body_children": n_children,
    }
def heuristic_metrics(data: dict) -> dict[str, Any]:
    """Fallback when DoclingDocument cannot be validated (older export / drift)."""
    texts = data.get("texts") or []
    tables = data.get("tables") or []
    body = data.get("body") or {}
    children = body.get("children") if isinstance(body, dict) else None
    n_children = len(children) if isinstance(children, list) else 0
    char_sum = 0
    for t in texts:
        if isinstance(t, dict):
            char_sum += len(str(t.get("text") or ""))
    return {
        "page_count": 0,
        "section_headers": 0,
        "text_items": len(texts),
        "total_text_chars": char_sum,
        "chars_per_page": 0.0,
        "tables": len(tables),
        "pictures": len(data.get("pictures") or []),
        "markdown_chars": 0,
        "replacement_chars": 0,
        "most_repeated_text_count": 0,
        "duplicate_heavy": False,
        "heuristic_only": True,
        "body_children": n_children,
    }

In [ ]:

Copied!





def evaluate(
    m: dict[str, Any],
    *,
    expect_tables: bool,
    min_chars_per_page: float,
    min_markdown_chars: int,
) -> tuple[str, list[str], list[str]]:
    issues: list[str] = []
    actions: list[str] = []

    if m.get("heuristic_only"):
        issues.append("Could not load full DoclingDocument; metrics are partial.")
        actions.append(
            "Ensure docling-core matches export; re-export with: docling <source> --to json --output <dir>"
        )

    cpp = m.get("chars_per_page") or 0
    if m.get("page_count", 0) >= 2 and cpp < min_chars_per_page:
        issues.append(
            f"Low text density ({cpp} chars/page); likely scan, image-heavy PDF, or extraction gap."
        )
        actions.append(
            "Retry: docling <source> --ocr-engine tesserocr (or rapidocr, ocrmac)"
        )
        actions.append("Retry: docling <source> --pipeline vlm")

    if m.get("replacement_chars", 0) > 5:
        issues.append(
            "Unicode replacement characters detected; OCR may be garbling text."
        )
        actions.append("Retry: docling <source> --ocr-engine tesserocr (or rapidocr)")
        actions.append(
            "Retry: docling <source> --pipeline vlm (use force_backend_text=True via Python API for hybrid)"
        )

    if m.get("duplicate_heavy") or (m.get("most_repeated_text_count", 0) > 8):
        issues.append(
            "Repeated text blocks; possible layout/OCR loop or bad reading order."
        )
        actions.append("Retry: docling <source> --pipeline vlm")
        actions.append(
            "If using VLM: try force_backend_text=True via Python API for text-heavy pages"
        )

    if expect_tables and m.get("tables", 0) == 0:
        issues.append("No tables detected but tables were expected.")
        actions.append(
            "Retry: docling <source> (tables are enabled by default; remove --no-tables if set)"
        )
        actions.append(
            "Retry: docling <source> --pipeline vlm (better for merged-cell or visual tables)"
        )

    mc = m.get("markdown_chars", 0)
    if mc > 0 and mc < min_markdown_chars and m.get("page_count", 0) >= 1:
        issues.append(f"Markdown export is very short ({mc} chars) for the page count.")
        actions.append(
            "Retry: docling <source> --pipeline vlm (or try different --ocr-engine)"
        )

    if m.get("text_items", 0) == 0 and m.get("page_count", 0) == 0:
        issues.append(
            "No text items and no page provenance; export may be empty or invalid."
        )
        actions.append(
            "Verify source file opens correctly; retry with: docling <source> --pipeline standard"
        )

    seen = set()
    uniq_actions = []
    for a in actions:
        if a not in seen:
            seen.add(a)
            uniq_actions.append(a)

    if not issues:
        return "pass", [], []

    severe = m.get("text_items", 0) == 0 or (
        m.get("page_count", 0) >= 1 and mc < 50 and mc > 0
    )
    status = "fail" if severe or m.get("replacement_chars", 0) > 20 else "warn"
    return status, issues, uniq_actions
def evaluate(
    m: dict[str, Any],
    *,
    expect_tables: bool,
    min_chars_per_page: float,
    min_markdown_chars: int,
) -> tuple[str, list[str], list[str]]:
    issues: list[str] = []
    actions: list[str] = []

    if m.get("heuristic_only"):
        issues.append("Could not load full DoclingDocument; metrics are partial.")
        actions.append(
            "Ensure docling-core matches export; re-export with: docling  --to json --output "
        )

    cpp = m.get("chars_per_page") or 0
    if m.get("page_count", 0) >= 2 and cpp < min_chars_per_page:
        issues.append(
            f"Low text density ({cpp} chars/page); likely scan, image-heavy PDF, or extraction gap."
        )
        actions.append(
            "Retry: docling  --ocr-engine tesserocr (or rapidocr, ocrmac)"
        )
        actions.append("Retry: docling  --pipeline vlm")

    if m.get("replacement_chars", 0) > 5:
        issues.append(
            "Unicode replacement characters detected; OCR may be garbling text."
        )
        actions.append("Retry: docling  --ocr-engine tesserocr (or rapidocr)")
        actions.append(
            "Retry: docling  --pipeline vlm (use force_backend_text=True via Python API for hybrid)"
        )

    if m.get("duplicate_heavy") or (m.get("most_repeated_text_count", 0) > 8):
        issues.append(
            "Repeated text blocks; possible layout/OCR loop or bad reading order."
        )
        actions.append("Retry: docling  --pipeline vlm")
        actions.append(
            "If using VLM: try force_backend_text=True via Python API for text-heavy pages"
        )

    if expect_tables and m.get("tables", 0) == 0:
        issues.append("No tables detected but tables were expected.")
        actions.append(
            "Retry: docling  (tables are enabled by default; remove --no-tables if set)"
        )
        actions.append(
            "Retry: docling  --pipeline vlm (better for merged-cell or visual tables)"
        )

    mc = m.get("markdown_chars", 0)
    if mc > 0 and mc < min_markdown_chars and m.get("page_count", 0) >= 1:
        issues.append(f"Markdown export is very short ({mc} chars) for the page count.")
        actions.append(
            "Retry: docling  --pipeline vlm (or try different --ocr-engine)"
        )

    if m.get("text_items", 0) == 0 and m.get("page_count", 0) == 0:
        issues.append(
            "No text items and no page provenance; export may be empty or invalid."
        )
        actions.append(
            "Verify source file opens correctly; retry with: docling  --pipeline standard"
        )

    seen = set()
    uniq_actions = []
    for a in actions:
        if a not in seen:
            seen.add(a)
            uniq_actions.append(a)

    if not issues:
        return "pass", [], []

    severe = m.get("text_items", 0) == 0 or (
        m.get("page_count", 0) >= 1 and mc < 50 and mc > 0
    )
    status = "fail" if severe or m.get("replacement_chars", 0) > 20 else "warn"
    return status, issues, uniq_actions

In [ ]:

Copied!





def parse_args():
    p = argparse.ArgumentParser(description="Evaluate Docling JSON export quality")
    p.add_argument(
        "json_path", type=Path, help="Path to DoclingDocument JSON (export_to_dict)"
    )
    p.add_argument(
        "--markdown",
        type=Path,
        default=None,
        help="Optional markdown file to cross-check length",
    )
    p.add_argument("--expect-tables", action="store_true")
    p.add_argument("--min-chars-per-page", type=float, default=120.0)
    p.add_argument("--min-markdown-chars", type=int, default=200)
    p.add_argument("--fail-on-warn", action="store_true")
    p.add_argument(
        "--quiet", action="store_true", help="Only print JSON report to stdout"
    )
    return p.parse_args()
def parse_args():
    p = argparse.ArgumentParser(description="Evaluate Docling JSON export quality")
    p.add_argument(
        "json_path", type=Path, help="Path to DoclingDocument JSON (export_to_dict)"
    )
    p.add_argument(
        "--markdown",
        type=Path,
        default=None,
        help="Optional markdown file to cross-check length",
    )
    p.add_argument("--expect-tables", action="store_true")
    p.add_argument("--min-chars-per-page", type=float, default=120.0)
    p.add_argument("--min-markdown-chars", type=int, default=200)
    p.add_argument("--fail-on-warn", action="store_true")
    p.add_argument(
        "--quiet", action="store_true", help="Only print JSON report to stdout"
    )
    return p.parse_args()

In [ ]:

Copied!





def main() -> None:
    args = parse_args()
    if not args.json_path.is_file():
        print(json.dumps({"error": f"not found: {args.json_path}"}), file=sys.stderr)
        sys.exit(1)

    doc, raw = load_document(args.json_path)
    if doc is not None:
        m = metrics_from_doc(doc)
    else:
        m = heuristic_metrics(raw)

    if args.markdown and args.markdown.is_file():
        md_len = len(args.markdown.read_text(encoding="utf-8"))
        m["markdown_file_chars"] = md_len
        if m.get("markdown_chars", 0) == 0:
            m["markdown_chars"] = md_len

    status, issues, actions = evaluate(
        m,
        expect_tables=args.expect_tables,
        min_chars_per_page=args.min_chars_per_page,
        min_markdown_chars=args.min_markdown_chars,
    )

    report = {
        "status": status,
        "metrics": m,
        "issues": issues,
        "recommended_actions": actions,
        "next_steps_for_agent": [
            "Re-run docling with flags from recommended_actions.",
            "Re-export JSON and run this script again until status is pass.",
            "Append a row to improvement-log.md (see SKILL.md).",
        ],
    }

    print(json.dumps(report, indent=2, ensure_ascii=False))
    if not args.quiet:
        print(f"\nstatus={status}", file=sys.stderr)
        if issues:
            print("issues:", file=sys.stderr)
            for i in issues:
                print(f"  - {i}", file=sys.stderr)
        if actions:
            print("recommended_actions:", file=sys.stderr)
            for a in actions:
                print(f"  - {a}", file=sys.stderr)

    if status == "fail":
        sys.exit(1)
    if status == "warn" and args.fail_on_warn:
        sys.exit(1)
    sys.exit(0)
def main() -> None:
    args = parse_args()
    if not args.json_path.is_file():
        print(json.dumps({"error": f"not found: {args.json_path}"}), file=sys.stderr)
        sys.exit(1)

    doc, raw = load_document(args.json_path)
    if doc is not None:
        m = metrics_from_doc(doc)
    else:
        m = heuristic_metrics(raw)

    if args.markdown and args.markdown.is_file():
        md_len = len(args.markdown.read_text(encoding="utf-8"))
        m["markdown_file_chars"] = md_len
        if m.get("markdown_chars", 0) == 0:
            m["markdown_chars"] = md_len

    status, issues, actions = evaluate(
        m,
        expect_tables=args.expect_tables,
        min_chars_per_page=args.min_chars_per_page,
        min_markdown_chars=args.min_markdown_chars,
    )

    report = {
        "status": status,
        "metrics": m,
        "issues": issues,
        "recommended_actions": actions,
        "next_steps_for_agent": [
            "Re-run docling with flags from recommended_actions.",
            "Re-export JSON and run this script again until status is pass.",
            "Append a row to improvement-log.md (see SKILL.md).",
        ],
    }

    print(json.dumps(report, indent=2, ensure_ascii=False))
    if not args.quiet:
        print(f"\nstatus={status}", file=sys.stderr)
        if issues:
            print("issues:", file=sys.stderr)
            for i in issues:
                print(f"  - {i}", file=sys.stderr)
        if actions:
            print("recommended_actions:", file=sys.stderr)
            for a in actions:
                print(f"  - {a}", file=sys.stderr)

    if status == "fail":
        sys.exit(1)
    if status == "warn" and args.fail_on_warn:
        sys.exit(1)
    sys.exit(0)

In [ ]:

Copied!

if __name__ == "__main__":
    main()
if __name__ == "__main__":
    main()