def load_document(path: Path):
data = json.loads(path.read_text(encoding="utf-8"))
try:
from docling_core.types.doc.document import DoclingDocument
return DoclingDocument.model_validate(data), data
except Exception:
return None, data
def page_numbers_from_doc(doc) -> set[int]:
pages: set[int] = set()
for item, _ in doc.iterate_items():
for prov in getattr(item, "prov", None) or []:
p = getattr(prov, "page_no", None)
if p is not None:
pages.add(int(p))
return pages
def collect_text_samples(doc, limit: int = 200) -> list[str]:
texts: list[str] = []
for item, _ in doc.iterate_items():
t = getattr(item, "text", None)
if t and str(t).strip():
texts.append(str(t).strip())
if len(texts) >= limit:
break
return texts
def metrics_from_doc(doc) -> dict[str, Any]:
n_tables = len(getattr(doc, "tables", []) or [])
n_pictures = len(getattr(doc, "pictures", []) or [])
n_headers = 0
n_text_items = 0
total_chars = 0
for item, _ in doc.iterate_items():
label = getattr(getattr(item, "label", None), "name", None) or ""
if label == "SECTION_HEADER":
n_headers += 1
t = getattr(item, "text", None)
if t:
n_text_items += 1
total_chars += len(str(t))
pages = page_numbers_from_doc(doc)
n_pages = len(pages) if pages else 0
density = (total_chars / n_pages) if n_pages else total_chars
samples = collect_text_samples(doc)
rep = Counter(samples)
top_rep = rep.most_common(1)[0] if rep else ("", 0)
dup_ratio = (
sum(c for _, c in rep.items() if c > 2) / max(len(rep), 1) if rep else 0.0
)
md = ""
try:
md = doc.export_to_markdown()
except Exception:
pass
replacement = md.count("\ufffd") + sum(str(t).count("\ufffd") for t in samples)
return {
"page_count": n_pages,
"section_headers": n_headers,
"text_items": n_text_items,
"total_text_chars": total_chars,
"chars_per_page": round(density, 2),
"tables": n_tables,
"pictures": n_pictures,
"markdown_chars": len(md),
"replacement_chars": replacement,
"most_repeated_text_count": int(top_rep[1]) if top_rep else 0,
"duplicate_heavy": dup_ratio > 0.15 and len(samples) > 10,
}
In [ ]:
Copied!
defheuristic_metrics(data:dict)->dict[str,Any]:"""Fallback when DoclingDocument cannot be validated (older export / drift)."""texts=data.get("texts")or[]tables=data.get("tables")or[]body=data.get("body")or{}children=body.get("children")ifisinstance(body,dict)elseNonen_children=len(children)ifisinstance(children,list)else0char_sum=0fortintexts:ifisinstance(t,dict):char_sum+=len(str(t.get("text")or""))return{"page_count":0,"section_headers":0,"text_items":len(texts),"total_text_chars":char_sum,"chars_per_page":0.0,"tables":len(tables),"pictures":len(data.get("pictures")or[]),"markdown_chars":0,"replacement_chars":0,"most_repeated_text_count":0,"duplicate_heavy":False,"heuristic_only":True,"body_children":n_children,}
def heuristic_metrics(data: dict) -> dict[str, Any]:
"""Fallback when DoclingDocument cannot be validated (older export / drift)."""
texts = data.get("texts") or []
tables = data.get("tables") or []
body = data.get("body") or {}
children = body.get("children") if isinstance(body, dict) else None
n_children = len(children) if isinstance(children, list) else 0
char_sum = 0
for t in texts:
if isinstance(t, dict):
char_sum += len(str(t.get("text") or ""))
return {
"page_count": 0,
"section_headers": 0,
"text_items": len(texts),
"total_text_chars": char_sum,
"chars_per_page": 0.0,
"tables": len(tables),
"pictures": len(data.get("pictures") or []),
"markdown_chars": 0,
"replacement_chars": 0,
"most_repeated_text_count": 0,
"duplicate_heavy": False,
"heuristic_only": True,
"body_children": n_children,
}
In [ ]:
Copied!
defevaluate(m:dict[str,Any],*,expect_tables:bool,min_chars_per_page:float,min_markdown_chars:int,)->tuple[str,list[str],list[str]]:issues:list[str]=[]actions:list[str]=[]ifm.get("heuristic_only"):issues.append("Could not load full DoclingDocument; metrics are partial.")actions.append("Ensure docling-core matches export; re-export with: docling <source> --to json --output <dir>")cpp=m.get("chars_per_page")or0ifm.get("page_count",0)>=2andcpp<min_chars_per_page:issues.append(f"Low text density ({cpp} chars/page); likely scan, image-heavy PDF, or extraction gap.")actions.append("Retry: docling <source> --ocr-engine tesserocr (or rapidocr, ocrmac)")actions.append("Retry: docling <source> --pipeline vlm")ifm.get("replacement_chars",0)>5:issues.append("Unicode replacement characters detected; OCR may be garbling text.")actions.append("Retry: docling <source> --ocr-engine tesserocr (or rapidocr)")actions.append("Retry: docling <source> --pipeline vlm (use force_backend_text=True via Python API for hybrid)")ifm.get("duplicate_heavy")or(m.get("most_repeated_text_count",0)>8):issues.append("Repeated text blocks; possible layout/OCR loop or bad reading order.")actions.append("Retry: docling <source> --pipeline vlm")actions.append("If using VLM: try force_backend_text=True via Python API for text-heavy pages")ifexpect_tablesandm.get("tables",0)==0:issues.append("No tables detected but tables were expected.")actions.append("Retry: docling <source> (tables are enabled by default; remove --no-tables if set)")actions.append("Retry: docling <source> --pipeline vlm (better for merged-cell or visual tables)")mc=m.get("markdown_chars",0)ifmc>0andmc<min_markdown_charsandm.get("page_count",0)>=1:issues.append(f"Markdown export is very short ({mc} chars) for the page count.")actions.append("Retry: docling <source> --pipeline vlm (or try different --ocr-engine)")ifm.get("text_items",0)==0andm.get("page_count",0)==0:issues.append("No text items and no page provenance; export may be empty or invalid.")actions.append("Verify source file opens correctly; retry with: docling <source> --pipeline standard")seen=set()uniq_actions=[]forainactions:ifanotinseen:seen.add(a)uniq_actions.append(a)ifnotissues:return"pass",[],[]severe=m.get("text_items",0)==0or(m.get("page_count",0)>=1andmc<50andmc>0)status="fail"ifsevereorm.get("replacement_chars",0)>20else"warn"returnstatus,issues,uniq_actions
def evaluate(
m: dict[str, Any],
*,
expect_tables: bool,
min_chars_per_page: float,
min_markdown_chars: int,
) -> tuple[str, list[str], list[str]]:
issues: list[str] = []
actions: list[str] = []
if m.get("heuristic_only"):
issues.append("Could not load full DoclingDocument; metrics are partial.")
actions.append(
"Ensure docling-core matches export; re-export with: docling --to json --output "
)
cpp = m.get("chars_per_page") or 0
if m.get("page_count", 0) >= 2 and cpp < min_chars_per_page:
issues.append(
f"Low text density ({cpp} chars/page); likely scan, image-heavy PDF, or extraction gap."
)
actions.append(
"Retry: docling --ocr-engine tesserocr (or rapidocr, ocrmac)"
)
actions.append("Retry: docling --pipeline vlm")
if m.get("replacement_chars", 0) > 5:
issues.append(
"Unicode replacement characters detected; OCR may be garbling text."
)
actions.append("Retry: docling --ocr-engine tesserocr (or rapidocr)")
actions.append(
"Retry: docling --pipeline vlm (use force_backend_text=True via Python API for hybrid)"
)
if m.get("duplicate_heavy") or (m.get("most_repeated_text_count", 0) > 8):
issues.append(
"Repeated text blocks; possible layout/OCR loop or bad reading order."
)
actions.append("Retry: docling --pipeline vlm")
actions.append(
"If using VLM: try force_backend_text=True via Python API for text-heavy pages"
)
if expect_tables and m.get("tables", 0) == 0:
issues.append("No tables detected but tables were expected.")
actions.append(
"Retry: docling (tables are enabled by default; remove --no-tables if set)"
)
actions.append(
"Retry: docling --pipeline vlm (better for merged-cell or visual tables)"
)
mc = m.get("markdown_chars", 0)
if mc > 0 and mc < min_markdown_chars and m.get("page_count", 0) >= 1:
issues.append(f"Markdown export is very short ({mc} chars) for the page count.")
actions.append(
"Retry: docling --pipeline vlm (or try different --ocr-engine)"
)
if m.get("text_items", 0) == 0 and m.get("page_count", 0) == 0:
issues.append(
"No text items and no page provenance; export may be empty or invalid."
)
actions.append(
"Verify source file opens correctly; retry with: docling --pipeline standard"
)
seen = set()
uniq_actions = []
for a in actions:
if a not in seen:
seen.add(a)
uniq_actions.append(a)
if not issues:
return "pass", [], []
severe = m.get("text_items", 0) == 0 or (
m.get("page_count", 0) >= 1 and mc < 50 and mc > 0
)
status = "fail" if severe or m.get("replacement_chars", 0) > 20 else "warn"
return status, issues, uniq_actions
In [ ]:
Copied!
defparse_args():p=argparse.ArgumentParser(description="Evaluate Docling JSON export quality")p.add_argument("json_path",type=Path,help="Path to DoclingDocument JSON (export_to_dict)")p.add_argument("--markdown",type=Path,default=None,help="Optional markdown file to cross-check length",)p.add_argument("--expect-tables",action="store_true")p.add_argument("--min-chars-per-page",type=float,default=120.0)p.add_argument("--min-markdown-chars",type=int,default=200)p.add_argument("--fail-on-warn",action="store_true")p.add_argument("--quiet",action="store_true",help="Only print JSON report to stdout")returnp.parse_args()
defmain()->None:args=parse_args()ifnotargs.json_path.is_file():print(json.dumps({"error":f"not found: {args.json_path}"}),file=sys.stderr)sys.exit(1)doc,raw=load_document(args.json_path)ifdocisnotNone:m=metrics_from_doc(doc)else:m=heuristic_metrics(raw)ifargs.markdownandargs.markdown.is_file():md_len=len(args.markdown.read_text(encoding="utf-8"))m["markdown_file_chars"]=md_lenifm.get("markdown_chars",0)==0:m["markdown_chars"]=md_lenstatus,issues,actions=evaluate(m,expect_tables=args.expect_tables,min_chars_per_page=args.min_chars_per_page,min_markdown_chars=args.min_markdown_chars,)report={"status":status,"metrics":m,"issues":issues,"recommended_actions":actions,"next_steps_for_agent":["Re-run docling with flags from recommended_actions.","Re-export JSON and run this script again until status is pass.","Append a row to improvement-log.md (see SKILL.md).",],}print(json.dumps(report,indent=2,ensure_ascii=False))ifnotargs.quiet:print(f"\nstatus={status}",file=sys.stderr)ifissues:print("issues:",file=sys.stderr)foriinissues:print(f" - {i}",file=sys.stderr)ifactions:print("recommended_actions:",file=sys.stderr)forainactions:print(f" - {a}",file=sys.stderr)ifstatus=="fail":sys.exit(1)ifstatus=="warn"andargs.fail_on_warn:sys.exit(1)sys.exit(0)
def main() -> None:
args = parse_args()
if not args.json_path.is_file():
print(json.dumps({"error": f"not found: {args.json_path}"}), file=sys.stderr)
sys.exit(1)
doc, raw = load_document(args.json_path)
if doc is not None:
m = metrics_from_doc(doc)
else:
m = heuristic_metrics(raw)
if args.markdown and args.markdown.is_file():
md_len = len(args.markdown.read_text(encoding="utf-8"))
m["markdown_file_chars"] = md_len
if m.get("markdown_chars", 0) == 0:
m["markdown_chars"] = md_len
status, issues, actions = evaluate(
m,
expect_tables=args.expect_tables,
min_chars_per_page=args.min_chars_per_page,
min_markdown_chars=args.min_markdown_chars,
)
report = {
"status": status,
"metrics": m,
"issues": issues,
"recommended_actions": actions,
"next_steps_for_agent": [
"Re-run docling with flags from recommended_actions.",
"Re-export JSON and run this script again until status is pass.",
"Append a row to improvement-log.md (see SKILL.md).",
],
}
print(json.dumps(report, indent=2, ensure_ascii=False))
if not args.quiet:
print(f"\nstatus={status}", file=sys.stderr)
if issues:
print("issues:", file=sys.stderr)
for i in issues:
print(f" - {i}", file=sys.stderr)
if actions:
print("recommended_actions:", file=sys.stderr)
for a in actions:
print(f" - {a}", file=sys.stderr)
if status == "fail":
sys.exit(1)
if status == "warn" and args.fail_on_warn:
sys.exit(1)
sys.exit(0)