py-pdf · bosd · May 23, 2026
diff --git a/benchmark.py b/benchmark.py
@@ -36,6 +36,7 @@
     pypdf_get_text,
     pypdf_image_extraction,
     pypdf_watermarking, tika_get_text, pdfium_image_extraction,
+    unpdf_markdown_get_text,
 )
 from pdf_benchmark.output import write_benchmark_report
 from pdf_benchmark.score import get_text_extraction_score
@@ -218,6 +219,15 @@ def write_single_result(
             last_release_date="-",
             license="GPL",
         ),
+        "unpdf_markdown": Library(
+            "unpdf-markdown",
+            "unpdf_markdown",
+            "https://pypi.org/project/unpdf-markdown/",
+            text_extraction_function=unpdf_markdown_get_text,
+            version="0.6.4",
+            license="MIT",
+            last_release_date="2026-05-20",
+        ),
         # "borb": Library(
         #     "Borb",
         #     "borb",

diff --git a/pdf_benchmark/library_code.py b/pdf_benchmark/library_code.py
@@ -225,6 +225,23 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes:
     return out_buffer.read()
 
 
+def unpdf_markdown_get_text(data: bytes) -> str:
+    import unpdf
+    new_file, filename = tempfile.mkstemp(suffix=".pdf")
+    try:
+        with open(filename, "wb") as fp:
+            fp.write(data)
+        try:
+            text = unpdf.to_text(filename)
+        except RuntimeError as exc:
+            print(f"unpdf text extraction failed: {exc}")
+            text = ""
+    finally:
+        os.close(new_file)
+        os.remove(filename)
+    return text
+
+
 def tika_get_text(data: bytes) -> str:
     from tika import parser
 

diff --git a/requirements/main.in b/requirements/main.in
@@ -13,3 +13,4 @@ pymupdf
 pypdfium2
 pdfrw
 lxml
+unpdf-markdown
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ pymupdf @@
     pypdfium2
     pdfrw
     lxml
+    unpdf-markdown