diff --git a/benchmark.py b/benchmark.py index 3d71e22..d44323d 100644 --- a/benchmark.py +++ b/benchmark.py @@ -36,6 +36,7 @@ pypdf_get_text, pypdf_image_extraction, pypdf_watermarking, tika_get_text, pdfium_image_extraction, + unpdf_markdown_get_text, ) from pdf_benchmark.output import write_benchmark_report from pdf_benchmark.score import get_text_extraction_score @@ -218,6 +219,15 @@ def write_single_result( last_release_date="-", license="GPL", ), + "unpdf_markdown": Library( + "unpdf-markdown", + "unpdf_markdown", + "https://pypi.org/project/unpdf-markdown/", + text_extraction_function=unpdf_markdown_get_text, + version="0.6.4", + license="MIT", + last_release_date="2026-05-20", + ), # "borb": Library( # "Borb", # "borb", diff --git a/pdf_benchmark/library_code.py b/pdf_benchmark/library_code.py index 32938c2..517ffda 100644 --- a/pdf_benchmark/library_code.py +++ b/pdf_benchmark/library_code.py @@ -225,6 +225,23 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes: return out_buffer.read() +def unpdf_markdown_get_text(data: bytes) -> str: + import unpdf + new_file, filename = tempfile.mkstemp(suffix=".pdf") + try: + with open(filename, "wb") as fp: + fp.write(data) + try: + text = unpdf.to_text(filename) + except RuntimeError as exc: + print(f"unpdf text extraction failed: {exc}") + text = "" + finally: + os.close(new_file) + os.remove(filename) + return text + + def tika_get_text(data: bytes) -> str: from tika import parser diff --git a/requirements/main.in b/requirements/main.in index d277654..b4706b1 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -13,3 +13,4 @@ pymupdf pypdfium2 pdfrw lxml +unpdf-markdown