Add PyMuPDF PDF loader (#1426)

Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html
2025-09-08 14:31:55 +00:00 · 2023-03-03 20:59:28 -08:00
parent 3d54b05863
commit 23231d65a9
4 changed files with 169 additions and 9 deletions
--- a/tests/integration_tests/document_loaders/test_pdf.py
+++ b/tests/integration_tests/document_loaders/test_pdf.py
@@ -0,0 +1,46 @@
+from pathlib import Path
+
+from langchain.document_loaders import (
+    PDFMinerLoader,
+    PyMuPDFLoader,
+    UnstructuredPDFLoader,
+)
+
+
+def test_unstructured_pdf_loader() -> None:
+    """Test unstructured loader."""
+    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
+    loader = UnstructuredPDFLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+
+def test_pdfminer_loader() -> None:
+    """Test PDFMiner loader."""
+    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
+    loader = PDFMinerLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
+    loader = PDFMinerLoader(str(file_path))
+
+    docs = loader.load()
+    assert len(docs) == 1
+
+
+def test_pymupdf_loader() -> None:
+    """Test PyMuPDF loader."""
+    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
+    loader = PyMuPDFLoader(str(file_path))
+
+    docs = loader.load()
+    assert len(docs) == 1
+
+    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
+    loader = PyMuPDFLoader(str(file_path))
+
+    docs = loader.load()
+    assert len(docs) == 16