mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-08 14:31:55 +00:00
Add PyMuPDF PDF loader (#1426)
Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html
This commit is contained in:
46
tests/integration_tests/document_loaders/test_pdf.py
Normal file
46
tests/integration_tests/document_loaders/test_pdf.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import (
|
||||
PDFMinerLoader,
|
||||
PyMuPDFLoader,
|
||||
UnstructuredPDFLoader,
|
||||
)
|
||||
|
||||
|
||||
def test_unstructured_pdf_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
def test_pdfminer_loader() -> None:
|
||||
"""Test PDFMiner loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(str(file_path))
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
def test_pymupdf_loader() -> None:
|
||||
"""Test PyMuPDF loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PyMuPDFLoader(str(file_path))
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PyMuPDFLoader(str(file_path))
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
Reference in New Issue
Block a user