mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
Dev2049/pypdfium2 (#4209)
thanks @jerrytigerxu for the addition! --------- Co-authored-by: Jere Xu <jtxu2008@gmail.com> Co-authored-by: jerrytigerxu <jere.tiger.xu@gmailc.om>
This commit is contained in:
parent
59204a5033
commit
5ca13cc1f0
@ -372,6 +372,44 @@
|
|||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 9,
|
||||||
|
"id": "483720b5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "96351714",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Using PyPDFium2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "003fcc1d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import PyPDFium2Loader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "46766e29",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = PyPDFium2Loader(\"example_data/layout-parser-paper.pdf\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
"id": "010d5cdd",
|
"id": "010d5cdd",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -662,7 +700,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.11.3"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -61,6 +61,7 @@ from langchain.document_loaders.pdf import (
|
|||||||
PDFMinerPDFasHTMLLoader,
|
PDFMinerPDFasHTMLLoader,
|
||||||
PyMuPDFLoader,
|
PyMuPDFLoader,
|
||||||
PyPDFDirectoryLoader,
|
PyPDFDirectoryLoader,
|
||||||
|
PyPDFium2Loader,
|
||||||
PyPDFLoader,
|
PyPDFLoader,
|
||||||
UnstructuredPDFLoader,
|
UnstructuredPDFLoader,
|
||||||
)
|
)
|
||||||
@ -161,6 +162,7 @@ __all__ = [
|
|||||||
"PlaywrightURLLoader",
|
"PlaywrightURLLoader",
|
||||||
"PyMuPDFLoader",
|
"PyMuPDFLoader",
|
||||||
"PyPDFLoader",
|
"PyPDFLoader",
|
||||||
|
"PyPDFium2Loader",
|
||||||
"PythonLoader",
|
"PythonLoader",
|
||||||
"ReadTheDocsLoader",
|
"ReadTheDocsLoader",
|
||||||
"RoamLoader",
|
"RoamLoader",
|
||||||
|
@ -115,6 +115,34 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class PyPDFium2Loader(BasePDFLoader):
|
||||||
|
"""Loads a PDF with pypdfium2 and chunks at character level."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
try:
|
||||||
|
import pypdfium2 # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"pypdfium2 package not found, please install it with"
|
||||||
|
" `pip install pypdfium2`"
|
||||||
|
)
|
||||||
|
super().__init__(file_path)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load given path as pages."""
|
||||||
|
import pypdfium2
|
||||||
|
|
||||||
|
with open(self.file_path, "rb") as f:
|
||||||
|
pdf_reader = pypdfium2.PdfDocument(f)
|
||||||
|
docs = []
|
||||||
|
for i, page in enumerate(pdf_reader):
|
||||||
|
content = page.get_textpage().get_text_range()
|
||||||
|
metadata = {"source": self.file_path, "page": i}
|
||||||
|
docs.append(Document(page_content=content, metadata=metadata))
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
class PyPDFDirectoryLoader(BaseLoader):
|
class PyPDFDirectoryLoader(BaseLoader):
|
||||||
"""Loads a directory with PDF files with pypdf and chunks at character level.
|
"""Loads a directory with PDF files with pypdf and chunks at character level.
|
||||||
|
|
||||||
|
@ -1,12 +1,14 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from langchain.document_loaders import (
|
from langchain.document_loaders import (
|
||||||
|
MathpixPDFLoader,
|
||||||
PDFMinerLoader,
|
PDFMinerLoader,
|
||||||
PDFMinerPDFasHTMLLoader,
|
PDFMinerPDFasHTMLLoader,
|
||||||
PyMuPDFLoader,
|
PyMuPDFLoader,
|
||||||
|
PyPDFium2Loader,
|
||||||
|
PyPDFLoader,
|
||||||
UnstructuredPDFLoader,
|
UnstructuredPDFLoader,
|
||||||
)
|
)
|
||||||
from langchain.document_loaders.pdf import MathpixPDFLoader
|
|
||||||
|
|
||||||
|
|
||||||
def test_unstructured_pdf_loader() -> None:
|
def test_unstructured_pdf_loader() -> None:
|
||||||
@ -48,6 +50,36 @@ def test_pdfminer_pdf_as_html_loader() -> None:
|
|||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_pypdf_loader() -> None:
|
||||||
|
"""Test PyPDFLoader."""
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
|
loader = PyPDFLoader(str(file_path))
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||||
|
loader = PyPDFLoader(str(file_path))
|
||||||
|
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 16
|
||||||
|
|
||||||
|
|
||||||
|
def test_pypdfium2_loader() -> None:
|
||||||
|
"""Test PyPDFium2Loader."""
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
|
loader = PyPDFium2Loader(str(file_path))
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||||
|
loader = PyPDFium2Loader(str(file_path))
|
||||||
|
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 16
|
||||||
|
|
||||||
|
|
||||||
def test_pymupdf_loader() -> None:
|
def test_pymupdf_loader() -> None:
|
||||||
"""Test PyMuPDF loader."""
|
"""Test PyMuPDF loader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
|
Loading…
Reference in New Issue
Block a user