Compare commits

...

15 Commits

Author SHA1 Message Date
Harrison Chase
3afce9e421 Merge branch 'lesterpjy-lesterpjy/pdfplumber' into harrison/pdfplumber 2023-05-14 19:56:15 -07:00
Harrison Chase
72d4709772 cr 2023-05-14 19:56:07 -07:00
Lester Yang
e7c34a1f9a resolve import typing conflict 2023-05-10 23:48:23 +08:00
Lester Yang
652f0fcb07 expose pdfplumber kwargs in constructor 2023-05-10 23:44:09 +08:00
Lester Yang
57449837c0 update notebook example 2023-05-10 23:43:22 +08:00
Lester Yang
d7696ec341 make format 2023-05-09 19:12:27 +08:00
Lester Yang
4f89a49847 use Any for kwargs type 2023-05-09 19:11:53 +08:00
Lester Yang
39390abc22 better name for saving path of annotate_and_load 2023-05-09 19:10:40 +08:00
Lester Yang
7055471721 use self.source instead of if-else 2023-05-09 19:08:50 +08:00
Lester Yang
a897afb965 fix import line 2023-05-09 18:37:59 +08:00
Lester Yang
fab49f41cf fix page number 2023-05-09 02:05:42 +08:00
Lester Yang
f154c885aa add integration tests for pdfplumberloader 2023-05-09 01:42:52 +08:00
Lester Yang
43a601a3b8 checking run with dev dependencies 2023-05-09 01:41:22 +08:00
Lester Yang
39c5ff56f2 adding example in documentation 2023-05-09 01:20:37 +08:00
Lester Yang
a8c1705c2f adding doc loader with PDFPlumber, allow saved visual debugging 2023-05-09 00:38:43 +08:00
3 changed files with 130 additions and 1 deletions

View File

@@ -60,6 +60,7 @@ from langchain.document_loaders.pdf import (
OnlinePDFLoader,
PDFMinerLoader,
PDFMinerPDFasHTMLLoader,
PDFPlumberLoader,
PyMuPDFLoader,
PyPDFDirectoryLoader,
PyPDFium2Loader,
@@ -165,6 +166,7 @@ __all__ = [
"PyMuPDFLoader",
"PyPDFDirectoryLoader",
"PyPDFLoader",
"PDFPlumberLoader",
"PyPDFium2Loader",
"PythonLoader",
"ReadTheDocsLoader",

View File

@@ -7,7 +7,7 @@ import time
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import Any, Iterator, List, Optional
from typing import Any, Iterator, List, Mapping, Optional
from urllib.parse import urlparse
import requests
@@ -362,3 +362,99 @@ class MathpixPDFLoader(BasePDFLoader):
contents = self.clean_pdf(contents)
metadata = {"source": self.source, "file_path": self.source}
return [Document(page_content=contents, metadata=metadata)]
class PDFPlumberLoader(BasePDFLoader):
"""Loader that uses PDFPlumber to load PDF files."""
def __init__(
self,
file_path: str,
text_kwargs: Mapping[str, Any] = {"x_tolerance": 3, "y_tolerance": 3},
word_kwargs: Mapping[str, Any] = {"x_tolerance": 3, "y_tolerance": 3},
image_kwargs: Mapping[str, Any] = {"resolution": None},
):
"""Initialize with file path."""
try:
import pdfplumber # noqa:F401
except ImportError:
raise ValueError(
"PDFPlumber package not found, please install it with "
"`pip install pdfplumber`"
)
super().__init__(file_path)
self.text_kwargs = text_kwargs
self.word_kwargs = word_kwargs
self.image_kwargs = image_kwargs
def load(self) -> List[Document]:
"""Load file."""
import pdfplumber
doc = pdfplumber.open(self.file_path)
file_path = self.source
return [
Document(
page_content=page.extract_text(**self.text_kwargs).encode("utf-8"),
metadata=dict(
{
"source": file_path,
"file_path": file_path,
"page_number": page.page_number,
"total_pages": len(doc.pages),
},
**{
k: doc.metadata[k]
for k in doc.metadata
if type(doc.metadata[k]) in [str, int]
},
),
)
for page in doc.pages
]
def annotate_and_load(self, save_path: str) -> List[Document]:
"""Annotate/save pdf file using pdfplumber's visual debudding and load file."""
import pdfplumber
path = Path(save_path)
path.mkdir(exist_ok=True, parents=True)
doc = pdfplumber.open(self.file_path)
file_path = self.source
# get annotated PIL.Images
annotated_imgs = []
for page in doc.pages:
im = page.to_image(**self.image_kwargs)
annotated_imgs.append(
im.draw_rects(page.extract_words(**self.word_kwargs)).annotated
)
# save as ranamed pdf
file_name = Path(self.file_path).stem
annotated_imgs[0].save(
str(path / "{}_annotated.pdf".format(file_name)),
save_all=True,
append_images=annotated_imgs[1:],
)
return [
Document(
page_content=page.extract_text(**self.text_kwargs).encode("utf-8"),
metadata=dict(
{
"source": file_path,
"file_path": file_path,
"page_number": page.page_number,
"total_pages": len(doc.pages),
},
**{
k: doc.metadata[k]
for k in doc.metadata
if type(doc.metadata[k]) in [str, int]
},
),
)
for page in doc.pages
]

View File

@@ -4,6 +4,7 @@ from langchain.document_loaders import (
MathpixPDFLoader,
PDFMinerLoader,
PDFMinerPDFasHTMLLoader,
PDFPlumberLoader,
PyMuPDFLoader,
PyPDFium2Loader,
PyPDFLoader,
@@ -118,3 +119,33 @@ def test_mathpix_loader() -> None:
docs = loader.load()
assert len(docs) == 1
print(docs[0].page_content)
def test_pdfplumber_loader() -> None:
"""Test PDFPlumber loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFPlumberLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
docs = loader.annotate_and_load(str(Path(__file__).parent.parent / "examples/"))
assert len(docs) == 1
assert Path(
str(Path(__file__).parent.parent / "examples/hello_annotated.pdf")
).is_file()
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFPlumberLoader(str(file_path))
docs = loader.load()
assert len(docs) == 16
assert loader.web_path is None
web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
loader = PDFPlumberLoader(web_path)
docs = loader.load()
assert loader.web_path == web_path
assert loader.file_path != web_path
assert len(docs) == 1