mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-19 11:08:55 +00:00
feat: Add page metadata on PDFMinerLoader (#12277)
- **Description:** #12273 's suggestion PR Like other PDFLoader, loading pdf per each page and giving page metadata. - **Issue:** #12273 - **Twitter handle:** @blue0_0hope --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
7148f3e1fe
commit
b1954aab13
@ -128,18 +128,36 @@ class PyPDFParser(BaseBlobParser):
|
||||
class PDFMinerParser(BaseBlobParser):
|
||||
"""Parse `PDF` using `PDFMiner`."""
|
||||
|
||||
def __init__(self, extract_images: bool = False):
|
||||
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
|
||||
"""Initialize a parser based on PDFMiner.
|
||||
|
||||
Args:
|
||||
extract_images: Whether to extract images from PDF.
|
||||
concatenate_pages: If True, concatenate all PDF pages into one a single
|
||||
document. Otherwise, return one document per page.
|
||||
"""
|
||||
self.extract_images = extract_images
|
||||
self.concatenate_pages = concatenate_pages
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
|
||||
if not self.extract_images:
|
||||
from pdfminer.high_level import extract_text
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj:
|
||||
text = extract_text(pdf_file_obj)
|
||||
metadata = {"source": blob.source}
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
if self.concatenate_pages:
|
||||
text = extract_text(pdf_file_obj)
|
||||
metadata = {"source": blob.source}
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
else:
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
|
||||
pages = PDFPage.get_pages(pdf_file_obj)
|
||||
for i, _ in enumerate(pages):
|
||||
text = extract_text(pdf_file_obj, page_numbers=[i])
|
||||
metadata = {"source": blob.source, "page": str(i)}
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
else:
|
||||
import io
|
||||
|
||||
|
@ -251,8 +251,15 @@ class PDFMinerLoader(BasePDFLoader):
|
||||
*,
|
||||
headers: Optional[Dict] = None,
|
||||
extract_images: bool = False,
|
||||
concatenate_pages: bool = True,
|
||||
) -> None:
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with file path.
|
||||
|
||||
Args:
|
||||
extract_images: Whether to extract images from PDF.
|
||||
concatenate_pages: If True, concatenate all PDF pages into one a single
|
||||
document. Otherwise, return one document per page.
|
||||
"""
|
||||
try:
|
||||
from pdfminer.high_level import extract_text # noqa:F401
|
||||
except ImportError:
|
||||
@ -262,7 +269,9 @@ class PDFMinerLoader(BasePDFLoader):
|
||||
)
|
||||
|
||||
super().__init__(file_path, headers=headers)
|
||||
self.parser = PDFMinerParser(extract_images=extract_images)
|
||||
self.parser = PDFMinerParser(
|
||||
extract_images=extract_images, concatenate_pages=concatenate_pages
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Eagerly load the content."""
|
||||
|
@ -56,6 +56,19 @@ def test_pdfminer_loader() -> None:
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
# Verify that concatenating pages parameter works
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(str(file_path), concatenate_pages=False)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
|
||||
|
||||
def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
"""Test PDFMinerPDFasHTMLLoader."""
|
||||
|
Loading…
Reference in New Issue
Block a user