mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-19 19:11:33 +00:00
feat: Add page metadata on PDFMinerLoader (#12277)
- **Description:** #12273 's suggestion PR Like other PDFLoader, loading pdf per each page and giving page metadata. - **Issue:** #12273 - **Twitter handle:** @blue0_0hope --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
7148f3e1fe
commit
b1954aab13
@ -128,18 +128,36 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
class PDFMinerParser(BaseBlobParser):
|
class PDFMinerParser(BaseBlobParser):
|
||||||
"""Parse `PDF` using `PDFMiner`."""
|
"""Parse `PDF` using `PDFMiner`."""
|
||||||
|
|
||||||
def __init__(self, extract_images: bool = False):
|
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
|
||||||
|
"""Initialize a parser based on PDFMiner.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extract_images: Whether to extract images from PDF.
|
||||||
|
concatenate_pages: If True, concatenate all PDF pages into one a single
|
||||||
|
document. Otherwise, return one document per page.
|
||||||
|
"""
|
||||||
self.extract_images = extract_images
|
self.extract_images = extract_images
|
||||||
|
self.concatenate_pages = concatenate_pages
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
"""Lazily parse the blob."""
|
"""Lazily parse the blob."""
|
||||||
|
|
||||||
if not self.extract_images:
|
if not self.extract_images:
|
||||||
from pdfminer.high_level import extract_text
|
from pdfminer.high_level import extract_text
|
||||||
|
|
||||||
with blob.as_bytes_io() as pdf_file_obj:
|
with blob.as_bytes_io() as pdf_file_obj:
|
||||||
text = extract_text(pdf_file_obj)
|
if self.concatenate_pages:
|
||||||
metadata = {"source": blob.source}
|
text = extract_text(pdf_file_obj)
|
||||||
yield Document(page_content=text, metadata=metadata)
|
metadata = {"source": blob.source}
|
||||||
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
else:
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
|
||||||
|
pages = PDFPage.get_pages(pdf_file_obj)
|
||||||
|
for i, _ in enumerate(pages):
|
||||||
|
text = extract_text(pdf_file_obj, page_numbers=[i])
|
||||||
|
metadata = {"source": blob.source, "page": str(i)}
|
||||||
|
yield Document(page_content=text, metadata=metadata)
|
||||||
else:
|
else:
|
||||||
import io
|
import io
|
||||||
|
|
||||||
|
@ -251,8 +251,15 @@ class PDFMinerLoader(BasePDFLoader):
|
|||||||
*,
|
*,
|
||||||
headers: Optional[Dict] = None,
|
headers: Optional[Dict] = None,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
|
concatenate_pages: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extract_images: Whether to extract images from PDF.
|
||||||
|
concatenate_pages: If True, concatenate all PDF pages into one a single
|
||||||
|
document. Otherwise, return one document per page.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from pdfminer.high_level import extract_text # noqa:F401
|
from pdfminer.high_level import extract_text # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -262,7 +269,9 @@ class PDFMinerLoader(BasePDFLoader):
|
|||||||
)
|
)
|
||||||
|
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
self.parser = PDFMinerParser(extract_images=extract_images)
|
self.parser = PDFMinerParser(
|
||||||
|
extract_images=extract_images, concatenate_pages=concatenate_pages
|
||||||
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Eagerly load the content."""
|
"""Eagerly load the content."""
|
||||||
|
@ -56,6 +56,19 @@ def test_pdfminer_loader() -> None:
|
|||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
# Verify that concatenating pages parameter works
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
|
loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||||
|
loader = PDFMinerLoader(str(file_path), concatenate_pages=False)
|
||||||
|
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 16
|
||||||
|
|
||||||
|
|
||||||
def test_pdfminer_pdf_as_html_loader() -> None:
|
def test_pdfminer_pdf_as_html_loader() -> None:
|
||||||
"""Test PDFMinerPDFasHTMLLoader."""
|
"""Test PDFMinerPDFasHTMLLoader."""
|
||||||
|
Loading…
Reference in New Issue
Block a user