feat: Add page metadata on PDFMinerLoader (#12277)

- **Description:** #12273 's suggestion PR
Like other PDFLoader, loading pdf per each page and giving page
metadata.
  - **Issue:** #12273 
  - **Twitter handle:** @blue0_0hope

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Dave Kwon 2023-11-02 00:25:37 +09:00 committed by GitHub
parent 7148f3e1fe
commit b1954aab13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 46 additions and 6 deletions

View File

@ -128,18 +128,36 @@ class PyPDFParser(BaseBlobParser):
class PDFMinerParser(BaseBlobParser): class PDFMinerParser(BaseBlobParser):
"""Parse `PDF` using `PDFMiner`.""" """Parse `PDF` using `PDFMiner`."""
def __init__(self, extract_images: bool = False): def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
"""Initialize a parser based on PDFMiner.
Args:
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
"""
self.extract_images = extract_images self.extract_images = extract_images
self.concatenate_pages = concatenate_pages
def lazy_parse(self, blob: Blob) -> Iterator[Document]: def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob.""" """Lazily parse the blob."""
if not self.extract_images: if not self.extract_images:
from pdfminer.high_level import extract_text from pdfminer.high_level import extract_text
with blob.as_bytes_io() as pdf_file_obj: with blob.as_bytes_io() as pdf_file_obj:
text = extract_text(pdf_file_obj) if self.concatenate_pages:
metadata = {"source": blob.source} text = extract_text(pdf_file_obj)
yield Document(page_content=text, metadata=metadata) metadata = {"source": blob.source}
yield Document(page_content=text, metadata=metadata)
else:
from pdfminer.pdfpage import PDFPage
pages = PDFPage.get_pages(pdf_file_obj)
for i, _ in enumerate(pages):
text = extract_text(pdf_file_obj, page_numbers=[i])
metadata = {"source": blob.source, "page": str(i)}
yield Document(page_content=text, metadata=metadata)
else: else:
import io import io

View File

@ -251,8 +251,15 @@ class PDFMinerLoader(BasePDFLoader):
*, *,
headers: Optional[Dict] = None, headers: Optional[Dict] = None,
extract_images: bool = False, extract_images: bool = False,
concatenate_pages: bool = True,
) -> None: ) -> None:
"""Initialize with file path.""" """Initialize with file path.
Args:
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
"""
try: try:
from pdfminer.high_level import extract_text # noqa:F401 from pdfminer.high_level import extract_text # noqa:F401
except ImportError: except ImportError:
@ -262,7 +269,9 @@ class PDFMinerLoader(BasePDFLoader):
) )
super().__init__(file_path, headers=headers) super().__init__(file_path, headers=headers)
self.parser = PDFMinerParser(extract_images=extract_images) self.parser = PDFMinerParser(
extract_images=extract_images, concatenate_pages=concatenate_pages
)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Eagerly load the content.""" """Eagerly load the content."""

View File

@ -56,6 +56,19 @@ def test_pdfminer_loader() -> None:
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
# Verify that concatenating pages parameter works
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(str(file_path), concatenate_pages=False)
docs = loader.load()
assert len(docs) == 16
def test_pdfminer_pdf_as_html_loader() -> None: def test_pdfminer_pdf_as_html_loader() -> None:
"""Test PDFMinerPDFasHTMLLoader.""" """Test PDFMinerPDFasHTMLLoader."""