diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index fb0cfd64a54..ae0c3105292 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -128,18 +128,36 @@ class PyPDFParser(BaseBlobParser): class PDFMinerParser(BaseBlobParser): """Parse `PDF` using `PDFMiner`.""" - def __init__(self, extract_images: bool = False): + def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True): + """Initialize a parser based on PDFMiner. + + Args: + extract_images: Whether to extract images from PDF. + concatenate_pages: If True, concatenate all PDF pages into one a single + document. Otherwise, return one document per page. + """ self.extract_images = extract_images + self.concatenate_pages = concatenate_pages def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" + if not self.extract_images: from pdfminer.high_level import extract_text with blob.as_bytes_io() as pdf_file_obj: - text = extract_text(pdf_file_obj) - metadata = {"source": blob.source} - yield Document(page_content=text, metadata=metadata) + if self.concatenate_pages: + text = extract_text(pdf_file_obj) + metadata = {"source": blob.source} + yield Document(page_content=text, metadata=metadata) + else: + from pdfminer.pdfpage import PDFPage + + pages = PDFPage.get_pages(pdf_file_obj) + for i, _ in enumerate(pages): + text = extract_text(pdf_file_obj, page_numbers=[i]) + metadata = {"source": blob.source, "page": str(i)} + yield Document(page_content=text, metadata=metadata) else: import io diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index 7463eb0dec4..910075efdff 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -251,8 +251,15 @@ class PDFMinerLoader(BasePDFLoader): *, headers: Optional[Dict] = None, extract_images: bool = False, + concatenate_pages: bool = True, ) -> None: - """Initialize with file path.""" + """Initialize with file path. + + Args: + extract_images: Whether to extract images from PDF. + concatenate_pages: If True, concatenate all PDF pages into one a single + document. Otherwise, return one document per page. + """ try: from pdfminer.high_level import extract_text # noqa:F401 except ImportError: @@ -262,7 +269,9 @@ class PDFMinerLoader(BasePDFLoader): ) super().__init__(file_path, headers=headers) - self.parser = PDFMinerParser(extract_images=extract_images) + self.parser = PDFMinerParser( + extract_images=extract_images, concatenate_pages=concatenate_pages + ) def load(self) -> List[Document]: """Eagerly load the content.""" diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py index 1fac61f3503..d83ba9cf7a1 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py @@ -56,6 +56,19 @@ def test_pdfminer_loader() -> None: docs = loader.load() assert len(docs) == 1 + # Verify that concatenating pages parameter works + file_path = Path(__file__).parent.parent / "examples/hello.pdf" + loader = PDFMinerLoader(str(file_path), concatenate_pages=True) + docs = loader.load() + + assert len(docs) == 1 + + file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" + loader = PDFMinerLoader(str(file_path), concatenate_pages=False) + + docs = loader.load() + assert len(docs) == 16 + def test_pdfminer_pdf_as_html_loader() -> None: """Test PDFMinerPDFasHTMLLoader."""