From 034a8c7c1b5bfb0b466f1a396eeb0f26acabe5b7 Mon Sep 17 00:00:00 2001
From: Brice Fotzo <44189336+bricefotzo@users.noreply.github.com>
Date: Wed, 17 Jul 2024 22:47:09 +0200
Subject: [PATCH] community: support advanced text extraction options for pdf
 documents (#20265)

**Description:**
- Updated constructors in PyPDFParser and PyPDFLoader to handle
`extraction_mode` and additional kwargs, aligning with the capabilities
of `PageObject.extract_text()` from pypdf.

- Added `test_pypdf_loader_with_layout` along with a corresponding
example text file to validate layout extraction from PDFs.

**Issue:** fixes #19735

**Dependencies:** This change requires updating the pypdf dependency
from version 3.4.0 to at least 4.0.0.

Additional changes include the addition of a new test
test_pypdf_loader_with_layout and an example text file to ensure the
functionality of layout extraction from PDFs aligns with the new
capabilities.

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
---
 libs/community/extended_testing_deps.txt      |  2 +-
 .../document_loaders/parsers/pdf.py           | 25 +++++++++-
 .../document_loaders/pdf.py                   | 10 +++-
 .../document_loaders/test_pdf.py              | 17 +++++++
 .../examples/layout-parser-paper-page-1.txt   | 49 +++++++++++++++++++
 .../pyproject.toml                            |  2 +-
 templates/nvidia-rag-canonical/pyproject.toml |  2 +-
 7 files changed, 101 insertions(+), 6 deletions(-)
 create mode 100644 libs/community/tests/integration_tests/examples/layout-parser-paper-page-1.txt

diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt
index 6879b6a184d..eca081b0a5b 100644
--- a/libs/community/extended_testing_deps.txt
+++ b/libs/community/extended_testing_deps.txt
@@ -60,7 +60,7 @@ psychicapi>=0.8.0,<0.9
 py-trello>=0.19.0,<0.20
 pyjwt>=2.8.0,<3
 pymupdf>=1.22.3,<2
-pypdf>=3.4.0,<4
+pypdf>=3.4.0,<5
 pypdfium2>=4.10.0,<5
 pyspark>=3.4.0,<4
 rank-bm25>=0.2.2,<0.3
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index c561cd0fe32..2013c948fb8 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -6,6 +6,7 @@ import warnings
 from typing import (
     TYPE_CHECKING,
     Any,
+    Dict,
     Iterable,
     Iterator,
     Mapping,
@@ -27,6 +28,7 @@ if TYPE_CHECKING:
     import pdfplumber.page
     import pypdf._page
     import pypdfium2._helpers.page
+    from pypdf import PageObject
     from textractor.data.text_linearization_config import TextLinearizationConfig
 
 
@@ -83,10 +85,17 @@ class PyPDFParser(BaseBlobParser):
     """Load `PDF` using `pypdf`"""
 
     def __init__(
-        self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
+        self,
+        password: Optional[Union[str, bytes]] = None,
+        extract_images: bool = False,
+        *,
+        extraction_mode: str = "plain",
+        extraction_kwargs: Optional[Dict[str, Any]] = None,
     ):
         self.password = password
         self.extract_images = extract_images
+        self.extraction_mode = extraction_mode
+        self.extraction_kwargs = extraction_kwargs or {}
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
         """Lazily parse the blob."""
@@ -98,11 +107,23 @@ class PyPDFParser(BaseBlobParser):
                 "`pip install pypdf`"
             )
 
+        def _extract_text_from_page(page: "PageObject") -> str:
+            """
+            Extract text from image given the version of pypdf.
+            """
+            if pypdf.__version__.startswith("3"):
+                return page.extract_text()
+            else:
+                return page.extract_text(
+                    extraction_mode=self.extraction_mode, **self.extraction_kwargs
+                )
+
         with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
             pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
+
             yield from [
                 Document(
-                    page_content=page.extract_text()
+                    page_content=_extract_text_from_page(page=page)
                     + self._extract_images_from_page(page),
                     metadata={"source": blob.source, "page": page_number},  # type: ignore[attr-defined]
                 )
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
index 17063eed0df..02c416a135d 100644
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -171,6 +171,9 @@ class PyPDFLoader(BasePDFLoader):
         password: Optional[Union[str, bytes]] = None,
         headers: Optional[Dict] = None,
         extract_images: bool = False,
+        *,
+        extraction_mode: str = "plain",
+        extraction_kwargs: Optional[Dict] = None,
     ) -> None:
         """Initialize with a file path."""
         try:
@@ -180,7 +183,12 @@ class PyPDFLoader(BasePDFLoader):
                 "pypdf package not found, please install it with " "`pip install pypdf`"
             )
         super().__init__(file_path, headers=headers)
-        self.parser = PyPDFParser(password=password, extract_images=extract_images)
+        self.parser = PyPDFParser(
+            password=password,
+            extract_images=extract_images,
+            extraction_mode=extraction_mode,
+            extraction_kwargs=extraction_kwargs,
+        )
 
     def lazy_load(
         self,
diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
index f9d71ea200e..462e20d3579 100644
--- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
@@ -1,3 +1,4 @@
+import re
 from pathlib import Path
 from typing import Sequence, Union
 
@@ -100,6 +101,22 @@ def test_pypdf_loader() -> None:
     assert len(docs) == 16
 
 
+def test_pypdf_loader_with_layout() -> None:
+    """Test PyPDFLoader with layout mode."""
+    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
+    loader = PyPDFLoader(str(file_path), extraction_mode="layout")
+
+    docs = loader.load()
+    first_page = docs[0].page_content
+
+    expected = (
+        Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt"
+    ).read_text(encoding="utf-8")
+    cleaned_first_page = re.sub(r"\x00", "", first_page)
+    cleaned_expected = re.sub(r"\x00", "", expected)
+    assert cleaned_first_page == cleaned_expected
+
+
 def test_pypdfium2_loader() -> None:
     """Test PyPDFium2Loader."""
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
diff --git a/libs/community/tests/integration_tests/examples/layout-parser-paper-page-1.txt b/libs/community/tests/integration_tests/examples/layout-parser-paper-page-1.txt
new file mode 100644
index 00000000000..294fac00a02
--- /dev/null
+++ b/libs/community/tests/integration_tests/examples/layout-parser-paper-page-1.txt
@@ -0,0 +1,49 @@
+             LayoutParser         : A Uniﬁed Toolkit for Deep
+          Learning Based Document Image Analysis
+
+
+Zejiang Shen           1  (     ), Ruochen Zhang                2, Melissa Dell         3, Benjamin Charles Germain
+                                         Lee   4, Jacob Carlson            3, and Weining Li              5
+
+                                                           1  Allen Institute for AI
+                                                           shannons@allenai.org
+                                                               2  Brown University
+                                                        ruochen          zhang@brown.edu
+                                                             3  Harvard University
+                                  {melissadell,jacob                       carlson       }@fas.harvard.edu
+                                                       4  University of Washington
+                                                         bcgl@cs.washington.edu
+                                                          5  University of Waterloo
+                                                            w422li@uwaterloo.ca
+
+
+
+             Abstract.        Recentadvancesindocumentimageanalysis(DIA)havebeen
+             primarily driven by the application of neural networks. Ideally, research
+             outcomes could be easily deployed in production and extended for further
+             investigation. However, various factors like loosely organized codebases
+             and sophisticated model conﬁgurations complicate the easy reuse of im-
+             portant innovations by awide audience. Though there havebeen on-going
+             eﬀorts to improve reusability and simplify deep learning (DL) model
+             development in disciplines like natural language processing and computer
+             vision, none of them are optimized for challenges in the domain of DIA.
+             This represents a major gap in the existing toolkit, as DIA is central to
+             academic research across a wide range of disciplines in the social sciences
+             and humanities. This paper introduces                           LayoutParser           , an open-source
+             library for streamlining the usage of DL in DIA research and applica-
+             tions. The core          LayoutParser            library comes with a set of simple and
+             intuitive interfaces for applying and customizing DL models for layout de-
+             tection,characterrecognition,andmanyotherdocumentprocessingtasks.
+             To promote extensibility,                 LayoutParser            also incorporates a community
+             platform for sharing both pre-trained models and full document digiti-
+             zation pipelines. We demonstrate that                         LayoutParser            is helpful for both
+             lightweight and large-scale digitization pipelines in real-word use cases.
+             The library is publicly available at                    https://layout-parser.github.io                            .
+
+             Keywords:          DocumentImageAnalysis                                    ·DeepLearning                    ·LayoutAnalysis
+             · Character Recognition                              · Open Source library                           · Toolkit.
+
+1   Introduction
+
+Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of
+documentimageanalysis(DIA)tasksincludingdocumentimageclassiﬁcation[                                              11 ,
\ No newline at end of file
diff --git a/templates/mongo-parent-document-retrieval/pyproject.toml b/templates/mongo-parent-document-retrieval/pyproject.toml
index f55bf1aefa4..e1403ed540a 100644
--- a/templates/mongo-parent-document-retrieval/pyproject.toml
+++ b/templates/mongo-parent-document-retrieval/pyproject.toml
@@ -10,7 +10,7 @@ python = ">=3.8.1,<4.0"
 langchain = "^0.1"
 openai = "<2"
 pymongo = "^4.6.0"
-pypdf = "^3.17.0"
+pypdf = "^4.0.0"
 tiktoken = "^0.5.1"
 langchain-text-splitters = ">=0.0.1,<0.1"
 
diff --git a/templates/nvidia-rag-canonical/pyproject.toml b/templates/nvidia-rag-canonical/pyproject.toml
index cd0b440fe5f..59ecf00a81f 100644
--- a/templates/nvidia-rag-canonical/pyproject.toml
+++ b/templates/nvidia-rag-canonical/pyproject.toml
@@ -10,7 +10,7 @@ python = ">=3.8.1,<4.0"
 langchain = "^0.1"
 pymilvus = ">=2.3.0"
 langchain-nvidia-aiplay = "^0.0.2"
-pypdf = ">=3.1"
+pypdf = ">=4.0.1"
 langchain-text-splitters = ">=0.0.1,<0.1"
 
 [tool.poetry.group.dev.dependencies]