community: support advanced text extraction options for pdf documents (#20265)

**Description:** - Updated constructors in PyPDFParser and PyPDFLoader to handle `extraction_mode` and additional kwargs, aligning with the capabilities of `PageObject.extract_text()` from pypdf. - Added `test_pypdf_loader_with_layout` along with a corresponding example text file to validate layout extraction from PDFs. **Issue:** fixes #19735 **Dependencies:** This change requires updating the pypdf dependency from version 3.4.0 to at least 4.0.0. Additional changes include the addition of a new test test_pypdf_loader_with_layout and an example text file to ensure the functionality of layout extraction from PDFs aligns with the new capabilities. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Erick Friis <erick@langchain.dev>
2025-09-13 05:25:07 +00:00 · 2024-07-17 22:47:09 +02:00
parent a402de3dae
commit 034a8c7c1b
7 changed files with 101 additions and 6 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -6,6 +6,7 @@ import warnings
 from typing import (
    TYPE_CHECKING,
    Any,
+    Dict,
    Iterable,
    Iterator,
    Mapping,
@@ -27,6 +28,7 @@ if TYPE_CHECKING:
    import pdfplumber.page
    import pypdf._page
    import pypdfium2._helpers.page
+    from pypdf import PageObject
    from textractor.data.text_linearization_config import TextLinearizationConfig


@@ -83,10 +85,17 @@ class PyPDFParser(BaseBlobParser):
    """Load `PDF` using `pypdf`"""

    def __init__(
-        self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
+        self,
+        password: Optional[Union[str, bytes]] = None,
+        extract_images: bool = False,
+        *,
+        extraction_mode: str = "plain",
+        extraction_kwargs: Optional[Dict[str, Any]] = None,
    ):
        self.password = password
        self.extract_images = extract_images
+        self.extraction_mode = extraction_mode
+        self.extraction_kwargs = extraction_kwargs or {}

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""
@@ -98,11 +107,23 @@ class PyPDFParser(BaseBlobParser):
                "`pip install pypdf`"
            )

+        def _extract_text_from_page(page: "PageObject") -> str:
+            """
+            Extract text from image given the version of pypdf.
+            """
+            if pypdf.__version__.startswith("3"):
+                return page.extract_text()
+            else:
+                return page.extract_text(
+                    extraction_mode=self.extraction_mode, **self.extraction_kwargs
+                )
+
        with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
            pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
+
            yield from [
                Document(
-                    page_content=page.extract_text()
+                    page_content=_extract_text_from_page(page=page)
                    + self._extract_images_from_page(page),
                    metadata={"source": blob.source, "page": page_number},  # type: ignore[attr-defined]
                )