community: support advanced text extraction options for pdf documents (#20265)

**Description:** 
- Updated constructors in PyPDFParser and PyPDFLoader to handle
`extraction_mode` and additional kwargs, aligning with the capabilities
of `PageObject.extract_text()` from pypdf.

- Added `test_pypdf_loader_with_layout` along with a corresponding
example text file to validate layout extraction from PDFs.

**Issue:** fixes #19735 

**Dependencies:** This change requires updating the pypdf dependency
from version 3.4.0 to at least 4.0.0.

Additional changes include the addition of a new test
test_pypdf_loader_with_layout and an example text file to ensure the
functionality of layout extraction from PDFs aligns with the new
capabilities.

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Brice Fotzo
2024-07-17 22:47:09 +02:00
committed by GitHub
parent a402de3dae
commit 034a8c7c1b
7 changed files with 101 additions and 6 deletions

View File

@@ -6,6 +6,7 @@ import warnings
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
Iterator,
Mapping,
@@ -27,6 +28,7 @@ if TYPE_CHECKING:
import pdfplumber.page
import pypdf._page
import pypdfium2._helpers.page
from pypdf import PageObject
from textractor.data.text_linearization_config import TextLinearizationConfig
@@ -83,10 +85,17 @@ class PyPDFParser(BaseBlobParser):
"""Load `PDF` using `pypdf`"""
def __init__(
self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
self,
password: Optional[Union[str, bytes]] = None,
extract_images: bool = False,
*,
extraction_mode: str = "plain",
extraction_kwargs: Optional[Dict[str, Any]] = None,
):
self.password = password
self.extract_images = extract_images
self.extraction_mode = extraction_mode
self.extraction_kwargs = extraction_kwargs or {}
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
@@ -98,11 +107,23 @@ class PyPDFParser(BaseBlobParser):
"`pip install pypdf`"
)
def _extract_text_from_page(page: "PageObject") -> str:
"""
Extract text from image given the version of pypdf.
"""
if pypdf.__version__.startswith("3"):
return page.extract_text()
else:
return page.extract_text(
extraction_mode=self.extraction_mode, **self.extraction_kwargs
)
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
yield from [
Document(
page_content=page.extract_text()
page_content=_extract_text_from_page(page=page)
+ self._extract_images_from_page(page),
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
)