mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-13 05:25:07 +00:00
community: support advanced text extraction options for pdf documents (#20265)
**Description:** - Updated constructors in PyPDFParser and PyPDFLoader to handle `extraction_mode` and additional kwargs, aligning with the capabilities of `PageObject.extract_text()` from pypdf. - Added `test_pypdf_loader_with_layout` along with a corresponding example text file to validate layout extraction from PDFs. **Issue:** fixes #19735 **Dependencies:** This change requires updating the pypdf dependency from version 3.4.0 to at least 4.0.0. Additional changes include the addition of a new test test_pypdf_loader_with_layout and an example text file to ensure the functionality of layout extraction from PDFs aligns with the new capabilities. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -6,6 +6,7 @@ import warnings
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Mapping,
|
||||
@@ -27,6 +28,7 @@ if TYPE_CHECKING:
|
||||
import pdfplumber.page
|
||||
import pypdf._page
|
||||
import pypdfium2._helpers.page
|
||||
from pypdf import PageObject
|
||||
from textractor.data.text_linearization_config import TextLinearizationConfig
|
||||
|
||||
|
||||
@@ -83,10 +85,17 @@ class PyPDFParser(BaseBlobParser):
|
||||
"""Load `PDF` using `pypdf`"""
|
||||
|
||||
def __init__(
|
||||
self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
|
||||
self,
|
||||
password: Optional[Union[str, bytes]] = None,
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
extraction_mode: str = "plain",
|
||||
extraction_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
self.password = password
|
||||
self.extract_images = extract_images
|
||||
self.extraction_mode = extraction_mode
|
||||
self.extraction_kwargs = extraction_kwargs or {}
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||
"""Lazily parse the blob."""
|
||||
@@ -98,11 +107,23 @@ class PyPDFParser(BaseBlobParser):
|
||||
"`pip install pypdf`"
|
||||
)
|
||||
|
||||
def _extract_text_from_page(page: "PageObject") -> str:
|
||||
"""
|
||||
Extract text from image given the version of pypdf.
|
||||
"""
|
||||
if pypdf.__version__.startswith("3"):
|
||||
return page.extract_text()
|
||||
else:
|
||||
return page.extract_text(
|
||||
extraction_mode=self.extraction_mode, **self.extraction_kwargs
|
||||
)
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
|
||||
|
||||
yield from [
|
||||
Document(
|
||||
page_content=page.extract_text()
|
||||
page_content=_extract_text_from_page(page=page)
|
||||
+ self._extract_images_from_page(page),
|
||||
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
|
||||
)
|
||||
|
Reference in New Issue
Block a user