mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-03 12:07:36 +00:00
community: support advanced text extraction options for pdf documents (#20265)
**Description:** - Updated constructors in PyPDFParser and PyPDFLoader to handle `extraction_mode` and additional kwargs, aligning with the capabilities of `PageObject.extract_text()` from pypdf. - Added `test_pypdf_loader_with_layout` along with a corresponding example text file to validate layout extraction from PDFs. **Issue:** fixes #19735 **Dependencies:** This change requires updating the pypdf dependency from version 3.4.0 to at least 4.0.0. Additional changes include the addition of a new test test_pypdf_loader_with_layout and an example text file to ensure the functionality of layout extraction from PDFs aligns with the new capabilities. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -171,6 +171,9 @@ class PyPDFLoader(BasePDFLoader):
|
||||
password: Optional[Union[str, bytes]] = None,
|
||||
headers: Optional[Dict] = None,
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
extraction_mode: str = "plain",
|
||||
extraction_kwargs: Optional[Dict] = None,
|
||||
) -> None:
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
@@ -180,7 +183,12 @@ class PyPDFLoader(BasePDFLoader):
|
||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||
)
|
||||
super().__init__(file_path, headers=headers)
|
||||
self.parser = PyPDFParser(password=password, extract_images=extract_images)
|
||||
self.parser = PyPDFParser(
|
||||
password=password,
|
||||
extract_images=extract_images,
|
||||
extraction_mode=extraction_mode,
|
||||
extraction_kwargs=extraction_kwargs,
|
||||
)
|
||||
|
||||
def lazy_load(
|
||||
self,
|
||||
|
Reference in New Issue
Block a user