mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-01 02:43:37 +00:00
community: support advanced text extraction options for pdf documents (#20265)
**Description:** - Updated constructors in PyPDFParser and PyPDFLoader to handle `extraction_mode` and additional kwargs, aligning with the capabilities of `PageObject.extract_text()` from pypdf. - Added `test_pypdf_loader_with_layout` along with a corresponding example text file to validate layout extraction from PDFs. **Issue:** fixes #19735 **Dependencies:** This change requires updating the pypdf dependency from version 3.4.0 to at least 4.0.0. Additional changes include the addition of a new test test_pypdf_loader_with_layout and an example text file to ensure the functionality of layout extraction from PDFs aligns with the new capabilities. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
a402de3dae
commit
034a8c7c1b
@ -60,7 +60,7 @@ psychicapi>=0.8.0,<0.9
|
|||||||
py-trello>=0.19.0,<0.20
|
py-trello>=0.19.0,<0.20
|
||||||
pyjwt>=2.8.0,<3
|
pyjwt>=2.8.0,<3
|
||||||
pymupdf>=1.22.3,<2
|
pymupdf>=1.22.3,<2
|
||||||
pypdf>=3.4.0,<4
|
pypdf>=3.4.0,<5
|
||||||
pypdfium2>=4.10.0,<5
|
pypdfium2>=4.10.0,<5
|
||||||
pyspark>=3.4.0,<4
|
pyspark>=3.4.0,<4
|
||||||
rank-bm25>=0.2.2,<0.3
|
rank-bm25>=0.2.2,<0.3
|
||||||
|
@ -6,6 +6,7 @@ import warnings
|
|||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
|
Dict,
|
||||||
Iterable,
|
Iterable,
|
||||||
Iterator,
|
Iterator,
|
||||||
Mapping,
|
Mapping,
|
||||||
@ -27,6 +28,7 @@ if TYPE_CHECKING:
|
|||||||
import pdfplumber.page
|
import pdfplumber.page
|
||||||
import pypdf._page
|
import pypdf._page
|
||||||
import pypdfium2._helpers.page
|
import pypdfium2._helpers.page
|
||||||
|
from pypdf import PageObject
|
||||||
from textractor.data.text_linearization_config import TextLinearizationConfig
|
from textractor.data.text_linearization_config import TextLinearizationConfig
|
||||||
|
|
||||||
|
|
||||||
@ -83,10 +85,17 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
"""Load `PDF` using `pypdf`"""
|
"""Load `PDF` using `pypdf`"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
|
self,
|
||||||
|
password: Optional[Union[str, bytes]] = None,
|
||||||
|
extract_images: bool = False,
|
||||||
|
*,
|
||||||
|
extraction_mode: str = "plain",
|
||||||
|
extraction_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
self.password = password
|
self.password = password
|
||||||
self.extract_images = extract_images
|
self.extract_images = extract_images
|
||||||
|
self.extraction_mode = extraction_mode
|
||||||
|
self.extraction_kwargs = extraction_kwargs or {}
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
"""Lazily parse the blob."""
|
"""Lazily parse the blob."""
|
||||||
@ -98,11 +107,23 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
"`pip install pypdf`"
|
"`pip install pypdf`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _extract_text_from_page(page: "PageObject") -> str:
|
||||||
|
"""
|
||||||
|
Extract text from image given the version of pypdf.
|
||||||
|
"""
|
||||||
|
if pypdf.__version__.startswith("3"):
|
||||||
|
return page.extract_text()
|
||||||
|
else:
|
||||||
|
return page.extract_text(
|
||||||
|
extraction_mode=self.extraction_mode, **self.extraction_kwargs
|
||||||
|
)
|
||||||
|
|
||||||
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
||||||
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
|
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
|
||||||
|
|
||||||
yield from [
|
yield from [
|
||||||
Document(
|
Document(
|
||||||
page_content=page.extract_text()
|
page_content=_extract_text_from_page(page=page)
|
||||||
+ self._extract_images_from_page(page),
|
+ self._extract_images_from_page(page),
|
||||||
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
|
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
|
||||||
)
|
)
|
||||||
|
@ -171,6 +171,9 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
password: Optional[Union[str, bytes]] = None,
|
password: Optional[Union[str, bytes]] = None,
|
||||||
headers: Optional[Dict] = None,
|
headers: Optional[Dict] = None,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
|
*,
|
||||||
|
extraction_mode: str = "plain",
|
||||||
|
extraction_kwargs: Optional[Dict] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path."""
|
||||||
try:
|
try:
|
||||||
@ -180,7 +183,12 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||||
)
|
)
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
self.parser = PyPDFParser(password=password, extract_images=extract_images)
|
self.parser = PyPDFParser(
|
||||||
|
password=password,
|
||||||
|
extract_images=extract_images,
|
||||||
|
extraction_mode=extraction_mode,
|
||||||
|
extraction_kwargs=extraction_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Sequence, Union
|
from typing import Sequence, Union
|
||||||
|
|
||||||
@ -100,6 +101,22 @@ def test_pypdf_loader() -> None:
|
|||||||
assert len(docs) == 16
|
assert len(docs) == 16
|
||||||
|
|
||||||
|
|
||||||
|
def test_pypdf_loader_with_layout() -> None:
|
||||||
|
"""Test PyPDFLoader with layout mode."""
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||||
|
loader = PyPDFLoader(str(file_path), extraction_mode="layout")
|
||||||
|
|
||||||
|
docs = loader.load()
|
||||||
|
first_page = docs[0].page_content
|
||||||
|
|
||||||
|
expected = (
|
||||||
|
Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt"
|
||||||
|
).read_text(encoding="utf-8")
|
||||||
|
cleaned_first_page = re.sub(r"\x00", "", first_page)
|
||||||
|
cleaned_expected = re.sub(r"\x00", "", expected)
|
||||||
|
assert cleaned_first_page == cleaned_expected
|
||||||
|
|
||||||
|
|
||||||
def test_pypdfium2_loader() -> None:
|
def test_pypdfium2_loader() -> None:
|
||||||
"""Test PyPDFium2Loader."""
|
"""Test PyPDFium2Loader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
|
@ -0,0 +1,49 @@
|
|||||||
|
LayoutParser : A Unified Toolkit for Deep
|
||||||
|
Learning Based Document Image Analysis
|
||||||
|
|
||||||
|
|
||||||
|
Zejiang Shen 1 ( ), Ruochen Zhang 2, Melissa Dell 3, Benjamin Charles Germain
|
||||||
|
Lee 4, Jacob Carlson 3, and Weining Li 5
|
||||||
|
|
||||||
|
1 Allen Institute for AI
|
||||||
|
shannons@allenai.org
|
||||||
|
2 Brown University
|
||||||
|
ruochen zhang@brown.edu
|
||||||
|
3 Harvard University
|
||||||
|
{melissadell,jacob carlson }@fas.harvard.edu
|
||||||
|
4 University of Washington
|
||||||
|
bcgl@cs.washington.edu
|
||||||
|
5 University of Waterloo
|
||||||
|
w422li@uwaterloo.ca
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Abstract. Recentadvancesindocumentimageanalysis(DIA)havebeen
|
||||||
|
primarily driven by the application of neural networks. Ideally, research
|
||||||
|
outcomes could be easily deployed in production and extended for further
|
||||||
|
investigation. However, various factors like loosely organized codebases
|
||||||
|
and sophisticated model configurations complicate the easy reuse of im-
|
||||||
|
portant innovations by awide audience. Though there havebeen on-going
|
||||||
|
efforts to improve reusability and simplify deep learning (DL) model
|
||||||
|
development in disciplines like natural language processing and computer
|
||||||
|
vision, none of them are optimized for challenges in the domain of DIA.
|
||||||
|
This represents a major gap in the existing toolkit, as DIA is central to
|
||||||
|
academic research across a wide range of disciplines in the social sciences
|
||||||
|
and humanities. This paper introduces LayoutParser , an open-source
|
||||||
|
library for streamlining the usage of DL in DIA research and applica-
|
||||||
|
tions. The core LayoutParser library comes with a set of simple and
|
||||||
|
intuitive interfaces for applying and customizing DL models for layout de-
|
||||||
|
tection,characterrecognition,andmanyotherdocumentprocessingtasks.
|
||||||
|
To promote extensibility, LayoutParser also incorporates a community
|
||||||
|
platform for sharing both pre-trained models and full document digiti-
|
||||||
|
zation pipelines. We demonstrate that LayoutParser is helpful for both
|
||||||
|
lightweight and large-scale digitization pipelines in real-word use cases.
|
||||||
|
The library is publicly available at https://layout-parser.github.io .
|
||||||
|
|
||||||
|
Keywords: DocumentImageAnalysis ·DeepLearning ·LayoutAnalysis
|
||||||
|
· Character Recognition · Open Source library · Toolkit.
|
||||||
|
|
||||||
|
1 Introduction
|
||||||
|
|
||||||
|
Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of
|
||||||
|
documentimageanalysis(DIA)tasksincludingdocumentimageclassification[ 11 ,
|
@ -10,7 +10,7 @@ python = ">=3.8.1,<4.0"
|
|||||||
langchain = "^0.1"
|
langchain = "^0.1"
|
||||||
openai = "<2"
|
openai = "<2"
|
||||||
pymongo = "^4.6.0"
|
pymongo = "^4.6.0"
|
||||||
pypdf = "^3.17.0"
|
pypdf = "^4.0.0"
|
||||||
tiktoken = "^0.5.1"
|
tiktoken = "^0.5.1"
|
||||||
langchain-text-splitters = ">=0.0.1,<0.1"
|
langchain-text-splitters = ">=0.0.1,<0.1"
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ python = ">=3.8.1,<4.0"
|
|||||||
langchain = "^0.1"
|
langchain = "^0.1"
|
||||||
pymilvus = ">=2.3.0"
|
pymilvus = ">=2.3.0"
|
||||||
langchain-nvidia-aiplay = "^0.0.2"
|
langchain-nvidia-aiplay = "^0.0.2"
|
||||||
pypdf = ">=3.1"
|
pypdf = ">=4.0.1"
|
||||||
langchain-text-splitters = ">=0.0.1,<0.1"
|
langchain-text-splitters = ">=0.0.1,<0.1"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
Loading…
Reference in New Issue
Block a user