From 034a8c7c1b5bfb0b466f1a396eeb0f26acabe5b7 Mon Sep 17 00:00:00 2001 From: Brice Fotzo <44189336+bricefotzo@users.noreply.github.com> Date: Wed, 17 Jul 2024 22:47:09 +0200 Subject: [PATCH] community: support advanced text extraction options for pdf documents (#20265) **Description:** - Updated constructors in PyPDFParser and PyPDFLoader to handle `extraction_mode` and additional kwargs, aligning with the capabilities of `PageObject.extract_text()` from pypdf. - Added `test_pypdf_loader_with_layout` along with a corresponding example text file to validate layout extraction from PDFs. **Issue:** fixes #19735 **Dependencies:** This change requires updating the pypdf dependency from version 3.4.0 to at least 4.0.0. Additional changes include the addition of a new test test_pypdf_loader_with_layout and an example text file to ensure the functionality of layout extraction from PDFs aligns with the new capabilities. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur Co-authored-by: Erick Friis --- libs/community/extended_testing_deps.txt | 2 +- .../document_loaders/parsers/pdf.py | 25 +++++++++- .../document_loaders/pdf.py | 10 +++- .../document_loaders/test_pdf.py | 17 +++++++ .../examples/layout-parser-paper-page-1.txt | 49 +++++++++++++++++++ .../pyproject.toml | 2 +- templates/nvidia-rag-canonical/pyproject.toml | 2 +- 7 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 libs/community/tests/integration_tests/examples/layout-parser-paper-page-1.txt diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index 6879b6a184d..eca081b0a5b 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -60,7 +60,7 @@ psychicapi>=0.8.0,<0.9 py-trello>=0.19.0,<0.20 pyjwt>=2.8.0,<3 pymupdf>=1.22.3,<2 -pypdf>=3.4.0,<4 +pypdf>=3.4.0,<5 pypdfium2>=4.10.0,<5 pyspark>=3.4.0,<4 rank-bm25>=0.2.2,<0.3 diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index c561cd0fe32..2013c948fb8 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -6,6 +6,7 @@ import warnings from typing import ( TYPE_CHECKING, Any, + Dict, Iterable, Iterator, Mapping, @@ -27,6 +28,7 @@ if TYPE_CHECKING: import pdfplumber.page import pypdf._page import pypdfium2._helpers.page + from pypdf import PageObject from textractor.data.text_linearization_config import TextLinearizationConfig @@ -83,10 +85,17 @@ class PyPDFParser(BaseBlobParser): """Load `PDF` using `pypdf`""" def __init__( - self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False + self, + password: Optional[Union[str, bytes]] = None, + extract_images: bool = False, + *, + extraction_mode: str = "plain", + extraction_kwargs: Optional[Dict[str, Any]] = None, ): self.password = password self.extract_images = extract_images + self.extraction_mode = extraction_mode + self.extraction_kwargs = extraction_kwargs or {} def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob.""" @@ -98,11 +107,23 @@ class PyPDFParser(BaseBlobParser): "`pip install pypdf`" ) + def _extract_text_from_page(page: "PageObject") -> str: + """ + Extract text from image given the version of pypdf. + """ + if pypdf.__version__.startswith("3"): + return page.extract_text() + else: + return page.extract_text( + extraction_mode=self.extraction_mode, **self.extraction_kwargs + ) + with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password) + yield from [ Document( - page_content=page.extract_text() + page_content=_extract_text_from_page(page=page) + self._extract_images_from_page(page), metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined] ) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 17063eed0df..02c416a135d 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -171,6 +171,9 @@ class PyPDFLoader(BasePDFLoader): password: Optional[Union[str, bytes]] = None, headers: Optional[Dict] = None, extract_images: bool = False, + *, + extraction_mode: str = "plain", + extraction_kwargs: Optional[Dict] = None, ) -> None: """Initialize with a file path.""" try: @@ -180,7 +183,12 @@ class PyPDFLoader(BasePDFLoader): "pypdf package not found, please install it with " "`pip install pypdf`" ) super().__init__(file_path, headers=headers) - self.parser = PyPDFParser(password=password, extract_images=extract_images) + self.parser = PyPDFParser( + password=password, + extract_images=extract_images, + extraction_mode=extraction_mode, + extraction_kwargs=extraction_kwargs, + ) def lazy_load( self, diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index f9d71ea200e..462e20d3579 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -1,3 +1,4 @@ +import re from pathlib import Path from typing import Sequence, Union @@ -100,6 +101,22 @@ def test_pypdf_loader() -> None: assert len(docs) == 16 +def test_pypdf_loader_with_layout() -> None: + """Test PyPDFLoader with layout mode.""" + file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" + loader = PyPDFLoader(str(file_path), extraction_mode="layout") + + docs = loader.load() + first_page = docs[0].page_content + + expected = ( + Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt" + ).read_text(encoding="utf-8") + cleaned_first_page = re.sub(r"\x00", "", first_page) + cleaned_expected = re.sub(r"\x00", "", expected) + assert cleaned_first_page == cleaned_expected + + def test_pypdfium2_loader() -> None: """Test PyPDFium2Loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" diff --git a/libs/community/tests/integration_tests/examples/layout-parser-paper-page-1.txt b/libs/community/tests/integration_tests/examples/layout-parser-paper-page-1.txt new file mode 100644 index 00000000000..294fac00a02 --- /dev/null +++ b/libs/community/tests/integration_tests/examples/layout-parser-paper-page-1.txt @@ -0,0 +1,49 @@ + LayoutParser : A Unified Toolkit for Deep + Learning Based Document Image Analysis + + +Zejiang Shen 1 ( ), Ruochen Zhang 2, Melissa Dell 3, Benjamin Charles Germain + Lee 4, Jacob Carlson 3, and Weining Li 5 + + 1 Allen Institute for AI + shannons@allenai.org + 2 Brown University + ruochen zhang@brown.edu + 3 Harvard University + {melissadell,jacob carlson }@fas.harvard.edu + 4 University of Washington + bcgl@cs.washington.edu + 5 University of Waterloo + w422li@uwaterloo.ca + + + + Abstract. Recentadvancesindocumentimageanalysis(DIA)havebeen + primarily driven by the application of neural networks. Ideally, research + outcomes could be easily deployed in production and extended for further + investigation. However, various factors like loosely organized codebases + and sophisticated model configurations complicate the easy reuse of im- + portant innovations by awide audience. Though there havebeen on-going + efforts to improve reusability and simplify deep learning (DL) model + development in disciplines like natural language processing and computer + vision, none of them are optimized for challenges in the domain of DIA. + This represents a major gap in the existing toolkit, as DIA is central to + academic research across a wide range of disciplines in the social sciences + and humanities. This paper introduces LayoutParser , an open-source + library for streamlining the usage of DL in DIA research and applica- + tions. The core LayoutParser library comes with a set of simple and + intuitive interfaces for applying and customizing DL models for layout de- + tection,characterrecognition,andmanyotherdocumentprocessingtasks. + To promote extensibility, LayoutParser also incorporates a community + platform for sharing both pre-trained models and full document digiti- + zation pipelines. We demonstrate that LayoutParser is helpful for both + lightweight and large-scale digitization pipelines in real-word use cases. + The library is publicly available at https://layout-parser.github.io . + + Keywords: DocumentImageAnalysis ·DeepLearning ·LayoutAnalysis + · Character Recognition · Open Source library · Toolkit. + +1 Introduction + +Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of +documentimageanalysis(DIA)tasksincludingdocumentimageclassification[ 11 , \ No newline at end of file diff --git a/templates/mongo-parent-document-retrieval/pyproject.toml b/templates/mongo-parent-document-retrieval/pyproject.toml index f55bf1aefa4..e1403ed540a 100644 --- a/templates/mongo-parent-document-retrieval/pyproject.toml +++ b/templates/mongo-parent-document-retrieval/pyproject.toml @@ -10,7 +10,7 @@ python = ">=3.8.1,<4.0" langchain = "^0.1" openai = "<2" pymongo = "^4.6.0" -pypdf = "^3.17.0" +pypdf = "^4.0.0" tiktoken = "^0.5.1" langchain-text-splitters = ">=0.0.1,<0.1" diff --git a/templates/nvidia-rag-canonical/pyproject.toml b/templates/nvidia-rag-canonical/pyproject.toml index cd0b440fe5f..59ecf00a81f 100644 --- a/templates/nvidia-rag-canonical/pyproject.toml +++ b/templates/nvidia-rag-canonical/pyproject.toml @@ -10,7 +10,7 @@ python = ">=3.8.1,<4.0" langchain = "^0.1" pymilvus = ">=2.3.0" langchain-nvidia-aiplay = "^0.0.2" -pypdf = ">=3.1" +pypdf = ">=4.0.1" langchain-text-splitters = ">=0.0.1,<0.1" [tool.poetry.group.dev.dependencies]