From 67c880af742d316cf76d6112f452b637a6a45db8 Mon Sep 17 00:00:00 2001 From: Luis Antonio Vieira Junior Date: Fri, 8 Mar 2024 17:25:22 -0800 Subject: [PATCH] community[patch]: adding linearization config to AmazonTextractPDFLoader (#17489) - **Description:** Adding an optional parameter `linearization_config` to the `AmazonTextractPDFLoader` so the caller can define how the output will be linearized, instead of forcing a predefined set of linearization configs. It will still have a default configuration as this will be an optional parameter. - **Issue:** #17457 - **Dependencies:** The same ones that already exist for `AmazonTextractPDFLoader` - **Twitter handle:** [@lvieirajr19](https://twitter.com/lvieirajr19) --------- Co-authored-by: Bagatur --- .../document_loaders/amazon_textract.ipynb | 51 ++++++++++++++++--- .../document_loaders/parsers/pdf.py | 24 ++++++--- .../document_loaders/pdf.py | 27 ++++++++-- 3 files changed, 86 insertions(+), 16 deletions(-) diff --git a/docs/docs/integrations/document_loaders/amazon_textract.ipynb b/docs/docs/integrations/document_loaders/amazon_textract.ipynb index 968c25b758a..67f0a5d49f5 100644 --- a/docs/docs/integrations/document_loaders/amazon_textract.ipynb +++ b/docs/docs/integrations/document_loaders/amazon_textract.ipynb @@ -206,6 +206,42 @@ "len(documents)" ] }, + { + "cell_type": "markdown", + "id": "a56ba97505c8d140", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## Sample 4\n", + "\n", + "You have the option to pass an additional parameter called `linearization_config` to the AmazonTextractPDFLoader which will determine how the the text output will be linearized by the parser after Textract runs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1efbc4b6-f3cb-45c5-bbe8-16e7df060b92", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import AmazonTextractPDFLoader\n", + "from textractor.data.text_linearization_config import TextLinearizationConfig\n", + "\n", + "loader = AmazonTextractPDFLoader(\n", + " \"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf\",\n", + " linearization_config=TextLinearizationConfig(\n", + " hide_header_layout=True,\n", + " hide_footer_layout=True,\n", + " hide_figure_layout=True,\n", + " ),\n", + ")\n", + "documents = loader.load()" + ] + }, { "cell_type": "markdown", "id": "b3e41b4d-b159-4274-89be-80d8159134ef", @@ -276,11 +312,14 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "1a09d18b-ab7b-468e-ae66-f92abf666b9b", - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "id": "bd97f1c90aff6a83", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [] } ], @@ -876,7 +915,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 44349f57600..629c1d8f57e 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -26,6 +26,7 @@ if TYPE_CHECKING: import pdfplumber.page import pypdf._page import pypdfium2._helpers.page + from textractor.data.text_linearization_config import TextLinearizationConfig _PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"] @@ -454,6 +455,8 @@ class AmazonTextractPDFParser(BaseBlobParser): self, textract_features: Optional[Sequence[int]] = None, client: Optional[Any] = None, + *, + linearization_config: Optional["TextLinearizationConfig"] = None, ) -> None: """Initializes the parser. @@ -462,6 +465,9 @@ class AmazonTextractPDFParser(BaseBlobParser): should be passed as an int that conforms to the enum `Textract_Features`, see `amazon-textract-caller` pkg client: boto3 textract client + linearization_config: Config to be used for linearization of the output + should be an instance of TextLinearizationConfig from + the `textractor` pkg """ try: @@ -477,6 +483,16 @@ class AmazonTextractPDFParser(BaseBlobParser): ] else: self.textract_features = [] + + if linearization_config is not None: + self.linearization_config = linearization_config + else: + self.linearization_config = self.textractor.TextLinearizationConfig( + hide_figure_layout=True, + title_prefix="# ", + section_header_prefix="## ", + list_element_prefix="*", + ) except ImportError: raise ImportError( "Could not import amazon-textract-caller or " @@ -527,15 +543,9 @@ class AmazonTextractPDFParser(BaseBlobParser): document = self.textractor.Document.open(textract_response_json) - linearizer_config = self.textractor.TextLinearizationConfig( - hide_figure_layout=True, - title_prefix="# ", - section_header_prefix="## ", - list_element_prefix="*", - ) for idx, page in enumerate(document.pages): yield Document( - page_content=page.get_text(config=linearizer_config), + page_content=page.get_text(config=self.linearization_config), metadata={"source": blob.source, "page": idx + 1}, ) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 25abfe80e2d..ddfe283b6c9 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -6,7 +6,17 @@ import time from abc import ABC from io import StringIO from pathlib import Path -from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Iterator, + List, + Mapping, + Optional, + Sequence, + Union, +) from urllib.parse import urlparse import requests @@ -26,6 +36,9 @@ from langchain_community.document_loaders.parsers.pdf import ( ) from langchain_community.document_loaders.unstructured import UnstructuredFileLoader +if TYPE_CHECKING: + from textractor.data.text_linearization_config import TextLinearizationConfig + logger = logging.getLogger(__file__) @@ -586,6 +599,8 @@ class AmazonTextractPDFLoader(BasePDFLoader): region_name: Optional[str] = None, endpoint_url: Optional[str] = None, headers: Optional[Dict] = None, + *, + linearization_config: Optional["TextLinearizationConfig"] = None, ) -> None: """Initialize the loader. @@ -598,7 +613,9 @@ class AmazonTextractPDFLoader(BasePDFLoader): credentials_profile_name: AWS profile name, if not default (Optional) region_name: AWS region, eg us-east-1 (Optional) endpoint_url: endpoint url for the textract service (Optional) - + linearization_config: Config to be used for linearization of the output + should be an instance of TextLinearizationConfig from + the `textractor` pkg """ super().__init__(file_path, headers=headers) @@ -643,7 +660,11 @@ class AmazonTextractPDFLoader(BasePDFLoader): "Please check that credentials in the specified " "profile name are valid." ) from e - self.parser = AmazonTextractPDFParser(textract_features=features, client=client) + self.parser = AmazonTextractPDFParser( + textract_features=features, + client=client, + linearization_config=linearization_config, + ) def load(self) -> List[Document]: """Load given path as pages."""