community[patch]: adding linearization config to AmazonTextractPDFLoader (#17489)

- **Description:** Adding an optional parameter `linearization_config`
to the `AmazonTextractPDFLoader` so the caller can define how the output
will be linearized, instead of forcing a predefined set of linearization
configs. It will still have a default configuration as this will be an
optional parameter.
- **Issue:** #17457
- **Dependencies:** The same ones that already exist for
`AmazonTextractPDFLoader`
- **Twitter handle:** [@lvieirajr19](https://twitter.com/lvieirajr19)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Luis Antonio Vieira Junior 2024-03-08 17:25:22 -08:00 committed by GitHub
parent 37e89ba5b1
commit 67c880af74
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 86 additions and 16 deletions

View File

@ -206,6 +206,42 @@
"len(documents)" "len(documents)"
] ]
}, },
{
"cell_type": "markdown",
"id": "a56ba97505c8d140",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Sample 4\n",
"\n",
"You have the option to pass an additional parameter called `linearization_config` to the AmazonTextractPDFLoader which will determine how the the text output will be linearized by the parser after Textract runs."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1efbc4b6-f3cb-45c5-bbe8-16e7df060b92",
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import AmazonTextractPDFLoader\n",
"from textractor.data.text_linearization_config import TextLinearizationConfig\n",
"\n",
"loader = AmazonTextractPDFLoader(\n",
" \"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf\",\n",
" linearization_config=TextLinearizationConfig(\n",
" hide_header_layout=True,\n",
" hide_footer_layout=True,\n",
" hide_figure_layout=True,\n",
" ),\n",
")\n",
"documents = loader.load()"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "b3e41b4d-b159-4274-89be-80d8159134ef", "id": "b3e41b4d-b159-4274-89be-80d8159134ef",
@ -276,11 +312,14 @@
] ]
}, },
{ {
"cell_type": "code", "cell_type": "markdown",
"execution_count": null, "id": "bd97f1c90aff6a83",
"id": "1a09d18b-ab7b-468e-ae66-f92abf666b9b", "metadata": {
"metadata": {}, "collapsed": false,
"outputs": [], "jupyter": {
"outputs_hidden": false
}
},
"source": [] "source": []
} }
], ],
@ -876,7 +915,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.12" "version": "3.10.13"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -26,6 +26,7 @@ if TYPE_CHECKING:
import pdfplumber.page import pdfplumber.page
import pypdf._page import pypdf._page
import pypdfium2._helpers.page import pypdfium2._helpers.page
from textractor.data.text_linearization_config import TextLinearizationConfig
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"] _PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
@ -454,6 +455,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
self, self,
textract_features: Optional[Sequence[int]] = None, textract_features: Optional[Sequence[int]] = None,
client: Optional[Any] = None, client: Optional[Any] = None,
*,
linearization_config: Optional["TextLinearizationConfig"] = None,
) -> None: ) -> None:
"""Initializes the parser. """Initializes the parser.
@ -462,6 +465,9 @@ class AmazonTextractPDFParser(BaseBlobParser):
should be passed as an int that conforms to the enum should be passed as an int that conforms to the enum
`Textract_Features`, see `amazon-textract-caller` pkg `Textract_Features`, see `amazon-textract-caller` pkg
client: boto3 textract client client: boto3 textract client
linearization_config: Config to be used for linearization of the output
should be an instance of TextLinearizationConfig from
the `textractor` pkg
""" """
try: try:
@ -477,6 +483,16 @@ class AmazonTextractPDFParser(BaseBlobParser):
] ]
else: else:
self.textract_features = [] self.textract_features = []
if linearization_config is not None:
self.linearization_config = linearization_config
else:
self.linearization_config = self.textractor.TextLinearizationConfig(
hide_figure_layout=True,
title_prefix="# ",
section_header_prefix="## ",
list_element_prefix="*",
)
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"Could not import amazon-textract-caller or " "Could not import amazon-textract-caller or "
@ -527,15 +543,9 @@ class AmazonTextractPDFParser(BaseBlobParser):
document = self.textractor.Document.open(textract_response_json) document = self.textractor.Document.open(textract_response_json)
linearizer_config = self.textractor.TextLinearizationConfig(
hide_figure_layout=True,
title_prefix="# ",
section_header_prefix="## ",
list_element_prefix="*",
)
for idx, page in enumerate(document.pages): for idx, page in enumerate(document.pages):
yield Document( yield Document(
page_content=page.get_text(config=linearizer_config), page_content=page.get_text(config=self.linearization_config),
metadata={"source": blob.source, "page": idx + 1}, metadata={"source": blob.source, "page": idx + 1},
) )

View File

@ -6,7 +6,17 @@ import time
from abc import ABC from abc import ABC
from io import StringIO from io import StringIO
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterator,
List,
Mapping,
Optional,
Sequence,
Union,
)
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@ -26,6 +36,9 @@ from langchain_community.document_loaders.parsers.pdf import (
) )
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
if TYPE_CHECKING:
from textractor.data.text_linearization_config import TextLinearizationConfig
logger = logging.getLogger(__file__) logger = logging.getLogger(__file__)
@ -586,6 +599,8 @@ class AmazonTextractPDFLoader(BasePDFLoader):
region_name: Optional[str] = None, region_name: Optional[str] = None,
endpoint_url: Optional[str] = None, endpoint_url: Optional[str] = None,
headers: Optional[Dict] = None, headers: Optional[Dict] = None,
*,
linearization_config: Optional["TextLinearizationConfig"] = None,
) -> None: ) -> None:
"""Initialize the loader. """Initialize the loader.
@ -598,7 +613,9 @@ class AmazonTextractPDFLoader(BasePDFLoader):
credentials_profile_name: AWS profile name, if not default (Optional) credentials_profile_name: AWS profile name, if not default (Optional)
region_name: AWS region, eg us-east-1 (Optional) region_name: AWS region, eg us-east-1 (Optional)
endpoint_url: endpoint url for the textract service (Optional) endpoint_url: endpoint url for the textract service (Optional)
linearization_config: Config to be used for linearization of the output
should be an instance of TextLinearizationConfig from
the `textractor` pkg
""" """
super().__init__(file_path, headers=headers) super().__init__(file_path, headers=headers)
@ -643,7 +660,11 @@ class AmazonTextractPDFLoader(BasePDFLoader):
"Please check that credentials in the specified " "Please check that credentials in the specified "
"profile name are valid." "profile name are valid."
) from e ) from e
self.parser = AmazonTextractPDFParser(textract_features=features, client=client) self.parser = AmazonTextractPDFParser(
textract_features=features,
client=client,
linearization_config=linearization_config,
)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load given path as pages.""" """Load given path as pages."""