community[patch]: adding linearization config to AmazonTextractPDFLoader (#17489)

- **Description:** Adding an optional parameter `linearization_config`
to the `AmazonTextractPDFLoader` so the caller can define how the output
will be linearized, instead of forcing a predefined set of linearization
configs. It will still have a default configuration as this will be an
optional parameter.
- **Issue:** #17457
- **Dependencies:** The same ones that already exist for
`AmazonTextractPDFLoader`
- **Twitter handle:** [@lvieirajr19](https://twitter.com/lvieirajr19)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Luis Antonio Vieira Junior 2024-03-08 17:25:22 -08:00 committed by GitHub
parent 37e89ba5b1
commit 67c880af74
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 86 additions and 16 deletions

View File

@ -206,6 +206,42 @@
"len(documents)"
]
},
{
"cell_type": "markdown",
"id": "a56ba97505c8d140",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Sample 4\n",
"\n",
"You have the option to pass an additional parameter called `linearization_config` to the AmazonTextractPDFLoader which will determine how the the text output will be linearized by the parser after Textract runs."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1efbc4b6-f3cb-45c5-bbe8-16e7df060b92",
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import AmazonTextractPDFLoader\n",
"from textractor.data.text_linearization_config import TextLinearizationConfig\n",
"\n",
"loader = AmazonTextractPDFLoader(\n",
" \"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf\",\n",
" linearization_config=TextLinearizationConfig(\n",
" hide_header_layout=True,\n",
" hide_footer_layout=True,\n",
" hide_figure_layout=True,\n",
" ),\n",
")\n",
"documents = loader.load()"
]
},
{
"cell_type": "markdown",
"id": "b3e41b4d-b159-4274-89be-80d8159134ef",
@ -276,11 +312,14 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a09d18b-ab7b-468e-ae66-f92abf666b9b",
"metadata": {},
"outputs": [],
"cell_type": "markdown",
"id": "bd97f1c90aff6a83",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": []
}
],
@ -876,7 +915,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.10.13"
}
},
"nbformat": 4,

View File

@ -26,6 +26,7 @@ if TYPE_CHECKING:
import pdfplumber.page
import pypdf._page
import pypdfium2._helpers.page
from textractor.data.text_linearization_config import TextLinearizationConfig
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
@ -454,6 +455,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
self,
textract_features: Optional[Sequence[int]] = None,
client: Optional[Any] = None,
*,
linearization_config: Optional["TextLinearizationConfig"] = None,
) -> None:
"""Initializes the parser.
@ -462,6 +465,9 @@ class AmazonTextractPDFParser(BaseBlobParser):
should be passed as an int that conforms to the enum
`Textract_Features`, see `amazon-textract-caller` pkg
client: boto3 textract client
linearization_config: Config to be used for linearization of the output
should be an instance of TextLinearizationConfig from
the `textractor` pkg
"""
try:
@ -477,6 +483,16 @@ class AmazonTextractPDFParser(BaseBlobParser):
]
else:
self.textract_features = []
if linearization_config is not None:
self.linearization_config = linearization_config
else:
self.linearization_config = self.textractor.TextLinearizationConfig(
hide_figure_layout=True,
title_prefix="# ",
section_header_prefix="## ",
list_element_prefix="*",
)
except ImportError:
raise ImportError(
"Could not import amazon-textract-caller or "
@ -527,15 +543,9 @@ class AmazonTextractPDFParser(BaseBlobParser):
document = self.textractor.Document.open(textract_response_json)
linearizer_config = self.textractor.TextLinearizationConfig(
hide_figure_layout=True,
title_prefix="# ",
section_header_prefix="## ",
list_element_prefix="*",
)
for idx, page in enumerate(document.pages):
yield Document(
page_content=page.get_text(config=linearizer_config),
page_content=page.get_text(config=self.linearization_config),
metadata={"source": blob.source, "page": idx + 1},
)

View File

@ -6,7 +6,17 @@ import time
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterator,
List,
Mapping,
Optional,
Sequence,
Union,
)
from urllib.parse import urlparse
import requests
@ -26,6 +36,9 @@ from langchain_community.document_loaders.parsers.pdf import (
)
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
if TYPE_CHECKING:
from textractor.data.text_linearization_config import TextLinearizationConfig
logger = logging.getLogger(__file__)
@ -586,6 +599,8 @@ class AmazonTextractPDFLoader(BasePDFLoader):
region_name: Optional[str] = None,
endpoint_url: Optional[str] = None,
headers: Optional[Dict] = None,
*,
linearization_config: Optional["TextLinearizationConfig"] = None,
) -> None:
"""Initialize the loader.
@ -598,7 +613,9 @@ class AmazonTextractPDFLoader(BasePDFLoader):
credentials_profile_name: AWS profile name, if not default (Optional)
region_name: AWS region, eg us-east-1 (Optional)
endpoint_url: endpoint url for the textract service (Optional)
linearization_config: Config to be used for linearization of the output
should be an instance of TextLinearizationConfig from
the `textractor` pkg
"""
super().__init__(file_path, headers=headers)
@ -643,7 +660,11 @@ class AmazonTextractPDFLoader(BasePDFLoader):
"Please check that credentials in the specified "
"profile name are valid."
) from e
self.parser = AmazonTextractPDFParser(textract_features=features, client=client)
self.parser = AmazonTextractPDFParser(
textract_features=features,
client=client,
linearization_config=linearization_config,
)
def load(self) -> List[Document]:
"""Load given path as pages."""