mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 00:48:45 +00:00
community[patch]: adding linearization config to AmazonTextractPDFLoader (#17489)
- **Description:** Adding an optional parameter `linearization_config` to the `AmazonTextractPDFLoader` so the caller can define how the output will be linearized, instead of forcing a predefined set of linearization configs. It will still have a default configuration as this will be an optional parameter. - **Issue:** #17457 - **Dependencies:** The same ones that already exist for `AmazonTextractPDFLoader` - **Twitter handle:** [@lvieirajr19](https://twitter.com/lvieirajr19) --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
37e89ba5b1
commit
67c880af74
@ -206,6 +206,42 @@
|
|||||||
"len(documents)"
|
"len(documents)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "a56ba97505c8d140",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## Sample 4\n",
|
||||||
|
"\n",
|
||||||
|
"You have the option to pass an additional parameter called `linearization_config` to the AmazonTextractPDFLoader which will determine how the the text output will be linearized by the parser after Textract runs."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "1efbc4b6-f3cb-45c5-bbe8-16e7df060b92",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain_community.document_loaders import AmazonTextractPDFLoader\n",
|
||||||
|
"from textractor.data.text_linearization_config import TextLinearizationConfig\n",
|
||||||
|
"\n",
|
||||||
|
"loader = AmazonTextractPDFLoader(\n",
|
||||||
|
" \"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf\",\n",
|
||||||
|
" linearization_config=TextLinearizationConfig(\n",
|
||||||
|
" hide_header_layout=True,\n",
|
||||||
|
" hide_footer_layout=True,\n",
|
||||||
|
" hide_figure_layout=True,\n",
|
||||||
|
" ),\n",
|
||||||
|
")\n",
|
||||||
|
"documents = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "b3e41b4d-b159-4274-89be-80d8159134ef",
|
"id": "b3e41b4d-b159-4274-89be-80d8159134ef",
|
||||||
@ -276,11 +312,14 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "markdown",
|
||||||
"execution_count": null,
|
"id": "bd97f1c90aff6a83",
|
||||||
"id": "1a09d18b-ab7b-468e-ae66-f92abf666b9b",
|
"metadata": {
|
||||||
"metadata": {},
|
"collapsed": false,
|
||||||
"outputs": [],
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
|
},
|
||||||
"source": []
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -876,7 +915,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.12"
|
"version": "3.10.13"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -26,6 +26,7 @@ if TYPE_CHECKING:
|
|||||||
import pdfplumber.page
|
import pdfplumber.page
|
||||||
import pypdf._page
|
import pypdf._page
|
||||||
import pypdfium2._helpers.page
|
import pypdfium2._helpers.page
|
||||||
|
from textractor.data.text_linearization_config import TextLinearizationConfig
|
||||||
|
|
||||||
|
|
||||||
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
|
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
|
||||||
@ -454,6 +455,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
|||||||
self,
|
self,
|
||||||
textract_features: Optional[Sequence[int]] = None,
|
textract_features: Optional[Sequence[int]] = None,
|
||||||
client: Optional[Any] = None,
|
client: Optional[Any] = None,
|
||||||
|
*,
|
||||||
|
linearization_config: Optional["TextLinearizationConfig"] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initializes the parser.
|
"""Initializes the parser.
|
||||||
|
|
||||||
@ -462,6 +465,9 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
|||||||
should be passed as an int that conforms to the enum
|
should be passed as an int that conforms to the enum
|
||||||
`Textract_Features`, see `amazon-textract-caller` pkg
|
`Textract_Features`, see `amazon-textract-caller` pkg
|
||||||
client: boto3 textract client
|
client: boto3 textract client
|
||||||
|
linearization_config: Config to be used for linearization of the output
|
||||||
|
should be an instance of TextLinearizationConfig from
|
||||||
|
the `textractor` pkg
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -477,6 +483,16 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
|||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
self.textract_features = []
|
self.textract_features = []
|
||||||
|
|
||||||
|
if linearization_config is not None:
|
||||||
|
self.linearization_config = linearization_config
|
||||||
|
else:
|
||||||
|
self.linearization_config = self.textractor.TextLinearizationConfig(
|
||||||
|
hide_figure_layout=True,
|
||||||
|
title_prefix="# ",
|
||||||
|
section_header_prefix="## ",
|
||||||
|
list_element_prefix="*",
|
||||||
|
)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Could not import amazon-textract-caller or "
|
"Could not import amazon-textract-caller or "
|
||||||
@ -527,15 +543,9 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
|||||||
|
|
||||||
document = self.textractor.Document.open(textract_response_json)
|
document = self.textractor.Document.open(textract_response_json)
|
||||||
|
|
||||||
linearizer_config = self.textractor.TextLinearizationConfig(
|
|
||||||
hide_figure_layout=True,
|
|
||||||
title_prefix="# ",
|
|
||||||
section_header_prefix="## ",
|
|
||||||
list_element_prefix="*",
|
|
||||||
)
|
|
||||||
for idx, page in enumerate(document.pages):
|
for idx, page in enumerate(document.pages):
|
||||||
yield Document(
|
yield Document(
|
||||||
page_content=page.get_text(config=linearizer_config),
|
page_content=page.get_text(config=self.linearization_config),
|
||||||
metadata={"source": blob.source, "page": idx + 1},
|
metadata={"source": blob.source, "page": idx + 1},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -6,7 +6,17 @@ import time
|
|||||||
from abc import ABC
|
from abc import ABC
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
Dict,
|
||||||
|
Iterator,
|
||||||
|
List,
|
||||||
|
Mapping,
|
||||||
|
Optional,
|
||||||
|
Sequence,
|
||||||
|
Union,
|
||||||
|
)
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -26,6 +36,9 @@ from langchain_community.document_loaders.parsers.pdf import (
|
|||||||
)
|
)
|
||||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from textractor.data.text_linearization_config import TextLinearizationConfig
|
||||||
|
|
||||||
logger = logging.getLogger(__file__)
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
|
|
||||||
@ -586,6 +599,8 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
|||||||
region_name: Optional[str] = None,
|
region_name: Optional[str] = None,
|
||||||
endpoint_url: Optional[str] = None,
|
endpoint_url: Optional[str] = None,
|
||||||
headers: Optional[Dict] = None,
|
headers: Optional[Dict] = None,
|
||||||
|
*,
|
||||||
|
linearization_config: Optional["TextLinearizationConfig"] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the loader.
|
"""Initialize the loader.
|
||||||
|
|
||||||
@ -598,7 +613,9 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
|||||||
credentials_profile_name: AWS profile name, if not default (Optional)
|
credentials_profile_name: AWS profile name, if not default (Optional)
|
||||||
region_name: AWS region, eg us-east-1 (Optional)
|
region_name: AWS region, eg us-east-1 (Optional)
|
||||||
endpoint_url: endpoint url for the textract service (Optional)
|
endpoint_url: endpoint url for the textract service (Optional)
|
||||||
|
linearization_config: Config to be used for linearization of the output
|
||||||
|
should be an instance of TextLinearizationConfig from
|
||||||
|
the `textractor` pkg
|
||||||
"""
|
"""
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
|
|
||||||
@ -643,7 +660,11 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
|||||||
"Please check that credentials in the specified "
|
"Please check that credentials in the specified "
|
||||||
"profile name are valid."
|
"profile name are valid."
|
||||||
) from e
|
) from e
|
||||||
self.parser = AmazonTextractPDFParser(textract_features=features, client=client)
|
self.parser = AmazonTextractPDFParser(
|
||||||
|
textract_features=features,
|
||||||
|
client=client,
|
||||||
|
linearization_config=linearization_config,
|
||||||
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load given path as pages."""
|
"""Load given path as pages."""
|
||||||
|
Loading…
Reference in New Issue
Block a user