mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-17 07:26:16 +00:00
community[patch]: adding linearization config to AmazonTextractPDFLoader (#17489)
- **Description:** Adding an optional parameter `linearization_config` to the `AmazonTextractPDFLoader` so the caller can define how the output will be linearized, instead of forcing a predefined set of linearization configs. It will still have a default configuration as this will be an optional parameter. - **Issue:** #17457 - **Dependencies:** The same ones that already exist for `AmazonTextractPDFLoader` - **Twitter handle:** [@lvieirajr19](https://twitter.com/lvieirajr19) --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
committed by
GitHub
parent
37e89ba5b1
commit
67c880af74
@@ -26,6 +26,7 @@ if TYPE_CHECKING:
|
||||
import pdfplumber.page
|
||||
import pypdf._page
|
||||
import pypdfium2._helpers.page
|
||||
from textractor.data.text_linearization_config import TextLinearizationConfig
|
||||
|
||||
|
||||
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
|
||||
@@ -454,6 +455,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
||||
self,
|
||||
textract_features: Optional[Sequence[int]] = None,
|
||||
client: Optional[Any] = None,
|
||||
*,
|
||||
linearization_config: Optional["TextLinearizationConfig"] = None,
|
||||
) -> None:
|
||||
"""Initializes the parser.
|
||||
|
||||
@@ -462,6 +465,9 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
||||
should be passed as an int that conforms to the enum
|
||||
`Textract_Features`, see `amazon-textract-caller` pkg
|
||||
client: boto3 textract client
|
||||
linearization_config: Config to be used for linearization of the output
|
||||
should be an instance of TextLinearizationConfig from
|
||||
the `textractor` pkg
|
||||
"""
|
||||
|
||||
try:
|
||||
@@ -477,6 +483,16 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
||||
]
|
||||
else:
|
||||
self.textract_features = []
|
||||
|
||||
if linearization_config is not None:
|
||||
self.linearization_config = linearization_config
|
||||
else:
|
||||
self.linearization_config = self.textractor.TextLinearizationConfig(
|
||||
hide_figure_layout=True,
|
||||
title_prefix="# ",
|
||||
section_header_prefix="## ",
|
||||
list_element_prefix="*",
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import amazon-textract-caller or "
|
||||
@@ -527,15 +543,9 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
||||
|
||||
document = self.textractor.Document.open(textract_response_json)
|
||||
|
||||
linearizer_config = self.textractor.TextLinearizationConfig(
|
||||
hide_figure_layout=True,
|
||||
title_prefix="# ",
|
||||
section_header_prefix="## ",
|
||||
list_element_prefix="*",
|
||||
)
|
||||
for idx, page in enumerate(document.pages):
|
||||
yield Document(
|
||||
page_content=page.get_text(config=linearizer_config),
|
||||
page_content=page.get_text(config=self.linearization_config),
|
||||
metadata={"source": blob.source, "page": idx + 1},
|
||||
)
|
||||
|
||||
|
@@ -6,7 +6,17 @@ import time
|
||||
from abc import ABC
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
Union,
|
||||
)
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
@@ -26,6 +36,9 @@ from langchain_community.document_loaders.parsers.pdf import (
|
||||
)
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from textractor.data.text_linearization_config import TextLinearizationConfig
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
@@ -586,6 +599,8 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
region_name: Optional[str] = None,
|
||||
endpoint_url: Optional[str] = None,
|
||||
headers: Optional[Dict] = None,
|
||||
*,
|
||||
linearization_config: Optional["TextLinearizationConfig"] = None,
|
||||
) -> None:
|
||||
"""Initialize the loader.
|
||||
|
||||
@@ -598,7 +613,9 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
credentials_profile_name: AWS profile name, if not default (Optional)
|
||||
region_name: AWS region, eg us-east-1 (Optional)
|
||||
endpoint_url: endpoint url for the textract service (Optional)
|
||||
|
||||
linearization_config: Config to be used for linearization of the output
|
||||
should be an instance of TextLinearizationConfig from
|
||||
the `textractor` pkg
|
||||
"""
|
||||
super().__init__(file_path, headers=headers)
|
||||
|
||||
@@ -643,7 +660,11 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
"Please check that credentials in the specified "
|
||||
"profile name are valid."
|
||||
) from e
|
||||
self.parser = AmazonTextractPDFParser(textract_features=features, client=client)
|
||||
self.parser = AmazonTextractPDFParser(
|
||||
textract_features=features,
|
||||
client=client,
|
||||
linearization_config=linearization_config,
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load given path as pages."""
|
||||
|
Reference in New Issue
Block a user