mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-08 06:00:41 +00:00
IMPROVEMENT: Reduce post-processing time for DocAIParser
(#13210)
- Remove `WrappedDocument` introduced in https://github.com/langchain-ai/langchain/pull/11413 - https://github.com/googleapis/python-documentai-toolbox/issues/198 in Document AI Toolbox to improve initialization time for `WrappedDocument` object. @lkuligin @baskaryan @hwchase17
This commit is contained in:
parent
f3fcdea574
commit
1c08dbfb33
@ -66,10 +66,10 @@ class DocAIParser(BaseBlobParser):
|
||||
"a client."
|
||||
)
|
||||
|
||||
pattern = "projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+"
|
||||
pattern = r"projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+"
|
||||
if processor_name and not re.fullmatch(pattern, processor_name):
|
||||
raise ValueError(
|
||||
f"Processor name {processor_name} has a wrong format. If your "
|
||||
f"Processor name {processor_name} has the wrong format. If your "
|
||||
"prediction endpoint looks like https://us-documentai.googleapis.com"
|
||||
"/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process,"
|
||||
" use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID "
|
||||
@ -139,9 +139,7 @@ class DocAIParser(BaseBlobParser):
|
||||
" `pip install google-cloud-documentai`"
|
||||
) from exc
|
||||
try:
|
||||
from google.cloud.documentai_toolbox.wrappers.document import (
|
||||
Document as WrappedDocument,
|
||||
)
|
||||
from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"documentai_toolbox package not found, please install it with"
|
||||
@ -171,16 +169,15 @@ class DocAIParser(BaseBlobParser):
|
||||
field_mask=field_mask,
|
||||
)
|
||||
)
|
||||
wrapped_document = WrappedDocument.from_documentai_document(response.document)
|
||||
yield from (
|
||||
Document(
|
||||
page_content=page.text,
|
||||
page_content=_text_from_layout(page.layout, response.document.text),
|
||||
metadata={
|
||||
"page": page.page_number,
|
||||
"source": wrapped_document.gcs_input_uri,
|
||||
"source": blob.path,
|
||||
},
|
||||
)
|
||||
for page in wrapped_document.pages
|
||||
for page in response.document.pages
|
||||
)
|
||||
|
||||
def batch_parse(
|
||||
@ -239,9 +236,8 @@ class DocAIParser(BaseBlobParser):
|
||||
from google.cloud.documentai_toolbox.utilities.gcs_utilities import (
|
||||
split_gcs_uri,
|
||||
)
|
||||
from google.cloud.documentai_toolbox.wrappers.document import (
|
||||
Document as WrappedDocument,
|
||||
)
|
||||
from google.cloud.documentai_toolbox.wrappers.document import _get_shards
|
||||
from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"documentai_toolbox package not found, please install it with"
|
||||
@ -249,18 +245,14 @@ class DocAIParser(BaseBlobParser):
|
||||
) from exc
|
||||
for result in results:
|
||||
gcs_bucket_name, gcs_prefix = split_gcs_uri(result.parsed_path)
|
||||
wrapped_document = WrappedDocument.from_gcs(
|
||||
gcs_bucket_name, gcs_prefix, gcs_input_uri=result.source_path
|
||||
)
|
||||
shards = _get_shards(gcs_bucket_name, gcs_prefix)
|
||||
yield from (
|
||||
Document(
|
||||
page_content=page.text,
|
||||
metadata={
|
||||
"page": page.page_number,
|
||||
"source": wrapped_document.gcs_input_uri,
|
||||
},
|
||||
page_content=_text_from_layout(page.layout, shard.text),
|
||||
metadata={"page": page.page_number, "source": result.source_path},
|
||||
)
|
||||
for page in wrapped_document.pages
|
||||
for shard in shards
|
||||
for page in shard.pages
|
||||
)
|
||||
|
||||
def operations_from_names(self, operation_names: List[str]) -> List["Operation"]:
|
||||
|
Loading…
Reference in New Issue
Block a user