IMPROVEMENT: Reduce post-processing time for DocAIParser (#13210)

- Remove `WrappedDocument` introduced in
https://github.com/langchain-ai/langchain/pull/11413
- https://github.com/googleapis/python-documentai-toolbox/issues/198 in
Document AI Toolbox to improve initialization time for `WrappedDocument`
object.

@lkuligin

@baskaryan

@hwchase17
This commit is contained in:
Holt Skinner 2023-11-20 11:41:44 -06:00 committed by GitHub
parent f3fcdea574
commit 1c08dbfb33
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -66,10 +66,10 @@ class DocAIParser(BaseBlobParser):
"a client." "a client."
) )
pattern = "projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+" pattern = r"projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+"
if processor_name and not re.fullmatch(pattern, processor_name): if processor_name and not re.fullmatch(pattern, processor_name):
raise ValueError( raise ValueError(
f"Processor name {processor_name} has a wrong format. If your " f"Processor name {processor_name} has the wrong format. If your "
"prediction endpoint looks like https://us-documentai.googleapis.com" "prediction endpoint looks like https://us-documentai.googleapis.com"
"/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process," "/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process,"
" use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID " " use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID "
@ -139,9 +139,7 @@ class DocAIParser(BaseBlobParser):
" `pip install google-cloud-documentai`" " `pip install google-cloud-documentai`"
) from exc ) from exc
try: try:
from google.cloud.documentai_toolbox.wrappers.document import ( from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
Document as WrappedDocument,
)
except ImportError as exc: except ImportError as exc:
raise ImportError( raise ImportError(
"documentai_toolbox package not found, please install it with" "documentai_toolbox package not found, please install it with"
@ -171,16 +169,15 @@ class DocAIParser(BaseBlobParser):
field_mask=field_mask, field_mask=field_mask,
) )
) )
wrapped_document = WrappedDocument.from_documentai_document(response.document)
yield from ( yield from (
Document( Document(
page_content=page.text, page_content=_text_from_layout(page.layout, response.document.text),
metadata={ metadata={
"page": page.page_number, "page": page.page_number,
"source": wrapped_document.gcs_input_uri, "source": blob.path,
}, },
) )
for page in wrapped_document.pages for page in response.document.pages
) )
def batch_parse( def batch_parse(
@ -239,9 +236,8 @@ class DocAIParser(BaseBlobParser):
from google.cloud.documentai_toolbox.utilities.gcs_utilities import ( from google.cloud.documentai_toolbox.utilities.gcs_utilities import (
split_gcs_uri, split_gcs_uri,
) )
from google.cloud.documentai_toolbox.wrappers.document import ( from google.cloud.documentai_toolbox.wrappers.document import _get_shards
Document as WrappedDocument, from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
)
except ImportError as exc: except ImportError as exc:
raise ImportError( raise ImportError(
"documentai_toolbox package not found, please install it with" "documentai_toolbox package not found, please install it with"
@ -249,18 +245,14 @@ class DocAIParser(BaseBlobParser):
) from exc ) from exc
for result in results: for result in results:
gcs_bucket_name, gcs_prefix = split_gcs_uri(result.parsed_path) gcs_bucket_name, gcs_prefix = split_gcs_uri(result.parsed_path)
wrapped_document = WrappedDocument.from_gcs( shards = _get_shards(gcs_bucket_name, gcs_prefix)
gcs_bucket_name, gcs_prefix, gcs_input_uri=result.source_path
)
yield from ( yield from (
Document( Document(
page_content=page.text, page_content=_text_from_layout(page.layout, shard.text),
metadata={ metadata={"page": page.page_number, "source": result.source_path},
"page": page.page_number,
"source": wrapped_document.gcs_input_uri,
},
) )
for page in wrapped_document.pages for shard in shards
for page in shard.pages
) )
def operations_from_names(self, operation_names: List[str]) -> List["Operation"]: def operations_from_names(self, operation_names: List[str]) -> List["Operation"]: