mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-08 14:05:16 +00:00
IMPROVEMENT: Reduce post-processing time for DocAIParser
(#13210)
- Remove `WrappedDocument` introduced in https://github.com/langchain-ai/langchain/pull/11413 - https://github.com/googleapis/python-documentai-toolbox/issues/198 in Document AI Toolbox to improve initialization time for `WrappedDocument` object. @lkuligin @baskaryan @hwchase17
This commit is contained in:
parent
f3fcdea574
commit
1c08dbfb33
@ -66,10 +66,10 @@ class DocAIParser(BaseBlobParser):
|
|||||||
"a client."
|
"a client."
|
||||||
)
|
)
|
||||||
|
|
||||||
pattern = "projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+"
|
pattern = r"projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+"
|
||||||
if processor_name and not re.fullmatch(pattern, processor_name):
|
if processor_name and not re.fullmatch(pattern, processor_name):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Processor name {processor_name} has a wrong format. If your "
|
f"Processor name {processor_name} has the wrong format. If your "
|
||||||
"prediction endpoint looks like https://us-documentai.googleapis.com"
|
"prediction endpoint looks like https://us-documentai.googleapis.com"
|
||||||
"/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process,"
|
"/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process,"
|
||||||
" use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID "
|
" use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID "
|
||||||
@ -139,9 +139,7 @@ class DocAIParser(BaseBlobParser):
|
|||||||
" `pip install google-cloud-documentai`"
|
" `pip install google-cloud-documentai`"
|
||||||
) from exc
|
) from exc
|
||||||
try:
|
try:
|
||||||
from google.cloud.documentai_toolbox.wrappers.document import (
|
from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
|
||||||
Document as WrappedDocument,
|
|
||||||
)
|
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"documentai_toolbox package not found, please install it with"
|
"documentai_toolbox package not found, please install it with"
|
||||||
@ -171,16 +169,15 @@ class DocAIParser(BaseBlobParser):
|
|||||||
field_mask=field_mask,
|
field_mask=field_mask,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
wrapped_document = WrappedDocument.from_documentai_document(response.document)
|
|
||||||
yield from (
|
yield from (
|
||||||
Document(
|
Document(
|
||||||
page_content=page.text,
|
page_content=_text_from_layout(page.layout, response.document.text),
|
||||||
metadata={
|
metadata={
|
||||||
"page": page.page_number,
|
"page": page.page_number,
|
||||||
"source": wrapped_document.gcs_input_uri,
|
"source": blob.path,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
for page in wrapped_document.pages
|
for page in response.document.pages
|
||||||
)
|
)
|
||||||
|
|
||||||
def batch_parse(
|
def batch_parse(
|
||||||
@ -239,9 +236,8 @@ class DocAIParser(BaseBlobParser):
|
|||||||
from google.cloud.documentai_toolbox.utilities.gcs_utilities import (
|
from google.cloud.documentai_toolbox.utilities.gcs_utilities import (
|
||||||
split_gcs_uri,
|
split_gcs_uri,
|
||||||
)
|
)
|
||||||
from google.cloud.documentai_toolbox.wrappers.document import (
|
from google.cloud.documentai_toolbox.wrappers.document import _get_shards
|
||||||
Document as WrappedDocument,
|
from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
|
||||||
)
|
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"documentai_toolbox package not found, please install it with"
|
"documentai_toolbox package not found, please install it with"
|
||||||
@ -249,18 +245,14 @@ class DocAIParser(BaseBlobParser):
|
|||||||
) from exc
|
) from exc
|
||||||
for result in results:
|
for result in results:
|
||||||
gcs_bucket_name, gcs_prefix = split_gcs_uri(result.parsed_path)
|
gcs_bucket_name, gcs_prefix = split_gcs_uri(result.parsed_path)
|
||||||
wrapped_document = WrappedDocument.from_gcs(
|
shards = _get_shards(gcs_bucket_name, gcs_prefix)
|
||||||
gcs_bucket_name, gcs_prefix, gcs_input_uri=result.source_path
|
|
||||||
)
|
|
||||||
yield from (
|
yield from (
|
||||||
Document(
|
Document(
|
||||||
page_content=page.text,
|
page_content=_text_from_layout(page.layout, shard.text),
|
||||||
metadata={
|
metadata={"page": page.page_number, "source": result.source_path},
|
||||||
"page": page.page_number,
|
|
||||||
"source": wrapped_document.gcs_input_uri,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
for page in wrapped_document.pages
|
for shard in shards
|
||||||
|
for page in shard.pages
|
||||||
)
|
)
|
||||||
|
|
||||||
def operations_from_names(self, operation_names: List[str]) -> List["Operation"]:
|
def operations_from_names(self, operation_names: List[str]) -> List["Operation"]:
|
||||||
|
Loading…
Reference in New Issue
Block a user