IMPROVEMENT: Reduce post-processing time for DocAIParser (#13210)

- Remove `WrappedDocument` introduced in https://github.com/langchain-ai/langchain/pull/11413 - https://github.com/googleapis/python-documentai-toolbox/issues/198 in Document AI Toolbox to improve initialization time for `WrappedDocument` object. @lkuligin @baskaryan @hwchase17
2025-09-26 13:59:49 +00:00 · 2023-11-20 11:41:44 -06:00
parent f3fcdea574
commit 1c08dbfb33
1 changed files with 13 additions and 21 deletions
--- a/libs/langchain/langchain/document_loaders/parsers/docai.py
+++ b/libs/langchain/langchain/document_loaders/parsers/docai.py
@@ -66,10 +66,10 @@ class DocAIParser(BaseBlobParser):
                "a client."
            )
-        pattern = "projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+"
+        pattern = r"projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+"
        if processor_name and not re.fullmatch(pattern, processor_name):
            raise ValueError(
-                f"Processor name {processor_name} has a wrong format. If your "
+                f"Processor name {processor_name} has the wrong format. If your "
                "prediction endpoint looks like https://us-documentai.googleapis.com"
                "/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process,"
                " use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID "
@@ -139,9 +139,7 @@ class DocAIParser(BaseBlobParser):
                " `pip install google-cloud-documentai`"
            ) from exc
        try:
-            from google.cloud.documentai_toolbox.wrappers.document import (
+            from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
                Document as WrappedDocument,
            )
        except ImportError as exc:
            raise ImportError(
                "documentai_toolbox package not found, please install it with"
@@ -171,16 +169,15 @@ class DocAIParser(BaseBlobParser):
                field_mask=field_mask,
            )
        )
        wrapped_document = WrappedDocument.from_documentai_document(response.document)
        yield from (
            Document(
-                page_content=page.text,
+                page_content=_text_from_layout(page.layout, response.document.text),
                metadata={
                    "page": page.page_number,
-                    "source": wrapped_document.gcs_input_uri,
+                    "source": blob.path,
                },
            )
-            for page in wrapped_document.pages
+            for page in response.document.pages
        )
    def batch_parse(
@@ -239,9 +236,8 @@ class DocAIParser(BaseBlobParser):
            from google.cloud.documentai_toolbox.utilities.gcs_utilities import (
                split_gcs_uri,
            )
-            from google.cloud.documentai_toolbox.wrappers.document import (
+            from google.cloud.documentai_toolbox.wrappers.document import _get_shards
-                Document as WrappedDocument,
+            from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
            )
        except ImportError as exc:
            raise ImportError(
                "documentai_toolbox package not found, please install it with"
@@ -249,18 +245,14 @@ class DocAIParser(BaseBlobParser):
            ) from exc
        for result in results:
            gcs_bucket_name, gcs_prefix = split_gcs_uri(result.parsed_path)
-            wrapped_document = WrappedDocument.from_gcs(
+            shards = _get_shards(gcs_bucket_name, gcs_prefix)
                gcs_bucket_name, gcs_prefix, gcs_input_uri=result.source_path
            )
            yield from (
                Document(
-                    page_content=page.text,
+                    page_content=_text_from_layout(page.layout, shard.text),
-                    metadata={
+                    metadata={"page": page.page_number, "source": result.source_path},
                        "page": page.page_number,
                        "source": wrapped_document.gcs_input_uri,
                    },
                )
-                for page in wrapped_document.pages
+                for shard in shards
                for page in shard.pages
            )
    def operations_from_names(self, operation_names: List[str]) -> List["Operation"]: