community[patch]: Microsoft Azure Document Intelligence updates (#16932)

- **Description:** Update Azure Document Intelligence implementation by Microsoft team and RAG cookbook with Azure AI Search --------- Co-authored-by: Lu Zhang (AI) <luzhan@microsoft.com> Co-authored-by: Yateng Hong <yatengh@microsoft.com> Co-authored-by: teethache <hongyateng2006@126.com> Co-authored-by: Lu Zhang <44625949+luzhang06@users.noreply.github.com> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
2025-09-16 15:04:13 +00:00 · 2024-03-27 06:36:59 +00:00
parent cd79305eb9
commit f12cb0bea4
12 changed files with 708 additions and 71 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
+++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
@@ -1,10 +1,13 @@
-from typing import Any, Iterator, Optional
+import logging
+from typing import Any, Iterator, List, Optional

 from langchain_core.documents import Document

 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob

+logger = logging.getLogger(__name__)
+

 class AzureAIDocumentIntelligenceParser(BaseBlobParser):
    """Loads a PDF with Azure Document Intelligence
@@ -17,22 +20,43 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
        api_version: Optional[str] = None,
        api_model: str = "prebuilt-layout",
        mode: str = "markdown",
+        analysis_features: Optional[List[str]] = None,
    ):
        from azure.ai.documentintelligence import DocumentIntelligenceClient
+        from azure.ai.documentintelligence.models import DocumentAnalysisFeature
        from azure.core.credentials import AzureKeyCredential

        kwargs = {}
        if api_version is not None:
            kwargs["api_version"] = api_version
+
+        if analysis_features is not None:
+            _SUPPORTED_FEATURES = [
+                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,
+            ]
+
+            analysis_features = [
+                DocumentAnalysisFeature(feature) for feature in analysis_features
+            ]
+            if any(
+                [feature not in _SUPPORTED_FEATURES for feature in analysis_features]
+            ):
+                logger.warning(
+                    f"The current supported features are: "
+                    f"{[f.value for f in _SUPPORTED_FEATURES]}. "
+                    "Using other features may result in unexpected behavior."
+                )
+
        self.client = DocumentIntelligenceClient(
            endpoint=api_endpoint,
            credential=AzureKeyCredential(api_key),
            headers={"x-ms-useragent": "langchain-parser/1.0.0"},
+            features=analysis_features,
            **kwargs,
        )
        self.api_model = api_model
        self.mode = mode
-        assert self.mode in ["single", "page", "object", "markdown"]
+        assert self.mode in ["single", "page", "markdown"]

    def _generate_docs_page(self, result: Any) -> Iterator[Document]:
        for p in result.pages:
@@ -49,41 +73,6 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
    def _generate_docs_single(self, result: Any) -> Iterator[Document]:
        yield Document(page_content=result.content, metadata={})

-    def _generate_docs_object(self, result: Any) -> Iterator[Document]:
-        # record relationship between page id and span offset
-        page_offset = []
-        for page in result.pages:
-            # assume that spans only contain 1 element, to double check
-            page_offset.append(page.spans[0]["offset"])
-
-        # paragraph
-        # warning: paragraph content is overlapping with table content
-        for para in result.paragraphs:
-            yield Document(
-                page_content=para.content,
-                metadata={
-                    "role": para.role,
-                    "page": para.bounding_regions[0].page_number,
-                    "bounding_box": para.bounding_regions[0].polygon,
-                    "type": "paragraph",
-                },
-            )
-
-        # table
-        for table in result.tables:
-            yield Document(
-                page_content=table.cells,  # json object
-                metadata={
-                    "footnote": table.footnotes,
-                    "caption": table.caption,
-                    "page": para.bounding_regions[0].page_number,
-                    "bounding_box": para.bounding_regions[0].polygon,
-                    "row_count": table.row_count,
-                    "column_count": table.column_count,
-                    "type": "table",
-                },
-            )
-
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

@@ -101,7 +90,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
            elif self.mode in ["page"]:
                yield from self._generate_docs_page(result)
            else:
-                yield from self._generate_docs_object(result)
+                raise ValueError(f"Invalid mode: {self.mode}")

    def parse_url(self, url: str) -> Iterator[Document]:
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
@@ -119,4 +108,4 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
        elif self.mode in ["page"]:
            yield from self._generate_docs_page(result)
        else:
-            yield from self._generate_docs_object(result)
+            raise ValueError(f"Invalid mode: {self.mode}")