community[minor]: Azure DocumentIntelligenceLoader/Parser support update with latest SDK (#14389)

- **Description:** Add DocumentIntelligenceLoader & DocumentIntelligenceParser implementation using the latest Azure Document Intelligence SDK with markdown support. The core logic resides in DocumentIntelligenceParser and DocumentIntelligenceLoader is a mere wrapper of the parser. The parser will takes api_endpoint and api_key and creates DocumentIntelligenceClient for the user. 4 parsing modes are supported: 1. Markdown (default) 2. Single 3. Page 4. Object UT and notebook are also updated accordingly. - **Dependencies:** Azure Document Intelligence SDK: azure-ai-documentintelligence [azure-sdk-for-python/sdk/documentintelligence/azure-ai-documentintelligence at 7c42462ac662522a6fd21b17d2a20f4cd40d0356 · Azure/azure-sdk-for-python (github.com)](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FAzure%2Fazure-sdk-for-python%2Ftree%2F7c42462ac662522a6fd21b17d2a20f4cd40d0356%2Fsdk%2Fdocumentintelligence%2Fazure-ai-documentintelligence&data=05%7C01%7CZifei.Qian%40microsoft.com%7C298225aa3e31468a863108dbf07374ff%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638368150928704292%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=oE0Sl4HERnMKdbkV9KgBV46Z2xytcQAShdTWf7ZNl%2Bs%3D&reserved=0). --------- Co-authored-by: Erick Friis <erick@langchain.dev>
2025-09-16 06:53:16 +00:00 · 2023-12-22 08:40:27 +08:00
parent 129a929d69
commit 2460f977c5
11 changed files with 367 additions and 45 deletions
--- a/libs/community/langchain_community/document_loaders/init.py
+++ b/libs/community/langchain_community/document_loaders/init.py
@@ -77,6 +77,9 @@ from langchain_community.document_loaders.dataframe import DataFrameLoader
 from langchain_community.document_loaders.diffbot import DiffbotLoader
 from langchain_community.document_loaders.directory import DirectoryLoader
 from langchain_community.document_loaders.discord import DiscordChatLoader
+from langchain_community.document_loaders.doc_intelligence import (
+    AzureAIDocumentIntelligenceLoader,
+)
 from langchain_community.document_loaders.docugami import DocugamiLoader
 from langchain_community.document_loaders.docusaurus import DocusaurusLoader
 from langchain_community.document_loaders.dropbox import DropboxLoader
@@ -247,6 +250,7 @@ __all__ = [
    "AssemblyAIAudioTranscriptLoader",
    "AsyncHtmlLoader",
    "AzureAIDataLoader",
+    "AzureAIDocumentIntelligenceLoader",
    "AzureBlobStorageContainerLoader",
    "AzureBlobStorageFileLoader",
    "BSHTMLLoader",
--- a/libs/community/langchain_community/document_loaders/doc_intelligence.py
+++ b/libs/community/langchain_community/document_loaders/doc_intelligence.py
@@ -0,0 +1,89 @@
+from typing import Iterator, List, Optional
+
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseLoader
+from langchain_community.document_loaders.blob_loaders import Blob
+from langchain_community.document_loaders.parsers import (
+    AzureAIDocumentIntelligenceParser,
+)
+
+
+class AzureAIDocumentIntelligenceLoader(BaseLoader):
+    """Loads a PDF with Azure Document Intelligence"""
+
+    def __init__(
+        self,
+        api_endpoint: str,
+        api_key: str,
+        file_path: Optional[str] = None,
+        url_path: Optional[str] = None,
+        api_version: Optional[str] = None,
+        api_model: str = "prebuilt-layout",
+        mode: str = "markdown",
+    ) -> None:
+        """
+        Initialize the object for file processing with Azure Document Intelligence
+        (formerly Form Recognizer).
+
+        This constructor initializes a AzureAIDocumentIntelligenceParser object to be
+        used for parsing files using the Azure Document Intelligence API. The load
+        method generates Documents whose content representations are determined by the
+        mode parameter.
+
+        Parameters:
+        -----------
+        api_endpoint: str
+            The API endpoint to use for DocumentIntelligenceClient construction.
+        api_key: str
+            The API key to use for DocumentIntelligenceClient construction.
+        file_path : Optional[str]
+            The path to the file that needs to be loaded.
+            Either file_path or url_path must be specified.
+        url_path : Optional[str]
+            The URL to the file that needs to be loaded.
+            Either file_path or url_path must be specified.
+        api_version: Optional[str]
+            The API version for DocumentIntelligenceClient. Setting None to use
+            the default value from SDK.
+        api_model: str
+            The model name or ID to be used for form recognition in Azure.
+
+        Examples:
+        ---------
+        >>> obj = AzureAIDocumentIntelligenceLoader(
+        ...     file_path="path/to/file",
+        ...     api_endpoint="https://endpoint.azure.com",
+        ...     api_key="APIKEY",
+        ...     api_version="2023-10-31-preview",
+        ...     model="prebuilt-document"
+        ... )
+        """
+
+        assert (
+            file_path is not None or url_path is not None
+        ), "file_path or url_path must be provided"
+        self.file_path = file_path
+        self.url_path = url_path
+
+        self.parser = AzureAIDocumentIntelligenceParser(
+            api_endpoint=api_endpoint,
+            api_key=api_key,
+            api_version=api_version,
+            api_model=api_model,
+            mode=mode,
+        )
+
+    def load(self) -> List[Document]:
+        """Load given path as pages."""
+        return list(self.lazy_load())
+
+    def lazy_load(
+        self,
+    ) -> Iterator[Document]:
+        """Lazy load given path as pages."""
+        if self.file_path is not None:
+            blob = Blob.from_path(self.file_path)
+            yield from self.parser.parse(blob)
+        else:
+            yield from self.parser.parse_url(self.url_path)
--- a/libs/community/langchain_community/document_loaders/parsers/init.py
+++ b/libs/community/langchain_community/document_loaders/parsers/init.py
@@ -1,4 +1,7 @@
 from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
+from langchain_community.document_loaders.parsers.doc_intelligence import (
+    AzureAIDocumentIntelligenceParser,
+)
 from langchain_community.document_loaders.parsers.docai import DocAIParser
 from langchain_community.document_loaders.parsers.grobid import GrobidParser
 from langchain_community.document_loaders.parsers.html import BS4HTMLParser
@@ -12,6 +15,7 @@ from langchain_community.document_loaders.parsers.pdf import (
 )

 __all__ = [
+    "AzureAIDocumentIntelligenceParser",
    "BS4HTMLParser",
    "DocAIParser",
    "GrobidParser",
--- a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
+++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
@@ -0,0 +1,122 @@
+from typing import Any, Iterator, Optional
+
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
+
+
+class AzureAIDocumentIntelligenceParser(BaseBlobParser):
+    """Loads a PDF with Azure Document Intelligence
+    (formerly Forms Recognizer)."""
+
+    def __init__(
+        self,
+        api_endpoint: str,
+        api_key: str,
+        api_version: Optional[str] = None,
+        api_model: str = "prebuilt-layout",
+        mode: str = "markdown",
+    ):
+        from azure.ai.documentintelligence import DocumentIntelligenceClient
+        from azure.core.credentials import AzureKeyCredential
+
+        kwargs = {}
+        if api_version is not None:
+            kwargs["api_version"] = api_version
+        self.client = DocumentIntelligenceClient(
+            endpoint=api_endpoint,
+            credential=AzureKeyCredential(api_key),
+            headers={"x-ms-useragent": "langchain-parser/1.0.0"},
+            **kwargs,
+        )
+        self.api_model = api_model
+        self.mode = mode
+        assert self.mode in ["single", "page", "object", "markdown"]
+
+    def _generate_docs_page(self, result: Any) -> Iterator[Document]:
+        for p in result.pages:
+            content = " ".join([line.content for line in p.lines])
+
+            d = Document(
+                page_content=content,
+                metadata={
+                    "page": p.page_number,
+                },
+            )
+            yield d
+
+    def _generate_docs_single(self, result: Any) -> Iterator[Document]:
+        yield Document(page_content=result.content, metadata={})
+
+    def _generate_docs_object(self, result: Any) -> Iterator[Document]:
+        # record relationship between page id and span offset
+        page_offset = []
+        for page in result.pages:
+            # assume that spans only contain 1 element, to double check
+            page_offset.append(page.spans[0]["offset"])
+
+        # paragraph
+        # warning: paragraph content is overlapping with table content
+        for para in result.paragraphs:
+            yield Document(
+                page_content=para.content,
+                metadata={
+                    "role": para.role,
+                    "page": para.bounding_regions[0].page_number,
+                    "bounding_box": para.bounding_regions[0].polygon,
+                    "type": "paragraph",
+                },
+            )
+
+        # table
+        for table in result.tables:
+            yield Document(
+                page_content=table.cells,  # json object
+                metadata={
+                    "footnote": table.footnotes,
+                    "caption": table.caption,
+                    "page": para.bounding_regions[0].page_number,
+                    "bounding_box": para.bounding_regions[0].polygon,
+                    "row_count": table.row_count,
+                    "column_count": table.column_count,
+                    "type": "table",
+                },
+            )
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+
+        with blob.as_bytes_io() as file_obj:
+            poller = self.client.begin_analyze_document(
+                self.api_model,
+                file_obj,
+                content_type="application/octet-stream",
+                output_content_format="markdown" if self.mode == "markdown" else "text",
+            )
+            result = poller.result()
+
+            if self.mode in ["single", "markdown"]:
+                yield from self._generate_docs_single(result)
+            elif self.mode == ["page"]:
+                yield from self._generate_docs_page(result)
+            else:
+                yield from self._generate_docs_object(result)
+
+    def parse_url(self, url: str) -> Iterator[Document]:
+        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+
+        poller = self.client.begin_analyze_document(
+            self.api_model,
+            AnalyzeDocumentRequest(url_source=url),
+            # content_type="application/octet-stream",
+            output_content_format="markdown" if self.mode == "markdown" else "text",
+        )
+        result = poller.result()
+
+        if self.mode in ["single", "markdown"]:
+            yield from self._generate_docs_single(result)
+        elif self.mode == ["page"]:
+            yield from self._generate_docs_page(result)
+        else:
+            yield from self._generate_docs_object(result)
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -542,9 +542,17 @@ class AmazonTextractPDFParser(BaseBlobParser):

 class DocumentIntelligenceParser(BaseBlobParser):
    """Loads a PDF with Azure Document Intelligence
-    (formerly Forms Recognizer) and chunks at character level."""
+    (formerly Form Recognizer) and chunks at character level."""

    def __init__(self, client: Any, model: str):
+        warnings.warn(
+            "langchain.document_loaders.parsers.pdf.DocumentIntelligenceParser"
+            "and langchain.document_loaders.pdf.DocumentIntelligenceLoader"
+            " are deprecated. Please upgrade to "
+            "langchain.document_loaders.DocumentIntelligenceLoader "
+            "for any file parsing purpose using Azure Document Intelligence "
+            "service."
+        )
        self.client = client
        self.model = model