From 2460f977c5c20073b41803c41fd08945be34cd60 Mon Sep 17 00:00:00 2001
From: QIAN Zifei <zfqian@outlook.com>
Date: Fri, 22 Dec 2023 08:40:27 +0800
Subject: [PATCH] community[minor]: Azure DocumentIntelligenceLoader/Parser
 support update with latest SDK (#14389)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- **Description:**
Add DocumentIntelligenceLoader & DocumentIntelligenceParser
implementation using the latest Azure Document Intelligence SDK with
markdown support.
The core logic resides in DocumentIntelligenceParser and
DocumentIntelligenceLoader is a mere wrapper of the parser.
The parser will takes api_endpoint and api_key and creates
DocumentIntelligenceClient for the user. 4 parsing modes are supported:
1. Markdown (default)
2. Single
3. Page
4. Object

UT and notebook are also updated accordingly.

- **Dependencies:** Azure Document Intelligence SDK:
azure-ai-documentintelligence
[azure-sdk-for-python/sdk/documentintelligence/azure-ai-documentintelligence
at 7c42462ac662522a6fd21b17d2a20f4cd40d0356 · Azure/azure-sdk-for-python
(github.com)](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FAzure%2Fazure-sdk-for-python%2Ftree%2F7c42462ac662522a6fd21b17d2a20f4cd40d0356%2Fsdk%2Fdocumentintelligence%2Fazure-ai-documentintelligence&data=05%7C01%7CZifei.Qian%40microsoft.com%7C298225aa3e31468a863108dbf07374ff%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638368150928704292%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=oE0Sl4HERnMKdbkV9KgBV46Z2xytcQAShdTWf7ZNl%2Bs%3D&reserved=0).

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
---
 .../azure_document_intelligence.ipynb         |  97 ++++++++------
 .../document_loaders/__init__.py              |   4 +
 .../document_loaders/doc_intelligence.py      |  89 +++++++++++++
 .../document_loaders/parsers/__init__.py      |   4 +
 .../parsers/doc_intelligence.py               | 122 ++++++++++++++++++
 .../document_loaders/parsers/pdf.py           |  10 +-
 libs/community/poetry.lock                    |  55 +++++++-
 libs/community/pyproject.toml                 |   2 +
 .../parsers/test_doc_intelligence.py          |  27 ++++
 .../parsers/test_public_api.py                |   1 +
 .../document_loaders/test_imports.py          |   1 +
 11 files changed, 367 insertions(+), 45 deletions(-)
 create mode 100644 libs/community/langchain_community/document_loaders/doc_intelligence.py
 create mode 100644 libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
 create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py

diff --git a/docs/docs/integrations/document_loaders/azure_document_intelligence.ipynb b/docs/docs/integrations/document_loaders/azure_document_intelligence.ipynb
index 390db4b5abb..dc68783d3d9 100644
--- a/docs/docs/integrations/document_loaders/azure_document_intelligence.ipynb
+++ b/docs/docs/integrations/document_loaders/azure_document_intelligence.ipynb
@@ -5,7 +5,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Azure Document Intelligence"
+    "# Azure AI Document Intelligence"
    ]
   },
   {
@@ -13,7 +13,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Azure Document Intelligence (formerly known as Azure Forms Recognizer) is machine-learning \n",
+    "Azure AI Document Intelligence (formerly known as Azure Form Recognizer) is machine-learning \n",
     "based service that extracts text (including handwriting), tables or key-value-pairs from\n",
     "scanned documents or images.\n",
     "\n",
@@ -21,7 +21,7 @@
     "\n",
     "Document Intelligence supports PDF, JPEG, PNG, BMP, or TIFF.\n",
     "\n",
-    "Further documentation is available at https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/?view=doc-intel-3.1.0.\n"
+    "Further documentation is available at https://aka.ms/doc-intelligence.\n"
    ]
   },
   {
@@ -30,7 +30,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install langchain azure-ai-formrecognizer -q"
+    "%pip install langchain langchain-community azure-ai-documentintelligence -q"
    ]
   },
   {
@@ -46,23 +46,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The first example uses a local file which will be sent to Azure Document Intelligence.\n",
-    "\n",
-    "First, an instance of a DocumentAnalysisClient is created with endpoint and key for the Azure service.    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from azure.ai.formrecognizer import DocumentAnalysisClient\n",
-    "from azure.core.credentials import AzureKeyCredential\n",
-    "\n",
-    "document_analysis_client = DocumentAnalysisClient(\n",
-    "    endpoint=\"<service_endpoint>\", credential=AzureKeyCredential(\"<service_key>\")\n",
-    ")"
+    "The first example uses a local file which will be sent to Azure AI Document Intelligence."
    ]
   },
   {
@@ -75,15 +59,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain.document_loaders.pdf import DocumentIntelligenceLoader\n",
+    "from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader\n",
     "\n",
-    "loader = DocumentIntelligenceLoader(\n",
-    "    \"<Local_filename>\", client=document_analysis_client, model=\"<model_name>\"\n",
-    ")  # e.g. prebuilt-document\n",
+    "file_path = \"<filepath>\"\n",
+    "endpoint = \"<endpoint>\"\n",
+    "key = \"<key>\"\n",
+    "loader = AzureAIDocumentIntelligenceLoader(\n",
+    "    api_endpoint=endpoint, api_key=key, file_path=file_path, api_model=\"prebuilt-layout\"\n",
+    ")\n",
     "\n",
     "documents = loader.load()"
    ]
@@ -93,25 +80,45 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The output contains each page of the source document as a LangChain document: "
+    "The default output contains one LangChain document with markdown format content: "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[Document(page_content='...', metadata={'source': '...', 'page': 1})]"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
+   "source": [
+    "documents"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example 2\n",
+    "The input file can also be URL path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url_path = \"<url>\"\n",
+    "loader = AzureAIDocumentIntelligenceLoader(\n",
+    "    api_endpoint=endpoint, api_key=key, url_path=url_path, api_model=\"prebuilt-layout\"\n",
+    ")\n",
+    "\n",
+    "documents = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "documents"
    ]
@@ -124,8 +131,16 @@
    "name": "python3"
   },
   "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
    "name": "python",
-   "version": "3.9.5"
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
   },
   "vscode": {
    "interpreter": {
diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py
index 48143a30b66..ca295e538eb 100644
--- a/libs/community/langchain_community/document_loaders/__init__.py
+++ b/libs/community/langchain_community/document_loaders/__init__.py
@@ -77,6 +77,9 @@ from langchain_community.document_loaders.dataframe import DataFrameLoader
 from langchain_community.document_loaders.diffbot import DiffbotLoader
 from langchain_community.document_loaders.directory import DirectoryLoader
 from langchain_community.document_loaders.discord import DiscordChatLoader
+from langchain_community.document_loaders.doc_intelligence import (
+    AzureAIDocumentIntelligenceLoader,
+)
 from langchain_community.document_loaders.docugami import DocugamiLoader
 from langchain_community.document_loaders.docusaurus import DocusaurusLoader
 from langchain_community.document_loaders.dropbox import DropboxLoader
@@ -247,6 +250,7 @@ __all__ = [
     "AssemblyAIAudioTranscriptLoader",
     "AsyncHtmlLoader",
     "AzureAIDataLoader",
+    "AzureAIDocumentIntelligenceLoader",
     "AzureBlobStorageContainerLoader",
     "AzureBlobStorageFileLoader",
     "BSHTMLLoader",
diff --git a/libs/community/langchain_community/document_loaders/doc_intelligence.py b/libs/community/langchain_community/document_loaders/doc_intelligence.py
new file mode 100644
index 00000000000..bad2130cecd
--- /dev/null
+++ b/libs/community/langchain_community/document_loaders/doc_intelligence.py
@@ -0,0 +1,89 @@
+from typing import Iterator, List, Optional
+
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseLoader
+from langchain_community.document_loaders.blob_loaders import Blob
+from langchain_community.document_loaders.parsers import (
+    AzureAIDocumentIntelligenceParser,
+)
+
+
+class AzureAIDocumentIntelligenceLoader(BaseLoader):
+    """Loads a PDF with Azure Document Intelligence"""
+
+    def __init__(
+        self,
+        api_endpoint: str,
+        api_key: str,
+        file_path: Optional[str] = None,
+        url_path: Optional[str] = None,
+        api_version: Optional[str] = None,
+        api_model: str = "prebuilt-layout",
+        mode: str = "markdown",
+    ) -> None:
+        """
+        Initialize the object for file processing with Azure Document Intelligence
+        (formerly Form Recognizer).
+
+        This constructor initializes a AzureAIDocumentIntelligenceParser object to be
+        used for parsing files using the Azure Document Intelligence API. The load
+        method generates Documents whose content representations are determined by the
+        mode parameter.
+
+        Parameters:
+        -----------
+        api_endpoint: str
+            The API endpoint to use for DocumentIntelligenceClient construction.
+        api_key: str
+            The API key to use for DocumentIntelligenceClient construction.
+        file_path : Optional[str]
+            The path to the file that needs to be loaded.
+            Either file_path or url_path must be specified.
+        url_path : Optional[str]
+            The URL to the file that needs to be loaded.
+            Either file_path or url_path must be specified.
+        api_version: Optional[str]
+            The API version for DocumentIntelligenceClient. Setting None to use
+            the default value from SDK.
+        api_model: str
+            The model name or ID to be used for form recognition in Azure.
+
+        Examples:
+        ---------
+        >>> obj = AzureAIDocumentIntelligenceLoader(
+        ...     file_path="path/to/file",
+        ...     api_endpoint="https://endpoint.azure.com",
+        ...     api_key="APIKEY",
+        ...     api_version="2023-10-31-preview",
+        ...     model="prebuilt-document"
+        ... )
+        """
+
+        assert (
+            file_path is not None or url_path is not None
+        ), "file_path or url_path must be provided"
+        self.file_path = file_path
+        self.url_path = url_path
+
+        self.parser = AzureAIDocumentIntelligenceParser(
+            api_endpoint=api_endpoint,
+            api_key=api_key,
+            api_version=api_version,
+            api_model=api_model,
+            mode=mode,
+        )
+
+    def load(self) -> List[Document]:
+        """Load given path as pages."""
+        return list(self.lazy_load())
+
+    def lazy_load(
+        self,
+    ) -> Iterator[Document]:
+        """Lazy load given path as pages."""
+        if self.file_path is not None:
+            blob = Blob.from_path(self.file_path)
+            yield from self.parser.parse(blob)
+        else:
+            yield from self.parser.parse_url(self.url_path)
diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py
index c7bd6d73dff..9d01c3df2f5 100644
--- a/libs/community/langchain_community/document_loaders/parsers/__init__.py
+++ b/libs/community/langchain_community/document_loaders/parsers/__init__.py
@@ -1,4 +1,7 @@
 from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
+from langchain_community.document_loaders.parsers.doc_intelligence import (
+    AzureAIDocumentIntelligenceParser,
+)
 from langchain_community.document_loaders.parsers.docai import DocAIParser
 from langchain_community.document_loaders.parsers.grobid import GrobidParser
 from langchain_community.document_loaders.parsers.html import BS4HTMLParser
@@ -12,6 +15,7 @@ from langchain_community.document_loaders.parsers.pdf import (
 )
 
 __all__ = [
+    "AzureAIDocumentIntelligenceParser",
     "BS4HTMLParser",
     "DocAIParser",
     "GrobidParser",
diff --git a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
new file mode 100644
index 00000000000..40645cea7d7
--- /dev/null
+++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
@@ -0,0 +1,122 @@
+from typing import Any, Iterator, Optional
+
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
+
+
+class AzureAIDocumentIntelligenceParser(BaseBlobParser):
+    """Loads a PDF with Azure Document Intelligence
+    (formerly Forms Recognizer)."""
+
+    def __init__(
+        self,
+        api_endpoint: str,
+        api_key: str,
+        api_version: Optional[str] = None,
+        api_model: str = "prebuilt-layout",
+        mode: str = "markdown",
+    ):
+        from azure.ai.documentintelligence import DocumentIntelligenceClient
+        from azure.core.credentials import AzureKeyCredential
+
+        kwargs = {}
+        if api_version is not None:
+            kwargs["api_version"] = api_version
+        self.client = DocumentIntelligenceClient(
+            endpoint=api_endpoint,
+            credential=AzureKeyCredential(api_key),
+            headers={"x-ms-useragent": "langchain-parser/1.0.0"},
+            **kwargs,
+        )
+        self.api_model = api_model
+        self.mode = mode
+        assert self.mode in ["single", "page", "object", "markdown"]
+
+    def _generate_docs_page(self, result: Any) -> Iterator[Document]:
+        for p in result.pages:
+            content = " ".join([line.content for line in p.lines])
+
+            d = Document(
+                page_content=content,
+                metadata={
+                    "page": p.page_number,
+                },
+            )
+            yield d
+
+    def _generate_docs_single(self, result: Any) -> Iterator[Document]:
+        yield Document(page_content=result.content, metadata={})
+
+    def _generate_docs_object(self, result: Any) -> Iterator[Document]:
+        # record relationship between page id and span offset
+        page_offset = []
+        for page in result.pages:
+            # assume that spans only contain 1 element, to double check
+            page_offset.append(page.spans[0]["offset"])
+
+        # paragraph
+        # warning: paragraph content is overlapping with table content
+        for para in result.paragraphs:
+            yield Document(
+                page_content=para.content,
+                metadata={
+                    "role": para.role,
+                    "page": para.bounding_regions[0].page_number,
+                    "bounding_box": para.bounding_regions[0].polygon,
+                    "type": "paragraph",
+                },
+            )
+
+        # table
+        for table in result.tables:
+            yield Document(
+                page_content=table.cells,  # json object
+                metadata={
+                    "footnote": table.footnotes,
+                    "caption": table.caption,
+                    "page": para.bounding_regions[0].page_number,
+                    "bounding_box": para.bounding_regions[0].polygon,
+                    "row_count": table.row_count,
+                    "column_count": table.column_count,
+                    "type": "table",
+                },
+            )
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+
+        with blob.as_bytes_io() as file_obj:
+            poller = self.client.begin_analyze_document(
+                self.api_model,
+                file_obj,
+                content_type="application/octet-stream",
+                output_content_format="markdown" if self.mode == "markdown" else "text",
+            )
+            result = poller.result()
+
+            if self.mode in ["single", "markdown"]:
+                yield from self._generate_docs_single(result)
+            elif self.mode == ["page"]:
+                yield from self._generate_docs_page(result)
+            else:
+                yield from self._generate_docs_object(result)
+
+    def parse_url(self, url: str) -> Iterator[Document]:
+        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+
+        poller = self.client.begin_analyze_document(
+            self.api_model,
+            AnalyzeDocumentRequest(url_source=url),
+            # content_type="application/octet-stream",
+            output_content_format="markdown" if self.mode == "markdown" else "text",
+        )
+        result = poller.result()
+
+        if self.mode in ["single", "markdown"]:
+            yield from self._generate_docs_single(result)
+        elif self.mode == ["page"]:
+            yield from self._generate_docs_page(result)
+        else:
+            yield from self._generate_docs_object(result)
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index 93ba53527b9..5ef03f3f696 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -542,9 +542,17 @@ class AmazonTextractPDFParser(BaseBlobParser):
 
 class DocumentIntelligenceParser(BaseBlobParser):
     """Loads a PDF with Azure Document Intelligence
-    (formerly Forms Recognizer) and chunks at character level."""
+    (formerly Form Recognizer) and chunks at character level."""
 
     def __init__(self, client: Any, model: str):
+        warnings.warn(
+            "langchain.document_loaders.parsers.pdf.DocumentIntelligenceParser"
+            "and langchain.document_loaders.pdf.DocumentIntelligenceLoader"
+            " are deprecated. Please upgrade to "
+            "langchain.document_loaders.DocumentIntelligenceLoader "
+            "for any file parsing purpose using Azure Document Intelligence "
+            "service."
+        )
         self.client = client
         self.model = model
 
diff --git a/libs/community/poetry.lock b/libs/community/poetry.lock
index 0f41b5941be..84df7d0c9dc 100644
--- a/libs/community/poetry.lock
+++ b/libs/community/poetry.lock
@@ -531,6 +531,41 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-
 tests = ["attrs[tests-no-zope]", "zope-interface"]
 tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
 
+[[package]]
+name = "azure-ai-documentintelligence"
+version = "1.0.0b1"
+description = "Microsoft Azure AI Document Intelligence Client Library for Python"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "azure-ai-documentintelligence-1.0.0b1.tar.gz", hash = "sha256:b0acedc50489cc63aac44190e32a3a04e5c50c98a1e4ed39bcb910f51fbf5207"},
+    {file = "azure_ai_documentintelligence-1.0.0b1-py3-none-any.whl", hash = "sha256:db81ea7c8c30e070b5b424a45f9c43c4111159ab6b3c2994c1346b3d3b01f682"},
+]
+
+[package.dependencies]
+azure-core = ">=1.28.0,<2.0.0"
+isodate = ">=0.6.1,<1.0.0"
+
+[[package]]
+name = "azure-core"
+version = "1.29.6"
+description = "Microsoft Azure Core Library for Python"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "azure-core-1.29.6.tar.gz", hash = "sha256:13b485252ecd9384ae624894fe51cfa6220966207264c360beada239f88b738a"},
+    {file = "azure_core-1.29.6-py3-none-any.whl", hash = "sha256:604a005bce6a49ba661bb7b2be84a9b169047e52fcfcd0a4e4770affab4178f7"},
+]
+
+[package.dependencies]
+anyio = ">=3.0,<5.0"
+requests = ">=2.21.0"
+six = ">=1.11.0"
+typing-extensions = ">=4.6.0"
+
+[package.extras]
+aio = ["aiohttp (>=3.0)"]
+
 [[package]]
 name = "babel"
 version = "2.13.1"
@@ -3167,6 +3202,20 @@ widgetsnbextension = ">=4.0.9,<4.1.0"
 [package.extras]
 test = ["ipykernel", "jsonschema", "pytest (>=3.6.0)", "pytest-cov", "pytz"]
 
+[[package]]
+name = "isodate"
+version = "0.6.1"
+description = "An ISO 8601 date/time/duration parser and formatter"
+optional = true
+python-versions = "*"
+files = [
+    {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"},
+    {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"},
+]
+
+[package.dependencies]
+six = "*"
+
 [[package]]
 name = "isoduration"
 version = "20.11.0"
@@ -3821,7 +3870,7 @@ files = [
 
 [[package]]
 name = "langchain-core"
-version = "0.1.1"
+version = "0.1.3"
 description = "Building applications with LLMs through composability"
 optional = false
 python-versions = ">=3.8.1,<4.0"
@@ -9062,9 +9111,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 
 [extras]
 cli = ["typer"]
-extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
+extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "00b69a8316c2748362f1f135e229950230be0401e7c307c0ce27a8309f947816"
+content-hash = "9094149705a405904c268b09c7dddae98fa466f67b2606defb5c6e3661b36602"
diff --git a/libs/community/pyproject.toml b/libs/community/pyproject.toml
index b668fb96c88..bc75d721876 100644
--- a/libs/community/pyproject.toml
+++ b/libs/community/pyproject.toml
@@ -84,6 +84,7 @@ msal = {version = "^1.25.0", optional = true}
 databricks-vectorsearch = {version = "^0.21", optional = true}
 dgml-utils = {version = "^0.3.0", optional = true}
 datasets = {version = "^2.15.0", optional = true}
+azure-ai-documentintelligence = {version = "^1.0.0b1", optional = true}
 oracle-ads = {version = "^2.9.1", optional = true}
 
 [tool.poetry.group.test]
@@ -244,6 +245,7 @@ extended_testing = [
  "databricks-vectorsearch",
  "dgml-utils",
  "cohere",
+ "azure-ai-documentintelligence",
  "oracle-ads",
 ]
 
diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py
new file mode 100644
index 00000000000..1f1651e3063
--- /dev/null
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py
@@ -0,0 +1,27 @@
+"""Tests for the Google Cloud DocAI parser."""
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from langchain_community.document_loaders.parsers import (
+    AzureAIDocumentIntelligenceParser,
+)
+
+
+@pytest.mark.requires("azure", "azure.ai", "azure.ai.documentintelligence")
+@patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
+@patch("azure.core.credentials.AzureKeyCredential")
+def test_doc_intelligence(mock_credential: MagicMock, mock_client: MagicMock) -> None:
+    endpoint = "endpoint"
+    key = "key"
+
+    parser = AzureAIDocumentIntelligenceParser(api_endpoint=endpoint, api_key=key)
+    mock_credential.assert_called_once_with(key)
+    mock_client.assert_called_once_with(
+        endpoint=endpoint,
+        credential=mock_credential(),
+        headers={"x-ms-useragent": "langchain-parser/1.0.0"},
+    )
+    assert parser.client == mock_client()
+    assert parser.api_model == "prebuilt-layout"
+    assert parser.mode == "markdown"
diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
index a0b62f8531f..3dc19adb310 100644
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
@@ -4,6 +4,7 @@ from langchain_community.document_loaders.parsers import __all__
 def test_parsers_public_api_correct() -> None:
     """Test public API of parsers for breaking changes."""
     assert set(__all__) == {
+        "AzureAIDocumentIntelligenceParser",
         "BS4HTMLParser",
         "DocAIParser",
         "GrobidParser",
diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py
index 69f20546d04..a2101c8830d 100644
--- a/libs/community/tests/unit_tests/document_loaders/test_imports.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py
@@ -23,6 +23,7 @@ EXPECTED_ALL = [
     "AssemblyAIAudioTranscriptLoader",
     "AsyncHtmlLoader",
     "AzureAIDataLoader",
+    "AzureAIDocumentIntelligenceLoader",
     "AzureBlobStorageContainerLoader",
     "AzureBlobStorageFileLoader",
     "BSHTMLLoader",