From c1d348e95d0f89f8a7322d3458e5a39a73de63bf Mon Sep 17 00:00:00 2001
From: Johannes Mohren <60135409+jmohren@users.noreply.github.com>
Date: Tue, 10 Dec 2024 17:22:58 +0100
Subject: [PATCH] doc-loader: retain Azure Doc Intelligence API metadata in
 Document parser (#28382)

**Description**:
This PR modifies the doc_intelligence.py parser in the community package
to include all metadata returned by the Azure Doc Intelligence API in
the Document object. Previously, only the parsed content (markdown) was
retained, while other important metadata such as bounding boxes (bboxes)
for images and tables was discarded. These image bboxes are crucial for
supporting use cases like multi-modal RAG workflows when using Azure Doc
Intelligence.

The change ensures that all information returned by the Azure Doc
Intelligence API is preserved by setting the metadata attribute of the
Document object to the entire result returned by the API, rather than an
empty dictionary. This extends the parser's utility for complex use
cases without breaking existing functionality.

**Issue**:
This change does not address a specific issue number, but it resolves a
critical limitation in supporting multimodal workflows when using the
LangChain wrapper for the Azure API.

**Dependencies**:
No additional dependencies are required for this change.

---------

Co-authored-by: jmohren <johannes.mohren@aol.de>
---
 .../document_loaders/parsers/doc_intelligence.py                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
index 2d77fcd1f87..107e569339f 100644
--- a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
+++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
@@ -71,7 +71,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
             yield d
 
     def _generate_docs_single(self, result: Any) -> Iterator[Document]:
-        yield Document(page_content=result.content, metadata={})
+        yield Document(page_content=result.content, metadata=result.as_dict())
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:
         """Lazily parse the blob."""