From c1d348e95d0f89f8a7322d3458e5a39a73de63bf Mon Sep 17 00:00:00 2001 From: Johannes Mohren <60135409+jmohren@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:22:58 +0100 Subject: [PATCH] doc-loader: retain Azure Doc Intelligence API metadata in Document parser (#28382) **Description**: This PR modifies the doc_intelligence.py parser in the community package to include all metadata returned by the Azure Doc Intelligence API in the Document object. Previously, only the parsed content (markdown) was retained, while other important metadata such as bounding boxes (bboxes) for images and tables was discarded. These image bboxes are crucial for supporting use cases like multi-modal RAG workflows when using Azure Doc Intelligence. The change ensures that all information returned by the Azure Doc Intelligence API is preserved by setting the metadata attribute of the Document object to the entire result returned by the API, rather than an empty dictionary. This extends the parser's utility for complex use cases without breaking existing functionality. **Issue**: This change does not address a specific issue number, but it resolves a critical limitation in supporting multimodal workflows when using the LangChain wrapper for the Azure API. **Dependencies**: No additional dependencies are required for this change. --------- Co-authored-by: jmohren --- .../document_loaders/parsers/doc_intelligence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py index 2d77fcd1f87..107e569339f 100644 --- a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py +++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py @@ -71,7 +71,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser): yield d def _generate_docs_single(self, result: Any) -> Iterator[Document]: - yield Document(page_content=result.content, metadata={}) + yield Document(page_content=result.content, metadata=result.as_dict()) def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob."""