From 0b532a4ed01320b236a90a628a8e54d4162b88e1 Mon Sep 17 00:00:00 2001 From: Louis Auneau Date: Wed, 26 Mar 2025 14:40:14 -0400 Subject: [PATCH] community: Azure Document Intelligence parser features not available fixed (#30370) Thank you for contributing to LangChain! - **Description:** Azure Document Intelligence OCR solution has a *feature* parameter that enables some features such as high-resolution document analysis, key-value pairs extraction, ... In langchain parser, you could be provided as a `analysis_feature` parameter to the constructor that was passed on the `DocumentIntelligenceClient`. However, according to the `DocumentIntelligenceClient` [API Reference](https://learn.microsoft.com/en-us/python/api/azure-ai-documentintelligence/azure.ai.documentintelligence.documentintelligenceclient?view=azure-python), this is not a valid constructor parameter. It was therefore remove and instead stored as a parser property that is used in the `begin_analyze_document`'s `features` parameter (see [API Reference](https://learn.microsoft.com/en-us/python/api/azure-ai-formrecognizer/azure.ai.formrecognizer.documentanalysisclient?view=azure-python#azure-ai-formrecognizer-documentanalysisclient-begin-analyze-document)). I also removed the check for "Supported features" since all features are supported out-of-the-box. Also I did not check if the provided `str` actually corresponds to the Azure package enumeration of features, since the `ValueError` when creating the enumeration object is pretty explicit. Last caveat, is that some features are not supported for some kind of documents. This is documented inside Microsoft documentation and exception are also explicit. - **Issue:** N/A - **Dependencies:** No - **Twitter handle:** @Louis___A --------- Co-authored-by: Louis Auneau --- .../parsers/doc_intelligence.py | 26 ++++++------------- .../parsers/test_doc_intelligence.py | 2 -- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py index 3bcbec6d9a4..f2c9d6b4a14 100644 --- a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py +++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py @@ -45,32 +45,19 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser): if api_version is not None: kwargs["api_version"] = api_version - if analysis_features is not None: - _SUPPORTED_FEATURES = [ - DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, - ] - - analysis_features = [ - DocumentAnalysisFeature(feature) for feature in analysis_features - ] - if any( - [feature not in _SUPPORTED_FEATURES for feature in analysis_features] - ): - logger.warning( - f"The current supported features are: " - f"{[f.value for f in _SUPPORTED_FEATURES]}. " - "Using other features may result in unexpected behavior." - ) - self.client = DocumentIntelligenceClient( endpoint=api_endpoint, credential=azure_credential or AzureKeyCredential(api_key), headers={"x-ms-useragent": "langchain-parser/1.0.0"}, - features=analysis_features, **kwargs, ) self.api_model = api_model self.mode = mode + self.features: Optional[List[DocumentAnalysisFeature]] = None + if analysis_features is not None: + self.features = [ + DocumentAnalysisFeature(feature) for feature in analysis_features + ] assert self.mode in ["single", "page", "markdown"] def _generate_docs_page(self, result: Any) -> Iterator[Document]: @@ -97,6 +84,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser): body=file_obj, content_type="application/octet-stream", output_content_format="markdown" if self.mode == "markdown" else "text", + features=self.features, ) result = poller.result() @@ -114,6 +102,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser): self.api_model, body=AnalyzeDocumentRequest(url_source=url), output_content_format="markdown" if self.mode == "markdown" else "text", + features=self.features, ) result = poller.result() @@ -131,6 +120,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser): self.api_model, body=AnalyzeDocumentRequest(bytes_source=bytes_source), output_content_format="markdown" if self.mode == "markdown" else "text", + features=self.features, ) result = poller.result() diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py index 820d1e56ba1..b6affb2e13f 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py @@ -24,7 +24,6 @@ def test_doc_intelligence(mock_credential: MagicMock, mock_client: MagicMock) -> headers={ "x-ms-useragent": "langchain-parser/1.0.0", }, - features=None, ) assert parser.client == mock_client() assert parser.api_model == "prebuilt-layout" @@ -51,7 +50,6 @@ def test_doc_intelligence_with_analysis_features( headers={ "x-ms-useragent": "langchain-parser/1.0.0", }, - features=analysis_features, ) assert parser.client == mock_client() assert parser.api_model == "prebuilt-layout"