From 9fb09c1c300777b141e2d9994d01ff5e11f74340 Mon Sep 17 00:00:00 2001 From: Edwin Wenink Date: Fri, 12 Jan 2024 20:01:28 +0100 Subject: [PATCH] community: fix the "page" mode in the AzureAIDocumentIntelligenceParser (bug) (#15958) **Description**: the "page" mode in the AzureAIDocumentIntelligenceParser is not accessible due to a wrong membership test. The mode argument can only be a string (also see the assertion in the `__init__`: `assert self.mode in ["single", "page", "object", "markdown"]`, so the check `elif self.mode == ["page"]:` always fails. As a result, effectively the "object" mode is used when selecting the "page" mode, which may lead to errors. The docstring of the `AzureAIDocumentIntelligenceLoader` also ommitted the `mode` parameter alltogether, so I added it. **Issue**: I could not find a related issue (this class is only 3 weeks old anyways) **Dependencies**: this PR does not introduce or affect dependencies. The current demo notebook and examples are not affected because they all use the default markdown mode. --- .../langchain_community/document_loaders/doc_intelligence.py | 5 ++++- .../document_loaders/parsers/doc_intelligence.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/doc_intelligence.py b/libs/community/langchain_community/document_loaders/doc_intelligence.py index bad2130cecd..d1326afdbe3 100644 --- a/libs/community/langchain_community/document_loaders/doc_intelligence.py +++ b/libs/community/langchain_community/document_loaders/doc_intelligence.py @@ -48,6 +48,8 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader): the default value from SDK. api_model: str The model name or ID to be used for form recognition in Azure. + mode: Optional[str] + The type of content representation of the generated Documents. Examples: --------- @@ -56,7 +58,8 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader): ... api_endpoint="https://endpoint.azure.com", ... api_key="APIKEY", ... api_version="2023-10-31-preview", - ... model="prebuilt-document" + ... model="prebuilt-document", + ... mode="markdown" ... ) """ diff --git a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py index 40645cea7d7..f23214aea91 100644 --- a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py +++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py @@ -98,7 +98,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser): if self.mode in ["single", "markdown"]: yield from self._generate_docs_single(result) - elif self.mode == ["page"]: + elif self.mode in ["page"]: yield from self._generate_docs_page(result) else: yield from self._generate_docs_object(result) @@ -116,7 +116,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser): if self.mode in ["single", "markdown"]: yield from self._generate_docs_single(result) - elif self.mode == ["page"]: + elif self.mode in ["page"]: yield from self._generate_docs_page(result) else: yield from self._generate_docs_object(result)