langchain/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
Louis Auneau 0b532a4ed0
community: Azure Document Intelligence parser features not available fixed (#30370)
Thank you for contributing to LangChain!

- **Description:** Azure Document Intelligence OCR solution has a
*feature* parameter that enables some features such as high-resolution
document analysis, key-value pairs extraction, ... In langchain parser,
you could be provided as a `analysis_feature` parameter to the
constructor that was passed on the `DocumentIntelligenceClient`.
However, according to the `DocumentIntelligenceClient` [API
Reference](https://learn.microsoft.com/en-us/python/api/azure-ai-documentintelligence/azure.ai.documentintelligence.documentintelligenceclient?view=azure-python),
this is not a valid constructor parameter. It was therefore remove and
instead stored as a parser property that is used in the
`begin_analyze_document`'s `features` parameter (see [API
Reference](https://learn.microsoft.com/en-us/python/api/azure-ai-formrecognizer/azure.ai.formrecognizer.documentanalysisclient?view=azure-python#azure-ai-formrecognizer-documentanalysisclient-begin-analyze-document)).
I also removed the check for "Supported features" since all features are
supported out-of-the-box. Also I did not check if the provided `str`
actually corresponds to the Azure package enumeration of features, since
the `ValueError` when creating the enumeration object is pretty
explicit.
Last caveat, is that some features are not supported for some kind of
documents. This is documented inside Microsoft documentation and
exception are also explicit.
- **Issue:** N/A
- **Dependencies:** No
- **Twitter handle:** @Louis___A

---------

Co-authored-by: Louis Auneau <louis@handshakehealth.co>
2025-03-26 14:40:14 -04:00

133 lines
4.8 KiB
Python

from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, Iterator, List, Optional
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
if TYPE_CHECKING:
from azure.core.credentials import TokenCredential
logger = logging.getLogger(__name__)
class AzureAIDocumentIntelligenceParser(BaseBlobParser):
"""Loads a PDF with Azure Document Intelligence
(formerly Forms Recognizer)."""
def __init__(
self,
api_endpoint: str,
api_key: Optional[str] = None,
api_version: Optional[str] = None,
api_model: str = "prebuilt-layout",
mode: str = "markdown",
analysis_features: Optional[List[str]] = None,
azure_credential: Optional["TokenCredential"] = None,
):
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentAnalysisFeature
from azure.core.credentials import AzureKeyCredential
kwargs = {}
if api_key is None and azure_credential is None:
raise ValueError("Either api_key or azure_credential must be provided.")
if api_key and azure_credential:
raise ValueError(
"Only one of api_key or azure_credential should be provided."
)
if api_version is not None:
kwargs["api_version"] = api_version
self.client = DocumentIntelligenceClient(
endpoint=api_endpoint,
credential=azure_credential or AzureKeyCredential(api_key),
headers={"x-ms-useragent": "langchain-parser/1.0.0"},
**kwargs,
)
self.api_model = api_model
self.mode = mode
self.features: Optional[List[DocumentAnalysisFeature]] = None
if analysis_features is not None:
self.features = [
DocumentAnalysisFeature(feature) for feature in analysis_features
]
assert self.mode in ["single", "page", "markdown"]
def _generate_docs_page(self, result: Any) -> Iterator[Document]:
for p in result.pages:
content = " ".join([line.content for line in p.lines])
d = Document(
page_content=content,
metadata={
"page": p.page_number,
},
)
yield d
def _generate_docs_single(self, result: Any) -> Iterator[Document]:
yield Document(page_content=result.content, metadata=result.as_dict())
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
with blob.as_bytes_io() as file_obj:
poller = self.client.begin_analyze_document(
self.api_model,
body=file_obj,
content_type="application/octet-stream",
output_content_format="markdown" if self.mode == "markdown" else "text",
features=self.features,
)
result = poller.result()
if self.mode in ["single", "markdown"]:
yield from self._generate_docs_single(result)
elif self.mode in ["page"]:
yield from self._generate_docs_page(result)
else:
raise ValueError(f"Invalid mode: {self.mode}")
def parse_url(self, url: str) -> Iterator[Document]:
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
poller = self.client.begin_analyze_document(
self.api_model,
body=AnalyzeDocumentRequest(url_source=url),
output_content_format="markdown" if self.mode == "markdown" else "text",
features=self.features,
)
result = poller.result()
if self.mode in ["single", "markdown"]:
yield from self._generate_docs_single(result)
elif self.mode in ["page"]:
yield from self._generate_docs_page(result)
else:
raise ValueError(f"Invalid mode: {self.mode}")
def parse_bytes(self, bytes_source: bytes) -> Iterator[Document]:
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
poller = self.client.begin_analyze_document(
self.api_model,
body=AnalyzeDocumentRequest(bytes_source=bytes_source),
output_content_format="markdown" if self.mode == "markdown" else "text",
features=self.features,
)
result = poller.result()
if self.mode in ["single", "markdown"]:
yield from self._generate_docs_single(result)
elif self.mode in ["page"]:
yield from self._generate_docs_page(result)
else:
raise ValueError(f"Invalid mode: {self.mode}")