mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-23 11:32:10 +00:00
Thank you for contributing to LangChain! - **Description:** Azure Document Intelligence OCR solution has a *feature* parameter that enables some features such as high-resolution document analysis, key-value pairs extraction, ... In langchain parser, you could be provided as a `analysis_feature` parameter to the constructor that was passed on the `DocumentIntelligenceClient`. However, according to the `DocumentIntelligenceClient` [API Reference](https://learn.microsoft.com/en-us/python/api/azure-ai-documentintelligence/azure.ai.documentintelligence.documentintelligenceclient?view=azure-python), this is not a valid constructor parameter. It was therefore remove and instead stored as a parser property that is used in the `begin_analyze_document`'s `features` parameter (see [API Reference](https://learn.microsoft.com/en-us/python/api/azure-ai-formrecognizer/azure.ai.formrecognizer.documentanalysisclient?view=azure-python#azure-ai-formrecognizer-documentanalysisclient-begin-analyze-document)). I also removed the check for "Supported features" since all features are supported out-of-the-box. Also I did not check if the provided `str` actually corresponds to the Azure package enumeration of features, since the `ValueError` when creating the enumeration object is pretty explicit. Last caveat, is that some features are not supported for some kind of documents. This is documented inside Microsoft documentation and exception are also explicit. - **Issue:** N/A - **Dependencies:** No - **Twitter handle:** @Louis___A --------- Co-authored-by: Louis Auneau <louis@handshakehealth.co>
133 lines
4.8 KiB
Python
133 lines
4.8 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import TYPE_CHECKING, Any, Iterator, List, Optional
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseBlobParser
|
|
from langchain_community.document_loaders.blob_loaders import Blob
|
|
|
|
if TYPE_CHECKING:
|
|
from azure.core.credentials import TokenCredential
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
|
"""Loads a PDF with Azure Document Intelligence
|
|
(formerly Forms Recognizer)."""
|
|
|
|
def __init__(
|
|
self,
|
|
api_endpoint: str,
|
|
api_key: Optional[str] = None,
|
|
api_version: Optional[str] = None,
|
|
api_model: str = "prebuilt-layout",
|
|
mode: str = "markdown",
|
|
analysis_features: Optional[List[str]] = None,
|
|
azure_credential: Optional["TokenCredential"] = None,
|
|
):
|
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
|
from azure.ai.documentintelligence.models import DocumentAnalysisFeature
|
|
from azure.core.credentials import AzureKeyCredential
|
|
|
|
kwargs = {}
|
|
|
|
if api_key is None and azure_credential is None:
|
|
raise ValueError("Either api_key or azure_credential must be provided.")
|
|
|
|
if api_key and azure_credential:
|
|
raise ValueError(
|
|
"Only one of api_key or azure_credential should be provided."
|
|
)
|
|
|
|
if api_version is not None:
|
|
kwargs["api_version"] = api_version
|
|
|
|
self.client = DocumentIntelligenceClient(
|
|
endpoint=api_endpoint,
|
|
credential=azure_credential or AzureKeyCredential(api_key),
|
|
headers={"x-ms-useragent": "langchain-parser/1.0.0"},
|
|
**kwargs,
|
|
)
|
|
self.api_model = api_model
|
|
self.mode = mode
|
|
self.features: Optional[List[DocumentAnalysisFeature]] = None
|
|
if analysis_features is not None:
|
|
self.features = [
|
|
DocumentAnalysisFeature(feature) for feature in analysis_features
|
|
]
|
|
assert self.mode in ["single", "page", "markdown"]
|
|
|
|
def _generate_docs_page(self, result: Any) -> Iterator[Document]:
|
|
for p in result.pages:
|
|
content = " ".join([line.content for line in p.lines])
|
|
|
|
d = Document(
|
|
page_content=content,
|
|
metadata={
|
|
"page": p.page_number,
|
|
},
|
|
)
|
|
yield d
|
|
|
|
def _generate_docs_single(self, result: Any) -> Iterator[Document]:
|
|
yield Document(page_content=result.content, metadata=result.as_dict())
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""Lazily parse the blob."""
|
|
|
|
with blob.as_bytes_io() as file_obj:
|
|
poller = self.client.begin_analyze_document(
|
|
self.api_model,
|
|
body=file_obj,
|
|
content_type="application/octet-stream",
|
|
output_content_format="markdown" if self.mode == "markdown" else "text",
|
|
features=self.features,
|
|
)
|
|
result = poller.result()
|
|
|
|
if self.mode in ["single", "markdown"]:
|
|
yield from self._generate_docs_single(result)
|
|
elif self.mode in ["page"]:
|
|
yield from self._generate_docs_page(result)
|
|
else:
|
|
raise ValueError(f"Invalid mode: {self.mode}")
|
|
|
|
def parse_url(self, url: str) -> Iterator[Document]:
|
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
|
|
|
poller = self.client.begin_analyze_document(
|
|
self.api_model,
|
|
body=AnalyzeDocumentRequest(url_source=url),
|
|
output_content_format="markdown" if self.mode == "markdown" else "text",
|
|
features=self.features,
|
|
)
|
|
result = poller.result()
|
|
|
|
if self.mode in ["single", "markdown"]:
|
|
yield from self._generate_docs_single(result)
|
|
elif self.mode in ["page"]:
|
|
yield from self._generate_docs_page(result)
|
|
else:
|
|
raise ValueError(f"Invalid mode: {self.mode}")
|
|
|
|
def parse_bytes(self, bytes_source: bytes) -> Iterator[Document]:
|
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
|
|
|
poller = self.client.begin_analyze_document(
|
|
self.api_model,
|
|
body=AnalyzeDocumentRequest(bytes_source=bytes_source),
|
|
output_content_format="markdown" if self.mode == "markdown" else "text",
|
|
features=self.features,
|
|
)
|
|
result = poller.result()
|
|
|
|
if self.mode in ["single", "markdown"]:
|
|
yield from self._generate_docs_single(result)
|
|
elif self.mode in ["page"]:
|
|
yield from self._generate_docs_page(result)
|
|
else:
|
|
raise ValueError(f"Invalid mode: {self.mode}")
|