mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 15:04:13 +00:00
community[patch]: Microsoft Azure Document Intelligence updates (#16932)
- **Description:** Update Azure Document Intelligence implementation by Microsoft team and RAG cookbook with Azure AI Search --------- Co-authored-by: Lu Zhang (AI) <luzhan@microsoft.com> Co-authored-by: Yateng Hong <yatengh@microsoft.com> Co-authored-by: teethache <hongyateng2006@126.com> Co-authored-by: Lu Zhang <44625949+luzhang06@users.noreply.github.com> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
@@ -1,10 +1,13 @@
|
||||
from typing import Any, Iterator, Optional
|
||||
import logging
|
||||
from typing import Any, Iterator, List, Optional
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
"""Loads a PDF with Azure Document Intelligence
|
||||
@@ -17,22 +20,43 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
api_version: Optional[str] = None,
|
||||
api_model: str = "prebuilt-layout",
|
||||
mode: str = "markdown",
|
||||
analysis_features: Optional[List[str]] = None,
|
||||
):
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import DocumentAnalysisFeature
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
kwargs = {}
|
||||
if api_version is not None:
|
||||
kwargs["api_version"] = api_version
|
||||
|
||||
if analysis_features is not None:
|
||||
_SUPPORTED_FEATURES = [
|
||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,
|
||||
]
|
||||
|
||||
analysis_features = [
|
||||
DocumentAnalysisFeature(feature) for feature in analysis_features
|
||||
]
|
||||
if any(
|
||||
[feature not in _SUPPORTED_FEATURES for feature in analysis_features]
|
||||
):
|
||||
logger.warning(
|
||||
f"The current supported features are: "
|
||||
f"{[f.value for f in _SUPPORTED_FEATURES]}. "
|
||||
"Using other features may result in unexpected behavior."
|
||||
)
|
||||
|
||||
self.client = DocumentIntelligenceClient(
|
||||
endpoint=api_endpoint,
|
||||
credential=AzureKeyCredential(api_key),
|
||||
headers={"x-ms-useragent": "langchain-parser/1.0.0"},
|
||||
features=analysis_features,
|
||||
**kwargs,
|
||||
)
|
||||
self.api_model = api_model
|
||||
self.mode = mode
|
||||
assert self.mode in ["single", "page", "object", "markdown"]
|
||||
assert self.mode in ["single", "page", "markdown"]
|
||||
|
||||
def _generate_docs_page(self, result: Any) -> Iterator[Document]:
|
||||
for p in result.pages:
|
||||
@@ -49,41 +73,6 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
def _generate_docs_single(self, result: Any) -> Iterator[Document]:
|
||||
yield Document(page_content=result.content, metadata={})
|
||||
|
||||
def _generate_docs_object(self, result: Any) -> Iterator[Document]:
|
||||
# record relationship between page id and span offset
|
||||
page_offset = []
|
||||
for page in result.pages:
|
||||
# assume that spans only contain 1 element, to double check
|
||||
page_offset.append(page.spans[0]["offset"])
|
||||
|
||||
# paragraph
|
||||
# warning: paragraph content is overlapping with table content
|
||||
for para in result.paragraphs:
|
||||
yield Document(
|
||||
page_content=para.content,
|
||||
metadata={
|
||||
"role": para.role,
|
||||
"page": para.bounding_regions[0].page_number,
|
||||
"bounding_box": para.bounding_regions[0].polygon,
|
||||
"type": "paragraph",
|
||||
},
|
||||
)
|
||||
|
||||
# table
|
||||
for table in result.tables:
|
||||
yield Document(
|
||||
page_content=table.cells, # json object
|
||||
metadata={
|
||||
"footnote": table.footnotes,
|
||||
"caption": table.caption,
|
||||
"page": para.bounding_regions[0].page_number,
|
||||
"bounding_box": para.bounding_regions[0].polygon,
|
||||
"row_count": table.row_count,
|
||||
"column_count": table.column_count,
|
||||
"type": "table",
|
||||
},
|
||||
)
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
|
||||
@@ -101,7 +90,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
elif self.mode in ["page"]:
|
||||
yield from self._generate_docs_page(result)
|
||||
else:
|
||||
yield from self._generate_docs_object(result)
|
||||
raise ValueError(f"Invalid mode: {self.mode}")
|
||||
|
||||
def parse_url(self, url: str) -> Iterator[Document]:
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
@@ -119,4 +108,4 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
elif self.mode in ["page"]:
|
||||
yield from self._generate_docs_page(result)
|
||||
else:
|
||||
yield from self._generate_docs_object(result)
|
||||
raise ValueError(f"Invalid mode: {self.mode}")
|
||||
|
Reference in New Issue
Block a user