mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-15 22:44:36 +00:00
community[minor]: Azure DocumentIntelligenceLoader/Parser support update with latest SDK (#14389)
- **Description:** Add DocumentIntelligenceLoader & DocumentIntelligenceParser implementation using the latest Azure Document Intelligence SDK with markdown support. The core logic resides in DocumentIntelligenceParser and DocumentIntelligenceLoader is a mere wrapper of the parser. The parser will takes api_endpoint and api_key and creates DocumentIntelligenceClient for the user. 4 parsing modes are supported: 1. Markdown (default) 2. Single 3. Page 4. Object UT and notebook are also updated accordingly. - **Dependencies:** Azure Document Intelligence SDK: azure-ai-documentintelligence [azure-sdk-for-python/sdk/documentintelligence/azure-ai-documentintelligence at 7c42462ac662522a6fd21b17d2a20f4cd40d0356 · Azure/azure-sdk-for-python (github.com)](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FAzure%2Fazure-sdk-for-python%2Ftree%2F7c42462ac662522a6fd21b17d2a20f4cd40d0356%2Fsdk%2Fdocumentintelligence%2Fazure-ai-documentintelligence&data=05%7C01%7CZifei.Qian%40microsoft.com%7C298225aa3e31468a863108dbf07374ff%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638368150928704292%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=oE0Sl4HERnMKdbkV9KgBV46Z2xytcQAShdTWf7ZNl%2Bs%3D&reserved=0). --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
|
||||
from langchain_community.document_loaders.parsers.doc_intelligence import (
|
||||
AzureAIDocumentIntelligenceParser,
|
||||
)
|
||||
from langchain_community.document_loaders.parsers.docai import DocAIParser
|
||||
from langchain_community.document_loaders.parsers.grobid import GrobidParser
|
||||
from langchain_community.document_loaders.parsers.html import BS4HTMLParser
|
||||
@@ -12,6 +15,7 @@ from langchain_community.document_loaders.parsers.pdf import (
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AzureAIDocumentIntelligenceParser",
|
||||
"BS4HTMLParser",
|
||||
"DocAIParser",
|
||||
"GrobidParser",
|
||||
|
@@ -0,0 +1,122 @@
|
||||
from typing import Any, Iterator, Optional
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
|
||||
|
||||
class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
"""Loads a PDF with Azure Document Intelligence
|
||||
(formerly Forms Recognizer)."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_endpoint: str,
|
||||
api_key: str,
|
||||
api_version: Optional[str] = None,
|
||||
api_model: str = "prebuilt-layout",
|
||||
mode: str = "markdown",
|
||||
):
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
kwargs = {}
|
||||
if api_version is not None:
|
||||
kwargs["api_version"] = api_version
|
||||
self.client = DocumentIntelligenceClient(
|
||||
endpoint=api_endpoint,
|
||||
credential=AzureKeyCredential(api_key),
|
||||
headers={"x-ms-useragent": "langchain-parser/1.0.0"},
|
||||
**kwargs,
|
||||
)
|
||||
self.api_model = api_model
|
||||
self.mode = mode
|
||||
assert self.mode in ["single", "page", "object", "markdown"]
|
||||
|
||||
def _generate_docs_page(self, result: Any) -> Iterator[Document]:
|
||||
for p in result.pages:
|
||||
content = " ".join([line.content for line in p.lines])
|
||||
|
||||
d = Document(
|
||||
page_content=content,
|
||||
metadata={
|
||||
"page": p.page_number,
|
||||
},
|
||||
)
|
||||
yield d
|
||||
|
||||
def _generate_docs_single(self, result: Any) -> Iterator[Document]:
|
||||
yield Document(page_content=result.content, metadata={})
|
||||
|
||||
def _generate_docs_object(self, result: Any) -> Iterator[Document]:
|
||||
# record relationship between page id and span offset
|
||||
page_offset = []
|
||||
for page in result.pages:
|
||||
# assume that spans only contain 1 element, to double check
|
||||
page_offset.append(page.spans[0]["offset"])
|
||||
|
||||
# paragraph
|
||||
# warning: paragraph content is overlapping with table content
|
||||
for para in result.paragraphs:
|
||||
yield Document(
|
||||
page_content=para.content,
|
||||
metadata={
|
||||
"role": para.role,
|
||||
"page": para.bounding_regions[0].page_number,
|
||||
"bounding_box": para.bounding_regions[0].polygon,
|
||||
"type": "paragraph",
|
||||
},
|
||||
)
|
||||
|
||||
# table
|
||||
for table in result.tables:
|
||||
yield Document(
|
||||
page_content=table.cells, # json object
|
||||
metadata={
|
||||
"footnote": table.footnotes,
|
||||
"caption": table.caption,
|
||||
"page": para.bounding_regions[0].page_number,
|
||||
"bounding_box": para.bounding_regions[0].polygon,
|
||||
"row_count": table.row_count,
|
||||
"column_count": table.column_count,
|
||||
"type": "table",
|
||||
},
|
||||
)
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
|
||||
with blob.as_bytes_io() as file_obj:
|
||||
poller = self.client.begin_analyze_document(
|
||||
self.api_model,
|
||||
file_obj,
|
||||
content_type="application/octet-stream",
|
||||
output_content_format="markdown" if self.mode == "markdown" else "text",
|
||||
)
|
||||
result = poller.result()
|
||||
|
||||
if self.mode in ["single", "markdown"]:
|
||||
yield from self._generate_docs_single(result)
|
||||
elif self.mode == ["page"]:
|
||||
yield from self._generate_docs_page(result)
|
||||
else:
|
||||
yield from self._generate_docs_object(result)
|
||||
|
||||
def parse_url(self, url: str) -> Iterator[Document]:
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
|
||||
poller = self.client.begin_analyze_document(
|
||||
self.api_model,
|
||||
AnalyzeDocumentRequest(url_source=url),
|
||||
# content_type="application/octet-stream",
|
||||
output_content_format="markdown" if self.mode == "markdown" else "text",
|
||||
)
|
||||
result = poller.result()
|
||||
|
||||
if self.mode in ["single", "markdown"]:
|
||||
yield from self._generate_docs_single(result)
|
||||
elif self.mode == ["page"]:
|
||||
yield from self._generate_docs_page(result)
|
||||
else:
|
||||
yield from self._generate_docs_object(result)
|
@@ -542,9 +542,17 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
||||
|
||||
class DocumentIntelligenceParser(BaseBlobParser):
|
||||
"""Loads a PDF with Azure Document Intelligence
|
||||
(formerly Forms Recognizer) and chunks at character level."""
|
||||
(formerly Form Recognizer) and chunks at character level."""
|
||||
|
||||
def __init__(self, client: Any, model: str):
|
||||
warnings.warn(
|
||||
"langchain.document_loaders.parsers.pdf.DocumentIntelligenceParser"
|
||||
"and langchain.document_loaders.pdf.DocumentIntelligenceLoader"
|
||||
" are deprecated. Please upgrade to "
|
||||
"langchain.document_loaders.DocumentIntelligenceLoader "
|
||||
"for any file parsing purpose using Azure Document Intelligence "
|
||||
"service."
|
||||
)
|
||||
self.client = client
|
||||
self.model = model
|
||||
|
||||
|
Reference in New Issue
Block a user