community[minor]: Azure DocumentIntelligenceLoader/Parser support update with latest SDK (#14389)

- **Description:**
Add DocumentIntelligenceLoader & DocumentIntelligenceParser
implementation using the latest Azure Document Intelligence SDK with
markdown support.
The core logic resides in DocumentIntelligenceParser and
DocumentIntelligenceLoader is a mere wrapper of the parser.
The parser will takes api_endpoint and api_key and creates
DocumentIntelligenceClient for the user. 4 parsing modes are supported:
1. Markdown (default)
2. Single
3. Page 
4. Object

UT and notebook are also updated accordingly.

- **Dependencies:** Azure Document Intelligence SDK:
azure-ai-documentintelligence
[azure-sdk-for-python/sdk/documentintelligence/azure-ai-documentintelligence
at 7c42462ac662522a6fd21b17d2a20f4cd40d0356 · Azure/azure-sdk-for-python
(github.com)](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FAzure%2Fazure-sdk-for-python%2Ftree%2F7c42462ac662522a6fd21b17d2a20f4cd40d0356%2Fsdk%2Fdocumentintelligence%2Fazure-ai-documentintelligence&data=05%7C01%7CZifei.Qian%40microsoft.com%7C298225aa3e31468a863108dbf07374ff%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638368150928704292%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=oE0Sl4HERnMKdbkV9KgBV46Z2xytcQAShdTWf7ZNl%2Bs%3D&reserved=0).

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
QIAN Zifei
2023-12-22 08:40:27 +08:00
committed by GitHub
parent 129a929d69
commit 2460f977c5
11 changed files with 367 additions and 45 deletions

View File

@@ -77,6 +77,9 @@ from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain_community.document_loaders.diffbot import DiffbotLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_community.document_loaders.discord import DiscordChatLoader
from langchain_community.document_loaders.doc_intelligence import (
AzureAIDocumentIntelligenceLoader,
)
from langchain_community.document_loaders.docugami import DocugamiLoader
from langchain_community.document_loaders.docusaurus import DocusaurusLoader
from langchain_community.document_loaders.dropbox import DropboxLoader
@@ -247,6 +250,7 @@ __all__ = [
"AssemblyAIAudioTranscriptLoader",
"AsyncHtmlLoader",
"AzureAIDataLoader",
"AzureAIDocumentIntelligenceLoader",
"AzureBlobStorageContainerLoader",
"AzureBlobStorageFileLoader",
"BSHTMLLoader",

View File

@@ -0,0 +1,89 @@
from typing import Iterator, List, Optional
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import (
AzureAIDocumentIntelligenceParser,
)
class AzureAIDocumentIntelligenceLoader(BaseLoader):
"""Loads a PDF with Azure Document Intelligence"""
def __init__(
self,
api_endpoint: str,
api_key: str,
file_path: Optional[str] = None,
url_path: Optional[str] = None,
api_version: Optional[str] = None,
api_model: str = "prebuilt-layout",
mode: str = "markdown",
) -> None:
"""
Initialize the object for file processing with Azure Document Intelligence
(formerly Form Recognizer).
This constructor initializes a AzureAIDocumentIntelligenceParser object to be
used for parsing files using the Azure Document Intelligence API. The load
method generates Documents whose content representations are determined by the
mode parameter.
Parameters:
-----------
api_endpoint: str
The API endpoint to use for DocumentIntelligenceClient construction.
api_key: str
The API key to use for DocumentIntelligenceClient construction.
file_path : Optional[str]
The path to the file that needs to be loaded.
Either file_path or url_path must be specified.
url_path : Optional[str]
The URL to the file that needs to be loaded.
Either file_path or url_path must be specified.
api_version: Optional[str]
The API version for DocumentIntelligenceClient. Setting None to use
the default value from SDK.
api_model: str
The model name or ID to be used for form recognition in Azure.
Examples:
---------
>>> obj = AzureAIDocumentIntelligenceLoader(
... file_path="path/to/file",
... api_endpoint="https://endpoint.azure.com",
... api_key="APIKEY",
... api_version="2023-10-31-preview",
... model="prebuilt-document"
... )
"""
assert (
file_path is not None or url_path is not None
), "file_path or url_path must be provided"
self.file_path = file_path
self.url_path = url_path
self.parser = AzureAIDocumentIntelligenceParser(
api_endpoint=api_endpoint,
api_key=api_key,
api_version=api_version,
api_model=api_model,
mode=mode,
)
def load(self) -> List[Document]:
"""Load given path as pages."""
return list(self.lazy_load())
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
if self.file_path is not None:
blob = Blob.from_path(self.file_path)
yield from self.parser.parse(blob)
else:
yield from self.parser.parse_url(self.url_path)

View File

@@ -1,4 +1,7 @@
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
from langchain_community.document_loaders.parsers.doc_intelligence import (
AzureAIDocumentIntelligenceParser,
)
from langchain_community.document_loaders.parsers.docai import DocAIParser
from langchain_community.document_loaders.parsers.grobid import GrobidParser
from langchain_community.document_loaders.parsers.html import BS4HTMLParser
@@ -12,6 +15,7 @@ from langchain_community.document_loaders.parsers.pdf import (
)
__all__ = [
"AzureAIDocumentIntelligenceParser",
"BS4HTMLParser",
"DocAIParser",
"GrobidParser",

View File

@@ -0,0 +1,122 @@
from typing import Any, Iterator, Optional
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
class AzureAIDocumentIntelligenceParser(BaseBlobParser):
"""Loads a PDF with Azure Document Intelligence
(formerly Forms Recognizer)."""
def __init__(
self,
api_endpoint: str,
api_key: str,
api_version: Optional[str] = None,
api_model: str = "prebuilt-layout",
mode: str = "markdown",
):
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
kwargs = {}
if api_version is not None:
kwargs["api_version"] = api_version
self.client = DocumentIntelligenceClient(
endpoint=api_endpoint,
credential=AzureKeyCredential(api_key),
headers={"x-ms-useragent": "langchain-parser/1.0.0"},
**kwargs,
)
self.api_model = api_model
self.mode = mode
assert self.mode in ["single", "page", "object", "markdown"]
def _generate_docs_page(self, result: Any) -> Iterator[Document]:
for p in result.pages:
content = " ".join([line.content for line in p.lines])
d = Document(
page_content=content,
metadata={
"page": p.page_number,
},
)
yield d
def _generate_docs_single(self, result: Any) -> Iterator[Document]:
yield Document(page_content=result.content, metadata={})
def _generate_docs_object(self, result: Any) -> Iterator[Document]:
# record relationship between page id and span offset
page_offset = []
for page in result.pages:
# assume that spans only contain 1 element, to double check
page_offset.append(page.spans[0]["offset"])
# paragraph
# warning: paragraph content is overlapping with table content
for para in result.paragraphs:
yield Document(
page_content=para.content,
metadata={
"role": para.role,
"page": para.bounding_regions[0].page_number,
"bounding_box": para.bounding_regions[0].polygon,
"type": "paragraph",
},
)
# table
for table in result.tables:
yield Document(
page_content=table.cells, # json object
metadata={
"footnote": table.footnotes,
"caption": table.caption,
"page": para.bounding_regions[0].page_number,
"bounding_box": para.bounding_regions[0].polygon,
"row_count": table.row_count,
"column_count": table.column_count,
"type": "table",
},
)
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
with blob.as_bytes_io() as file_obj:
poller = self.client.begin_analyze_document(
self.api_model,
file_obj,
content_type="application/octet-stream",
output_content_format="markdown" if self.mode == "markdown" else "text",
)
result = poller.result()
if self.mode in ["single", "markdown"]:
yield from self._generate_docs_single(result)
elif self.mode == ["page"]:
yield from self._generate_docs_page(result)
else:
yield from self._generate_docs_object(result)
def parse_url(self, url: str) -> Iterator[Document]:
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
poller = self.client.begin_analyze_document(
self.api_model,
AnalyzeDocumentRequest(url_source=url),
# content_type="application/octet-stream",
output_content_format="markdown" if self.mode == "markdown" else "text",
)
result = poller.result()
if self.mode in ["single", "markdown"]:
yield from self._generate_docs_single(result)
elif self.mode == ["page"]:
yield from self._generate_docs_page(result)
else:
yield from self._generate_docs_object(result)

View File

@@ -542,9 +542,17 @@ class AmazonTextractPDFParser(BaseBlobParser):
class DocumentIntelligenceParser(BaseBlobParser):
"""Loads a PDF with Azure Document Intelligence
(formerly Forms Recognizer) and chunks at character level."""
(formerly Form Recognizer) and chunks at character level."""
def __init__(self, client: Any, model: str):
warnings.warn(
"langchain.document_loaders.parsers.pdf.DocumentIntelligenceParser"
"and langchain.document_loaders.pdf.DocumentIntelligenceLoader"
" are deprecated. Please upgrade to "
"langchain.document_loaders.DocumentIntelligenceLoader "
"for any file parsing purpose using Azure Document Intelligence "
"service."
)
self.client = client
self.model = model