Add parser and loader for Azure document intelligence service. (#10136)

Hi,

this PR contains loader / parser for Azure Document intelligence which
is a ML-based service to ingest arbitrary PDFs / images, even if
scanned. The loader generates Documents by pages of the original
document. This is my first contribution to LangChain.

Unfortunately I could not find the correct place for test cases. Happy
to add one if you can point me to the location, but as this is a
cloud-based service, a test would require network access and credentials
- so might be of limited help.

Dependencies: The needed dependency was already part of pyproject.toml,
no change.
Twitter: feel free to mention @LarsAC on the announcement
This commit is contained in:
Lars von Wedel
2023-09-03 23:25:39 +02:00
committed by GitHub
parent 4abe85be57
commit 6d82503eb1
3 changed files with 220 additions and 0 deletions

View File

@@ -244,3 +244,36 @@ class AmazonTextractPDFParser(BaseBlobParser):
page_content=current_text,
metadata={"source": blob.source, "page": current_page},
)
class DocumentIntelligenceParser(BaseBlobParser):
"""Loads a PDF with Azure Document Intelligence
(formerly Forms Recognizer) and chunks at character level."""
def __init__(self, client: Any, model: str):
self.client = client
self.model = model
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
for p in result.pages:
content = " ".join([line.content for line in p.lines])
d = Document(
page_content=content,
metadata={
"source": blob.source,
"page": p.page_number,
},
)
yield d
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
with blob.as_bytes_io() as file_obj:
poller = self.client.begin_analyze_document(self.model, file_obj)
result = poller.result()
docs = self._generate_docs(blob, result)
yield from docs

View File

@@ -16,6 +16,7 @@ from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.pdf import (
AmazonTextractPDFParser,
DocumentIntelligenceParser,
PDFMinerParser,
PDFPlumberParser,
PyMuPDFParser,
@@ -597,3 +598,51 @@ class AmazonTextractPDFLoader(BasePDFLoader):
return 1
else:
raise ValueError(f"unsupported mime type: {blob.mimetype}")
class DocumentIntelligenceLoader(BasePDFLoader):
"""Loads a PDF with Azure Document Intelligence"""
def __init__(
self, file_path: str, client: Any, model: str = "prebuilt-document"
) -> None:
"""
Initialize the object for file processing with Azure Document Intelligence
(formerly Form Recognizer).
This constructor initializes a DocumentIntelligenceParser object to be used
for parsing files using the Azure Document Intelligence API. The load method
generates a Document node including metadata (source blob and page number)
for each page.
Parameters:
-----------
file_path : str
The path to the file that needs to be parsed.
client: Any
A DocumentAnalysisClient to perform the analysis of the blob
model : str
The model name or ID to be used for form recognition in Azure.
Examples:
---------
>>> obj = DocumentIntelligenceLoader(
... file_path="path/to/file",
... client=client,
... model="prebuilt-document"
... )
"""
self.parser = DocumentIntelligenceParser(client=client, model=model)
super().__init__(file_path)
def load(self) -> List[Document]:
"""Load given path as pages."""
return list(self.lazy_load())
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
blob = Blob.from_path(self.file_path)
yield from self.parser.parse(blob)