mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-05 13:06:03 +00:00
Add parser and loader for Azure document intelligence service. (#10136)
Hi, this PR contains loader / parser for Azure Document intelligence which is a ML-based service to ingest arbitrary PDFs / images, even if scanned. The loader generates Documents by pages of the original document. This is my first contribution to LangChain. Unfortunately I could not find the correct place for test cases. Happy to add one if you can point me to the location, but as this is a cloud-based service, a test would require network access and credentials - so might be of limited help. Dependencies: The needed dependency was already part of pyproject.toml, no change. Twitter: feel free to mention @LarsAC on the announcement
This commit is contained in:
@@ -244,3 +244,36 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
||||
page_content=current_text,
|
||||
metadata={"source": blob.source, "page": current_page},
|
||||
)
|
||||
|
||||
|
||||
class DocumentIntelligenceParser(BaseBlobParser):
|
||||
"""Loads a PDF with Azure Document Intelligence
|
||||
(formerly Forms Recognizer) and chunks at character level."""
|
||||
|
||||
def __init__(self, client: Any, model: str):
|
||||
self.client = client
|
||||
self.model = model
|
||||
|
||||
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
|
||||
for p in result.pages:
|
||||
content = " ".join([line.content for line in p.lines])
|
||||
|
||||
d = Document(
|
||||
page_content=content,
|
||||
metadata={
|
||||
"source": blob.source,
|
||||
"page": p.page_number,
|
||||
},
|
||||
)
|
||||
yield d
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
|
||||
with blob.as_bytes_io() as file_obj:
|
||||
poller = self.client.begin_analyze_document(self.model, file_obj)
|
||||
result = poller.result()
|
||||
|
||||
docs = self._generate_docs(blob, result)
|
||||
|
||||
yield from docs
|
||||
|
@@ -16,6 +16,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers.pdf import (
|
||||
AmazonTextractPDFParser,
|
||||
DocumentIntelligenceParser,
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PyMuPDFParser,
|
||||
@@ -597,3 +598,51 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
return 1
|
||||
else:
|
||||
raise ValueError(f"unsupported mime type: {blob.mimetype}")
|
||||
|
||||
|
||||
class DocumentIntelligenceLoader(BasePDFLoader):
|
||||
"""Loads a PDF with Azure Document Intelligence"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, client: Any, model: str = "prebuilt-document"
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the object for file processing with Azure Document Intelligence
|
||||
(formerly Form Recognizer).
|
||||
|
||||
This constructor initializes a DocumentIntelligenceParser object to be used
|
||||
for parsing files using the Azure Document Intelligence API. The load method
|
||||
generates a Document node including metadata (source blob and page number)
|
||||
for each page.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
file_path : str
|
||||
The path to the file that needs to be parsed.
|
||||
client: Any
|
||||
A DocumentAnalysisClient to perform the analysis of the blob
|
||||
model : str
|
||||
The model name or ID to be used for form recognition in Azure.
|
||||
|
||||
Examples:
|
||||
---------
|
||||
>>> obj = DocumentIntelligenceLoader(
|
||||
... file_path="path/to/file",
|
||||
... client=client,
|
||||
... model="prebuilt-document"
|
||||
... )
|
||||
"""
|
||||
|
||||
self.parser = DocumentIntelligenceParser(client=client, model=model)
|
||||
super().__init__(file_path)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load given path as pages."""
|
||||
return list(self.lazy_load())
|
||||
|
||||
def lazy_load(
|
||||
self,
|
||||
) -> Iterator[Document]:
|
||||
"""Lazy load given path as pages."""
|
||||
blob = Blob.from_path(self.file_path)
|
||||
yield from self.parser.parse(blob)
|
||||
|
Reference in New Issue
Block a user