mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-29 04:16:02 +00:00
127 lines
5.0 KiB
Python
127 lines
5.0 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING, Iterator, List, Optional
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
from langchain_community.document_loaders.blob_loaders import Blob
|
|
from langchain_community.document_loaders.parsers import (
|
|
AzureAIDocumentIntelligenceParser,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from azure.core.credentials import TokenCredential
|
|
|
|
|
|
class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
|
"""Load a PDF with Azure Document Intelligence."""
|
|
|
|
def __init__(
|
|
self,
|
|
api_endpoint: str,
|
|
api_key: Optional[str] = None,
|
|
file_path: Optional[str] = None,
|
|
url_path: Optional[str] = None,
|
|
bytes_source: Optional[bytes] = None,
|
|
api_version: Optional[str] = None,
|
|
api_model: str = "prebuilt-layout",
|
|
mode: str = "markdown",
|
|
*,
|
|
analysis_features: Optional[List[str]] = None,
|
|
azure_credential: Optional["TokenCredential"] = None,
|
|
) -> None:
|
|
"""
|
|
Initialize the object for file processing with Azure Document Intelligence
|
|
(formerly Form Recognizer).
|
|
|
|
This constructor initializes a AzureAIDocumentIntelligenceParser object to be
|
|
used for parsing files using the Azure Document Intelligence API. The load
|
|
method generates Documents whose content representations are determined by the
|
|
mode parameter.
|
|
|
|
Parameters:
|
|
-----------
|
|
api_endpoint: str
|
|
The API endpoint to use for DocumentIntelligenceClient construction.
|
|
api_key: str
|
|
The API key to use for DocumentIntelligenceClient construction.
|
|
file_path : Optional[str]
|
|
The path to the file that needs to be loaded.
|
|
Either file_path, url_path or bytes_source must be specified.
|
|
url_path : Optional[str]
|
|
The URL to the file that needs to be loaded.
|
|
Either file_path, url_path or bytes_source must be specified.
|
|
bytes_source : Optional[bytes]
|
|
The bytes array of the file that needs to be loaded.
|
|
Either file_path, url_path or bytes_source must be specified.
|
|
api_version: Optional[str]
|
|
The API version for DocumentIntelligenceClient. Setting None to use
|
|
the default value from `azure-ai-documentintelligence` package.
|
|
api_model: str
|
|
Unique document model name. Default value is "prebuilt-layout".
|
|
Note that overriding this default value may result in unsupported
|
|
behavior.
|
|
mode: Optional[str]
|
|
The type of content representation of the generated Documents.
|
|
Use either "single", "page", or "markdown". Default value is "markdown".
|
|
analysis_features: Optional[List[str]]
|
|
List of optional analysis features, each feature should be passed
|
|
as a str that conforms to the enum `DocumentAnalysisFeature` in
|
|
`azure-ai-documentintelligence` package. Default value is None.
|
|
azure_credential: Optional[TokenCredential]
|
|
The credentials to use for DocumentIntelligenceClient construction, when
|
|
using credentials other than api_key (like AD).
|
|
|
|
Examples:
|
|
---------
|
|
>>> obj = AzureAIDocumentIntelligenceLoader(
|
|
... file_path="path/to/file",
|
|
... api_endpoint="https://endpoint.azure.com",
|
|
... api_key="APIKEY",
|
|
... api_version="2023-10-31-preview",
|
|
... api_model="prebuilt-layout",
|
|
... mode="markdown"
|
|
... )
|
|
"""
|
|
|
|
assert (
|
|
file_path is not None or url_path is not None or bytes_source is not None
|
|
), "file_path, url_path or bytes_source must be provided"
|
|
|
|
assert api_key is not None or azure_credential is not None, (
|
|
"Either api_key or azure_credential must be provided."
|
|
)
|
|
|
|
assert api_key is None or azure_credential is None, (
|
|
"Only one of api_key or azure_credential should be provided."
|
|
)
|
|
|
|
self.file_path = file_path
|
|
self.url_path = url_path
|
|
self.bytes_source = bytes_source
|
|
|
|
self.parser = AzureAIDocumentIntelligenceParser(
|
|
api_endpoint=api_endpoint,
|
|
api_key=api_key,
|
|
api_version=api_version,
|
|
api_model=api_model,
|
|
mode=mode,
|
|
analysis_features=analysis_features,
|
|
azure_credential=azure_credential,
|
|
)
|
|
|
|
def lazy_load(
|
|
self,
|
|
) -> Iterator[Document]:
|
|
"""Lazy load the document as pages."""
|
|
if self.file_path is not None:
|
|
blob = Blob.from_path(self.file_path)
|
|
yield from self.parser.parse(blob)
|
|
elif self.url_path is not None:
|
|
yield from self.parser.parse_url(self.url_path)
|
|
elif self.bytes_source is not None:
|
|
yield from self.parser.parse_bytes(self.bytes_source)
|
|
else:
|
|
raise ValueError("No data source provided.")
|