mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-04 10:42:55 +00:00
community: bytes as a source to AzureAIDocumentIntelligenceLoader
(#26618)
- **Description:** This PR adds functionality to pass in in-memory bytes as a source to `AzureAIDocumentIntelligenceLoader`. - **Issue:** I needed the functionality, so I added it. - **Dependencies:** NA - **Twitter handle:** @akseljoonas if this is a big enough change :) --------- Co-authored-by: Aksel Joonas Reedi <aksel@klippa.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
7a9149f5dd
commit
2cb39270ec
@ -18,6 +18,7 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
|||||||
api_key: str,
|
api_key: str,
|
||||||
file_path: Optional[str] = None,
|
file_path: Optional[str] = None,
|
||||||
url_path: Optional[str] = None,
|
url_path: Optional[str] = None,
|
||||||
|
bytes_source: Optional[bytes] = None,
|
||||||
api_version: Optional[str] = None,
|
api_version: Optional[str] = None,
|
||||||
api_model: str = "prebuilt-layout",
|
api_model: str = "prebuilt-layout",
|
||||||
mode: str = "markdown",
|
mode: str = "markdown",
|
||||||
@ -41,10 +42,13 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
|||||||
The API key to use for DocumentIntelligenceClient construction.
|
The API key to use for DocumentIntelligenceClient construction.
|
||||||
file_path : Optional[str]
|
file_path : Optional[str]
|
||||||
The path to the file that needs to be loaded.
|
The path to the file that needs to be loaded.
|
||||||
Either file_path or url_path must be specified.
|
Either file_path, url_path or bytes_source must be specified.
|
||||||
url_path : Optional[str]
|
url_path : Optional[str]
|
||||||
The URL to the file that needs to be loaded.
|
The URL to the file that needs to be loaded.
|
||||||
Either file_path or url_path must be specified.
|
Either file_path, url_path or bytes_source must be specified.
|
||||||
|
bytes_source : Optional[bytes]
|
||||||
|
The bytes array of the file that needs to be loaded.
|
||||||
|
Either file_path, url_path or bytes_source must be specified.
|
||||||
api_version: Optional[str]
|
api_version: Optional[str]
|
||||||
The API version for DocumentIntelligenceClient. Setting None to use
|
The API version for DocumentIntelligenceClient. Setting None to use
|
||||||
the default value from `azure-ai-documentintelligence` package.
|
the default value from `azure-ai-documentintelligence` package.
|
||||||
@ -73,10 +77,11 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
file_path is not None or url_path is not None
|
file_path is not None or url_path is not None or bytes_source is not None
|
||||||
), "file_path or url_path must be provided"
|
), "file_path, url_path or bytes_source must be provided"
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.url_path = url_path
|
self.url_path = url_path
|
||||||
|
self.bytes_source = bytes_source
|
||||||
|
|
||||||
self.parser = AzureAIDocumentIntelligenceParser( # type: ignore[misc]
|
self.parser = AzureAIDocumentIntelligenceParser( # type: ignore[misc]
|
||||||
api_endpoint=api_endpoint,
|
api_endpoint=api_endpoint,
|
||||||
@ -90,9 +95,13 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
|||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
"""Lazy load given path as pages."""
|
"""Lazy load the document as pages."""
|
||||||
if self.file_path is not None:
|
if self.file_path is not None:
|
||||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||||
yield from self.parser.parse(blob)
|
yield from self.parser.parse(blob)
|
||||||
else:
|
elif self.url_path is not None:
|
||||||
yield from self.parser.parse_url(self.url_path) # type: ignore[arg-type]
|
yield from self.parser.parse_url(self.url_path) # type: ignore[arg-type]
|
||||||
|
elif self.bytes_source is not None:
|
||||||
|
yield from self.parser.parse_bytes(self.bytes_source)
|
||||||
|
else:
|
||||||
|
raise ValueError("No data source provided.")
|
||||||
|
@ -109,3 +109,21 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
|||||||
yield from self._generate_docs_page(result)
|
yield from self._generate_docs_page(result)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid mode: {self.mode}")
|
raise ValueError(f"Invalid mode: {self.mode}")
|
||||||
|
|
||||||
|
def parse_bytes(self, bytes_source: bytes) -> Iterator[Document]:
|
||||||
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||||
|
|
||||||
|
poller = self.client.begin_analyze_document(
|
||||||
|
self.api_model,
|
||||||
|
analyze_request=AnalyzeDocumentRequest(bytes_source=bytes_source),
|
||||||
|
# content_type="application/octet-stream",
|
||||||
|
output_content_format="markdown" if self.mode == "markdown" else "text",
|
||||||
|
)
|
||||||
|
result = poller.result()
|
||||||
|
|
||||||
|
if self.mode in ["single", "markdown"]:
|
||||||
|
yield from self._generate_docs_single(result)
|
||||||
|
elif self.mode in ["page"]:
|
||||||
|
yield from self._generate_docs_page(result)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid mode: {self.mode}")
|
||||||
|
Loading…
Reference in New Issue
Block a user