diff --git a/libs/community/langchain_community/document_loaders/doc_intelligence.py b/libs/community/langchain_community/document_loaders/doc_intelligence.py index 68a3eb44bd3..d51fa575604 100644 --- a/libs/community/langchain_community/document_loaders/doc_intelligence.py +++ b/libs/community/langchain_community/document_loaders/doc_intelligence.py @@ -18,6 +18,7 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader): api_key: str, file_path: Optional[str] = None, url_path: Optional[str] = None, + bytes_source: Optional[bytes] = None, api_version: Optional[str] = None, api_model: str = "prebuilt-layout", mode: str = "markdown", @@ -41,10 +42,13 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader): The API key to use for DocumentIntelligenceClient construction. file_path : Optional[str] The path to the file that needs to be loaded. - Either file_path or url_path must be specified. + Either file_path, url_path or bytes_source must be specified. url_path : Optional[str] The URL to the file that needs to be loaded. - Either file_path or url_path must be specified. + Either file_path, url_path or bytes_source must be specified. + bytes_source : Optional[bytes] + The bytes array of the file that needs to be loaded. + Either file_path, url_path or bytes_source must be specified. api_version: Optional[str] The API version for DocumentIntelligenceClient. Setting None to use the default value from `azure-ai-documentintelligence` package. @@ -73,10 +77,11 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader): """ assert ( - file_path is not None or url_path is not None - ), "file_path or url_path must be provided" + file_path is not None or url_path is not None or bytes_source is not None + ), "file_path, url_path or bytes_source must be provided" self.file_path = file_path self.url_path = url_path + self.bytes_source = bytes_source self.parser = AzureAIDocumentIntelligenceParser( # type: ignore[misc] api_endpoint=api_endpoint, @@ -90,9 +95,13 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader): def lazy_load( self, ) -> Iterator[Document]: - """Lazy load given path as pages.""" + """Lazy load the document as pages.""" if self.file_path is not None: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.parse(blob) - else: + elif self.url_path is not None: yield from self.parser.parse_url(self.url_path) # type: ignore[arg-type] + elif self.bytes_source is not None: + yield from self.parser.parse_bytes(self.bytes_source) + else: + raise ValueError("No data source provided.") diff --git a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py index 0be8e7583e7..2d77fcd1f87 100644 --- a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py +++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py @@ -109,3 +109,21 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser): yield from self._generate_docs_page(result) else: raise ValueError(f"Invalid mode: {self.mode}") + + def parse_bytes(self, bytes_source: bytes) -> Iterator[Document]: + from azure.ai.documentintelligence.models import AnalyzeDocumentRequest + + poller = self.client.begin_analyze_document( + self.api_model, + analyze_request=AnalyzeDocumentRequest(bytes_source=bytes_source), + # content_type="application/octet-stream", + output_content_format="markdown" if self.mode == "markdown" else "text", + ) + result = poller.result() + + if self.mode in ["single", "markdown"]: + yield from self._generate_docs_single(result) + elif self.mode in ["page"]: + yield from self._generate_docs_page(result) + else: + raise ValueError(f"Invalid mode: {self.mode}")