community: bytes as a source to AzureAIDocumentIntelligenceLoader (#26618)

- **Description:** This PR adds functionality to pass in in-memory bytes as a source to `AzureAIDocumentIntelligenceLoader`. - **Issue:** I needed the functionality, so I added it. - **Dependencies:** NA - **Twitter handle:** @akseljoonas if this is a big enough change :) --------- Co-authored-by: Aksel Joonas Reedi <aksel@klippa.com> Co-authored-by: Erick Friis <erick@langchain.dev>
2025-08-02 01:23:07 +00:00 · 2024-11-07 04:40:21 +01:00 · 2024-11-07 04:40:21 +01:00 · 2cb39270ec
commit 2cb39270ec
parent 7a9149f5dd
2 changed files with 33 additions and 6 deletions
--- a/libs/community/langchain_community/document_loaders/doc_intelligence.py
+++ b/libs/community/langchain_community/document_loaders/doc_intelligence.py
@ -18,6 +18,7 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
        api_key: str,
        file_path: Optional[str] = None,
        url_path: Optional[str] = None,
+        bytes_source: Optional[bytes] = None,
        api_version: Optional[str] = None,
        api_model: str = "prebuilt-layout",
        mode: str = "markdown",
@ -41,10 +42,13 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
            The API key to use for DocumentIntelligenceClient construction.
        file_path : Optional[str]
            The path to the file that needs to be loaded.
-            Either file_path or url_path must be specified.
+            Either file_path, url_path or bytes_source must be specified.
        url_path : Optional[str]
            The URL to the file that needs to be loaded.
-            Either file_path or url_path must be specified.
+            Either file_path, url_path or bytes_source must be specified.
+        bytes_source : Optional[bytes]
+            The bytes array of the file that needs to be loaded.
+            Either file_path, url_path or bytes_source must be specified.
        api_version: Optional[str]
            The API version for DocumentIntelligenceClient. Setting None to use
            the default value from `azure-ai-documentintelligence` package.
@ -73,10 +77,11 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
        """

        assert (
-            file_path is not None or url_path is not None
-        ), "file_path or url_path must be provided"
+            file_path is not None or url_path is not None or bytes_source is not None
+        ), "file_path, url_path or bytes_source must be provided"
        self.file_path = file_path
        self.url_path = url_path
+        self.bytes_source = bytes_source

        self.parser = AzureAIDocumentIntelligenceParser(  # type: ignore[misc]
            api_endpoint=api_endpoint,
@ -90,9 +95,13 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
    def lazy_load(
        self,
    ) -> Iterator[Document]:
-        """Lazy load given path as pages."""
+        """Lazy load the document as pages."""
        if self.file_path is not None:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
            yield from self.parser.parse(blob)
-        else:
+        elif self.url_path is not None:
            yield from self.parser.parse_url(self.url_path)  # type: ignore[arg-type]
+        elif self.bytes_source is not None:
+            yield from self.parser.parse_bytes(self.bytes_source)
+        else:
+            raise ValueError("No data source provided.")
--- a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
+++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py
@ -109,3 +109,21 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
            yield from self._generate_docs_page(result)
        else:
            raise ValueError(f"Invalid mode: {self.mode}")
+
+    def parse_bytes(self, bytes_source: bytes) -> Iterator[Document]:
+        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+
+        poller = self.client.begin_analyze_document(
+            self.api_model,
+            analyze_request=AnalyzeDocumentRequest(bytes_source=bytes_source),
+            # content_type="application/octet-stream",
+            output_content_format="markdown" if self.mode == "markdown" else "text",
+        )
+        result = poller.result()
+
+        if self.mode in ["single", "markdown"]:
+            yield from self._generate_docs_single(result)
+        elif self.mode in ["page"]:
+            yield from self._generate_docs_page(result)
+        else:
+            raise ValueError(f"Invalid mode: {self.mode}")