community: add AzureOpenAIWhisperParser (#27796)

Commandeered from https://github.com/langchain-ai/langchain/pull/26757. --------- Co-authored-by: Sheepsta300 <128811766+Sheepsta300@users.noreply.github.com>
2025-07-21 20:13:39 +00:00 · 2024-10-31 12:37:41 -04:00 · 2024-10-31 12:37:41 -04:00 · 0172d938b4
commit 0172d938b4
parent b631b0a596
4 changed files with 513 additions and 10 deletions
--- a/docs/docs/integrations/document_loaders/parsers/azure_openai_whisper_parser.ipynb
+++ b/docs/docs/integrations/document_loaders/parsers/azure_openai_whisper_parser.ipynb
@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Azure OpenAI Whisper Parser"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    ">[Azure OpenAI Whisper Parser](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/whisper-overview) is a wrapper around the Azure OpenAI Whisper API which utilizes machine learning to transcribe audio files to english text. \n",
+    ">\n",
+    ">The Parser supports `.mp3`, `.mp4`, `.mpeg`, `.mpga`, `.m4a`, `.wav`, and `.webm`.\n",
+    "\n",
+    "The current implementation follows LangChain core principles and can be used with other loaders to handle both audio downloading and parsing. As a result of this the parser will `yield` an `Iterator[Document]`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The service requires Azure credentials, Azure endpoint and Whisper Model deployment, which can be set up by following the guide [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/whisper-quickstart?tabs=command-line%2Cpython-new%2Cjavascript&pivots=programming-language-python). Furthermore, the required dependencies must be installed.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -Uq  langchain langchain-community openai"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `AzureOpenAIWhisperParser`'s method, `.lazy_parse`, accepts a `Blob` object as a parameter containing the file path of the file to be transcribed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.documents.base import Blob\n",
+    "\n",
+    "audio_path = \"path/to/your/audio/file\"\n",
+    "audio_blob = Blob(path=audio_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders.parsers.audio import AzureOpenAIWhisperParser\n",
+    "\n",
+    "endpoint = \"<your_endpoint>\"\n",
+    "key = \"<your_api_key\"\n",
+    "version = \"<your_api_version>\"\n",
+    "name = \"<your_deployment_name>\"\n",
+    "\n",
+    "parser = AzureOpenAIWhisperParser(\n",
+    "    api_key=key, azure_endpoint=endpoint, api_version=version, deployment_name=name\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents = parser.lazy_parse(blob=audio_blob)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for doc in documents:\n",
+    "    print(doc.page_content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `AzureOpenAIWhisperParser` can also be used in conjuction with audio loaders, like the `YoutubeAudioLoader` with a `GenericLoader`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders.blob_loaders.youtube_audio import (\n",
+    "    YoutubeAudioLoader,\n",
+    ")\n",
+    "from langchain_community.document_loaders.generic import GenericLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Must be a list\n",
+    "url = [\"www.youtube.url.com\"]\n",
+    "\n",
+    "save_dir = \"save/directory/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "name = \"<your_deployment_name>\"\n",
+    "\n",
+    "loader = GenericLoader(\n",
+    "    YoutubeAudioLoader(url, save_dir), AzureOpenAIWhisperParser(deployment_name=name)\n",
+    ")\n",
+    "\n",
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for doc in documents:\n",
+    "    print(doc.page_content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/libs/community/langchain_community/document_loaders/parsers/audio.py
+++ b/libs/community/langchain_community/document_loaders/parsers/audio.py
@ -1,7 +1,8 @@
+import io
 import logging
 import os
 import time
-from typing import Any, Dict, Iterator, Literal, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterator, Literal, Optional, Tuple, Union

 from langchain_core.documents import Document

@ -12,6 +13,218 @@ from langchain_community.utils.openai import is_openai_v1
 logger = logging.getLogger(__name__)


+class AzureOpenAIWhisperParser(BaseBlobParser):
+    """
+    Transcribe and parse audio files using Azure OpenAI Whisper.
+
+    This parser integrates with the Azure OpenAI Whisper model to transcribe
+    audio files. It differs from the standard OpenAI Whisper parser, requiring
+    an Azure endpoint and credentials. The parser is limited to files under 25 MB.
+
+    **Note**:
+    This parser uses the Azure OpenAI API, providing integration with the Azure
+     ecosystem, and making it suitable for workflows involving other Azure services.
+
+    For files larger than 25 MB, consider using Azure AI Speech batch transcription:
+    https://learn.microsoft.com/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api#use-a-whisper-model
+
+    Setup:
+        1. Follow the instructions here to deploy Azure Whisper:
+           https://learn.microsoft.com/azure/ai-services/openai/whisper-quickstart?tabs=command-line%2Cpython-new&pivots=programming-language-python
+        2. Install ``langchain`` and set the following environment variables:
+
+        .. code-block:: bash
+
+            pip install -U langchain langchain-community
+
+            export AZURE_OPENAI_API_KEY="your-api-key"
+            export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/"
+            export OPENAI_API_VERSION="your-api-version"
+
+    Example Usage:
+        .. code-block:: python
+
+            from langchain.community import AzureOpenAIWhisperParser
+
+            whisper_parser = AzureOpenAIWhisperParser(
+                deployment_name="your-whisper-deployment",
+                api_version="2024-06-01",
+                api_key="your-api-key",
+                # other params...
+            )
+
+            audio_blob = Blob(path="your-audio-file-path")
+            response = whisper_parser.lazy_parse(audio_blob)
+
+            for document in response:
+                print(document.page_content)
+
+    Integration with Other Loaders:
+        The AzureOpenAIWhisperParser can be used with video/audio loaders and
+        `GenericLoader` to automate retrieval and parsing.
+
+    YoutubeAudioLoader Example:
+        .. code-block:: python
+
+            from langchain_community.document_loaders.blob_loaders import (
+                YoutubeAudioLoader
+                )
+            from langchain_community.document_loaders.generic import GenericLoader
+
+            # Must be a list
+            youtube_url = ["https://your-youtube-url"]
+            save_dir = "directory-to-download-videos"
+
+            loader = GenericLoader(
+                YoutubeAudioLoader(youtube_url, save_dir),
+                AzureOpenAIWhisperParser(deployment_name="your-deployment-name")
+            )
+
+            docs = loader.load()
+    """
+
+    def __init__(
+        self,
+        *,
+        api_key: Optional[str] = None,
+        azure_endpoint: Optional[str] = None,
+        api_version: Optional[str] = None,
+        azure_ad_token_provider: Union[Callable[[], str], None] = None,
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        response_format: Union[
+            Literal["json", "text", "srt", "verbose_json", "vtt"], None
+        ] = None,
+        temperature: Optional[float] = None,
+        deployment_name: str,
+        max_retries: int = 3,
+    ):
+        """
+        Initialize the AzureOpenAIWhisperParser.
+
+        Args:
+            api_key (Optional[str]):
+                Azure OpenAI API key. If not provided, defaults to the
+                `AZURE_OPENAI_API_KEY` environment variable.
+            azure_endpoint (Optional[str]):
+                Azure OpenAI service endpoint. Defaults to `AZURE_OPENAI_ENDPOINT`
+                environment variable if not set.
+            api_version (Optional[str]):
+                API version to use,
+                defaults to the `OPENAI_API_VERSION` environment variable.
+            azure_ad_token_provider (Union[Callable[[], str], None]):
+                Azure Active Directory token for authentication (if applicable).
+            language (Optional[str]):
+                Language in which the request should be processed.
+            prompt (Optional[str]):
+                Custom instructions or prompt for the Whisper model.
+            response_format (Union[str, None]):
+                The desired output format. Options: "json", "text", "srt",
+                "verbose_json", "vtt".
+            temperature (Optional[float]):
+                Controls the randomness of the model's output.
+            deployment_name (str):
+                The deployment name of the Whisper model.
+            max_retries (int):
+                Maximum number of retries for failed API requests.
+        Raises:
+            ImportError:
+                If the required package `openai` is not installed.
+        """
+        self.api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY")
+        self.azure_endpoint = azure_endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
+        self.api_version = api_version or os.environ.get("OPENAI_API_VERSION")
+        self.azure_ad_token_provider = azure_ad_token_provider
+
+        self.language = language
+        self.prompt = prompt
+        self.response_format = response_format
+        self.temperature = temperature
+
+        self.deployment_name = deployment_name
+        self.max_retries = max_retries
+
+        try:
+            import openai
+        except ImportError:
+            raise ImportError(
+                "openai package not found, please install it with "
+                "`pip install openai`"
+            )
+
+        if is_openai_v1():
+            self._client = openai.AzureOpenAI(
+                api_key=self.api_key,
+                azure_endpoint=self.azure_endpoint,
+                api_version=self.api_version,
+                max_retries=self.max_retries,
+                azure_ad_token=self.azure_ad_token_provider,
+            )
+        else:
+            if self.api_key:
+                openai.api_key = self.api_key
+            if self.azure_endpoint:
+                openai.api_base = self.azure_endpoint
+            if self.api_version:
+                openai.api_version = self.api_version
+            openai.api_type = "azure"
+            self._client = openai
+
+    @property
+    def _create_params(self) -> Dict[str, Any]:
+        params = {
+            "language": self.language,
+            "prompt": self.prompt,
+            "response_format": self.response_format,
+            "temperature": self.temperature,
+        }
+        return {k: v for k, v in params.items() if v is not None}
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """
+        Lazily parse the provided audio blob for transcription.
+
+        Args:
+            blob (Blob):
+                The audio file in Blob format to be transcribed.
+
+        Yields:
+            Document:
+                Parsed transcription from the audio file.
+
+        Raises:
+            Exception:
+                If an error occurs during transcription.
+        """
+
+        file_obj = open(str(blob.path), "rb")
+
+        # Transcribe
+        try:
+            if is_openai_v1():
+                transcript = self._client.audio.transcriptions.create(
+                    model=self.deployment_name,
+                    file=file_obj,
+                    **self._create_params,
+                )
+            else:
+                transcript = self._client.Audio.transcribe(
+                    model=self.deployment_name,
+                    deployment_id=self.deployment_name,
+                    file=file_obj,
+                    **self._create_params,
+                )
+        except Exception:
+            raise
+
+        yield Document(
+            page_content=transcript.text
+            if not isinstance(transcript, str)
+            else transcript,
+            metadata={"source": blob.source},
+        )
+
+
 class OpenAIWhisperParser(BaseBlobParser):
    """Transcribe and parse audio files.

@ -19,7 +232,7 @@ class OpenAIWhisperParser(BaseBlobParser):

    Args:
        api_key: OpenAI API key
-        chunk_duration_threshold: minimum duration of a chunk in seconds
+        chunk_duration_threshold: Minimum duration of a chunk in seconds
            NOTE: According to the OpenAI API, the chunk duration should be at least 0.1
            seconds. If the chunk duration is less or equal than the threshold,
            it will be skipped.
@ -61,8 +274,6 @@ class OpenAIWhisperParser(BaseBlobParser):
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

-        import io
-
        try:
            import openai
        except ImportError:
@ -85,11 +296,11 @@ class OpenAIWhisperParser(BaseBlobParser):
            if self.api_key:
                openai.api_key = self.api_key
            if self.base_url:
-                openai.base_url = self.base_url
+                openai.api_base = self.base_url

        # Audio file from disk
-        audio = AudioSegment.from_file(blob.path)

+        audio = AudioSegment.from_file(blob.path)
        # Define the duration of each chunk in minutes
        # Need to meet 25MB size limit for Whisper API
        chunk_duration = 20
@ -240,8 +451,6 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

-        import io
-
        try:
            from pydub import AudioSegment
        except ImportError:
@ -436,8 +645,6 @@ class FasterWhisperParser(BaseBlobParser):
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

-        import io
-
        try:
            from pydub import AudioSegment
        except ImportError:
--- a/libs/community/tests/examples/hello_world.m4a
+++ b/libs/community/tests/examples/hello_world.m4a
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_azure_whisper_parser.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_azure_whisper_parser.py
@ -0,0 +1,104 @@
+"""Tests for the Azure OpenAI Whisper parser."""
+
+from pathlib import Path
+from typing import Any
+from unittest.mock import Mock, patch
+
+import pytest
+from langchain_core.documents import Document
+from langchain_core.documents.base import Blob
+
+from langchain_community.document_loaders.parsers.audio import AzureOpenAIWhisperParser
+
+_THIS_DIR = Path(__file__).parents[3]
+
+_EXAMPLES_DIR = _THIS_DIR / "examples"
+AUDIO_M4A = _EXAMPLES_DIR / "hello_world.m4a"
+
+
+@pytest.mark.requires("openai")
+@patch("openai.AzureOpenAI")
+def test_azure_openai_whisper(mock_client: Mock) -> None:
+    endpoint = "endpoint"
+    key = "key"
+    version = "115"
+    name = "model"
+
+    parser = AzureOpenAIWhisperParser(
+        api_key=key, azure_endpoint=endpoint, api_version=version, deployment_name=name
+    )
+    mock_client.assert_called_once_with(
+        api_key=key,
+        azure_endpoint=endpoint,
+        api_version=version,
+        max_retries=3,
+        azure_ad_token=None,
+    )
+    assert parser._client == mock_client()
+
+
+@pytest.mark.requires("openai")
+def test_is_openai_v1_lazy_parse(mocker: Any) -> None:
+    endpoint = "endpoint"
+    key = "key"
+    version = "115"
+    name = "model"
+
+    mock_blob = mocker.Mock(spec=Blob)
+    mock_blob.path = AUDIO_M4A
+    mock_blob.source = "test_source"
+
+    mock_openai_client = mocker.Mock()
+
+    mock_openai_client.audio.transcriptions.create.return_value = mocker.Mock()
+    mock_openai_client.audio.transcriptions.create.return_value.text = (
+        "Transcribed text"
+    )
+
+    mocker.patch("langchain_community.utils.openai.is_openai_v1", return_value=True)
+
+    parser = AzureOpenAIWhisperParser(
+        api_key=key, azure_endpoint=endpoint, api_version=version, deployment_name=name
+    )
+
+    parser._client = mock_openai_client
+
+    result = list(parser.lazy_parse(mock_blob))
+
+    assert len(result) == 1
+    assert isinstance(result[0], Document)
+    assert result[0].page_content == "Transcribed text"
+    assert result[0].metadata["source"] == "test_source"
+
+
+@pytest.mark.requires("openai")
+def test_is_not_openai_v1_lazy_parse(mocker: Any) -> None:
+    endpoint = "endpoint"
+    key = "key"
+    version = "115"
+    name = "model"
+
+    mock_blob = mocker.Mock(spec=Blob)
+    mock_blob.path = AUDIO_M4A
+    mock_blob.source = "test_source"
+
+    mock_openai_client = mocker.Mock()
+
+    mock_openai_client.audio.transcriptions.create.return_value = mocker.Mock()
+    mock_openai_client.audio.transcriptions.create.return_value.text = (
+        "Transcribed text"
+    )
+
+    mocker.patch("langchain_community.utils.openai.is_openai_v1", return_value=False)
+
+    parser = AzureOpenAIWhisperParser(
+        api_key=key, azure_endpoint=endpoint, api_version=version, deployment_name=name
+    )
+    parser._client = mock_openai_client
+
+    result = list(parser.lazy_parse(mock_blob))
+
+    assert len(result) == 1
+    assert isinstance(result[0], Document)
+    assert result[0].page_content == "Transcribed text"
+    assert result[0].metadata["source"] == "test_source"