community: add AzureOpenAIWhisperParser (#27796)

Commandeered from https://github.com/langchain-ai/langchain/pull/26757. --------- Co-authored-by: Sheepsta300 <128811766+Sheepsta300@users.noreply.github.com>
2025-09-03 03:59:42 +00:00 · 2024-10-31 12:37:41 -04:00
parent b631b0a596
commit 0172d938b4
4 changed files with 513 additions and 10 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/audio.py
+++ b/libs/community/langchain_community/document_loaders/parsers/audio.py
@@ -1,7 +1,8 @@
+import io
 import logging
 import os
 import time
-from typing import Any, Dict, Iterator, Literal, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterator, Literal, Optional, Tuple, Union

 from langchain_core.documents import Document

@@ -12,6 +13,218 @@ from langchain_community.utils.openai import is_openai_v1
 logger = logging.getLogger(__name__)


+class AzureOpenAIWhisperParser(BaseBlobParser):
+    """
+    Transcribe and parse audio files using Azure OpenAI Whisper.
+
+    This parser integrates with the Azure OpenAI Whisper model to transcribe
+    audio files. It differs from the standard OpenAI Whisper parser, requiring
+    an Azure endpoint and credentials. The parser is limited to files under 25 MB.
+
+    **Note**:
+    This parser uses the Azure OpenAI API, providing integration with the Azure
+     ecosystem, and making it suitable for workflows involving other Azure services.
+
+    For files larger than 25 MB, consider using Azure AI Speech batch transcription:
+    https://learn.microsoft.com/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api#use-a-whisper-model
+
+    Setup:
+        1. Follow the instructions here to deploy Azure Whisper:
+           https://learn.microsoft.com/azure/ai-services/openai/whisper-quickstart?tabs=command-line%2Cpython-new&pivots=programming-language-python
+        2. Install ``langchain`` and set the following environment variables:
+
+        .. code-block:: bash
+
+            pip install -U langchain langchain-community
+
+            export AZURE_OPENAI_API_KEY="your-api-key"
+            export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/"
+            export OPENAI_API_VERSION="your-api-version"
+
+    Example Usage:
+        .. code-block:: python
+
+            from langchain.community import AzureOpenAIWhisperParser
+
+            whisper_parser = AzureOpenAIWhisperParser(
+                deployment_name="your-whisper-deployment",
+                api_version="2024-06-01",
+                api_key="your-api-key",
+                # other params...
+            )
+
+            audio_blob = Blob(path="your-audio-file-path")
+            response = whisper_parser.lazy_parse(audio_blob)
+
+            for document in response:
+                print(document.page_content)
+
+    Integration with Other Loaders:
+        The AzureOpenAIWhisperParser can be used with video/audio loaders and
+        `GenericLoader` to automate retrieval and parsing.
+
+    YoutubeAudioLoader Example:
+        .. code-block:: python
+
+            from langchain_community.document_loaders.blob_loaders import (
+                YoutubeAudioLoader
+                )
+            from langchain_community.document_loaders.generic import GenericLoader
+
+            # Must be a list
+            youtube_url = ["https://your-youtube-url"]
+            save_dir = "directory-to-download-videos"
+
+            loader = GenericLoader(
+                YoutubeAudioLoader(youtube_url, save_dir),
+                AzureOpenAIWhisperParser(deployment_name="your-deployment-name")
+            )
+
+            docs = loader.load()
+    """
+
+    def __init__(
+        self,
+        *,
+        api_key: Optional[str] = None,
+        azure_endpoint: Optional[str] = None,
+        api_version: Optional[str] = None,
+        azure_ad_token_provider: Union[Callable[[], str], None] = None,
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        response_format: Union[
+            Literal["json", "text", "srt", "verbose_json", "vtt"], None
+        ] = None,
+        temperature: Optional[float] = None,
+        deployment_name: str,
+        max_retries: int = 3,
+    ):
+        """
+        Initialize the AzureOpenAIWhisperParser.
+
+        Args:
+            api_key (Optional[str]):
+                Azure OpenAI API key. If not provided, defaults to the
+                `AZURE_OPENAI_API_KEY` environment variable.
+            azure_endpoint (Optional[str]):
+                Azure OpenAI service endpoint. Defaults to `AZURE_OPENAI_ENDPOINT`
+                environment variable if not set.
+            api_version (Optional[str]):
+                API version to use,
+                defaults to the `OPENAI_API_VERSION` environment variable.
+            azure_ad_token_provider (Union[Callable[[], str], None]):
+                Azure Active Directory token for authentication (if applicable).
+            language (Optional[str]):
+                Language in which the request should be processed.
+            prompt (Optional[str]):
+                Custom instructions or prompt for the Whisper model.
+            response_format (Union[str, None]):
+                The desired output format. Options: "json", "text", "srt",
+                "verbose_json", "vtt".
+            temperature (Optional[float]):
+                Controls the randomness of the model's output.
+            deployment_name (str):
+                The deployment name of the Whisper model.
+            max_retries (int):
+                Maximum number of retries for failed API requests.
+        Raises:
+            ImportError:
+                If the required package `openai` is not installed.
+        """
+        self.api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY")
+        self.azure_endpoint = azure_endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
+        self.api_version = api_version or os.environ.get("OPENAI_API_VERSION")
+        self.azure_ad_token_provider = azure_ad_token_provider
+
+        self.language = language
+        self.prompt = prompt
+        self.response_format = response_format
+        self.temperature = temperature
+
+        self.deployment_name = deployment_name
+        self.max_retries = max_retries
+
+        try:
+            import openai
+        except ImportError:
+            raise ImportError(
+                "openai package not found, please install it with "
+                "`pip install openai`"
+            )
+
+        if is_openai_v1():
+            self._client = openai.AzureOpenAI(
+                api_key=self.api_key,
+                azure_endpoint=self.azure_endpoint,
+                api_version=self.api_version,
+                max_retries=self.max_retries,
+                azure_ad_token=self.azure_ad_token_provider,
+            )
+        else:
+            if self.api_key:
+                openai.api_key = self.api_key
+            if self.azure_endpoint:
+                openai.api_base = self.azure_endpoint
+            if self.api_version:
+                openai.api_version = self.api_version
+            openai.api_type = "azure"
+            self._client = openai
+
+    @property
+    def _create_params(self) -> Dict[str, Any]:
+        params = {
+            "language": self.language,
+            "prompt": self.prompt,
+            "response_format": self.response_format,
+            "temperature": self.temperature,
+        }
+        return {k: v for k, v in params.items() if v is not None}
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """
+        Lazily parse the provided audio blob for transcription.
+
+        Args:
+            blob (Blob):
+                The audio file in Blob format to be transcribed.
+
+        Yields:
+            Document:
+                Parsed transcription from the audio file.
+
+        Raises:
+            Exception:
+                If an error occurs during transcription.
+        """
+
+        file_obj = open(str(blob.path), "rb")
+
+        # Transcribe
+        try:
+            if is_openai_v1():
+                transcript = self._client.audio.transcriptions.create(
+                    model=self.deployment_name,
+                    file=file_obj,
+                    **self._create_params,
+                )
+            else:
+                transcript = self._client.Audio.transcribe(
+                    model=self.deployment_name,
+                    deployment_id=self.deployment_name,
+                    file=file_obj,
+                    **self._create_params,
+                )
+        except Exception:
+            raise
+
+        yield Document(
+            page_content=transcript.text
+            if not isinstance(transcript, str)
+            else transcript,
+            metadata={"source": blob.source},
+        )
+
+
 class OpenAIWhisperParser(BaseBlobParser):
    """Transcribe and parse audio files.

@@ -19,7 +232,7 @@ class OpenAIWhisperParser(BaseBlobParser):

    Args:
        api_key: OpenAI API key
-        chunk_duration_threshold: minimum duration of a chunk in seconds
+        chunk_duration_threshold: Minimum duration of a chunk in seconds
            NOTE: According to the OpenAI API, the chunk duration should be at least 0.1
            seconds. If the chunk duration is less or equal than the threshold,
            it will be skipped.
@@ -61,8 +274,6 @@ class OpenAIWhisperParser(BaseBlobParser):
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

-        import io
-
        try:
            import openai
        except ImportError:
@@ -85,11 +296,11 @@ class OpenAIWhisperParser(BaseBlobParser):
            if self.api_key:
                openai.api_key = self.api_key
            if self.base_url:
-                openai.base_url = self.base_url
+                openai.api_base = self.base_url

        # Audio file from disk
-        audio = AudioSegment.from_file(blob.path)

+        audio = AudioSegment.from_file(blob.path)
        # Define the duration of each chunk in minutes
        # Need to meet 25MB size limit for Whisper API
        chunk_duration = 20
@@ -240,8 +451,6 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

-        import io
-
        try:
            from pydub import AudioSegment
        except ImportError:
@@ -436,8 +645,6 @@ class FasterWhisperParser(BaseBlobParser):
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

-        import io
-
        try:
            from pydub import AudioSegment
        except ImportError: