diff --git a/docs/docs/integrations/document_loaders/parsers/azure_openai_whisper_parser.ipynb b/docs/docs/integrations/document_loaders/parsers/azure_openai_whisper_parser.ipynb new file mode 100644 index 00000000000..b3dadb1f0ad --- /dev/null +++ b/docs/docs/integrations/document_loaders/parsers/azure_openai_whisper_parser.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Azure OpenAI Whisper Parser" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ">[Azure OpenAI Whisper Parser](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/whisper-overview) is a wrapper around the Azure OpenAI Whisper API which utilizes machine learning to transcribe audio files to english text. \n", + ">\n", + ">The Parser supports `.mp3`, `.mp4`, `.mpeg`, `.mpga`, `.m4a`, `.wav`, and `.webm`.\n", + "\n", + "The current implementation follows LangChain core principles and can be used with other loaders to handle both audio downloading and parsing. As a result of this the parser will `yield` an `Iterator[Document]`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The service requires Azure credentials, Azure endpoint and Whisper Model deployment, which can be set up by following the guide [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/whisper-quickstart?tabs=command-line%2Cpython-new%2Cjavascript&pivots=programming-language-python). Furthermore, the required dependencies must be installed.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -Uq langchain langchain-community openai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `AzureOpenAIWhisperParser`'s method, `.lazy_parse`, accepts a `Blob` object as a parameter containing the file path of the file to be transcribed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.documents.base import Blob\n", + "\n", + "audio_path = \"path/to/your/audio/file\"\n", + "audio_blob = Blob(path=audio_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders.parsers.audio import AzureOpenAIWhisperParser\n", + "\n", + "endpoint = \"\"\n", + "key = \"\"\n", + "name = \"\"\n", + "\n", + "parser = AzureOpenAIWhisperParser(\n", + " api_key=key, azure_endpoint=endpoint, api_version=version, deployment_name=name\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documents = parser.lazy_parse(blob=audio_blob)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for doc in documents:\n", + " print(doc.page_content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `AzureOpenAIWhisperParser` can also be used in conjuction with audio loaders, like the `YoutubeAudioLoader` with a `GenericLoader`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders.blob_loaders.youtube_audio import (\n", + " YoutubeAudioLoader,\n", + ")\n", + "from langchain_community.document_loaders.generic import GenericLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Must be a list\n", + "url = [\"www.youtube.url.com\"]\n", + "\n", + "save_dir = \"save/directory/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "name = \"\"\n", + "\n", + "loader = GenericLoader(\n", + " YoutubeAudioLoader(url, save_dir), AzureOpenAIWhisperParser(deployment_name=name)\n", + ")\n", + "\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for doc in documents:\n", + " print(doc.page_content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py index abc08e561a5..32ced082601 100644 --- a/libs/community/langchain_community/document_loaders/parsers/audio.py +++ b/libs/community/langchain_community/document_loaders/parsers/audio.py @@ -1,7 +1,8 @@ +import io import logging import os import time -from typing import Any, Dict, Iterator, Literal, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterator, Literal, Optional, Tuple, Union from langchain_core.documents import Document @@ -12,6 +13,218 @@ from langchain_community.utils.openai import is_openai_v1 logger = logging.getLogger(__name__) +class AzureOpenAIWhisperParser(BaseBlobParser): + """ + Transcribe and parse audio files using Azure OpenAI Whisper. + + This parser integrates with the Azure OpenAI Whisper model to transcribe + audio files. It differs from the standard OpenAI Whisper parser, requiring + an Azure endpoint and credentials. The parser is limited to files under 25 MB. + + **Note**: + This parser uses the Azure OpenAI API, providing integration with the Azure + ecosystem, and making it suitable for workflows involving other Azure services. + + For files larger than 25 MB, consider using Azure AI Speech batch transcription: + https://learn.microsoft.com/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api#use-a-whisper-model + + Setup: + 1. Follow the instructions here to deploy Azure Whisper: + https://learn.microsoft.com/azure/ai-services/openai/whisper-quickstart?tabs=command-line%2Cpython-new&pivots=programming-language-python + 2. Install ``langchain`` and set the following environment variables: + + .. code-block:: bash + + pip install -U langchain langchain-community + + export AZURE_OPENAI_API_KEY="your-api-key" + export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/" + export OPENAI_API_VERSION="your-api-version" + + Example Usage: + .. code-block:: python + + from langchain.community import AzureOpenAIWhisperParser + + whisper_parser = AzureOpenAIWhisperParser( + deployment_name="your-whisper-deployment", + api_version="2024-06-01", + api_key="your-api-key", + # other params... + ) + + audio_blob = Blob(path="your-audio-file-path") + response = whisper_parser.lazy_parse(audio_blob) + + for document in response: + print(document.page_content) + + Integration with Other Loaders: + The AzureOpenAIWhisperParser can be used with video/audio loaders and + `GenericLoader` to automate retrieval and parsing. + + YoutubeAudioLoader Example: + .. code-block:: python + + from langchain_community.document_loaders.blob_loaders import ( + YoutubeAudioLoader + ) + from langchain_community.document_loaders.generic import GenericLoader + + # Must be a list + youtube_url = ["https://your-youtube-url"] + save_dir = "directory-to-download-videos" + + loader = GenericLoader( + YoutubeAudioLoader(youtube_url, save_dir), + AzureOpenAIWhisperParser(deployment_name="your-deployment-name") + ) + + docs = loader.load() + """ + + def __init__( + self, + *, + api_key: Optional[str] = None, + azure_endpoint: Optional[str] = None, + api_version: Optional[str] = None, + azure_ad_token_provider: Union[Callable[[], str], None] = None, + language: Optional[str] = None, + prompt: Optional[str] = None, + response_format: Union[ + Literal["json", "text", "srt", "verbose_json", "vtt"], None + ] = None, + temperature: Optional[float] = None, + deployment_name: str, + max_retries: int = 3, + ): + """ + Initialize the AzureOpenAIWhisperParser. + + Args: + api_key (Optional[str]): + Azure OpenAI API key. If not provided, defaults to the + `AZURE_OPENAI_API_KEY` environment variable. + azure_endpoint (Optional[str]): + Azure OpenAI service endpoint. Defaults to `AZURE_OPENAI_ENDPOINT` + environment variable if not set. + api_version (Optional[str]): + API version to use, + defaults to the `OPENAI_API_VERSION` environment variable. + azure_ad_token_provider (Union[Callable[[], str], None]): + Azure Active Directory token for authentication (if applicable). + language (Optional[str]): + Language in which the request should be processed. + prompt (Optional[str]): + Custom instructions or prompt for the Whisper model. + response_format (Union[str, None]): + The desired output format. Options: "json", "text", "srt", + "verbose_json", "vtt". + temperature (Optional[float]): + Controls the randomness of the model's output. + deployment_name (str): + The deployment name of the Whisper model. + max_retries (int): + Maximum number of retries for failed API requests. + Raises: + ImportError: + If the required package `openai` is not installed. + """ + self.api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY") + self.azure_endpoint = azure_endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT") + self.api_version = api_version or os.environ.get("OPENAI_API_VERSION") + self.azure_ad_token_provider = azure_ad_token_provider + + self.language = language + self.prompt = prompt + self.response_format = response_format + self.temperature = temperature + + self.deployment_name = deployment_name + self.max_retries = max_retries + + try: + import openai + except ImportError: + raise ImportError( + "openai package not found, please install it with " + "`pip install openai`" + ) + + if is_openai_v1(): + self._client = openai.AzureOpenAI( + api_key=self.api_key, + azure_endpoint=self.azure_endpoint, + api_version=self.api_version, + max_retries=self.max_retries, + azure_ad_token=self.azure_ad_token_provider, + ) + else: + if self.api_key: + openai.api_key = self.api_key + if self.azure_endpoint: + openai.api_base = self.azure_endpoint + if self.api_version: + openai.api_version = self.api_version + openai.api_type = "azure" + self._client = openai + + @property + def _create_params(self) -> Dict[str, Any]: + params = { + "language": self.language, + "prompt": self.prompt, + "response_format": self.response_format, + "temperature": self.temperature, + } + return {k: v for k, v in params.items() if v is not None} + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """ + Lazily parse the provided audio blob for transcription. + + Args: + blob (Blob): + The audio file in Blob format to be transcribed. + + Yields: + Document: + Parsed transcription from the audio file. + + Raises: + Exception: + If an error occurs during transcription. + """ + + file_obj = open(str(blob.path), "rb") + + # Transcribe + try: + if is_openai_v1(): + transcript = self._client.audio.transcriptions.create( + model=self.deployment_name, + file=file_obj, + **self._create_params, + ) + else: + transcript = self._client.Audio.transcribe( + model=self.deployment_name, + deployment_id=self.deployment_name, + file=file_obj, + **self._create_params, + ) + except Exception: + raise + + yield Document( + page_content=transcript.text + if not isinstance(transcript, str) + else transcript, + metadata={"source": blob.source}, + ) + + class OpenAIWhisperParser(BaseBlobParser): """Transcribe and parse audio files. @@ -19,7 +232,7 @@ class OpenAIWhisperParser(BaseBlobParser): Args: api_key: OpenAI API key - chunk_duration_threshold: minimum duration of a chunk in seconds + chunk_duration_threshold: Minimum duration of a chunk in seconds NOTE: According to the OpenAI API, the chunk duration should be at least 0.1 seconds. If the chunk duration is less or equal than the threshold, it will be skipped. @@ -61,8 +274,6 @@ class OpenAIWhisperParser(BaseBlobParser): def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" - import io - try: import openai except ImportError: @@ -85,11 +296,11 @@ class OpenAIWhisperParser(BaseBlobParser): if self.api_key: openai.api_key = self.api_key if self.base_url: - openai.base_url = self.base_url + openai.api_base = self.base_url # Audio file from disk - audio = AudioSegment.from_file(blob.path) + audio = AudioSegment.from_file(blob.path) # Define the duration of each chunk in minutes # Need to meet 25MB size limit for Whisper API chunk_duration = 20 @@ -240,8 +451,6 @@ class OpenAIWhisperParserLocal(BaseBlobParser): def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" - import io - try: from pydub import AudioSegment except ImportError: @@ -436,8 +645,6 @@ class FasterWhisperParser(BaseBlobParser): def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" - import io - try: from pydub import AudioSegment except ImportError: diff --git a/libs/community/tests/examples/hello_world.m4a b/libs/community/tests/examples/hello_world.m4a new file mode 100644 index 00000000000..959ddd19aa6 Binary files /dev/null and b/libs/community/tests/examples/hello_world.m4a differ diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_azure_whisper_parser.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_azure_whisper_parser.py new file mode 100644 index 00000000000..d48970534c1 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_azure_whisper_parser.py @@ -0,0 +1,104 @@ +"""Tests for the Azure OpenAI Whisper parser.""" + +from pathlib import Path +from typing import Any +from unittest.mock import Mock, patch + +import pytest +from langchain_core.documents import Document +from langchain_core.documents.base import Blob + +from langchain_community.document_loaders.parsers.audio import AzureOpenAIWhisperParser + +_THIS_DIR = Path(__file__).parents[3] + +_EXAMPLES_DIR = _THIS_DIR / "examples" +AUDIO_M4A = _EXAMPLES_DIR / "hello_world.m4a" + + +@pytest.mark.requires("openai") +@patch("openai.AzureOpenAI") +def test_azure_openai_whisper(mock_client: Mock) -> None: + endpoint = "endpoint" + key = "key" + version = "115" + name = "model" + + parser = AzureOpenAIWhisperParser( + api_key=key, azure_endpoint=endpoint, api_version=version, deployment_name=name + ) + mock_client.assert_called_once_with( + api_key=key, + azure_endpoint=endpoint, + api_version=version, + max_retries=3, + azure_ad_token=None, + ) + assert parser._client == mock_client() + + +@pytest.mark.requires("openai") +def test_is_openai_v1_lazy_parse(mocker: Any) -> None: + endpoint = "endpoint" + key = "key" + version = "115" + name = "model" + + mock_blob = mocker.Mock(spec=Blob) + mock_blob.path = AUDIO_M4A + mock_blob.source = "test_source" + + mock_openai_client = mocker.Mock() + + mock_openai_client.audio.transcriptions.create.return_value = mocker.Mock() + mock_openai_client.audio.transcriptions.create.return_value.text = ( + "Transcribed text" + ) + + mocker.patch("langchain_community.utils.openai.is_openai_v1", return_value=True) + + parser = AzureOpenAIWhisperParser( + api_key=key, azure_endpoint=endpoint, api_version=version, deployment_name=name + ) + + parser._client = mock_openai_client + + result = list(parser.lazy_parse(mock_blob)) + + assert len(result) == 1 + assert isinstance(result[0], Document) + assert result[0].page_content == "Transcribed text" + assert result[0].metadata["source"] == "test_source" + + +@pytest.mark.requires("openai") +def test_is_not_openai_v1_lazy_parse(mocker: Any) -> None: + endpoint = "endpoint" + key = "key" + version = "115" + name = "model" + + mock_blob = mocker.Mock(spec=Blob) + mock_blob.path = AUDIO_M4A + mock_blob.source = "test_source" + + mock_openai_client = mocker.Mock() + + mock_openai_client.audio.transcriptions.create.return_value = mocker.Mock() + mock_openai_client.audio.transcriptions.create.return_value.text = ( + "Transcribed text" + ) + + mocker.patch("langchain_community.utils.openai.is_openai_v1", return_value=False) + + parser = AzureOpenAIWhisperParser( + api_key=key, azure_endpoint=endpoint, api_version=version, deployment_name=name + ) + parser._client = mock_openai_client + + result = list(parser.lazy_parse(mock_blob)) + + assert len(result) == 1 + assert isinstance(result[0], Document) + assert result[0].page_content == "Transcribed text" + assert result[0].metadata["source"] == "test_source"