community: add AzureOpenAIWhisperParser (#27796)

Commandeered from https://github.com/langchain-ai/langchain/pull/26757.

---------

Co-authored-by: Sheepsta300 <128811766+Sheepsta300@users.noreply.github.com>
This commit is contained in:
ccurme
2024-10-31 12:37:41 -04:00
committed by GitHub
parent b631b0a596
commit 0172d938b4
4 changed files with 513 additions and 10 deletions

View File

@@ -1,7 +1,8 @@
import io
import logging
import os
import time
from typing import Any, Dict, Iterator, Literal, Optional, Tuple, Union
from typing import Any, Callable, Dict, Iterator, Literal, Optional, Tuple, Union
from langchain_core.documents import Document
@@ -12,6 +13,218 @@ from langchain_community.utils.openai import is_openai_v1
logger = logging.getLogger(__name__)
class AzureOpenAIWhisperParser(BaseBlobParser):
"""
Transcribe and parse audio files using Azure OpenAI Whisper.
This parser integrates with the Azure OpenAI Whisper model to transcribe
audio files. It differs from the standard OpenAI Whisper parser, requiring
an Azure endpoint and credentials. The parser is limited to files under 25 MB.
**Note**:
This parser uses the Azure OpenAI API, providing integration with the Azure
ecosystem, and making it suitable for workflows involving other Azure services.
For files larger than 25 MB, consider using Azure AI Speech batch transcription:
https://learn.microsoft.com/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api#use-a-whisper-model
Setup:
1. Follow the instructions here to deploy Azure Whisper:
https://learn.microsoft.com/azure/ai-services/openai/whisper-quickstart?tabs=command-line%2Cpython-new&pivots=programming-language-python
2. Install ``langchain`` and set the following environment variables:
.. code-block:: bash
pip install -U langchain langchain-community
export AZURE_OPENAI_API_KEY="your-api-key"
export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/"
export OPENAI_API_VERSION="your-api-version"
Example Usage:
.. code-block:: python
from langchain.community import AzureOpenAIWhisperParser
whisper_parser = AzureOpenAIWhisperParser(
deployment_name="your-whisper-deployment",
api_version="2024-06-01",
api_key="your-api-key",
# other params...
)
audio_blob = Blob(path="your-audio-file-path")
response = whisper_parser.lazy_parse(audio_blob)
for document in response:
print(document.page_content)
Integration with Other Loaders:
The AzureOpenAIWhisperParser can be used with video/audio loaders and
`GenericLoader` to automate retrieval and parsing.
YoutubeAudioLoader Example:
.. code-block:: python
from langchain_community.document_loaders.blob_loaders import (
YoutubeAudioLoader
)
from langchain_community.document_loaders.generic import GenericLoader
# Must be a list
youtube_url = ["https://your-youtube-url"]
save_dir = "directory-to-download-videos"
loader = GenericLoader(
YoutubeAudioLoader(youtube_url, save_dir),
AzureOpenAIWhisperParser(deployment_name="your-deployment-name")
)
docs = loader.load()
"""
def __init__(
self,
*,
api_key: Optional[str] = None,
azure_endpoint: Optional[str] = None,
api_version: Optional[str] = None,
azure_ad_token_provider: Union[Callable[[], str], None] = None,
language: Optional[str] = None,
prompt: Optional[str] = None,
response_format: Union[
Literal["json", "text", "srt", "verbose_json", "vtt"], None
] = None,
temperature: Optional[float] = None,
deployment_name: str,
max_retries: int = 3,
):
"""
Initialize the AzureOpenAIWhisperParser.
Args:
api_key (Optional[str]):
Azure OpenAI API key. If not provided, defaults to the
`AZURE_OPENAI_API_KEY` environment variable.
azure_endpoint (Optional[str]):
Azure OpenAI service endpoint. Defaults to `AZURE_OPENAI_ENDPOINT`
environment variable if not set.
api_version (Optional[str]):
API version to use,
defaults to the `OPENAI_API_VERSION` environment variable.
azure_ad_token_provider (Union[Callable[[], str], None]):
Azure Active Directory token for authentication (if applicable).
language (Optional[str]):
Language in which the request should be processed.
prompt (Optional[str]):
Custom instructions or prompt for the Whisper model.
response_format (Union[str, None]):
The desired output format. Options: "json", "text", "srt",
"verbose_json", "vtt".
temperature (Optional[float]):
Controls the randomness of the model's output.
deployment_name (str):
The deployment name of the Whisper model.
max_retries (int):
Maximum number of retries for failed API requests.
Raises:
ImportError:
If the required package `openai` is not installed.
"""
self.api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY")
self.azure_endpoint = azure_endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
self.api_version = api_version or os.environ.get("OPENAI_API_VERSION")
self.azure_ad_token_provider = azure_ad_token_provider
self.language = language
self.prompt = prompt
self.response_format = response_format
self.temperature = temperature
self.deployment_name = deployment_name
self.max_retries = max_retries
try:
import openai
except ImportError:
raise ImportError(
"openai package not found, please install it with "
"`pip install openai`"
)
if is_openai_v1():
self._client = openai.AzureOpenAI(
api_key=self.api_key,
azure_endpoint=self.azure_endpoint,
api_version=self.api_version,
max_retries=self.max_retries,
azure_ad_token=self.azure_ad_token_provider,
)
else:
if self.api_key:
openai.api_key = self.api_key
if self.azure_endpoint:
openai.api_base = self.azure_endpoint
if self.api_version:
openai.api_version = self.api_version
openai.api_type = "azure"
self._client = openai
@property
def _create_params(self) -> Dict[str, Any]:
params = {
"language": self.language,
"prompt": self.prompt,
"response_format": self.response_format,
"temperature": self.temperature,
}
return {k: v for k, v in params.items() if v is not None}
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""
Lazily parse the provided audio blob for transcription.
Args:
blob (Blob):
The audio file in Blob format to be transcribed.
Yields:
Document:
Parsed transcription from the audio file.
Raises:
Exception:
If an error occurs during transcription.
"""
file_obj = open(str(blob.path), "rb")
# Transcribe
try:
if is_openai_v1():
transcript = self._client.audio.transcriptions.create(
model=self.deployment_name,
file=file_obj,
**self._create_params,
)
else:
transcript = self._client.Audio.transcribe(
model=self.deployment_name,
deployment_id=self.deployment_name,
file=file_obj,
**self._create_params,
)
except Exception:
raise
yield Document(
page_content=transcript.text
if not isinstance(transcript, str)
else transcript,
metadata={"source": blob.source},
)
class OpenAIWhisperParser(BaseBlobParser):
"""Transcribe and parse audio files.
@@ -19,7 +232,7 @@ class OpenAIWhisperParser(BaseBlobParser):
Args:
api_key: OpenAI API key
chunk_duration_threshold: minimum duration of a chunk in seconds
chunk_duration_threshold: Minimum duration of a chunk in seconds
NOTE: According to the OpenAI API, the chunk duration should be at least 0.1
seconds. If the chunk duration is less or equal than the threshold,
it will be skipped.
@@ -61,8 +274,6 @@ class OpenAIWhisperParser(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import io
try:
import openai
except ImportError:
@@ -85,11 +296,11 @@ class OpenAIWhisperParser(BaseBlobParser):
if self.api_key:
openai.api_key = self.api_key
if self.base_url:
openai.base_url = self.base_url
openai.api_base = self.base_url
# Audio file from disk
audio = AudioSegment.from_file(blob.path)
audio = AudioSegment.from_file(blob.path)
# Define the duration of each chunk in minutes
# Need to meet 25MB size limit for Whisper API
chunk_duration = 20
@@ -240,8 +451,6 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import io
try:
from pydub import AudioSegment
except ImportError:
@@ -436,8 +645,6 @@ class FasterWhisperParser(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import io
try:
from pydub import AudioSegment
except ImportError: