community: support in-memory data (Blob.from_data) in all audio parsers (#30262)

OpenAIWhisperParser, OpenAIWhisperParserLocal, YandexSTTParser do not
handle in-memory audio data (loaded via Blob.from_data) correctly. They
require Blob.path to be set and AudioSegment is always read from the
file system. In-memory data is handled correctly only for
FasterWhisperParser so far. I changed OpenAIWhisperParser,
OpenAIWhisperParserLocal, YandexSTTParser accordingly to match
FasterWhisperParser.
Thanks for reviewing the PR!

Co-authored-by: qonnop <qonnop@users.noreply.github.com>
This commit is contained in:
qonnop 2025-03-18 00:52:33 +01:00 committed by GitHub
parent 98a9ef19ec
commit 036f00dc92
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -281,12 +281,8 @@ class OpenAIWhisperParser(BaseBlobParser):
raise ImportError(
"openai package not found, please install it with `pip install openai`"
)
try:
from pydub import AudioSegment
except ImportError:
raise ImportError(
"pydub package not found, please install it with `pip install pydub`"
)
audio = _get_audio_from_blob(blob)
if is_openai_v1():
# api_key optional, defaults to `os.environ['OPENAI_API_KEY']`
@ -298,9 +294,6 @@ class OpenAIWhisperParser(BaseBlobParser):
if self.base_url:
openai.api_base = self.base_url
# Audio file from disk
audio = AudioSegment.from_file(blob.path)
# Define the duration of each chunk in minutes
# Need to meet 25MB size limit for Whisper API
chunk_duration = 20
@ -451,13 +444,6 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
try:
from pydub import AudioSegment
except ImportError:
raise ImportError(
"pydub package not found, please install it with `pip install pydub`"
)
try:
import librosa
except ImportError:
@ -466,8 +452,7 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
"`pip install librosa`"
)
# Audio file from disk
audio = AudioSegment.from_file(blob.path)
audio = _get_audio_from_blob(blob)
file_obj = io.BytesIO(audio.export(format="mp3").read())
@ -529,12 +514,8 @@ class YandexSTTParser(BaseBlobParser):
"yandex-speechkit package not found, please install it with "
"`pip install yandex-speechkit`"
)
try:
from pydub import AudioSegment
except ImportError:
raise ImportError(
"pydub package not found, please install it with `pip install pydub`"
)
audio = _get_audio_from_blob(blob)
if self.api_key:
configure_credentials(
@ -545,8 +526,6 @@ class YandexSTTParser(BaseBlobParser):
yandex_credentials=creds.YandexCredentials(iam_token=self.iam_token)
)
audio = AudioSegment.from_file(blob.path)
model = model_repository.recognition_model()
model.model = self.model
@ -645,13 +624,6 @@ class FasterWhisperParser(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
try:
from pydub import AudioSegment
except ImportError:
raise ImportError(
"pydub package not found, please install it with `pip install pydub`"
)
try:
from faster_whisper import WhisperModel
except ImportError:
@ -660,15 +632,7 @@ class FasterWhisperParser(BaseBlobParser):
"`pip install faster-whisper`"
)
# get the audio
if isinstance(blob.data, bytes):
# blob contains the audio
audio = AudioSegment.from_file(io.BytesIO(blob.data))
elif blob.data is None and blob.path:
# Audio file from disk
audio = AudioSegment.from_file(blob.path)
else:
raise ValueError("Unable to get audio from blob")
audio = _get_audio_from_blob(blob)
file_obj = io.BytesIO(audio.export(format="mp3").read())
@ -688,3 +652,33 @@ class FasterWhisperParser(BaseBlobParser):
**blob.metadata,
},
)
def _get_audio_from_blob(blob: Blob) -> Any:
"""Get audio data from blob.
Args:
blob: Blob object containing the audio data.
Returns:
AudioSegment: Audio data from the blob.
Raises:
ImportError: If the required package `pydub` is not installed.
ValueError: If the audio data is not found in the blob
"""
try:
from pydub import AudioSegment
except ImportError:
raise ImportError(
"pydub package not found, please install it with `pip install pydub`"
)
if isinstance(blob.data, bytes):
audio = AudioSegment.from_file(io.BytesIO(blob.data))
elif blob.data is None and blob.path:
audio = AudioSegment.from_file(blob.path)
else:
raise ValueError("Unable to get audio from blob")
return audio