community: support in-memory data (Blob.from_data) in all audio parsers (#30262)

OpenAIWhisperParser, OpenAIWhisperParserLocal, YandexSTTParser do not handle in-memory audio data (loaded via Blob.from_data) correctly. They require Blob.path to be set and AudioSegment is always read from the file system. In-memory data is handled correctly only for FasterWhisperParser so far. I changed OpenAIWhisperParser, OpenAIWhisperParserLocal, YandexSTTParser accordingly to match FasterWhisperParser. Thanks for reviewing the PR! Co-authored-by: qonnop <qonnop@users.noreply.github.com>
2025-08-17 16:39:52 +00:00 · 2025-03-18 00:52:33 +01:00 · 2025-03-18 00:52:33 +01:00 · 036f00dc92
commit 036f00dc92
parent 98a9ef19ec
1 changed files with 36 additions and 42 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/audio.py
+++ b/libs/community/langchain_community/document_loaders/parsers/audio.py
@ -281,12 +281,8 @@ class OpenAIWhisperParser(BaseBlobParser):
            raise ImportError(
                "openai package not found, please install it with `pip install openai`"
            )
-        try:
+
-            from pydub import AudioSegment
+        audio = _get_audio_from_blob(blob)
        except ImportError:
            raise ImportError(
                "pydub package not found, please install it with `pip install pydub`"
            )
        if is_openai_v1():
            # api_key optional, defaults to `os.environ['OPENAI_API_KEY']`
@ -298,9 +294,6 @@ class OpenAIWhisperParser(BaseBlobParser):
            if self.base_url:
                openai.api_base = self.base_url
        # Audio file from disk
        audio = AudioSegment.from_file(blob.path)
        # Define the duration of each chunk in minutes
        # Need to meet 25MB size limit for Whisper API
        chunk_duration = 20
@ -451,13 +444,6 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""
        try:
            from pydub import AudioSegment
        except ImportError:
            raise ImportError(
                "pydub package not found, please install it with `pip install pydub`"
            )
        try:
            import librosa
        except ImportError:
@ -466,8 +452,7 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
                "`pip install librosa`"
            )
-        # Audio file from disk
+        audio = _get_audio_from_blob(blob)
        audio = AudioSegment.from_file(blob.path)
        file_obj = io.BytesIO(audio.export(format="mp3").read())
@ -529,12 +514,8 @@ class YandexSTTParser(BaseBlobParser):
                "yandex-speechkit package not found, please install it with "
                "`pip install yandex-speechkit`"
            )
-        try:
+
-            from pydub import AudioSegment
+        audio = _get_audio_from_blob(blob)
        except ImportError:
            raise ImportError(
                "pydub package not found, please install it with `pip install pydub`"
            )
        if self.api_key:
            configure_credentials(
@ -545,8 +526,6 @@ class YandexSTTParser(BaseBlobParser):
                yandex_credentials=creds.YandexCredentials(iam_token=self.iam_token)
            )
        audio = AudioSegment.from_file(blob.path)
        model = model_repository.recognition_model()
        model.model = self.model
@ -645,13 +624,6 @@ class FasterWhisperParser(BaseBlobParser):
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""
        try:
            from pydub import AudioSegment
        except ImportError:
            raise ImportError(
                "pydub package not found, please install it with `pip install pydub`"
            )
        try:
            from faster_whisper import WhisperModel
        except ImportError:
@ -660,15 +632,7 @@ class FasterWhisperParser(BaseBlobParser):
                "`pip install faster-whisper`"
            )
-        # get the audio
+        audio = _get_audio_from_blob(blob)
        if isinstance(blob.data, bytes):
            # blob contains the audio
            audio = AudioSegment.from_file(io.BytesIO(blob.data))
        elif blob.data is None and blob.path:
            # Audio file from disk
            audio = AudioSegment.from_file(blob.path)
        else:
            raise ValueError("Unable to get audio from blob")
        file_obj = io.BytesIO(audio.export(format="mp3").read())
@ -688,3 +652,33 @@ class FasterWhisperParser(BaseBlobParser):
                    **blob.metadata,
                },
            )
 def _get_audio_from_blob(blob: Blob) -> Any:
    """Get audio data from blob.
    Args:
        blob: Blob object containing the audio data.
    Returns:
        AudioSegment: Audio data from the blob.
    Raises:
        ImportError: If the required package `pydub` is not installed.
        ValueError: If the audio data is not found in the blob
    """
    try:
        from pydub import AudioSegment
    except ImportError:
        raise ImportError(
            "pydub package not found, please install it with `pip install pydub`"
        )
    if isinstance(blob.data, bytes):
        audio = AudioSegment.from_file(io.BytesIO(blob.data))
    elif blob.data is None and blob.path:
        audio = AudioSegment.from_file(blob.path)
    else:
        raise ValueError("Unable to get audio from blob")
    return audio