From b15fccbb99f261ebcb0776ac4b79c72b8a8a7f35 Mon Sep 17 00:00:00 2001 From: Leo Diegues Date: Thu, 22 Feb 2024 22:02:43 -0300 Subject: [PATCH] community[patch]: Skip `OpenAIWhisperParser` extremely small audio chunks to avoid api error (#11450) **Description** This PR addresses a rare issue in `OpenAIWhisperParser` that causes it to crash when processing an audio file with a duration very close to the class's chunk size threshold of 20 minutes. **Issue** #11449 **Dependencies** None **Tag maintainer** @agola11 @eyurtsev **Twitter handle** leonardodiegues --------- Co-authored-by: Leonardo Diegues Co-authored-by: Bagatur --- .../document_loaders/parsers/audio.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py index 65674e3d1da..77d1b2b8e12 100644 --- a/libs/community/langchain_community/document_loaders/parsers/audio.py +++ b/libs/community/langchain_community/document_loaders/parsers/audio.py @@ -13,10 +13,22 @@ logger = logging.getLogger(__name__) class OpenAIWhisperParser(BaseBlobParser): """Transcribe and parse audio files. - Audio transcription is with OpenAI Whisper model.""" - def __init__(self, api_key: Optional[str] = None): + Audio transcription is with OpenAI Whisper model. + + Args: + api_key: OpenAI API key + chunk_duration_threshold: minimum duration of a chunk in seconds + NOTE: According to the OpenAI API, the chunk duration should be at least 0.1 + seconds. If the chunk duration is less or equal than the threshold, + it will be skipped. + """ + + def __init__( + self, api_key: Optional[str] = None, *, chunk_duration_threshold: float = 0.1 + ): self.api_key = api_key + self.chunk_duration_threshold = chunk_duration_threshold def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" @@ -57,6 +69,9 @@ class OpenAIWhisperParser(BaseBlobParser): for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)): # Audio chunk chunk = audio[i : i + chunk_duration_ms] + # Skip chunks that are too short to transcribe + if chunk.duration_seconds <= self.chunk_duration_threshold: + continue file_obj = io.BytesIO(chunk.export(format="mp3").read()) if blob.source is not None: file_obj.name = blob.source + f"_part_{split_number}.mp3"