diff --git a/libs/langchain/langchain/document_loaders/parsers/audio.py b/libs/langchain/langchain/document_loaders/parsers/audio.py index 91c6870f7e2..344e48a98ca 100644 --- a/libs/langchain/langchain/document_loaders/parsers/audio.py +++ b/libs/langchain/langchain/document_loaders/parsers/audio.py @@ -219,3 +219,81 @@ class OpenAIWhisperParserLocal(BaseBlobParser): page_content=prediction, metadata={"source": blob.source}, ) + + +class YandexSTTParser(BaseBlobParser): + """Transcribe and parse audio files. + Audio transcription is with OpenAI Whisper model.""" + + def __init__( + self, + *, + api_key: Optional[str] = None, + iam_token: Optional[str] = None, + model: str = "general", + language: str = "auto", + ): + """Initialize the parser. + + Args: + api_key: API key for a service account + with the `ai.speechkit-stt.user` role. + iam_token: IAM token for a service account + with the `ai.speechkit-stt.user` role. + model: Recognition model name. + Defaults to general. + language: The language in ISO 639-1 format. + Defaults to automatic language recognition. + Either `api_key` or `iam_token` must be provided, but not both. + """ + if (api_key is None) == (iam_token is None): + raise ValueError( + "Either 'api_key' or 'iam_token' must be provided, but not both." + ) + self.api_key = api_key + self.iam_token = iam_token + self.model = model + self.language = language + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + + try: + from speechkit import configure_credentials, creds, model_repository + from speechkit.stt import AudioProcessingType + except ImportError: + raise ImportError( + "yandex-speechkit package not found, please install it with " + "`pip install yandex-speechkit`" + ) + try: + from pydub import AudioSegment + except ImportError: + raise ImportError( + "pydub package not found, please install it with " "`pip install pydub`" + ) + + if self.api_key: + configure_credentials( + yandex_credentials=creds.YandexCredentials(api_key=self.api_key) + ) + else: + configure_credentials( + yandex_credentials=creds.YandexCredentials(iam_token=self.iam_token) + ) + + audio = AudioSegment.from_file(blob.path) + + model = model_repository.recognition_model() + + model.model = self.model + model.language = self.language + model.audio_processing_type = AudioProcessingType.Full + + result = model.transcribe(audio) + + for res in result: + yield Document( + page_content=res.normalized_text, + metadata={"source": blob.source}, + )