Compare commits

...

36 Commits

Author SHA1 Message Date
Eugene Yurtsev
a59739e1d2 Merge branch 'master' into fork/feature_audio_loader_auzre_speech 2024-03-01 17:45:30 -05:00
Eugene Yurtsev
02b4c7ff67 x 2024-03-01 16:59:55 -05:00
Eugene Yurtsev
d11273c5da Merge branch 'master' into fork/feature_audio_loader_auzre_speech 2024-03-01 15:06:19 -05:00
Eugene Yurtsev
368abe1106 x 2024-02-28 17:27:29 -05:00
Eugene Yurtsev
e9768c2ffc x 2024-02-28 17:06:06 -05:00
Eugene Yurtsev
c120d02d8c x 2024-02-28 16:52:03 -05:00
Eugene Yurtsev
002943c8d7 x 2024-02-28 16:44:08 -05:00
Eugene Yurtsev
ddac748fee x 2024-02-28 16:24:20 -05:00
Eugene Yurtsev
f8c628a8b8 fix 2024-02-28 16:18:59 -05:00
Eugene Yurtsev
87583b3217 commit 2024-02-28 16:09:41 -05:00
Eugene Yurtsev
f56f7e1895 x 2024-02-28 16:07:42 -05:00
Eugene Yurtsev
fd3a3a0ad1 Merge branch 'master' into feature_audio_loader_auzre_speech 2024-02-28 15:46:16 -05:00
kzmain
6d58293c0e Merge branch 'master' into feature_audio_loader_auzre_speech 2024-02-01 01:31:06 +08:00
kzmain
f84fcef962 Merge branch 'master' into feature_audio_loader_auzre_speech 2024-01-01 00:03:43 +08:00
kzmain
ae2d85b171 Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-30 13:51:01 +08:00
kzmain
b4c6272ce2 Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-28 17:58:42 +08:00
kzmain
5b830b3727 Linting and formatting 2023-12-25 22:16:33 +08:00
kzmain
a6b49b2de5 move loader to the langchain_community 2023-12-25 22:09:37 +08:00
kzmain
edbb79574f Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-25 20:35:30 +08:00
kzmain
a99f35039e Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-11 16:04:30 +08:00
kzmain
61786f77ba Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-07 16:52:54 +08:00
kzmain
46b5ef3310 Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-06 20:23:29 +08:00
kzmain
1c41d348ac Update unit test, parser kwarg's type and change loader from GenericLoader to BaseLoader 2023-12-06 20:14:43 +08:00
Eugene Yurtsev
4c639b0064 x 2023-12-05 14:44:11 -05:00
Eugene Yurtsev
ec5bdb6d60 x 2023-12-05 14:44:03 -05:00
kzmain
b1f7f3caee Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-05 20:36:45 +08:00
kzmain
f407c4786d Fix lint issue 2023-12-05 20:34:44 +08:00
kzmain
ba1d82d314 Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-05 07:52:00 +08:00
kzmain
6c7ca557c3 Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-03 17:14:49 +08:00
实验台边吃早饭
7b8372db69 Merge branch 'master' into feature_audio_loader_auzre_speech 2023-12-03 03:26:45 +08:00
kzmain
eea86b599d GenericLoader Fix and optimize AzureSpeechServiceParser's kwargs
1. Fix misuse of GenericLoader in the AzureSpeechServiceLoader class
2.1. Add Azure Speech service official documents and code pages.
2.2. Expand and document the kwargs meanings.
2.3. Add a parameter to control the transcribe job's polling interval
2023-12-03 03:26:23 +08:00
实验台边吃早饭
f1c7eefebc Merge branch 'master' into feature_audio_loader_auzre_speech 2023-11-30 13:39:53 +08:00
Bagatur
add93468c9 cr 2023-11-28 15:23:57 -08:00
Bagatur
3e2c139cf4 Merge branch 'master' into feature_audio_loader_auzre_speech 2023-11-28 14:51:18 -08:00
实验台边吃早饭
c3a9f6f52a Merge branch 'master' into feature_audio_loader_auzre_speech 2023-11-29 05:50:55 +08:00
kzmain
e80074ab53 Audio file loader implemented with Azure speech service 2023-11-29 05:34:45 +08:00
3 changed files with 316 additions and 6 deletions

View File

@@ -1,8 +1,11 @@
from __future__ import annotations
import logging
import time
from typing import Dict, Iterator, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Sequence, Tuple
from langchain_core.documents import Document
from langchain_core.utils import get_from_env
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
@@ -79,7 +82,7 @@ class OpenAIWhisperParser(BaseBlobParser):
file_obj.name = f"part_{split_number}.mp3"
# Transcribe
print(f"Transcribing part {split_number + 1}!") # noqa: T201
logger.info(f"Transcribing part {split_number + 1}!") # noqa: T201
attempts = 0
while attempts < 3:
try:
@@ -92,10 +95,10 @@ class OpenAIWhisperParser(BaseBlobParser):
break
except Exception as e:
attempts += 1
print(f"Attempt {attempts} failed. Exception: {str(e)}") # noqa: T201
logger.error(f"Attempt {attempts} failed. Exception: {str(e)}") # noqa: T201
time.sleep(5)
else:
print("Failed to transcribe after 3 attempts.") # noqa: T201
logger.info("Failed to transcribe after 3 attempts.") # noqa: T201
continue
yield Document(
@@ -184,7 +187,7 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
rec_model = "openai/whisper-large"
self.lang_model = lang_model if lang_model else rec_model
print("Using the following model: ", self.lang_model) # noqa: T201
logger.info("Using the following model: ", self.lang_model) # noqa: T201
self.batch_size = batch_size
@@ -231,7 +234,7 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
file_obj = io.BytesIO(audio.export(format="mp3").read())
# Transcribe
print(f"Transcribing part {blob.path}!") # noqa: T201
logger.info(f"Transcribing part {blob.path}!") # noqa: T201
y, sr = librosa.load(file_obj, sr=16000)
@@ -319,3 +322,219 @@ class YandexSTTParser(BaseBlobParser):
page_content=res.normalized_text,
metadata={"source": blob.source},
)
class AzureAISpeechParser(BaseBlobParser):
"""
This AzureSpeechServiceParser class make use of the Microsoft Azure Cognitive Speech
service's transcription module to convert an audio file to text.
You can find official transcribe sdk documents with this link:
https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.transcription?view=azure-python
You can find official transcribe sdk samples with this link:
https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/2e39515446ec261bf9fd8d42902147c51c5f72cd/samples/python/console/transcription_sample.py
"""
def __init__(
self,
*,
api_key: Optional[str] = None,
region: Optional[str] = None,
endpoint: Optional[str] = None,
log_path: Optional[str] = None,
polling_interval_seconds: float = 0.5,
speech_recognition_language: Optional[str] = None,
auto_detect_languages: Optional[Sequence[str]] = None,
speech_config_kwargs: Optional[dict] = None,
) -> None:
"""Initialize the parser.
See ``speechsdk.SpeechConfig(()`` for more information about how these
parameters are used.
Args:
api_key: The Azure Cognitive Speech service authentication token
region: The Azure Cognitive Speech service locate region,
you need this argument or the endpoint argument
endpoint: The Azure Cognitive Speech service endpoint with wss protocol,
this would be useful when a programmer uses the Azure cloud other
than the Azure Global Cloud like Azure China, Azure German
log_path: pass when transaction job log is required
polling_interval_seconds: check transcribe job status at this frequency
auto_detect_languages: pass a list of potential source languages,
for source language auto-detection in recognition.
speech_recognition_language: pass a transcribe job's target source languages
"""
self.api_key = api_key or get_from_env("api_key", "AZURE_SPEECH_SERVICE_KEY")
self.region = region or get_from_env("region", "AZURE_SPEECH_REGION")
self.endpoint = endpoint or get_from_env("endpoint", "AZURE_SPEECH_ENDPOINT")
if not self.region and not self.endpoint:
raise ValueError(
"You need to provide either the region or the endpoint argument."
)
self.log_path = log_path or get_from_env("log_path", "AZURE_SPEECH_LOG_PATH")
self.polling_interval_seconds = polling_interval_seconds
self.speech_recognition_language = speech_recognition_language
self.auto_detect_languages = auto_detect_languages
self.speech_config_kwargs = (
speech_config_kwargs if speech_config_kwargs is not None else {}
)
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import json
raw_json_list: List[dict] = []
document_list: List[Document] = []
try:
import azure.cognitiveservices.speech as speechsdk
except ImportError:
raise ImportError(
"azure.cognitiveservices.speech package not found, please install "
"it with `pip install azure-cognitiveservices-speech`."
)
def conversation_transcriber_recognition_canceled_cb(
evt: speechsdk.SessionEventArgs,
) -> None:
# Canceled event
pass
def conversation_transcriber_session_stopped_cb(
evt: speechsdk.SessionEventArgs,
) -> None:
# SessionStopped event
pass
def conversation_transcriber_transcribed_cb(
evt: speechsdk.SpeechRecognitionEventArgs,
) -> None:
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
evt_dict = json.loads(evt.result.json)
content = evt_dict["DisplayText"]
if self.speech_recognition_language is not None:
language = self.speech_recognition_language
elif self.auto_detect_languages is not None:
temp_dict = evt_dict["PrimaryLanguage"]
language = (
temp_dict["Language"] if "Language" in temp_dict else "Unknown"
)
else:
language = "Unsigned"
speaker_id = (
evt_dict["SpeakerId"] if "SpeakerId" in evt_dict else "Unknown"
)
offset_second = evt_dict["Offset"]
duration_second = evt_dict["Duration"]
evt_dict = json.loads(evt.result.json)
_doc = Document(
page_content=content,
metadata={
"offset_second": int(offset_second) / 10**7,
"duration_second": int(duration_second) / 10**7,
"language": language,
"speaker_id": speaker_id,
},
)
logger.info(f"TRANSCRIBED:{evt_dict}")
raw_json_list.append(evt_dict)
document_list.append(_doc)
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
logger.warning(
"\tNOMATCH: Speech could not be TRANSCRIBED: {}".format(
evt.result.no_match_details
)
)
def conversation_transcriber_session_started_cb(
evt: speechsdk.SessionEventArgs,
) -> None:
# SessionStarted event
pass
def recognize_from_file() -> Iterator[Document]:
# Speech service speech config
speech_config = speechsdk.SpeechConfig(
subscription=self.api_key,
region=self.region,
endpoint=self.endpoint,
speech_recognition_language=self.speech_recognition_language,
**self.speech_config_kwargs,
)
speech_config.output_format = speechsdk.OutputFormat.Detailed
if self.log_path is not None:
speech_config.set_property(
speechsdk.PropertyId.Speech_LogFilename, self.log_path
)
# Speech service audio config
audio_config = speechsdk.audio.AudioConfig(filename=blob.path)
# Speech service auto_detect_source_language_config config
if self.auto_detect_languages is not None:
auto_detect_source_language_config = (
speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
languages=self.auto_detect_languages
)
)
else:
auto_detect_source_language_config = None
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
speech_config=speech_config,
audio_config=audio_config,
auto_detect_source_language_config=auto_detect_source_language_config,
)
transcribing_stop = False
def stop_cb(evt: speechsdk.SessionEventArgs) -> None:
# callback that signals to stop continuous recognition
# upon receiving an event `evt`
logger.info("CLOSING on {}".format(evt))
nonlocal transcribing_stop
transcribing_stop = True
# Connect callbacks to the events fired by the conversation transcriber
conversation_transcriber.transcribed.connect(
conversation_transcriber_transcribed_cb
)
conversation_transcriber.session_started.connect(
conversation_transcriber_session_started_cb
)
conversation_transcriber.session_stopped.connect(
conversation_transcriber_session_stopped_cb
)
conversation_transcriber.canceled.connect(
conversation_transcriber_recognition_canceled_cb
)
# stop transcribing on either session stopped or canceled events
conversation_transcriber.session_stopped.connect(stop_cb)
conversation_transcriber.canceled.connect(stop_cb)
conversation_transcriber.start_transcribing_async()
# Waits for completion.
while not transcribing_stop:
time.sleep(self.polling_interval_seconds)
conversation_transcriber.stop_transcribing_async()
return iter(document_list)
try:
return recognize_from_file()
except Exception as err:
logger.error("Encountered exception. {}".format(err))
raise err

View File

@@ -0,0 +1,91 @@
from __future__ import annotations
from typing import Any, Iterator, List, Optional
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.audio import AzureAISpeechParser
SPEECH_SERVICE_REGION = "eastasia"
SPEECH_SERVICE_KEY = "someservicekey"
# Loader for testing purposes only
class _AzureAISpeechLoader(BaseLoader):
"""Azure AI Speech Service Document Loader.
A document loader that can load an audio file from the local file system
and transcribe it using Azure AI Speech Service.
Examples:
.. code-block:: python
from langchain_community.document_loaders import AzureAISpeechLoader
loader = AzureAISpeechParser(
file_path="path/to/directory/example.wav",
api_key="speech-api-key-from-azure",
region="speech-api-region-from-azure"
)
loader.lazy_load()
"""
def __init__(self, file_path: str, **kwargs: Any) -> None:
"""
Args:
file_path: The path to the audio file.
"""
self.file_path = file_path
self.parser = AzureAISpeechParser(**kwargs) # type: ignore
def load(self) -> List[Document]:
return list(self.lazy_load())
def lazy_load(self) -> Iterator[Document]:
blob = Blob.from_path(self.file_path)
return self.parser.lazy_parse(blob)
def _get_audio_file_path() -> str:
return "../test_audio/whatstheweatherlike.wav"
def test_azure_speech_load_key_region_auto_detect_languages() -> None:
loader = _AzureAISpeechLoader(
_get_audio_file_path(),
api_key=SPEECH_SERVICE_KEY,
region=SPEECH_SERVICE_REGION,
auto_detect_languages=["zh-CN", "en-US"],
)
documents = loader.load()
assert "what" in documents[0].page_content.lower()
def test_azure_speech_load_key_region_language() -> None:
loader = _AzureAISpeechLoader(
_get_audio_file_path(),
api_key=SPEECH_SERVICE_KEY,
region=SPEECH_SERVICE_REGION,
speech_recognition_language="en-US",
)
documents = loader.load()
assert "what" in documents[0].page_content.lower()
def test_azure_speech_load_key_region() -> None:
loader = _AzureAISpeechLoader(
_get_audio_file_path(), api_key=SPEECH_SERVICE_KEY, region=SPEECH_SERVICE_REGION
)
documents = loader.load()
assert "what" in documents[0].page_content.lower()
def test_azure_speech_load_key_endpoint() -> None:
loader = _AzureAISpeechLoader(
_get_audio_file_path(),
api_key=SPEECH_SERVICE_KEY,
endpoint=f"wss://{SPEECH_SERVICE_REGION}.stt.speech.microsoft.com/speech/recognition"
"/conversation/cognitiveservices/v1",
)
documents = loader.load()
assert "what" in documents[0].page_content.lower()