mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-16 18:24:31 +00:00
Compare commits
36 Commits
bagatur/de
...
fork/featu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a59739e1d2 | ||
|
|
02b4c7ff67 | ||
|
|
d11273c5da | ||
|
|
368abe1106 | ||
|
|
e9768c2ffc | ||
|
|
c120d02d8c | ||
|
|
002943c8d7 | ||
|
|
ddac748fee | ||
|
|
f8c628a8b8 | ||
|
|
87583b3217 | ||
|
|
f56f7e1895 | ||
|
|
fd3a3a0ad1 | ||
|
|
6d58293c0e | ||
|
|
f84fcef962 | ||
|
|
ae2d85b171 | ||
|
|
b4c6272ce2 | ||
|
|
5b830b3727 | ||
|
|
a6b49b2de5 | ||
|
|
edbb79574f | ||
|
|
a99f35039e | ||
|
|
61786f77ba | ||
|
|
46b5ef3310 | ||
|
|
1c41d348ac | ||
|
|
4c639b0064 | ||
|
|
ec5bdb6d60 | ||
|
|
b1f7f3caee | ||
|
|
f407c4786d | ||
|
|
ba1d82d314 | ||
|
|
6c7ca557c3 | ||
|
|
7b8372db69 | ||
|
|
eea86b599d | ||
|
|
f1c7eefebc | ||
|
|
add93468c9 | ||
|
|
3e2c139cf4 | ||
|
|
c3a9f6f52a | ||
|
|
e80074ab53 |
@@ -1,8 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, Iterator, Optional, Tuple
|
||||
from typing import Dict, Iterator, List, Optional, Sequence, Tuple
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.utils import get_from_env
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
@@ -79,7 +82,7 @@ class OpenAIWhisperParser(BaseBlobParser):
|
||||
file_obj.name = f"part_{split_number}.mp3"
|
||||
|
||||
# Transcribe
|
||||
print(f"Transcribing part {split_number + 1}!") # noqa: T201
|
||||
logger.info(f"Transcribing part {split_number + 1}!") # noqa: T201
|
||||
attempts = 0
|
||||
while attempts < 3:
|
||||
try:
|
||||
@@ -92,10 +95,10 @@ class OpenAIWhisperParser(BaseBlobParser):
|
||||
break
|
||||
except Exception as e:
|
||||
attempts += 1
|
||||
print(f"Attempt {attempts} failed. Exception: {str(e)}") # noqa: T201
|
||||
logger.error(f"Attempt {attempts} failed. Exception: {str(e)}") # noqa: T201
|
||||
time.sleep(5)
|
||||
else:
|
||||
print("Failed to transcribe after 3 attempts.") # noqa: T201
|
||||
logger.info("Failed to transcribe after 3 attempts.") # noqa: T201
|
||||
continue
|
||||
|
||||
yield Document(
|
||||
@@ -184,7 +187,7 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
|
||||
rec_model = "openai/whisper-large"
|
||||
self.lang_model = lang_model if lang_model else rec_model
|
||||
|
||||
print("Using the following model: ", self.lang_model) # noqa: T201
|
||||
logger.info("Using the following model: ", self.lang_model) # noqa: T201
|
||||
|
||||
self.batch_size = batch_size
|
||||
|
||||
@@ -231,7 +234,7 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
|
||||
file_obj = io.BytesIO(audio.export(format="mp3").read())
|
||||
|
||||
# Transcribe
|
||||
print(f"Transcribing part {blob.path}!") # noqa: T201
|
||||
logger.info(f"Transcribing part {blob.path}!") # noqa: T201
|
||||
|
||||
y, sr = librosa.load(file_obj, sr=16000)
|
||||
|
||||
@@ -319,3 +322,219 @@ class YandexSTTParser(BaseBlobParser):
|
||||
page_content=res.normalized_text,
|
||||
metadata={"source": blob.source},
|
||||
)
|
||||
|
||||
|
||||
class AzureAISpeechParser(BaseBlobParser):
|
||||
"""
|
||||
This AzureSpeechServiceParser class make use of the Microsoft Azure Cognitive Speech
|
||||
service's transcription module to convert an audio file to text.
|
||||
|
||||
You can find official transcribe sdk documents with this link:
|
||||
|
||||
https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.transcription?view=azure-python
|
||||
|
||||
You can find official transcribe sdk samples with this link:
|
||||
https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/2e39515446ec261bf9fd8d42902147c51c5f72cd/samples/python/console/transcription_sample.py
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
region: Optional[str] = None,
|
||||
endpoint: Optional[str] = None,
|
||||
log_path: Optional[str] = None,
|
||||
polling_interval_seconds: float = 0.5,
|
||||
speech_recognition_language: Optional[str] = None,
|
||||
auto_detect_languages: Optional[Sequence[str]] = None,
|
||||
speech_config_kwargs: Optional[dict] = None,
|
||||
) -> None:
|
||||
"""Initialize the parser.
|
||||
|
||||
See ``speechsdk.SpeechConfig(()`` for more information about how these
|
||||
parameters are used.
|
||||
|
||||
Args:
|
||||
api_key: The Azure Cognitive Speech service authentication token
|
||||
region: The Azure Cognitive Speech service locate region,
|
||||
you need this argument or the endpoint argument
|
||||
endpoint: The Azure Cognitive Speech service endpoint with wss protocol,
|
||||
this would be useful when a programmer uses the Azure cloud other
|
||||
than the Azure Global Cloud like Azure China, Azure German
|
||||
log_path: pass when transaction job log is required
|
||||
polling_interval_seconds: check transcribe job status at this frequency
|
||||
auto_detect_languages: pass a list of potential source languages,
|
||||
for source language auto-detection in recognition.
|
||||
speech_recognition_language: pass a transcribe job's target source languages
|
||||
"""
|
||||
|
||||
self.api_key = api_key or get_from_env("api_key", "AZURE_SPEECH_SERVICE_KEY")
|
||||
self.region = region or get_from_env("region", "AZURE_SPEECH_REGION")
|
||||
self.endpoint = endpoint or get_from_env("endpoint", "AZURE_SPEECH_ENDPOINT")
|
||||
|
||||
if not self.region and not self.endpoint:
|
||||
raise ValueError(
|
||||
"You need to provide either the region or the endpoint argument."
|
||||
)
|
||||
|
||||
self.log_path = log_path or get_from_env("log_path", "AZURE_SPEECH_LOG_PATH")
|
||||
self.polling_interval_seconds = polling_interval_seconds
|
||||
|
||||
self.speech_recognition_language = speech_recognition_language
|
||||
self.auto_detect_languages = auto_detect_languages
|
||||
self.speech_config_kwargs = (
|
||||
speech_config_kwargs if speech_config_kwargs is not None else {}
|
||||
)
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
import json
|
||||
|
||||
raw_json_list: List[dict] = []
|
||||
document_list: List[Document] = []
|
||||
|
||||
try:
|
||||
import azure.cognitiveservices.speech as speechsdk
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"azure.cognitiveservices.speech package not found, please install "
|
||||
"it with `pip install azure-cognitiveservices-speech`."
|
||||
)
|
||||
|
||||
def conversation_transcriber_recognition_canceled_cb(
|
||||
evt: speechsdk.SessionEventArgs,
|
||||
) -> None:
|
||||
# Canceled event
|
||||
pass
|
||||
|
||||
def conversation_transcriber_session_stopped_cb(
|
||||
evt: speechsdk.SessionEventArgs,
|
||||
) -> None:
|
||||
# SessionStopped event
|
||||
pass
|
||||
|
||||
def conversation_transcriber_transcribed_cb(
|
||||
evt: speechsdk.SpeechRecognitionEventArgs,
|
||||
) -> None:
|
||||
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
|
||||
evt_dict = json.loads(evt.result.json)
|
||||
|
||||
content = evt_dict["DisplayText"]
|
||||
|
||||
if self.speech_recognition_language is not None:
|
||||
language = self.speech_recognition_language
|
||||
elif self.auto_detect_languages is not None:
|
||||
temp_dict = evt_dict["PrimaryLanguage"]
|
||||
language = (
|
||||
temp_dict["Language"] if "Language" in temp_dict else "Unknown"
|
||||
)
|
||||
else:
|
||||
language = "Unsigned"
|
||||
|
||||
speaker_id = (
|
||||
evt_dict["SpeakerId"] if "SpeakerId" in evt_dict else "Unknown"
|
||||
)
|
||||
offset_second = evt_dict["Offset"]
|
||||
duration_second = evt_dict["Duration"]
|
||||
|
||||
evt_dict = json.loads(evt.result.json)
|
||||
_doc = Document(
|
||||
page_content=content,
|
||||
metadata={
|
||||
"offset_second": int(offset_second) / 10**7,
|
||||
"duration_second": int(duration_second) / 10**7,
|
||||
"language": language,
|
||||
"speaker_id": speaker_id,
|
||||
},
|
||||
)
|
||||
logger.info(f"TRANSCRIBED:{evt_dict}")
|
||||
raw_json_list.append(evt_dict)
|
||||
document_list.append(_doc)
|
||||
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
|
||||
logger.warning(
|
||||
"\tNOMATCH: Speech could not be TRANSCRIBED: {}".format(
|
||||
evt.result.no_match_details
|
||||
)
|
||||
)
|
||||
|
||||
def conversation_transcriber_session_started_cb(
|
||||
evt: speechsdk.SessionEventArgs,
|
||||
) -> None:
|
||||
# SessionStarted event
|
||||
pass
|
||||
|
||||
def recognize_from_file() -> Iterator[Document]:
|
||||
# Speech service speech config
|
||||
speech_config = speechsdk.SpeechConfig(
|
||||
subscription=self.api_key,
|
||||
region=self.region,
|
||||
endpoint=self.endpoint,
|
||||
speech_recognition_language=self.speech_recognition_language,
|
||||
**self.speech_config_kwargs,
|
||||
)
|
||||
speech_config.output_format = speechsdk.OutputFormat.Detailed
|
||||
|
||||
if self.log_path is not None:
|
||||
speech_config.set_property(
|
||||
speechsdk.PropertyId.Speech_LogFilename, self.log_path
|
||||
)
|
||||
|
||||
# Speech service audio config
|
||||
audio_config = speechsdk.audio.AudioConfig(filename=blob.path)
|
||||
|
||||
# Speech service auto_detect_source_language_config config
|
||||
if self.auto_detect_languages is not None:
|
||||
auto_detect_source_language_config = (
|
||||
speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
|
||||
languages=self.auto_detect_languages
|
||||
)
|
||||
)
|
||||
else:
|
||||
auto_detect_source_language_config = None
|
||||
|
||||
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
|
||||
speech_config=speech_config,
|
||||
audio_config=audio_config,
|
||||
auto_detect_source_language_config=auto_detect_source_language_config,
|
||||
)
|
||||
|
||||
transcribing_stop = False
|
||||
|
||||
def stop_cb(evt: speechsdk.SessionEventArgs) -> None:
|
||||
# callback that signals to stop continuous recognition
|
||||
# upon receiving an event `evt`
|
||||
logger.info("CLOSING on {}".format(evt))
|
||||
nonlocal transcribing_stop
|
||||
transcribing_stop = True
|
||||
|
||||
# Connect callbacks to the events fired by the conversation transcriber
|
||||
conversation_transcriber.transcribed.connect(
|
||||
conversation_transcriber_transcribed_cb
|
||||
)
|
||||
conversation_transcriber.session_started.connect(
|
||||
conversation_transcriber_session_started_cb
|
||||
)
|
||||
conversation_transcriber.session_stopped.connect(
|
||||
conversation_transcriber_session_stopped_cb
|
||||
)
|
||||
conversation_transcriber.canceled.connect(
|
||||
conversation_transcriber_recognition_canceled_cb
|
||||
)
|
||||
# stop transcribing on either session stopped or canceled events
|
||||
conversation_transcriber.session_stopped.connect(stop_cb)
|
||||
conversation_transcriber.canceled.connect(stop_cb)
|
||||
|
||||
conversation_transcriber.start_transcribing_async()
|
||||
|
||||
# Waits for completion.
|
||||
while not transcribing_stop:
|
||||
time.sleep(self.polling_interval_seconds)
|
||||
|
||||
conversation_transcriber.stop_transcribing_async()
|
||||
return iter(document_list)
|
||||
|
||||
try:
|
||||
return recognize_from_file()
|
||||
except Exception as err:
|
||||
logger.error("Encountered exception. {}".format(err))
|
||||
raise err
|
||||
|
||||
@@ -0,0 +1,91 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Iterator, List, Optional
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.audio import AzureAISpeechParser
|
||||
|
||||
SPEECH_SERVICE_REGION = "eastasia"
|
||||
SPEECH_SERVICE_KEY = "someservicekey"
|
||||
|
||||
|
||||
# Loader for testing purposes only
|
||||
class _AzureAISpeechLoader(BaseLoader):
|
||||
"""Azure AI Speech Service Document Loader.
|
||||
A document loader that can load an audio file from the local file system
|
||||
and transcribe it using Azure AI Speech Service.
|
||||
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
from langchain_community.document_loaders import AzureAISpeechLoader
|
||||
loader = AzureAISpeechParser(
|
||||
file_path="path/to/directory/example.wav",
|
||||
api_key="speech-api-key-from-azure",
|
||||
region="speech-api-region-from-azure"
|
||||
)
|
||||
loader.lazy_load()
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, **kwargs: Any) -> None:
|
||||
"""
|
||||
Args:
|
||||
file_path: The path to the audio file.
|
||||
"""
|
||||
self.file_path = file_path
|
||||
self.parser = AzureAISpeechParser(**kwargs) # type: ignore
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
return list(self.lazy_load())
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
blob = Blob.from_path(self.file_path)
|
||||
return self.parser.lazy_parse(blob)
|
||||
|
||||
|
||||
def _get_audio_file_path() -> str:
|
||||
return "../test_audio/whatstheweatherlike.wav"
|
||||
|
||||
|
||||
def test_azure_speech_load_key_region_auto_detect_languages() -> None:
|
||||
loader = _AzureAISpeechLoader(
|
||||
_get_audio_file_path(),
|
||||
api_key=SPEECH_SERVICE_KEY,
|
||||
region=SPEECH_SERVICE_REGION,
|
||||
auto_detect_languages=["zh-CN", "en-US"],
|
||||
)
|
||||
documents = loader.load()
|
||||
assert "what" in documents[0].page_content.lower()
|
||||
|
||||
|
||||
def test_azure_speech_load_key_region_language() -> None:
|
||||
loader = _AzureAISpeechLoader(
|
||||
_get_audio_file_path(),
|
||||
api_key=SPEECH_SERVICE_KEY,
|
||||
region=SPEECH_SERVICE_REGION,
|
||||
speech_recognition_language="en-US",
|
||||
)
|
||||
documents = loader.load()
|
||||
assert "what" in documents[0].page_content.lower()
|
||||
|
||||
|
||||
def test_azure_speech_load_key_region() -> None:
|
||||
loader = _AzureAISpeechLoader(
|
||||
_get_audio_file_path(), api_key=SPEECH_SERVICE_KEY, region=SPEECH_SERVICE_REGION
|
||||
)
|
||||
documents = loader.load()
|
||||
assert "what" in documents[0].page_content.lower()
|
||||
|
||||
|
||||
def test_azure_speech_load_key_endpoint() -> None:
|
||||
loader = _AzureAISpeechLoader(
|
||||
_get_audio_file_path(),
|
||||
api_key=SPEECH_SERVICE_KEY,
|
||||
endpoint=f"wss://{SPEECH_SERVICE_REGION}.stt.speech.microsoft.com/speech/recognition"
|
||||
"/conversation/cognitiveservices/v1",
|
||||
)
|
||||
documents = loader.load()
|
||||
assert "what" in documents[0].page_content.lower()
|
||||
Binary file not shown.
Reference in New Issue
Block a user