move loader to the langchain_community

This commit is contained in:
kzmain
2023-12-25 22:09:37 +08:00
parent edbb79574f
commit a6b49b2de5
5 changed files with 256 additions and 31 deletions

View File

@@ -26,6 +26,7 @@ from langchain_community.document_loaders.airbyte import (
AirbyteTypeformLoader,
AirbyteZendeskSupportLoader,
)
from langchain_community.document_loaders.azure_ai_speech import AzureAISpeechLoader
from langchain_community.document_loaders.airbyte_json import AirbyteJSONLoader
from langchain_community.document_loaders.airtable import AirtableLoader
from langchain_community.document_loaders.apify_dataset import ApifyDatasetLoader
@@ -53,7 +54,7 @@ from langchain_community.document_loaders.blob_loaders import (
Blob,
BlobLoader,
FileSystemBlobLoader,
YoutubeAudioLoader,
YoutubeAudioLoader
)
from langchain_community.document_loaders.blockchain import BlockchainDocumentLoader
from langchain_community.document_loaders.brave_search import BraveSearchLoader
@@ -250,6 +251,7 @@ __all__ = [
"AssemblyAIAudioTranscriptLoader",
"AsyncHtmlLoader",
"AzureAIDataLoader",
"AzureAISpeechLoader",
"AzureAIDocumentIntelligenceLoader",
"AzureBlobStorageContainerLoader",
"AzureBlobStorageFileLoader",

View File

@@ -4,24 +4,24 @@ from typing import List, Optional
from langchain_core.documents import Document
from langchain.document_loaders import Blob
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.parsers.audio import AzureSpeechServiceParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.parsers.audio import AzureAISpeechParser
class AzureSpeechServiceLoader(BaseLoader):
"""Azure Speech Service Document Loader.
class AzureAISpeechLoader(BaseLoader):
"""Azure AI Speech Service Document Loader.
A document loader that can load an audio file from the local file system
and transcribe it using Azure Speech Service.
and transcribe it using Azure AI Speech Service.
Examples:
.. code-block:: python
from langchain.document_loaders import AzureSpeechServiceLoader
from langchain_community.document_loaders import AzureAISpeechLoader
loader = AzureSpeechServiceLoader(
loader = AzureAISpeechParser(
file_path="path/to/directory/example.wav",
api_key="speech-api-key-from-azure",
region="speech-api-region-from-azure"
@@ -43,4 +43,4 @@ class AzureSpeechServiceLoader(BaseLoader):
file_path: The path to the audio file.
"""
self.file_path = file_path
self.parser = AzureSpeechServiceParser(**kwargs)
self.parser = AzureAISpeechParser(**kwargs)

View File

@@ -1,8 +1,9 @@
import logging
import time
from typing import Dict, Iterator, Optional, Tuple
from typing import Dict, Iterator, Optional, Tuple, List
from langchain_core.documents import Document
from langchain_core.utils import get_from_env
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
@@ -56,7 +57,7 @@ class OpenAIWhisperParser(BaseBlobParser):
# Split the audio into chunk_duration_ms chunks
for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)):
# Audio chunk
chunk = audio[i : i + chunk_duration_ms]
chunk = audio[i: i + chunk_duration_ms]
file_obj = io.BytesIO(chunk.export(format="mp3").read())
if blob.source is not None:
file_obj.name = blob.source + f"_part_{split_number}.mp3"
@@ -64,7 +65,7 @@ class OpenAIWhisperParser(BaseBlobParser):
file_obj.name = f"part_{split_number}.mp3"
# Transcribe
print(f"Transcribing part {split_number+1}!")
print(f"Transcribing part {split_number + 1}!")
attempts = 0
while attempts < 3:
try:
@@ -113,10 +114,10 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
"""
def __init__(
self,
device: str = "0",
lang_model: Optional[str] = None,
forced_decoder_ids: Optional[Tuple[Dict]] = None,
self,
device: str = "0",
lang_model: Optional[str] = None,
forced_decoder_ids: Optional[Tuple[Dict]] = None,
):
"""Initialize the parser.
@@ -155,7 +156,7 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
self.device = "cuda:0"
# check GPU memory and select automatically the model
mem = torch.cuda.get_device_properties(self.device).total_memory / (
1024**2
1024 ** 2
)
if mem < 5000:
rec_model = "openai/whisper-base"
@@ -237,12 +238,12 @@ class YandexSTTParser(BaseBlobParser):
Audio transcription is with OpenAI Whisper model."""
def __init__(
self,
*,
api_key: Optional[str] = None,
iam_token: Optional[str] = None,
model: str = "general",
language: str = "auto",
self,
*,
api_key: Optional[str] = None,
iam_token: Optional[str] = None,
model: str = "general",
language: str = "auto",
):
"""Initialize the parser.
@@ -308,3 +309,225 @@ class YandexSTTParser(BaseBlobParser):
page_content=res.normalized_text,
metadata={"source": blob.source},
)
class AzureAISpeechParser(BaseBlobParser):
"""
This AzureSpeechServiceParser class make use of the Microsoft Azure Cognitive Speech
service's transcription module to convert an audio file to text.
You can find official transcribe sdk documents with this link:
https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.transcription?view=azure-python
You can find official transcribe sdk samples with this link:
https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/2e39515446ec261bf9fd8d42902147c51c5f72cd/samples/python/console/transcription_sample.py
"""
def __init__(
self,
*,
api_key: Optional[str] = None,
region: Optional[str] = None,
endpoint: Optional[str] = None,
log_path: Optional[str] = None,
polling_interval_seconds: float = 0.5,
speech_recognition_language: Optional[str] = None,
auto_detect_languages: Optional[list[str]] = None,
speech_config_kwargs: Optional[dict] = None
) -> None:
"""Initialize the parser.
See ``speechsdk.SpeechConfig(()`` for more information about how these
parameters are used.
Args:
api_key: The Azure Cognitive Speech service authentication token
region: The Azure Cognitive Speech service locate region,
you need this argument or the endpoint argument
endpoint: The Azure Cognitive Speech service endpoint with wss protocol,
this would be useful when a programmer uses the Azure cloud other
than the Azure Global Cloud like Azure China, Azure German
log_path: pass when transaction job log is required
polling_interval_seconds: check transcribe job status at this frequency
auto_detect_languages: pass a list of potential source languages,
for source language auto-detection in recognition.
speech_recognition_language: pass a transcribe job's target source languages
"""
self.api_key = api_key if api_key is not None else get_from_env('api_key', 'AZURE_SPEECH_SERVICE_KEY')
self.region = region if region is not None else get_from_env('region', 'AZURE_SPEECH_REGION', 'NONE')
self.region = self.region if self.region != 'NONE' else None
self.endpoint = endpoint if endpoint is not None else get_from_env('endpoint', 'AZURE_SPEECH_ENDPOINT', 'NONE')
self.endpoint = self.endpoint if self.endpoint != 'NONE' else None
if not self.region and not self.endpoint:
raise ValueError(
"You need to provide either the region or the endpoint argument."
)
self.log_path = log_path if log_path is not None else get_from_env('log_path', 'AZURE_SPEECH_LOG_PATH', 'NONE')
self.log_path = self.log_path if self.log_path != 'NONE' else None
self.polling_interval_seconds = polling_interval_seconds
self.speech_recognition_language = speech_recognition_language
self.auto_detect_languages = auto_detect_languages
self.speech_config_kwargs = (
speech_config_kwargs if speech_config_kwargs is not None else {}
)
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import json
raw_json_list: List[dict] = []
document_list: List[Document] = []
try:
import azure.cognitiveservices.speech as speechsdk
except ImportError:
raise ImportError(
"azure.cognitiveservices.speech package not found, please install "
"it with `pip install azure-cognitiveservices-speech`."
)
def conversation_transcriber_recognition_canceled_cb(
evt: speechsdk.SessionEventArgs
) -> None:
# Canceled event
pass
def conversation_transcriber_session_stopped_cb(
evt: speechsdk.SessionEventArgs
) -> None:
# SessionStopped event
pass
def conversation_transcriber_transcribed_cb(
evt: speechsdk.SpeechRecognitionEventArgs
) -> None:
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
evt_dict = json.loads(evt.result.json)
content = evt_dict["DisplayText"]
if self.speech_recognition_language is not None:
language = self.speech_recognition_language
elif self.auto_detect_languages is not None:
temp_dict = evt_dict["PrimaryLanguage"]
language = (
temp_dict["Language"] if "Language" in temp_dict else "Unknown"
)
else:
language = "Unsigned"
speaker_id = (
evt_dict["SpeakerId"] if "SpeakerId" in evt_dict else "Unknown"
)
offset_second = evt_dict["Offset"]
duration_second = evt_dict["Duration"]
evt_dict = json.loads(evt.result.json)
_doc = Document(
page_content=content,
metadata={
"offset_second": int(offset_second) / 10 ** 7,
"duration_second": int(duration_second) / 10 ** 7,
"language": language,
"speaker_id": speaker_id,
},
)
print(f"TRANSCRIBED:{evt_dict}")
raw_json_list.append(evt_dict)
document_list.append(_doc)
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print(
"\tNOMATCH: Speech could not be TRANSCRIBED: {}".format(
evt.result.no_match_details
)
)
def conversation_transcriber_session_started_cb(
evt: speechsdk.SessionEventArgs
) -> None:
# SessionStarted event
pass
def recognize_from_file() -> Iterator[Document]:
# Speech service speech config
speech_config = speechsdk.SpeechConfig(
subscription=self.api_key,
region=self.region,
endpoint=self.endpoint,
speech_recognition_language=self.speech_recognition_language,
**self.speech_config_kwargs,
)
speech_config.output_format = speechsdk.OutputFormat.Detailed
if self.log_path is not None:
speech_config.set_property(
speechsdk.PropertyId.Speech_LogFilename, self.log_path
)
# Speech service audio config
audio_config = speechsdk.audio.AudioConfig(filename=blob.path)
# Speech service auto_detect_source_language_config config
if self.auto_detect_languages is not None:
auto_detect_source_language_config = (
speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
languages=self.auto_detect_languages
)
)
else:
auto_detect_source_language_config = None
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
speech_config=speech_config,
audio_config=audio_config,
auto_detect_source_language_config=auto_detect_source_language_config,
)
transcribing_stop = False
def stop_cb(evt: speechsdk.SessionEventArgs) -> None:
# callback that signals to stop continuous recognition
# upon receiving an event `evt`
print("CLOSING on {}".format(evt))
nonlocal transcribing_stop
transcribing_stop = True
# Connect callbacks to the events fired by the conversation transcriber
conversation_transcriber.transcribed.connect(
conversation_transcriber_transcribed_cb
)
conversation_transcriber.session_started.connect(
conversation_transcriber_session_started_cb
)
conversation_transcriber.session_stopped.connect(
conversation_transcriber_session_stopped_cb
)
conversation_transcriber.canceled.connect(
conversation_transcriber_recognition_canceled_cb
)
# stop transcribing on either session stopped or canceled events
conversation_transcriber.session_stopped.connect(stop_cb)
conversation_transcriber.canceled.connect(stop_cb)
conversation_transcriber.start_transcribing_async()
# Waits for completion.
while not transcribing_stop:
time.sleep(self.polling_interval_seconds)
conversation_transcriber.stop_transcribing_async()
return iter(document_list)
try:
return recognize_from_file()
except Exception as err:
print("Encountered exception. {}".format(err))
raise err

View File

@@ -1,7 +1,7 @@
from langchain.document_loaders.audio import AzureSpeechServiceLoader
from langchain_community.document_loaders import AzureAISpeechLoader
SPEECH_SERVICE_REGION = ""
SPEECH_SERVICE_KEY = ""
SPEECH_SERVICE_REGION = "eastasia"
SPEECH_SERVICE_KEY = "c77dcf2aa5c04dd6b6613f77d9d9161d"
def _get_audio_file_path() -> str:
@@ -9,7 +9,7 @@ def _get_audio_file_path() -> str:
def test_azure_speech_load_key_region_auto_detect_languages() -> None:
loader = AzureSpeechServiceLoader(
loader = AzureAISpeechLoader(
_get_audio_file_path(),
api_key=SPEECH_SERVICE_KEY,
region=SPEECH_SERVICE_REGION,
@@ -20,7 +20,7 @@ def test_azure_speech_load_key_region_auto_detect_languages() -> None:
def test_azure_speech_load_key_region_language() -> None:
loader = AzureSpeechServiceLoader(
loader = AzureAISpeechLoader(
_get_audio_file_path(),
api_key=SPEECH_SERVICE_KEY,
region=SPEECH_SERVICE_REGION,
@@ -31,7 +31,7 @@ def test_azure_speech_load_key_region_language() -> None:
def test_azure_speech_load_key_region() -> None:
loader = AzureSpeechServiceLoader(
loader = AzureAISpeechLoader(
_get_audio_file_path(),
api_key=SPEECH_SERVICE_KEY,
region=SPEECH_SERVICE_REGION
@@ -41,7 +41,7 @@ def test_azure_speech_load_key_region() -> None:
def test_azure_speech_load_key_endpoint() -> None:
loader = AzureSpeechServiceLoader(
loader = AzureAISpeechLoader(
_get_audio_file_path(),
api_key=SPEECH_SERVICE_KEY,
endpoint=f"wss://{SPEECH_SERVICE_REGION}.stt.speech.microsoft.com/speech/recognition"