move loader to the langchain_community

2026-02-21 06:33:41 +00:00 · 2023-12-25 22:09:37 +08:00
parent edbb79574f
commit a6b49b2de5
5 changed files with 256 additions and 31 deletions
--- a/libs/community/langchain_community/document_loaders/init.py
+++ b/libs/community/langchain_community/document_loaders/init.py
@@ -26,6 +26,7 @@ from langchain_community.document_loaders.airbyte import (
    AirbyteTypeformLoader,
    AirbyteZendeskSupportLoader,
 )
+from langchain_community.document_loaders.azure_ai_speech import AzureAISpeechLoader
 from langchain_community.document_loaders.airbyte_json import AirbyteJSONLoader
 from langchain_community.document_loaders.airtable import AirtableLoader
 from langchain_community.document_loaders.apify_dataset import ApifyDatasetLoader
@@ -53,7 +54,7 @@ from langchain_community.document_loaders.blob_loaders import (
    Blob,
    BlobLoader,
    FileSystemBlobLoader,
-    YoutubeAudioLoader,
+    YoutubeAudioLoader
 )
 from langchain_community.document_loaders.blockchain import BlockchainDocumentLoader
 from langchain_community.document_loaders.brave_search import BraveSearchLoader
@@ -250,6 +251,7 @@ __all__ = [
    "AssemblyAIAudioTranscriptLoader",
    "AsyncHtmlLoader",
    "AzureAIDataLoader",
+    "AzureAISpeechLoader",
    "AzureAIDocumentIntelligenceLoader",
    "AzureBlobStorageContainerLoader",
    "AzureBlobStorageFileLoader",
--- a/libs/community/langchain_community/document_loaders/azure_ai_speech.py
+++ b/libs/community/langchain_community/document_loaders/azure_ai_speech.py
@@ -4,24 +4,24 @@ from typing import List, Optional

 from langchain_core.documents import Document

-from langchain.document_loaders import Blob
-from langchain.document_loaders.base import BaseLoader
-from langchain.document_loaders.parsers.audio import AzureSpeechServiceParser
+from langchain_community.document_loaders.blob_loaders import Blob
+from langchain_community.document_loaders.base import BaseLoader
+from langchain_community.document_loaders.parsers.audio import AzureAISpeechParser


-class AzureSpeechServiceLoader(BaseLoader):
-    """Azure Speech Service Document Loader.
+class AzureAISpeechLoader(BaseLoader):
+    """Azure AI Speech Service Document Loader.

        A document loader that can load an audio file from the local file system
-        and transcribe it using Azure Speech Service.
+        and transcribe it using Azure AI Speech Service.

        Examples:

            .. code-block:: python

-                from langchain.document_loaders import AzureSpeechServiceLoader
+                from langchain_community.document_loaders import AzureAISpeechLoader

-                loader = AzureSpeechServiceLoader(
+                loader = AzureAISpeechParser(
                    file_path="path/to/directory/example.wav",
                    api_key="speech-api-key-from-azure",
                    region="speech-api-region-from-azure"
@@ -43,4 +43,4 @@ class AzureSpeechServiceLoader(BaseLoader):
            file_path: The path to the audio file.
        """
        self.file_path = file_path
-        self.parser = AzureSpeechServiceParser(**kwargs)
+        self.parser = AzureAISpeechParser(**kwargs)
--- a/libs/community/langchain_community/document_loaders/parsers/audio.py
+++ b/libs/community/langchain_community/document_loaders/parsers/audio.py
@@ -1,8 +1,9 @@
 import logging
 import time
-from typing import Dict, Iterator, Optional, Tuple
+from typing import Dict, Iterator, Optional, Tuple, List

 from langchain_core.documents import Document
+from langchain_core.utils import get_from_env

 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
@@ -56,7 +57,7 @@ class OpenAIWhisperParser(BaseBlobParser):
        # Split the audio into chunk_duration_ms chunks
        for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)):
            # Audio chunk
-            chunk = audio[i : i + chunk_duration_ms]
+            chunk = audio[i: i + chunk_duration_ms]
            file_obj = io.BytesIO(chunk.export(format="mp3").read())
            if blob.source is not None:
                file_obj.name = blob.source + f"_part_{split_number}.mp3"
@@ -64,7 +65,7 @@ class OpenAIWhisperParser(BaseBlobParser):
                file_obj.name = f"part_{split_number}.mp3"

            # Transcribe
-            print(f"Transcribing part {split_number+1}!")
+            print(f"Transcribing part {split_number + 1}!")
            attempts = 0
            while attempts < 3:
                try:
@@ -113,10 +114,10 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
    """

    def __init__(
-        self,
-        device: str = "0",
-        lang_model: Optional[str] = None,
-        forced_decoder_ids: Optional[Tuple[Dict]] = None,
+            self,
+            device: str = "0",
+            lang_model: Optional[str] = None,
+            forced_decoder_ids: Optional[Tuple[Dict]] = None,
    ):
        """Initialize the parser.

@@ -155,7 +156,7 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
                self.device = "cuda:0"
                # check GPU memory and select automatically the model
                mem = torch.cuda.get_device_properties(self.device).total_memory / (
-                    1024**2
+                        1024 ** 2
                )
                if mem < 5000:
                    rec_model = "openai/whisper-base"
@@ -237,12 +238,12 @@ class YandexSTTParser(BaseBlobParser):
    Audio transcription is with OpenAI Whisper model."""

    def __init__(
-        self,
-        *,
-        api_key: Optional[str] = None,
-        iam_token: Optional[str] = None,
-        model: str = "general",
-        language: str = "auto",
+            self,
+            *,
+            api_key: Optional[str] = None,
+            iam_token: Optional[str] = None,
+            model: str = "general",
+            language: str = "auto",
    ):
        """Initialize the parser.

@@ -308,3 +309,225 @@ class YandexSTTParser(BaseBlobParser):
                page_content=res.normalized_text,
                metadata={"source": blob.source},
            )
+
+
+class AzureAISpeechParser(BaseBlobParser):
+    """
+    This AzureSpeechServiceParser class make use of the Microsoft Azure Cognitive Speech
+    service's transcription module to convert an audio file to text.
+
+    You can find official transcribe sdk documents with this link:
+
+    https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.transcription?view=azure-python
+
+    You can find official transcribe sdk samples with this link:
+    https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/2e39515446ec261bf9fd8d42902147c51c5f72cd/samples/python/console/transcription_sample.py
+    """
+
+    def __init__(
+            self,
+            *,
+            api_key: Optional[str] = None,
+            region: Optional[str] = None,
+            endpoint: Optional[str] = None,
+            log_path: Optional[str] = None,
+            polling_interval_seconds: float = 0.5,
+            speech_recognition_language: Optional[str] = None,
+            auto_detect_languages: Optional[list[str]] = None,
+            speech_config_kwargs: Optional[dict] = None
+    ) -> None:
+        """Initialize the parser.
+
+        See ``speechsdk.SpeechConfig(()`` for more information about how these
+        parameters are used.
+
+        Args:
+            api_key: The Azure Cognitive Speech service authentication token
+            region: The Azure Cognitive Speech service locate region,
+                you need this argument or the endpoint argument
+            endpoint: The Azure Cognitive Speech service endpoint with wss protocol,
+                this would be useful when a programmer uses the Azure cloud other
+                than the Azure Global Cloud like Azure China, Azure German
+            log_path: pass when transaction job log is required
+            polling_interval_seconds: check transcribe job status at this frequency
+            auto_detect_languages: pass a list of potential source languages,
+                for source language auto-detection in recognition.
+            speech_recognition_language: pass a transcribe job's target source languages
+        """
+
+        self.api_key = api_key if api_key is not None else get_from_env('api_key', 'AZURE_SPEECH_SERVICE_KEY')
+
+        self.region = region if region is not None else get_from_env('region', 'AZURE_SPEECH_REGION', 'NONE')
+        self.region = self.region if self.region != 'NONE' else None
+
+        self.endpoint = endpoint if endpoint is not None else get_from_env('endpoint', 'AZURE_SPEECH_ENDPOINT', 'NONE')
+        self.endpoint = self.endpoint if self.endpoint != 'NONE' else None
+
+        if not self.region and not self.endpoint:
+            raise ValueError(
+                "You need to provide either the region or the endpoint argument."
+            )
+
+        self.log_path = log_path if log_path is not None else get_from_env('log_path', 'AZURE_SPEECH_LOG_PATH', 'NONE')
+        self.log_path = self.log_path if self.log_path != 'NONE' else None
+
+        self.polling_interval_seconds = polling_interval_seconds
+
+        self.speech_recognition_language = speech_recognition_language
+        self.auto_detect_languages = auto_detect_languages
+        self.speech_config_kwargs = (
+            speech_config_kwargs if speech_config_kwargs is not None else {}
+        )
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+        import json
+
+        raw_json_list: List[dict] = []
+        document_list: List[Document] = []
+
+        try:
+            import azure.cognitiveservices.speech as speechsdk
+        except ImportError:
+            raise ImportError(
+                "azure.cognitiveservices.speech package not found, please install "
+                "it with `pip install azure-cognitiveservices-speech`."
+            )
+
+        def conversation_transcriber_recognition_canceled_cb(
+                evt: speechsdk.SessionEventArgs
+        ) -> None:
+            # Canceled event
+            pass
+
+        def conversation_transcriber_session_stopped_cb(
+                evt: speechsdk.SessionEventArgs
+        ) -> None:
+            # SessionStopped event
+            pass
+
+        def conversation_transcriber_transcribed_cb(
+                evt: speechsdk.SpeechRecognitionEventArgs
+        ) -> None:
+            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
+                evt_dict = json.loads(evt.result.json)
+
+                content = evt_dict["DisplayText"]
+
+                if self.speech_recognition_language is not None:
+                    language = self.speech_recognition_language
+                elif self.auto_detect_languages is not None:
+                    temp_dict = evt_dict["PrimaryLanguage"]
+                    language = (
+                        temp_dict["Language"] if "Language" in temp_dict else "Unknown"
+                    )
+                else:
+                    language = "Unsigned"
+
+                speaker_id = (
+                    evt_dict["SpeakerId"] if "SpeakerId" in evt_dict else "Unknown"
+                )
+                offset_second = evt_dict["Offset"]
+                duration_second = evt_dict["Duration"]
+
+                evt_dict = json.loads(evt.result.json)
+                _doc = Document(
+                    page_content=content,
+                    metadata={
+                        "offset_second": int(offset_second) / 10 ** 7,
+                        "duration_second": int(duration_second) / 10 ** 7,
+                        "language": language,
+                        "speaker_id": speaker_id,
+                    },
+                )
+                print(f"TRANSCRIBED:{evt_dict}")
+                raw_json_list.append(evt_dict)
+                document_list.append(_doc)
+            elif evt.result.reason == speechsdk.ResultReason.NoMatch:
+                print(
+                    "\tNOMATCH: Speech could not be TRANSCRIBED: {}".format(
+                        evt.result.no_match_details
+                    )
+                )
+
+        def conversation_transcriber_session_started_cb(
+                evt: speechsdk.SessionEventArgs
+        ) -> None:
+            # SessionStarted event
+            pass
+
+        def recognize_from_file() -> Iterator[Document]:
+            # Speech service speech config
+            speech_config = speechsdk.SpeechConfig(
+                subscription=self.api_key,
+                region=self.region,
+                endpoint=self.endpoint,
+                speech_recognition_language=self.speech_recognition_language,
+                **self.speech_config_kwargs,
+            )
+            speech_config.output_format = speechsdk.OutputFormat.Detailed
+
+            if self.log_path is not None:
+                speech_config.set_property(
+                    speechsdk.PropertyId.Speech_LogFilename, self.log_path
+                )
+
+            # Speech service audio config
+            audio_config = speechsdk.audio.AudioConfig(filename=blob.path)
+
+            # Speech service auto_detect_source_language_config config
+            if self.auto_detect_languages is not None:
+                auto_detect_source_language_config = (
+                    speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
+                        languages=self.auto_detect_languages
+                    )
+                )
+            else:
+                auto_detect_source_language_config = None
+
+            conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
+                speech_config=speech_config,
+                audio_config=audio_config,
+                auto_detect_source_language_config=auto_detect_source_language_config,
+            )
+
+            transcribing_stop = False
+
+            def stop_cb(evt: speechsdk.SessionEventArgs) -> None:
+                # callback that signals to stop continuous recognition
+                # upon receiving an event `evt`
+                print("CLOSING on {}".format(evt))
+                nonlocal transcribing_stop
+                transcribing_stop = True
+
+            # Connect callbacks to the events fired by the conversation transcriber
+            conversation_transcriber.transcribed.connect(
+                conversation_transcriber_transcribed_cb
+            )
+            conversation_transcriber.session_started.connect(
+                conversation_transcriber_session_started_cb
+            )
+            conversation_transcriber.session_stopped.connect(
+                conversation_transcriber_session_stopped_cb
+            )
+            conversation_transcriber.canceled.connect(
+                conversation_transcriber_recognition_canceled_cb
+            )
+            # stop transcribing on either session stopped or canceled events
+            conversation_transcriber.session_stopped.connect(stop_cb)
+            conversation_transcriber.canceled.connect(stop_cb)
+
+            conversation_transcriber.start_transcribing_async()
+
+            # Waits for completion.
+            while not transcribing_stop:
+                time.sleep(self.polling_interval_seconds)
+
+            conversation_transcriber.stop_transcribing_async()
+            return iter(document_list)
+
+        try:
+            return recognize_from_file()
+        except Exception as err:
+            print("Encountered exception. {}".format(err))
+            raise err
--- a/libs/community/tests/unit_tests/document_loaders/test_audio.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_audio.py
@@ -1,7 +1,7 @@
-from langchain.document_loaders.audio import AzureSpeechServiceLoader
+from langchain_community.document_loaders import AzureAISpeechLoader

-SPEECH_SERVICE_REGION = ""
-SPEECH_SERVICE_KEY = ""
+SPEECH_SERVICE_REGION = "eastasia"
+SPEECH_SERVICE_KEY = "c77dcf2aa5c04dd6b6613f77d9d9161d"


 def _get_audio_file_path() -> str:
@@ -9,7 +9,7 @@ def _get_audio_file_path() -> str:


 def test_azure_speech_load_key_region_auto_detect_languages() -> None:
-    loader = AzureSpeechServiceLoader(
+    loader = AzureAISpeechLoader(
        _get_audio_file_path(),
        api_key=SPEECH_SERVICE_KEY,
        region=SPEECH_SERVICE_REGION,
@@ -20,7 +20,7 @@ def test_azure_speech_load_key_region_auto_detect_languages() -> None:


 def test_azure_speech_load_key_region_language() -> None:
-    loader = AzureSpeechServiceLoader(
+    loader = AzureAISpeechLoader(
        _get_audio_file_path(),
        api_key=SPEECH_SERVICE_KEY,
        region=SPEECH_SERVICE_REGION,
@@ -31,7 +31,7 @@ def test_azure_speech_load_key_region_language() -> None:


 def test_azure_speech_load_key_region() -> None:
-    loader = AzureSpeechServiceLoader(
+    loader = AzureAISpeechLoader(
        _get_audio_file_path(),
        api_key=SPEECH_SERVICE_KEY,
        region=SPEECH_SERVICE_REGION
@@ -41,7 +41,7 @@ def test_azure_speech_load_key_region() -> None:


 def test_azure_speech_load_key_endpoint() -> None:
-    loader = AzureSpeechServiceLoader(
+    loader = AzureAISpeechLoader(
        _get_audio_file_path(),
        api_key=SPEECH_SERVICE_KEY,
        endpoint=f"wss://{SPEECH_SERVICE_REGION}.stt.speech.microsoft.com/speech/recognition"
--- a/libs/community/tests/unit_tests/test_audio/whatstheweatherlike.wav
+++ b/libs/community/tests/unit_tests/test_audio/whatstheweatherlike.wav