From a6b49b2de50d7b9901214c610d4448cea75a0220 Mon Sep 17 00:00:00 2001
From: kzmain <main@kai.sh>
Date: Mon, 25 Dec 2023 22:09:37 +0800
Subject: [PATCH] move loader to the langchain_community

---
 .../document_loaders/__init__.py              |   4 +-
 .../document_loaders/azure_ai_speech.py}      |  18 +-
 .../document_loaders/parsers/audio.py         | 251 +++++++++++++++++-
 .../unit_tests/document_loaders/test_audio.py |  14 +-
 .../test_audio/whatstheweatherlike.wav        | Bin
 5 files changed, 256 insertions(+), 31 deletions(-)
 rename libs/{langchain/langchain/document_loaders/audio.py => community/langchain_community/document_loaders/azure_ai_speech.py} (62%)
 rename libs/{langchain => community}/tests/unit_tests/document_loaders/test_audio.py (81%)
 rename libs/{langchain => community}/tests/unit_tests/test_audio/whatstheweatherlike.wav (100%)

diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py
index ca295e538eb..b7d24acf64c 100644
--- a/libs/community/langchain_community/document_loaders/__init__.py
+++ b/libs/community/langchain_community/document_loaders/__init__.py
@@ -26,6 +26,7 @@ from langchain_community.document_loaders.airbyte import (
     AirbyteTypeformLoader,
     AirbyteZendeskSupportLoader,
 )
+from langchain_community.document_loaders.azure_ai_speech import AzureAISpeechLoader
 from langchain_community.document_loaders.airbyte_json import AirbyteJSONLoader
 from langchain_community.document_loaders.airtable import AirtableLoader
 from langchain_community.document_loaders.apify_dataset import ApifyDatasetLoader
@@ -53,7 +54,7 @@ from langchain_community.document_loaders.blob_loaders import (
     Blob,
     BlobLoader,
     FileSystemBlobLoader,
-    YoutubeAudioLoader,
+    YoutubeAudioLoader
 )
 from langchain_community.document_loaders.blockchain import BlockchainDocumentLoader
 from langchain_community.document_loaders.brave_search import BraveSearchLoader
@@ -250,6 +251,7 @@ __all__ = [
     "AssemblyAIAudioTranscriptLoader",
     "AsyncHtmlLoader",
     "AzureAIDataLoader",
+    "AzureAISpeechLoader",
     "AzureAIDocumentIntelligenceLoader",
     "AzureBlobStorageContainerLoader",
     "AzureBlobStorageFileLoader",
diff --git a/libs/langchain/langchain/document_loaders/audio.py b/libs/community/langchain_community/document_loaders/azure_ai_speech.py
similarity index 62%
rename from libs/langchain/langchain/document_loaders/audio.py
rename to libs/community/langchain_community/document_loaders/azure_ai_speech.py
index 283d4f13c03..8053a190737 100644
--- a/libs/langchain/langchain/document_loaders/audio.py
+++ b/libs/community/langchain_community/document_loaders/azure_ai_speech.py
@@ -4,24 +4,24 @@ from typing import List, Optional
 
 from langchain_core.documents import Document
 
-from langchain.document_loaders import Blob
-from langchain.document_loaders.base import BaseLoader
-from langchain.document_loaders.parsers.audio import AzureSpeechServiceParser
+from langchain_community.document_loaders.blob_loaders import Blob
+from langchain_community.document_loaders.base import BaseLoader
+from langchain_community.document_loaders.parsers.audio import AzureAISpeechParser
 
 
-class AzureSpeechServiceLoader(BaseLoader):
-    """Azure Speech Service Document Loader.
+class AzureAISpeechLoader(BaseLoader):
+    """Azure AI Speech Service Document Loader.
 
         A document loader that can load an audio file from the local file system
-        and transcribe it using Azure Speech Service.
+        and transcribe it using Azure AI Speech Service.
 
         Examples:
 
             .. code-block:: python
 
-                from langchain.document_loaders import AzureSpeechServiceLoader
+                from langchain_community.document_loaders import AzureAISpeechLoader
 
-                loader = AzureSpeechServiceLoader(
+                loader = AzureAISpeechParser(
                     file_path="path/to/directory/example.wav",
                     api_key="speech-api-key-from-azure",
                     region="speech-api-region-from-azure"
@@ -43,4 +43,4 @@ class AzureSpeechServiceLoader(BaseLoader):
             file_path: The path to the audio file.
         """
         self.file_path = file_path
-        self.parser = AzureSpeechServiceParser(**kwargs)
+        self.parser = AzureAISpeechParser(**kwargs)
diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py
index ab54c67ed37..85be627cb3b 100644
--- a/libs/community/langchain_community/document_loaders/parsers/audio.py
+++ b/libs/community/langchain_community/document_loaders/parsers/audio.py
@@ -1,8 +1,9 @@
 import logging
 import time
-from typing import Dict, Iterator, Optional, Tuple
+from typing import Dict, Iterator, Optional, Tuple, List
 
 from langchain_core.documents import Document
+from langchain_core.utils import get_from_env
 
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
@@ -56,7 +57,7 @@ class OpenAIWhisperParser(BaseBlobParser):
         # Split the audio into chunk_duration_ms chunks
         for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)):
             # Audio chunk
-            chunk = audio[i : i + chunk_duration_ms]
+            chunk = audio[i: i + chunk_duration_ms]
             file_obj = io.BytesIO(chunk.export(format="mp3").read())
             if blob.source is not None:
                 file_obj.name = blob.source + f"_part_{split_number}.mp3"
@@ -64,7 +65,7 @@ class OpenAIWhisperParser(BaseBlobParser):
                 file_obj.name = f"part_{split_number}.mp3"
 
             # Transcribe
-            print(f"Transcribing part {split_number+1}!")
+            print(f"Transcribing part {split_number + 1}!")
             attempts = 0
             while attempts < 3:
                 try:
@@ -113,10 +114,10 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
     """
 
     def __init__(
-        self,
-        device: str = "0",
-        lang_model: Optional[str] = None,
-        forced_decoder_ids: Optional[Tuple[Dict]] = None,
+            self,
+            device: str = "0",
+            lang_model: Optional[str] = None,
+            forced_decoder_ids: Optional[Tuple[Dict]] = None,
     ):
         """Initialize the parser.
 
@@ -155,7 +156,7 @@ class OpenAIWhisperParserLocal(BaseBlobParser):
                 self.device = "cuda:0"
                 # check GPU memory and select automatically the model
                 mem = torch.cuda.get_device_properties(self.device).total_memory / (
-                    1024**2
+                        1024 ** 2
                 )
                 if mem < 5000:
                     rec_model = "openai/whisper-base"
@@ -237,12 +238,12 @@ class YandexSTTParser(BaseBlobParser):
     Audio transcription is with OpenAI Whisper model."""
 
     def __init__(
-        self,
-        *,
-        api_key: Optional[str] = None,
-        iam_token: Optional[str] = None,
-        model: str = "general",
-        language: str = "auto",
+            self,
+            *,
+            api_key: Optional[str] = None,
+            iam_token: Optional[str] = None,
+            model: str = "general",
+            language: str = "auto",
     ):
         """Initialize the parser.
 
@@ -308,3 +309,225 @@ class YandexSTTParser(BaseBlobParser):
                 page_content=res.normalized_text,
                 metadata={"source": blob.source},
             )
+
+
+class AzureAISpeechParser(BaseBlobParser):
+    """
+    This AzureSpeechServiceParser class make use of the Microsoft Azure Cognitive Speech
+    service's transcription module to convert an audio file to text.
+
+    You can find official transcribe sdk documents with this link:
+
+    https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.transcription?view=azure-python
+
+    You can find official transcribe sdk samples with this link:
+    https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/2e39515446ec261bf9fd8d42902147c51c5f72cd/samples/python/console/transcription_sample.py
+    """
+
+    def __init__(
+            self,
+            *,
+            api_key: Optional[str] = None,
+            region: Optional[str] = None,
+            endpoint: Optional[str] = None,
+            log_path: Optional[str] = None,
+            polling_interval_seconds: float = 0.5,
+            speech_recognition_language: Optional[str] = None,
+            auto_detect_languages: Optional[list[str]] = None,
+            speech_config_kwargs: Optional[dict] = None
+    ) -> None:
+        """Initialize the parser.
+
+        See ``speechsdk.SpeechConfig(()`` for more information about how these
+        parameters are used.
+
+        Args:
+            api_key: The Azure Cognitive Speech service authentication token
+            region: The Azure Cognitive Speech service locate region,
+                you need this argument or the endpoint argument
+            endpoint: The Azure Cognitive Speech service endpoint with wss protocol,
+                this would be useful when a programmer uses the Azure cloud other
+                than the Azure Global Cloud like Azure China, Azure German
+            log_path: pass when transaction job log is required
+            polling_interval_seconds: check transcribe job status at this frequency
+            auto_detect_languages: pass a list of potential source languages,
+                for source language auto-detection in recognition.
+            speech_recognition_language: pass a transcribe job's target source languages
+        """
+
+        self.api_key = api_key if api_key is not None else get_from_env('api_key', 'AZURE_SPEECH_SERVICE_KEY')
+
+        self.region = region if region is not None else get_from_env('region', 'AZURE_SPEECH_REGION', 'NONE')
+        self.region = self.region if self.region != 'NONE' else None
+
+        self.endpoint = endpoint if endpoint is not None else get_from_env('endpoint', 'AZURE_SPEECH_ENDPOINT', 'NONE')
+        self.endpoint = self.endpoint if self.endpoint != 'NONE' else None
+
+        if not self.region and not self.endpoint:
+            raise ValueError(
+                "You need to provide either the region or the endpoint argument."
+            )
+
+        self.log_path = log_path if log_path is not None else get_from_env('log_path', 'AZURE_SPEECH_LOG_PATH', 'NONE')
+        self.log_path = self.log_path if self.log_path != 'NONE' else None
+
+        self.polling_interval_seconds = polling_interval_seconds
+
+        self.speech_recognition_language = speech_recognition_language
+        self.auto_detect_languages = auto_detect_languages
+        self.speech_config_kwargs = (
+            speech_config_kwargs if speech_config_kwargs is not None else {}
+        )
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+        import json
+
+        raw_json_list: List[dict] = []
+        document_list: List[Document] = []
+
+        try:
+            import azure.cognitiveservices.speech as speechsdk
+        except ImportError:
+            raise ImportError(
+                "azure.cognitiveservices.speech package not found, please install "
+                "it with `pip install azure-cognitiveservices-speech`."
+            )
+
+        def conversation_transcriber_recognition_canceled_cb(
+                evt: speechsdk.SessionEventArgs
+        ) -> None:
+            # Canceled event
+            pass
+
+        def conversation_transcriber_session_stopped_cb(
+                evt: speechsdk.SessionEventArgs
+        ) -> None:
+            # SessionStopped event
+            pass
+
+        def conversation_transcriber_transcribed_cb(
+                evt: speechsdk.SpeechRecognitionEventArgs
+        ) -> None:
+            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
+                evt_dict = json.loads(evt.result.json)
+
+                content = evt_dict["DisplayText"]
+
+                if self.speech_recognition_language is not None:
+                    language = self.speech_recognition_language
+                elif self.auto_detect_languages is not None:
+                    temp_dict = evt_dict["PrimaryLanguage"]
+                    language = (
+                        temp_dict["Language"] if "Language" in temp_dict else "Unknown"
+                    )
+                else:
+                    language = "Unsigned"
+
+                speaker_id = (
+                    evt_dict["SpeakerId"] if "SpeakerId" in evt_dict else "Unknown"
+                )
+                offset_second = evt_dict["Offset"]
+                duration_second = evt_dict["Duration"]
+
+                evt_dict = json.loads(evt.result.json)
+                _doc = Document(
+                    page_content=content,
+                    metadata={
+                        "offset_second": int(offset_second) / 10 ** 7,
+                        "duration_second": int(duration_second) / 10 ** 7,
+                        "language": language,
+                        "speaker_id": speaker_id,
+                    },
+                )
+                print(f"TRANSCRIBED:{evt_dict}")
+                raw_json_list.append(evt_dict)
+                document_list.append(_doc)
+            elif evt.result.reason == speechsdk.ResultReason.NoMatch:
+                print(
+                    "\tNOMATCH: Speech could not be TRANSCRIBED: {}".format(
+                        evt.result.no_match_details
+                    )
+                )
+
+        def conversation_transcriber_session_started_cb(
+                evt: speechsdk.SessionEventArgs
+        ) -> None:
+            # SessionStarted event
+            pass
+
+        def recognize_from_file() -> Iterator[Document]:
+            # Speech service speech config
+            speech_config = speechsdk.SpeechConfig(
+                subscription=self.api_key,
+                region=self.region,
+                endpoint=self.endpoint,
+                speech_recognition_language=self.speech_recognition_language,
+                **self.speech_config_kwargs,
+            )
+            speech_config.output_format = speechsdk.OutputFormat.Detailed
+
+            if self.log_path is not None:
+                speech_config.set_property(
+                    speechsdk.PropertyId.Speech_LogFilename, self.log_path
+                )
+
+            # Speech service audio config
+            audio_config = speechsdk.audio.AudioConfig(filename=blob.path)
+
+            # Speech service auto_detect_source_language_config config
+            if self.auto_detect_languages is not None:
+                auto_detect_source_language_config = (
+                    speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
+                        languages=self.auto_detect_languages
+                    )
+                )
+            else:
+                auto_detect_source_language_config = None
+
+            conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
+                speech_config=speech_config,
+                audio_config=audio_config,
+                auto_detect_source_language_config=auto_detect_source_language_config,
+            )
+
+            transcribing_stop = False
+
+            def stop_cb(evt: speechsdk.SessionEventArgs) -> None:
+                # callback that signals to stop continuous recognition
+                # upon receiving an event `evt`
+                print("CLOSING on {}".format(evt))
+                nonlocal transcribing_stop
+                transcribing_stop = True
+
+            # Connect callbacks to the events fired by the conversation transcriber
+            conversation_transcriber.transcribed.connect(
+                conversation_transcriber_transcribed_cb
+            )
+            conversation_transcriber.session_started.connect(
+                conversation_transcriber_session_started_cb
+            )
+            conversation_transcriber.session_stopped.connect(
+                conversation_transcriber_session_stopped_cb
+            )
+            conversation_transcriber.canceled.connect(
+                conversation_transcriber_recognition_canceled_cb
+            )
+            # stop transcribing on either session stopped or canceled events
+            conversation_transcriber.session_stopped.connect(stop_cb)
+            conversation_transcriber.canceled.connect(stop_cb)
+
+            conversation_transcriber.start_transcribing_async()
+
+            # Waits for completion.
+            while not transcribing_stop:
+                time.sleep(self.polling_interval_seconds)
+
+            conversation_transcriber.stop_transcribing_async()
+            return iter(document_list)
+
+        try:
+            return recognize_from_file()
+        except Exception as err:
+            print("Encountered exception. {}".format(err))
+            raise err
diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_audio.py b/libs/community/tests/unit_tests/document_loaders/test_audio.py
similarity index 81%
rename from libs/langchain/tests/unit_tests/document_loaders/test_audio.py
rename to libs/community/tests/unit_tests/document_loaders/test_audio.py
index 0f0b582b6fa..5bb0ff196b8 100644
--- a/libs/langchain/tests/unit_tests/document_loaders/test_audio.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_audio.py
@@ -1,7 +1,7 @@
-from langchain.document_loaders.audio import AzureSpeechServiceLoader
+from langchain_community.document_loaders import AzureAISpeechLoader
 
-SPEECH_SERVICE_REGION = ""
-SPEECH_SERVICE_KEY = ""
+SPEECH_SERVICE_REGION = "eastasia"
+SPEECH_SERVICE_KEY = "c77dcf2aa5c04dd6b6613f77d9d9161d"
 
 
 def _get_audio_file_path() -> str:
@@ -9,7 +9,7 @@ def _get_audio_file_path() -> str:
 
 
 def test_azure_speech_load_key_region_auto_detect_languages() -> None:
-    loader = AzureSpeechServiceLoader(
+    loader = AzureAISpeechLoader(
         _get_audio_file_path(),
         api_key=SPEECH_SERVICE_KEY,
         region=SPEECH_SERVICE_REGION,
@@ -20,7 +20,7 @@ def test_azure_speech_load_key_region_auto_detect_languages() -> None:
 
 
 def test_azure_speech_load_key_region_language() -> None:
-    loader = AzureSpeechServiceLoader(
+    loader = AzureAISpeechLoader(
         _get_audio_file_path(),
         api_key=SPEECH_SERVICE_KEY,
         region=SPEECH_SERVICE_REGION,
@@ -31,7 +31,7 @@ def test_azure_speech_load_key_region_language() -> None:
 
 
 def test_azure_speech_load_key_region() -> None:
-    loader = AzureSpeechServiceLoader(
+    loader = AzureAISpeechLoader(
         _get_audio_file_path(),
         api_key=SPEECH_SERVICE_KEY,
         region=SPEECH_SERVICE_REGION
@@ -41,7 +41,7 @@ def test_azure_speech_load_key_region() -> None:
 
 
 def test_azure_speech_load_key_endpoint() -> None:
-    loader = AzureSpeechServiceLoader(
+    loader = AzureAISpeechLoader(
         _get_audio_file_path(),
         api_key=SPEECH_SERVICE_KEY,
         endpoint=f"wss://{SPEECH_SERVICE_REGION}.stt.speech.microsoft.com/speech/recognition"
diff --git a/libs/langchain/tests/unit_tests/test_audio/whatstheweatherlike.wav b/libs/community/tests/unit_tests/test_audio/whatstheweatherlike.wav
similarity index 100%
rename from libs/langchain/tests/unit_tests/test_audio/whatstheweatherlike.wav
rename to libs/community/tests/unit_tests/test_audio/whatstheweatherlike.wav