mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 15:43:54 +00:00
community[minor]: Add audio-parser "faster-whisper" in audio.py (#20012)
faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is up to 4 times faster than enai/whisper for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU. It can automatically detect the following 14 languages and transcribe the text into their respective languages: en, zh, fr, de, ja, ko, ru, es, th, it, pt, vi, ar, tr. The gitbub repository for faster-whisper is : https://github.com/SYSTRAN/faster-whisper --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
e3c2431c5b
commit
7d0a008744
@ -329,3 +329,135 @@ class YandexSTTParser(BaseBlobParser):
|
|||||||
page_content=res.normalized_text,
|
page_content=res.normalized_text,
|
||||||
metadata={"source": blob.source},
|
metadata={"source": blob.source},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class FasterWhisperParser(BaseBlobParser):
|
||||||
|
"""Transcribe and parse audio files with faster-whisper.
|
||||||
|
|
||||||
|
faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2,
|
||||||
|
which is up to 4 times faster than openai/whisper for the same accuracy while using
|
||||||
|
less memory. The efficiency can be further improved with 8-bit quantization on both
|
||||||
|
CPU and GPU.
|
||||||
|
|
||||||
|
It can automatically detect the following 14 languages and transcribe the text
|
||||||
|
into their respective languages: en, zh, fr, de, ja, ko, ru, es, th, it, pt, vi,
|
||||||
|
ar, tr.
|
||||||
|
|
||||||
|
The gitbub repository for faster-whisper is :
|
||||||
|
https://github.com/SYSTRAN/faster-whisper
|
||||||
|
|
||||||
|
Example: Load a YouTube video and transcribe the video speech into a document.
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.document_loaders.generic import GenericLoader
|
||||||
|
from langchain_community.document_loaders.parsers.audio
|
||||||
|
import FasterWhisperParser
|
||||||
|
from langchain.document_loaders.blob_loaders.youtube_audio
|
||||||
|
import YoutubeAudioLoader
|
||||||
|
|
||||||
|
|
||||||
|
url="https://www.youtube.com/watch?v=your_video"
|
||||||
|
save_dir="your_dir/"
|
||||||
|
loader = GenericLoader(
|
||||||
|
YoutubeAudioLoader([url],save_dir),
|
||||||
|
FasterWhisperParser()
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
device: Optional[str] = "cuda",
|
||||||
|
model_size: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""Initialize the parser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device: It can be "cuda" or "cpu" based on the available device.
|
||||||
|
model_size: There are four model sizes to choose from: "base", "small",
|
||||||
|
"medium", and "large-v3", based on the available GPU memory.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"torch package not found, please install it with `pip install torch`"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Determine the device to use
|
||||||
|
if device == "cpu":
|
||||||
|
self.device = "cpu"
|
||||||
|
else:
|
||||||
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
|
# Determine the model_size
|
||||||
|
if self.device == "cpu":
|
||||||
|
self.model_size = "base"
|
||||||
|
else:
|
||||||
|
# Set the model_size based on the available memory
|
||||||
|
mem = torch.cuda.get_device_properties(self.device).total_memory / (1024**2)
|
||||||
|
if mem < 1000:
|
||||||
|
self.model_size = "base"
|
||||||
|
elif mem < 3000:
|
||||||
|
self.model_size = "small"
|
||||||
|
elif mem < 5000:
|
||||||
|
self.model_size = "medium"
|
||||||
|
else:
|
||||||
|
self.model_size = "large-v3"
|
||||||
|
# If the user has assigned a model size, then use the assigned size
|
||||||
|
if model_size is not None:
|
||||||
|
if model_size in ["base", "small", "medium", "large-v3"]:
|
||||||
|
self.model_size = model_size
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Lazily parse the blob."""
|
||||||
|
|
||||||
|
import io
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pydub import AudioSegment
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"pydub package not found, please install it with `pip install pydub`"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"faster_whisper package not found, please install it with "
|
||||||
|
"`pip install faster-whisper`"
|
||||||
|
)
|
||||||
|
|
||||||
|
# get the audio
|
||||||
|
if isinstance(blob.data, bytes):
|
||||||
|
# blob contains the audio
|
||||||
|
audio = AudioSegment.from_file(io.BytesIO(blob.data))
|
||||||
|
elif blob.data is None and blob.path:
|
||||||
|
# Audio file from disk
|
||||||
|
audio = AudioSegment.from_file(blob.path)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unable to get audio from blob")
|
||||||
|
|
||||||
|
file_obj = io.BytesIO(audio.export(format="mp3").read())
|
||||||
|
|
||||||
|
# Transcribe
|
||||||
|
model = WhisperModel(
|
||||||
|
self.model_size, device=self.device, compute_type="float16"
|
||||||
|
)
|
||||||
|
|
||||||
|
segments, info = model.transcribe(file_obj, beam_size=5)
|
||||||
|
|
||||||
|
for segment in segments:
|
||||||
|
yield Document(
|
||||||
|
page_content=segment.text,
|
||||||
|
metadata={
|
||||||
|
"source": blob.source,
|
||||||
|
"timestamps": "[%.2fs -> %.2fs]" % (segment.start, segment.end),
|
||||||
|
"language": info.language,
|
||||||
|
"probability": "%d%%" % round(info.language_probability * 100),
|
||||||
|
**blob.metadata,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user