Merge pull request #18421

* Implement lazy_load() for AssemblyAIAudioTranscriptLoader
This commit is contained in:
Christophe Bornet
2024-03-06 19:16:05 +01:00
committed by GitHub
parent bb284eebe4
commit 15b1770326
4 changed files with 62 additions and 26 deletions

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
from enum import Enum
from typing import TYPE_CHECKING, List, Optional
from typing import TYPE_CHECKING, Iterator, Optional
import requests
from langchain_core.documents import Document
@@ -75,7 +75,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
self.transcript_format = transcript_format
self.transcriber = assemblyai.Transcriber(config=config)
def load(self) -> List[Document]:
def lazy_load(self) -> Iterator[Document]:
"""Transcribes the audio file and loads the transcript into documents.
It uses the AssemblyAI API to transcribe the audio file and blocks until
@@ -88,27 +88,21 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
raise ValueError(f"Could not transcribe file: {transcript.error}")
if self.transcript_format == TranscriptFormat.TEXT:
return [
Document(
page_content=transcript.text, metadata=transcript.json_response
)
]
yield Document(
page_content=transcript.text, metadata=transcript.json_response
)
elif self.transcript_format == TranscriptFormat.SENTENCES:
sentences = transcript.get_sentences()
return [
Document(page_content=s.text, metadata=s.dict(exclude={"text"}))
for s in sentences
]
for s in sentences:
yield Document(page_content=s.text, metadata=s.dict(exclude={"text"}))
elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
paragraphs = transcript.get_paragraphs()
return [
Document(page_content=p.text, metadata=p.dict(exclude={"text"}))
for p in paragraphs
]
for p in paragraphs:
yield Document(page_content=p.text, metadata=p.dict(exclude={"text"}))
elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
return [Document(page_content=transcript.export_subtitles_srt())]
yield Document(page_content=transcript.export_subtitles_srt())
elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
return [Document(page_content=transcript.export_subtitles_vtt())]
yield Document(page_content=transcript.export_subtitles_vtt())
else:
raise ValueError("Unknown transcript format.")
@@ -140,7 +134,7 @@ class AssemblyAIAudioLoaderById(BaseLoader):
self.transcript_id = transcript_id
self.transcript_format = transcript_format
def load(self) -> List[Document]:
def lazy_load(self) -> Iterator[Document]:
"""Load data into Document objects."""
HEADERS = {"authorization": self.api_key}
@@ -157,9 +151,7 @@ class AssemblyAIAudioLoaderById(BaseLoader):
transcript = transcript_response.json()["text"]
return [
Document(page_content=transcript, metadata=transcript_response.json())
]
yield Document(page_content=transcript, metadata=transcript_response.json())
elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
try:
paragraphs_response = requests.get(
@@ -173,7 +165,8 @@ class AssemblyAIAudioLoaderById(BaseLoader):
paragraphs = paragraphs_response.json()["paragraphs"]
return [Document(page_content=p["text"], metadata=p) for p in paragraphs]
for p in paragraphs:
yield Document(page_content=p["text"], metadata=p)
elif self.transcript_format == TranscriptFormat.SENTENCES:
try:
@@ -188,7 +181,8 @@ class AssemblyAIAudioLoaderById(BaseLoader):
sentences = sentences_response.json()["sentences"]
return [Document(page_content=s["text"], metadata=s) for s in sentences]
for s in sentences:
yield Document(page_content=s["text"], metadata=s)
elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
try:
@@ -203,7 +197,7 @@ class AssemblyAIAudioLoaderById(BaseLoader):
srt = srt_response.text
return [Document(page_content=srt)]
yield Document(page_content=srt)
elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
try:
@@ -218,6 +212,6 @@ class AssemblyAIAudioLoaderById(BaseLoader):
vtt = vtt_response.text
return [Document(page_content=vtt)]
yield Document(page_content=vtt)
else:
raise ValueError("Unknown transcript format.")