Harrison/subtitles (#1842)

Co-authored-by: David Ruan <ruanwz@gmail.com>
Co-authored-by: David Ruan <david.ruan@analyticservice.net>
This commit is contained in:
Harrison Chase 2023-03-20 22:53:52 -07:00 committed by GitHub
parent 3674074eb0
commit 6f02286805
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -114,7 +114,7 @@ class YoutubeLoader(BaseLoader):
def load(self) -> List[Document]:
"""Load documents."""
try:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
except ImportError:
raise ImportError(
"Could not import youtube_transcript_api python package. "
@ -129,9 +129,15 @@ class YoutubeLoader(BaseLoader):
video_info = self._get_video_info()
metadata.update(video_info)
transcript_pieces = YouTubeTranscriptApi.get_transcript(
self.video_id, languages=[self.language]
)
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
try:
transcript = transcript_list.find_transcript([self.language])
except NoTranscriptFound:
en_transcript = transcript_list.find_transcript(["en"])
transcript = en_transcript.translate(self.language)
transcript_pieces = transcript.fetch()
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
return [Document(page_content=transcript, metadata=metadata)]
@ -233,9 +239,16 @@ class GoogleApiYoutubeLoader(BaseLoader):
return values
def _get_transcripe_for_video_id(self, video_id: str) -> str:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
transcript_pieces = YouTubeTranscriptApi.get_transcript(video_id)
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_ids)
try:
transcript = transcript_list.find_transcript([self.captions_language])
except NoTranscriptFound:
en_transcript = transcript_list.find_transcript(["en"])
transcript = en_transcript.translate(self.captions_language)
transcript_pieces = transcript.fetch()
return " ".join([t["text"].strip(" ") for t in transcript_pieces])
def _get_document_for_video_id(self, video_id: str, **kwargs: Any) -> Document: