From 6f02286805338165a4c86c86a47943574c46a54f Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 20 Mar 2023 22:53:52 -0700 Subject: [PATCH] Harrison/subtitles (#1842) Co-authored-by: David Ruan Co-authored-by: David Ruan --- langchain/document_loaders/youtube.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/langchain/document_loaders/youtube.py b/langchain/document_loaders/youtube.py index 7ba5cb502ae..03c74022a24 100644 --- a/langchain/document_loaders/youtube.py +++ b/langchain/document_loaders/youtube.py @@ -114,7 +114,7 @@ class YoutubeLoader(BaseLoader): def load(self) -> List[Document]: """Load documents.""" try: - from youtube_transcript_api import YouTubeTranscriptApi + from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi except ImportError: raise ImportError( "Could not import youtube_transcript_api python package. " @@ -129,9 +129,15 @@ class YoutubeLoader(BaseLoader): video_info = self._get_video_info() metadata.update(video_info) - transcript_pieces = YouTubeTranscriptApi.get_transcript( - self.video_id, languages=[self.language] - ) + transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id) + try: + transcript = transcript_list.find_transcript([self.language]) + except NoTranscriptFound: + en_transcript = transcript_list.find_transcript(["en"]) + transcript = en_transcript.translate(self.language) + + transcript_pieces = transcript.fetch() + transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces]) return [Document(page_content=transcript, metadata=metadata)] @@ -233,9 +239,16 @@ class GoogleApiYoutubeLoader(BaseLoader): return values def _get_transcripe_for_video_id(self, video_id: str) -> str: - from youtube_transcript_api import YouTubeTranscriptApi + from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi - transcript_pieces = YouTubeTranscriptApi.get_transcript(video_id) + transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_ids) + try: + transcript = transcript_list.find_transcript([self.captions_language]) + except NoTranscriptFound: + en_transcript = transcript_list.find_transcript(["en"]) + transcript = en_transcript.translate(self.captions_language) + + transcript_pieces = transcript.fetch() return " ".join([t["text"].strip(" ") for t in transcript_pieces]) def _get_document_for_video_id(self, video_id: str, **kwargs: Any) -> Document: