Harrison/subtitles (#1842)

Co-authored-by: David Ruan <ruanwz@gmail.com>
Co-authored-by: David Ruan <david.ruan@analyticservice.net>
This commit is contained in:
Harrison Chase 2023-03-20 22:53:52 -07:00 committed by GitHub
parent 3674074eb0
commit 6f02286805
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -114,7 +114,7 @@ class YoutubeLoader(BaseLoader):
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
try: try:
from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"Could not import youtube_transcript_api python package. " "Could not import youtube_transcript_api python package. "
@ -129,9 +129,15 @@ class YoutubeLoader(BaseLoader):
video_info = self._get_video_info() video_info = self._get_video_info()
metadata.update(video_info) metadata.update(video_info)
transcript_pieces = YouTubeTranscriptApi.get_transcript( transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
self.video_id, languages=[self.language] try:
) transcript = transcript_list.find_transcript([self.language])
except NoTranscriptFound:
en_transcript = transcript_list.find_transcript(["en"])
transcript = en_transcript.translate(self.language)
transcript_pieces = transcript.fetch()
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces]) transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
return [Document(page_content=transcript, metadata=metadata)] return [Document(page_content=transcript, metadata=metadata)]
@ -233,9 +239,16 @@ class GoogleApiYoutubeLoader(BaseLoader):
return values return values
def _get_transcripe_for_video_id(self, video_id: str) -> str: def _get_transcripe_for_video_id(self, video_id: str) -> str:
from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
transcript_pieces = YouTubeTranscriptApi.get_transcript(video_id) transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_ids)
try:
transcript = transcript_list.find_transcript([self.captions_language])
except NoTranscriptFound:
en_transcript = transcript_list.find_transcript(["en"])
transcript = en_transcript.translate(self.captions_language)
transcript_pieces = transcript.fetch()
return " ".join([t["text"].strip(" ") for t in transcript_pieces]) return " ".join([t["text"].strip(" ") for t in transcript_pieces])
def _get_document_for_video_id(self, video_id: str, **kwargs: Any) -> Document: def _get_document_for_video_id(self, video_id: str, **kwargs: Any) -> Document: