mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-30 03:28:40 +00:00
Improve youtube loader (#3395)
Small improvements for the YouTube loader: a) use the YouTube API permission scope instead of Google Drive b) bugfix: allow transcript loading for single videos c) an additional parameter "continue_on_failure" for cases when videos in a playlist do not have transcription enabled. d) support automated translation for all languages, if available. --------- Co-authored-by: Johann-Peter Hartmann <johann-peter.hartmann@mayflower.de>
This commit is contained in:
parent
e5ffbee5eb
commit
199cb855ea
@ -1,6 +1,7 @@
|
||||
"""Loader that loads YouTube transcript."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
@ -10,7 +11,9 @@ from pydantic.dataclasses import dataclass
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SCOPES = ["https://www.googleapis.com/auth/youtube.readonly"]
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -98,12 +101,17 @@ class YoutubeLoader(BaseLoader):
|
||||
"""Loader that loads Youtube transcripts."""
|
||||
|
||||
def __init__(
|
||||
self, video_id: str, add_video_info: bool = False, language: str = "en"
|
||||
self,
|
||||
video_id: str,
|
||||
add_video_info: bool = False,
|
||||
language: str = "en",
|
||||
continue_on_failure: bool = False,
|
||||
):
|
||||
"""Initialize with YouTube video ID."""
|
||||
self.video_id = video_id
|
||||
self.add_video_info = add_video_info
|
||||
self.language = language
|
||||
self.continue_on_failure = continue_on_failure
|
||||
|
||||
@classmethod
|
||||
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
||||
@ -217,6 +225,7 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
||||
video_ids: Optional[List[str]] = None
|
||||
add_video_info: bool = True
|
||||
captions_language: str = "en"
|
||||
continue_on_failure: bool = False
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self.youtube_client = self._build_youtube_client(self.google_api_client.creds)
|
||||
@ -249,12 +258,13 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
||||
def _get_transcripe_for_video_id(self, video_id: str) -> str:
|
||||
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
|
||||
|
||||
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_ids)
|
||||
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
||||
try:
|
||||
transcript = transcript_list.find_transcript([self.captions_language])
|
||||
except NoTranscriptFound:
|
||||
en_transcript = transcript_list.find_transcript(["en"])
|
||||
transcript = en_transcript.translate(self.captions_language)
|
||||
for available_transcript in transcript_list:
|
||||
transcript = available_transcript.translate(self.captions_language)
|
||||
continue
|
||||
|
||||
transcript_pieces = transcript.fetch()
|
||||
return " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
||||
@ -286,6 +296,19 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
||||
return channel_id
|
||||
|
||||
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
|
||||
try:
|
||||
from youtube_transcript_api import (
|
||||
NoTranscriptFound,
|
||||
TranscriptsDisabled,
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You must run"
|
||||
"`pip install --upgrade "
|
||||
"youtube-transcript-api`"
|
||||
"to use the youtube loader"
|
||||
)
|
||||
|
||||
channel_id = self._get_channel_id(channel)
|
||||
request = self.youtube_client.search().list(
|
||||
part="id,snippet",
|
||||
@ -304,14 +327,25 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
||||
if self.add_video_info:
|
||||
item["snippet"].pop("thumbnails")
|
||||
meta_data.update(item["snippet"])
|
||||
video_ids.append(
|
||||
Document(
|
||||
page_content=self._get_transcripe_for_video_id(
|
||||
item["id"]["videoId"]
|
||||
),
|
||||
metadata=meta_data,
|
||||
try:
|
||||
page_content = self._get_transcripe_for_video_id(
|
||||
item["id"]["videoId"]
|
||||
)
|
||||
)
|
||||
video_ids.append(
|
||||
Document(
|
||||
page_content=page_content,
|
||||
metadata=meta_data,
|
||||
)
|
||||
)
|
||||
except (TranscriptsDisabled, NoTranscriptFound) as e:
|
||||
if self.continue_on_failure:
|
||||
logger.error(
|
||||
"Error fetching transscript "
|
||||
+ f" {item['id']['videoId']}, exception: {e}"
|
||||
)
|
||||
else:
|
||||
raise e
|
||||
pass
|
||||
request = self.youtube_client.search().list_next(request, response)
|
||||
|
||||
return video_ids
|
||||
|
Loading…
Reference in New Issue
Block a user