mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-03 13:43:24 +00:00
Improve youtube loader (#3395)
Small improvements for the YouTube loader: a) use the YouTube API permission scope instead of Google Drive b) bugfix: allow transcript loading for single videos c) an additional parameter "continue_on_failure" for cases when videos in a playlist do not have transcription enabled. d) support automated translation for all languages, if available. --------- Co-authored-by: Johann-Peter Hartmann <johann-peter.hartmann@mayflower.de>
This commit is contained in:
parent
e5ffbee5eb
commit
199cb855ea
@ -1,6 +1,7 @@
|
|||||||
"""Loader that loads YouTube transcript."""
|
"""Loader that loads YouTube transcript."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
@ -10,7 +11,9 @@ from pydantic.dataclasses import dataclass
|
|||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
SCOPES = ["https://www.googleapis.com/auth/youtube.readonly"]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -98,12 +101,17 @@ class YoutubeLoader(BaseLoader):
|
|||||||
"""Loader that loads Youtube transcripts."""
|
"""Loader that loads Youtube transcripts."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, video_id: str, add_video_info: bool = False, language: str = "en"
|
self,
|
||||||
|
video_id: str,
|
||||||
|
add_video_info: bool = False,
|
||||||
|
language: str = "en",
|
||||||
|
continue_on_failure: bool = False,
|
||||||
):
|
):
|
||||||
"""Initialize with YouTube video ID."""
|
"""Initialize with YouTube video ID."""
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
self.add_video_info = add_video_info
|
self.add_video_info = add_video_info
|
||||||
self.language = language
|
self.language = language
|
||||||
|
self.continue_on_failure = continue_on_failure
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
||||||
@ -217,6 +225,7 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
|||||||
video_ids: Optional[List[str]] = None
|
video_ids: Optional[List[str]] = None
|
||||||
add_video_info: bool = True
|
add_video_info: bool = True
|
||||||
captions_language: str = "en"
|
captions_language: str = "en"
|
||||||
|
continue_on_failure: bool = False
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
self.youtube_client = self._build_youtube_client(self.google_api_client.creds)
|
self.youtube_client = self._build_youtube_client(self.google_api_client.creds)
|
||||||
@ -249,12 +258,13 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
|||||||
def _get_transcripe_for_video_id(self, video_id: str) -> str:
|
def _get_transcripe_for_video_id(self, video_id: str) -> str:
|
||||||
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
|
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
|
||||||
|
|
||||||
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_ids)
|
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
||||||
try:
|
try:
|
||||||
transcript = transcript_list.find_transcript([self.captions_language])
|
transcript = transcript_list.find_transcript([self.captions_language])
|
||||||
except NoTranscriptFound:
|
except NoTranscriptFound:
|
||||||
en_transcript = transcript_list.find_transcript(["en"])
|
for available_transcript in transcript_list:
|
||||||
transcript = en_transcript.translate(self.captions_language)
|
transcript = available_transcript.translate(self.captions_language)
|
||||||
|
continue
|
||||||
|
|
||||||
transcript_pieces = transcript.fetch()
|
transcript_pieces = transcript.fetch()
|
||||||
return " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
return " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
||||||
@ -286,6 +296,19 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
|||||||
return channel_id
|
return channel_id
|
||||||
|
|
||||||
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
|
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
|
||||||
|
try:
|
||||||
|
from youtube_transcript_api import (
|
||||||
|
NoTranscriptFound,
|
||||||
|
TranscriptsDisabled,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"You must run"
|
||||||
|
"`pip install --upgrade "
|
||||||
|
"youtube-transcript-api`"
|
||||||
|
"to use the youtube loader"
|
||||||
|
)
|
||||||
|
|
||||||
channel_id = self._get_channel_id(channel)
|
channel_id = self._get_channel_id(channel)
|
||||||
request = self.youtube_client.search().list(
|
request = self.youtube_client.search().list(
|
||||||
part="id,snippet",
|
part="id,snippet",
|
||||||
@ -304,14 +327,25 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
|||||||
if self.add_video_info:
|
if self.add_video_info:
|
||||||
item["snippet"].pop("thumbnails")
|
item["snippet"].pop("thumbnails")
|
||||||
meta_data.update(item["snippet"])
|
meta_data.update(item["snippet"])
|
||||||
video_ids.append(
|
try:
|
||||||
Document(
|
page_content = self._get_transcripe_for_video_id(
|
||||||
page_content=self._get_transcripe_for_video_id(
|
item["id"]["videoId"]
|
||||||
item["id"]["videoId"]
|
|
||||||
),
|
|
||||||
metadata=meta_data,
|
|
||||||
)
|
)
|
||||||
)
|
video_ids.append(
|
||||||
|
Document(
|
||||||
|
page_content=page_content,
|
||||||
|
metadata=meta_data,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except (TranscriptsDisabled, NoTranscriptFound) as e:
|
||||||
|
if self.continue_on_failure:
|
||||||
|
logger.error(
|
||||||
|
"Error fetching transscript "
|
||||||
|
+ f" {item['id']['videoId']}, exception: {e}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
pass
|
||||||
request = self.youtube_client.search().list_next(request, response)
|
request = self.youtube_client.search().list_next(request, response)
|
||||||
|
|
||||||
return video_ids
|
return video_ids
|
||||||
|
Loading…
Reference in New Issue
Block a user