mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-04 04:28:58 +00:00
community[patch]: Load YouTube transcripts (captions) as fixed-duration chunks with start times (#21710)
- **Description:** Add a new format, `CHUNKS`, to `langchain_community.document_loaders.youtube.YoutubeLoader` which creates multiple `Document` objects from YouTube video transcripts (captions), each of a fixed duration. The metadata of each chunk `Document` includes the start time of each one and a URL to that time in the video on the YouTube website. I had implemented this for UMich (@umich-its-ai) in a local module, but it makes sense to contribute this to LangChain community for all to benefit and to simplify maintenance. - **Issue:** N/A - **Dependencies:** N/A - **Twitter:** lsloan_umich - **Mastodon:** [lsloan@mastodon.social](https://mastodon.social/@lsloan) With regards to **tests and documentation**, most existing features of the `YoutubeLoader` class are not tested. Only the `YoutubeLoader.extract_video_id()` static method had a test. However, while I was waiting for this PR to be reviewed and merged, I had time to add a test for the chunking feature I've proposed in this PR. I have added an example of using chunking to the `docs/docs/integrations/document_loaders/youtube_transcript.ipynb` notebook. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
committed by
GitHub
parent
71811e0547
commit
84dc2dd059
@@ -4,7 +4,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
from langchain_core.documents import Document
|
||||
@@ -99,8 +99,8 @@ class GoogleApiClient:
|
||||
return creds
|
||||
|
||||
|
||||
ALLOWED_SCHEMAS = {"http", "https"}
|
||||
ALLOWED_NETLOCK = {
|
||||
ALLOWED_SCHEMES = {"http", "https"}
|
||||
ALLOWED_NETLOCS = {
|
||||
"youtu.be",
|
||||
"m.youtube.com",
|
||||
"youtube.com",
|
||||
@@ -111,13 +111,13 @@ ALLOWED_NETLOCK = {
|
||||
|
||||
|
||||
def _parse_video_id(url: str) -> Optional[str]:
|
||||
"""Parse a youtube url and return the video id if valid, otherwise None."""
|
||||
"""Parse a YouTube URL and return the video ID if valid, otherwise None."""
|
||||
parsed_url = urlparse(url)
|
||||
|
||||
if parsed_url.scheme not in ALLOWED_SCHEMAS:
|
||||
if parsed_url.scheme not in ALLOWED_SCHEMES:
|
||||
return None
|
||||
|
||||
if parsed_url.netloc not in ALLOWED_NETLOCK:
|
||||
if parsed_url.netloc not in ALLOWED_NETLOCS:
|
||||
return None
|
||||
|
||||
path = parsed_url.path
|
||||
@@ -141,14 +141,15 @@ def _parse_video_id(url: str) -> Optional[str]:
|
||||
|
||||
|
||||
class TranscriptFormat(Enum):
|
||||
"""Transcript format."""
|
||||
"""Output formats of transcripts from `YoutubeLoader`."""
|
||||
|
||||
TEXT = "text"
|
||||
LINES = "lines"
|
||||
CHUNKS = "chunks"
|
||||
|
||||
|
||||
class YoutubeLoader(BaseLoader):
|
||||
"""Load `YouTube` transcripts."""
|
||||
"""Load `YouTube` video transcripts."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -158,9 +159,11 @@ class YoutubeLoader(BaseLoader):
|
||||
translation: Optional[str] = None,
|
||||
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
|
||||
continue_on_failure: bool = False,
|
||||
chunk_size_seconds: int = 120,
|
||||
):
|
||||
"""Initialize with YouTube video ID."""
|
||||
self.video_id = video_id
|
||||
self._metadata = {"source": video_id}
|
||||
self.add_video_info = add_video_info
|
||||
self.language = language
|
||||
if isinstance(language, str):
|
||||
@@ -170,25 +173,69 @@ class YoutubeLoader(BaseLoader):
|
||||
self.translation = translation
|
||||
self.transcript_format = transcript_format
|
||||
self.continue_on_failure = continue_on_failure
|
||||
self.chunk_size_seconds = chunk_size_seconds
|
||||
|
||||
@staticmethod
|
||||
def extract_video_id(youtube_url: str) -> str:
|
||||
"""Extract video id from common YT urls."""
|
||||
"""Extract video ID from common YouTube URLs."""
|
||||
video_id = _parse_video_id(youtube_url)
|
||||
if not video_id:
|
||||
raise ValueError(
|
||||
f"Could not determine the video ID for the URL {youtube_url}"
|
||||
f'Could not determine the video ID for the URL "{youtube_url}".'
|
||||
)
|
||||
return video_id
|
||||
|
||||
@classmethod
|
||||
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
||||
"""Given youtube URL, load video."""
|
||||
"""Given a YouTube URL, construct a loader.
|
||||
See `YoutubeLoader()` constructor for a list of keyword arguments.
|
||||
"""
|
||||
video_id = cls.extract_video_id(youtube_url)
|
||||
return cls(video_id, **kwargs)
|
||||
|
||||
def _make_chunk_document(
|
||||
self, chunk_pieces: List[Dict], chunk_start_seconds: int
|
||||
) -> Document:
|
||||
"""Create Document from chunk of transcript pieces."""
|
||||
m, s = divmod(chunk_start_seconds, 60)
|
||||
h, m = divmod(m, 60)
|
||||
return Document(
|
||||
page_content=" ".join(
|
||||
map(lambda chunk_piece: chunk_piece["text"].strip(" "), chunk_pieces)
|
||||
),
|
||||
metadata={
|
||||
**self._metadata,
|
||||
"start_seconds": chunk_start_seconds,
|
||||
"start_timestamp": f"{h:02d}:{m:02d}:{s:02d}",
|
||||
"source":
|
||||
# replace video ID with URL to start time
|
||||
f"https://www.youtube.com/watch?v={self.video_id}"
|
||||
f"&t={chunk_start_seconds}s",
|
||||
},
|
||||
)
|
||||
|
||||
def _get_transcript_chunks(
|
||||
self, transcript_pieces: List[Dict]
|
||||
) -> Generator[Document, None, None]:
|
||||
chunk_pieces: List[Dict[str, Any]] = []
|
||||
chunk_start_seconds = 0
|
||||
chunk_time_limit = self.chunk_size_seconds
|
||||
for transcript_piece in transcript_pieces:
|
||||
piece_end = transcript_piece["start"] + transcript_piece["duration"]
|
||||
if piece_end > chunk_time_limit:
|
||||
if chunk_pieces:
|
||||
yield self._make_chunk_document(chunk_pieces, chunk_start_seconds)
|
||||
chunk_pieces = []
|
||||
chunk_start_seconds = chunk_time_limit
|
||||
chunk_time_limit += self.chunk_size_seconds
|
||||
|
||||
chunk_pieces.append(transcript_piece)
|
||||
|
||||
if len(chunk_pieces) > 0:
|
||||
yield self._make_chunk_document(chunk_pieces, chunk_start_seconds)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
"""Load YouTube transcripts into `Document` objects."""
|
||||
try:
|
||||
from youtube_transcript_api import (
|
||||
NoTranscriptFound,
|
||||
@@ -197,17 +244,15 @@ class YoutubeLoader(BaseLoader):
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import youtube_transcript_api python package. "
|
||||
'Could not import "youtube_transcript_api" Python package. '
|
||||
"Please install it with `pip install youtube-transcript-api`."
|
||||
)
|
||||
|
||||
metadata = {"source": self.video_id}
|
||||
|
||||
if self.add_video_info:
|
||||
# Get more video meta info
|
||||
# Such as title, description, thumbnail url, publish_date
|
||||
video_info = self._get_video_info()
|
||||
metadata.update(video_info)
|
||||
self._metadata.update(video_info)
|
||||
|
||||
try:
|
||||
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
|
||||
@@ -222,31 +267,45 @@ class YoutubeLoader(BaseLoader):
|
||||
if self.translation is not None:
|
||||
transcript = transcript.translate(self.translation)
|
||||
|
||||
transcript_pieces = transcript.fetch()
|
||||
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
|
||||
|
||||
if self.transcript_format == TranscriptFormat.TEXT:
|
||||
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
||||
return [Document(page_content=transcript, metadata=metadata)]
|
||||
elif self.transcript_format == TranscriptFormat.LINES:
|
||||
return [
|
||||
Document(
|
||||
page_content=t["text"].strip(" "),
|
||||
metadata=dict((key, t[key]) for key in t if key != "text"),
|
||||
transcript = " ".join(
|
||||
map(
|
||||
lambda transcript_piece: transcript_piece["text"].strip(" "),
|
||||
transcript_pieces,
|
||||
)
|
||||
for t in transcript_pieces
|
||||
]
|
||||
)
|
||||
return [Document(page_content=transcript, metadata=self._metadata)]
|
||||
elif self.transcript_format == TranscriptFormat.LINES:
|
||||
return list(
|
||||
map(
|
||||
lambda transcript_piece: Document(
|
||||
page_content=transcript_piece["text"].strip(" "),
|
||||
metadata={
|
||||
filter(
|
||||
lambda item: item[0] != "text", transcript_piece.items()
|
||||
)
|
||||
},
|
||||
),
|
||||
transcript_pieces,
|
||||
)
|
||||
)
|
||||
elif self.transcript_format == TranscriptFormat.CHUNKS:
|
||||
return list(self._get_transcript_chunks(transcript_pieces))
|
||||
|
||||
else:
|
||||
raise ValueError("Unknown transcript format.")
|
||||
|
||||
def _get_video_info(self) -> dict:
|
||||
def _get_video_info(self) -> Dict:
|
||||
"""Get important video information.
|
||||
|
||||
Components are:
|
||||
Components include:
|
||||
- title
|
||||
- description
|
||||
- thumbnail url,
|
||||
- thumbnail URL,
|
||||
- publish_date
|
||||
- channel_author
|
||||
- channel author
|
||||
- and more.
|
||||
"""
|
||||
try:
|
||||
@@ -254,7 +313,7 @@ class YoutubeLoader(BaseLoader):
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import pytube python package. "
|
||||
'Could not import "pytube" Python package. '
|
||||
"Please install it with `pip install pytube`."
|
||||
)
|
||||
yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}")
|
||||
|
Reference in New Issue
Block a user