community: Fix attribute access for transcript text in YoutubeLoader (Fixes #30309) (#30582)

**Description:** 
Fixes a bug in the YoutubeLoader where FetchedTranscript objects were
not properly processed. The loader was only extracting the 'text'
attribute from FetchedTranscriptSnippet objects while ignoring 'start'
and 'duration' attributes. This would cause a TypeError when the code
later tried to access these missing keys, particularly when using the
CHUNKS format or any code path that needed timestamp information.

This PR modifies the conversion of FetchedTranscriptSnippet objects to
include all necessary attributes, ensuring that the loader works
correctly with all transcript formats.

**Issue:** Fixes #30309

**Dependencies:** None

**Testing:**
- Tested the fix with multiple YouTube videos to confirm it resolves the
issue
- Verified that both regular loading and CHUNKS format work correctly
This commit is contained in:
Armaanjeet Singh Sandhu 2025-04-01 16:43:06 +05:30 committed by GitHub
parent ecff055096
commit 4bbc249b13
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 5 deletions

View File

@ -272,7 +272,14 @@ class YoutubeLoader(BaseLoader):
transcript = transcript.translate(self.translation)
transcript_object = transcript.fetch()
if isinstance(transcript_object, FetchedTranscript):
transcript_pieces = [{"text": x.text} for x in transcript_object.snippets]
transcript_pieces = [
{
"text": snippet.text,
"start": snippet.start,
"duration": snippet.duration,
}
for snippet in transcript_object.snippets
]
else:
transcript_pieces: List[Dict[str, Any]] = transcript_object # type: ignore[no-redef]

View File

@ -1492,7 +1492,7 @@ wheels = [
[[package]]
name = "langchain"
version = "0.3.21"
version = "0.3.22"
source = { editable = "../langchain" }
dependencies = [
{ name = "async-timeout", marker = "python_full_version < '3.11'" },
@ -1747,7 +1747,7 @@ typing = [
[[package]]
name = "langchain-core"
version = "0.3.47"
version = "0.3.49"
source = { editable = "../core" }
dependencies = [
{ name = "jsonpatch" },
@ -1777,7 +1777,7 @@ dev = [
{ name = "jupyter", specifier = ">=1.0.0,<2.0.0" },
{ name = "setuptools", specifier = ">=67.6.1,<68.0.0" },
]
lint = [{ name = "ruff", specifier = ">=0.9.2,<1.0.0" }]
lint = [{ name = "ruff", specifier = ">=0.11.2,<0.12.0" }]
test = [
{ name = "blockbuster", specifier = "~=1.5.18" },
{ name = "freezegun", specifier = ">=1.2.2,<2.0.0" },
@ -1805,7 +1805,7 @@ typing = [
[[package]]
name = "langchain-tests"
version = "0.3.15"
version = "0.3.17"
source = { editable = "../standard-tests" }
dependencies = [
{ name = "httpx" },