community[minor]: [GoogleApiYoutubeLoader] Replace API used in _get_document_for_channel from search to playlistItem (#24034)

- **Description:** Search has a limit of 500 results, playlistItems
doesn't. Added a class in except clause to catch another common error.
- **Issue:** None
- **Dependencies:** None
- **Twitter handle:** @TupleType

---------

Co-authored-by: asi-cider <88270351+asi-cider@users.noreply.github.com>
Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Asi Greenholts 2024-07-19 21:04:34 +03:00 committed by GitHub
parent 6a45bf9554
commit 372c27f2e5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 11 deletions

View File

@ -7,6 +7,7 @@ from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Sequence, Union from typing import Any, Dict, Generator, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
from xml.etree.ElementTree import ParseError # OK: trusted-source
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.pydantic_v1 import root_validator from langchain_core.pydantic_v1 import root_validator
@ -28,6 +29,8 @@ class GoogleApiClient:
As the google api expects credentials you need to set up a google account and As the google api expects credentials you need to set up a google account and
register your Service. "https://developers.google.com/docs/api/quickstart/python" register your Service. "https://developers.google.com/docs/api/quickstart/python"
*Security Note*: Note that parsing of the transcripts relies on the standard
xml library but the input is viewed as trusted in this case.
Example: Example:
@ -437,6 +440,14 @@ class GoogleApiYoutubeLoader(BaseLoader):
channel_id = response["items"][0]["id"]["channelId"] channel_id = response["items"][0]["id"]["channelId"]
return channel_id return channel_id
def _get_uploads_playlist_id(self, channel_id: str) -> str:
request = self.youtube_client.channels().list(
part="contentDetails",
id=channel_id,
)
response = request.execute()
return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]: def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
try: try:
from youtube_transcript_api import ( from youtube_transcript_api import (
@ -452,10 +463,11 @@ class GoogleApiYoutubeLoader(BaseLoader):
) )
channel_id = self._get_channel_id(channel) channel_id = self._get_channel_id(channel)
request = self.youtube_client.search().list( uploads_playlist_id = self._get_uploads_playlist_id(channel_id)
request = self.youtube_client.playlistItems().list(
part="id,snippet", part="id,snippet",
channelId=channel_id, playlistId=uploads_playlist_id,
maxResults=50, # adjust this value to retrieve more or fewer videos maxResults=50,
) )
video_ids = [] video_ids = []
while request is not None: while request is not None:
@ -463,23 +475,20 @@ class GoogleApiYoutubeLoader(BaseLoader):
# Add each video ID to the list # Add each video ID to the list
for item in response["items"]: for item in response["items"]:
if not item["id"].get("videoId"): video_id = item["snippet"]["resourceId"]["videoId"]
continue meta_data = {"videoId": video_id}
meta_data = {"videoId": item["id"]["videoId"]}
if self.add_video_info: if self.add_video_info:
item["snippet"].pop("thumbnails") item["snippet"].pop("thumbnails")
meta_data.update(item["snippet"]) meta_data.update(item["snippet"])
try: try:
page_content = self._get_transcripe_for_video_id( page_content = self._get_transcripe_for_video_id(video_id)
item["id"]["videoId"]
)
video_ids.append( video_ids.append(
Document( Document(
page_content=page_content, page_content=page_content,
metadata=meta_data, metadata=meta_data,
) )
) )
except (TranscriptsDisabled, NoTranscriptFound) as e: except (TranscriptsDisabled, NoTranscriptFound, ParseError) as e:
if self.continue_on_failure: if self.continue_on_failure:
logger.error( logger.error(
"Error fetching transscript " "Error fetching transscript "

View File

@ -29,7 +29,7 @@ fi
# is very nuanced and depends on the user's environment. # is very nuanced and depends on the user's environment.
# https://docs.python.org/3/library/xml.etree.elementtree.html # https://docs.python.org/3/library/xml.etree.elementtree.html
result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in" || true) result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in| # OK: trusted-source" || true)
if [ -n "$result" ]; then if [ -n "$result" ]; then
echo "ERROR: The following lines need to be updated:" echo "ERROR: The following lines need to be updated:"