mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
community[minor]: [GoogleApiYoutubeLoader] Replace API used in _get_document_for_channel from search to playlistItem (#24034)
- **Description:** Search has a limit of 500 results, playlistItems doesn't. Added a class in except clause to catch another common error. - **Issue:** None - **Dependencies:** None - **Twitter handle:** @TupleType --------- Co-authored-by: asi-cider <88270351+asi-cider@users.noreply.github.com> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
6a45bf9554
commit
372c27f2e5
@ -7,6 +7,7 @@ from enum import Enum
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
|
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
|
from xml.etree.ElementTree import ParseError # OK: trusted-source
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.pydantic_v1 import root_validator
|
from langchain_core.pydantic_v1 import root_validator
|
||||||
@ -28,6 +29,8 @@ class GoogleApiClient:
|
|||||||
As the google api expects credentials you need to set up a google account and
|
As the google api expects credentials you need to set up a google account and
|
||||||
register your Service. "https://developers.google.com/docs/api/quickstart/python"
|
register your Service. "https://developers.google.com/docs/api/quickstart/python"
|
||||||
|
|
||||||
|
*Security Note*: Note that parsing of the transcripts relies on the standard
|
||||||
|
xml library but the input is viewed as trusted in this case.
|
||||||
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
@ -437,6 +440,14 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
|||||||
channel_id = response["items"][0]["id"]["channelId"]
|
channel_id = response["items"][0]["id"]["channelId"]
|
||||||
return channel_id
|
return channel_id
|
||||||
|
|
||||||
|
def _get_uploads_playlist_id(self, channel_id: str) -> str:
|
||||||
|
request = self.youtube_client.channels().list(
|
||||||
|
part="contentDetails",
|
||||||
|
id=channel_id,
|
||||||
|
)
|
||||||
|
response = request.execute()
|
||||||
|
return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
|
||||||
|
|
||||||
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
|
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
|
||||||
try:
|
try:
|
||||||
from youtube_transcript_api import (
|
from youtube_transcript_api import (
|
||||||
@ -452,10 +463,11 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
|
|
||||||
channel_id = self._get_channel_id(channel)
|
channel_id = self._get_channel_id(channel)
|
||||||
request = self.youtube_client.search().list(
|
uploads_playlist_id = self._get_uploads_playlist_id(channel_id)
|
||||||
|
request = self.youtube_client.playlistItems().list(
|
||||||
part="id,snippet",
|
part="id,snippet",
|
||||||
channelId=channel_id,
|
playlistId=uploads_playlist_id,
|
||||||
maxResults=50, # adjust this value to retrieve more or fewer videos
|
maxResults=50,
|
||||||
)
|
)
|
||||||
video_ids = []
|
video_ids = []
|
||||||
while request is not None:
|
while request is not None:
|
||||||
@ -463,23 +475,20 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
|||||||
|
|
||||||
# Add each video ID to the list
|
# Add each video ID to the list
|
||||||
for item in response["items"]:
|
for item in response["items"]:
|
||||||
if not item["id"].get("videoId"):
|
video_id = item["snippet"]["resourceId"]["videoId"]
|
||||||
continue
|
meta_data = {"videoId": video_id}
|
||||||
meta_data = {"videoId": item["id"]["videoId"]}
|
|
||||||
if self.add_video_info:
|
if self.add_video_info:
|
||||||
item["snippet"].pop("thumbnails")
|
item["snippet"].pop("thumbnails")
|
||||||
meta_data.update(item["snippet"])
|
meta_data.update(item["snippet"])
|
||||||
try:
|
try:
|
||||||
page_content = self._get_transcripe_for_video_id(
|
page_content = self._get_transcripe_for_video_id(video_id)
|
||||||
item["id"]["videoId"]
|
|
||||||
)
|
|
||||||
video_ids.append(
|
video_ids.append(
|
||||||
Document(
|
Document(
|
||||||
page_content=page_content,
|
page_content=page_content,
|
||||||
metadata=meta_data,
|
metadata=meta_data,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
except (TranscriptsDisabled, NoTranscriptFound) as e:
|
except (TranscriptsDisabled, NoTranscriptFound, ParseError) as e:
|
||||||
if self.continue_on_failure:
|
if self.continue_on_failure:
|
||||||
logger.error(
|
logger.error(
|
||||||
"Error fetching transscript "
|
"Error fetching transscript "
|
||||||
|
@ -29,7 +29,7 @@ fi
|
|||||||
# is very nuanced and depends on the user's environment.
|
# is very nuanced and depends on the user's environment.
|
||||||
# https://docs.python.org/3/library/xml.etree.elementtree.html
|
# https://docs.python.org/3/library/xml.etree.elementtree.html
|
||||||
|
|
||||||
result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in" || true)
|
result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in| # OK: trusted-source" || true)
|
||||||
|
|
||||||
if [ -n "$result" ]; then
|
if [ -n "$result" ]; then
|
||||||
echo "ERROR: The following lines need to be updated:"
|
echo "ERROR: The following lines need to be updated:"
|
||||||
|
Loading…
Reference in New Issue
Block a user