YouTube Loader: Replace regexp with built-in parsing (#4729)

2025-08-06 11:37:12 +00:00 · 2023-05-15 11:34:41 -04:00 · 2023-05-15 11:34:41 -04:00 · d3300bd799
commit d3300bd799
parent c70ae562b4
2 changed files with 52 additions and 34 deletions
--- a/langchain/document_loaders/youtube.py
+++ b/langchain/document_loaders/youtube.py
@ -4,7 +4,7 @@ from __future__ import annotations
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-import re
+from urllib.parse import parse_qs, urlparse
 from pydantic import root_validator
 from pydantic.dataclasses import dataclass
@ -97,33 +97,46 @@ class GoogleApiClient:
        return creds
-YT_URL_RE = re.compile(
+
-    r"""(?x)^
+ALLOWED_SCHEMAS = {"http", "https"}
-     (
+ALLOWED_NETLOCK = {
-         (?:https?://|//)                                    # http(s):// or protocol-independent URL
+    "youtu.be",
-         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
+    "m.youtube.com",
-            youtube\.googleapis\.com)/                        # the various hostnames, with wildcard subdomains
+    "youtube.com",
-         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
+    "www.youtube.com",
-         (?:                                                  # the various things that can precede the ID:
+    "www.youtube-nocookie.com",
-             (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
+    "vid.plus",
-             |shorts/
+}
-             |(?:                                             # or the v= param in all its forms
+
-                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+
-                 (?:\?|\#!?)                                  # the params delimiter ? or # or #!
+def _parse_video_id(url: str) -> Optional[str]:
-                 (?:.*?[&;])??                                # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
+    """Parse a youtube url and return the video id if valid, otherwise None."""
-                 v=
+    parsed_url = urlparse(url)
-             )
+
-         ))
+    if parsed_url.scheme not in ALLOWED_SCHEMAS:
-         |(?:
+        return None
-            youtu\.be|                                        # just youtu.be/xxxx
+
-            vid\.plus|                                        # or vid.plus/xxxx
+    if parsed_url.netloc not in ALLOWED_NETLOCK:
-         )/
+        return None
-         )
+
-     )?                                                       # all until now is optional -> you can pass the naked ID
+    path = parsed_url.path
-     (?P<id>[0-9A-Za-z_-]{11})                                # here is it! the YouTube video ID
+
-     (?(1).+)?                                                # if we found the ID, everything can follow
+    if path.endswith("/watch"):
-     $"""
+        query = parsed_url.query
-)
+        parsed_query = parse_qs(query)
        if "v" in parsed_query:
            ids = parsed_query["v"]
            video_id = ids if isinstance(ids, str) else ids[0]
        else:
            return None
    else:
        path = parsed_url.path.lstrip("/")
        video_id = path.split("/")[-1]
    if len(video_id) != 11:  # Video IDs are 11 characters long
        return None
    return video_id
 class YoutubeLoader(BaseLoader):
@ -145,10 +158,12 @@ class YoutubeLoader(BaseLoader):
    @staticmethod
    def extract_video_id(youtube_url: str) -> str:
        """Extract video id from common YT urls."""
-        match = YT_URL_RE.match(youtube_url)
+        video_id = _parse_video_id(youtube_url)
-        if not match:
+        if not video_id:
-            raise ValueError(f"Could not determine the video ID for the URL {youtube_url}")
+            raise ValueError(
-        return match.group("id")
+                f"Could not determine the video ID for the URL {youtube_url}"
            )
        return video_id
    @classmethod
    def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
--- a/tests/unit_tests/document_loader/test_youtube.py
+++ b/tests/unit_tests/document_loader/test_youtube.py
@ -1,6 +1,7 @@
 from langchain.document_loaders import YoutubeLoader
 import pytest
 from langchain.document_loaders import YoutubeLoader
@pytest.mark.parametrize(
    "youtube_url, expected_video_id",
@ -18,7 +19,9 @@ import pytest
        ("https://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
        ("http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0", "lalOy8Mbfdc"),
        ("https://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
        ("https://www.youtube.com/shorts/cd0Fy92_w_s", "cd0Fy92_w_s"),
    ],
 )
-def test_video_id_extraction(youtube_url: str, expected_video_id: str):
+def test_video_id_extraction(youtube_url: str, expected_video_id: str) -> None:
    """Test that the video id is extracted from a youtube url"""
    assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id