fix(text-splitters): deprecate and use SSRF-safe transport in split_text_from_url (#36821)

2026-06-09 10:17:00 +00:00 · 2026-04-16 10:13:31 -04:00
parent b7447c6969
commit c289bf10e9
2 changed files with 20 additions and 10 deletions
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -15,8 +15,7 @@ from typing import (
    cast,
 )

-import requests
-from langchain_core._api import beta
+from langchain_core._api import beta, deprecated
 from langchain_core.documents import BaseDocumentTransformer, Document
 from typing_extensions import override

@@ -186,8 +185,19 @@ class HTMLHeaderTextSplitter:
        """
        return self.split_text_from_file(StringIO(text))

+    @deprecated(
+        since="1.1.2",
+        removal="2.0.0",
+        message=(
+            "Please fetch the HTML content from the URL yourself and pass it "
+            "to split_text."
+        ),
+    )
    def split_text_from_url(
-        self, url: str, timeout: int = 10, **kwargs: Any
+        self,
+        url: str,
+        timeout: int = 10,
+        **kwargs: Any,  # noqa: ARG002
    ) -> list[Document]:
        """Fetch text content from a URL and split it into documents.

@@ -205,14 +215,14 @@ class HTMLHeaderTextSplitter:
        Raises:
            requests.RequestException: If the HTTP request fails.
        """
-        from langchain_core._security._ssrf_protection import (  # noqa: PLC0415
-            validate_safe_url,
+        from langchain_core._security._transport import (  # noqa: PLC0415
+            ssrf_safe_client,
        )

-        validate_safe_url(url, allow_private=False, allow_http=True)
-        response = requests.get(url, timeout=timeout, **kwargs)
-        response.raise_for_status()
-        return self.split_text(response.text)
+        with ssrf_safe_client() as client:
+            response = client.get(url, timeout=timeout)
+            response.raise_for_status()
+            return self.split_text(response.text)

    def split_text_from_file(self, file: str | IO[str]) -> list[Document]:
        """Split HTML content from a file into a list of `Document` objects.