fix(text-splitters): deprecate and use SSRF-safe transport in split_text_from_url (#36821)

This commit is contained in:
ccurme
2026-04-16 10:13:31 -04:00
committed by GitHub
parent b7447c6969
commit c289bf10e9
2 changed files with 20 additions and 10 deletions

View File

@@ -15,8 +15,7 @@ from typing import (
cast,
)
import requests
from langchain_core._api import beta
from langchain_core._api import beta, deprecated
from langchain_core.documents import BaseDocumentTransformer, Document
from typing_extensions import override
@@ -186,8 +185,19 @@ class HTMLHeaderTextSplitter:
"""
return self.split_text_from_file(StringIO(text))
@deprecated(
since="1.1.2",
removal="2.0.0",
message=(
"Please fetch the HTML content from the URL yourself and pass it "
"to split_text."
),
)
def split_text_from_url(
self, url: str, timeout: int = 10, **kwargs: Any
self,
url: str,
timeout: int = 10,
**kwargs: Any, # noqa: ARG002
) -> list[Document]:
"""Fetch text content from a URL and split it into documents.
@@ -205,14 +215,14 @@ class HTMLHeaderTextSplitter:
Raises:
requests.RequestException: If the HTTP request fails.
"""
from langchain_core._security._ssrf_protection import ( # noqa: PLC0415
validate_safe_url,
from langchain_core._security._transport import ( # noqa: PLC0415
ssrf_safe_client,
)
validate_safe_url(url, allow_private=False, allow_http=True)
response = requests.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
return self.split_text(response.text)
with ssrf_safe_client() as client:
response = client.get(url, timeout=timeout)
response.raise_for_status()
return self.split_text(response.text)
def split_text_from_file(self, file: str | IO[str]) -> list[Document]:
"""Split HTML content from a file into a list of `Document` objects.