core[patch]: deprecate hwchase17/langchain-hub, address path traversal (#18600)

Deprecates the old langchain-hub repository. Does *not* deprecate the
new https://smith.langchain.com/hub

@PinkDraconian has correctly raised that in the event someone is loading
unsanitized user input into the `try_load_from_hub` function, they have
the ability to load files from other locations in github than the
hwchase17/langchain-hub repository.

This PR adds some more path checking to that function and deprecates the
functionality in favor of the hub built into LangSmith.
This commit is contained in:
Erick Friis 2024-03-05 12:49:38 -08:00 committed by GitHub
parent 96cd50938a
commit e1924b3e93
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 46 additions and 2 deletions

View File

@ -9,16 +9,28 @@ from urllib.parse import urljoin
import requests
from langchain_core._api.deprecation import deprecated
DEFAULT_REF = os.environ.get("LANGCHAIN_HUB_DEFAULT_REF", "master")
LANGCHAINHUB_REPO = "https://raw.githubusercontent.com/hwchase17/langchain-hub/"
URL_BASE = os.environ.get(
"LANGCHAIN_HUB_URL_BASE",
"https://raw.githubusercontent.com/hwchase17/langchain-hub/{ref}/",
LANGCHAINHUB_REPO + "{ref}/",
)
HUB_PATH_RE = re.compile(r"lc(?P<ref>@[^:]+)?://(?P<path>.*)")
T = TypeVar("T")
@deprecated(
since="0.1.30",
removal="0.2",
message=(
"Using the hwchase17/langchain-hub "
"repo for prompts is deprecated. Please use "
"https://smith.langchain.com/hub instead."
),
)
def try_load_from_hub(
path: Union[str, Path],
loader: Callable[[str], T],
@ -43,6 +55,8 @@ def try_load_from_hub(
# Instead, use PurePosixPath to ensure that forward slashes are used as the
# path separator, regardless of the operating system.
full_url = urljoin(URL_BASE.format(ref=ref), PurePosixPath(remote_path).__str__())
if not full_url.startswith(LANGCHAINHUB_REPO):
raise ValueError(f"Invalid hub path: {path}")
r = requests.get(full_url, timeout=5)
if r.status_code != 200:

21
libs/core/poetry.lock generated
View File

@ -2214,6 +2214,25 @@ urllib3 = ">=1.21.1,<3"
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "responses"
version = "0.25.0"
description = "A utility library for mocking out the `requests` Python library."
optional = false
python-versions = ">=3.8"
files = [
{file = "responses-0.25.0-py3-none-any.whl", hash = "sha256:2f0b9c2b6437db4b528619a77e5d565e4ec2a9532162ac1a131a83529db7be1a"},
{file = "responses-0.25.0.tar.gz", hash = "sha256:01ae6a02b4f34e39bffceb0fc6786b67a25eae919c6368d05eabc8d9576c2a66"},
]
[package.dependencies]
pyyaml = "*"
requests = ">=2.30.0,<3.0"
urllib3 = ">=1.25.10,<3.0"
[package.extras]
tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli", "tomli-w", "types-PyYAML", "types-requests"]
[[package]]
name = "rfc3339-validator"
version = "0.1.4"
@ -2796,4 +2815,4 @@ extended-testing = ["jinja2"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "8fe07123109b62d7210542d8aff20df6df00819e5b0f36bc12f02206c5161c43"
content-hash = "de97591989f083b89c7a7bc6dabba87e29e13fddc812450d5196d564b2c02ce1"

View File

@ -58,6 +58,7 @@ pytest-watcher = "^0.3.4"
pytest-asyncio = "^0.21.1"
grandalf = "^0.8"
pytest-profiling = "^1.7.0"
responses = "^0.25.0"
[tool.poetry.group.test_integration]

View File

@ -9,6 +9,7 @@ from urllib.parse import urljoin
import pytest
import responses
from langchain_core.utils.loading import DEFAULT_REF, URL_BASE, try_load_from_hub
@ -94,3 +95,12 @@ def test_failed_request(mocked_responses: responses.RequestsMock) -> None:
with pytest.raises(ValueError, match=re.compile("Could not find file at .*")):
try_load_from_hub(f"lc://{path}", loader, "chains", {"json"})
loader.assert_not_called()
def test_path_traversal() -> None:
"""Test that a path traversal attack is prevented."""
path = "lc://chains/../../../../../../../../../it.json"
loader = Mock()
with pytest.raises(ValueError):
try_load_from_hub(path, loader, "chains", {"json"})