fix(core, langchain): harden load() against untrusted manifests (#37197)

2026-06-09 10:17:00 +00:00 · 2026-05-05 14:36:58 -04:00
parent d7031101da
commit c979c6187b
8 changed files with 550 additions and 98 deletions
--- a/libs/core/langchain_core/load/_validation.py
+++ b/libs/core/langchain_core/load/_validation.py
@@ -18,7 +18,7 @@ During deserialization, escaped dicts are unwrapped and returned as plain dicts,
 NOT instantiated as LC objects.
 """

-from typing import Any
+from typing import Any, cast

 from langchain_core.load.serializable import (
    Serializable,
@@ -102,16 +102,25 @@ def _serialize_value(obj: Any) -> Any:
    return to_json_not_implemented(obj)


-def _is_lc_secret(obj: Any) -> bool:
-    """Check if an object is a LangChain secret marker."""
-    expected_num_keys = 3
-    return (
-        isinstance(obj, dict)
-        and obj.get("lc") == 1
-        and obj.get("type") == "secret"
-        and "id" in obj
-        and len(obj) == expected_num_keys
-    )
+def _get_secret_keys(obj: Serializable) -> set[str]:
+    """Return the merged set of constructor kwarg names declared as secrets.
+
+    Mirrors the MRO walk in `Serializable.to_json` so the keys returned here
+    match the keys whose values `_replace_secrets` rewrites into secret
+    markers. Used by `_serialize_lc_object` to decide which kwargs to skip
+    when escaping user data.
+    """
+    secrets: dict[str, str] = {}
+    model_fields = type(obj).model_fields
+    for cls in [None, *obj.__class__.mro()]:
+        if cls is Serializable:
+            break
+        this = cast("Serializable", obj if cls is None else super(cls, obj))
+        secrets.update(this.lc_secrets)
+        for key in list(secrets):
+            if (key in model_fields) and (alias := model_fields[key].alias) is not None:
+                secrets[alias] = secrets[key]
+    return set(secrets)


 def _serialize_lc_object(obj: Any) -> dict[str, Any]:
@@ -124,9 +133,15 @@ def _serialize_lc_object(obj: Any) -> dict[str, Any]:
        The serialized dict with user data in kwargs escaped as needed.

    Note:
-        Kwargs values are processed with `_serialize_value` to escape user data (like
-        metadata) that contains `'lc'` keys. Secret fields (from `lc_secrets`) are
-        skipped because `to_json()` replaces their values with secret markers.
+        Kwargs values are processed with `_serialize_value` to escape user data
+        (like metadata) that contains `'lc'` keys. Secret fields are identified
+        by the class's declared `lc_secrets` and skipped because `to_json()`
+        already converted their values to secret markers.
+
+        The check is key-based rather than shape-based. A shape-based check
+        ("this dict looks like a secret marker") can be forged by user data,
+        letting attacker-controlled free-form dicts bypass escaping and reach
+        the Reviver.
    """
    if not isinstance(obj, Serializable):
        msg = f"Expected Serializable, got {type(obj)}"
@@ -134,11 +149,13 @@ def _serialize_lc_object(obj: Any) -> dict[str, Any]:

    serialized: dict[str, Any] = dict(obj.to_json())

-    # Process kwargs to escape user data that could be confused with LC objects
-    # Skip secret fields - to_json() already converted them to secret markers
+    # Process kwargs to escape user data that could be confused with LC objects.
+    # Skip kwargs declared as secrets - `to_json()` already replaced their
+    # values with secret markers via `_replace_secrets`.
    if serialized.get("type") == "constructor" and "kwargs" in serialized:
+        secret_keys = _get_secret_keys(obj)
        serialized["kwargs"] = {
-            k: v if _is_lc_secret(v) else _serialize_value(v)
+            k: v if k in secret_keys else _serialize_value(v)
            for k, v in serialized["kwargs"].items()
        }

--- a/libs/core/langchain_core/load/load.py
+++ b/libs/core/langchain_core/load/load.py
@@ -11,48 +11,58 @@ is a list of strings representing the module path and class name. For example:
 When deserializing, the class path from the JSON `'id'` field is checked against an
 allowlist. If the class is not in the allowlist, deserialization raises a `ValueError`.

-## Security model
+## Threat model

-!!! warning "Exercise caution with untrusted input"
+A serialized LangChain payload crosses a trust boundary because the manifest
+may contain serialized objects and configuration that affect runtime behavior.
+For example, a payload can configure a chat model with a custom `base_url`,
+custom headers, a different model name, or other constructor arguments. These
+are supported features, but they also mean the payload contents should be
+treated as executable configuration rather than plain text.

-    These functions deserialize by instantiating Python objects, which means
-    constructors (`__init__`) and validators may run and can trigger side effects.
-    With the default settings, deserialization is restricted to a core allowlist
-    of `langchain_core` types (for example: messages, documents, and prompts)
-    defined in `langchain_core.load.mapping`.
+Concretely, deserialization instantiates Python objects, so any constructor
+(`__init__`) or validator on an allowed class can run during `load()`. A
+crafted payload that is allowed to reach an unintended class — or an intended
+class with attacker-controlled kwargs — could cause network calls, file
+operations, or environment-variable access while the object is being built.

-    If you broaden `allowed_objects` (for example, by using `'all'` or adding
-    additional classes), treat the serialized payload as a manifest and only
-    deserialize data that comes from a trusted source. A crafted payload that
-    is allowed to instantiate unintended classes could cause network calls,
-    file operations, or environment variable access during `__init__`.
+!!! warning "Do not use with untrusted input"
+
+    If the source is untrusted, avoid calling `load()` / `loads()` on it. If
+    you must, restrict `allowed_objects` to types that do not execute logic
+    during init — `allowed_objects='messages'` (or an explicit list of
+    message classes) is the safe choice. Keep `secrets_from_env=False`.

 The `allowed_objects` parameter controls which classes can be deserialized:

- **`'core'` (default)**: Allow classes defined in the serialization mappings for
-    langchain_core.
- **`'all'`**: Allow classes defined in the serialization mappings. This
-    includes core LangChain types (messages, prompts, documents, etc.) and trusted
-    partner integrations. See `langchain_core.load.mapping` for the full list.
- **Explicit list of classes**: Only those specific classes are allowed.
-
-For simple data types like messages and documents, the default allowlist is safe to use.
-These classes do not perform side effects during initialization.
+- **Explicit list of classes** (recommended for untrusted input): only those
+    specific classes are allowed.
+- **`'messages'`**: chat-message classes only (e.g. `AIMessage`,
+    `HumanMessage`). Safe for untrusted input.
+- **`'core'` (current default)** — *unsafe with untrusted manifests.*
+    Classes defined in the serialization mappings under `langchain_core`
+    (messages, documents, prompts, etc.).
+- **`'all'`** — *unsafe with untrusted manifests.* Every class in the
+    serialization mappings, including partner chat models and LLMs and their
+    constructor kwargs (endpoint URLs, headers, model names, etc.).

 !!! note "Side effects in allowed classes"

-    Deserialization calls `__init__` on allowed classes. If those classes perform side
-    effects during initialization (network calls, file operations, etc.), those side
-    effects will occur. The allowlist prevents instantiation of classes outside the
-    allowlist, but does not sandbox the allowed classes themselves.
+    Deserialization calls `__init__` on allowed classes. If those classes perform
+    side effects during initialization (network calls, file operations, etc.),
+    those side effects will occur. The allowlist prevents instantiation of
+    classes outside the allowlist, but does not sandbox the allowed classes
+    themselves or constrain their constructor kwargs.

 Import paths are also validated against trusted namespaces before any module is
 imported.

 ### Best practices

- Use the most restrictive `allowed_objects` possible. Prefer an explicit list
-    of classes over `'core'` or `'all'`.
+- Use the most restrictive `allowed_objects` possible. For untrusted input,
+    pass an explicit list of classes or `'messages'`. `'core'` and `'all'`
+    are unsafe with untrusted manifests — only use them when the source
+    serves the entire payload, including its configuration.
 - Keep `secrets_from_env` set to `False` (the default). If you must use it,
    ensure the serialized data comes from a fully trusted source, as a crafted
    payload can read arbitrary environment variables.
@@ -101,6 +111,7 @@ from collections.abc import Callable, Iterable
 from typing import Any, Literal, cast

 from langchain_core._api import beta
+from langchain_core._api.deprecation import warn_deprecated
 from langchain_core.load._validation import _is_escaped_dict, _unescape_value
 from langchain_core.load.mapping import (
    _JS_SERIALIZABLE_MAPPING,
@@ -141,13 +152,31 @@ ALL_SERIALIZABLE_MAPPINGS = {
    **_JS_SERIALIZABLE_MAPPING,
 }

+# Modern message classes admitted by `allowed_objects='messages'`. Legacy types
+# (BaseMessage / BaseMessageChunk, ChatMessage / ChatMessageChunk, FunctionMessage /
+# FunctionMessageChunk) are intentionally excluded — `BaseMessage` is abstract and
+# the chat/function variants are superseded by `ToolMessage` and tool calling.
+_MESSAGES_ALLOWED_CLASS_NAMES = frozenset(
+    {
+        "AIMessage",
+        "AIMessageChunk",
+        "HumanMessage",
+        "HumanMessageChunk",
+        "SystemMessage",
+        "SystemMessageChunk",
+        "ToolMessage",
+        "ToolMessageChunk",
+        "RemoveMessage",
+    }
+)
+
 # Cache for the default allowed class paths computed from mappings
-# Maps mode ("all" or "core") to the cached set of paths
+# Maps mode ("all", "core", or "messages") to the cached set of paths
 _default_class_paths_cache: dict[str, set[tuple[str, ...]]] = {}


 def _get_default_allowed_class_paths(
-    allowed_object_mode: Literal["all", "core"],
+    allowed_object_mode: Literal["all", "core", "messages"],
 ) -> set[tuple[str, ...]]:
    """Get the default allowed class paths from the serialization mappings.

@@ -155,7 +184,7 @@ def _get_default_allowed_class_paths(
    by default. Both the legacy paths (keys) and current paths (values) are included.

    Args:
-        allowed_object_mode: either `'all'` or `'core'`.
+        allowed_object_mode: either `'all'`, `'core'`, or `'messages'`.

    Returns:
        Set of class path tuples that are allowed by default.
@@ -167,6 +196,11 @@ def _get_default_allowed_class_paths(
    for key, value in ALL_SERIALIZABLE_MAPPINGS.items():
        if allowed_object_mode == "core" and value[0] != "langchain_core":
            continue
+        if allowed_object_mode == "messages" and (
+            value[0] != "langchain_core"
+            or value[-1] not in _MESSAGES_ALLOWED_CLASS_NAMES
+        ):
+            continue
        allowed_paths.add(key)
        allowed_paths.add(value)

@@ -301,7 +335,9 @@ class Reviver:

    def __init__(
        self,
-        allowed_objects: Iterable[AllowedObject] | Literal["all", "core"] = "core",
+        allowed_objects: Iterable[AllowedObject]
+        | Literal["all", "core", "messages"]
+        | None = None,
        secrets_map: dict[str, str] | None = None,
        valid_namespaces: list[str] | None = None,
        secrets_from_env: bool = False,  # noqa: FBT001,FBT002
@@ -313,16 +349,24 @@ class Reviver:
    ) -> None:
        """Initialize the reviver.

+        See the module docstring for the threat model around `load()`/`loads()`:
+        a serialized payload may carry constructor configuration that affects
+        runtime behavior (custom `base_url`, headers, model name, etc.). Do not
+        use `'core'` or `'all'` with untrusted manifests.
+
        Args:
            allowed_objects: Allowlist of classes that can be deserialized.
-                - `'core'` (default): Allow classes defined in the serialization
-                    mappings for `langchain_core`.
-                - `'all'`: Allow classes defined in the serialization mappings.
-
-                    This includes core LangChain types (messages, prompts, documents,
-                    etc.) and trusted partner integrations. See
+                - Explicit list of classes (recommended for untrusted input):
+                    only those specific classes are allowed.
+                - `'messages'`: chat-message classes only (e.g. `AIMessage`,
+                    `HumanMessage`). Safe for untrusted input.
+                - `'core'` (current default): unsafe with untrusted manifests.
+                    Classes defined in the serialization mappings under
+                    `langchain_core`.
+                - `'all'`: unsafe with untrusted manifests. Every class in the
+                    serialization mappings, including partner chat models and
+                    LLMs and their constructor kwargs. See
                    `langchain_core.load.mapping` for the full list.
-                - Explicit list of classes: Only those specific classes are allowed.
            secrets_map: A map of secrets to load.

                Only include the specific secrets the serialized object
@@ -352,6 +396,19 @@ class Reviver:

                Defaults to `default_init_validator` which blocks jinja2 templates.
        """
+        if allowed_objects is None:
+            warn_deprecated(
+                since="1.4.0",
+                message=(
+                    "The default value of `allowed_objects` will change in a future "
+                    "version. Pass an explicit value (e.g., "
+                    "allowed_objects='messages' or allowed_objects='core') to suppress "
+                    "this warning."
+                ),
+                pending=True,
+            )
+            allowed_objects = "core"
+
        self.secrets_from_env = secrets_from_env
        self.secrets_map = secrets_map or {}
        # By default, only support langchain, but user can pass in additional namespaces
@@ -372,10 +429,10 @@ class Reviver:
        # Compute allowed class paths:
        # - "all" -> use default paths from mappings (+ additional_import_mappings)
        # - Explicit list -> compute from those classes
-        if allowed_objects in ("all", "core"):
+        if allowed_objects in ("all", "core", "messages"):
            self.allowed_class_paths: set[tuple[str, ...]] | None = (
                _get_default_allowed_class_paths(
-                    cast("Literal['all', 'core']", allowed_objects)
+                    cast("Literal['all', 'core', 'messages']", allowed_objects)
                ).copy()
            )
            # Add paths from additional_import_mappings to the defaults
@@ -512,7 +569,9 @@ class Reviver:
 def loads(
    text: str,
    *,
-    allowed_objects: Iterable[AllowedObject] | Literal["all", "core"] = "core",
+    allowed_objects: Iterable[AllowedObject]
+    | Literal["all", "core", "messages"]
+    | None = None,
    secrets_map: dict[str, str] | None = None,
    valid_namespaces: list[str] | None = None,
    secrets_from_env: bool = False,
@@ -524,30 +583,33 @@ def loads(

    Equivalent to `load(json.loads(text))`.

-    Only classes in the allowlist can be instantiated. The default allowlist includes
-    core LangChain types (messages, prompts, documents, etc.). See
+    Only classes in the allowlist can be instantiated. The default allowlist
+    includes core LangChain types (messages, prompts, documents, etc.). See
    `langchain_core.load.mapping` for the full list.

    !!! warning "Do not use with untrusted input"

-        This function instantiates Python objects and can trigger side effects
-        during deserialization. **Never call `loads()` on data from an untrusted
-        or unauthenticated source.** See the module-level security model
-        documentation for details and best practices.
+        A serialized payload may carry constructor kwargs that affect runtime
+        behavior (custom `base_url`, headers, model name, etc.), so it should be
+        treated as executable configuration rather than plain text. If the
+        source is untrusted, avoid calling `loads()` on it; if you must, pass
+        `allowed_objects='messages'` or an explicit list of message classes.
+        See the module-level threat model for details.

    Args:
        text: The string to load.
        allowed_objects: Allowlist of classes that can be deserialized.

-            - `'core'` (default): Allow classes defined in the serialization mappings
-                for `langchain_core`.
-            - `'all'`: Allow classes defined in the serialization mappings.
-
-                This includes core LangChain types (messages, prompts, documents, etc.)
-                and trusted partner integrations. See `langchain_core.load.mapping` for
-                the full list.
-
-            - Explicit list of classes: Only those specific classes are allowed.
+            - Explicit list of classes (recommended for untrusted input): only
+                those specific classes are allowed.
+            - `'messages'`: chat-message classes only. Safe for untrusted input.
+            - `'core'` (current default): unsafe with untrusted manifests.
+                Classes defined in the serialization mappings under
+                `langchain_core`.
+            - `'all'`: unsafe with untrusted manifests. Every class in the
+                serialization mappings, including partner chat models and LLMs
+                and their constructor kwargs. See `langchain_core.load.mapping`
+                for the full list.
            - `[]`: Disallow all deserialization (will raise on any object).
        secrets_map: A map of secrets to load.

@@ -584,6 +646,19 @@ def loads(
    Raises:
        ValueError: If an object's class path is not in the `allowed_objects` allowlist.
    """
+    if allowed_objects is None:
+        warn_deprecated(
+            since="1.4.0",
+            message=(
+                "The default value of `allowed_objects` will change in a future "
+                "version. Pass an explicit list of allowed classes (or "
+                "'messages' for untrusted input that contains only chat "
+                "messages) to suppress this warning."
+            ),
+            pending=True,
+        )
+        allowed_objects = "core"
+
    # Parse JSON and delegate to load() for proper escape handling
    raw_obj = json.loads(text)
    return load(
@@ -602,7 +677,9 @@ def loads(
 def load(
    obj: Any,
    *,
-    allowed_objects: Iterable[AllowedObject] | Literal["all", "core"] = "core",
+    allowed_objects: Iterable[AllowedObject]
+    | Literal["all", "core", "messages"]
+    | None = None,
    secrets_map: dict[str, str] | None = None,
    valid_namespaces: list[str] | None = None,
    secrets_from_env: bool = False,
@@ -615,30 +692,33 @@ def load(
    Use this if you already have a parsed JSON object, eg. from `json.load` or
    `orjson.loads`.

-    Only classes in the allowlist can be instantiated. The default allowlist includes
-    core LangChain types (messages, prompts, documents, etc.). See
+    Only classes in the allowlist can be instantiated. The default allowlist
+    includes core LangChain types (messages, prompts, documents, etc.). See
    `langchain_core.load.mapping` for the full list.

    !!! warning "Do not use with untrusted input"

-        This function instantiates Python objects and can trigger side effects
-        during deserialization. **Never call `load()` on data from an untrusted
-        or unauthenticated source.** See the module-level security model
-        documentation for details and best practices.
+        A serialized payload may carry constructor kwargs that affect runtime
+        behavior (custom `base_url`, headers, model name, etc.), so it should be
+        treated as executable configuration rather than plain text. If the
+        source is untrusted, avoid calling `load()` on it; if you must, pass
+        `allowed_objects='messages'` or an explicit list of message classes.
+        See the module-level threat model for details.

    Args:
        obj: The object to load.
        allowed_objects: Allowlist of classes that can be deserialized.

-            - `'core'` (default): Allow classes defined in the serialization mappings
-                for `langchain_core`.
-            - `'all'`: Allow classes defined in the serialization mappings.
-
-                This includes core LangChain types (messages, prompts, documents, etc.)
-                and trusted partner integrations. See `langchain_core.load.mapping` for
-                the full list.
-
-            - Explicit list of classes: Only those specific classes are allowed.
+            - Explicit list of classes (recommended for untrusted input): only
+                those specific classes are allowed.
+            - `'messages'`: chat-message classes only. Safe for untrusted input.
+            - `'core'` (current default): unsafe with untrusted manifests.
+                Classes defined in the serialization mappings under
+                `langchain_core`.
+            - `'all'`: unsafe with untrusted manifests. Every class in the
+                serialization mappings, including partner chat models and LLMs
+                and their constructor kwargs. See `langchain_core.load.mapping`
+                for the full list.
            - `[]`: Disallow all deserialization (will raise on any object).
        secrets_map: A map of secrets to load.

@@ -699,6 +779,19 @@ def load(
        )
        ```
    """
+    if allowed_objects is None:
+        warn_deprecated(
+            since="1.4.0",
+            message=(
+                "The default value of `allowed_objects` will change in a future "
+                "version. Pass an explicit list of allowed classes (or "
+                "'messages' for untrusted input that contains only chat "
+                "messages) to suppress this warning."
+            ),
+            pending=True,
+        )
+        allowed_objects = "core"
+
    reviver = Reviver(
        allowed_objects,
        secrets_map,