feat(anthropic): support cache_control as a kwarg (#31523)

```python from langchain_anthropic import ChatAnthropic llm = ChatAnthropic(model="claude-3-5-haiku-latest") caching_llm = llm.bind(cache_control={"type": "ephemeral"}) caching_llm.invoke( [ HumanMessage("..."), AIMessage("..."), HumanMessage("..."), # <-- final message / content block gets cache annotation ] ) ``` Potentially useful given's Anthropic's [incremental caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#continuing-a-multi-turn-conversation) capabilities: > During each turn, we mark the final block of the final message with cache_control so the conversation can be incrementally cached. The system will automatically lookup and use the longest previously cached prefix for follow-up messages. --------- Co-authored-by: Mason Daugherty <mason@langchain.dev> Co-authored-by: Mason Daugherty <github@mdrxy.com>
2025-08-20 01:49:51 +00:00 · 2025-08-12 17:18:24 -03:00 · 2025-08-12 17:18:24 -03:00 · be83ce74a7
commit be83ce74a7
parent 1167e7458e
2 changed files with 111 additions and 3 deletions
--- a/libs/partners/anthropic/langchain_anthropic/chat_models.py
+++ b/libs/partners/anthropic/langchain_anthropic/chat_models.py
@ -916,8 +916,13 @@ class ChatAnthropic(BaseChatModel):
        or by setting ``stream_usage=False`` when initializing ChatAnthropic.
    Prompt caching:
-        See LangChain `docs <https://python.langchain.com/docs/integrations/chat/anthropic/#built-in-tools>`__
+        Prompt caching reduces processing time and costs for repetitive tasks or prompts
-        for more detail.
+        with consistent elements
        .. note::
            Only certain models support prompt caching.
            See the `Claude documentation <https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#supported-models>`__
            for a full list.
        .. code-block:: python
@ -953,6 +958,18 @@ class ChatAnthropic(BaseChatModel):
            {'cache_read': 0, 'cache_creation': 1458}
        Alternatively, you may enable prompt caching at invocation time. You may want to
        conditionally cache based on runtime conditions, such as the length of the
        context. Alternatively, this is useful for app-level decisions about what to
        cache.
        .. code-block:: python
            response = llm.invoke(
                messages,
                cache_control={"type": "ephemeral"},
            )
        .. dropdown:: Extended caching
            .. versionadded:: 0.3.15
@ -970,6 +987,10 @@ class ChatAnthropic(BaseChatModel):
            and specifying ``"cache_control": {"type": "ephemeral", "ttl": "1h"}``.
            .. important::
                Specifying a `ttl` key under `cache_control` will not work unless the
                beta header is set!
            Details of cached token counts will be included on the ``InputTokenDetails``
            of response's ``usage_metadata``:
@ -1068,7 +1089,7 @@ class ChatAnthropic(BaseChatModel):
            Total tokens: 408
    Built-in tools:
-        See LangChain `docs <https://python.langchain.com/docs/integrations/chat/anthropic/>`__
+        See LangChain `docs <https://python.langchain.com/docs/integrations/chat/anthropic/#built-in-tools>`__
        for more detail.
        .. dropdown::  Web search
@ -1368,6 +1389,46 @@ class ChatAnthropic(BaseChatModel):
    ) -> dict:
        messages = self._convert_input(input_).to_messages()
        system, formatted_messages = _format_messages(messages)
        # If cache_control is provided in kwargs, add it to last message
        # and content block.
        if "cache_control" in kwargs and formatted_messages:
            cache_control = kwargs["cache_control"]
            # Validate TTL usage requires extended cache TTL beta header
            if (
                isinstance(cache_control, dict)
                and "ttl" in cache_control
                and (
                    not self.betas or "extended-cache-ttl-2025-04-11" not in self.betas
                )
            ):
                msg = (
                    "Specifying a 'ttl' under 'cache_control' requires enabling "
                    "the 'extended-cache-ttl-2025-04-11' beta header. "
                    "Set betas=['extended-cache-ttl-2025-04-11'] when initializing "
                    "ChatAnthropic."
                )
                warnings.warn(msg, stacklevel=2)
            if isinstance(formatted_messages[-1]["content"], list):
                formatted_messages[-1]["content"][-1]["cache_control"] = kwargs.pop(
                    "cache_control"
                )
            elif isinstance(formatted_messages[-1]["content"], str):
                formatted_messages[-1]["content"] = [
                    {
                        "type": "text",
                        "text": formatted_messages[-1]["content"],
                        "cache_control": kwargs.pop("cache_control"),
                    }
                ]
            else:
                pass
        # If cache_control remains in kwargs, it would be passed as a top-level param
        # to the API, but Anthropic expects it nested within a message
        _ = kwargs.pop("cache_control", None)
        payload = {
            "model": self.model,
            "max_tokens": self.max_tokens,
--- a/libs/partners/anthropic/tests/unit_tests/test_chat_models.py
+++ b/libs/partners/anthropic/tests/unit_tests/test_chat_models.py
@ -1076,3 +1076,50 @@ def test_mcp_tracing() -> None:
    # Test headers are correctly propagated to request
    payload = llm._get_request_payload([input_message])
    assert payload["mcp_servers"][0]["authorization_token"] == "PLACEHOLDER"  # noqa: S105
 def test_cache_control_kwarg() -> None:
    llm = ChatAnthropic(model="claude-3-5-haiku-latest")
    messages = [HumanMessage("foo"), AIMessage("bar"), HumanMessage("baz")]
    payload = llm._get_request_payload(messages)
    assert payload["messages"] == [
        {"role": "user", "content": "foo"},
        {"role": "assistant", "content": "bar"},
        {"role": "user", "content": "baz"},
    ]
    payload = llm._get_request_payload(messages, cache_control={"type": "ephemeral"})
    assert payload["messages"] == [
        {"role": "user", "content": "foo"},
        {"role": "assistant", "content": "bar"},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "baz", "cache_control": {"type": "ephemeral"}}
            ],
        },
    ]
    messages = [
        HumanMessage("foo"),
        AIMessage("bar"),
        HumanMessage(
            content=[
                {"type": "text", "text": "baz"},
                {"type": "text", "text": "qux"},
            ]
        ),
    ]
    payload = llm._get_request_payload(messages, cache_control={"type": "ephemeral"})
    assert payload["messages"] == [
        {"role": "user", "content": "foo"},
        {"role": "assistant", "content": "bar"},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "baz"},
                {"type": "text", "text": "qux", "cache_control": {"type": "ephemeral"}},
            ],
        },
    ]