feat(anthropic): support cache_control as a kwarg (#31523)

```python
from langchain_anthropic import ChatAnthropic

llm = ChatAnthropic(model="claude-3-5-haiku-latest")
caching_llm = llm.bind(cache_control={"type": "ephemeral"})

caching_llm.invoke(
    [
        HumanMessage("..."),
        AIMessage("..."),
        HumanMessage("..."),  # <-- final message / content block gets cache annotation
    ]
)
```
Potentially useful given's Anthropic's [incremental
caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#continuing-a-multi-turn-conversation)
capabilities:
> During each turn, we mark the final block of the final message with
cache_control so the conversation can be incrementally cached. The
system will automatically lookup and use the longest previously cached
prefix for follow-up messages.

---------

Co-authored-by: Mason Daugherty <mason@langchain.dev>
Co-authored-by: Mason Daugherty <github@mdrxy.com>
This commit is contained in:
ccurme 2025-08-12 17:18:24 -03:00 committed by GitHub
parent 1167e7458e
commit be83ce74a7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 111 additions and 3 deletions

View File

@ -916,8 +916,13 @@ class ChatAnthropic(BaseChatModel):
or by setting ``stream_usage=False`` when initializing ChatAnthropic.
Prompt caching:
See LangChain `docs <https://python.langchain.com/docs/integrations/chat/anthropic/#built-in-tools>`__
for more detail.
Prompt caching reduces processing time and costs for repetitive tasks or prompts
with consistent elements
.. note::
Only certain models support prompt caching.
See the `Claude documentation <https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#supported-models>`__
for a full list.
.. code-block:: python
@ -953,6 +958,18 @@ class ChatAnthropic(BaseChatModel):
{'cache_read': 0, 'cache_creation': 1458}
Alternatively, you may enable prompt caching at invocation time. You may want to
conditionally cache based on runtime conditions, such as the length of the
context. Alternatively, this is useful for app-level decisions about what to
cache.
.. code-block:: python
response = llm.invoke(
messages,
cache_control={"type": "ephemeral"},
)
.. dropdown:: Extended caching
.. versionadded:: 0.3.15
@ -970,6 +987,10 @@ class ChatAnthropic(BaseChatModel):
and specifying ``"cache_control": {"type": "ephemeral", "ttl": "1h"}``.
.. important::
Specifying a `ttl` key under `cache_control` will not work unless the
beta header is set!
Details of cached token counts will be included on the ``InputTokenDetails``
of response's ``usage_metadata``:
@ -1068,7 +1089,7 @@ class ChatAnthropic(BaseChatModel):
Total tokens: 408
Built-in tools:
See LangChain `docs <https://python.langchain.com/docs/integrations/chat/anthropic/>`__
See LangChain `docs <https://python.langchain.com/docs/integrations/chat/anthropic/#built-in-tools>`__
for more detail.
.. dropdown:: Web search
@ -1368,6 +1389,46 @@ class ChatAnthropic(BaseChatModel):
) -> dict:
messages = self._convert_input(input_).to_messages()
system, formatted_messages = _format_messages(messages)
# If cache_control is provided in kwargs, add it to last message
# and content block.
if "cache_control" in kwargs and formatted_messages:
cache_control = kwargs["cache_control"]
# Validate TTL usage requires extended cache TTL beta header
if (
isinstance(cache_control, dict)
and "ttl" in cache_control
and (
not self.betas or "extended-cache-ttl-2025-04-11" not in self.betas
)
):
msg = (
"Specifying a 'ttl' under 'cache_control' requires enabling "
"the 'extended-cache-ttl-2025-04-11' beta header. "
"Set betas=['extended-cache-ttl-2025-04-11'] when initializing "
"ChatAnthropic."
)
warnings.warn(msg, stacklevel=2)
if isinstance(formatted_messages[-1]["content"], list):
formatted_messages[-1]["content"][-1]["cache_control"] = kwargs.pop(
"cache_control"
)
elif isinstance(formatted_messages[-1]["content"], str):
formatted_messages[-1]["content"] = [
{
"type": "text",
"text": formatted_messages[-1]["content"],
"cache_control": kwargs.pop("cache_control"),
}
]
else:
pass
# If cache_control remains in kwargs, it would be passed as a top-level param
# to the API, but Anthropic expects it nested within a message
_ = kwargs.pop("cache_control", None)
payload = {
"model": self.model,
"max_tokens": self.max_tokens,

View File

@ -1076,3 +1076,50 @@ def test_mcp_tracing() -> None:
# Test headers are correctly propagated to request
payload = llm._get_request_payload([input_message])
assert payload["mcp_servers"][0]["authorization_token"] == "PLACEHOLDER" # noqa: S105
def test_cache_control_kwarg() -> None:
llm = ChatAnthropic(model="claude-3-5-haiku-latest")
messages = [HumanMessage("foo"), AIMessage("bar"), HumanMessage("baz")]
payload = llm._get_request_payload(messages)
assert payload["messages"] == [
{"role": "user", "content": "foo"},
{"role": "assistant", "content": "bar"},
{"role": "user", "content": "baz"},
]
payload = llm._get_request_payload(messages, cache_control={"type": "ephemeral"})
assert payload["messages"] == [
{"role": "user", "content": "foo"},
{"role": "assistant", "content": "bar"},
{
"role": "user",
"content": [
{"type": "text", "text": "baz", "cache_control": {"type": "ephemeral"}}
],
},
]
messages = [
HumanMessage("foo"),
AIMessage("bar"),
HumanMessage(
content=[
{"type": "text", "text": "baz"},
{"type": "text", "text": "qux"},
]
),
]
payload = llm._get_request_payload(messages, cache_control={"type": "ephemeral"})
assert payload["messages"] == [
{"role": "user", "content": "foo"},
{"role": "assistant", "content": "bar"},
{
"role": "user",
"content": [
{"type": "text", "text": "baz"},
{"type": "text", "text": "qux", "cache_control": {"type": "ephemeral"}},
],
},
]