mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-20 01:49:51 +00:00
feat(anthropic): support cache_control
as a kwarg (#31523)
```python from langchain_anthropic import ChatAnthropic llm = ChatAnthropic(model="claude-3-5-haiku-latest") caching_llm = llm.bind(cache_control={"type": "ephemeral"}) caching_llm.invoke( [ HumanMessage("..."), AIMessage("..."), HumanMessage("..."), # <-- final message / content block gets cache annotation ] ) ``` Potentially useful given's Anthropic's [incremental caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#continuing-a-multi-turn-conversation) capabilities: > During each turn, we mark the final block of the final message with cache_control so the conversation can be incrementally cached. The system will automatically lookup and use the longest previously cached prefix for follow-up messages. --------- Co-authored-by: Mason Daugherty <mason@langchain.dev> Co-authored-by: Mason Daugherty <github@mdrxy.com>
This commit is contained in:
parent
1167e7458e
commit
be83ce74a7
@ -916,8 +916,13 @@ class ChatAnthropic(BaseChatModel):
|
|||||||
or by setting ``stream_usage=False`` when initializing ChatAnthropic.
|
or by setting ``stream_usage=False`` when initializing ChatAnthropic.
|
||||||
|
|
||||||
Prompt caching:
|
Prompt caching:
|
||||||
See LangChain `docs <https://python.langchain.com/docs/integrations/chat/anthropic/#built-in-tools>`__
|
Prompt caching reduces processing time and costs for repetitive tasks or prompts
|
||||||
for more detail.
|
with consistent elements
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
Only certain models support prompt caching.
|
||||||
|
See the `Claude documentation <https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#supported-models>`__
|
||||||
|
for a full list.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -953,6 +958,18 @@ class ChatAnthropic(BaseChatModel):
|
|||||||
|
|
||||||
{'cache_read': 0, 'cache_creation': 1458}
|
{'cache_read': 0, 'cache_creation': 1458}
|
||||||
|
|
||||||
|
Alternatively, you may enable prompt caching at invocation time. You may want to
|
||||||
|
conditionally cache based on runtime conditions, such as the length of the
|
||||||
|
context. Alternatively, this is useful for app-level decisions about what to
|
||||||
|
cache.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
response = llm.invoke(
|
||||||
|
messages,
|
||||||
|
cache_control={"type": "ephemeral"},
|
||||||
|
)
|
||||||
|
|
||||||
.. dropdown:: Extended caching
|
.. dropdown:: Extended caching
|
||||||
|
|
||||||
.. versionadded:: 0.3.15
|
.. versionadded:: 0.3.15
|
||||||
@ -970,6 +987,10 @@ class ChatAnthropic(BaseChatModel):
|
|||||||
|
|
||||||
and specifying ``"cache_control": {"type": "ephemeral", "ttl": "1h"}``.
|
and specifying ``"cache_control": {"type": "ephemeral", "ttl": "1h"}``.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
Specifying a `ttl` key under `cache_control` will not work unless the
|
||||||
|
beta header is set!
|
||||||
|
|
||||||
Details of cached token counts will be included on the ``InputTokenDetails``
|
Details of cached token counts will be included on the ``InputTokenDetails``
|
||||||
of response's ``usage_metadata``:
|
of response's ``usage_metadata``:
|
||||||
|
|
||||||
@ -1068,7 +1089,7 @@ class ChatAnthropic(BaseChatModel):
|
|||||||
Total tokens: 408
|
Total tokens: 408
|
||||||
|
|
||||||
Built-in tools:
|
Built-in tools:
|
||||||
See LangChain `docs <https://python.langchain.com/docs/integrations/chat/anthropic/>`__
|
See LangChain `docs <https://python.langchain.com/docs/integrations/chat/anthropic/#built-in-tools>`__
|
||||||
for more detail.
|
for more detail.
|
||||||
|
|
||||||
.. dropdown:: Web search
|
.. dropdown:: Web search
|
||||||
@ -1368,6 +1389,46 @@ class ChatAnthropic(BaseChatModel):
|
|||||||
) -> dict:
|
) -> dict:
|
||||||
messages = self._convert_input(input_).to_messages()
|
messages = self._convert_input(input_).to_messages()
|
||||||
system, formatted_messages = _format_messages(messages)
|
system, formatted_messages = _format_messages(messages)
|
||||||
|
|
||||||
|
# If cache_control is provided in kwargs, add it to last message
|
||||||
|
# and content block.
|
||||||
|
if "cache_control" in kwargs and formatted_messages:
|
||||||
|
cache_control = kwargs["cache_control"]
|
||||||
|
|
||||||
|
# Validate TTL usage requires extended cache TTL beta header
|
||||||
|
if (
|
||||||
|
isinstance(cache_control, dict)
|
||||||
|
and "ttl" in cache_control
|
||||||
|
and (
|
||||||
|
not self.betas or "extended-cache-ttl-2025-04-11" not in self.betas
|
||||||
|
)
|
||||||
|
):
|
||||||
|
msg = (
|
||||||
|
"Specifying a 'ttl' under 'cache_control' requires enabling "
|
||||||
|
"the 'extended-cache-ttl-2025-04-11' beta header. "
|
||||||
|
"Set betas=['extended-cache-ttl-2025-04-11'] when initializing "
|
||||||
|
"ChatAnthropic."
|
||||||
|
)
|
||||||
|
warnings.warn(msg, stacklevel=2)
|
||||||
|
if isinstance(formatted_messages[-1]["content"], list):
|
||||||
|
formatted_messages[-1]["content"][-1]["cache_control"] = kwargs.pop(
|
||||||
|
"cache_control"
|
||||||
|
)
|
||||||
|
elif isinstance(formatted_messages[-1]["content"], str):
|
||||||
|
formatted_messages[-1]["content"] = [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": formatted_messages[-1]["content"],
|
||||||
|
"cache_control": kwargs.pop("cache_control"),
|
||||||
|
}
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If cache_control remains in kwargs, it would be passed as a top-level param
|
||||||
|
# to the API, but Anthropic expects it nested within a message
|
||||||
|
_ = kwargs.pop("cache_control", None)
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": self.model,
|
"model": self.model,
|
||||||
"max_tokens": self.max_tokens,
|
"max_tokens": self.max_tokens,
|
||||||
|
@ -1076,3 +1076,50 @@ def test_mcp_tracing() -> None:
|
|||||||
# Test headers are correctly propagated to request
|
# Test headers are correctly propagated to request
|
||||||
payload = llm._get_request_payload([input_message])
|
payload = llm._get_request_payload([input_message])
|
||||||
assert payload["mcp_servers"][0]["authorization_token"] == "PLACEHOLDER" # noqa: S105
|
assert payload["mcp_servers"][0]["authorization_token"] == "PLACEHOLDER" # noqa: S105
|
||||||
|
|
||||||
|
|
||||||
|
def test_cache_control_kwarg() -> None:
|
||||||
|
llm = ChatAnthropic(model="claude-3-5-haiku-latest")
|
||||||
|
|
||||||
|
messages = [HumanMessage("foo"), AIMessage("bar"), HumanMessage("baz")]
|
||||||
|
payload = llm._get_request_payload(messages)
|
||||||
|
assert payload["messages"] == [
|
||||||
|
{"role": "user", "content": "foo"},
|
||||||
|
{"role": "assistant", "content": "bar"},
|
||||||
|
{"role": "user", "content": "baz"},
|
||||||
|
]
|
||||||
|
|
||||||
|
payload = llm._get_request_payload(messages, cache_control={"type": "ephemeral"})
|
||||||
|
assert payload["messages"] == [
|
||||||
|
{"role": "user", "content": "foo"},
|
||||||
|
{"role": "assistant", "content": "bar"},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "baz", "cache_control": {"type": "ephemeral"}}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
HumanMessage("foo"),
|
||||||
|
AIMessage("bar"),
|
||||||
|
HumanMessage(
|
||||||
|
content=[
|
||||||
|
{"type": "text", "text": "baz"},
|
||||||
|
{"type": "text", "text": "qux"},
|
||||||
|
]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
payload = llm._get_request_payload(messages, cache_control={"type": "ephemeral"})
|
||||||
|
assert payload["messages"] == [
|
||||||
|
{"role": "user", "content": "foo"},
|
||||||
|
{"role": "assistant", "content": "bar"},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "baz"},
|
||||||
|
{"type": "text", "text": "qux", "cache_control": {"type": "ephemeral"}},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
Loading…
Reference in New Issue
Block a user