diff --git a/libs/partners/anthropic/langchain_anthropic/chat_models.py b/libs/partners/anthropic/langchain_anthropic/chat_models.py index 28e459993eb..d6292c59851 100644 --- a/libs/partners/anthropic/langchain_anthropic/chat_models.py +++ b/libs/partners/anthropic/langchain_anthropic/chat_models.py @@ -916,8 +916,13 @@ class ChatAnthropic(BaseChatModel): or by setting ``stream_usage=False`` when initializing ChatAnthropic. Prompt caching: - See LangChain `docs `__ - for more detail. + Prompt caching reduces processing time and costs for repetitive tasks or prompts + with consistent elements + + .. note:: + Only certain models support prompt caching. + See the `Claude documentation `__ + for a full list. .. code-block:: python @@ -953,6 +958,18 @@ class ChatAnthropic(BaseChatModel): {'cache_read': 0, 'cache_creation': 1458} + Alternatively, you may enable prompt caching at invocation time. You may want to + conditionally cache based on runtime conditions, such as the length of the + context. Alternatively, this is useful for app-level decisions about what to + cache. + + .. code-block:: python + + response = llm.invoke( + messages, + cache_control={"type": "ephemeral"}, + ) + .. dropdown:: Extended caching .. versionadded:: 0.3.15 @@ -970,6 +987,10 @@ class ChatAnthropic(BaseChatModel): and specifying ``"cache_control": {"type": "ephemeral", "ttl": "1h"}``. + .. important:: + Specifying a `ttl` key under `cache_control` will not work unless the + beta header is set! + Details of cached token counts will be included on the ``InputTokenDetails`` of response's ``usage_metadata``: @@ -1068,7 +1089,7 @@ class ChatAnthropic(BaseChatModel): Total tokens: 408 Built-in tools: - See LangChain `docs `__ + See LangChain `docs `__ for more detail. .. dropdown:: Web search @@ -1368,6 +1389,46 @@ class ChatAnthropic(BaseChatModel): ) -> dict: messages = self._convert_input(input_).to_messages() system, formatted_messages = _format_messages(messages) + + # If cache_control is provided in kwargs, add it to last message + # and content block. + if "cache_control" in kwargs and formatted_messages: + cache_control = kwargs["cache_control"] + + # Validate TTL usage requires extended cache TTL beta header + if ( + isinstance(cache_control, dict) + and "ttl" in cache_control + and ( + not self.betas or "extended-cache-ttl-2025-04-11" not in self.betas + ) + ): + msg = ( + "Specifying a 'ttl' under 'cache_control' requires enabling " + "the 'extended-cache-ttl-2025-04-11' beta header. " + "Set betas=['extended-cache-ttl-2025-04-11'] when initializing " + "ChatAnthropic." + ) + warnings.warn(msg, stacklevel=2) + if isinstance(formatted_messages[-1]["content"], list): + formatted_messages[-1]["content"][-1]["cache_control"] = kwargs.pop( + "cache_control" + ) + elif isinstance(formatted_messages[-1]["content"], str): + formatted_messages[-1]["content"] = [ + { + "type": "text", + "text": formatted_messages[-1]["content"], + "cache_control": kwargs.pop("cache_control"), + } + ] + else: + pass + + # If cache_control remains in kwargs, it would be passed as a top-level param + # to the API, but Anthropic expects it nested within a message + _ = kwargs.pop("cache_control", None) + payload = { "model": self.model, "max_tokens": self.max_tokens, diff --git a/libs/partners/anthropic/tests/unit_tests/test_chat_models.py b/libs/partners/anthropic/tests/unit_tests/test_chat_models.py index 93605b4dded..e2c8d5a36c1 100644 --- a/libs/partners/anthropic/tests/unit_tests/test_chat_models.py +++ b/libs/partners/anthropic/tests/unit_tests/test_chat_models.py @@ -1076,3 +1076,50 @@ def test_mcp_tracing() -> None: # Test headers are correctly propagated to request payload = llm._get_request_payload([input_message]) assert payload["mcp_servers"][0]["authorization_token"] == "PLACEHOLDER" # noqa: S105 + + +def test_cache_control_kwarg() -> None: + llm = ChatAnthropic(model="claude-3-5-haiku-latest") + + messages = [HumanMessage("foo"), AIMessage("bar"), HumanMessage("baz")] + payload = llm._get_request_payload(messages) + assert payload["messages"] == [ + {"role": "user", "content": "foo"}, + {"role": "assistant", "content": "bar"}, + {"role": "user", "content": "baz"}, + ] + + payload = llm._get_request_payload(messages, cache_control={"type": "ephemeral"}) + assert payload["messages"] == [ + {"role": "user", "content": "foo"}, + {"role": "assistant", "content": "bar"}, + { + "role": "user", + "content": [ + {"type": "text", "text": "baz", "cache_control": {"type": "ephemeral"}} + ], + }, + ] + + messages = [ + HumanMessage("foo"), + AIMessage("bar"), + HumanMessage( + content=[ + {"type": "text", "text": "baz"}, + {"type": "text", "text": "qux"}, + ] + ), + ] + payload = llm._get_request_payload(messages, cache_control={"type": "ephemeral"}) + assert payload["messages"] == [ + {"role": "user", "content": "foo"}, + {"role": "assistant", "content": "bar"}, + { + "role": "user", + "content": [ + {"type": "text", "text": "baz"}, + {"type": "text", "text": "qux", "cache_control": {"type": "ephemeral"}}, + ], + }, + ]