diff --git a/libs/core/langchain_core/messages/utils.py b/libs/core/langchain_core/messages/utils.py index 45f17046145..529f2a5c46e 100644 --- a/libs/core/langchain_core/messages/utils.py +++ b/libs/core/langchain_core/messages/utils.py @@ -12,6 +12,7 @@ from __future__ import annotations import base64 import inspect import json +import math from collections.abc import Iterable, Sequence from functools import partial from typing import ( @@ -1424,3 +1425,80 @@ def _convert_to_openai_tool_calls(tool_calls: list[ToolCall]) -> list[dict]: } for tool_call in tool_calls ] + + +def count_tokens_approximately( + messages: Iterable[MessageLikeRepresentation], + *, + chars_per_token: float = 4.0, + extra_tokens_per_message: float = 3.0, + count_name: bool = True, +) -> int: + """Approximate the total number of tokens in messages. + + The token count includes stringified message content, role, and (optionally) name. + - For AI messages, the token count also includes stringified tool calls. + - For tool messages, the token count also includes the tool call ID. + + Args: + messages: List of messages to count tokens for. + chars_per_token: Number of characters per token to use for the approximation. + Default is 4 (one token corresponds to ~4 chars for common English text). + You can also specify float values for more fine-grained control. + See more here: https://platform.openai.com/tokenizer + extra_tokens_per_message: Number of extra tokens to add per message. + Default is 3 (special tokens, including beginning/end of message). + You can also specify float values for more fine-grained control. + See more here: + https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb + count_name: Whether to include message names in the count. + Enabled by default. + + Returns: + Approximate number of tokens in the messages. + + Note: + This is a simple approximation that may not match the exact token count + used by specific models. For accurate counts, use model-specific tokenizers. + + .. versionadded:: 0.3.46 + """ + token_count = 0.0 + for message in convert_to_messages(messages): + message_chars = 0 + if isinstance(message.content, str): + message_chars += len(message.content) + + # TODO: add support for approximate counting for image blocks + else: + content = repr(message.content) + message_chars += len(content) + + if ( + isinstance(message, AIMessage) + # exclude Anthropic format as tool calls are already included in the content + and not isinstance(message.content, list) + and message.tool_calls + ): + tool_calls_content = repr(message.tool_calls) + message_chars += len(tool_calls_content) + + if isinstance(message, ToolMessage): + message_chars += len(message.tool_call_id) + + role = _get_message_openai_role(message) + message_chars += len(role) + + if message.name and count_name: + message_chars += len(message.name) + + # NOTE: we're rounding up per message to ensure that + # individual message token counts add up to the total count + # for a list of messages + token_count += math.ceil(message_chars / chars_per_token) + + # add extra tokens per message + token_count += extra_tokens_per_message + + # round up once more time in case extra_tokens_per_message is a float + return math.ceil(token_count) diff --git a/libs/core/tests/unit_tests/messages/test_utils.py b/libs/core/tests/unit_tests/messages/test_utils.py index 3b223b4864f..02221897a9e 100644 --- a/libs/core/tests/unit_tests/messages/test_utils.py +++ b/libs/core/tests/unit_tests/messages/test_utils.py @@ -18,6 +18,7 @@ from langchain_core.messages import ( from langchain_core.messages.utils import ( convert_to_messages, convert_to_openai_messages, + count_tokens_approximately, filter_messages, merge_message_runs, trim_messages, @@ -976,3 +977,130 @@ def test_convert_to_openai_messages_developer() -> None: ] result = convert_to_openai_messages(messages) assert result == [{"role": "developer", "content": "a"}] * 2 + + +def test_count_tokens_approximately_empty_messages() -> None: + # Test with empty message list + assert count_tokens_approximately([]) == 0 + + # Test with empty content + messages = [HumanMessage(content="")] + # 4 role chars -> 1 + 3 = 4 tokens + assert count_tokens_approximately(messages) == 4 + + +def test_count_tokens_approximately_with_names() -> None: + messages = [ + # 5 chars + 4 role chars -> 3 + 3 = 6 tokens + # (with name: extra 4 name chars, so total = 4 + 3 = 7 tokens) + HumanMessage(content="Hello", name="user"), + # 8 chars + 9 role chars -> 5 + 3 = 8 tokens + # (with name: extra 9 name chars, so total = 7 + 3 = 10 tokens) + AIMessage(content="Hi there", name="assistant"), + ] + # With names included (default) + assert count_tokens_approximately(messages) == 17 + + # Without names + without_names = count_tokens_approximately(messages, count_name=False) + assert without_names == 14 + + +def test_count_tokens_approximately_openai_format() -> None: + # same as test_count_tokens_approximately_with_names, but in OpenAI format + messages = [ + {"role": "user", "content": "Hello", "name": "user"}, + {"role": "assistant", "content": "Hi there", "name": "assistant"}, + ] + # With names included (default) + assert count_tokens_approximately(messages) == 17 + + # Without names + without_names = count_tokens_approximately(messages, count_name=False) + assert without_names == 14 + + +def test_count_tokens_approximately_string_content() -> None: + messages = [ + # 5 chars + 4 role chars -> 3 + 3 = 6 tokens + HumanMessage(content="Hello"), + # 8 chars + 9 role chars -> 5 + 3 = 8 tokens + AIMessage(content="Hi there"), + # 12 chars + 4 role chars -> 4 + 3 = 7 tokens + HumanMessage(content="How are you?"), + ] + assert count_tokens_approximately(messages) == 21 + + +def test_count_tokens_approximately_list_content() -> None: + messages = [ + # '[{"foo": "bar"}]' -> 16 chars + 4 role chars -> 5 + 3 = 8 tokens + HumanMessage(content=[{"foo": "bar"}]), + # '[{"test": 123}]' -> 15 chars + 9 role chars -> 6 + 3 = 9 tokens + AIMessage(content=[{"test": 123}]), + ] + assert count_tokens_approximately(messages) == 17 + + +def test_count_tokens_approximately_tool_calls() -> None: + tool_calls = [{"name": "test_tool", "args": {"foo": "bar"}, "id": "1"}] + messages = [ + # tool calls json -> 79 chars + 9 role chars -> 22 + 3 = 25 tokens + AIMessage(content="", tool_calls=tool_calls), + # 15 chars + 4 role chars -> 5 + 3 = 8 tokens + HumanMessage(content="Regular message"), + ] + assert count_tokens_approximately(messages) == 33 + # AI message w/ both content and tool calls + # 94 chars + 9 role chars -> 26 + 3 = 29 tokens + messages = [ + AIMessage(content="Regular message", tool_calls=tool_calls), + ] + assert count_tokens_approximately(messages) == 29 + + +def test_count_tokens_approximately_custom_token_length() -> None: + messages = [ + # 11 chars + 4 role chars -> (4 tokens of length 4 / 8 tokens of length 2) + 3 + HumanMessage(content="Hello world"), + # 7 chars + 9 role chars -> (4 tokens of length 4 / 8 tokens of length 2) + 3 + AIMessage(content="Testing"), + ] + assert count_tokens_approximately(messages, chars_per_token=4) == 14 + assert count_tokens_approximately(messages, chars_per_token=2) == 22 + + +def test_count_tokens_approximately_large_message_content() -> None: + # Test with large content to ensure no issues + large_text = "x" * 10000 + messages = [HumanMessage(content=large_text)] + # 10,000 chars + 4 role chars -> 2501 + 3 = 2504 tokens + assert count_tokens_approximately(messages) == 2504 + + +def test_count_tokens_approximately_large_number_of_messages() -> None: + # Test with large content to ensure no issues + messages = [HumanMessage(content="x")] * 1_000 + # 1 chars + 4 role chars -> 2 + 3 = 5 tokens + assert count_tokens_approximately(messages) == 5_000 + + +def test_count_tokens_approximately_mixed_content_types() -> None: + # Test with a variety of content types in the same message list + tool_calls = [{"name": "test_tool", "args": {"foo": "bar"}, "id": "1"}] + messages = [ + # 13 chars + 6 role chars -> 5 + 3 = 8 tokens + SystemMessage(content="System prompt"), + # '[{"foo": "bar"}]' -> 16 chars + 4 role chars -> 5 + 3 = 8 tokens + HumanMessage(content=[{"foo": "bar"}]), + # tool calls json -> 79 chars + 9 role chars -> 22 + 3 = 25 tokens + AIMessage(content="", tool_calls=tool_calls), + # 13 chars + 4 role chars + 9 name chars + 1 tool call ID char -> + # 7 + 3 = 10 tokens + ToolMessage(content="Tool response", name="test_tool", tool_call_id="1"), + ] + token_count = count_tokens_approximately(messages) + assert token_count == 51 + + # Ensure that count is consistent if we do one message at a time + assert sum(count_tokens_approximately([m]) for m in messages) == token_count