revert(anthropic): streaming token counting to defer input tokens until completion (#32587)

Reverts langchain-ai/langchain#32518
This commit is contained in:
Mason Daugherty
2025-08-18 09:48:33 -04:00
committed by GitHub
parent b8cdbc4eca
commit fd891ee3d4
2 changed files with 17 additions and 310 deletions

View File

@@ -70,20 +70,6 @@ class AnthropicTool(TypedDict):
cache_control: NotRequired[dict[str, str]]
class _CombinedUsage(BaseModel):
"""Combined usage model for deferred token counting in streaming.
This mimics the Anthropic Usage structure while combining stored input usage
with final output usage for accurate token reporting during streaming.
"""
input_tokens: int = 0
output_tokens: int = 0
cache_creation_input_tokens: Optional[int] = None
cache_read_input_tokens: Optional[int] = None
cache_creation: Optional[dict[str, Any]] = None
def _is_builtin_tool(tool: Any) -> bool:
if not isinstance(tool, dict):
return False
@@ -1522,18 +1508,12 @@ class ChatAnthropic(BaseChatModel):
and not _thinking_in_params(payload)
)
block_start_event = None
stored_input_usage = None
for event in stream:
(
msg,
block_start_event,
stored_input_usage,
) = _make_message_chunk_from_anthropic_event(
msg, block_start_event = _make_message_chunk_from_anthropic_event(
event,
stream_usage=stream_usage,
coerce_content_to_string=coerce_content_to_string,
block_start_event=block_start_event,
stored_input_usage=stored_input_usage,
)
if msg is not None:
chunk = ChatGenerationChunk(message=msg)
@@ -1564,18 +1544,12 @@ class ChatAnthropic(BaseChatModel):
and not _thinking_in_params(payload)
)
block_start_event = None
stored_input_usage = None
async for event in stream:
(
msg,
block_start_event,
stored_input_usage,
) = _make_message_chunk_from_anthropic_event(
msg, block_start_event = _make_message_chunk_from_anthropic_event(
event,
stream_usage=stream_usage,
coerce_content_to_string=coerce_content_to_string,
block_start_event=block_start_event,
stored_input_usage=stored_input_usage,
)
if msg is not None:
chunk = ChatGenerationChunk(message=msg)
@@ -2208,40 +2182,22 @@ def _make_message_chunk_from_anthropic_event(
stream_usage: bool = True,
coerce_content_to_string: bool,
block_start_event: Optional[anthropic.types.RawMessageStreamEvent] = None,
stored_input_usage: Optional[BaseModel] = None,
) -> tuple[
Optional[AIMessageChunk],
Optional[anthropic.types.RawMessageStreamEvent],
Optional[BaseModel],
]:
"""Convert Anthropic event to ``AIMessageChunk``.
) -> tuple[Optional[AIMessageChunk], Optional[anthropic.types.RawMessageStreamEvent]]:
"""Convert Anthropic event to AIMessageChunk.
Note that not all events will result in a message chunk. In these cases
we return ``None``.
Args:
event: The Anthropic streaming event to convert.
stream_usage: Whether to include usage metadata in the chunk.
coerce_content_to_string: Whether to coerce content blocks to strings.
block_start_event: Previous content block start event for context.
stored_input_usage: Usage metadata from ``message_start`` event to be used
in ``message_delta`` event for accurate input token counts.
Returns:
Tuple of ``(message_chunk, block_start_event, stored_usage)``
"""
message_chunk: Optional[AIMessageChunk] = None
updated_stored_usage = stored_input_usage
# See https://github.com/anthropics/anthropic-sdk-python/blob/main/src/anthropic/lib/streaming/_messages.py # noqa: E501
if event.type == "message_start" and stream_usage:
# Store input usage for later use in message_delta but don't emit tokens yet
updated_stored_usage = event.message.usage
usage_metadata = UsageMetadata(
input_tokens=0,
output_tokens=0,
total_tokens=0,
usage_metadata = _create_usage_metadata(event.message.usage)
# We pick up a cumulative count of output_tokens at the end of the stream,
# so here we zero out to avoid double counting.
usage_metadata["total_tokens"] = (
usage_metadata["total_tokens"] - usage_metadata["output_tokens"]
)
usage_metadata["output_tokens"] = 0
if hasattr(event.message, "model"):
response_metadata = {"model_name": event.message.model}
else:
@@ -2329,37 +2285,11 @@ def _make_message_chunk_from_anthropic_event(
tool_call_chunks=tool_call_chunks,
)
elif event.type == "message_delta" and stream_usage:
# Create usage metadata combining stored input usage with final output usage
#
# Per Anthropic docs: "The token counts shown in the usage field of the
# message_delta event are cumulative." Thus, when MCP tools are called
# mid-stream, `input_tokens` may be updated with a higher cumulative count.
# We prioritize `event.usage.input_tokens` when available to handle this case.
if stored_input_usage is not None:
# Create a combined usage object that mimics the Anthropic Usage structure
combined_usage = _CombinedUsage(
input_tokens=event.usage.input_tokens
or getattr(stored_input_usage, "input_tokens", 0),
output_tokens=event.usage.output_tokens,
cache_creation_input_tokens=getattr(
stored_input_usage, "cache_creation_input_tokens", None
),
cache_read_input_tokens=getattr(
stored_input_usage, "cache_read_input_tokens", None
),
cache_creation=getattr(stored_input_usage, "cache_creation", None)
if hasattr(stored_input_usage, "cache_creation")
else None,
)
usage_metadata = _create_usage_metadata(combined_usage)
else:
# Fallback to just output tokens if no stored usage
usage_metadata = UsageMetadata(
input_tokens=event.usage.input_tokens or 0,
output_tokens=event.usage.output_tokens,
total_tokens=(event.usage.input_tokens or 0)
+ event.usage.output_tokens,
)
usage_metadata = UsageMetadata(
input_tokens=0,
output_tokens=event.usage.output_tokens,
total_tokens=event.usage.output_tokens,
)
message_chunk = AIMessageChunk(
content="",
usage_metadata=usage_metadata,
@@ -2371,7 +2301,7 @@ def _make_message_chunk_from_anthropic_event(
else:
pass
return message_chunk, block_start_event, updated_stored_usage
return message_chunk, block_start_event
@deprecated(since="0.1.0", removal="1.0.0", alternative="ChatAnthropic")