multiple: multi-modal content blocks (#30746)

Introduces standard content block format for images, audio, and files.

## Examples

Image from url:
```
{
    "type": "image",
    "source_type": "url",
    "url": "https://path.to.image.png",
}
```


Image, in-line data:
```
{
    "type": "image",
    "source_type": "base64",
    "data": "<base64 string>",
    "mime_type": "image/png",
}
```


PDF, in-line data:
```
{
    "type": "file",
    "source_type": "base64",
    "data": "<base64 string>",
    "mime_type": "application/pdf",
}
```


File from ID:
```
{
    "type": "file",
    "source_type": "id",
    "id": "file-abc123",
}
```


Plain-text file:
```
{
    "type": "file",
    "source_type": "text",
    "text": "foo bar",
}
```
This commit is contained in:
ccurme
2025-04-15 09:48:06 -04:00
committed by GitHub
parent 09438857e8
commit 9cfe6bcacd
15 changed files with 854 additions and 25 deletions

View File

@@ -35,6 +35,7 @@ from langchain_core.messages import (
SystemMessage,
ToolCall,
ToolMessage,
is_data_content_block,
)
from langchain_core.messages.ai import InputTokenDetails, UsageMetadata
from langchain_core.messages.tool import tool_call_chunk as create_tool_call_chunk
@@ -177,8 +178,78 @@ def _merge_messages(
return merged
def _format_data_content_block(block: dict) -> dict:
"""Format standard data content block to format expected by Anthropic."""
if block["type"] == "image":
if block["source_type"] == "url":
if block["url"].startswith("data:"):
# Data URI
formatted_block = {
"type": "image",
"source": _format_image(block["url"]),
}
else:
formatted_block = {
"type": "image",
"source": {"type": "url", "url": block["url"]},
}
elif block["source_type"] == "base64":
formatted_block = {
"type": "image",
"source": {
"type": "base64",
"media_type": block["mime_type"],
"data": block["data"],
},
}
else:
raise ValueError(
"Anthropic only supports 'url' and 'base64' source_type for image "
"content blocks."
)
elif block["type"] == "file":
if block["source_type"] == "url":
formatted_block = {
"type": "document",
"source": {
"type": "url",
"url": block["url"],
},
}
elif block["source_type"] == "base64":
formatted_block = {
"type": "document",
"source": {
"type": "base64",
"media_type": block.get("mime_type") or "application/pdf",
"data": block["data"],
},
}
elif block["source_type"] == "text":
formatted_block = {
"type": "document",
"source": {
"type": "text",
"media_type": block.get("mime_type") or "text/plain",
"data": block["text"],
},
}
else:
raise ValueError(f"Block of type {block['type']} is not supported.")
if formatted_block and (metadata := block.get("metadata")):
if "cache_control" in metadata:
formatted_block["cache_control"] = metadata["cache_control"]
if "citations" in metadata:
formatted_block["citations"] = metadata["citations"]
return formatted_block
def _format_messages(
messages: list[BaseMessage],
messages: Sequence[BaseMessage],
) -> tuple[Union[str, list[dict], None], list[dict]]:
"""Format messages for anthropic."""
@@ -233,6 +304,8 @@ def _format_messages(
# convert format
source = _format_image(block["image_url"]["url"])
content.append({"type": "image", "source": source})
elif is_data_content_block(block):
content.append(_format_data_content_block(block))
elif block["type"] == "tool_use":
# If a tool_call with the same id as a tool_use content block
# exists, the tool_call is preferred.

View File

@@ -25,6 +25,14 @@ class TestAnthropicStandard(ChatModelIntegrationTests):
def supports_image_inputs(self) -> bool:
return True
@property
def supports_image_urls(self) -> bool:
return True
@property
def supports_pdf_inputs(self) -> bool:
return True
@property
def supports_image_tool_message(self) -> bool:
return True

View File

@@ -690,6 +690,85 @@ def test__format_messages_with_cache_control() -> None:
assert expected_system == actual_system
assert expected_messages == actual_messages
# Test standard multi-modal format
messages = [
HumanMessage(
[
{
"type": "text",
"text": "Summarize this document:",
},
{
"type": "file",
"source_type": "base64",
"mime_type": "application/pdf",
"data": "<base64 data>",
"metadata": {"cache_control": {"type": "ephemeral"}},
},
]
)
]
actual_system, actual_messages = _format_messages(messages)
assert actual_system is None
expected_messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Summarize this document:",
},
{
"type": "document",
"source": {
"type": "base64",
"media_type": "application/pdf",
"data": "<base64 data>",
},
"cache_control": {"type": "ephemeral"},
},
],
}
]
assert actual_messages == expected_messages
def test__format_messages_with_citations() -> None:
input_messages = [
HumanMessage(
content=[
{
"type": "file",
"source_type": "text",
"text": "The grass is green. The sky is blue.",
"mime_type": "text/plain",
"metadata": {"citations": {"enabled": True}},
},
{"type": "text", "text": "What color is the grass and sky?"},
]
)
]
expected_messages = [
{
"role": "user",
"content": [
{
"type": "document",
"source": {
"type": "text",
"media_type": "text/plain",
"data": "The grass is green. The sky is blue.",
},
"citations": {"enabled": True},
},
{"type": "text", "text": "What color is the grass and sky?"},
],
}
]
actual_system, actual_messages = _format_messages(input_messages)
assert actual_system is None
assert actual_messages == expected_messages
def test__format_messages_with_multiple_system() -> None:
messages = [