mirror of
https://github.com/hwchase17/langchain.git
synced 2026-06-09 18:50:33 +00:00
multiple: multi-modal content blocks (#30746)
Introduces standard content block format for images, audio, and files.
## Examples
Image from url:
```
{
"type": "image",
"source_type": "url",
"url": "https://path.to.image.png",
}
```
Image, in-line data:
```
{
"type": "image",
"source_type": "base64",
"data": "<base64 string>",
"mime_type": "image/png",
}
```
PDF, in-line data:
```
{
"type": "file",
"source_type": "base64",
"data": "<base64 string>",
"mime_type": "application/pdf",
}
```
File from ID:
```
{
"type": "file",
"source_type": "id",
"id": "file-abc123",
}
```
Plain-text file:
```
{
"type": "file",
"source_type": "text",
"text": "foo bar",
}
```
This commit is contained in:
@@ -35,6 +35,7 @@ from langchain_core.messages import (
|
||||
SystemMessage,
|
||||
ToolCall,
|
||||
ToolMessage,
|
||||
is_data_content_block,
|
||||
)
|
||||
from langchain_core.messages.ai import InputTokenDetails, UsageMetadata
|
||||
from langchain_core.messages.tool import tool_call_chunk as create_tool_call_chunk
|
||||
@@ -177,8 +178,78 @@ def _merge_messages(
|
||||
return merged
|
||||
|
||||
|
||||
def _format_data_content_block(block: dict) -> dict:
|
||||
"""Format standard data content block to format expected by Anthropic."""
|
||||
if block["type"] == "image":
|
||||
if block["source_type"] == "url":
|
||||
if block["url"].startswith("data:"):
|
||||
# Data URI
|
||||
formatted_block = {
|
||||
"type": "image",
|
||||
"source": _format_image(block["url"]),
|
||||
}
|
||||
else:
|
||||
formatted_block = {
|
||||
"type": "image",
|
||||
"source": {"type": "url", "url": block["url"]},
|
||||
}
|
||||
elif block["source_type"] == "base64":
|
||||
formatted_block = {
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": block["mime_type"],
|
||||
"data": block["data"],
|
||||
},
|
||||
}
|
||||
else:
|
||||
raise ValueError(
|
||||
"Anthropic only supports 'url' and 'base64' source_type for image "
|
||||
"content blocks."
|
||||
)
|
||||
|
||||
elif block["type"] == "file":
|
||||
if block["source_type"] == "url":
|
||||
formatted_block = {
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "url",
|
||||
"url": block["url"],
|
||||
},
|
||||
}
|
||||
elif block["source_type"] == "base64":
|
||||
formatted_block = {
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": block.get("mime_type") or "application/pdf",
|
||||
"data": block["data"],
|
||||
},
|
||||
}
|
||||
elif block["source_type"] == "text":
|
||||
formatted_block = {
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "text",
|
||||
"media_type": block.get("mime_type") or "text/plain",
|
||||
"data": block["text"],
|
||||
},
|
||||
}
|
||||
|
||||
else:
|
||||
raise ValueError(f"Block of type {block['type']} is not supported.")
|
||||
|
||||
if formatted_block and (metadata := block.get("metadata")):
|
||||
if "cache_control" in metadata:
|
||||
formatted_block["cache_control"] = metadata["cache_control"]
|
||||
if "citations" in metadata:
|
||||
formatted_block["citations"] = metadata["citations"]
|
||||
|
||||
return formatted_block
|
||||
|
||||
|
||||
def _format_messages(
|
||||
messages: list[BaseMessage],
|
||||
messages: Sequence[BaseMessage],
|
||||
) -> tuple[Union[str, list[dict], None], list[dict]]:
|
||||
"""Format messages for anthropic."""
|
||||
|
||||
@@ -233,6 +304,8 @@ def _format_messages(
|
||||
# convert format
|
||||
source = _format_image(block["image_url"]["url"])
|
||||
content.append({"type": "image", "source": source})
|
||||
elif is_data_content_block(block):
|
||||
content.append(_format_data_content_block(block))
|
||||
elif block["type"] == "tool_use":
|
||||
# If a tool_call with the same id as a tool_use content block
|
||||
# exists, the tool_call is preferred.
|
||||
|
||||
@@ -25,6 +25,14 @@ class TestAnthropicStandard(ChatModelIntegrationTests):
|
||||
def supports_image_inputs(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def supports_image_urls(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def supports_pdf_inputs(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def supports_image_tool_message(self) -> bool:
|
||||
return True
|
||||
|
||||
@@ -690,6 +690,85 @@ def test__format_messages_with_cache_control() -> None:
|
||||
assert expected_system == actual_system
|
||||
assert expected_messages == actual_messages
|
||||
|
||||
# Test standard multi-modal format
|
||||
messages = [
|
||||
HumanMessage(
|
||||
[
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Summarize this document:",
|
||||
},
|
||||
{
|
||||
"type": "file",
|
||||
"source_type": "base64",
|
||||
"mime_type": "application/pdf",
|
||||
"data": "<base64 data>",
|
||||
"metadata": {"cache_control": {"type": "ephemeral"}},
|
||||
},
|
||||
]
|
||||
)
|
||||
]
|
||||
actual_system, actual_messages = _format_messages(messages)
|
||||
assert actual_system is None
|
||||
expected_messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Summarize this document:",
|
||||
},
|
||||
{
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "application/pdf",
|
||||
"data": "<base64 data>",
|
||||
},
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
assert actual_messages == expected_messages
|
||||
|
||||
|
||||
def test__format_messages_with_citations() -> None:
|
||||
input_messages = [
|
||||
HumanMessage(
|
||||
content=[
|
||||
{
|
||||
"type": "file",
|
||||
"source_type": "text",
|
||||
"text": "The grass is green. The sky is blue.",
|
||||
"mime_type": "text/plain",
|
||||
"metadata": {"citations": {"enabled": True}},
|
||||
},
|
||||
{"type": "text", "text": "What color is the grass and sky?"},
|
||||
]
|
||||
)
|
||||
]
|
||||
expected_messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "text",
|
||||
"media_type": "text/plain",
|
||||
"data": "The grass is green. The sky is blue.",
|
||||
},
|
||||
"citations": {"enabled": True},
|
||||
},
|
||||
{"type": "text", "text": "What color is the grass and sky?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
actual_system, actual_messages = _format_messages(input_messages)
|
||||
assert actual_system is None
|
||||
assert actual_messages == expected_messages
|
||||
|
||||
|
||||
def test__format_messages_with_multiple_system() -> None:
|
||||
messages = [
|
||||
|
||||
@@ -61,6 +61,8 @@ from langchain_core.messages import (
|
||||
ToolCall,
|
||||
ToolMessage,
|
||||
ToolMessageChunk,
|
||||
convert_to_openai_image_block,
|
||||
is_data_content_block,
|
||||
)
|
||||
from langchain_core.messages.ai import (
|
||||
InputTokenDetails,
|
||||
@@ -184,6 +186,32 @@ def _convert_dict_to_message(_dict: Mapping[str, Any]) -> BaseMessage:
|
||||
return ChatMessage(content=_dict.get("content", ""), role=role, id=id_) # type: ignore[arg-type]
|
||||
|
||||
|
||||
def _format_data_content_block(block: dict) -> dict:
|
||||
"""Format standard data content block to format expected by OpenAI."""
|
||||
if block["type"] == "image":
|
||||
formatted_block = convert_to_openai_image_block(block)
|
||||
|
||||
elif block["type"] == "file":
|
||||
if block["source_type"] == "base64":
|
||||
file = {"file_data": f"data:{block['mime_type']};base64,{block['data']}"}
|
||||
if (metadata := block.get("metadata")) and ("filename" in metadata):
|
||||
file["filename"] = metadata["filename"]
|
||||
else:
|
||||
warnings.warn(
|
||||
"OpenAI may require a filename for file inputs. Specify a filename "
|
||||
"in the metadata: {'type': 'file', 'source_type': 'base64', "
|
||||
"'mime_type': 'application/pdf', 'data': '...', "
|
||||
"'metadata': {'filename': 'my-pdf'}}"
|
||||
)
|
||||
formatted_block = {"type": "file", "file": file}
|
||||
elif block["source_type"] == "id":
|
||||
formatted_block = {"type": "file", "file": {"file_id": block["id"]}}
|
||||
else:
|
||||
raise ValueError(f"Block of type {block['type']} is not supported.")
|
||||
|
||||
return formatted_block
|
||||
|
||||
|
||||
def _format_message_content(content: Any) -> Any:
|
||||
"""Format message content."""
|
||||
if content and isinstance(content, list):
|
||||
@@ -196,6 +224,8 @@ def _format_message_content(content: Any) -> Any:
|
||||
and block["type"] in ("tool_use", "thinking")
|
||||
):
|
||||
continue
|
||||
elif isinstance(block, dict) and is_data_content_block(block):
|
||||
formatted_content.append(_format_data_content_block(block))
|
||||
# Anthropic image blocks
|
||||
elif (
|
||||
isinstance(block, dict)
|
||||
@@ -3122,6 +3152,9 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
|
||||
if block["image_url"].get("detail"):
|
||||
new_block["detail"] = block["image_url"]["detail"]
|
||||
new_blocks.append(new_block)
|
||||
elif block["type"] == "file":
|
||||
new_block = {"type": "input_file", **block["file"]}
|
||||
new_blocks.append(new_block)
|
||||
elif block["type"] in ("input_text", "input_image", "input_file"):
|
||||
new_blocks.append(block)
|
||||
else:
|
||||
|
||||
@@ -30,6 +30,10 @@ class TestAzureOpenAIStandard(ChatModelIntegrationTests):
|
||||
def supports_image_inputs(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def supports_image_urls(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def supports_json_mode(self) -> bool:
|
||||
return True
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
"""Standard LangChain interface tests"""
|
||||
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from typing import Literal, cast
|
||||
|
||||
import httpx
|
||||
from langchain_core.language_models import BaseChatModel
|
||||
from langchain_core.messages import AIMessage
|
||||
from langchain_core.messages import AIMessage, HumanMessage
|
||||
from langchain_tests.integration_tests import ChatModelIntegrationTests
|
||||
|
||||
from langchain_openai import ChatOpenAI
|
||||
@@ -25,6 +27,10 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
|
||||
def supports_image_inputs(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def supports_image_urls(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def supports_json_mode(self) -> bool:
|
||||
return True
|
||||
@@ -71,6 +77,31 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
|
||||
)
|
||||
return _invoke(llm, input_, stream)
|
||||
|
||||
@property
|
||||
def supports_pdf_inputs(self) -> bool:
|
||||
# OpenAI requires a filename for PDF inputs
|
||||
# For now, we test with filename in OpenAI-specific tests
|
||||
return False
|
||||
|
||||
def test_openai_pdf_inputs(self, model: BaseChatModel) -> None:
|
||||
"""Test that the model can process PDF inputs."""
|
||||
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
|
||||
pdf_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
|
||||
|
||||
message = HumanMessage(
|
||||
[
|
||||
{"type": "text", "text": "Summarize this document:"},
|
||||
{
|
||||
"type": "file",
|
||||
"source_type": "base64",
|
||||
"mime_type": "application/pdf",
|
||||
"data": pdf_data,
|
||||
"metadata": {"filename": "my-pdf"}, # OpenAI requires a filename
|
||||
},
|
||||
]
|
||||
)
|
||||
_ = model.invoke([message])
|
||||
|
||||
|
||||
def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage:
|
||||
if stream:
|
||||
|
||||
@@ -649,6 +649,51 @@ def test_format_message_content() -> None:
|
||||
]
|
||||
assert [{"type": "text", "text": "hello"}] == _format_message_content(content)
|
||||
|
||||
# Standard multi-modal inputs
|
||||
content = [{"type": "image", "source_type": "url", "url": "https://..."}]
|
||||
expected = [{"type": "image_url", "image_url": {"url": "https://..."}}]
|
||||
assert expected == _format_message_content(content)
|
||||
|
||||
content = [
|
||||
{
|
||||
"type": "image",
|
||||
"source_type": "base64",
|
||||
"data": "<base64 data>",
|
||||
"mime_type": "image/png",
|
||||
}
|
||||
]
|
||||
expected = [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": "data:image/png;base64,<base64 data>"},
|
||||
}
|
||||
]
|
||||
assert expected == _format_message_content(content)
|
||||
|
||||
content = [
|
||||
{
|
||||
"type": "file",
|
||||
"source_type": "base64",
|
||||
"data": "<base64 data>",
|
||||
"mime_type": "application/pdf",
|
||||
"metadata": {"filename": "my_file"},
|
||||
}
|
||||
]
|
||||
expected = [
|
||||
{
|
||||
"type": "file",
|
||||
"file": {
|
||||
"filename": "my_file",
|
||||
"file_data": "data:application/pdf;base64,<base64 data>",
|
||||
},
|
||||
}
|
||||
]
|
||||
assert expected == _format_message_content(content)
|
||||
|
||||
content = [{"type": "file", "source_type": "id", "id": "file-abc123"}]
|
||||
expected = [{"type": "file", "file": {"file_id": "file-abc123"}}]
|
||||
assert expected == _format_message_content(content)
|
||||
|
||||
|
||||
class GenerateUsername(BaseModel):
|
||||
"Get a username based on someone's name and hair color."
|
||||
|
||||
Reference in New Issue
Block a user