multiple: multi-modal content blocks (#30746)

Introduces standard content block format for images, audio, and files.

## Examples

Image from url:
```
{
    "type": "image",
    "source_type": "url",
    "url": "https://path.to.image.png",
}
```


Image, in-line data:
```
{
    "type": "image",
    "source_type": "base64",
    "data": "<base64 string>",
    "mime_type": "image/png",
}
```


PDF, in-line data:
```
{
    "type": "file",
    "source_type": "base64",
    "data": "<base64 string>",
    "mime_type": "application/pdf",
}
```


File from ID:
```
{
    "type": "file",
    "source_type": "id",
    "id": "file-abc123",
}
```


Plain-text file:
```
{
    "type": "file",
    "source_type": "text",
    "text": "foo bar",
}
```
This commit is contained in:
ccurme
2025-04-15 09:48:06 -04:00
committed by GitHub
parent 09438857e8
commit 9cfe6bcacd
15 changed files with 854 additions and 25 deletions

View File

@@ -61,6 +61,8 @@ from langchain_core.messages import (
ToolCall,
ToolMessage,
ToolMessageChunk,
convert_to_openai_image_block,
is_data_content_block,
)
from langchain_core.messages.ai import (
InputTokenDetails,
@@ -184,6 +186,32 @@ def _convert_dict_to_message(_dict: Mapping[str, Any]) -> BaseMessage:
return ChatMessage(content=_dict.get("content", ""), role=role, id=id_) # type: ignore[arg-type]
def _format_data_content_block(block: dict) -> dict:
"""Format standard data content block to format expected by OpenAI."""
if block["type"] == "image":
formatted_block = convert_to_openai_image_block(block)
elif block["type"] == "file":
if block["source_type"] == "base64":
file = {"file_data": f"data:{block['mime_type']};base64,{block['data']}"}
if (metadata := block.get("metadata")) and ("filename" in metadata):
file["filename"] = metadata["filename"]
else:
warnings.warn(
"OpenAI may require a filename for file inputs. Specify a filename "
"in the metadata: {'type': 'file', 'source_type': 'base64', "
"'mime_type': 'application/pdf', 'data': '...', "
"'metadata': {'filename': 'my-pdf'}}"
)
formatted_block = {"type": "file", "file": file}
elif block["source_type"] == "id":
formatted_block = {"type": "file", "file": {"file_id": block["id"]}}
else:
raise ValueError(f"Block of type {block['type']} is not supported.")
return formatted_block
def _format_message_content(content: Any) -> Any:
"""Format message content."""
if content and isinstance(content, list):
@@ -196,6 +224,8 @@ def _format_message_content(content: Any) -> Any:
and block["type"] in ("tool_use", "thinking")
):
continue
elif isinstance(block, dict) and is_data_content_block(block):
formatted_content.append(_format_data_content_block(block))
# Anthropic image blocks
elif (
isinstance(block, dict)
@@ -3122,6 +3152,9 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
if block["image_url"].get("detail"):
new_block["detail"] = block["image_url"]["detail"]
new_blocks.append(new_block)
elif block["type"] == "file":
new_block = {"type": "input_file", **block["file"]}
new_blocks.append(new_block)
elif block["type"] in ("input_text", "input_image", "input_file"):
new_blocks.append(block)
else:

View File

@@ -30,6 +30,10 @@ class TestAzureOpenAIStandard(ChatModelIntegrationTests):
def supports_image_inputs(self) -> bool:
return True
@property
def supports_image_urls(self) -> bool:
return True
@property
def supports_json_mode(self) -> bool:
return True

View File

@@ -1,10 +1,12 @@
"""Standard LangChain interface tests"""
import base64
from pathlib import Path
from typing import Literal, cast
import httpx
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import AIMessage
from langchain_core.messages import AIMessage, HumanMessage
from langchain_tests.integration_tests import ChatModelIntegrationTests
from langchain_openai import ChatOpenAI
@@ -25,6 +27,10 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
def supports_image_inputs(self) -> bool:
return True
@property
def supports_image_urls(self) -> bool:
return True
@property
def supports_json_mode(self) -> bool:
return True
@@ -71,6 +77,31 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
)
return _invoke(llm, input_, stream)
@property
def supports_pdf_inputs(self) -> bool:
# OpenAI requires a filename for PDF inputs
# For now, we test with filename in OpenAI-specific tests
return False
def test_openai_pdf_inputs(self, model: BaseChatModel) -> None:
"""Test that the model can process PDF inputs."""
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
pdf_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
message = HumanMessage(
[
{"type": "text", "text": "Summarize this document:"},
{
"type": "file",
"source_type": "base64",
"mime_type": "application/pdf",
"data": pdf_data,
"metadata": {"filename": "my-pdf"}, # OpenAI requires a filename
},
]
)
_ = model.invoke([message])
def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage:
if stream:

View File

@@ -649,6 +649,51 @@ def test_format_message_content() -> None:
]
assert [{"type": "text", "text": "hello"}] == _format_message_content(content)
# Standard multi-modal inputs
content = [{"type": "image", "source_type": "url", "url": "https://..."}]
expected = [{"type": "image_url", "image_url": {"url": "https://..."}}]
assert expected == _format_message_content(content)
content = [
{
"type": "image",
"source_type": "base64",
"data": "<base64 data>",
"mime_type": "image/png",
}
]
expected = [
{
"type": "image_url",
"image_url": {"url": "data:image/png;base64,<base64 data>"},
}
]
assert expected == _format_message_content(content)
content = [
{
"type": "file",
"source_type": "base64",
"data": "<base64 data>",
"mime_type": "application/pdf",
"metadata": {"filename": "my_file"},
}
]
expected = [
{
"type": "file",
"file": {
"filename": "my_file",
"file_data": "data:application/pdf;base64,<base64 data>",
},
}
]
assert expected == _format_message_content(content)
content = [{"type": "file", "source_type": "id", "id": "file-abc123"}]
expected = [{"type": "file", "file": {"file_id": "file-abc123"}}]
assert expected == _format_message_content(content)
class GenerateUsername(BaseModel):
"Get a username based on someone's name and hair color."