multiple: multi-modal content blocks (#30746)

Introduces standard content block format for images, audio, and files. ## Examples Image from url: ``` { "type": "image", "source_type": "url", "url": "https://path.to.image.png", } ``` Image, in-line data: ``` { "type": "image", "source_type": "base64", "data": "<base64 string>", "mime_type": "image/png", } ``` PDF, in-line data: ``` { "type": "file", "source_type": "base64", "data": "<base64 string>", "mime_type": "application/pdf", } ``` File from ID: ``` { "type": "file", "source_type": "id", "id": "file-abc123", } ``` Plain-text file: ``` { "type": "file", "source_type": "text", "text": "foo bar", } ```
2026-06-09 18:50:33 +00:00 · 2025-04-15 09:48:06 -04:00
parent 09438857e8
commit 9cfe6bcacd
15 changed files with 854 additions and 25 deletions
--- a/libs/partners/anthropic/langchain_anthropic/chat_models.py
+++ b/libs/partners/anthropic/langchain_anthropic/chat_models.py
@@ -35,6 +35,7 @@ from langchain_core.messages import (
    SystemMessage,
    ToolCall,
    ToolMessage,
+    is_data_content_block,
 )
 from langchain_core.messages.ai import InputTokenDetails, UsageMetadata
 from langchain_core.messages.tool import tool_call_chunk as create_tool_call_chunk
@@ -177,8 +178,78 @@ def _merge_messages(
    return merged


+def _format_data_content_block(block: dict) -> dict:
+    """Format standard data content block to format expected by Anthropic."""
+    if block["type"] == "image":
+        if block["source_type"] == "url":
+            if block["url"].startswith("data:"):
+                # Data URI
+                formatted_block = {
+                    "type": "image",
+                    "source": _format_image(block["url"]),
+                }
+            else:
+                formatted_block = {
+                    "type": "image",
+                    "source": {"type": "url", "url": block["url"]},
+                }
+        elif block["source_type"] == "base64":
+            formatted_block = {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": block["mime_type"],
+                    "data": block["data"],
+                },
+            }
+        else:
+            raise ValueError(
+                "Anthropic only supports 'url' and 'base64' source_type for image "
+                "content blocks."
+            )
+
+    elif block["type"] == "file":
+        if block["source_type"] == "url":
+            formatted_block = {
+                "type": "document",
+                "source": {
+                    "type": "url",
+                    "url": block["url"],
+                },
+            }
+        elif block["source_type"] == "base64":
+            formatted_block = {
+                "type": "document",
+                "source": {
+                    "type": "base64",
+                    "media_type": block.get("mime_type") or "application/pdf",
+                    "data": block["data"],
+                },
+            }
+        elif block["source_type"] == "text":
+            formatted_block = {
+                "type": "document",
+                "source": {
+                    "type": "text",
+                    "media_type": block.get("mime_type") or "text/plain",
+                    "data": block["text"],
+                },
+            }
+
+    else:
+        raise ValueError(f"Block of type {block['type']} is not supported.")
+
+    if formatted_block and (metadata := block.get("metadata")):
+        if "cache_control" in metadata:
+            formatted_block["cache_control"] = metadata["cache_control"]
+        if "citations" in metadata:
+            formatted_block["citations"] = metadata["citations"]
+
+    return formatted_block
+
+
 def _format_messages(
-    messages: list[BaseMessage],
+    messages: Sequence[BaseMessage],
 ) -> tuple[Union[str, list[dict], None], list[dict]]:
    """Format messages for anthropic."""

@@ -233,6 +304,8 @@ def _format_messages(
                        # convert format
                        source = _format_image(block["image_url"]["url"])
                        content.append({"type": "image", "source": source})
+                    elif is_data_content_block(block):
+                        content.append(_format_data_content_block(block))
                    elif block["type"] == "tool_use":
                        # If a tool_call with the same id as a tool_use content block
                        # exists, the tool_call is preferred.
--- a/libs/partners/anthropic/tests/integration_tests/test_standard.py
+++ b/libs/partners/anthropic/tests/integration_tests/test_standard.py
@@ -25,6 +25,14 @@ class TestAnthropicStandard(ChatModelIntegrationTests):
    def supports_image_inputs(self) -> bool:
        return True

+    @property
+    def supports_image_urls(self) -> bool:
+        return True
+
+    @property
+    def supports_pdf_inputs(self) -> bool:
+        return True
+
    @property
    def supports_image_tool_message(self) -> bool:
        return True
--- a/libs/partners/anthropic/tests/unit_tests/test_chat_models.py
+++ b/libs/partners/anthropic/tests/unit_tests/test_chat_models.py
@@ -690,6 +690,85 @@ def test__format_messages_with_cache_control() -> None:
    assert expected_system == actual_system
    assert expected_messages == actual_messages

+    # Test standard multi-modal format
+    messages = [
+        HumanMessage(
+            [
+                {
+                    "type": "text",
+                    "text": "Summarize this document:",
+                },
+                {
+                    "type": "file",
+                    "source_type": "base64",
+                    "mime_type": "application/pdf",
+                    "data": "<base64 data>",
+                    "metadata": {"cache_control": {"type": "ephemeral"}},
+                },
+            ]
+        )
+    ]
+    actual_system, actual_messages = _format_messages(messages)
+    assert actual_system is None
+    expected_messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Summarize this document:",
+                },
+                {
+                    "type": "document",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "application/pdf",
+                        "data": "<base64 data>",
+                    },
+                    "cache_control": {"type": "ephemeral"},
+                },
+            ],
+        }
+    ]
+    assert actual_messages == expected_messages
+
+
+def test__format_messages_with_citations() -> None:
+    input_messages = [
+        HumanMessage(
+            content=[
+                {
+                    "type": "file",
+                    "source_type": "text",
+                    "text": "The grass is green. The sky is blue.",
+                    "mime_type": "text/plain",
+                    "metadata": {"citations": {"enabled": True}},
+                },
+                {"type": "text", "text": "What color is the grass and sky?"},
+            ]
+        )
+    ]
+    expected_messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "document",
+                    "source": {
+                        "type": "text",
+                        "media_type": "text/plain",
+                        "data": "The grass is green. The sky is blue.",
+                    },
+                    "citations": {"enabled": True},
+                },
+                {"type": "text", "text": "What color is the grass and sky?"},
+            ],
+        }
+    ]
+    actual_system, actual_messages = _format_messages(input_messages)
+    assert actual_system is None
+    assert actual_messages == expected_messages
+

 def test__format_messages_with_multiple_system() -> None:
    messages = [
--- a/libs/partners/openai/langchain_openai/chat_models/base.py
+++ b/libs/partners/openai/langchain_openai/chat_models/base.py
@@ -61,6 +61,8 @@ from langchain_core.messages import (
    ToolCall,
    ToolMessage,
    ToolMessageChunk,
+    convert_to_openai_image_block,
+    is_data_content_block,
 )
 from langchain_core.messages.ai import (
    InputTokenDetails,
@@ -184,6 +186,32 @@ def _convert_dict_to_message(_dict: Mapping[str, Any]) -> BaseMessage:
        return ChatMessage(content=_dict.get("content", ""), role=role, id=id_)  # type: ignore[arg-type]


+def _format_data_content_block(block: dict) -> dict:
+    """Format standard data content block to format expected by OpenAI."""
+    if block["type"] == "image":
+        formatted_block = convert_to_openai_image_block(block)
+
+    elif block["type"] == "file":
+        if block["source_type"] == "base64":
+            file = {"file_data": f"data:{block['mime_type']};base64,{block['data']}"}
+            if (metadata := block.get("metadata")) and ("filename" in metadata):
+                file["filename"] = metadata["filename"]
+            else:
+                warnings.warn(
+                    "OpenAI may require a filename for file inputs. Specify a filename "
+                    "in the metadata: {'type': 'file', 'source_type': 'base64', "
+                    "'mime_type': 'application/pdf', 'data': '...', "
+                    "'metadata': {'filename': 'my-pdf'}}"
+                )
+            formatted_block = {"type": "file", "file": file}
+        elif block["source_type"] == "id":
+            formatted_block = {"type": "file", "file": {"file_id": block["id"]}}
+    else:
+        raise ValueError(f"Block of type {block['type']} is not supported.")
+
+    return formatted_block
+
+
 def _format_message_content(content: Any) -> Any:
    """Format message content."""
    if content and isinstance(content, list):
@@ -196,6 +224,8 @@ def _format_message_content(content: Any) -> Any:
                and block["type"] in ("tool_use", "thinking")
            ):
                continue
+            elif isinstance(block, dict) and is_data_content_block(block):
+                formatted_content.append(_format_data_content_block(block))
            # Anthropic image blocks
            elif (
                isinstance(block, dict)
@@ -3122,6 +3152,9 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
                        if block["image_url"].get("detail"):
                            new_block["detail"] = block["image_url"]["detail"]
                        new_blocks.append(new_block)
+                    elif block["type"] == "file":
+                        new_block = {"type": "input_file", **block["file"]}
+                        new_blocks.append(new_block)
                    elif block["type"] in ("input_text", "input_image", "input_file"):
                        new_blocks.append(block)
                    else:
--- a/libs/partners/openai/tests/integration_tests/chat_models/test_azure_standard.py
+++ b/libs/partners/openai/tests/integration_tests/chat_models/test_azure_standard.py
@@ -30,6 +30,10 @@ class TestAzureOpenAIStandard(ChatModelIntegrationTests):
    def supports_image_inputs(self) -> bool:
        return True

+    @property
+    def supports_image_urls(self) -> bool:
+        return True
+
    @property
    def supports_json_mode(self) -> bool:
        return True
--- a/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py
+++ b/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py
@@ -1,10 +1,12 @@
 """Standard LangChain interface tests"""

+import base64
 from pathlib import Path
 from typing import Literal, cast

+import httpx
 from langchain_core.language_models import BaseChatModel
-from langchain_core.messages import AIMessage
+from langchain_core.messages import AIMessage, HumanMessage
 from langchain_tests.integration_tests import ChatModelIntegrationTests

 from langchain_openai import ChatOpenAI
@@ -25,6 +27,10 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
    def supports_image_inputs(self) -> bool:
        return True

+    @property
+    def supports_image_urls(self) -> bool:
+        return True
+
    @property
    def supports_json_mode(self) -> bool:
        return True
@@ -71,6 +77,31 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
        )
        return _invoke(llm, input_, stream)

+    @property
+    def supports_pdf_inputs(self) -> bool:
+        # OpenAI requires a filename for PDF inputs
+        # For now, we test with filename in OpenAI-specific tests
+        return False
+
+    def test_openai_pdf_inputs(self, model: BaseChatModel) -> None:
+        """Test that the model can process PDF inputs."""
+        url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
+        pdf_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
+
+        message = HumanMessage(
+            [
+                {"type": "text", "text": "Summarize this document:"},
+                {
+                    "type": "file",
+                    "source_type": "base64",
+                    "mime_type": "application/pdf",
+                    "data": pdf_data,
+                    "metadata": {"filename": "my-pdf"},  # OpenAI requires a filename
+                },
+            ]
+        )
+        _ = model.invoke([message])
+

 def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage:
    if stream:
--- a/libs/partners/openai/tests/unit_tests/chat_models/test_base.py
+++ b/libs/partners/openai/tests/unit_tests/chat_models/test_base.py
@@ -649,6 +649,51 @@ def test_format_message_content() -> None:
    ]
    assert [{"type": "text", "text": "hello"}] == _format_message_content(content)

+    # Standard multi-modal inputs
+    content = [{"type": "image", "source_type": "url", "url": "https://..."}]
+    expected = [{"type": "image_url", "image_url": {"url": "https://..."}}]
+    assert expected == _format_message_content(content)
+
+    content = [
+        {
+            "type": "image",
+            "source_type": "base64",
+            "data": "<base64 data>",
+            "mime_type": "image/png",
+        }
+    ]
+    expected = [
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,<base64 data>"},
+        }
+    ]
+    assert expected == _format_message_content(content)
+
+    content = [
+        {
+            "type": "file",
+            "source_type": "base64",
+            "data": "<base64 data>",
+            "mime_type": "application/pdf",
+            "metadata": {"filename": "my_file"},
+        }
+    ]
+    expected = [
+        {
+            "type": "file",
+            "file": {
+                "filename": "my_file",
+                "file_data": "data:application/pdf;base64,<base64 data>",
+            },
+        }
+    ]
+    assert expected == _format_message_content(content)
+
+    content = [{"type": "file", "source_type": "id", "id": "file-abc123"}]
+    expected = [{"type": "file", "file": {"file_id": "file-abc123"}}]
+    assert expected == _format_message_content(content)
+

 class GenerateUsername(BaseModel):
    "Get a username based on someone's name and hair color."