core, standard-tests: support PDF and audio input in Chat Completions format (#30979)

Chat models currently implement support for: - images in OpenAI Chat Completions format - other multimodal types (e.g., PDF and audio) in a cross-provider [standard format](https://python.langchain.com/docs/how_to/multimodal_inputs/) Here we update core to extend support to PDF and audio input in Chat Completions format. **If an OAI-format PDF or audio content block is passed into any chat model, it will be transformed to the LangChain standard format**. We assume that any chat model supporting OAI-format PDF or audio has implemented support for the standard format.
2025-07-10 15:06:18 +00:00 · 2025-04-23 14:32:51 -04:00 · 2025-04-23 14:32:51 -04:00 · faef3e5d50
commit faef3e5d50
parent d4fc734250
5 changed files with 305 additions and 4 deletions
--- a/libs/core/langchain_core/language_models/_utils.py
+++ b/libs/core/langchain_core/language_models/_utils.py
@ -0,0 +1,132 @@
 import re
 from typing import Optional
 from langchain_core.messages import BaseMessage
 def _is_openai_data_block(block: dict) -> bool:
    """Check if the block contains multimodal data in OpenAI Chat Completions format."""
    if block.get("type") == "image_url":
        url = block.get("image_url", {}).get("url")
        if isinstance(url, str) and set(block.keys()) <= {
            "type",
            "image_url",
            "detail",
        }:
            return True
    elif block.get("type") == "file":
        data = block.get("file", {}).get("file_data")
        if isinstance(data, str):
            return True
    elif block.get("type") == "input_audio":
        audio_data = block.get("input_audio", {}).get("data")
        audio_format = block.get("input_audio", {}).get("format")
        if isinstance(audio_data, str) and isinstance(audio_format, str):
            return True
    return False
 def _parse_data_uri(uri: str) -> Optional[dict]:
    """Parse a data URI into its components. If parsing fails, return None.
    Example:
        .. code-block:: python
            data_uri = "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
            parsed = _parse_data_uri(data_uri)
            assert parsed == {
                "source_type": "base64",
                "mime_type": "image/jpeg",
                "data": "/9j/4AAQSkZJRg...",
            }
    """
    regex = r"^data:(?P<mime_type>[^;]+);base64,(?P<data>.+)$"
    match = re.match(regex, uri)
    if match is None:
        return None
    return {
        "source_type": "base64",
        "data": match.group("data"),
        "mime_type": match.group("mime_type"),
    }
 def _convert_openai_format_to_data_block(block: dict) -> dict:
    """Convert OpenAI image content block to standard data content block.
    If parsing fails, pass-through.
    Args:
        block: The OpenAI image content block to convert.
    Returns:
        The converted standard data content block.
    """
    if block["type"] == "image_url":
        parsed = _parse_data_uri(block["image_url"]["url"])
        if parsed is not None:
            parsed["type"] = "image"
            return parsed
        return block
    if block["type"] == "file":
        parsed = _parse_data_uri(block["file"]["file_data"])
        if parsed is not None:
            parsed["type"] = "file"
            if filename := block["file"].get("filename"):
                parsed["filename"] = filename
            return parsed
        return block
    if block["type"] == "input_audio":
        data = block["input_audio"].get("data")
        format = block["input_audio"].get("format")
        if data and format:
            return {
                "type": "audio",
                "source_type": "base64",
                "data": data,
                "mime_type": f"audio/{format}",
            }
        return block
    return block
 def _normalize_messages(messages: list[BaseMessage]) -> list[BaseMessage]:
    """Extend support for message formats.
    Chat models implement support for images in OpenAI Chat Completions format, as well
    as other multimodal data as standard data blocks. This function extends support to
    audio and file data in OpenAI Chat Completions format by converting them to standard
    data blocks.
    """
    formatted_messages = []
    for message in messages:
        formatted_message = message
        if isinstance(message.content, list):
            for idx, block in enumerate(message.content):
                if (
                    isinstance(block, dict)
                    # Subset to (PDF) files and audio, as most relevant chat models
                    # support images in OAI format (and some may not yet support the
                    # standard data block format)
                    and block.get("type") in ("file", "input_audio")
                    and _is_openai_data_block(block)
                ):
                    if formatted_message is message:
                        formatted_message = message.model_copy()
                        # Also shallow-copy content
                        formatted_message.content = list(formatted_message.content)
                    formatted_message.content[idx] = (  # type: ignore[index]  # mypy confused by .model_copy
                        _convert_openai_format_to_data_block(block)
                    )
        formatted_messages.append(formatted_message)
    return formatted_messages
--- a/libs/core/langchain_core/language_models/chat_models.py
+++ b/libs/core/langchain_core/language_models/chat_models.py
@ -40,6 +40,7 @@ from langchain_core.callbacks import (
    Callbacks,
 )
 from langchain_core.globals import get_llm_cache
 from langchain_core.language_models._utils import _normalize_messages
 from langchain_core.language_models.base import (
    BaseLanguageModel,
    LangSmithParams,
@ -489,7 +490,8 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC):
                self.rate_limiter.acquire(blocking=True)
            try:
-                for chunk in self._stream(messages, stop=stop, **kwargs):
+                input_messages = _normalize_messages(messages)
                for chunk in self._stream(input_messages, stop=stop, **kwargs):
                    if chunk.message.id is None:
                        chunk.message.id = f"run-{run_manager.run_id}"
                    chunk.message.response_metadata = _gen_info_and_msg_metadata(chunk)
@ -574,8 +576,9 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC):
        generation: Optional[ChatGenerationChunk] = None
        try:
            input_messages = _normalize_messages(messages)
            async for chunk in self._astream(
-                messages,
+                input_messages,
                stop=stop,
                **kwargs,
            ):
@ -753,7 +756,10 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC):
            batch_size=len(messages),
        )
        results = []
-        for i, m in enumerate(messages):
+        input_messages = [
            _normalize_messages(message_list) for message_list in messages
        ]
        for i, m in enumerate(input_messages):
            try:
                results.append(
                    self._generate_with_cache(
@ -865,6 +871,9 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC):
            run_id=run_id,
        )
        input_messages = [
            _normalize_messages(message_list) for message_list in messages
        ]
        results = await asyncio.gather(
            *[
                self._agenerate_with_cache(
@ -873,7 +882,7 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC):
                    run_manager=run_managers[i] if run_managers else None,
                    **kwargs,
                )
-                for i, m in enumerate(messages)
+                for i, m in enumerate(input_messages)
            ],
            return_exceptions=True,
        )
--- a/libs/core/tests/unit_tests/language_models/chat_models/test_base.py
+++ b/libs/core/tests/unit_tests/language_models/chat_models/test_base.py
@ -455,3 +455,115 @@ def test_trace_images_in_openai_format() -> None:
            "url": "https://example.com/image.png",
        }
    ]
 def test_extend_support_to_openai_multimodal_formats() -> None:
    """Test that chat models normalize OpenAI file and audio inputs."""
    llm = ParrotFakeChatModel()
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Hello"},
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/image.png"},
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "data:image/jpeg;base64,/9j/4AAQSkZJRg..."},
                },
                {
                    "type": "file",
                    "file": {
                        "filename": "draconomicon.pdf",
                        "file_data": "data:application/pdf;base64,<base64 string>",
                    },
                },
                {
                    "type": "file",
                    "file": {
                        "file_data": "data:application/pdf;base64,<base64 string>",
                    },
                },
                {
                    "type": "file",
                    "file": {"file_id": "<file id>"},
                },
                {
                    "type": "input_audio",
                    "input_audio": {"data": "<base64 data>", "format": "wav"},
                },
            ],
        },
    ]
    expected_content = [
        {"type": "text", "text": "Hello"},
        {
            "type": "image_url",
            "image_url": {"url": "https://example.com/image.png"},
        },
        {
            "type": "image_url",
            "image_url": {"url": "data:image/jpeg;base64,/9j/4AAQSkZJRg..."},
        },
        {
            "type": "file",
            "source_type": "base64",
            "data": "<base64 string>",
            "mime_type": "application/pdf",
            "filename": "draconomicon.pdf",
        },
        {
            "type": "file",
            "source_type": "base64",
            "data": "<base64 string>",
            "mime_type": "application/pdf",
        },
        {
            "type": "file",
            "file": {"file_id": "<file id>"},
        },
        {
            "type": "audio",
            "source_type": "base64",
            "data": "<base64 data>",
            "mime_type": "audio/wav",
        },
    ]
    response = llm.invoke(messages)
    assert response.content == expected_content
    # Test no mutation
    assert messages[0]["content"] == [
        {"type": "text", "text": "Hello"},
        {
            "type": "image_url",
            "image_url": {"url": "https://example.com/image.png"},
        },
        {
            "type": "image_url",
            "image_url": {"url": "data:image/jpeg;base64,/9j/4AAQSkZJRg..."},
        },
        {
            "type": "file",
            "file": {
                "filename": "draconomicon.pdf",
                "file_data": "data:application/pdf;base64,<base64 string>",
            },
        },
        {
            "type": "file",
            "file": {
                "file_data": "data:application/pdf;base64,<base64 string>",
            },
        },
        {
            "type": "file",
            "file": {"file_id": "<file id>"},
        },
        {
            "type": "input_audio",
            "input_audio": {"data": "<base64 data>", "format": "wav"},
        },
    ]
--- a/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py
+++ b/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py
@ -103,6 +103,21 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
        )
        _ = model.invoke([message])
        # Test OpenAI Chat Completions format
        message = HumanMessage(
            [
                {"type": "text", "text": "Summarize this document:"},
                {
                    "type": "file",
                    "file": {
                        "filename": "test file.pdf",
                        "file_data": f"data:application/pdf;base64,{pdf_data}",
                    },
                },
            ]
        )
        _ = model.invoke([message])
 def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage:
    if stream:
--- a/libs/standard-tests/langchain_tests/integration_tests/chat_models.py
+++ b/libs/standard-tests/langchain_tests/integration_tests/chat_models.py
@ -2036,6 +2036,24 @@ class ChatModelIntegrationTests(ChatModelTests):
        )
        _ = model.invoke([message])
        # Test OpenAI Chat Completions format
        message = HumanMessage(
            [
                {
                    "type": "text",
                    "text": "Summarize this document:",
                },
                {
                    "type": "file",
                    "file": {
                        "filename": "test file.pdf",
                        "file_data": f"data:application/pdf;base64,{pdf_data}",
                    },
                },
            ]
        )
        _ = model.invoke([message])
    def test_audio_inputs(self, model: BaseChatModel) -> None:
        """Test that the model can process audio inputs.
@ -2093,6 +2111,21 @@ class ChatModelIntegrationTests(ChatModelTests):
        )
        _ = model.invoke([message])
        # Test OpenAI Chat Completions format
        message = HumanMessage(
            [
                {
                    "type": "text",
                    "text": "Describe this audio:",
                },
                {
                    "type": "input_audio",
                    "input_audio": {"data": audio_data, "format": "wav"},
                },
            ]
        )
        _ = model.invoke([message])
    def test_image_inputs(self, model: BaseChatModel) -> None:
        """Test that the model can process image inputs.