standard-tests, openai[patch]: add support standard audio inputs (#30904)

2025-08-22 02:45:49 +00:00 · 2025-04-17 10:30:57 -04:00 · 2025-04-17 10:30:57 -04:00 · add6a78f98
commit add6a78f98
parent 2c2db1ab69
4 changed files with 157 additions and 1 deletions
--- a/libs/partners/openai/langchain_openai/chat_models/base.py
+++ b/libs/partners/openai/langchain_openai/chat_models/base.py
@ -208,6 +208,17 @@ def _format_data_content_block(block: dict) -> dict:
            formatted_block = {"type": "file", "file": file}
        elif block["source_type"] == "id":
            formatted_block = {"type": "file", "file": {"file_id": block["id"]}}
        else:
            raise ValueError("source_type base64 or id is required for file blocks.")
    elif block["type"] == "audio":
        if block["source_type"] == "base64":
            format = block["mime_type"].split("/")[-1]
            formatted_block = {
                "type": "input_audio",
                "input_audio": {"data": block["data"], "format": format},
            }
        else:
            raise ValueError("source_type base64 is required for audio blocks.")
    else:
        raise ValueError(f"Block of type {block['type']} is not supported.")
--- a/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py
+++ b/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py
@ -5,6 +5,7 @@ from pathlib import Path
 from typing import Literal, cast
 import httpx
 import pytest
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import AIMessage, HumanMessage
 from langchain_tests.integration_tests import ChatModelIntegrationTests
@ -111,3 +112,30 @@ def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage:
        return cast(AIMessage, full)
    else:
        return cast(AIMessage, llm.invoke(input_))
@pytest.mark.skip()  # Test either finishes in 5 seconds or 5 minutes.
 def test_audio_model() -> None:
    class AudioModelTests(ChatModelIntegrationTests):
        @property
        def chat_model_class(self) -> type[ChatOpenAI]:
            return ChatOpenAI
        @property
        def chat_model_params(self) -> dict:
            return {
                "model": "gpt-4o-audio-preview",
                "temperature": 0,
                "model_kwargs": {
                    "modalities": ["text", "audio"],
                    "audio": {"voice": "alloy", "format": "wav"},
                },
            }
        @property
        def supports_audio_inputs(self) -> bool:
            return True
    test_instance = AudioModelTests()
    model = test_instance.chat_model_class(**test_instance.chat_model_params)
    AudioModelTests().test_audio_inputs(model)
--- a/libs/standard-tests/langchain_tests/integration_tests/chat_models.py
+++ b/libs/standard-tests/langchain_tests/integration_tests/chat_models.py
@ -377,6 +377,33 @@ class ChatModelIntegrationTests(ChatModelTests):
            def supports_pdf_inputs(self) -> bool:
                return True
    .. dropdown:: supports_audio_inputs
        Boolean property indicating whether the chat model supports audio inputs.
        Defaults to ``False``.
        If set to ``True``, the chat model will be tested using content blocks of the
        form
        .. code-block:: python
            {
                "type": "audio",
                "source_type": "base64",
                "data": "<base64 audio data>",
                "mime_type": "audio/wav",  # or appropriate mime-type
            }
        See https://python.langchain.com/docs/concepts/multimodality/
        Example:
        .. code-block:: python
            @property
            def supports_audio_inputs(self) -> bool:
                return True
    .. dropdown:: supports_video_inputs
        Boolean property indicating whether the chat model supports image inputs.
@ -2009,6 +2036,63 @@ class ChatModelIntegrationTests(ChatModelTests):
        )
        _ = model.invoke([message])
    def test_audio_inputs(self, model: BaseChatModel) -> None:
        """Test that the model can process audio inputs.
        This test should be skipped (see Configuration below) if the model does not
        support audio inputs. These will take the form:
        .. code-block:: python
            {
                "type": "audio",
                "source_type": "base64",
                "data": "<base64 audio data>",
                "mime_type": "audio/wav",  # or appropriate mime-type
            }
        See https://python.langchain.com/docs/concepts/multimodality/
        .. dropdown:: Configuration
            To disable this test, set ``supports_audio_inputs`` to False in your
            test class:
            .. code-block:: python
                class TestMyChatModelIntegration(ChatModelIntegrationTests):
                    @property
                    def supports_audio_inputs(self) -> bool:
                        return False
        .. dropdown:: Troubleshooting
            If this test fails, check that the model can correctly handle messages
            with audio content blocks, specifically base64-encoded files. Otherwise,
            set the ``supports_audio_inputs`` property to False.
        """
        if not self.supports_audio_inputs:
            pytest.skip("Model does not support audio inputs.")
        url = "https://upload.wikimedia.org/wikipedia/commons/3/3d/Alcal%C3%A1_de_Henares_%28RPS_13-04-2024%29_canto_de_ruise%C3%B1or_%28Luscinia_megarhynchos%29_en_el_Soto_del_Henares.wav"
        audio_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
        message = HumanMessage(
            [
                {
                    "type": "text",
                    "text": "Describe this audio:",
                },
                {
                    "type": "audio",
                    "source_type": "base64",
                    "mime_type": "audio/wav",
                    "data": audio_data,
                },
            ]
        )
        _ = model.invoke([message])
    def test_image_inputs(self, model: BaseChatModel) -> None:
        """Test that the model can process image inputs.
--- a/libs/standard-tests/langchain_tests/unit_tests/chat_models.py
+++ b/libs/standard-tests/langchain_tests/unit_tests/chat_models.py
@ -171,9 +171,15 @@ class ChatModelTests(BaseStandardTests):
        """(bool) whether the chat model supports PDF inputs, defaults to ``False``."""
        return False
    @property
    def supports_audio_inputs(self) -> bool:
        """(bool) whether the chat model supports audio inputs, defaults to
        ``False``."""
        return False
    @property
    def supports_video_inputs(self) -> bool:
-        """(bool) whether the chat model supports video inputs, efaults to ``False``.
+        """(bool) whether the chat model supports video inputs, defaults to ``False``.
        No current tests are written for this feature."""
        return False
@ -463,6 +469,33 @@ class ChatModelUnitTests(ChatModelTests):
            def supports_pdf_inputs(self) -> bool:
                return True
    .. dropdown:: supports_audio_inputs
        Boolean property indicating whether the chat model supports audio inputs.
        Defaults to ``False``.
        If set to ``True``, the chat model will be tested using content blocks of the
        form
        .. code-block:: python
            {
                "type": "audio",
                "source_type": "base64",
                "data": "<base64 audio data>",
                "mime_type": "audio/wav",  # or appropriate mime-type
            }
        See https://python.langchain.com/docs/concepts/multimodality/
        Example:
        .. code-block:: python
            @property
            def supports_audio_inputs(self) -> bool:
                return True
    .. dropdown:: supports_video_inputs
        Boolean property indicating whether the chat model supports image inputs.