From add6a78f98ad725933469758fa688b28dc0efc3d Mon Sep 17 00:00:00 2001 From: ccurme Date: Thu, 17 Apr 2025 10:30:57 -0400 Subject: [PATCH] standard-tests, openai[patch]: add support standard audio inputs (#30904) --- .../langchain_openai/chat_models/base.py | 11 +++ .../chat_models/test_base_standard.py | 28 +++++++ .../integration_tests/chat_models.py | 84 +++++++++++++++++++ .../langchain_tests/unit_tests/chat_models.py | 35 +++++++- 4 files changed, 157 insertions(+), 1 deletion(-) diff --git a/libs/partners/openai/langchain_openai/chat_models/base.py b/libs/partners/openai/langchain_openai/chat_models/base.py index 119058038b3..346a173a924 100644 --- a/libs/partners/openai/langchain_openai/chat_models/base.py +++ b/libs/partners/openai/langchain_openai/chat_models/base.py @@ -208,6 +208,17 @@ def _format_data_content_block(block: dict) -> dict: formatted_block = {"type": "file", "file": file} elif block["source_type"] == "id": formatted_block = {"type": "file", "file": {"file_id": block["id"]}} + else: + raise ValueError("source_type base64 or id is required for file blocks.") + elif block["type"] == "audio": + if block["source_type"] == "base64": + format = block["mime_type"].split("/")[-1] + formatted_block = { + "type": "input_audio", + "input_audio": {"data": block["data"], "format": format}, + } + else: + raise ValueError("source_type base64 is required for audio blocks.") else: raise ValueError(f"Block of type {block['type']} is not supported.") diff --git a/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py b/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py index 71e83fe0eaa..14e8865d594 100644 --- a/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py +++ b/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Literal, cast import httpx +import pytest from langchain_core.language_models import BaseChatModel from langchain_core.messages import AIMessage, HumanMessage from langchain_tests.integration_tests import ChatModelIntegrationTests @@ -111,3 +112,30 @@ def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage: return cast(AIMessage, full) else: return cast(AIMessage, llm.invoke(input_)) + + +@pytest.mark.skip() # Test either finishes in 5 seconds or 5 minutes. +def test_audio_model() -> None: + class AudioModelTests(ChatModelIntegrationTests): + @property + def chat_model_class(self) -> type[ChatOpenAI]: + return ChatOpenAI + + @property + def chat_model_params(self) -> dict: + return { + "model": "gpt-4o-audio-preview", + "temperature": 0, + "model_kwargs": { + "modalities": ["text", "audio"], + "audio": {"voice": "alloy", "format": "wav"}, + }, + } + + @property + def supports_audio_inputs(self) -> bool: + return True + + test_instance = AudioModelTests() + model = test_instance.chat_model_class(**test_instance.chat_model_params) + AudioModelTests().test_audio_inputs(model) diff --git a/libs/standard-tests/langchain_tests/integration_tests/chat_models.py b/libs/standard-tests/langchain_tests/integration_tests/chat_models.py index ae083c5bf36..c9414e5f52a 100644 --- a/libs/standard-tests/langchain_tests/integration_tests/chat_models.py +++ b/libs/standard-tests/langchain_tests/integration_tests/chat_models.py @@ -377,6 +377,33 @@ class ChatModelIntegrationTests(ChatModelTests): def supports_pdf_inputs(self) -> bool: return True + .. dropdown:: supports_audio_inputs + + Boolean property indicating whether the chat model supports audio inputs. + Defaults to ``False``. + + If set to ``True``, the chat model will be tested using content blocks of the + form + + .. code-block:: python + + { + "type": "audio", + "source_type": "base64", + "data": "", + "mime_type": "audio/wav", # or appropriate mime-type + } + + See https://python.langchain.com/docs/concepts/multimodality/ + + Example: + + .. code-block:: python + + @property + def supports_audio_inputs(self) -> bool: + return True + .. dropdown:: supports_video_inputs Boolean property indicating whether the chat model supports image inputs. @@ -2009,6 +2036,63 @@ class ChatModelIntegrationTests(ChatModelTests): ) _ = model.invoke([message]) + def test_audio_inputs(self, model: BaseChatModel) -> None: + """Test that the model can process audio inputs. + + This test should be skipped (see Configuration below) if the model does not + support audio inputs. These will take the form: + + .. code-block:: python + + { + "type": "audio", + "source_type": "base64", + "data": "", + "mime_type": "audio/wav", # or appropriate mime-type + } + + See https://python.langchain.com/docs/concepts/multimodality/ + + .. dropdown:: Configuration + + To disable this test, set ``supports_audio_inputs`` to False in your + test class: + + .. code-block:: python + + class TestMyChatModelIntegration(ChatModelIntegrationTests): + + @property + def supports_audio_inputs(self) -> bool: + return False + + .. dropdown:: Troubleshooting + + If this test fails, check that the model can correctly handle messages + with audio content blocks, specifically base64-encoded files. Otherwise, + set the ``supports_audio_inputs`` property to False. + """ + if not self.supports_audio_inputs: + pytest.skip("Model does not support audio inputs.") + url = "https://upload.wikimedia.org/wikipedia/commons/3/3d/Alcal%C3%A1_de_Henares_%28RPS_13-04-2024%29_canto_de_ruise%C3%B1or_%28Luscinia_megarhynchos%29_en_el_Soto_del_Henares.wav" + audio_data = base64.b64encode(httpx.get(url).content).decode("utf-8") + + message = HumanMessage( + [ + { + "type": "text", + "text": "Describe this audio:", + }, + { + "type": "audio", + "source_type": "base64", + "mime_type": "audio/wav", + "data": audio_data, + }, + ] + ) + _ = model.invoke([message]) + def test_image_inputs(self, model: BaseChatModel) -> None: """Test that the model can process image inputs. diff --git a/libs/standard-tests/langchain_tests/unit_tests/chat_models.py b/libs/standard-tests/langchain_tests/unit_tests/chat_models.py index f3e22d04adf..e95cbb840fa 100644 --- a/libs/standard-tests/langchain_tests/unit_tests/chat_models.py +++ b/libs/standard-tests/langchain_tests/unit_tests/chat_models.py @@ -171,9 +171,15 @@ class ChatModelTests(BaseStandardTests): """(bool) whether the chat model supports PDF inputs, defaults to ``False``.""" return False + @property + def supports_audio_inputs(self) -> bool: + """(bool) whether the chat model supports audio inputs, defaults to + ``False``.""" + return False + @property def supports_video_inputs(self) -> bool: - """(bool) whether the chat model supports video inputs, efaults to ``False``. + """(bool) whether the chat model supports video inputs, defaults to ``False``. No current tests are written for this feature.""" return False @@ -463,6 +469,33 @@ class ChatModelUnitTests(ChatModelTests): def supports_pdf_inputs(self) -> bool: return True + .. dropdown:: supports_audio_inputs + + Boolean property indicating whether the chat model supports audio inputs. + Defaults to ``False``. + + If set to ``True``, the chat model will be tested using content blocks of the + form + + .. code-block:: python + + { + "type": "audio", + "source_type": "base64", + "data": "", + "mime_type": "audio/wav", # or appropriate mime-type + } + + See https://python.langchain.com/docs/concepts/multimodality/ + + Example: + + .. code-block:: python + + @property + def supports_audio_inputs(self) -> bool: + return True + .. dropdown:: supports_video_inputs Boolean property indicating whether the chat model supports image inputs.