standard-tests, openai[patch]: add support standard audio inputs (#30904)

This commit is contained in:
ccurme 2025-04-17 10:30:57 -04:00 committed by GitHub
parent 2c2db1ab69
commit add6a78f98
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 157 additions and 1 deletions

View File

@ -208,6 +208,17 @@ def _format_data_content_block(block: dict) -> dict:
formatted_block = {"type": "file", "file": file}
elif block["source_type"] == "id":
formatted_block = {"type": "file", "file": {"file_id": block["id"]}}
else:
raise ValueError("source_type base64 or id is required for file blocks.")
elif block["type"] == "audio":
if block["source_type"] == "base64":
format = block["mime_type"].split("/")[-1]
formatted_block = {
"type": "input_audio",
"input_audio": {"data": block["data"], "format": format},
}
else:
raise ValueError("source_type base64 is required for audio blocks.")
else:
raise ValueError(f"Block of type {block['type']} is not supported.")

View File

@ -5,6 +5,7 @@ from pathlib import Path
from typing import Literal, cast
import httpx
import pytest
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import AIMessage, HumanMessage
from langchain_tests.integration_tests import ChatModelIntegrationTests
@ -111,3 +112,30 @@ def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage:
return cast(AIMessage, full)
else:
return cast(AIMessage, llm.invoke(input_))
@pytest.mark.skip() # Test either finishes in 5 seconds or 5 minutes.
def test_audio_model() -> None:
class AudioModelTests(ChatModelIntegrationTests):
@property
def chat_model_class(self) -> type[ChatOpenAI]:
return ChatOpenAI
@property
def chat_model_params(self) -> dict:
return {
"model": "gpt-4o-audio-preview",
"temperature": 0,
"model_kwargs": {
"modalities": ["text", "audio"],
"audio": {"voice": "alloy", "format": "wav"},
},
}
@property
def supports_audio_inputs(self) -> bool:
return True
test_instance = AudioModelTests()
model = test_instance.chat_model_class(**test_instance.chat_model_params)
AudioModelTests().test_audio_inputs(model)

View File

@ -377,6 +377,33 @@ class ChatModelIntegrationTests(ChatModelTests):
def supports_pdf_inputs(self) -> bool:
return True
.. dropdown:: supports_audio_inputs
Boolean property indicating whether the chat model supports audio inputs.
Defaults to ``False``.
If set to ``True``, the chat model will be tested using content blocks of the
form
.. code-block:: python
{
"type": "audio",
"source_type": "base64",
"data": "<base64 audio data>",
"mime_type": "audio/wav", # or appropriate mime-type
}
See https://python.langchain.com/docs/concepts/multimodality/
Example:
.. code-block:: python
@property
def supports_audio_inputs(self) -> bool:
return True
.. dropdown:: supports_video_inputs
Boolean property indicating whether the chat model supports image inputs.
@ -2009,6 +2036,63 @@ class ChatModelIntegrationTests(ChatModelTests):
)
_ = model.invoke([message])
def test_audio_inputs(self, model: BaseChatModel) -> None:
"""Test that the model can process audio inputs.
This test should be skipped (see Configuration below) if the model does not
support audio inputs. These will take the form:
.. code-block:: python
{
"type": "audio",
"source_type": "base64",
"data": "<base64 audio data>",
"mime_type": "audio/wav", # or appropriate mime-type
}
See https://python.langchain.com/docs/concepts/multimodality/
.. dropdown:: Configuration
To disable this test, set ``supports_audio_inputs`` to False in your
test class:
.. code-block:: python
class TestMyChatModelIntegration(ChatModelIntegrationTests):
@property
def supports_audio_inputs(self) -> bool:
return False
.. dropdown:: Troubleshooting
If this test fails, check that the model can correctly handle messages
with audio content blocks, specifically base64-encoded files. Otherwise,
set the ``supports_audio_inputs`` property to False.
"""
if not self.supports_audio_inputs:
pytest.skip("Model does not support audio inputs.")
url = "https://upload.wikimedia.org/wikipedia/commons/3/3d/Alcal%C3%A1_de_Henares_%28RPS_13-04-2024%29_canto_de_ruise%C3%B1or_%28Luscinia_megarhynchos%29_en_el_Soto_del_Henares.wav"
audio_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
message = HumanMessage(
[
{
"type": "text",
"text": "Describe this audio:",
},
{
"type": "audio",
"source_type": "base64",
"mime_type": "audio/wav",
"data": audio_data,
},
]
)
_ = model.invoke([message])
def test_image_inputs(self, model: BaseChatModel) -> None:
"""Test that the model can process image inputs.

View File

@ -171,9 +171,15 @@ class ChatModelTests(BaseStandardTests):
"""(bool) whether the chat model supports PDF inputs, defaults to ``False``."""
return False
@property
def supports_audio_inputs(self) -> bool:
"""(bool) whether the chat model supports audio inputs, defaults to
``False``."""
return False
@property
def supports_video_inputs(self) -> bool:
"""(bool) whether the chat model supports video inputs, efaults to ``False``.
"""(bool) whether the chat model supports video inputs, defaults to ``False``.
No current tests are written for this feature."""
return False
@ -463,6 +469,33 @@ class ChatModelUnitTests(ChatModelTests):
def supports_pdf_inputs(self) -> bool:
return True
.. dropdown:: supports_audio_inputs
Boolean property indicating whether the chat model supports audio inputs.
Defaults to ``False``.
If set to ``True``, the chat model will be tested using content blocks of the
form
.. code-block:: python
{
"type": "audio",
"source_type": "base64",
"data": "<base64 audio data>",
"mime_type": "audio/wav", # or appropriate mime-type
}
See https://python.langchain.com/docs/concepts/multimodality/
Example:
.. code-block:: python
@property
def supports_audio_inputs(self) -> bool:
return True
.. dropdown:: supports_video_inputs
Boolean property indicating whether the chat model supports image inputs.