mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-30 04:45:23 +00:00
standard-tests, openai[patch]: add support standard audio inputs (#30904)
This commit is contained in:
parent
2c2db1ab69
commit
add6a78f98
@ -208,6 +208,17 @@ def _format_data_content_block(block: dict) -> dict:
|
|||||||
formatted_block = {"type": "file", "file": file}
|
formatted_block = {"type": "file", "file": file}
|
||||||
elif block["source_type"] == "id":
|
elif block["source_type"] == "id":
|
||||||
formatted_block = {"type": "file", "file": {"file_id": block["id"]}}
|
formatted_block = {"type": "file", "file": {"file_id": block["id"]}}
|
||||||
|
else:
|
||||||
|
raise ValueError("source_type base64 or id is required for file blocks.")
|
||||||
|
elif block["type"] == "audio":
|
||||||
|
if block["source_type"] == "base64":
|
||||||
|
format = block["mime_type"].split("/")[-1]
|
||||||
|
formatted_block = {
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {"data": block["data"], "format": format},
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise ValueError("source_type base64 is required for audio blocks.")
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Block of type {block['type']} is not supported.")
|
raise ValueError(f"Block of type {block['type']} is not supported.")
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||||||
from typing import Literal, cast
|
from typing import Literal, cast
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
import pytest
|
||||||
from langchain_core.language_models import BaseChatModel
|
from langchain_core.language_models import BaseChatModel
|
||||||
from langchain_core.messages import AIMessage, HumanMessage
|
from langchain_core.messages import AIMessage, HumanMessage
|
||||||
from langchain_tests.integration_tests import ChatModelIntegrationTests
|
from langchain_tests.integration_tests import ChatModelIntegrationTests
|
||||||
@ -111,3 +112,30 @@ def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage:
|
|||||||
return cast(AIMessage, full)
|
return cast(AIMessage, full)
|
||||||
else:
|
else:
|
||||||
return cast(AIMessage, llm.invoke(input_))
|
return cast(AIMessage, llm.invoke(input_))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip() # Test either finishes in 5 seconds or 5 minutes.
|
||||||
|
def test_audio_model() -> None:
|
||||||
|
class AudioModelTests(ChatModelIntegrationTests):
|
||||||
|
@property
|
||||||
|
def chat_model_class(self) -> type[ChatOpenAI]:
|
||||||
|
return ChatOpenAI
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chat_model_params(self) -> dict:
|
||||||
|
return {
|
||||||
|
"model": "gpt-4o-audio-preview",
|
||||||
|
"temperature": 0,
|
||||||
|
"model_kwargs": {
|
||||||
|
"modalities": ["text", "audio"],
|
||||||
|
"audio": {"voice": "alloy", "format": "wav"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_audio_inputs(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
test_instance = AudioModelTests()
|
||||||
|
model = test_instance.chat_model_class(**test_instance.chat_model_params)
|
||||||
|
AudioModelTests().test_audio_inputs(model)
|
||||||
|
@ -377,6 +377,33 @@ class ChatModelIntegrationTests(ChatModelTests):
|
|||||||
def supports_pdf_inputs(self) -> bool:
|
def supports_pdf_inputs(self) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
.. dropdown:: supports_audio_inputs
|
||||||
|
|
||||||
|
Boolean property indicating whether the chat model supports audio inputs.
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
If set to ``True``, the chat model will be tested using content blocks of the
|
||||||
|
form
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "audio",
|
||||||
|
"source_type": "base64",
|
||||||
|
"data": "<base64 audio data>",
|
||||||
|
"mime_type": "audio/wav", # or appropriate mime-type
|
||||||
|
}
|
||||||
|
|
||||||
|
See https://python.langchain.com/docs/concepts/multimodality/
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_audio_inputs(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
.. dropdown:: supports_video_inputs
|
.. dropdown:: supports_video_inputs
|
||||||
|
|
||||||
Boolean property indicating whether the chat model supports image inputs.
|
Boolean property indicating whether the chat model supports image inputs.
|
||||||
@ -2009,6 +2036,63 @@ class ChatModelIntegrationTests(ChatModelTests):
|
|||||||
)
|
)
|
||||||
_ = model.invoke([message])
|
_ = model.invoke([message])
|
||||||
|
|
||||||
|
def test_audio_inputs(self, model: BaseChatModel) -> None:
|
||||||
|
"""Test that the model can process audio inputs.
|
||||||
|
|
||||||
|
This test should be skipped (see Configuration below) if the model does not
|
||||||
|
support audio inputs. These will take the form:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "audio",
|
||||||
|
"source_type": "base64",
|
||||||
|
"data": "<base64 audio data>",
|
||||||
|
"mime_type": "audio/wav", # or appropriate mime-type
|
||||||
|
}
|
||||||
|
|
||||||
|
See https://python.langchain.com/docs/concepts/multimodality/
|
||||||
|
|
||||||
|
.. dropdown:: Configuration
|
||||||
|
|
||||||
|
To disable this test, set ``supports_audio_inputs`` to False in your
|
||||||
|
test class:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
class TestMyChatModelIntegration(ChatModelIntegrationTests):
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_audio_inputs(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
.. dropdown:: Troubleshooting
|
||||||
|
|
||||||
|
If this test fails, check that the model can correctly handle messages
|
||||||
|
with audio content blocks, specifically base64-encoded files. Otherwise,
|
||||||
|
set the ``supports_audio_inputs`` property to False.
|
||||||
|
"""
|
||||||
|
if not self.supports_audio_inputs:
|
||||||
|
pytest.skip("Model does not support audio inputs.")
|
||||||
|
url = "https://upload.wikimedia.org/wikipedia/commons/3/3d/Alcal%C3%A1_de_Henares_%28RPS_13-04-2024%29_canto_de_ruise%C3%B1or_%28Luscinia_megarhynchos%29_en_el_Soto_del_Henares.wav"
|
||||||
|
audio_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
|
||||||
|
|
||||||
|
message = HumanMessage(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Describe this audio:",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "audio",
|
||||||
|
"source_type": "base64",
|
||||||
|
"mime_type": "audio/wav",
|
||||||
|
"data": audio_data,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
_ = model.invoke([message])
|
||||||
|
|
||||||
def test_image_inputs(self, model: BaseChatModel) -> None:
|
def test_image_inputs(self, model: BaseChatModel) -> None:
|
||||||
"""Test that the model can process image inputs.
|
"""Test that the model can process image inputs.
|
||||||
|
|
||||||
|
@ -171,9 +171,15 @@ class ChatModelTests(BaseStandardTests):
|
|||||||
"""(bool) whether the chat model supports PDF inputs, defaults to ``False``."""
|
"""(bool) whether the chat model supports PDF inputs, defaults to ``False``."""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_audio_inputs(self) -> bool:
|
||||||
|
"""(bool) whether the chat model supports audio inputs, defaults to
|
||||||
|
``False``."""
|
||||||
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def supports_video_inputs(self) -> bool:
|
def supports_video_inputs(self) -> bool:
|
||||||
"""(bool) whether the chat model supports video inputs, efaults to ``False``.
|
"""(bool) whether the chat model supports video inputs, defaults to ``False``.
|
||||||
No current tests are written for this feature."""
|
No current tests are written for this feature."""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -463,6 +469,33 @@ class ChatModelUnitTests(ChatModelTests):
|
|||||||
def supports_pdf_inputs(self) -> bool:
|
def supports_pdf_inputs(self) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
.. dropdown:: supports_audio_inputs
|
||||||
|
|
||||||
|
Boolean property indicating whether the chat model supports audio inputs.
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
If set to ``True``, the chat model will be tested using content blocks of the
|
||||||
|
form
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "audio",
|
||||||
|
"source_type": "base64",
|
||||||
|
"data": "<base64 audio data>",
|
||||||
|
"mime_type": "audio/wav", # or appropriate mime-type
|
||||||
|
}
|
||||||
|
|
||||||
|
See https://python.langchain.com/docs/concepts/multimodality/
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_audio_inputs(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
.. dropdown:: supports_video_inputs
|
.. dropdown:: supports_video_inputs
|
||||||
|
|
||||||
Boolean property indicating whether the chat model supports image inputs.
|
Boolean property indicating whether the chat model supports image inputs.
|
||||||
|
Loading…
Reference in New Issue
Block a user