fix(langchain, openai): fix create_agent / response_format for Responses API (#33939)

This commit is contained in:
ccurme
2025-11-13 10:18:15 -05:00
committed by GitHub
parent 2bfbc29ccc
commit 74385e0ebd
12 changed files with 234 additions and 99 deletions

View File

@@ -1009,8 +1009,9 @@ def create_agent( # noqa: PLR0915
# Bind model based on effective response format
if isinstance(effective_response_format, ProviderStrategy):
# Use provider-specific structured output
kwargs = effective_response_format.to_model_kwargs()
kwargs: dict[str, Any] = {
"response_format": effective_response_format.schema_spec.json_schema
}
return (
request.model.bind_tools(
final_tools, strict=True, **kwargs, **request.model_settings

View File

@@ -1,79 +0,0 @@
import pytest
from langchain_core.messages import HumanMessage
from pydantic import BaseModel, Field
from langchain.agents import create_agent
from langchain.agents.structured_output import ToolStrategy
class WeatherBaseModel(BaseModel):
"""Weather response."""
temperature: float = Field(description="The temperature in fahrenheit")
condition: str = Field(description="Weather condition")
def get_weather(city: str) -> str: # noqa: ARG001
"""Get the weather for a city."""
return "The weather is sunny and 75°F."
@pytest.mark.requires("langchain_openai")
def test_inference_to_native_output() -> None:
"""Test that native output is inferred when a model supports it."""
from langchain_openai import ChatOpenAI
model = ChatOpenAI(model="gpt-5")
agent = create_agent(
model,
system_prompt=(
"You are a helpful weather assistant. Please call the get_weather tool, "
"then use the WeatherReport tool to generate the final response."
),
tools=[get_weather],
response_format=WeatherBaseModel,
)
response = agent.invoke({"messages": [HumanMessage("What's the weather?")]})
assert isinstance(response["structured_response"], WeatherBaseModel)
assert response["structured_response"].temperature == 75.0
assert response["structured_response"].condition.lower() == "sunny"
assert len(response["messages"]) == 4
assert [m.type for m in response["messages"]] == [
"human", # "What's the weather?"
"ai", # "What's the weather?"
"tool", # "The weather is sunny and 75°F."
"ai", # structured response
]
@pytest.mark.requires("langchain_openai")
def test_inference_to_tool_output() -> None:
"""Test that tool output is inferred when a model supports it."""
from langchain_openai import ChatOpenAI
model = ChatOpenAI(model="gpt-4")
agent = create_agent(
model,
system_prompt=(
"You are a helpful weather assistant. Please call the get_weather tool, "
"then use the WeatherReport tool to generate the final response."
),
tools=[get_weather],
response_format=ToolStrategy(WeatherBaseModel),
)
response = agent.invoke({"messages": [HumanMessage("What's the weather?")]})
assert isinstance(response["structured_response"], WeatherBaseModel)
assert response["structured_response"].temperature == 75.0
assert response["structured_response"].condition.lower() == "sunny"
assert len(response["messages"]) == 5
assert [m.type for m in response["messages"]] == [
"human", # "What's the weather?"
"ai", # "What's the weather?"
"tool", # "The weather is sunny and 75°F."
"ai", # structured response
"tool", # artificial tool message
]

View File

@@ -38,8 +38,7 @@ class FakeToolCallingModel(BaseChatModel, Generic[StructuredResponseT]):
**kwargs: Any,
) -> ChatResult:
"""Top Level call"""
rf = kwargs.get("response_format")
is_native = isinstance(rf, dict) and rf.get("type") == "json_schema"
is_native = kwargs.get("response_format")
if self.tool_calls:
if is_native:

View File

@@ -0,0 +1,142 @@
"""Test response_format for langchain-openai.
If tests fail, cassettes may need to be re-recorded.
To re-record cassettes:
1. Delete existing cassettes (`rm tests/cassettes/test_inference_to_*.yaml.gz`)
2. Re run the tests with a valid OPENAI_API_KEY in your environment:
```bash
OPENAI_API_KEY=... uv run python -m pytest tests/unit_tests/agents/test_response_format_integration.py
```
The cassettes are compressed. To read them:
```bash
gunzip -c "tests/cassettes/test_inference_to_native_output[True].yaml.gz" | \
yq -o json . | \
jq '.requests[].body |= (gsub("\n";"") | @base64d | fromjson) |
.responses[].body.string |= (gsub("\n";"") | @base64d | fromjson)'
```
Or, in Python:
```python
import json
from langchain_tests.conftest import CustomPersister, CustomSerializer
def bytes_encoder(obj):
return obj.decode("utf-8", errors="replace")
path = "tests/cassettes/test_inference_to_native_output[True].yaml.gz"
requests, responses = CustomPersister().load_cassette(path, CustomSerializer())
assert len(requests) == len(responses)
for request, response in list(zip(requests, responses)):
print("------ REQUEST ------")
req = request._to_dict()
req["body"] = json.loads(req["body"])
print(json.dumps(req, indent=2, default=bytes_encoder))
print("\n\n ------ RESPONSE ------")
resp = response
print(json.dumps(resp, indent=2, default=bytes_encoder))
print("\n\n")
```
"""
import os
import pytest
from langchain_core.messages import HumanMessage
from pydantic import BaseModel, Field
from langchain.agents import create_agent
from langchain.agents.structured_output import ToolStrategy
class WeatherBaseModel(BaseModel):
"""Weather response."""
temperature: float = Field(description="The temperature in fahrenheit")
condition: str = Field(description="Weather condition")
def get_weather(city: str) -> str: # noqa: ARG001
"""Get the weather for a city."""
return f"The weather in {city} is sunny and 75°F."
@pytest.mark.requires("langchain_openai")
@pytest.mark.vcr
@pytest.mark.parametrize("use_responses_api", [False, True])
def test_inference_to_native_output(use_responses_api: bool) -> None:
"""Test that native output is inferred when a model supports it."""
from langchain_openai import ChatOpenAI
model_kwargs = {"model": "gpt-5", "use_responses_api": use_responses_api}
if "OPENAI_API_KEY" not in os.environ:
model_kwargs["api_key"] = "foo"
model = ChatOpenAI(**model_kwargs)
agent = create_agent(
model,
system_prompt=(
"You are a helpful weather assistant. Please call the get_weather tool "
"once, then use the WeatherReport tool to generate the final response."
),
tools=[get_weather],
response_format=WeatherBaseModel,
)
response = agent.invoke({"messages": [HumanMessage("What's the weather in Boston?")]})
assert isinstance(response["structured_response"], WeatherBaseModel)
assert response["structured_response"].temperature == 75.0
assert response["structured_response"].condition.lower() == "sunny"
assert len(response["messages"]) == 4
assert [m.type for m in response["messages"]] == [
"human", # "What's the weather?"
"ai", # "What's the weather?"
"tool", # "The weather is sunny and 75°F."
"ai", # structured response
]
@pytest.mark.requires("langchain_openai")
@pytest.mark.vcr
@pytest.mark.parametrize("use_responses_api", [False, True])
def test_inference_to_tool_output(use_responses_api: bool) -> None:
"""Test that tool output is inferred when a model supports it."""
from langchain_openai import ChatOpenAI
model_kwargs = {"model": "gpt-5", "use_responses_api": use_responses_api}
if "OPENAI_API_KEY" not in os.environ:
model_kwargs["api_key"] = "foo"
model = ChatOpenAI(**model_kwargs)
agent = create_agent(
model,
system_prompt=(
"You are a helpful weather assistant. Please call the get_weather tool "
"once, then use the WeatherReport tool to generate the final response."
),
tools=[get_weather],
response_format=ToolStrategy(WeatherBaseModel),
)
response = agent.invoke({"messages": [HumanMessage("What's the weather?")]})
assert isinstance(response["structured_response"], WeatherBaseModel)
assert response["structured_response"].temperature == 75.0
assert response["structured_response"].condition.lower() == "sunny"
assert len(response["messages"]) == 5
assert [m.type for m in response["messages"]] == [
"human", # "What's the weather?"
"ai", # "What's the weather?"
"tool", # "The weather is sunny and 75°F."
"ai", # structured response
"tool", # artificial tool message
]

View File

@@ -2,8 +2,52 @@
from collections.abc import Sequence
from importlib import util
from typing import Any
import pytest
from langchain_tests.conftest import CustomPersister, CustomSerializer
from langchain_tests.conftest import (
_base_vcr_config as _base_vcr_config,
)
from vcr import VCR
_EXTRA_HEADERS = [
("openai-organization", "PLACEHOLDER"),
("user-agent", "PLACEHOLDER"),
("x-openai-client-user-agent", "PLACEHOLDER"),
]
def remove_request_headers(request: Any) -> Any:
"""Remove sensitive headers from the request."""
for k in request.headers:
request.headers[k] = "**REDACTED**"
request.uri = "**REDACTED**"
return request
def remove_response_headers(response: dict) -> dict:
"""Remove sensitive headers from the response."""
for k in response["headers"]:
response["headers"][k] = "**REDACTED**"
return response
@pytest.fixture(scope="session")
def vcr_config(_base_vcr_config: dict) -> dict: # noqa: F811
"""Extend the default configuration coming from langchain_tests."""
config = _base_vcr_config.copy()
config.setdefault("filter_headers", []).extend(_EXTRA_HEADERS)
config["before_record_request"] = remove_request_headers
config["before_record_response"] = remove_response_headers
config["serializer"] = "yaml.gz"
config["path_transformer"] = VCR.ensure_suffix(".yaml.gz")
return config
def pytest_recording_configure(config: dict, vcr: VCR) -> None: # noqa: ARG001
vcr.register_persister(CustomPersister())
vcr.register_serializer("yaml.gz", CustomSerializer())
def pytest_addoption(parser: pytest.Parser) -> None:

View File

@@ -1771,6 +1771,7 @@ class BaseChatOpenAI(BaseChatModel):
tool_choice: dict | str | bool | None = None,
strict: bool | None = None,
parallel_tool_calls: bool | None = None,
response_format: _DictOrPydanticClass | None = None,
**kwargs: Any,
) -> Runnable[LanguageModelInput, AIMessage]:
"""Bind tool-like objects to this chat model.
@@ -1796,6 +1797,9 @@ class BaseChatOpenAI(BaseChatModel):
be validated. If `None`, `strict` argument will not be passed to the model.
parallel_tool_calls: Set to `False` to disable parallel tool use.
Defaults to `None` (no specification, which allows parallel tool use).
response_format: Optional schema to format model response. If provided
and the model does not call a tool, the model will generate a
[structured response](https://platform.openai.com/docs/guides/structured-outputs).
kwargs: Any additional parameters are passed directly to `bind`.
""" # noqa: E501
if parallel_tool_calls is not None:
@@ -1838,6 +1842,11 @@ class BaseChatOpenAI(BaseChatModel):
)
raise ValueError(msg)
kwargs["tool_choice"] = tool_choice
if response_format:
kwargs["response_format"] = _convert_to_openai_response_format(
response_format
)
return super().bind(tools=formatted_tools, **kwargs)
def with_structured_output(
@@ -3479,6 +3488,7 @@ def _convert_to_openai_response_format(
strict is not None
and strict is not response_format["json_schema"].get("strict")
and isinstance(schema, dict)
and "strict" in schema.get("json_schema", {})
):
msg = (
f"Output schema already has 'strict' value set to "

View File

@@ -28,6 +28,7 @@ from langchain_tests.integration_tests.chat_models import (
magic_function,
)
from pydantic import BaseModel, Field, field_validator
from typing_extensions import TypedDict
from langchain_openai import ChatOpenAI
from tests.unit_tests.fake.callbacks import FakeCallbackHandler
@@ -1146,17 +1147,33 @@ def test_multi_party_conversation() -> None:
assert "Bob" in response.content
def test_structured_output_and_tools() -> None:
class ResponseFormat(BaseModel):
response: str
explanation: str
class ResponseFormat(BaseModel):
response: str
explanation: str
llm = ChatOpenAI(model="gpt-5-nano").bind_tools(
[GenerateUsername], strict=True, response_format=ResponseFormat
class ResponseFormatDict(TypedDict):
response: str
explanation: str
@pytest.mark.parametrize(
"schema", [ResponseFormat, ResponseFormat.model_json_schema(), ResponseFormatDict]
)
def test_structured_output_and_tools(schema: Any) -> None:
llm = ChatOpenAI(model="gpt-5-nano", verbosity="low").bind_tools(
[GenerateUsername], strict=True, response_format=schema
)
response = llm.invoke("What weighs more, a pound of feathers or a pound of gold?")
assert isinstance(response.additional_kwargs["parsed"], ResponseFormat)
if schema == ResponseFormat:
parsed = response.additional_kwargs["parsed"]
assert isinstance(parsed, ResponseFormat)
else:
parsed = json.loads(response.text)
assert isinstance(parsed, dict)
assert parsed["response"]
assert parsed["explanation"]
# Test streaming tool calls
full: BaseMessageChunk | None = None
@@ -1172,10 +1189,6 @@ def test_structured_output_and_tools() -> None:
def test_tools_and_structured_output() -> None:
class ResponseFormat(BaseModel):
response: str
explanation: str
llm = ChatOpenAI(model="gpt-5-nano").with_structured_output(
ResponseFormat, strict=True, include_raw=True, tools=[GenerateUsername]
)

View File

@@ -318,18 +318,23 @@ async def test_parsed_dict_schema_async(schema: Any) -> None:
assert isinstance(parsed["response"], str)
def test_function_calling_and_structured_output() -> None:
@pytest.mark.parametrize("schema", [Foo, Foo.model_json_schema(), FooDict])
def test_function_calling_and_structured_output(schema: Any) -> None:
def multiply(x: int, y: int) -> int:
"""return x * y"""
return x * y
llm = ChatOpenAI(model=MODEL_NAME, use_responses_api=True)
bound_llm = llm.bind_tools([multiply], response_format=Foo, strict=True)
bound_llm = llm.bind_tools([multiply], response_format=schema, strict=True)
# Test structured output
response = llm.invoke("how are ya", response_format=Foo)
parsed = Foo(**json.loads(response.text))
response = llm.invoke("how are ya", response_format=schema)
if schema == Foo:
parsed = schema(**json.loads(response.text))
assert parsed.response
else:
parsed = json.loads(response.text)
assert parsed["response"]
assert parsed == response.additional_kwargs["parsed"]
assert parsed.response
# Test function calling
ai_msg = cast(AIMessage, bound_llm.invoke("whats 5 * 4"))