fix(langchain, openai): fix create_agent / response_format for Responses API (#33939)

2026-04-25 01:16:55 +00:00 · 2025-11-13 10:18:15 -05:00
parent 2bfbc29ccc
commit 74385e0ebd
12 changed files with 234 additions and 99 deletions
--- a/libs/langchain_v1/langchain/agents/factory.py
+++ b/libs/langchain_v1/langchain/agents/factory.py
@@ -1009,8 +1009,9 @@ def create_agent(  # noqa: PLR0915

        # Bind model based on effective response format
        if isinstance(effective_response_format, ProviderStrategy):
-            # Use provider-specific structured output
-            kwargs = effective_response_format.to_model_kwargs()
+            kwargs: dict[str, Any] = {
+                "response_format": effective_response_format.schema_spec.json_schema
+            }
            return (
                request.model.bind_tools(
                    final_tools, strict=True, **kwargs, **request.model_settings
--- a/libs/langchain_v1/tests/cassettes/test_inference_to_native_output[False].yaml.gz
+++ b/libs/langchain_v1/tests/cassettes/test_inference_to_native_output[False].yaml.gz
--- a/libs/langchain_v1/tests/cassettes/test_inference_to_native_output[True].yaml.gz
+++ b/libs/langchain_v1/tests/cassettes/test_inference_to_native_output[True].yaml.gz
--- a/libs/langchain_v1/tests/cassettes/test_inference_to_tool_output[False].yaml.gz
+++ b/libs/langchain_v1/tests/cassettes/test_inference_to_tool_output[False].yaml.gz
--- a/libs/langchain_v1/tests/cassettes/test_inference_to_tool_output[True].yaml.gz
+++ b/libs/langchain_v1/tests/cassettes/test_inference_to_tool_output[True].yaml.gz
--- a/libs/langchain_v1/tests/integration_tests/agents/test_response_format.py
+++ b/libs/langchain_v1/tests/integration_tests/agents/test_response_format.py
@@ -1,79 +0,0 @@
-import pytest
-from langchain_core.messages import HumanMessage
-from pydantic import BaseModel, Field
-
-from langchain.agents import create_agent
-from langchain.agents.structured_output import ToolStrategy
-
-
-class WeatherBaseModel(BaseModel):
-    """Weather response."""
-
-    temperature: float = Field(description="The temperature in fahrenheit")
-    condition: str = Field(description="Weather condition")
-
-
-def get_weather(city: str) -> str:  # noqa: ARG001
-    """Get the weather for a city."""
-    return "The weather is sunny and 75°F."
-
-
-@pytest.mark.requires("langchain_openai")
-def test_inference_to_native_output() -> None:
-    """Test that native output is inferred when a model supports it."""
-    from langchain_openai import ChatOpenAI
-
-    model = ChatOpenAI(model="gpt-5")
-    agent = create_agent(
-        model,
-        system_prompt=(
-            "You are a helpful weather assistant. Please call the get_weather tool, "
-            "then use the WeatherReport tool to generate the final response."
-        ),
-        tools=[get_weather],
-        response_format=WeatherBaseModel,
-    )
-    response = agent.invoke({"messages": [HumanMessage("What's the weather?")]})
-
-    assert isinstance(response["structured_response"], WeatherBaseModel)
-    assert response["structured_response"].temperature == 75.0
-    assert response["structured_response"].condition.lower() == "sunny"
-    assert len(response["messages"]) == 4
-
-    assert [m.type for m in response["messages"]] == [
-        "human",  # "What's the weather?"
-        "ai",  # "What's the weather?"
-        "tool",  # "The weather is sunny and 75°F."
-        "ai",  # structured response
-    ]
-
-
-@pytest.mark.requires("langchain_openai")
-def test_inference_to_tool_output() -> None:
-    """Test that tool output is inferred when a model supports it."""
-    from langchain_openai import ChatOpenAI
-
-    model = ChatOpenAI(model="gpt-4")
-    agent = create_agent(
-        model,
-        system_prompt=(
-            "You are a helpful weather assistant. Please call the get_weather tool, "
-            "then use the WeatherReport tool to generate the final response."
-        ),
-        tools=[get_weather],
-        response_format=ToolStrategy(WeatherBaseModel),
-    )
-    response = agent.invoke({"messages": [HumanMessage("What's the weather?")]})
-
-    assert isinstance(response["structured_response"], WeatherBaseModel)
-    assert response["structured_response"].temperature == 75.0
-    assert response["structured_response"].condition.lower() == "sunny"
-    assert len(response["messages"]) == 5
-
-    assert [m.type for m in response["messages"]] == [
-        "human",  # "What's the weather?"
-        "ai",  # "What's the weather?"
-        "tool",  # "The weather is sunny and 75°F."
-        "ai",  # structured response
-        "tool",  # artificial tool message
-    ]
--- a/libs/langchain_v1/tests/unit_tests/agents/model.py
+++ b/libs/langchain_v1/tests/unit_tests/agents/model.py
@@ -38,8 +38,7 @@ class FakeToolCallingModel(BaseChatModel, Generic[StructuredResponseT]):
        **kwargs: Any,
    ) -> ChatResult:
        """Top Level call"""
-        rf = kwargs.get("response_format")
-        is_native = isinstance(rf, dict) and rf.get("type") == "json_schema"
+        is_native = kwargs.get("response_format")

        if self.tool_calls:
            if is_native:
--- a/libs/langchain_v1/tests/unit_tests/agents/test_response_format_integration.py
+++ b/libs/langchain_v1/tests/unit_tests/agents/test_response_format_integration.py
@@ -0,0 +1,142 @@
+"""Test response_format for langchain-openai.
+
+If tests fail, cassettes may need to be re-recorded.
+
+To re-record cassettes:
+
+1. Delete existing cassettes (`rm tests/cassettes/test_inference_to_*.yaml.gz`)
+2. Re run the tests with a valid OPENAI_API_KEY in your environment:
+```bash
+OPENAI_API_KEY=... uv run python -m pytest tests/unit_tests/agents/test_response_format_integration.py
+```
+
+The cassettes are compressed. To read them:
+```bash
+gunzip -c "tests/cassettes/test_inference_to_native_output[True].yaml.gz" | \
+    yq -o json . | \
+    jq '.requests[].body |= (gsub("\n";"") | @base64d | fromjson) |
+        .responses[].body.string |= (gsub("\n";"") | @base64d | fromjson)'
+```
+
+Or, in  Python:
+```python
+import json
+
+from langchain_tests.conftest import CustomPersister, CustomSerializer
+
+def bytes_encoder(obj):
+    return obj.decode("utf-8", errors="replace")
+
+path = "tests/cassettes/test_inference_to_native_output[True].yaml.gz"
+
+requests, responses = CustomPersister().load_cassette(path, CustomSerializer())
+assert len(requests) == len(responses)
+for request, response in list(zip(requests, responses)):
+    print("------ REQUEST ------")
+    req = request._to_dict()
+    req["body"] = json.loads(req["body"])
+    print(json.dumps(req, indent=2, default=bytes_encoder))
+    print("\n\n ------ RESPONSE ------")
+    resp = response
+    print(json.dumps(resp, indent=2, default=bytes_encoder))
+print("\n\n")
+```
+"""
+
+import os
+
+import pytest
+from langchain_core.messages import HumanMessage
+from pydantic import BaseModel, Field
+
+from langchain.agents import create_agent
+from langchain.agents.structured_output import ToolStrategy
+
+
+class WeatherBaseModel(BaseModel):
+    """Weather response."""
+
+    temperature: float = Field(description="The temperature in fahrenheit")
+    condition: str = Field(description="Weather condition")
+
+
+def get_weather(city: str) -> str:  # noqa: ARG001
+    """Get the weather for a city."""
+    return f"The weather in {city} is sunny and 75°F."
+
+
+@pytest.mark.requires("langchain_openai")
+@pytest.mark.vcr
+@pytest.mark.parametrize("use_responses_api", [False, True])
+def test_inference_to_native_output(use_responses_api: bool) -> None:
+    """Test that native output is inferred when a model supports it."""
+    from langchain_openai import ChatOpenAI
+
+    model_kwargs = {"model": "gpt-5", "use_responses_api": use_responses_api}
+
+    if "OPENAI_API_KEY" not in os.environ:
+        model_kwargs["api_key"] = "foo"
+
+    model = ChatOpenAI(**model_kwargs)
+
+    agent = create_agent(
+        model,
+        system_prompt=(
+            "You are a helpful weather assistant. Please call the get_weather tool "
+            "once, then use the WeatherReport tool to generate the final response."
+        ),
+        tools=[get_weather],
+        response_format=WeatherBaseModel,
+    )
+    response = agent.invoke({"messages": [HumanMessage("What's the weather in Boston?")]})
+
+    assert isinstance(response["structured_response"], WeatherBaseModel)
+    assert response["structured_response"].temperature == 75.0
+    assert response["structured_response"].condition.lower() == "sunny"
+    assert len(response["messages"]) == 4
+
+    assert [m.type for m in response["messages"]] == [
+        "human",  # "What's the weather?"
+        "ai",  # "What's the weather?"
+        "tool",  # "The weather is sunny and 75°F."
+        "ai",  # structured response
+    ]
+
+
+@pytest.mark.requires("langchain_openai")
+@pytest.mark.vcr
+@pytest.mark.parametrize("use_responses_api", [False, True])
+def test_inference_to_tool_output(use_responses_api: bool) -> None:
+    """Test that tool output is inferred when a model supports it."""
+    from langchain_openai import ChatOpenAI
+
+    model_kwargs = {"model": "gpt-5", "use_responses_api": use_responses_api}
+
+    if "OPENAI_API_KEY" not in os.environ:
+        model_kwargs["api_key"] = "foo"
+
+    model = ChatOpenAI(**model_kwargs)
+
+    agent = create_agent(
+        model,
+        system_prompt=(
+            "You are a helpful weather assistant. Please call the get_weather tool "
+            "once, then use the WeatherReport tool to generate the final response."
+        ),
+        tools=[get_weather],
+        response_format=ToolStrategy(WeatherBaseModel),
+    )
+    response = agent.invoke({"messages": [HumanMessage("What's the weather?")]})
+
+    assert isinstance(response["structured_response"], WeatherBaseModel)
+    assert response["structured_response"].temperature == 75.0
+    assert response["structured_response"].condition.lower() == "sunny"
+    assert len(response["messages"]) == 5
+
+    assert [m.type for m in response["messages"]] == [
+        "human",  # "What's the weather?"
+        "ai",  # "What's the weather?"
+        "tool",  # "The weather is sunny and 75°F."
+        "ai",  # structured response
+        "tool",  # artificial tool message
+    ]
--- a/libs/langchain_v1/tests/unit_tests/conftest.py
+++ b/libs/langchain_v1/tests/unit_tests/conftest.py
@@ -2,8 +2,52 @@

 from collections.abc import Sequence
 from importlib import util
+from typing import Any

 import pytest
+from langchain_tests.conftest import CustomPersister, CustomSerializer
+from langchain_tests.conftest import (
+    _base_vcr_config as _base_vcr_config,
+)
+from vcr import VCR
+
+_EXTRA_HEADERS = [
+    ("openai-organization", "PLACEHOLDER"),
+    ("user-agent", "PLACEHOLDER"),
+    ("x-openai-client-user-agent", "PLACEHOLDER"),
+]
+
+
+def remove_request_headers(request: Any) -> Any:
+    """Remove sensitive headers from the request."""
+    for k in request.headers:
+        request.headers[k] = "**REDACTED**"
+    request.uri = "**REDACTED**"
+    return request
+
+
+def remove_response_headers(response: dict) -> dict:
+    """Remove sensitive headers from the response."""
+    for k in response["headers"]:
+        response["headers"][k] = "**REDACTED**"
+    return response
+
+
+@pytest.fixture(scope="session")
+def vcr_config(_base_vcr_config: dict) -> dict:  # noqa: F811
+    """Extend the default configuration coming from langchain_tests."""
+    config = _base_vcr_config.copy()
+    config.setdefault("filter_headers", []).extend(_EXTRA_HEADERS)
+    config["before_record_request"] = remove_request_headers
+    config["before_record_response"] = remove_response_headers
+    config["serializer"] = "yaml.gz"
+    config["path_transformer"] = VCR.ensure_suffix(".yaml.gz")
+    return config
+
+
+def pytest_recording_configure(config: dict, vcr: VCR) -> None:  # noqa: ARG001
+    vcr.register_persister(CustomPersister())
+    vcr.register_serializer("yaml.gz", CustomSerializer())


 def pytest_addoption(parser: pytest.Parser) -> None:
--- a/libs/partners/openai/langchain_openai/chat_models/base.py
+++ b/libs/partners/openai/langchain_openai/chat_models/base.py
@@ -1771,6 +1771,7 @@ class BaseChatOpenAI(BaseChatModel):
        tool_choice: dict | str | bool | None = None,
        strict: bool | None = None,
        parallel_tool_calls: bool | None = None,
+        response_format: _DictOrPydanticClass | None = None,
        **kwargs: Any,
    ) -> Runnable[LanguageModelInput, AIMessage]:
        """Bind tool-like objects to this chat model.
@@ -1796,6 +1797,9 @@ class BaseChatOpenAI(BaseChatModel):
                be validated. If `None`, `strict` argument will not be passed to the model.
            parallel_tool_calls: Set to `False` to disable parallel tool use.
                Defaults to `None` (no specification, which allows parallel tool use).
+            response_format: Optional schema to format model response. If provided
+                and the model does not call a tool, the model will generate a
+                [structured response](https://platform.openai.com/docs/guides/structured-outputs).
            kwargs: Any additional parameters are passed directly to `bind`.
        """  # noqa: E501
        if parallel_tool_calls is not None:
@@ -1838,6 +1842,11 @@ class BaseChatOpenAI(BaseChatModel):
                )
                raise ValueError(msg)
            kwargs["tool_choice"] = tool_choice
+
+        if response_format:
+            kwargs["response_format"] = _convert_to_openai_response_format(
+                response_format
+            )
        return super().bind(tools=formatted_tools, **kwargs)

    def with_structured_output(
@@ -3479,6 +3488,7 @@ def _convert_to_openai_response_format(
        strict is not None
        and strict is not response_format["json_schema"].get("strict")
        and isinstance(schema, dict)
+        and "strict" in schema.get("json_schema", {})
    ):
        msg = (
            f"Output schema already has 'strict' value set to "
--- a/libs/partners/openai/tests/integration_tests/chat_models/test_base.py
+++ b/libs/partners/openai/tests/integration_tests/chat_models/test_base.py
@@ -28,6 +28,7 @@ from langchain_tests.integration_tests.chat_models import (
    magic_function,
 )
 from pydantic import BaseModel, Field, field_validator
+from typing_extensions import TypedDict

 from langchain_openai import ChatOpenAI
 from tests.unit_tests.fake.callbacks import FakeCallbackHandler
@@ -1146,17 +1147,33 @@ def test_multi_party_conversation() -> None:
    assert "Bob" in response.content


-def test_structured_output_and_tools() -> None:
-    class ResponseFormat(BaseModel):
-        response: str
-        explanation: str
+class ResponseFormat(BaseModel):
+    response: str
+    explanation: str

-    llm = ChatOpenAI(model="gpt-5-nano").bind_tools(
-        [GenerateUsername], strict=True, response_format=ResponseFormat
+
+class ResponseFormatDict(TypedDict):
+    response: str
+    explanation: str
+
+
+@pytest.mark.parametrize(
+    "schema", [ResponseFormat, ResponseFormat.model_json_schema(), ResponseFormatDict]
+)
+def test_structured_output_and_tools(schema: Any) -> None:
+    llm = ChatOpenAI(model="gpt-5-nano", verbosity="low").bind_tools(
+        [GenerateUsername], strict=True, response_format=schema
    )

    response = llm.invoke("What weighs more, a pound of feathers or a pound of gold?")
-    assert isinstance(response.additional_kwargs["parsed"], ResponseFormat)
+    if schema == ResponseFormat:
+        parsed = response.additional_kwargs["parsed"]
+        assert isinstance(parsed, ResponseFormat)
+    else:
+        parsed = json.loads(response.text)
+        assert isinstance(parsed, dict)
+        assert parsed["response"]
+        assert parsed["explanation"]

    # Test streaming tool calls
    full: BaseMessageChunk | None = None
@@ -1172,10 +1189,6 @@ def test_structured_output_and_tools() -> None:


 def test_tools_and_structured_output() -> None:
-    class ResponseFormat(BaseModel):
-        response: str
-        explanation: str
-
    llm = ChatOpenAI(model="gpt-5-nano").with_structured_output(
        ResponseFormat, strict=True, include_raw=True, tools=[GenerateUsername]
    )
--- a/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py
+++ b/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py
@@ -318,18 +318,23 @@ async def test_parsed_dict_schema_async(schema: Any) -> None:
    assert isinstance(parsed["response"], str)


-def test_function_calling_and_structured_output() -> None:
+@pytest.mark.parametrize("schema", [Foo, Foo.model_json_schema(), FooDict])
+def test_function_calling_and_structured_output(schema: Any) -> None:
    def multiply(x: int, y: int) -> int:
        """return x * y"""
        return x * y

    llm = ChatOpenAI(model=MODEL_NAME, use_responses_api=True)
-    bound_llm = llm.bind_tools([multiply], response_format=Foo, strict=True)
+    bound_llm = llm.bind_tools([multiply], response_format=schema, strict=True)
    # Test structured output
-    response = llm.invoke("how are ya", response_format=Foo)
-    parsed = Foo(**json.loads(response.text))
+    response = llm.invoke("how are ya", response_format=schema)
+    if schema == Foo:
+        parsed = schema(**json.loads(response.text))
+        assert parsed.response
+    else:
+        parsed = json.loads(response.text)
+        assert parsed["response"]
    assert parsed == response.additional_kwargs["parsed"]
-    assert parsed.response

    # Test function calling
    ai_msg = cast(AIMessage, bound_llm.invoke("whats 5 * 4"))