openai[patch]: support multi-turn computer use (#30410)

Here we accept ToolMessages of the form ```python ToolMessage( content=<representation of screenshot> (see below), tool_call_id="abc123", additional_kwargs={"type": "computer_call_output"}, ) ``` and translate them to `computer_call_output` items for the Responses API. We also propagate `reasoning_content` items from AIMessages. ## Example ### Load screenshots ```python import base64 def load_png_as_base64(file_path): with open(file_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()) return encoded_string.decode('utf-8') screenshot_1_base64 = load_png_as_base64("/path/to/screenshot/of/application.png") screenshot_2_base64 = load_png_as_base64("/path/to/screenshot/of/desktop.png") ``` ### Initial message and response ```python from langchain_core.messages import HumanMessage, ToolMessage from langchain_openai import ChatOpenAI llm = ChatOpenAI( model="computer-use-preview", model_kwargs={"truncation": "auto"}, ) tool = { "type": "computer_use_preview", "display_width": 1024, "display_height": 768, "environment": "browser" } llm_with_tools = llm.bind_tools([tool]) input_message = HumanMessage( content=[ { "type": "text", "text": ( "Click the red X to close and reveal my Desktop. " "Proceed, no confirmation needed." ) }, { "type": "input_image", "image_url": f"data:image/png;base64,{screenshot_1_base64}", } ] ) response = llm_with_tools.invoke( [input_message], reasoning={ "generate_summary": "concise", }, ) response.additional_kwargs["tool_outputs"] ``` ### Construct ToolMessage ```python tool_call_id = response.additional_kwargs["tool_outputs"][0]["call_id"] tool_message = ToolMessage( content=[ { "type": "input_image", "image_url": f"data:image/png;base64,{screenshot_2_base64}" } ], # content=f"data:image/png;base64,{screenshot_2_base64}", # <-- also acceptable tool_call_id=tool_call_id, additional_kwargs={"type": "computer_call_output"}, ) ``` ### Invoke again ```python messages = [ input_message, response, tool_message, ] response_2 = llm_with_tools.invoke( messages, reasoning={ "generate_summary": "concise", }, ) ```
2026-01-29 21:30:18 +00:00 · 2025-03-24 11:25:36 -04:00
parent 7bc50730aa
commit ed5e589191
3 changed files with 336 additions and 14 deletions
--- a/libs/partners/openai/langchain_openai/chat_models/base.py
+++ b/libs/partners/openai/langchain_openai/chat_models/base.py
@@ -2291,7 +2291,7 @@ class ChatOpenAI(BaseChatOpenAI):  # type: ignore[override]
        self, *args: Any, stream_usage: Optional[bool] = None, **kwargs: Any
    ) -> Iterator[ChatGenerationChunk]:
        """Set default stream_options."""
-        if self._use_responses_api(kwargs):
+        if self._use_responses_api({**kwargs, **self.model_kwargs}):
            return super()._stream_responses(*args, **kwargs)
        else:
            stream_usage = self._should_stream_usage(stream_usage, **kwargs)
@@ -2309,7 +2309,7 @@ class ChatOpenAI(BaseChatOpenAI):  # type: ignore[override]
        self, *args: Any, stream_usage: Optional[bool] = None, **kwargs: Any
    ) -> AsyncIterator[ChatGenerationChunk]:
        """Set default stream_options."""
-        if self._use_responses_api(kwargs):
+        if self._use_responses_api({**kwargs, **self.model_kwargs}):
            async for chunk in super()._astream_responses(*args, **kwargs):
                yield chunk
        else:
@@ -2942,6 +2942,25 @@ def _construct_responses_api_payload(
    return payload


+def _make_computer_call_output_from_message(message: ToolMessage) -> dict:
+    computer_call_output: dict = {
+        "call_id": message.tool_call_id,
+        "type": "computer_call_output",
+    }
+    if isinstance(message.content, list):
+        # Use first input_image block
+        output = next(
+            block
+            for block in message.content
+            if cast(dict, block)["type"] == "input_image"
+        )
+    else:
+        # string, assume image_url
+        output = {"type": "input_image", "image_url": message.content}
+    computer_call_output["output"] = output
+    return computer_call_output
+
+
 def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
    input_ = []
    for lc_msg in messages:
@@ -2951,15 +2970,26 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
            msg.pop("name")
        if msg["role"] == "tool":
            tool_output = msg["content"]
-            if not isinstance(tool_output, str):
-                tool_output = _stringify(tool_output)
-            function_call_output = {
-                "type": "function_call_output",
-                "output": tool_output,
-                "call_id": msg["tool_call_id"],
-            }
-            input_.append(function_call_output)
+            if lc_msg.additional_kwargs.get("type") == "computer_call_output":
+                computer_call_output = _make_computer_call_output_from_message(
+                    cast(ToolMessage, lc_msg)
+                )
+                input_.append(computer_call_output)
+            else:
+                if not isinstance(tool_output, str):
+                    tool_output = _stringify(tool_output)
+                function_call_output = {
+                    "type": "function_call_output",
+                    "output": tool_output,
+                    "call_id": msg["tool_call_id"],
+                }
+                input_.append(function_call_output)
        elif msg["role"] == "assistant":
+            # Reasoning items
+            reasoning_items = []
+            if reasoning := lc_msg.additional_kwargs.get("reasoning"):
+                reasoning_items.append(reasoning)
+            # Function calls
            function_calls = []
            if tool_calls := msg.pop("tool_calls", None):
                # TODO: should you be able to preserve the function call object id on
@@ -2979,7 +3009,12 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
                    ):
                        function_call["id"] = _id
                    function_calls.append(function_call)
-
+            # Computer calls
+            computer_calls = []
+            tool_outputs = lc_msg.additional_kwargs.get("tool_outputs", [])
+            for tool_output in tool_outputs:
+                if tool_output.get("type") == "computer_call":
+                    computer_calls.append(tool_output)
            msg["content"] = msg.get("content") or []
            if lc_msg.additional_kwargs.get("refusal"):
                if isinstance(msg["content"], str):
@@ -3013,7 +3048,9 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
                msg["content"] = new_blocks
            if msg["content"]:
                input_.append(msg)
+            input_.extend(reasoning_items)
            input_.extend(function_calls)
+            input_.extend(computer_calls)
        elif msg["role"] == "user":
            if isinstance(msg["content"], list):
                new_blocks = []
@@ -3220,6 +3257,8 @@ def _convert_responses_chunk_to_generation_chunk(
        )
        if parsed := msg.additional_kwargs.get("parsed"):
            additional_kwargs["parsed"] = parsed
+        if reasoning := msg.additional_kwargs.get("reasoning"):
+            additional_kwargs["reasoning"] = reasoning
        usage_metadata = msg.usage_metadata
        response_metadata = {
            k: v for k, v in msg.response_metadata.items() if k != "id"
@@ -3245,6 +3284,7 @@ def _convert_responses_chunk_to_generation_chunk(
    elif chunk.type == "response.output_item.done" and chunk.item.type in (
        "web_search_call",
        "file_search_call",
+        "computer_call",
    ):
        additional_kwargs["tool_outputs"] = [
            chunk.item.model_dump(exclude_none=True, mode="json")
--- a/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py
+++ b/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py
@@ -286,10 +286,14 @@ def test_reasoning() -> None:
    assert isinstance(response, AIMessage)
    assert response.additional_kwargs["reasoning"]

+    # Test init params + streaming
    llm = ChatOpenAI(model="o3-mini", reasoning_effort="low", use_responses_api=True)
-    response = llm.invoke("Hello")
-    assert isinstance(response, AIMessage)
-    assert response.additional_kwargs["reasoning"]
+    full: Optional[BaseMessageChunk] = None
+    for chunk in llm.stream("Hello"):
+        assert isinstance(chunk, AIMessageChunk)
+        full = chunk if full is None else full + chunk
+    assert isinstance(full, AIMessage)
+    assert full.additional_kwargs["reasoning"]


 def test_stateful_api() -> None:
@@ -304,6 +308,24 @@ def test_stateful_api() -> None:
    assert "bobo" in second_response.content[0]["text"].lower()  # type: ignore


+def test_route_from_model_kwargs() -> None:
+    llm = ChatOpenAI(model=MODEL_NAME, model_kwargs={"truncation": "auto"})
+    _ = next(llm.stream("Hello"))
+
+
+def test_computer_calls() -> None:
+    llm = ChatOpenAI(model="computer-use-preview", model_kwargs={"truncation": "auto"})
+    tool = {
+        "type": "computer_use_preview",
+        "display_width": 1024,
+        "display_height": 768,
+        "environment": "browser",
+    }
+    llm_with_tools = llm.bind_tools([tool], tool_choice="any")
+    response = llm_with_tools.invoke("Please wait a moment.")
+    assert response.additional_kwargs["tool_outputs"]
+
+
 def test_file_search() -> None:
    pytest.skip()  # TODO: set up infra
    llm = ChatOpenAI(model=MODEL_NAME)