diff --git a/docs/docs/integrations/chat/openai.ipynb b/docs/docs/integrations/chat/openai.ipynb index 93c9f73c8d6..cb60106cf26 100644 --- a/docs/docs/integrations/chat/openai.ipynb +++ b/docs/docs/integrations/chat/openai.ipynb @@ -655,6 +655,266 @@ "response.additional_kwargs" ] }, + { + "cell_type": "markdown", + "id": "82b2cfbe-a019-4c6b-a323-a5d7c158cb0d", + "metadata": {}, + "source": [ + "#### Computer use\n", + "\n", + "`ChatOpenAI` supports the `\"computer-use-preview\"` model, which is a specialized model for the built-in computer use tool. To enable, pass a [computer use tool](https://platform.openai.com/docs/guides/tools-computer-use) as you would pass another tool.\n", + "\n", + "Currently, tool outputs for computer use are present in `AIMessage.additional_kwargs[\"tool_outputs\"]`. To reply to the computer use tool call, construct a `ToolMessage` with `{\"type\": \"computer_call_output\"}` in its `additional_kwargs`. The content of the message will be a screenshot. Below, we demonstrate a simple example.\n", + "\n", + "First, load two screenshots:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0fab26a6-f041-4d40-8d7c-51ae8a1ad698", + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "\n", + "\n", + "def load_png_as_base64(file_path):\n", + " with open(file_path, \"rb\") as image_file:\n", + " encoded_string = base64.b64encode(image_file.read())\n", + " return encoded_string.decode(\"utf-8\")\n", + "\n", + "\n", + "screenshot_1_base64 = load_png_as_base64(\n", + " \"/path/to/screenshot_1.png\"\n", + ") # perhaps a screenshot of an application\n", + "screenshot_2_base64 = load_png_as_base64(\n", + " \"/path/to/screenshot_2.png\"\n", + ") # perhaps a screenshot of the Desktop" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ff26e977-1bf2-467d-a853-719c1132bb43", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "\n", + "# Initialize model\n", + "llm = ChatOpenAI(\n", + " model=\"computer-use-preview\",\n", + " model_kwargs={\"truncation\": \"auto\"},\n", + ")\n", + "\n", + "# Bind computer-use tool\n", + "tool = {\n", + " \"type\": \"computer_use_preview\",\n", + " \"display_width\": 1024,\n", + " \"display_height\": 768,\n", + " \"environment\": \"browser\",\n", + "}\n", + "llm_with_tools = llm.bind_tools([tool])\n", + "\n", + "# Construct input message\n", + "input_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": (\n", + " \"Click the red X to close and reveal my Desktop. \"\n", + " \"Proceed, no confirmation needed.\"\n", + " ),\n", + " },\n", + " {\n", + " \"type\": \"input_image\",\n", + " \"image_url\": f\"data:image/png;base64,{screenshot_1_base64}\",\n", + " },\n", + " ],\n", + "}\n", + "\n", + "# Invoke model\n", + "response = llm_with_tools.invoke(\n", + " [input_message],\n", + " reasoning={\n", + " \"generate_summary\": \"concise\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "714bce19-6360-4c09-ba44-59034050527f", + "metadata": {}, + "source": [ + "The response will include a call to the computer-use tool in its `additional_kwargs`:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e4a12d04-d1ab-4bd5-b93d-7028f9c818fb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'reasoning': {'id': 'rs_67ddb381c85081919c46e3e544a161e8051ff325ba1bad35',\n", + " 'summary': [{'text': 'Closing Visual Studio Code application',\n", + " 'type': 'summary_text'}],\n", + " 'type': 'reasoning'},\n", + " 'tool_outputs': [{'id': 'cu_67ddb385358c8191bf1a127b71bcf1ea051ff325ba1bad35',\n", + " 'action': {'button': 'left', 'type': 'click', 'x': 17, 'y': 38},\n", + " 'call_id': 'call_Ae3Ghz8xdqZQ01mosYhXXMho',\n", + " 'pending_safety_checks': [],\n", + " 'status': 'completed',\n", + " 'type': 'computer_call'}]}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response.additional_kwargs" + ] + }, + { + "cell_type": "markdown", + "id": "f54e95aa-715e-4ebe-acbd-681ea832abb0", + "metadata": {}, + "source": [ + "We next construct a ToolMessage with these properties:\n", + "\n", + "1. It has a `tool_call_id` matching the `call_id` from the computer-call.\n", + "2. It has `{\"type\": \"computer_call_output\"}` in its `additional_kwargs`.\n", + "3. Its content is either an `image_url` or an `input_image` output block (see [OpenAI docs](https://platform.openai.com/docs/guides/tools-computer-use#5-repeat) for formatting)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "003626d2-82d9-41c2-995e-e9f8c1520d30", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.messages import ToolMessage\n", + "\n", + "tool_call_id = response.additional_kwargs[\"tool_outputs\"][0][\"call_id\"]\n", + "\n", + "tool_message = ToolMessage(\n", + " content=[\n", + " {\n", + " \"type\": \"input_image\",\n", + " \"image_url\": f\"data:image/png;base64,{screenshot_2_base64}\",\n", + " }\n", + " ],\n", + " # content=f\"data:image/png;base64,{screenshot_2_base64}\", # <-- also acceptable\n", + " tool_call_id=tool_call_id,\n", + " additional_kwargs={\"type\": \"computer_call_output\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ffa2bc27-389d-4c3a-b646-a9c7eedc2cb7", + "metadata": {}, + "source": [ + "We can now invoke the model again using the message history:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ad10a31a-9b81-4dde-8a37-1a656543345a", + "metadata": {}, + "outputs": [], + "source": [ + "messages = [\n", + " input_message,\n", + " response,\n", + " tool_message,\n", + "]\n", + "\n", + "response_2 = llm_with_tools.invoke(\n", + " messages,\n", + " reasoning={\n", + " \"generate_summary\": \"concise\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fb3a7251-890a-467c-ab59-ae0331221964", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Done! The Desktop is now visible.'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response_2.text()" + ] + }, + { + "cell_type": "markdown", + "id": "a2759df1-317c-4dd9-823a-4aab65e41939", + "metadata": {}, + "source": [ + "Instead of passing back the entire sequence, we can also use the [previous_response_id](#passing-previous_response_id):" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6a40d11b-2426-48ec-bb5e-19e0b36fd74c", + "metadata": {}, + "outputs": [], + "source": [ + "previous_response_id = response.response_metadata[\"id\"]\n", + "\n", + "response_2 = llm_with_tools.invoke(\n", + " [tool_message],\n", + " previous_response_id=previous_response_id,\n", + " reasoning={\n", + " \"generate_summary\": \"concise\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "687d2f05-38b7-42a5-b640-bfd6b4753719", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The Visual Studio Code terminal has been closed and your desktop is now visible.'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response_2.text()" + ] + }, { "cell_type": "markdown", "id": "6fda05f0-4b81-4709-9407-f316d760ad50", diff --git a/libs/partners/openai/langchain_openai/chat_models/base.py b/libs/partners/openai/langchain_openai/chat_models/base.py index cb123cfe6b6..4e7faf2c0ed 100644 --- a/libs/partners/openai/langchain_openai/chat_models/base.py +++ b/libs/partners/openai/langchain_openai/chat_models/base.py @@ -2291,7 +2291,7 @@ class ChatOpenAI(BaseChatOpenAI): # type: ignore[override] self, *args: Any, stream_usage: Optional[bool] = None, **kwargs: Any ) -> Iterator[ChatGenerationChunk]: """Set default stream_options.""" - if self._use_responses_api(kwargs): + if self._use_responses_api({**kwargs, **self.model_kwargs}): return super()._stream_responses(*args, **kwargs) else: stream_usage = self._should_stream_usage(stream_usage, **kwargs) @@ -2309,7 +2309,7 @@ class ChatOpenAI(BaseChatOpenAI): # type: ignore[override] self, *args: Any, stream_usage: Optional[bool] = None, **kwargs: Any ) -> AsyncIterator[ChatGenerationChunk]: """Set default stream_options.""" - if self._use_responses_api(kwargs): + if self._use_responses_api({**kwargs, **self.model_kwargs}): async for chunk in super()._astream_responses(*args, **kwargs): yield chunk else: @@ -2942,6 +2942,25 @@ def _construct_responses_api_payload( return payload +def _make_computer_call_output_from_message(message: ToolMessage) -> dict: + computer_call_output: dict = { + "call_id": message.tool_call_id, + "type": "computer_call_output", + } + if isinstance(message.content, list): + # Use first input_image block + output = next( + block + for block in message.content + if cast(dict, block)["type"] == "input_image" + ) + else: + # string, assume image_url + output = {"type": "input_image", "image_url": message.content} + computer_call_output["output"] = output + return computer_call_output + + def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list: input_ = [] for lc_msg in messages: @@ -2951,15 +2970,26 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list: msg.pop("name") if msg["role"] == "tool": tool_output = msg["content"] - if not isinstance(tool_output, str): - tool_output = _stringify(tool_output) - function_call_output = { - "type": "function_call_output", - "output": tool_output, - "call_id": msg["tool_call_id"], - } - input_.append(function_call_output) + if lc_msg.additional_kwargs.get("type") == "computer_call_output": + computer_call_output = _make_computer_call_output_from_message( + cast(ToolMessage, lc_msg) + ) + input_.append(computer_call_output) + else: + if not isinstance(tool_output, str): + tool_output = _stringify(tool_output) + function_call_output = { + "type": "function_call_output", + "output": tool_output, + "call_id": msg["tool_call_id"], + } + input_.append(function_call_output) elif msg["role"] == "assistant": + # Reasoning items + reasoning_items = [] + if reasoning := lc_msg.additional_kwargs.get("reasoning"): + reasoning_items.append(reasoning) + # Function calls function_calls = [] if tool_calls := msg.pop("tool_calls", None): # TODO: should you be able to preserve the function call object id on @@ -2979,7 +3009,12 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list: ): function_call["id"] = _id function_calls.append(function_call) - + # Computer calls + computer_calls = [] + tool_outputs = lc_msg.additional_kwargs.get("tool_outputs", []) + for tool_output in tool_outputs: + if tool_output.get("type") == "computer_call": + computer_calls.append(tool_output) msg["content"] = msg.get("content") or [] if lc_msg.additional_kwargs.get("refusal"): if isinstance(msg["content"], str): @@ -3013,7 +3048,9 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list: msg["content"] = new_blocks if msg["content"]: input_.append(msg) + input_.extend(reasoning_items) input_.extend(function_calls) + input_.extend(computer_calls) elif msg["role"] == "user": if isinstance(msg["content"], list): new_blocks = [] @@ -3220,6 +3257,8 @@ def _convert_responses_chunk_to_generation_chunk( ) if parsed := msg.additional_kwargs.get("parsed"): additional_kwargs["parsed"] = parsed + if reasoning := msg.additional_kwargs.get("reasoning"): + additional_kwargs["reasoning"] = reasoning usage_metadata = msg.usage_metadata response_metadata = { k: v for k, v in msg.response_metadata.items() if k != "id" @@ -3245,6 +3284,7 @@ def _convert_responses_chunk_to_generation_chunk( elif chunk.type == "response.output_item.done" and chunk.item.type in ( "web_search_call", "file_search_call", + "computer_call", ): additional_kwargs["tool_outputs"] = [ chunk.item.model_dump(exclude_none=True, mode="json") diff --git a/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py b/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py index fd4a6665761..d827c3c8469 100644 --- a/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py +++ b/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py @@ -286,10 +286,14 @@ def test_reasoning() -> None: assert isinstance(response, AIMessage) assert response.additional_kwargs["reasoning"] + # Test init params + streaming llm = ChatOpenAI(model="o3-mini", reasoning_effort="low", use_responses_api=True) - response = llm.invoke("Hello") - assert isinstance(response, AIMessage) - assert response.additional_kwargs["reasoning"] + full: Optional[BaseMessageChunk] = None + for chunk in llm.stream("Hello"): + assert isinstance(chunk, AIMessageChunk) + full = chunk if full is None else full + chunk + assert isinstance(full, AIMessage) + assert full.additional_kwargs["reasoning"] def test_stateful_api() -> None: @@ -304,6 +308,24 @@ def test_stateful_api() -> None: assert "bobo" in second_response.content[0]["text"].lower() # type: ignore +def test_route_from_model_kwargs() -> None: + llm = ChatOpenAI(model=MODEL_NAME, model_kwargs={"truncation": "auto"}) + _ = next(llm.stream("Hello")) + + +def test_computer_calls() -> None: + llm = ChatOpenAI(model="computer-use-preview", model_kwargs={"truncation": "auto"}) + tool = { + "type": "computer_use_preview", + "display_width": 1024, + "display_height": 768, + "environment": "browser", + } + llm_with_tools = llm.bind_tools([tool], tool_choice="any") + response = llm_with_tools.invoke("Please wait a moment.") + assert response.additional_kwargs["tool_outputs"] + + def test_file_search() -> None: pytest.skip() # TODO: set up infra llm = ChatOpenAI(model=MODEL_NAME)