mirror of
https://github.com/hwchase17/langchain.git
synced 2026-01-29 21:30:18 +00:00
openai[patch]: support multi-turn computer use (#30410)
Here we accept ToolMessages of the form
```python
ToolMessage(
content=<representation of screenshot> (see below),
tool_call_id="abc123",
additional_kwargs={"type": "computer_call_output"},
)
```
and translate them to `computer_call_output` items for the Responses
API.
We also propagate `reasoning_content` items from AIMessages.
## Example
### Load screenshots
```python
import base64
def load_png_as_base64(file_path):
with open(file_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
return encoded_string.decode('utf-8')
screenshot_1_base64 = load_png_as_base64("/path/to/screenshot/of/application.png")
screenshot_2_base64 = load_png_as_base64("/path/to/screenshot/of/desktop.png")
```
### Initial message and response
```python
from langchain_core.messages import HumanMessage, ToolMessage
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(
model="computer-use-preview",
model_kwargs={"truncation": "auto"},
)
tool = {
"type": "computer_use_preview",
"display_width": 1024,
"display_height": 768,
"environment": "browser"
}
llm_with_tools = llm.bind_tools([tool])
input_message = HumanMessage(
content=[
{
"type": "text",
"text": (
"Click the red X to close and reveal my Desktop. "
"Proceed, no confirmation needed."
)
},
{
"type": "input_image",
"image_url": f"data:image/png;base64,{screenshot_1_base64}",
}
]
)
response = llm_with_tools.invoke(
[input_message],
reasoning={
"generate_summary": "concise",
},
)
response.additional_kwargs["tool_outputs"]
```
### Construct ToolMessage
```python
tool_call_id = response.additional_kwargs["tool_outputs"][0]["call_id"]
tool_message = ToolMessage(
content=[
{
"type": "input_image",
"image_url": f"data:image/png;base64,{screenshot_2_base64}"
}
],
# content=f"data:image/png;base64,{screenshot_2_base64}", # <-- also acceptable
tool_call_id=tool_call_id,
additional_kwargs={"type": "computer_call_output"},
)
```
### Invoke again
```python
messages = [
input_message,
response,
tool_message,
]
response_2 = llm_with_tools.invoke(
messages,
reasoning={
"generate_summary": "concise",
},
)
```
This commit is contained in:
@@ -2291,7 +2291,7 @@ class ChatOpenAI(BaseChatOpenAI): # type: ignore[override]
|
||||
self, *args: Any, stream_usage: Optional[bool] = None, **kwargs: Any
|
||||
) -> Iterator[ChatGenerationChunk]:
|
||||
"""Set default stream_options."""
|
||||
if self._use_responses_api(kwargs):
|
||||
if self._use_responses_api({**kwargs, **self.model_kwargs}):
|
||||
return super()._stream_responses(*args, **kwargs)
|
||||
else:
|
||||
stream_usage = self._should_stream_usage(stream_usage, **kwargs)
|
||||
@@ -2309,7 +2309,7 @@ class ChatOpenAI(BaseChatOpenAI): # type: ignore[override]
|
||||
self, *args: Any, stream_usage: Optional[bool] = None, **kwargs: Any
|
||||
) -> AsyncIterator[ChatGenerationChunk]:
|
||||
"""Set default stream_options."""
|
||||
if self._use_responses_api(kwargs):
|
||||
if self._use_responses_api({**kwargs, **self.model_kwargs}):
|
||||
async for chunk in super()._astream_responses(*args, **kwargs):
|
||||
yield chunk
|
||||
else:
|
||||
@@ -2942,6 +2942,25 @@ def _construct_responses_api_payload(
|
||||
return payload
|
||||
|
||||
|
||||
def _make_computer_call_output_from_message(message: ToolMessage) -> dict:
|
||||
computer_call_output: dict = {
|
||||
"call_id": message.tool_call_id,
|
||||
"type": "computer_call_output",
|
||||
}
|
||||
if isinstance(message.content, list):
|
||||
# Use first input_image block
|
||||
output = next(
|
||||
block
|
||||
for block in message.content
|
||||
if cast(dict, block)["type"] == "input_image"
|
||||
)
|
||||
else:
|
||||
# string, assume image_url
|
||||
output = {"type": "input_image", "image_url": message.content}
|
||||
computer_call_output["output"] = output
|
||||
return computer_call_output
|
||||
|
||||
|
||||
def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
|
||||
input_ = []
|
||||
for lc_msg in messages:
|
||||
@@ -2951,15 +2970,26 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
|
||||
msg.pop("name")
|
||||
if msg["role"] == "tool":
|
||||
tool_output = msg["content"]
|
||||
if not isinstance(tool_output, str):
|
||||
tool_output = _stringify(tool_output)
|
||||
function_call_output = {
|
||||
"type": "function_call_output",
|
||||
"output": tool_output,
|
||||
"call_id": msg["tool_call_id"],
|
||||
}
|
||||
input_.append(function_call_output)
|
||||
if lc_msg.additional_kwargs.get("type") == "computer_call_output":
|
||||
computer_call_output = _make_computer_call_output_from_message(
|
||||
cast(ToolMessage, lc_msg)
|
||||
)
|
||||
input_.append(computer_call_output)
|
||||
else:
|
||||
if not isinstance(tool_output, str):
|
||||
tool_output = _stringify(tool_output)
|
||||
function_call_output = {
|
||||
"type": "function_call_output",
|
||||
"output": tool_output,
|
||||
"call_id": msg["tool_call_id"],
|
||||
}
|
||||
input_.append(function_call_output)
|
||||
elif msg["role"] == "assistant":
|
||||
# Reasoning items
|
||||
reasoning_items = []
|
||||
if reasoning := lc_msg.additional_kwargs.get("reasoning"):
|
||||
reasoning_items.append(reasoning)
|
||||
# Function calls
|
||||
function_calls = []
|
||||
if tool_calls := msg.pop("tool_calls", None):
|
||||
# TODO: should you be able to preserve the function call object id on
|
||||
@@ -2979,7 +3009,12 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
|
||||
):
|
||||
function_call["id"] = _id
|
||||
function_calls.append(function_call)
|
||||
|
||||
# Computer calls
|
||||
computer_calls = []
|
||||
tool_outputs = lc_msg.additional_kwargs.get("tool_outputs", [])
|
||||
for tool_output in tool_outputs:
|
||||
if tool_output.get("type") == "computer_call":
|
||||
computer_calls.append(tool_output)
|
||||
msg["content"] = msg.get("content") or []
|
||||
if lc_msg.additional_kwargs.get("refusal"):
|
||||
if isinstance(msg["content"], str):
|
||||
@@ -3013,7 +3048,9 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
|
||||
msg["content"] = new_blocks
|
||||
if msg["content"]:
|
||||
input_.append(msg)
|
||||
input_.extend(reasoning_items)
|
||||
input_.extend(function_calls)
|
||||
input_.extend(computer_calls)
|
||||
elif msg["role"] == "user":
|
||||
if isinstance(msg["content"], list):
|
||||
new_blocks = []
|
||||
@@ -3220,6 +3257,8 @@ def _convert_responses_chunk_to_generation_chunk(
|
||||
)
|
||||
if parsed := msg.additional_kwargs.get("parsed"):
|
||||
additional_kwargs["parsed"] = parsed
|
||||
if reasoning := msg.additional_kwargs.get("reasoning"):
|
||||
additional_kwargs["reasoning"] = reasoning
|
||||
usage_metadata = msg.usage_metadata
|
||||
response_metadata = {
|
||||
k: v for k, v in msg.response_metadata.items() if k != "id"
|
||||
@@ -3245,6 +3284,7 @@ def _convert_responses_chunk_to_generation_chunk(
|
||||
elif chunk.type == "response.output_item.done" and chunk.item.type in (
|
||||
"web_search_call",
|
||||
"file_search_call",
|
||||
"computer_call",
|
||||
):
|
||||
additional_kwargs["tool_outputs"] = [
|
||||
chunk.item.model_dump(exclude_none=True, mode="json")
|
||||
|
||||
@@ -286,10 +286,14 @@ def test_reasoning() -> None:
|
||||
assert isinstance(response, AIMessage)
|
||||
assert response.additional_kwargs["reasoning"]
|
||||
|
||||
# Test init params + streaming
|
||||
llm = ChatOpenAI(model="o3-mini", reasoning_effort="low", use_responses_api=True)
|
||||
response = llm.invoke("Hello")
|
||||
assert isinstance(response, AIMessage)
|
||||
assert response.additional_kwargs["reasoning"]
|
||||
full: Optional[BaseMessageChunk] = None
|
||||
for chunk in llm.stream("Hello"):
|
||||
assert isinstance(chunk, AIMessageChunk)
|
||||
full = chunk if full is None else full + chunk
|
||||
assert isinstance(full, AIMessage)
|
||||
assert full.additional_kwargs["reasoning"]
|
||||
|
||||
|
||||
def test_stateful_api() -> None:
|
||||
@@ -304,6 +308,24 @@ def test_stateful_api() -> None:
|
||||
assert "bobo" in second_response.content[0]["text"].lower() # type: ignore
|
||||
|
||||
|
||||
def test_route_from_model_kwargs() -> None:
|
||||
llm = ChatOpenAI(model=MODEL_NAME, model_kwargs={"truncation": "auto"})
|
||||
_ = next(llm.stream("Hello"))
|
||||
|
||||
|
||||
def test_computer_calls() -> None:
|
||||
llm = ChatOpenAI(model="computer-use-preview", model_kwargs={"truncation": "auto"})
|
||||
tool = {
|
||||
"type": "computer_use_preview",
|
||||
"display_width": 1024,
|
||||
"display_height": 768,
|
||||
"environment": "browser",
|
||||
}
|
||||
llm_with_tools = llm.bind_tools([tool], tool_choice="any")
|
||||
response = llm_with_tools.invoke("Please wait a moment.")
|
||||
assert response.additional_kwargs["tool_outputs"]
|
||||
|
||||
|
||||
def test_file_search() -> None:
|
||||
pytest.skip() # TODO: set up infra
|
||||
llm = ChatOpenAI(model=MODEL_NAME)
|
||||
|
||||
Reference in New Issue
Block a user