From 1e35d8f7a988efd2f10af776739d5123d2c0a8af Mon Sep 17 00:00:00 2001 From: ccurme Date: Mon, 29 Jun 2026 08:40:50 -0400 Subject: [PATCH] chore(standard-tests): add sandbox integration test for offloading large `execute` results (#38537) --- .../integration_tests/sandboxes.py | 97 ++++++++++++++++++- 1 file changed, 93 insertions(+), 4 deletions(-) diff --git a/libs/standard-tests/langchain_tests/integration_tests/sandboxes.py b/libs/standard-tests/langchain_tests/integration_tests/sandboxes.py index 4c22812e966..115b961cddc 100644 --- a/libs/standard-tests/langchain_tests/integration_tests/sandboxes.py +++ b/libs/standard-tests/langchain_tests/integration_tests/sandboxes.py @@ -37,12 +37,19 @@ import base64 import shlex import sys from abc import abstractmethod -from typing import TYPE_CHECKING +from collections.abc import ( + Iterator, # noqa: TC003 # runtime import: pydantic resolves the field annotation +) +from typing import Any import pytest +from langchain_core.language_models.fake_chat_models import GenericFakeChatModel +from langchain_core.messages import AIMessage, HumanMessage, ToolMessage +from pydantic import Field deepagents = pytest.importorskip("deepagents") +from deepagents.backends import CompositeBackend from deepagents.backends.protocol import ( ExecuteResponse, FileDownloadResponse, @@ -50,17 +57,29 @@ from deepagents.backends.protocol import ( ReadResult, SandboxBackendProtocol, ) +from deepagents.graph import create_deep_agent from langchain_tests.base import BaseStandardTests -if TYPE_CHECKING: - from collections.abc import Iterator - def _quote(path: str) -> str: return shlex.quote(path) +class _ScriptedToolModel(GenericFakeChatModel): + """Fake chat model that replays scripted `AIMessage`s and accepts any tools. + + `bind_tools` is a no-op so scripted tool calls pass through unchanged, and + `messages` is excluded from serialization so tracing cannot consume the + iterator before the agent pulls from it. + """ + + messages: Iterator[AIMessage | str] = Field(exclude=True) + + def bind_tools(self, tools: Any, **kwargs: Any) -> _ScriptedToolModel: # noqa: ARG002 + return self + + class SandboxIntegrationTests(BaseStandardTests): """Standard integration tests for a `SandboxBackendProtocol` implementation.""" @@ -212,6 +231,76 @@ class SandboxIntegrationTests(BaseStandardTests): assert len(result.output) >= 500 * 1024 assert result.output.startswith("x") + def test_execute_capture_at_source_offload( + self, sandbox_backend: SandboxBackendProtocol, sandbox_test_root: str + ) -> None: + """A large `execute` result is offloaded to a file and read back. + + Drives a deep agent (fake model -> `execute` tool call) end-to-end and + checks that an over-budget result is not returned inline: the agent gets + a head/tail preview plus a `read_file` pointer, and the full output is + retrievable from the sandbox. The offload happens via capture-at-source + when the sandbox supports it, or the middleware's generic eviction + otherwise -- this asserts the outcome, not the mechanism. `artifacts_root` + is pinned under the writable test root so the offload file does not land + at the sandbox filesystem root. + """ + if not self.has_sync: + pytest.skip("Sync tests not supported.") + + backend = CompositeBackend( + default=sandbox_backend, routes={}, artifacts_root=sandbox_test_root + ) + # ~225 KiB of output, comfortably over the default eviction budget. + command = ( + "for i in $(seq 1 5000); do " + 'echo "L$i: padding to clear the eviction budget"; done' + ) + model = _ScriptedToolModel( + messages=iter( + [ + AIMessage( + content="", + tool_calls=[ + { + "name": "execute", + "args": {"command": command}, + "id": "exec_1", + "type": "tool_call", + } + ], + ), + AIMessage(content="done"), + ] + ) + ) + agent = create_deep_agent(model=model, backend=backend) + + result = agent.invoke({"messages": [HumanMessage(content="run it")]}) + + tool_messages = [ + m + for m in result["messages"] + if isinstance(m, ToolMessage) and m.name == "execute" + ] + assert tool_messages, "execute tool was not called" + content = tool_messages[0].content + capture_path = self.sandbox_path( + "large_tool_results/exec_1", root_dir=sandbox_test_root + ) + # Offloaded: a read_file pointer and a head/tail preview, middle omitted. + assert capture_path in content + assert "read_file" in content + assert "L1:" in content + assert "L100:" not in content + + # The full output is preserved on the sandbox and readable back. + read_back = sandbox_backend.read(capture_path) + assert isinstance(read_back, ReadResult) + assert read_back.error is None + assert read_back.file_data is not None + assert "L100:" in read_back.file_data["content"] + def test_edit_single_occurrence( self, sandbox_backend: SandboxBackendProtocol, sandbox_test_root: str ) -> None: