chore(standard-tests): add sandbox integration test for offloading large execute results (#38537)

2026-06-30 06:05:52 +00:00 · 2026-06-29 08:40:50 -04:00
parent c863b92b9e
commit 1e35d8f7a9
1 changed files with 93 additions and 4 deletions
--- a/libs/standard-tests/langchain_tests/integration_tests/sandboxes.py
+++ b/libs/standard-tests/langchain_tests/integration_tests/sandboxes.py
@@ -37,12 +37,19 @@ import base64
 import shlex
 import sys
 from abc import abstractmethod
-from typing import TYPE_CHECKING
+from collections.abc import (
+    Iterator,  # noqa: TC003  # runtime import: pydantic resolves the field annotation
+)
+from typing import Any

 import pytest
+from langchain_core.language_models.fake_chat_models import GenericFakeChatModel
+from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+from pydantic import Field

 deepagents = pytest.importorskip("deepagents")

+from deepagents.backends import CompositeBackend
 from deepagents.backends.protocol import (
    ExecuteResponse,
    FileDownloadResponse,
@@ -50,17 +57,29 @@ from deepagents.backends.protocol import (
    ReadResult,
    SandboxBackendProtocol,
 )
+from deepagents.graph import create_deep_agent

 from langchain_tests.base import BaseStandardTests

-if TYPE_CHECKING:
-    from collections.abc import Iterator
-

 def _quote(path: str) -> str:
    return shlex.quote(path)


+class _ScriptedToolModel(GenericFakeChatModel):
+    """Fake chat model that replays scripted `AIMessage`s and accepts any tools.
+
+    `bind_tools` is a no-op so scripted tool calls pass through unchanged, and
+    `messages` is excluded from serialization so tracing cannot consume the
+    iterator before the agent pulls from it.
+    """
+
+    messages: Iterator[AIMessage | str] = Field(exclude=True)
+
+    def bind_tools(self, tools: Any, **kwargs: Any) -> _ScriptedToolModel:  # noqa: ARG002
+        return self
+
+
 class SandboxIntegrationTests(BaseStandardTests):
    """Standard integration tests for a `SandboxBackendProtocol` implementation."""

@@ -212,6 +231,76 @@ class SandboxIntegrationTests(BaseStandardTests):
        assert len(result.output) >= 500 * 1024
        assert result.output.startswith("x")

+    def test_execute_capture_at_source_offload(
+        self, sandbox_backend: SandboxBackendProtocol, sandbox_test_root: str
+    ) -> None:
+        """A large `execute` result is offloaded to a file and read back.
+
+        Drives a deep agent (fake model -> `execute` tool call) end-to-end and
+        checks that an over-budget result is not returned inline: the agent gets
+        a head/tail preview plus a `read_file` pointer, and the full output is
+        retrievable from the sandbox. The offload happens via capture-at-source
+        when the sandbox supports it, or the middleware's generic eviction
+        otherwise -- this asserts the outcome, not the mechanism. `artifacts_root`
+        is pinned under the writable test root so the offload file does not land
+        at the sandbox filesystem root.
+        """
+        if not self.has_sync:
+            pytest.skip("Sync tests not supported.")
+
+        backend = CompositeBackend(
+            default=sandbox_backend, routes={}, artifacts_root=sandbox_test_root
+        )
+        # ~225 KiB of output, comfortably over the default eviction budget.
+        command = (
+            "for i in $(seq 1 5000); do "
+            'echo "L$i: padding to clear the eviction budget"; done'
+        )
+        model = _ScriptedToolModel(
+            messages=iter(
+                [
+                    AIMessage(
+                        content="",
+                        tool_calls=[
+                            {
+                                "name": "execute",
+                                "args": {"command": command},
+                                "id": "exec_1",
+                                "type": "tool_call",
+                            }
+                        ],
+                    ),
+                    AIMessage(content="done"),
+                ]
+            )
+        )
+        agent = create_deep_agent(model=model, backend=backend)
+
+        result = agent.invoke({"messages": [HumanMessage(content="run it")]})
+
+        tool_messages = [
+            m
+            for m in result["messages"]
+            if isinstance(m, ToolMessage) and m.name == "execute"
+        ]
+        assert tool_messages, "execute tool was not called"
+        content = tool_messages[0].content
+        capture_path = self.sandbox_path(
+            "large_tool_results/exec_1", root_dir=sandbox_test_root
+        )
+        # Offloaded: a read_file pointer and a head/tail preview, middle omitted.
+        assert capture_path in content
+        assert "read_file" in content
+        assert "L1:" in content
+        assert "L100:" not in content
+
+        # The full output is preserved on the sandbox and readable back.
+        read_back = sandbox_backend.read(capture_path)
+        assert isinstance(read_back, ReadResult)
+        assert read_back.error is None
+        assert read_back.file_data is not None
+        assert "L100:" in read_back.file_data["content"]
+
    def test_edit_single_occurrence(
        self, sandbox_backend: SandboxBackendProtocol, sandbox_test_root: str
    ) -> None: