test(langchain): fix benchmark quality issues from code review

- Move middleware construction inside benchmarked lambdas for fresh instances - Rework memory test to observation-only with print output (no hard assertion) - Add deeply-nested Pydantic schema tool (RouteSchema) to LARGE_TOOLS (15 tools) - Update docstrings to document '10 accesses per iteration' in schema benchmarks - Fix bare `_ =` pattern in schema benchmarks (bare expressions) - Mark memory test with @pytest.mark.benchmark to exclude from normal runs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 19:05:21 +00:00 · 2026-04-21 08:29:36 -04:00
parent 3b030cb9f2
commit 4cd09d39ae
1 changed files with 65 additions and 23 deletions
--- a/libs/langchain_v1/tests/benchmarks/test_create_agent.py
+++ b/libs/langchain_v1/tests/benchmarks/test_create_agent.py
@@ -114,6 +114,35 @@ def structured_tool_4(rows: int, cols: int, fill: float) -> list[list[float]]:
    return [[fill] * cols for _ in range(rows)]


+class CoordinateSchema(BaseModel):
+    lat: float = Field(description="Latitude")
+    lon: float = Field(description="Longitude")
+
+
+class LocationSchema(BaseModel):
+    name: str = Field(description="Location name")
+    coordinate: CoordinateSchema = Field(description="GPS coordinate")
+    altitude_m: float = Field(default=0.0, description="Altitude in meters")
+
+
+class RouteSchema(BaseModel):
+    origin: LocationSchema = Field(description="Starting location")
+    destination: LocationSchema = Field(description="Ending location")
+    waypoints: list[LocationSchema] = Field(default_factory=list, description="Intermediate stops")
+    max_distance_km: float = Field(default=1000.0, description="Maximum route distance")
+
+
+@tool(args_schema=RouteSchema)
+def deep_nested_tool(
+    origin: LocationSchema,
+    destination: LocationSchema,
+    waypoints: list[LocationSchema],
+    max_distance_km: float,
+) -> dict[str, Any]:
+    """Plan a route between locations with deep nested schema."""
+    return {"origin": origin.name, "destination": destination.name}
+
+
@tool
 def complex_tool_1(
    name: str,
@@ -187,6 +216,7 @@ LARGE_TOOLS = [
    structured_tool_2,
    structured_tool_3,
    structured_tool_4,
+    deep_nested_tool,
    complex_tool_1,
    complex_tool_2,
    complex_tool_3,
@@ -224,47 +254,48 @@ def test_create_agent_medium_tools(benchmark: BenchmarkFixture) -> None:

@pytest.mark.benchmark
 def test_create_agent_large_tools(benchmark: BenchmarkFixture) -> None:
-    """14 tools including complex nested schemas."""
+    """15 tools including complex nested schemas."""
    benchmark(lambda: create_agent(model=_make_model(), tools=LARGE_TOOLS))


@pytest.mark.benchmark
 def test_create_agent_large_tools_with_middleware(benchmark: BenchmarkFixture) -> None:
-    """14 tools + full middleware stack."""
-    middleware: Sequence[AgentMiddleware[Any, Any]] = (
-        TodoListMiddleware(),
-        ToolRetryMiddleware(),
-        ModelRetryMiddleware(),
-    )
-    benchmark(
-        lambda: create_agent(
+    """15 tools + full middleware stack."""
+    def run() -> None:
+        middleware: Sequence[AgentMiddleware[Any, Any]] = (
+            TodoListMiddleware(),
+            ToolRetryMiddleware(),
+            ModelRetryMiddleware(),
+        )
+        create_agent(
            model=_make_model(),
            tools=LARGE_TOOLS,
            middleware=middleware,
        )
-    )
+
+    benchmark(run)


@pytest.mark.benchmark
 def test_tool_call_schema_repeated_access(benchmark: BenchmarkFixture) -> None:
-    """Measure cost of repeated .tool_call_schema access on a complex tool."""
+    """Measure cost of repeated .tool_call_schema access on a complex tool (10 accesses per iteration)."""
    t = structured_tool_1

    def access_schema_10x() -> None:
        for _ in range(10):
-            _ = t.tool_call_schema
+            t.tool_call_schema

    benchmark(access_schema_10x)


@pytest.mark.benchmark
 def test_tool_args_repeated_access(benchmark: BenchmarkFixture) -> None:
-    """Measure cost of repeated .args access on a complex tool."""
+    """Measure cost of repeated .args access on a complex tool (10 accesses per iteration)."""
    t = structured_tool_1

    def access_args_10x() -> None:
        for _ in range(10):
-            _ = t.args
+            t.args

    benchmark(access_args_10x)

@@ -272,12 +303,15 @@ def test_tool_args_repeated_access(benchmark: BenchmarkFixture) -> None:
@pytest.mark.benchmark
 def test_create_agent_instantiation_with_middleware(benchmark: BenchmarkFixture) -> None:
    """Baseline with middleware, no tools."""
-    middleware: Sequence[AgentMiddleware[Any, Any]] = (
-        TodoListMiddleware(),
-        ToolRetryMiddleware(),
-        ModelRetryMiddleware(),
-    )
-    benchmark(lambda: create_agent(model=_make_model(), middleware=middleware))
+    def run() -> None:
+        middleware: Sequence[AgentMiddleware[Any, Any]] = (
+            TodoListMiddleware(),
+            ToolRetryMiddleware(),
+            ModelRetryMiddleware(),
+        )
+        create_agent(model=_make_model(), middleware=middleware)
+
+    benchmark(run)


 # ---------------------------------------------------------------------------
@@ -285,11 +319,19 @@ def test_create_agent_instantiation_with_middleware(benchmark: BenchmarkFixture)
 # ---------------------------------------------------------------------------


+@pytest.mark.benchmark
 def test_create_agent_large_tools_memory() -> None:
-    """Record peak memory for large-tools agent creation. Not a perf benchmark."""
+    """Observe peak memory for large-tools agent creation.
+
+    This is not a hard assertion — it records the tracemalloc peak for the
+    memory allocated *during* create_agent. Run before and after optimization
+    passes to track improvement. Update the printed baseline comment below
+    when the number changes significantly.
+    """
    tracemalloc.start()
    create_agent(model=_make_model(), tools=LARGE_TOOLS)
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
-    # Soft assertion: 50 MB is a generous ceiling for a single agent instantiation.
-    assert peak < 50 * 1024 * 1024, f"Peak memory {peak / 1024 / 1024:.1f} MB exceeded 50 MB"
+    peak_kb = peak / 1024
+    # Baseline (pre-optimization): ~recorded after first run
+    print(f"\nPeak memory during create_agent (15 tools): {peak_kb:.1f} KB")