From 7807aa3d62a677ccb6ecdde72ba728c2f9490dbb Mon Sep 17 00:00:00 2001 From: Thejas N Date: Mon, 4 May 2026 14:47:37 +0530 Subject: [PATCH] agent: fix get_oom_event deadlock after connection restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the agent-protocol-forwarder's inbound connection restarts (e.g. during a Cloud API Adaptor restart in peer pod environments), the shim re-sends a GetOOMEvent request through the new connection. Since the forwarder→agent Unix socket survives the restart, the old handler from the previous connection remains alive, holding the event_rx lock while blocked in recv().await. The new handler acquires the sandbox lock, then attempts to acquire the event_rx lock — which is held by the old handler. Because the sandbox lock is still held during this wait, every subsequent RPC (ExecProcess, WaitProcess, StatsContainer, SignalProcess, etc.) blocks on the sandbox lock, rendering the pod completely unresponsive. The root cause is a lock ordering violation: get_oom_event held the sandbox lock while acquiring the event_rx lock. Fix this by scoping the sandbox lock acquisition so it is dropped before the event_rx lock is acquired. The sandbox lock is only needed to clone the Arc> — once cloned, it can be released immediately. Assisted-by: Claude Code Signed-off-by: Thejas N --- src/agent/src/rpc.rs | 78 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 22b8d5b73e..f0c0647d77 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -1667,10 +1667,11 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::GetOOMEventRequest, ) -> ttrpc::Result { is_allowed(&req).await?; - let s = self.sandbox.lock().await; - let event_rx = &s.event_rx.clone(); + let event_rx = { + let s = self.sandbox.lock().await; + s.event_rx.clone() + }; let mut event_rx = event_rx.lock().await; - drop(s); let container_id = event_rx .recv() @@ -3606,4 +3607,75 @@ COMMIT assert_eq!(d.result, result, "{msg}"); } } + + #[tokio::test] + async fn test_get_oom_event_no_deadlock() { + let logger = slog::Logger::root(slog::Discard, o!()); + let sandbox = Sandbox::new(&logger).unwrap(); + + let agent_service = Arc::new(AgentService { + sandbox: Arc::new(Mutex::new(sandbox)), + init_mode: true, + oma: None, + }); + + let svc1 = agent_service.clone(); + let handle1 = tokio::spawn(async move { + let ctx = mk_ttrpc_context(); + let req = protocols::agent::GetOOMEventRequest::default(); + svc1.get_oom_event(&ctx, req).await + }); + + // Yield until handler #1 has released the sandbox lock (entered recv()). + // Each yield_now() gives the spawned task a chance to make progress. + tokio::time::timeout(std::time::Duration::from_secs(1), async { + loop { + tokio::task::yield_now().await; + if agent_service.sandbox.try_lock().is_ok() { + return; + } + } + }) + .await + .expect("sandbox lock should be free while get_oom_event waits"); + + let svc2 = agent_service.clone(); + let handle2 = tokio::spawn(async move { + let ctx = mk_ttrpc_context(); + let req = protocols::agent::GetOOMEventRequest::default(); + svc2.get_oom_event(&ctx, req).await + }); + + // Yield until handler #2 has also released the sandbox lock (entered recv()). + tokio::time::timeout(std::time::Duration::from_secs(1), async { + loop { + tokio::task::yield_now().await; + if agent_service.sandbox.try_lock().is_ok() { + return; + } + } + }) + .await + .expect("sandbox lock should be free with two concurrent get_oom_event handlers"); + + let tx = { + let s = agent_service.sandbox.lock().await; + s.event_tx.as_ref().unwrap().clone() + }; + tx.send("container-1".to_string()).await.unwrap(); + tx.send("container-2".to_string()).await.unwrap(); + + let result1 = tokio::time::timeout(std::time::Duration::from_secs(5), handle1).await; + let result2 = tokio::time::timeout(std::time::Duration::from_secs(5), handle2).await; + + assert!(result1.is_ok(), "handler #1 timed out — possible deadlock"); + assert!(result2.is_ok(), "handler #2 timed out — possible deadlock"); + + let resp1 = result1.unwrap().unwrap().unwrap(); + let resp2 = result2.unwrap().unwrap().unwrap(); + + let mut ids: Vec = vec![resp1.container_id, resp2.container_id]; + ids.sort(); + assert_eq!(ids, vec!["container-1", "container-2"]); + } }