agent: fix get_oom_event deadlock after connection restart

When the agent-protocol-forwarder's inbound connection restarts (e.g.
during a Cloud API Adaptor restart in peer pod environments), the shim
re-sends a GetOOMEvent request through the new connection. Since the
forwarder→agent Unix socket survives the restart, the old handler from
the previous connection remains alive, holding the event_rx lock while
blocked in recv().await.

The new handler acquires the sandbox lock, then attempts to acquire the
event_rx lock — which is held by the old handler. Because the sandbox
lock is still held during this wait, every subsequent RPC
(ExecProcess, WaitProcess, StatsContainer, SignalProcess, etc.) blocks
on the sandbox lock, rendering the pod completely unresponsive.

The root cause is a lock ordering violation: get_oom_event held the
sandbox lock while acquiring the event_rx lock. Fix this by scoping the
sandbox lock acquisition so it is dropped before the event_rx lock is
acquired. The sandbox lock is only needed to clone the Arc<Mutex<Receiver>>
— once cloned, it can be released immediately.

Assisted-by: Claude Code <noreply@anthropic.com>
Signed-off-by: Thejas N <thn@redhat.com>
This commit is contained in:
Thejas N
2026-05-04 14:47:37 +05:30
committed by Fabiano Fidêncio
parent 37c4a0b6a2
commit 7807aa3d62

View File

@@ -1667,10 +1667,11 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::GetOOMEventRequest,
) -> ttrpc::Result<OOMEvent> {
is_allowed(&req).await?;
let s = self.sandbox.lock().await;
let event_rx = &s.event_rx.clone();
let event_rx = {
let s = self.sandbox.lock().await;
s.event_rx.clone()
};
let mut event_rx = event_rx.lock().await;
drop(s);
let container_id = event_rx
.recv()
@@ -3606,4 +3607,75 @@ COMMIT
assert_eq!(d.result, result, "{msg}");
}
}
#[tokio::test]
async fn test_get_oom_event_no_deadlock() {
let logger = slog::Logger::root(slog::Discard, o!());
let sandbox = Sandbox::new(&logger).unwrap();
let agent_service = Arc::new(AgentService {
sandbox: Arc::new(Mutex::new(sandbox)),
init_mode: true,
oma: None,
});
let svc1 = agent_service.clone();
let handle1 = tokio::spawn(async move {
let ctx = mk_ttrpc_context();
let req = protocols::agent::GetOOMEventRequest::default();
svc1.get_oom_event(&ctx, req).await
});
// Yield until handler #1 has released the sandbox lock (entered recv()).
// Each yield_now() gives the spawned task a chance to make progress.
tokio::time::timeout(std::time::Duration::from_secs(1), async {
loop {
tokio::task::yield_now().await;
if agent_service.sandbox.try_lock().is_ok() {
return;
}
}
})
.await
.expect("sandbox lock should be free while get_oom_event waits");
let svc2 = agent_service.clone();
let handle2 = tokio::spawn(async move {
let ctx = mk_ttrpc_context();
let req = protocols::agent::GetOOMEventRequest::default();
svc2.get_oom_event(&ctx, req).await
});
// Yield until handler #2 has also released the sandbox lock (entered recv()).
tokio::time::timeout(std::time::Duration::from_secs(1), async {
loop {
tokio::task::yield_now().await;
if agent_service.sandbox.try_lock().is_ok() {
return;
}
}
})
.await
.expect("sandbox lock should be free with two concurrent get_oom_event handlers");
let tx = {
let s = agent_service.sandbox.lock().await;
s.event_tx.as_ref().unwrap().clone()
};
tx.send("container-1".to_string()).await.unwrap();
tx.send("container-2".to_string()).await.unwrap();
let result1 = tokio::time::timeout(std::time::Duration::from_secs(5), handle1).await;
let result2 = tokio::time::timeout(std::time::Duration::from_secs(5), handle2).await;
assert!(result1.is_ok(), "handler #1 timed out — possible deadlock");
assert!(result2.is_ok(), "handler #2 timed out — possible deadlock");
let resp1 = result1.unwrap().unwrap().unwrap();
let resp2 = result2.unwrap().unwrap().unwrap();
let mut ids: Vec<String> = vec![resp1.container_id, resp2.container_id];
ids.sort();
assert_eq!(ids, vec!["container-1", "container-2"]);
}
}