runtime-rs: shut down shim daemon on a failed create

When CreateContainer fails before the runtime instance is registered
(e.g. a hypervisor/cgroup error), no sandbox exists to drive the normal
teardown. containerd's follow-up Shutdown RPC then reaches
get_runtime_instance(), fails with "runtime not ready", and returns
before the service loop is ever told to stop. Because the shim ignores
SIGTERM, the containerd-shim-kata-v2 daemon is left running and orphaned.

Make the Shutdown RPC force the daemon to exit when there is no runtime
instance, emitting the same Action::Shutdown that sandbox.shutdown()
sends on the normal path. This guarantees the shim process is reaped
after a failed create instead of leaking.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <noreply@cursor.com>
This commit is contained in:
Fabiano Fidêncio
2026-06-02 21:38:47 +02:00
parent 2a1ce7b8c4
commit 80e2473440

View File

@@ -535,6 +535,29 @@ impl RuntimeHandlerManager {
Ok(TaskResponse::CreateContainer(shim_pid))
} else {
// A teardown RPC must still make the shim daemon exit even when
// the runtime instance was never (fully) created -- e.g. after a
// failed CreateContainer. In that case containerd's follow-up
// Shutdown would otherwise hit `get_runtime_instance()`, fail with
// "runtime not ready", and the service loop would never receive
// `Action::Shutdown`. Because the shim ignores SIGTERM the daemon
// would then be left running and orphaned by containerd.
if let TaskRequest::ShutdownContainer(_) = &req {
if self.get_runtime_instance().await.is_err() {
warn!(
sl!(),
"shutdown requested but runtime instance is not ready; \
forcing shim exit to avoid an orphaned shim process"
);
let sender = self.inner.read().await.msg_sender.clone();
sender
.send(Message::new(Action::Shutdown))
.await
.context("send shutdown message")?;
return Ok(TaskResponse::ShutdownContainer);
}
}
self.handler_task_request(req)
.await
.context("handler TaskRequest")
@@ -950,3 +973,35 @@ fn configure_non_root_hypervisor(config: &mut Hypervisor) -> Result<()> {
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use common::types::ShutdownRequest;
use tokio::sync::mpsc::channel;
// A ShutdownContainer RPC that arrives before any runtime instance was
// created (e.g. after a failed CreateContainer) must still drive the shim
// daemon to exit, otherwise the process is orphaned. Verify it returns
// ShutdownContainer and emits Action::Shutdown on the service channel.
#[tokio::test]
async fn test_shutdown_without_runtime_instance_forces_exit() {
let (sender, mut receiver) = channel::<Message>(8);
let manager = RuntimeHandlerManager::new("test-sid", sender).unwrap();
let resp = manager
.handler_task_message(TaskRequest::ShutdownContainer(ShutdownRequest {
container_id: "test-sid".to_string(),
is_now: true,
}))
.await
.expect("shutdown should succeed even without a runtime instance");
assert!(matches!(resp, TaskResponse::ShutdownContainer));
let msg = receiver
.try_recv()
.expect("an Action::Shutdown message must be sent to stop the daemon");
assert!(matches!(msg.action, Action::Shutdown));
}
}