mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-08-31 16:36:38 +00:00
agent: Fix race condition with cgroup watchers
In the CI, test containers intermittently fail to start after creation, with an error like below (see #10872 for more details): # State: Terminated # Reason: StartError # Message: failed to start containerd task "afd43e77fae0815afbc7205eac78f94859e247968a6a4e8bcbb987690fcf10a6": No such file or directory (os error 2) I've observed this error to repro with the following containers, which have in common that they're all *very short-lived* by design (more tests might be affected): * k8s-job.bats * k8s-seccomp.bats * k8s-hostname.bats * k8s-policy-job.bats * k8s-policy-logs.bats Furthermore, appending a `; sleep 1` to the command line for those containers seemed to consistently get rid of the error. Investigating further, I've uncovered a race between the end of the container process and the setting up of the cgroup watchers (to report OOMs). If the process terminates first, the agent will try to watch cgroup paths that don't exist anymore, and it will fail to start the container. The added error context in notifier.rs confirms that the error comes from the missing cgroup: https://github.com/kata-containers/kata-containers/actions/runs/13450787436/job/37585901466#step:17:6536 The fix simply consists in creating the watchers *before* we start the container but still *after* we create it -- this is non-blocking, and IIUC the cgroup is guaranteed to already be present then. Fixes: #10872 Signed-off-by: Aurélien Bombo <abombo@microsoft.com>
This commit is contained in:
@@ -77,9 +77,17 @@ async fn register_memory_event_v2(
|
||||
let mut inotify = Inotify::init().context("Failed to initialize inotify")?;
|
||||
|
||||
// watching oom kill
|
||||
let ev_wd = inotify.add_watch(&event_control_path, WatchMask::MODIFY)?;
|
||||
let ev_wd = inotify
|
||||
.add_watch(&event_control_path, WatchMask::MODIFY)
|
||||
.context(format!("failed to add watch for {:?}", &event_control_path))?;
|
||||
|
||||
// Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
|
||||
let cg_wd = inotify.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)?;
|
||||
let cg_wd = inotify
|
||||
.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)
|
||||
.context(format!(
|
||||
"failed to add watch for {:?}",
|
||||
&cgroup_event_control_path
|
||||
))?;
|
||||
|
||||
info!(sl(), "ev_wd: {:?}", ev_wd);
|
||||
info!(sl(), "cg_wd: {:?}", cg_wd);
|
||||
|
@@ -342,24 +342,25 @@ impl AgentService {
|
||||
async fn do_start_container(&self, req: protocols::agent::StartContainerRequest) -> Result<()> {
|
||||
let mut s = self.sandbox.lock().await;
|
||||
let sid = s.id.clone();
|
||||
let cid = req.container_id;
|
||||
let cid = req.container_id.clone();
|
||||
|
||||
let ctr = s
|
||||
.get_container(&cid)
|
||||
.ok_or_else(|| anyhow!("Invalid container id"))?;
|
||||
ctr.exec().await?;
|
||||
|
||||
if sid == cid {
|
||||
return Ok(());
|
||||
if sid != cid {
|
||||
// start oom event loop
|
||||
if let Ok(cg_path) = ctr.cgroup_manager.as_ref().get_cgroup_path("memory") {
|
||||
let rx = notifier::notify_oom(cid.as_str(), cg_path.to_string()).await?;
|
||||
s.run_oom_event_monitor(rx, cid.clone()).await;
|
||||
}
|
||||
}
|
||||
|
||||
// start oom event loop
|
||||
if let Ok(cg_path) = ctr.cgroup_manager.as_ref().get_cgroup_path("memory") {
|
||||
let rx = notifier::notify_oom(cid.as_str(), cg_path.to_string()).await?;
|
||||
s.run_oom_event_monitor(rx, cid).await;
|
||||
}
|
||||
let ctr = s
|
||||
.get_container(&cid)
|
||||
.ok_or_else(|| anyhow!("Invalid container id"))?;
|
||||
|
||||
Ok(())
|
||||
ctr.exec().await
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
|
Reference in New Issue
Block a user