Merge pull request #10911 from kata-containers/sprt/fix-cgroup-race

agent: Fix race condition with cgroup watchers
This commit is contained in:
Aurélien Bombo
2025-02-24 10:28:58 -06:00
committed by GitHub
6 changed files with 23 additions and 42 deletions

View File

@@ -369,6 +369,8 @@ jobs:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: Install dependencies
env:
GITHUB_API_TOKEN: ${{ github.token }}
run: bash tests/integration/nerdctl/gha-run.sh install-dependencies
- name: get-kata-tarball

View File

@@ -77,9 +77,17 @@ async fn register_memory_event_v2(
let mut inotify = Inotify::init().context("Failed to initialize inotify")?;
// watching oom kill
let ev_wd = inotify.add_watch(&event_control_path, WatchMask::MODIFY)?;
let ev_wd = inotify
.add_watch(&event_control_path, WatchMask::MODIFY)
.context(format!("failed to add watch for {:?}", &event_control_path))?;
// Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
let cg_wd = inotify.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)?;
let cg_wd = inotify
.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)
.context(format!(
"failed to add watch for {:?}",
&cgroup_event_control_path
))?;
info!(sl(), "ev_wd: {:?}", ev_wd);
info!(sl(), "cg_wd: {:?}", cg_wd);

View File

@@ -342,24 +342,25 @@ impl AgentService {
async fn do_start_container(&self, req: protocols::agent::StartContainerRequest) -> Result<()> {
let mut s = self.sandbox.lock().await;
let sid = s.id.clone();
let cid = req.container_id;
let cid = req.container_id.clone();
let ctr = s
.get_container(&cid)
.ok_or_else(|| anyhow!("Invalid container id"))?;
ctr.exec().await?;
if sid == cid {
return Ok(());
if sid != cid {
// start oom event loop
if let Ok(cg_path) = ctr.cgroup_manager.as_ref().get_cgroup_path("memory") {
let rx = notifier::notify_oom(cid.as_str(), cg_path.to_string()).await?;
s.run_oom_event_monitor(rx, cid.clone()).await;
}
}
// start oom event loop
if let Ok(cg_path) = ctr.cgroup_manager.as_ref().get_cgroup_path("memory") {
let rx = notifier::notify_oom(cid.as_str(), cg_path.to_string()).await?;
s.run_oom_event_monitor(rx, cid).await;
}
let ctr = s
.get_container(&cid)
.ok_or_else(|| anyhow!("Invalid container id"))?;
Ok(())
ctr.exec().await
}
#[instrument]

View File

@@ -1033,20 +1033,6 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis
grpcSpec.Linux.Resources.CPU.Mems = ""
}
// We need agent systemd cgroup now.
// There are three main reasons to do not apply systemd cgroups in the VM
// - Initrd image doesn't have systemd.
// - Nobody will be able to modify the resources of a specific container by using systemctl set-property.
// - docker is not running in the VM.
// if resCtrl.IsSystemdCgroup(grpcSpec.Linux.CgroupsPath) {
// // Convert systemd cgroup to cgroupfs
// slice := strings.Split(grpcSpec.Linux.CgroupsPath, ":")
// // 0 - slice: system.slice
// // 1 - prefix: docker
// // 2 - name: abc123
// grpcSpec.Linux.CgroupsPath = filepath.Join("/", slice[1], slice[2])
// }
// Disable network namespace since it is already handled on the host by
// virtcontainers. The network is a complex part which cannot be simply
// passed to the agent.

View File

@@ -9,10 +9,6 @@ load "${BATS_TEST_DIRNAME}/../../common.bash"
load "${BATS_TEST_DIRNAME}/tests_common.sh"
setup() {
if [ "${KATA_HYPERVISOR}" == "qemu-coco-dev" ]; then
skip "Test not stable on qemu-coco-dev. See issue #10616"
fi
get_pod_config_dir
job_name="job-pi-test"
yaml_file="${pod_config_dir}/job.yaml"
@@ -42,10 +38,6 @@ setup() {
}
teardown() {
if [ "${KATA_HYPERVISOR}" == "qemu-coco-dev" ]; then
skip "Test not stable on qemu-coco-dev. See issue #10616"
fi
# Debugging information
kubectl describe pod "$pod_name"
kubectl describe jobs/"$job_name"

View File

@@ -9,10 +9,6 @@ load "${BATS_TEST_DIRNAME}/../../common.bash"
load "${BATS_TEST_DIRNAME}/tests_common.sh"
setup() {
if [ "${KATA_HYPERVISOR}" == "qemu-coco-dev" ]; then
skip "Test not stable on qemu-coco-dev. See issue #10616"
fi
auto_generate_policy_enabled || skip "Auto-generated policy tests are disabled."
get_pod_config_dir
@@ -145,10 +141,6 @@ test_job_policy_error() {
}
teardown() {
if [ "${KATA_HYPERVISOR}" == "qemu-coco-dev" ]; then
skip "Test not stable on qemu-coco-dev. See issue #10616"
fi
auto_generate_policy_enabled || skip "Auto-generated policy tests are disabled."
# Debugging information