From ecb68d1ebb6b1ade327a3e32a5656ca160977396 Mon Sep 17 00:00:00 2001 From: mchtech Date: Wed, 26 Feb 2025 15:57:47 +0800 Subject: [PATCH] agent: support running command in nesting cgroup v2 Using a fixed sub cgroup name "init" only supports the DinD (Docker-in-Docker) scenario. A more elegant approach is to obtain the path of the sub-cgroup based on the cgroup path of the first process in the container. This patch can support that init process running in sub-cgroup with any names, including but not limited to systemd (init.scope), even if the user moves the cgroup of the init process after the container starts running. This is a enhancement for PR #10845. Fixes #10733 Signed-off-by: mchtech --- src/agent/rustjail/src/cgroups/fs/mod.rs | 44 +++++++++++++------ src/agent/rustjail/src/cgroups/mod.rs | 4 ++ .../rustjail/src/cgroups/systemd/manager.rs | 14 +++++- src/agent/rustjail/src/container.rs | 6 ++- 4 files changed, 50 insertions(+), 18 deletions(-) diff --git a/src/agent/rustjail/src/cgroups/fs/mod.rs b/src/agent/rustjail/src/cgroups/fs/mod.rs index cc7801f0d1..fabd994c27 100644 --- a/src/agent/rustjail/src/cgroups/fs/mod.rs +++ b/src/agent/rustjail/src/cgroups/fs/mod.rs @@ -1167,21 +1167,37 @@ impl Manager { }) } - pub fn subcgroup(&self) -> &str { - // Check if we're in a Docker-in-Docker setup by verifying: - // 1. We're using cgroups v2 (which restricts direct process control) - // 2. An "init" subdirectory exists (used by DinD for process delegation) - let is_dind = cgroups::hierarchies::is_cgroup2_unified_mode() - && cgroups::hierarchies::auto() - .root() - .join(&self.cpath) - .join("init") - .exists(); - if is_dind { - "/init/" - } else { - "/" + pub fn subcgroup(&self, init_pid: pid_t, container_cgroup: &str) -> Result { + if init_pid <= 0 || !cgroups::hierarchies::is_cgroup2_unified_mode() { + return Ok("/".into()); } + // get sub-cgroup from container init pid + let cgroup_info = fs::read_to_string(format!("/proc/{}/cgroup", init_pid))?; + for line in cgroup_info.lines() { + // the entry for cgroup v2 is always in the format "0::$PATH" + // see https://docs.kernel.org/admin-guide/cgroup-v2.html + if let Some((_, init_cgroup_path)) = line.split_once("0::") { + let container_cgroup_path = format!("/{}", container_cgroup); + debug!( + sl(), + "subcgroup info: init_pid {}, init_cgroup_path {}, container_cgroup_path {}", + init_pid, + init_cgroup_path, + container_cgroup_path + ); + // get relative cgroup path of init_cgroup_path to container_cgroup_path + // container_cgroup_path: /a/b/c.slice + // init_cgroup_path: /a/b/c.slice/d.scope or /a/b/c.slice + if let Some((_, sub_path)) = init_cgroup_path.split_once(&container_cgroup_path) { + match sub_path { + "" => return Ok("/".into()), + _ => return Ok(sub_path.into()), + } + }; + break; + } + } + Ok("/".into()) } fn get_paths_and_mounts( diff --git a/src/agent/rustjail/src/cgroups/mod.rs b/src/agent/rustjail/src/cgroups/mod.rs index 18b64966fc..c94361be31 100644 --- a/src/agent/rustjail/src/cgroups/mod.rs +++ b/src/agent/rustjail/src/cgroups/mod.rs @@ -30,6 +30,10 @@ pub trait Manager { Err(anyhow!("not supported!".to_string())) } + fn set_init_pid(&mut self, _pid: i32) -> Result<()> { + Err(anyhow!("not supported!")) + } + fn get_pids(&self) -> Result> { Err(anyhow!("not supported!")) } diff --git a/src/agent/rustjail/src/cgroups/systemd/manager.rs b/src/agent/rustjail/src/cgroups/systemd/manager.rs index 748f18b2bf..a1fd23abf9 100644 --- a/src/agent/rustjail/src/cgroups/systemd/manager.rs +++ b/src/agent/rustjail/src/cgroups/systemd/manager.rs @@ -37,13 +37,15 @@ pub struct Manager { fs_manager: FsManager, // cgroup version for different dbus properties cg_hierarchy: CgroupHierarchy, + // pid of first process in container + init_pid: pid_t, } impl CgroupManager for Manager { fn apply(&self, pid: pid_t) -> Result<()> { if self.dbus_client.unit_exists()? { - let subcgroup = self.fs_manager.subcgroup(); - self.dbus_client.add_process(pid, subcgroup)?; + let subcgroup = self.fs_manager.subcgroup(self.init_pid, &self.cpath)?; + self.dbus_client.add_process(pid, subcgroup.as_str())?; } else { self.dbus_client.start_unit( (pid as u32).try_into().unwrap(), @@ -55,6 +57,13 @@ impl CgroupManager for Manager { Ok(()) } + fn set_init_pid(&mut self, pid: pid_t) -> Result<()> { + if self.init_pid == 0 { + self.init_pid = pid; + } + Ok(()) + } + fn set(&self, r: &LinuxResources, _: bool) -> Result<()> { let mut properties: Properties = vec![]; @@ -130,6 +139,7 @@ impl Manager { } else { CgroupHierarchy::Legacy }, + init_pid: 0, }) } } diff --git a/src/agent/rustjail/src/container.rs b/src/agent/rustjail/src/container.rs index 291f19af1f..7e95135d33 100644 --- a/src/agent/rustjail/src/container.rs +++ b/src/agent/rustjail/src/container.rs @@ -1216,7 +1216,7 @@ impl BaseContainer for LinuxContainer { &logger, spec, &p, - self.cgroup_manager.as_ref(), + self.cgroup_manager.as_mut(), self.config.use_systemd_cgroup, &st, &mut pipe_w, @@ -1517,7 +1517,7 @@ async fn join_namespaces( logger: &Logger, spec: &Spec, p: &Process, - cm: &(dyn Manager + Send + Sync), + cm: &mut (dyn Manager + Send + Sync), use_systemd_cgroup: bool, st: &OCIState, pipe_w: &mut PipeStream, @@ -1583,6 +1583,8 @@ async fn join_namespaces( } if p.init && res.is_some() { + info!(logger, "set init pid {} for {:p}", p.pid, cm); + cm.set_init_pid(p.pid)?; info!(logger, "set properties to cgroups!"); cm.set(res.unwrap(), false)?; }