From ef4c3844a38f49ef2216120e692aa3ab4407e8c9 Mon Sep 17 00:00:00 2001 From: Xuewei Niu Date: Tue, 1 Aug 2023 15:33:57 +0800 Subject: [PATCH 1/6] agent: Restrict device access at upper node of container's cgroup The target is to guarantee that containers couldn't escape to access extra devices, like vm rootfs, etc. Assume that there is a cgroup, such as `/A/B`. The `B` is container cgroup, and the `A` is what we called pod cgroup. No matter what permissions are set for the container (`B`), the `A`'s permission is always `a *:* rwm`. It leads that containers could acquire permission to access to other devices in VM that not belongs to themselves. In order to set devices cgroup properly, the order of setting cgroups is that the pod cgroup comes first and the container cgroup comes after. The `Sandbox` has a new field, `devcg_info`, to save cgroup states. To avoid setting container cgroup too early, an initialization should be done carefully. `inited`, one of the states, is a boolean to indicate if the pod cgroup is initialized. If no, the pod cgroup should be created firstly, and set default permissions. After that, the pause container cgroup is created and inherits the permissions from the pod cgroup. If whitelist mode which allows containers to access all devices in VM is enabled, then device resources from OCI spec are ignored. This feature not supports systemd cgroup and cgroup v2, since: - Systemd cgroup implemented on Agent hasn't supported devices subsystem so far, see: https://github.com/kata-containers/kata-containers/issues/7506. - Cgroup v2's device controller depends on eBPF programs, which is out of scope of cgroup. Fixes: #7507 Signed-off-by: Xuewei Niu --- src/agent/Cargo.lock | 4 +- src/agent/Cargo.toml | 2 +- src/agent/rustjail/Cargo.toml | 2 +- src/agent/rustjail/src/cgroups/fs/mod.rs | 462 ++++++++++++++++-- src/agent/rustjail/src/cgroups/mock.rs | 11 +- src/agent/rustjail/src/cgroups/mod.rs | 28 +- .../rustjail/src/cgroups/systemd/manager.rs | 2 +- src/agent/rustjail/src/container.rs | 92 +++- src/agent/src/device.rs | 162 +++--- src/agent/src/rpc.rs | 54 +- src/agent/src/sandbox.rs | 69 ++- 11 files changed, 745 insertions(+), 143 deletions(-) diff --git a/src/agent/Cargo.lock b/src/agent/Cargo.lock index 80632c589d..4b60694095 100644 --- a/src/agent/Cargo.lock +++ b/src/agent/Cargo.lock @@ -355,9 +355,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "cgroups-rs" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b098e7c3a70d03c288fa0a96ccf13e770eb3d78c4cc0e1549b3c13215d5f965" +checksum = "1fb3af90c8d48ad5f432d8afb521b5b40c2a2fce46dd60e05912de51c47fba64" dependencies = [ "libc", "log", diff --git a/src/agent/Cargo.toml b/src/agent/Cargo.toml index 47e1d378bc..e7cb0f7c2d 100644 --- a/src/agent/Cargo.toml +++ b/src/agent/Cargo.toml @@ -53,7 +53,7 @@ cfg-if = "1.0.0" prometheus = { version = "0.13.0", features = ["process"] } procfs = "0.12.0" anyhow = "1.0.32" -cgroups = { package = "cgroups-rs", version = "0.3.2" } +cgroups = { package = "cgroups-rs", version = "0.3.3" } # Tracing tracing = "0.1.26" diff --git a/src/agent/rustjail/Cargo.toml b/src/agent/rustjail/Cargo.toml index 231fa353ab..9c3cabb72e 100644 --- a/src/agent/rustjail/Cargo.toml +++ b/src/agent/rustjail/Cargo.toml @@ -25,7 +25,7 @@ scan_fmt = "0.2.6" regex = "1.5.6" path-absolutize = "1.2.0" anyhow = "1.0.32" -cgroups = { package = "cgroups-rs", version = "0.3.2" } +cgroups = { package = "cgroups-rs", version = "0.3.3" } rlimit = "0.5.3" cfg-if = "0.1.0" diff --git a/src/agent/rustjail/src/cgroups/fs/mod.rs b/src/agent/rustjail/src/cgroups/fs/mod.rs index 6145f5f9c0..b7f9d9056b 100644 --- a/src/agent/rustjail/src/cgroups/fs/mod.rs +++ b/src/agent/rustjail/src/cgroups/fs/mod.rs @@ -18,13 +18,14 @@ use cgroups::{ DeviceResource, HugePageResource, MaxValue, NetworkPriority, }; -use crate::cgroups::Manager as CgroupManager; +use crate::cgroups::{is_all_devices_rule, Manager as CgroupManager}; +#[cfg(not(test))] use crate::container::DEFAULT_DEVICES; use anyhow::{anyhow, Context, Result}; use libc::{self, pid_t}; use oci::{ LinuxBlockIo, LinuxCpu, LinuxDevice, LinuxDeviceCgroup, LinuxHugepageLimit, LinuxMemory, - LinuxNetwork, LinuxPids, LinuxResources, + LinuxNetwork, LinuxPids, LinuxResources, Spec, }; use protobuf::MessageField; @@ -35,7 +36,10 @@ use protocols::agent::{ use std::any::Any; use std::collections::HashMap; use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, RwLock}; + +use super::DevicesCgroupInfo; const GUEST_CPUS_PATH: &str = "/sys/devices/system/cpu/online"; @@ -60,6 +64,10 @@ pub struct Manager { pub cpath: String, #[serde(skip)] cgroup: cgroups::Cgroup, + #[serde(skip)] + pod_cgroup: cgroups::Cgroup, + #[serde(skip)] + devcg_whitelist: bool, } // set_resource is used to set reources by cgroup controller. @@ -85,6 +93,7 @@ impl CgroupManager for Manager { ); let res = &mut cgroups::Resources::default(); + let pod_res = &mut cgroups::Resources::default(); // set cpuset and cpu reources if let Some(cpu) = &r.cpu { @@ -117,10 +126,16 @@ impl CgroupManager for Manager { } // set devices resources - set_devices_resources(&self.cgroup, &r.devices, res); - info!(sl(), "resources after processed {:?}", res); + if !self.devcg_whitelist { + set_devices_resources(&self.cgroup, &r.devices, res, pod_res); + } + info!( + sl(), + "Resources after processed, pod_res = {:?}, res = {:?}", pod_res, res + ); // apply resources + self.pod_cgroup.apply(pod_res)?; self.cgroup.apply(res)?; Ok(()) @@ -179,7 +194,15 @@ impl CgroupManager for Manager { } fn destroy(&mut self) -> Result<()> { - let _ = self.cgroup.delete(); + if let Err(err) = self.cgroup.delete() { + warn!( + sl(), + "Failed to delete cgroup {}: {}", + self.cgroup.path(), + err + ); + } + // TODO: Clean up pod cgroup if pod cgroup has no children. Ok(()) } @@ -300,28 +323,21 @@ fn set_devices_resources( _cg: &cgroups::Cgroup, device_resources: &[LinuxDeviceCgroup], res: &mut cgroups::Resources, + pod_res: &mut cgroups::Resources, ) { info!(sl(), "cgroup manager set devices"); let mut devices = vec![]; for d in device_resources.iter() { - if let Some(dev) = linux_device_group_to_cgroup_device(d) { - devices.push(dev); - } - } - - for d in DEFAULT_DEVICES.iter() { - if let Some(dev) = linux_device_to_cgroup_device(d) { - devices.push(dev); - } - } - - for d in DEFAULT_ALLOWED_DEVICES.iter() { - if let Some(dev) = linux_device_group_to_cgroup_device(d) { + if is_all_devices_rule(d) { + continue; + } + if let Some(dev) = linux_device_cgroup_to_device_resource(d) { devices.push(dev); } } + pod_res.devices.devices = devices.clone(); res.devices.devices = devices; } @@ -519,7 +535,8 @@ fn build_blk_io_device_throttle_resource( blk_io_device_throttle_resources } -fn linux_device_to_cgroup_device(d: &LinuxDevice) -> Option { +#[allow(dead_code)] +fn linux_device_to_device_resource(d: &LinuxDevice) -> Option { let dev_type = match DeviceType::from_char(d.r#type.chars().next()) { Some(t) => t, None => return None, @@ -540,7 +557,7 @@ fn linux_device_to_cgroup_device(d: &LinuxDevice) -> Option { }) } -fn linux_device_group_to_cgroup_device(d: &LinuxDeviceCgroup) -> Option { +fn linux_device_cgroup_to_device_resource(d: &LinuxDeviceCgroup) -> Option { let dev_type = match DeviceType::from_char(d.r#type.chars().next()) { Some(t) => t, None => return None, @@ -1001,13 +1018,134 @@ pub fn get_mounts(paths: &HashMap) -> Result, path: &str) -> Result { let valid_path = path.trim_start_matches('/').to_string(); cgroups::Cgroup::new(h, valid_path.as_str()).map_err(anyhow::Error::from) } +#[inline] +fn load_cgroup(h: Box, path: &str) -> Cgroup { + let valid_path = path.trim_start_matches('/').to_string(); + cgroups::Cgroup::load(h, valid_path.as_str()) +} + impl Manager { - pub fn new(cpath: &str) -> Result { + pub fn new( + cpath: &str, + spec: &Spec, + devcg_info: Arc>, + ) -> Result { + let (paths, mounts) = Self::get_paths_and_mounts(cpath)?; + + // Do not expect poisoning lock + let mut devices_group_info = devcg_info.write().unwrap(); + + // Cgroup path of parent of container + let pod_cpath = PathBuf::from(cpath) + .parent() + .context("Get parent of cgroup path")? + .display() + .to_string(); + + // Create a cgroup for the pod if not exists. + // Note that this step MUST be done before the pause container's + // cgroup created, since children inherit upper nodes' rules, which + // have excessive permission. You'll feel painful to shrink upper + // nodes' permission. + let pod_cg = load_cgroup(cgroups::hierarchies::auto(), &pod_cpath); + + // The default is blacklist mode. + let is_whitelist_mode = Self::devcg_whitelist_rule(spec).unwrap_or(false); + if devices_group_info.inited { + // Do nothing for whitelist except for updating whitelist + // field of devices_group_info. + debug!(sl(), "Devices cgroup has been initialzied."); + + // Set allowed all devices to pod cgroup + if !devices_group_info.whitelist && is_whitelist_mode { + info!( + sl(), + "Pod devices cgroup is changed to whitelist mode, devices_group_info = {:?}", + devices_group_info + ); + Self::setup_whitelist_mode(&pod_cg) + .with_context(|| format!("Setup whitelist mode for {}", pod_cpath))?; + devices_group_info.whitelist = true; + } + } else { + // This is the first container (aka pause container) + debug!(sl(), "Started to init devices cgroup"); + + pod_cg.create().context("Create pod cgroup")?; + + // Setup blacklist for pod cgroup + if !is_whitelist_mode { + // Setup blacklist rule and allowed default devices for + // blacklist. + Self::setup_blacklist_mode(&pod_cg) + .with_context(|| format!("Setup blacklist mode for {}", pod_cpath))?; + } else { + Self::setup_whitelist_mode(&pod_cg) + .with_context(|| format!("Setup whitelist mode for {}", pod_cpath))?; + devices_group_info.whitelist = true; + } + + devices_group_info.inited = true + } + + // Create a cgroup for the container. + let cg = new_cgroup(cgroups::hierarchies::auto(), cpath)?; + // The rules of container cgroup are copied from its parent, which + // contains some permissions that the container doesn't need. + // Therefore, resetting the container's devices cgroup is required. + if !devices_group_info.whitelist { + Self::setup_blacklist_mode(&cg) + .with_context(|| format!("Setup blacklist mode for {}", cpath))?; + } + + Ok(Self { + paths, + mounts, + // rels: paths, + cpath: cpath.to_string(), + cgroup: cg, + pod_cgroup: pod_cg, + devcg_whitelist: devices_group_info.whitelist, + }) + } + + /// Create a cgroupfs manager without creating any cgroups. + /// A typical case is for systemd cgroup: Systemd manager retains a + /// cgroupfs manager to read cgroup information only. Writing cgroup + /// rules is done by the systemd. That is, the cgroupfs manager runs in + /// read-only mode. + pub fn new_read_only(cpath: &str) -> Result { + let (paths, mounts) = Self::get_paths_and_mounts(cpath)?; + + // Cgroup path of parent of container + let pod_cpath = PathBuf::from(cpath) + .parent() + .context("Get parent of cgroup path")? + .display() + .to_string(); + + let pod_cg = load_cgroup(cgroups::hierarchies::auto(), &pod_cpath); + let cg = load_cgroup(cgroups::hierarchies::auto(), cpath); + + Ok(Self { + paths, + mounts, + cpath: cpath.to_string(), + pod_cgroup: pod_cg, + cgroup: cg, + devcg_whitelist: false, + }) + } + + fn get_paths_and_mounts( + cpath: &str, + ) -> Result<(HashMap, HashMap)> { let mut m = HashMap::new(); let paths = get_paths()?; @@ -1020,20 +1158,120 @@ impl Manager { continue; } - let p = format!("{}/{}", mnt.unwrap(), cpath); - - m.insert(key.to_string(), p); + m.insert(key.to_string(), format!("{}/{}", mnt.unwrap(), cpath)); } - let cg = new_cgroup(cgroups::hierarchies::auto(), cpath)?; + Ok((m, mounts)) + } - Ok(Self { - paths: m, - mounts, - // rels: paths, - cpath: cpath.to_string(), - cgroup: cg, - }) + fn setup_whitelist_mode(cgroup: &cgroups::Cgroup) -> Result<()> { + // Insert two rules: `b *:* rwm` and `c *:* rwm`. + // The reason of not inserting `a *:* rwm` is that the Linux kernel + // will deny writing `a` to `devices.allow` once a cgroup has + // children. You can refer to + // https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt. + let res = cgroups::Resources { + devices: cgroups::DeviceResources { + devices: vec![ + DeviceResource { + allow: true, + devtype: DeviceType::Block, + major: -1, + minor: -1, + access: vec![ + DevicePermissions::Read, + DevicePermissions::Write, + DevicePermissions::MkNod, + ], + }, + DeviceResource { + allow: true, + devtype: DeviceType::Char, + major: -1, + minor: -1, + access: vec![ + DevicePermissions::Read, + DevicePermissions::Write, + DevicePermissions::MkNod, + ], + }, + ], + }, + ..Default::default() + }; + cgroup.apply(&res)?; + + Ok(()) + } + + fn setup_blacklist_mode(cgroup: &cgroups::Cgroup) -> Result<()> { + #[allow(unused_mut)] + let mut dev_res_list = vec![DeviceResource { + allow: false, + devtype: DeviceType::All, + major: -1, + minor: -1, + access: vec![ + DevicePermissions::Read, + DevicePermissions::Write, + DevicePermissions::MkNod, + ], + }]; + // Do not append default allowed devices for simplicity while + // testing. + #[cfg(not(test))] + dev_res_list.append(&mut Self::default_allowed_devices()); + + let res = cgroups::Resources { + devices: cgroups::DeviceResources { + devices: dev_res_list, + }, + ..Default::default() + }; + cgroup.apply(&res)?; + + Ok(()) + } + + /// Generate a list for allowed devices including `DEFAULT_DEVICES` and + /// `DEFAULT_ALLOWED_DEVICES`. + #[cfg(not(test))] + fn default_allowed_devices() -> Vec { + let mut dev_res_list = Vec::new(); + DEFAULT_DEVICES.iter().for_each(|dev| { + if let Some(dev_res) = linux_device_to_device_resource(dev) { + dev_res_list.push(dev_res) + } + }); + DEFAULT_ALLOWED_DEVICES.iter().for_each(|dev| { + if let Some(dev_res) = linux_device_cgroup_to_device_resource(dev) { + dev_res_list.push(dev_res) + } + }); + dev_res_list + } + + /// Check if OCI spec contains a whiltlist rule. + /// + /// # Returns + /// + /// * `Some(true)` - Contains whitelist all rule; + /// * `Some(false)` - Contains blacklist all rule; + /// * `None` - Neither whitelist rule nor blacklist rule found. + fn devcg_whitelist_rule(spec: &Spec) -> Option { + let linux = match spec.linux.as_ref() { + Some(linux) => linux, + None => return None, + }; + let resources = match linux.resources.as_ref() { + Some(resource) => resource, + None => return None, + }; + resources + .devices + .iter() + .find(|dev| is_all_devices_rule(dev)) + .map(|dev| dev.allow) } } @@ -1085,6 +1323,12 @@ fn convert_memory_swap_to_v2_value(memory_swap: i64, memory: i64) -> Result #[cfg(test)] mod tests { + use std::process::Command; + + use oci::Linux; + use oci::LinuxResources; + use test_utils::skip_if_not_root; + use super::*; #[test] @@ -1133,4 +1377,156 @@ mod tests { ); } } + + struct MockSandbox { + devcg_info: Arc>, + } + + impl MockSandbox { + fn new() -> Self { + Self { + devcg_info: Arc::new(RwLock::new(DevicesCgroupInfo::default())), + } + } + } + + #[test] + fn test_new_fs_manager() { + skip_if_not_root!(); + + struct TestCase { + cpath: Vec, + devices: Vec>, + whitelist: Vec, + pod_devices_list: Vec, + container_devices_list: Vec, + } + + let allow_all = LinuxDeviceCgroup { + allow: true, + r#type: String::new(), + major: Some(0), + minor: Some(0), + access: String::from("rwm"), + }; + + let deny_all = LinuxDeviceCgroup { + allow: false, + r#type: String::new(), + major: Some(0), + minor: Some(0), + access: String::from("rwm"), + }; + + let test_cases = vec![ + TestCase { + cpath: vec![String::from( + "/kata-agent-fs-manager-test/449ccd81-9320-4f3e-bb67-78f84700fac9", + )], + devices: vec![vec![allow_all.clone()]], + whitelist: vec![true], + pod_devices_list: vec![String::from("a *:* rwm\n")], + container_devices_list: vec![String::from("a *:* rwm\n")], + }, + TestCase { + cpath: vec![String::from( + "/kata-agent-fs-manager-test/449ccd81-9320-4f3e-bb67-78f84700fac9", + )], + devices: vec![vec![deny_all.clone()]], + whitelist: vec![false], + pod_devices_list: vec![String::new()], + container_devices_list: vec![String::new()], + }, + TestCase { + cpath: vec![ + String::from( + "/kata-agent-fs-manager-test/449ccd81-9320-4f3e-bb67-78f84700fac9", + ), + String::from( + "/kata-agent-fs-manager-test/1c7affca-1f65-427c-ba92-caff1cea61f6", + ), + ], + devices: vec![vec![deny_all.clone()], vec![allow_all.clone()]], + whitelist: vec![false, true], + pod_devices_list: vec![String::new(), String::from("b *:* rwm\nc *:* rwm\n")], + container_devices_list: vec![String::new(), String::from("b *:* rwm\nc *:* rwm\n")], + }, + TestCase { + cpath: vec![ + String::from( + "/kata-agent-fs-manager-test/449ccd81-9320-4f3e-bb67-78f84700fac9", + ), + String::from( + "/kata-agent-fs-manager-test/1c7affca-1f65-427c-ba92-caff1cea61f6", + ), + ], + devices: vec![vec![allow_all], vec![deny_all]], + whitelist: vec![true, true], + pod_devices_list: vec![String::from("a *:* rwm\n"), String::from("a *:* rwm\n")], + container_devices_list: vec![ + String::from("a *:* rwm\n"), + String::from("a *:* rwm\n"), + ], + }, + ]; + + for (round, tc) in test_cases.iter().enumerate() { + let sandbox = MockSandbox::new(); + let devcg_info = sandbox.devcg_info.read().unwrap(); + assert!(!devcg_info.inited); + assert!(!devcg_info.whitelist); + drop(devcg_info); + let mut managers = Vec::with_capacity(tc.devices.len()); + + for cid in 0..tc.devices.len() { + let spec = Spec { + linux: Some(Linux { + resources: Some(LinuxResources { + devices: tc.devices[cid].clone(), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + managers + .push(Manager::new(&tc.cpath[cid], &spec, sandbox.devcg_info.clone()).unwrap()); + + let devcg_info = sandbox.devcg_info.read().unwrap(); + assert!(devcg_info.inited); + assert_eq!( + devcg_info.whitelist, tc.whitelist[cid], + "Round {}, cid {} whitelist assertion failure", + round, cid + ); + drop(devcg_info); + + // TODO: Assertions go here + let pod_devices_list = Command::new("cat") + .arg("/sys/fs/cgroup/devices/kata-agent-fs-manager-test/devices.list") + .output() + .unwrap(); + let container_devices_list = Command::new("cat") + .arg(&format!( + "/sys/fs/cgroup/devices{}/devices.list", + tc.cpath[cid] + )) + .output() + .unwrap(); + + let pod_devices_list = String::from_utf8(pod_devices_list.stdout).unwrap(); + let container_devices_list = + String::from_utf8(container_devices_list.stdout).unwrap(); + + assert_eq!(&pod_devices_list, &tc.pod_devices_list[cid]); + assert_eq!(&container_devices_list, &tc.container_devices_list[cid]) + } + + // Clean up cgroups + managers + .iter() + .for_each(|manager| manager.cgroup.delete().unwrap()); + managers[0].pod_cgroup.delete().unwrap(); + } + } } diff --git a/src/agent/rustjail/src/cgroups/mock.rs b/src/agent/rustjail/src/cgroups/mock.rs index 8ac77c63b2..117bd25223 100644 --- a/src/agent/rustjail/src/cgroups/mock.rs +++ b/src/agent/rustjail/src/cgroups/mock.rs @@ -10,10 +10,13 @@ use crate::protocols::agent::{BlkioStats, CgroupStats, CpuStats, MemoryStats, Pi use anyhow::Result; use cgroups::freezer::FreezerState; use libc::{self, pid_t}; -use oci::LinuxResources; +use oci::{LinuxResources, Spec}; use std::any::Any; use std::collections::HashMap; use std::string::String; +use std::sync::{Arc, RwLock}; + +use super::DevicesCgroupInfo; #[derive(Serialize, Deserialize, Debug, Clone)] pub struct Manager { @@ -72,7 +75,11 @@ impl CgroupManager for Manager { } impl Manager { - pub fn new(cpath: &str) -> Result { + pub fn new( + cpath: &str, + _spec: &Spec, + _devcg_info: Arc>, + ) -> Result { Ok(Self { paths: HashMap::new(), mounts: HashMap::new(), diff --git a/src/agent/rustjail/src/cgroups/mod.rs b/src/agent/rustjail/src/cgroups/mod.rs index c4e3b178b5..ec4727b406 100644 --- a/src/agent/rustjail/src/cgroups/mod.rs +++ b/src/agent/rustjail/src/cgroups/mod.rs @@ -5,7 +5,7 @@ use anyhow::{anyhow, Result}; use core::fmt::Debug; -use oci::LinuxResources; +use oci::{LinuxDeviceCgroup, LinuxResources}; use protocols::agent::CgroupStats; use std::any::Any; @@ -16,6 +16,15 @@ pub mod mock; pub mod notifier; pub mod systemd; +#[derive(Default, Debug)] +pub struct DevicesCgroupInfo { + /// Indicate if the pod cgroup is initialized. + inited: bool, + /// Indicate if pod's devices cgroup is in whitelist mode. Returns true + /// once one container requires `a *:* rwm` permission. + whitelist: bool, +} + pub trait Manager { fn apply(&self, _pid: i32) -> Result<()> { Err(anyhow!("not supported!".to_string())) @@ -61,3 +70,20 @@ impl Debug for dyn Manager + Send + Sync { write!(f, "{}", self.name()) } } + +/// Check if device cgroup is an all rule from OCI spec. +/// +/// The formats representing all devices between OCI spec and cgroups-rs +/// are different. +/// - OCI spec: major: 0, minor: 0, type: "" (All devices), access: "rwm"; +/// - Cgroups-rs: major: -1, minor: -1, type: "a", access: "rwm"; +/// - Linux: a *:* rwm +#[inline] +fn is_all_devices_rule(dev_cgroup: &LinuxDeviceCgroup) -> bool { + dev_cgroup.major.unwrap_or(0) == 0 + && dev_cgroup.minor.unwrap_or(0) == 0 + && (dev_cgroup.r#type.as_str() == "" || dev_cgroup.r#type.as_str() == "a") + && dev_cgroup.access.contains('r') + && dev_cgroup.access.contains('w') + && dev_cgroup.access.contains('m') +} diff --git a/src/agent/rustjail/src/cgroups/systemd/manager.rs b/src/agent/rustjail/src/cgroups/systemd/manager.rs index 19be1c9610..07e1ecb80d 100644 --- a/src/agent/rustjail/src/cgroups/systemd/manager.rs +++ b/src/agent/rustjail/src/cgroups/systemd/manager.rs @@ -113,7 +113,7 @@ impl Manager { let (parent_slice, unit_name) = cgroups_path.parse()?; let cpath = parent_slice + "/" + &unit_name; - let fs_manager = FsManager::new(cpath.as_str())?; + let fs_manager = FsManager::new_read_only(cpath.as_str())?; Ok(Manager { paths: fs_manager.paths.clone(), diff --git a/src/agent/rustjail/src/container.rs b/src/agent/rustjail/src/container.rs index e17cd32820..f7f5e1282b 100644 --- a/src/agent/rustjail/src/container.rs +++ b/src/agent/rustjail/src/container.rs @@ -23,7 +23,7 @@ use crate::cgroups::fs::Manager as FsManager; #[cfg(test)] use crate::cgroups::mock::Manager as FsManager; use crate::cgroups::systemd::manager::Manager as SystemdManager; -use crate::cgroups::Manager; +use crate::cgroups::{DevicesCgroupInfo, Manager}; #[cfg(feature = "standard-oci-runtime")] use crate::console; use crate::log_child; @@ -55,7 +55,7 @@ use regex::Regex; use std::collections::HashMap; use std::os::unix::io::FromRawFd; use std::str::FromStr; -use std::sync::Arc; +use std::sync::{Arc, RwLock}; use slog::{info, o, Logger}; @@ -136,7 +136,7 @@ lazy_static! { m }; -// type to name hashmap, better to be in NAMESPACES + // type to name hashmap, better to be in NAMESPACES pub static ref TYPETONAME: HashMap<&'static str, &'static str> = { let mut m = HashMap::new(); m.insert("ipc", "ipc"); @@ -1494,6 +1494,7 @@ impl LinuxContainer { pub fn new + Display + Clone>( id: T, base: T, + devcg_info: Arc>, config: Config, logger: &Logger, ) -> Result { @@ -1535,19 +1536,12 @@ impl LinuxContainer { }; let cgroup_manager: Box = if config.use_systemd_cgroup { - Box::new(SystemdManager::new(cpath.as_str()).map_err(|e| { - anyhow!(format!( - "fail to create cgroup manager with path {}: {:}", - cpath, e - )) - })?) + Box::new(SystemdManager::new(cpath.as_str()).context("Create systemd manager")?) } else { - Box::new(FsManager::new(cpath.as_str()).map_err(|e| { - anyhow!(format!( - "fail to create cgroup manager with path {}: {:}", - cpath, e - )) - })?) + Box::new( + FsManager::new(cpath.as_str(), spec, devcg_info) + .context("Create cgroupfs manager")?, + ) }; info!(logger, "new cgroup_manager {:?}", &cgroup_manager); @@ -1611,12 +1605,16 @@ mod tests { use super::*; use crate::process::Process; use nix::unistd::Uid; + use oci::{LinuxDeviceCgroup, Root}; use std::fs; use std::os::unix::fs::MetadataExt; use std::os::unix::io::AsRawFd; + use std::time::UNIX_EPOCH; use tempfile::tempdir; use test_utils::skip_if_not_root; + const CGROUP_PARENT: &str = "kata.agent.test.k8s.io"; + fn sl() -> slog::Logger { slog_scope::logger() } @@ -1717,13 +1715,42 @@ mod tests { } fn create_dummy_opts() -> CreateOpts { - let mut root = oci::Root::default(); - root.path = "/tmp".to_string(); + let start = SystemTime::now(); + let since_the_epoch = start + .duration_since(UNIX_EPOCH) + .expect("Time went backwards"); - let linux = Linux::default(); - let mut spec = Spec::default(); - spec.root = Some(root).into(); - spec.linux = Some(linux).into(); + let root = Root { + path: String::from("/tmp"), + ..Default::default() + }; + + let linux_resources = LinuxResources { + devices: vec![LinuxDeviceCgroup { + allow: true, + r#type: String::new(), + major: None, + minor: None, + access: String::from("rwm"), + }], + ..Default::default() + }; + + let cgroups_path = format!( + "/{}/dummycontainer{}", + CGROUP_PARENT, + since_the_epoch.as_millis() + ); + + let spec = Spec { + linux: Some(Linux { + cgroups_path, + resources: Some(linux_resources), + ..Default::default() + }), + root: Some(root), + ..Default::default() + }; CreateOpts { cgroup_name: "".to_string(), @@ -1748,6 +1775,7 @@ mod tests { LinuxContainer::new( "some_id", &dir.path().join("rootfs").to_str().unwrap(), + Arc::new(RwLock::new(DevicesCgroupInfo::default())), create_dummy_opts(), &slog_scope::logger(), ), @@ -1777,9 +1805,14 @@ mod tests { #[test] fn test_linuxcontainer_pause() { let ret = new_linux_container_and_then(|mut c: LinuxContainer| { - c.cgroup_manager = Box::new(FsManager::new("").map_err(|e| { - anyhow!(format!("fail to create cgroup manager with path: {:}", e)) - })?); + c.cgroup_manager = Box::new( + FsManager::new( + "", + &Spec::default(), + Arc::new(RwLock::new(DevicesCgroupInfo::default())), + ) + .map_err(|e| anyhow!(format!("fail to create cgroup manager with path: {:}", e)))?, + ); c.pause().map_err(|e| anyhow!(e)) }); @@ -1801,9 +1834,14 @@ mod tests { #[test] fn test_linuxcontainer_resume() { let ret = new_linux_container_and_then(|mut c: LinuxContainer| { - c.cgroup_manager = Box::new(FsManager::new("").map_err(|e| { - anyhow!(format!("fail to create cgroup manager with path: {:}", e)) - })?); + c.cgroup_manager = Box::new( + FsManager::new( + "", + &Spec::default(), + Arc::new(RwLock::new(DevicesCgroupInfo::default())), + ) + .map_err(|e| anyhow!(format!("fail to create cgroup manager with path: {:}", e)))?, + ); // Change status to paused, this way we can resume it c.status.transition(ContainerState::Paused); c.resume().map_err(|e| anyhow!(e)) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 4e91bc1759..da11dad339 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -11,6 +11,7 @@ use std::fmt; use std::fs; use std::os::unix::ffi::OsStrExt; use std::os::unix::fs::MetadataExt; +use std::os::unix::prelude::FileTypeExt; use std::path::{Path, PathBuf}; use std::str::FromStr; use std::sync::Arc; @@ -31,7 +32,7 @@ fn sl() -> slog::Logger { slog_scope::logger().new(o!("subsystem" => "device")) } -const VM_ROOTFS: &str = "/"; +pub const VM_ROOTFS: &str = "/"; const BLOCK: &str = "block"; pub const DRIVER_9P_TYPE: &str = "9p"; pub const DRIVER_VIRTIOFS_TYPE: &str = "virtio-fs"; @@ -515,25 +516,53 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { } #[derive(Debug, Clone)] -struct DevNumUpdate { - // the major and minor numbers for the device within the guest +pub struct DeviceInfo { + // Device type, "b" for block device and "c" for character device + cgroup_type: String, + // The major and minor numbers for the device within the guest guest_major: i64, guest_minor: i64, } -impl DevNumUpdate { - fn from_vm_path>(vm_path: T) -> Result { - let vm_path = vm_path.as_ref(); +impl DeviceInfo { + /// Create a device info. + /// + /// # Arguments + /// + /// * `vm_path` - Device's vm path. + /// * `is_rdev` - If the vm_path is a device, set to true. If the + /// vm_path is a file in a device, set to false. + pub fn new(vm_path: &str, is_rdev: bool) -> Result { + let cgroup_type; + let devid; + let vm_path = PathBuf::from(vm_path); if !vm_path.exists() { return Err(anyhow!("VM device path {:?} doesn't exist", vm_path)); } - let devid = fs::metadata(vm_path)?.rdev(); + let metadata = fs::metadata(&vm_path)?; + + if is_rdev { + devid = metadata.rdev(); + let file_type = metadata.file_type(); + if file_type.is_block_device() { + cgroup_type = String::from("b"); + } else if file_type.is_char_device() { + cgroup_type = String::from("c"); + } else { + return Err(anyhow!("Unknown device {:?}'s cgroup type", vm_path)); + } + } else { + devid = metadata.dev(); + cgroup_type = String::from("b"); + } + let guest_major = stat::major(devid) as i64; let guest_minor = stat::minor(devid) as i64; - Ok(DevNumUpdate { + Ok(DeviceInfo { + cgroup_type, guest_major, guest_minor, }) @@ -544,25 +573,25 @@ impl DevNumUpdate { // spec needed for a particular device #[derive(Debug, Clone)] struct DevUpdate { - num: DevNumUpdate, + info: DeviceInfo, // an optional new path to update the device to in the "inner" container // specification final_path: Option, } impl DevUpdate { - fn from_vm_path>(vm_path: T, final_path: String) -> Result { + fn new(vm_path: &str, final_path: &str) -> Result { Ok(DevUpdate { - final_path: Some(final_path), - ..DevNumUpdate::from_vm_path(vm_path)?.into() + final_path: Some(final_path.to_owned()), + ..DeviceInfo::new(vm_path, true)?.into() }) } } -impl From for DevUpdate { - fn from(num: DevNumUpdate) -> Self { +impl From for DevUpdate { + fn from(info: DeviceInfo) -> Self { DevUpdate { - num, + info, final_path: None, } } @@ -596,7 +625,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) - .linux .as_mut() .ok_or_else(|| anyhow!("Spec didn't contain linux field"))?; - let mut res_updates = HashMap::<(&str, i64, i64), DevNumUpdate>::with_capacity(updates.len()); + let mut res_updates = HashMap::<(&str, i64, i64), DeviceInfo>::with_capacity(updates.len()); for specdev in &mut linux.devices { if let Some(update) = updates.remove(specdev.path.as_str()) { @@ -610,13 +639,13 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) - "type" => &specdev.r#type, "host_major" => host_major, "host_minor" => host_minor, - "guest_major" => update.num.guest_major, - "guest_minor" => update.num.guest_minor, + "guest_major" => update.info.guest_major, + "guest_minor" => update.info.guest_minor, "final_path" => update.final_path.as_ref(), ); - specdev.major = update.num.guest_major; - specdev.minor = update.num.guest_minor; + specdev.major = update.info.guest_major; + specdev.minor = update.info.guest_minor; if let Some(final_path) = update.final_path { specdev.path = final_path; } @@ -624,7 +653,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) - if res_updates .insert( (specdev.r#type.as_str(), host_major, host_minor), - update.num, + update.info, ) .is_some() { @@ -728,7 +757,9 @@ async fn virtiommio_blk_device_handler( .context("failed to get mmio device name")?; } - Ok(DevNumUpdate::from_vm_path(&device.vm_path)?.into()) + Ok(DeviceInfo::new(device.vm_path(), true) + .context("New device info")? + .into()) } // device.Id should be a PCI path string @@ -740,7 +771,9 @@ async fn virtio_blk_device_handler( let pcipath = pci::Path::from_str(&device.id)?; let vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; - Ok(DevNumUpdate::from_vm_path(vm_path)?.into()) + Ok(DeviceInfo::new(&vm_path, true) + .context("New device info")? + .into()) } // device.id should be a CCW path string @@ -753,7 +786,7 @@ async fn virtio_blk_ccw_device_handler( let ccw_device = ccw::Device::from_str(&device.id)?; let vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?; - Ok(DevNumUpdate::from_vm_path(vm_path)?.into()) + Ok(DeviceInfo::new(&vm_path, true)?.into()) } #[cfg(not(target_arch = "s390x"))] @@ -770,7 +803,9 @@ async fn virtio_scsi_device_handler( ) -> Result { let vm_path = get_scsi_device_name(sandbox, &device.id).await?; - Ok(DevNumUpdate::from_vm_path(vm_path)?.into()) + Ok(DeviceInfo::new(&vm_path, true) + .context("New device info")? + .into()) } #[instrument] @@ -782,7 +817,9 @@ async fn virtio_nvdimm_device_handler( return Err(anyhow!("Invalid path for nvdimm device")); } - Ok(DevNumUpdate::from_vm_path(&device.vm_path)?.into()) + Ok(DeviceInfo::new(device.vm_path(), true) + .context("New device info")? + .into()) } fn split_vfio_pci_option(opt: &str) -> Option<(&str, &str)> { @@ -842,7 +879,7 @@ async fn vfio_pci_device_handler( let vm_path = get_vfio_device_name(sandbox, group).await?; - Some(DevUpdate::from_vm_path(&vm_path, vm_path.clone())?) + Some(DevUpdate::new(&vm_path, &vm_path)?) } else { None }; @@ -887,7 +924,7 @@ pub async fn add_devices( let update = add_device(device, sandbox).await?; if let Some(dev_update) = update.dev { if dev_updates - .insert(&device.container_path, dev_update) + .insert(&device.container_path, dev_update.clone()) .is_some() { return Err(anyhow!( @@ -907,6 +944,10 @@ pub async fn add_devices( )); } } + + // Update cgroup to allow all devices added to guest. + insert_devices_cgroup_rule(spec, &dev_update.info, true, "rwm") + .context("Update device cgroup")?; } } @@ -948,16 +989,14 @@ async fn add_device(device: &Device, sandbox: &Arc>) -> Result Result<()> { - let meta = fs::metadata(VM_ROOTFS)?; - let rdev = meta.dev(); - let major = stat::major(rdev) as i64; - let minor = stat::minor(rdev) as i64; - +pub fn insert_devices_cgroup_rule( + spec: &mut Spec, + dev_info: &DeviceInfo, + allow: bool, + access: &str, +) -> Result<()> { let linux = spec .linux .as_mut() @@ -965,13 +1004,25 @@ pub fn update_device_cgroup(spec: &mut Spec) -> Result<()> { let resources = linux.resources.get_or_insert(LinuxResources::default()); - resources.devices.push(LinuxDeviceCgroup { - allow: false, - major: Some(major), - minor: Some(minor), - r#type: String::from("b"), - access: String::from("rw"), - }); + let cgroup = LinuxDeviceCgroup { + allow, + major: Some(dev_info.guest_major), + minor: Some(dev_info.guest_minor), + r#type: dev_info.cgroup_type.clone(), + access: access.to_owned(), + }; + + debug!( + sl(), + "Insert a devices cgroup rule"; + "linux_device_cgroup" => cgroup.allow, + "guest_major" => cgroup.major, + "guest_minor" => cgroup.minor, + "type" => cgroup.r#type.as_str(), + "access" => cgroup.access.as_str(), + ); + + resources.devices.push(cgroup); Ok(()) } @@ -991,7 +1042,8 @@ mod tests { ..Default::default() }; - update_device_cgroup(&mut spec).unwrap(); + let dev_info = DeviceInfo::new(VM_ROOTFS, false).unwrap(); + insert_devices_cgroup_rule(&mut spec, &dev_info, false, "rw").unwrap(); let devices = spec.linux.unwrap().resources.unwrap().devices; assert_eq!(devices.len(), 1); @@ -1011,7 +1063,7 @@ mod tests { let mut spec = Spec::default(); // vm_path empty - let update = DevNumUpdate::from_vm_path(""); + let update = DeviceInfo::new("", true); assert!(update.is_err()); // linux is empty @@ -1021,7 +1073,7 @@ mod tests { &mut spec, HashMap::from_iter(vec![( container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + DeviceInfo::new(vm_path, true).unwrap().into(), )]), ); assert!(res.is_err()); @@ -1033,7 +1085,7 @@ mod tests { &mut spec, HashMap::from_iter(vec![( container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + DeviceInfo::new(vm_path, true).unwrap().into(), )]), ); assert!(res.is_err()); @@ -1050,7 +1102,7 @@ mod tests { &mut spec, HashMap::from_iter(vec![( container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + DeviceInfo::new(vm_path, true).unwrap().into(), )]), ); assert!( @@ -1068,7 +1120,7 @@ mod tests { &mut spec, HashMap::from_iter(vec![( container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + DeviceInfo::new(vm_path, true).unwrap().into(), )]), ); assert!(res.is_ok()); @@ -1094,7 +1146,7 @@ mod tests { &mut spec, HashMap::from_iter(vec![( container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + DeviceInfo::new(vm_path, true).unwrap().into(), )]), ); assert!(res.is_ok()); @@ -1178,11 +1230,11 @@ mod tests { let updates = HashMap::from_iter(vec![ ( container_path_a, - DevNumUpdate::from_vm_path(vm_path_a).unwrap().into(), + DeviceInfo::new(vm_path_a, true).unwrap().into(), ), ( container_path_b, - DevNumUpdate::from_vm_path(vm_path_b).unwrap().into(), + DeviceInfo::new(vm_path_b, true).unwrap().into(), ), ]); let res = update_spec_devices(&mut spec, updates); @@ -1263,7 +1315,7 @@ mod tests { &mut spec, HashMap::from_iter(vec![( container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + DeviceInfo::new(vm_path, true).unwrap().into(), )]), ); assert!(res.is_ok()); @@ -1307,7 +1359,7 @@ mod tests { &mut spec, HashMap::from_iter(vec![( container_path, - DevUpdate::from_vm_path(vm_path, final_path.to_string()).unwrap(), + DevUpdate::new(vm_path, final_path).unwrap(), )]), ); assert!(res.is_ok()); diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 822a68cb2f..b12b3a8265 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -53,7 +53,8 @@ use nix::unistd::{self, Pid}; use rustjail::process::ProcessOperations; use crate::device::{ - add_devices, get_virtio_blk_pci_device_name, update_device_cgroup, update_env_pci, + add_devices, get_virtio_blk_pci_device_name, insert_devices_cgroup_rule, update_env_pci, + DeviceInfo, VM_ROOTFS, }; use crate::linux_abi::*; use crate::metrics::get_metrics; @@ -239,7 +240,8 @@ impl AgentService { update_container_namespaces(&s, &mut oci, use_sandbox_pidns)?; // Add the root partition to the device cgroup to prevent access - update_device_cgroup(&mut oci)?; + let dev_info = DeviceInfo::new(VM_ROOTFS, false)?; + insert_devices_cgroup_rule(&mut oci, &dev_info, false, "rw")?; // Append guest hooks append_guest_hooks(&s, &mut oci)?; @@ -272,8 +274,13 @@ impl AgentService { container_name, }; - let mut ctr: LinuxContainer = - LinuxContainer::new(cid.as_str(), CONTAINER_BASE, opts, &sl())?; + let mut ctr: LinuxContainer = LinuxContainer::new( + cid.as_str(), + CONTAINER_BASE, + s.devcg_info.clone(), + opts, + &sl(), + )?; let pipe_size = AGENT_CONFIG.container_pipe_size; @@ -1981,16 +1988,22 @@ fn load_kernel_module(module: &protocols::agent::KernelModule) -> Result<()> { #[cfg(test)] mod tests { + use std::sync::RwLock; + use std::time::{SystemTime, UNIX_EPOCH}; + use super::*; use crate::{namespace::Namespace, protocols::agent_ttrpc_async::AgentService as _}; use nix::mount; use nix::sched::{unshare, CloneFlags}; - use oci::{Hook, Hooks, Linux, LinuxNamespace}; + use oci::{Hook, Hooks, Linux, LinuxDeviceCgroup, LinuxNamespace, LinuxResources}; + use rustjail::cgroups::DevicesCgroupInfo; use tempfile::{tempdir, TempDir}; use test_utils::{assert_result, skip_if_not_root}; use ttrpc::{r#async::TtrpcContext, MessageHeader}; use which::which; + const CGROUP_PARENT: &str = "kata.agent.test.k8s.io"; + fn check_command(cmd: &str) -> bool { which(cmd).is_ok() } @@ -2005,13 +2018,39 @@ mod tests { } fn create_dummy_opts() -> CreateOpts { + let start = SystemTime::now(); + let since_the_epoch = start + .duration_since(UNIX_EPOCH) + .expect("Time went backwards"); + let root = Root { path: String::from("/"), ..Default::default() }; + let linux_resources = LinuxResources { + devices: vec![LinuxDeviceCgroup { + allow: true, + r#type: String::new(), + major: None, + minor: None, + access: String::from("rwm"), + }], + ..Default::default() + }; + + let cgroups_path = format!( + "/{}/dummycontainer{}", + CGROUP_PARENT, + since_the_epoch.as_millis() + ); + let spec = Spec { - linux: Some(oci::Linux::default()), + linux: Some(Linux { + cgroups_path, + resources: Some(linux_resources), + ..Default::default() + }), root: Some(root), ..Default::default() }; @@ -2035,6 +2074,7 @@ mod tests { LinuxContainer::new( "some_id", dir.path().join("rootfs").to_str().unwrap(), + Arc::new(RwLock::new(DevicesCgroupInfo::default())), create_dummy_opts(), &slog_scope::logger(), ) @@ -2141,6 +2181,8 @@ mod tests { #[tokio::test] async fn test_do_write_stream() { + skip_if_not_root!(); + #[derive(Debug)] struct TestData<'a> { create_container: bool, diff --git a/src/agent/src/sandbox.rs b/src/agent/src/sandbox.rs index accf364559..a1a857a646 100644 --- a/src/agent/src/sandbox.rs +++ b/src/agent/src/sandbox.rs @@ -12,7 +12,7 @@ use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::str::FromStr; use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, RwLock}; use std::time::{Duration, Instant}; use std::{thread, time}; @@ -26,7 +26,7 @@ use nix::sys::stat::Mode; use oci::{Hook, Hooks}; use protocols::agent::{OnlineCPUMemRequest, SharedMount}; use regex::Regex; -use rustjail::cgroups as rustjail_cgroups; +use rustjail::cgroups::{self as rustjail_cgroups, DevicesCgroupInfo}; use rustjail::container::BaseContainer; use rustjail::container::LinuxContainer; use rustjail::process::Process; @@ -118,6 +118,7 @@ pub struct Sandbox { pub event_tx: Option>, pub bind_watcher: BindWatcher, pub pcimap: HashMap, + pub devcg_info: Arc>, } impl Sandbox { @@ -151,6 +152,7 @@ impl Sandbox { event_tx: Some(tx), bind_watcher: BindWatcher::new(), pcimap: HashMap::new(), + devcg_info: Arc::new(RwLock::new(DevicesCgroupInfo::default())), }) } @@ -658,7 +660,7 @@ mod tests { use crate::mount::baremount; use anyhow::{anyhow, Error}; use nix::mount::MsFlags; - use oci::{Linux, Root, Spec}; + use oci::{Linux, LinuxDeviceCgroup, LinuxResources, Root, Spec}; use rustjail::container::LinuxContainer; use rustjail::process::Process; use rustjail::specconv::CreateOpts; @@ -667,9 +669,12 @@ mod tests { use std::io::prelude::*; use std::os::unix::fs::PermissionsExt; use std::path::Path; + use std::time::{SystemTime, UNIX_EPOCH}; use tempfile::{tempdir, Builder, TempDir}; use test_utils::skip_if_not_root; + const CGROUP_PARENT: &str = "kata.agent.test.k8s.io"; + fn bind_mount(src: &str, dst: &str, logger: &Logger) -> Result<(), Error> { let src_path = Path::new(src); let dst_path = Path::new(dst); @@ -824,13 +829,39 @@ mod tests { } fn create_dummy_opts() -> CreateOpts { + let start = SystemTime::now(); + let since_the_epoch = start + .duration_since(UNIX_EPOCH) + .expect("Time went backwards"); + let root = Root { path: String::from("/"), ..Default::default() }; + let linux_resources = LinuxResources { + devices: vec![LinuxDeviceCgroup { + allow: true, + r#type: String::new(), + major: None, + minor: None, + access: String::from("rwm"), + }], + ..Default::default() + }; + + let cgroups_path = format!( + "/{}/dummycontainer{}", + CGROUP_PARENT, + since_the_epoch.as_millis() + ); + let spec = Spec { - linux: Some(Linux::default()), + linux: Some(Linux { + cgroups_path, + resources: Some(linux_resources), + ..Default::default() + }), root: Some(root), ..Default::default() }; @@ -853,22 +884,24 @@ mod tests { .map_err(|e| anyhow!(e).context("tempdir failed")) .unwrap(); - // Create a new container - ( - LinuxContainer::new( - "some_id", - dir.path().join("rootfs").to_str().unwrap(), - create_dummy_opts(), - &slog_scope::logger(), - ) - .unwrap(), - dir, + let container = LinuxContainer::new( + "some_id", + dir.path().join("rootfs").to_str().unwrap(), + Arc::new(RwLock::new(DevicesCgroupInfo::default())), + create_dummy_opts(), + &slog_scope::logger(), ) + .unwrap(); + + // Create a new container + (container, dir) } #[tokio::test] #[serial] async fn get_container_entry_exist() { + skip_if_not_root!(); + let logger = slog::Logger::root(slog::Discard, o!()); let mut s = Sandbox::new(&logger).unwrap(); let (linux_container, _root) = create_linuxcontainer(); @@ -892,6 +925,8 @@ mod tests { #[tokio::test] #[serial] async fn add_and_get_container() { + skip_if_not_root!(); + let logger = slog::Logger::root(slog::Discard, o!()); let mut s = Sandbox::new(&logger).unwrap(); let (linux_container, _root) = create_linuxcontainer(); @@ -903,6 +938,8 @@ mod tests { #[tokio::test] #[serial] async fn update_shared_pidns() { + skip_if_not_root!(); + let logger = slog::Logger::root(slog::Discard, o!()); let mut s = Sandbox::new(&logger).unwrap(); let test_pid = 9999; @@ -953,6 +990,8 @@ mod tests { #[tokio::test] async fn test_find_container_process() { + skip_if_not_root!(); + let logger = slog::Logger::root(slog::Discard, o!()); let mut s = Sandbox::new(&logger).unwrap(); let cid = "container-123"; @@ -998,6 +1037,8 @@ mod tests { #[tokio::test] async fn test_find_process() { + skip_if_not_root!(); + let logger = slog::Logger::root(slog::Discard, o!()); let test_pids = [std::i32::MIN, -1, 0, 1, std::i32::MAX]; From cec80447447d8e00c57bc29e795c650635115b0c Mon Sep 17 00:00:00 2001 From: Xuewei Niu Date: Thu, 10 Aug 2023 15:44:50 +0800 Subject: [PATCH 2/6] agent: Make devcg_info optional for LinuxContainer::new() The runk is a standard OCI runtime that isnt' aware of concept of sandbox. Therefore, the `devcg_info` argument of `LinuxContainer::new()` is unneccessary to be provided. Signed-off-by: Xuewei Niu --- src/agent/rustjail/src/cgroups/fs/mod.rs | 133 ++++++++++--------- src/agent/rustjail/src/cgroups/mock.rs | 2 +- src/agent/rustjail/src/container.rs | 28 ++-- src/agent/src/rpc.rs | 6 +- src/agent/src/sandbox.rs | 2 +- src/tools/runk/Cargo.lock | 4 +- src/tools/runk/libcontainer/src/container.rs | 2 + 7 files changed, 87 insertions(+), 90 deletions(-) diff --git a/src/agent/rustjail/src/cgroups/fs/mod.rs b/src/agent/rustjail/src/cgroups/fs/mod.rs index b7f9d9056b..bb21d04623 100644 --- a/src/agent/rustjail/src/cgroups/fs/mod.rs +++ b/src/agent/rustjail/src/cgroups/fs/mod.rs @@ -65,7 +65,7 @@ pub struct Manager { #[serde(skip)] cgroup: cgroups::Cgroup, #[serde(skip)] - pod_cgroup: cgroups::Cgroup, + pod_cgroup: Option, #[serde(skip)] devcg_whitelist: bool, } @@ -135,7 +135,9 @@ impl CgroupManager for Manager { ); // apply resources - self.pod_cgroup.apply(pod_res)?; + if let Some(pod_cg) = self.pod_cgroup.as_ref() { + pod_cg.apply(pod_res)?; + } self.cgroup.apply(res)?; Ok(()) @@ -1034,64 +1036,70 @@ impl Manager { pub fn new( cpath: &str, spec: &Spec, - devcg_info: Arc>, + devcg_info: Option>>, ) -> Result { let (paths, mounts) = Self::get_paths_and_mounts(cpath)?; // Do not expect poisoning lock - let mut devices_group_info = devcg_info.write().unwrap(); + let mut devices_group_info = devcg_info.as_ref().map(|i| i.write().unwrap()); + let pod_cgroup: Option; - // Cgroup path of parent of container - let pod_cpath = PathBuf::from(cpath) - .parent() - .context("Get parent of cgroup path")? - .display() - .to_string(); + if let Some(devices_group_info) = devices_group_info.as_mut() { + // Cgroup path of parent of container + let pod_cpath = PathBuf::from(cpath) + .parent() + .context("Get parent of cgroup path")? + .display() + .to_string(); - // Create a cgroup for the pod if not exists. - // Note that this step MUST be done before the pause container's - // cgroup created, since children inherit upper nodes' rules, which - // have excessive permission. You'll feel painful to shrink upper - // nodes' permission. - let pod_cg = load_cgroup(cgroups::hierarchies::auto(), &pod_cpath); + // Create a cgroup for the pod if not exists. + // Note that this step MUST be done before the pause container's + // cgroup created, since children inherit upper nodes' rules, which + // have excessive permission. You'll feel painful to shrink upper + // nodes' permission. + pod_cgroup = Some(load_cgroup(cgroups::hierarchies::auto(), &pod_cpath)); + let pod_cg = pod_cgroup.as_ref().unwrap(); - // The default is blacklist mode. - let is_whitelist_mode = Self::devcg_whitelist_rule(spec).unwrap_or(false); - if devices_group_info.inited { - // Do nothing for whitelist except for updating whitelist - // field of devices_group_info. - debug!(sl(), "Devices cgroup has been initialzied."); + // The default is blacklist mode. + let is_whitelist_mode = Self::devcg_whitelist_rule(spec).unwrap_or(false); + if devices_group_info.inited { + // Do nothing for whitelist except for updating whitelist + // field of devices_group_info. + debug!(sl(), "Devices cgroup has been initialzied."); - // Set allowed all devices to pod cgroup - if !devices_group_info.whitelist && is_whitelist_mode { - info!( + // Set allowed all devices to pod cgroup + if !devices_group_info.whitelist && is_whitelist_mode { + info!( sl(), "Pod devices cgroup is changed to whitelist mode, devices_group_info = {:?}", devices_group_info ); - Self::setup_whitelist_mode(&pod_cg) - .with_context(|| format!("Setup whitelist mode for {}", pod_cpath))?; - devices_group_info.whitelist = true; + Self::setup_whitelist_mode(pod_cg) + .with_context(|| format!("Setup whitelist mode for {}", pod_cpath))?; + devices_group_info.whitelist = true; + } + } else { + // This is the first container (aka pause container) + debug!(sl(), "Started to init devices cgroup"); + + pod_cg.create().context("Create pod cgroup")?; + + // Setup blacklist for pod cgroup + if !is_whitelist_mode { + // Setup blacklist rule and allowed default devices for + // blacklist. + Self::setup_blacklist_mode(pod_cg) + .with_context(|| format!("Setup blacklist mode for {}", pod_cpath))?; + } else { + Self::setup_whitelist_mode(pod_cg) + .with_context(|| format!("Setup whitelist mode for {}", pod_cpath))?; + devices_group_info.whitelist = true; + } + + devices_group_info.inited = true } } else { - // This is the first container (aka pause container) - debug!(sl(), "Started to init devices cgroup"); - - pod_cg.create().context("Create pod cgroup")?; - - // Setup blacklist for pod cgroup - if !is_whitelist_mode { - // Setup blacklist rule and allowed default devices for - // blacklist. - Self::setup_blacklist_mode(&pod_cg) - .with_context(|| format!("Setup blacklist mode for {}", pod_cpath))?; - } else { - Self::setup_whitelist_mode(&pod_cg) - .with_context(|| format!("Setup whitelist mode for {}", pod_cpath))?; - devices_group_info.whitelist = true; - } - - devices_group_info.inited = true + pod_cgroup = None; } // Create a cgroup for the container. @@ -1099,9 +1107,11 @@ impl Manager { // The rules of container cgroup are copied from its parent, which // contains some permissions that the container doesn't need. // Therefore, resetting the container's devices cgroup is required. - if !devices_group_info.whitelist { - Self::setup_blacklist_mode(&cg) - .with_context(|| format!("Setup blacklist mode for {}", cpath))?; + if let Some(devices_group_info) = devices_group_info.as_ref() { + if !devices_group_info.whitelist { + Self::setup_blacklist_mode(&cg) + .with_context(|| format!("Setup blacklist mode for {}", cpath))?; + } } Ok(Self { @@ -1110,8 +1120,10 @@ impl Manager { // rels: paths, cpath: cpath.to_string(), cgroup: cg, - pod_cgroup: pod_cg, - devcg_whitelist: devices_group_info.whitelist, + pod_cgroup, + devcg_whitelist: devices_group_info + .map(|info| info.whitelist) + .unwrap_or(false), }) } @@ -1123,21 +1135,13 @@ impl Manager { pub fn new_read_only(cpath: &str) -> Result { let (paths, mounts) = Self::get_paths_and_mounts(cpath)?; - // Cgroup path of parent of container - let pod_cpath = PathBuf::from(cpath) - .parent() - .context("Get parent of cgroup path")? - .display() - .to_string(); - - let pod_cg = load_cgroup(cgroups::hierarchies::auto(), &pod_cpath); let cg = load_cgroup(cgroups::hierarchies::auto(), cpath); Ok(Self { paths, mounts, cpath: cpath.to_string(), - pod_cgroup: pod_cg, + pod_cgroup: None, cgroup: cg, devcg_whitelist: false, }) @@ -1489,8 +1493,9 @@ mod tests { }), ..Default::default() }; - managers - .push(Manager::new(&tc.cpath[cid], &spec, sandbox.devcg_info.clone()).unwrap()); + managers.push( + Manager::new(&tc.cpath[cid], &spec, Some(sandbox.devcg_info.clone())).unwrap(), + ); let devcg_info = sandbox.devcg_info.read().unwrap(); assert!(devcg_info.inited); @@ -1501,7 +1506,6 @@ mod tests { ); drop(devcg_info); - // TODO: Assertions go here let pod_devices_list = Command::new("cat") .arg("/sys/fs/cgroup/devices/kata-agent-fs-manager-test/devices.list") .output() @@ -1526,7 +1530,8 @@ mod tests { managers .iter() .for_each(|manager| manager.cgroup.delete().unwrap()); - managers[0].pod_cgroup.delete().unwrap(); + // The pod_cgroup must not be None + managers[0].pod_cgroup.as_ref().unwrap().delete().unwrap(); } } } diff --git a/src/agent/rustjail/src/cgroups/mock.rs b/src/agent/rustjail/src/cgroups/mock.rs index 117bd25223..ce1cfb2bc7 100644 --- a/src/agent/rustjail/src/cgroups/mock.rs +++ b/src/agent/rustjail/src/cgroups/mock.rs @@ -78,7 +78,7 @@ impl Manager { pub fn new( cpath: &str, _spec: &Spec, - _devcg_info: Arc>, + _devcg_info: Option>>, ) -> Result { Ok(Self { paths: HashMap::new(), diff --git a/src/agent/rustjail/src/container.rs b/src/agent/rustjail/src/container.rs index f7f5e1282b..897c5a3c86 100644 --- a/src/agent/rustjail/src/container.rs +++ b/src/agent/rustjail/src/container.rs @@ -1494,7 +1494,7 @@ impl LinuxContainer { pub fn new + Display + Clone>( id: T, base: T, - devcg_info: Arc>, + devcg_info: Option>>, config: Config, logger: &Logger, ) -> Result { @@ -1775,7 +1775,7 @@ mod tests { LinuxContainer::new( "some_id", &dir.path().join("rootfs").to_str().unwrap(), - Arc::new(RwLock::new(DevicesCgroupInfo::default())), + None, create_dummy_opts(), &slog_scope::logger(), ), @@ -1805,14 +1805,10 @@ mod tests { #[test] fn test_linuxcontainer_pause() { let ret = new_linux_container_and_then(|mut c: LinuxContainer| { - c.cgroup_manager = Box::new( - FsManager::new( - "", - &Spec::default(), - Arc::new(RwLock::new(DevicesCgroupInfo::default())), - ) - .map_err(|e| anyhow!(format!("fail to create cgroup manager with path: {:}", e)))?, - ); + c.cgroup_manager = + Box::new(FsManager::new("", &Spec::default(), None).map_err(|e| { + anyhow!(format!("fail to create cgroup manager with path: {:}", e)) + })?); c.pause().map_err(|e| anyhow!(e)) }); @@ -1834,14 +1830,10 @@ mod tests { #[test] fn test_linuxcontainer_resume() { let ret = new_linux_container_and_then(|mut c: LinuxContainer| { - c.cgroup_manager = Box::new( - FsManager::new( - "", - &Spec::default(), - Arc::new(RwLock::new(DevicesCgroupInfo::default())), - ) - .map_err(|e| anyhow!(format!("fail to create cgroup manager with path: {:}", e)))?, - ); + c.cgroup_manager = + Box::new(FsManager::new("", &Spec::default(), None).map_err(|e| { + anyhow!(format!("fail to create cgroup manager with path: {:}", e)) + })?); // Change status to paused, this way we can resume it c.status.transition(ContainerState::Paused); c.resume().map_err(|e| anyhow!(e)) diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index b12b3a8265..bb7e2a943a 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -277,7 +277,7 @@ impl AgentService { let mut ctr: LinuxContainer = LinuxContainer::new( cid.as_str(), CONTAINER_BASE, - s.devcg_info.clone(), + Some(s.devcg_info.clone()), opts, &sl(), )?; @@ -1988,7 +1988,6 @@ fn load_kernel_module(module: &protocols::agent::KernelModule) -> Result<()> { #[cfg(test)] mod tests { - use std::sync::RwLock; use std::time::{SystemTime, UNIX_EPOCH}; use super::*; @@ -1996,7 +1995,6 @@ mod tests { use nix::mount; use nix::sched::{unshare, CloneFlags}; use oci::{Hook, Hooks, Linux, LinuxDeviceCgroup, LinuxNamespace, LinuxResources}; - use rustjail::cgroups::DevicesCgroupInfo; use tempfile::{tempdir, TempDir}; use test_utils::{assert_result, skip_if_not_root}; use ttrpc::{r#async::TtrpcContext, MessageHeader}; @@ -2074,7 +2072,7 @@ mod tests { LinuxContainer::new( "some_id", dir.path().join("rootfs").to_str().unwrap(), - Arc::new(RwLock::new(DevicesCgroupInfo::default())), + None, create_dummy_opts(), &slog_scope::logger(), ) diff --git a/src/agent/src/sandbox.rs b/src/agent/src/sandbox.rs index a1a857a646..8137939c8e 100644 --- a/src/agent/src/sandbox.rs +++ b/src/agent/src/sandbox.rs @@ -887,7 +887,7 @@ mod tests { let container = LinuxContainer::new( "some_id", dir.path().join("rootfs").to_str().unwrap(), - Arc::new(RwLock::new(DevicesCgroupInfo::default())), + None, create_dummy_opts(), &slog_scope::logger(), ) diff --git a/src/tools/runk/Cargo.lock b/src/tools/runk/Cargo.lock index 73f9db2d58..c8884f898e 100644 --- a/src/tools/runk/Cargo.lock +++ b/src/tools/runk/Cargo.lock @@ -292,9 +292,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "cgroups-rs" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b098e7c3a70d03c288fa0a96ccf13e770eb3d78c4cc0e1549b3c13215d5f965" +checksum = "1fb3af90c8d48ad5f432d8afb521b5b40c2a2fce46dd60e05912de51c47fba64" dependencies = [ "libc", "log", diff --git a/src/tools/runk/libcontainer/src/container.rs b/src/tools/runk/libcontainer/src/container.rs index f8ddcdfe5f..c7c3e068be 100644 --- a/src/tools/runk/libcontainer/src/container.rs +++ b/src/tools/runk/libcontainer/src/container.rs @@ -361,6 +361,7 @@ pub fn create_linux_container( .map(|s| s.to_string()) .ok_or_else(|| anyhow!("failed to convert bundle path"))? .as_str(), + None, config, logger, )?; @@ -384,6 +385,7 @@ pub fn load_linux_container( .to_str() .map(|s| s.to_string()) .ok_or_else(|| anyhow!("failed to convert a root path"))?, + None, status.config.clone(), logger, )?; From 647782519538b2d73f7571ed63081d34ef1f47dc Mon Sep 17 00:00:00 2001 From: Xuewei Niu Date: Thu, 17 Aug 2023 11:58:43 +0800 Subject: [PATCH 3/6] agent: Minor changes according to Zhou's comments The changes include: - Change to debug logging level for resources after processed. - Remove a todo for pod cgroup cleanup. - Add an anyhow context to `get_paths_and_mounts()`. - Remove code which denys access to VMROOTFS since it won't take effect. If blackmode is in use, the VMROOTFS will be denyed as default. Otherwise, device cgroups won't be updated in whitelist mode. - Add a unit test for `default_allowed_devices()`. Signed-off-by: Xuewei Niu --- src/agent/rustjail/src/cgroups/fs/mod.rs | 146 +++++++++++++++-------- src/agent/src/device.rs | 3 +- src/agent/src/rpc.rs | 9 +- 3 files changed, 96 insertions(+), 62 deletions(-) diff --git a/src/agent/rustjail/src/cgroups/fs/mod.rs b/src/agent/rustjail/src/cgroups/fs/mod.rs index bb21d04623..9913e945ee 100644 --- a/src/agent/rustjail/src/cgroups/fs/mod.rs +++ b/src/agent/rustjail/src/cgroups/fs/mod.rs @@ -19,7 +19,6 @@ use cgroups::{ }; use crate::cgroups::{is_all_devices_rule, Manager as CgroupManager}; -#[cfg(not(test))] use crate::container::DEFAULT_DEVICES; use anyhow::{anyhow, Context, Result}; use libc::{self, pid_t}; @@ -129,7 +128,7 @@ impl CgroupManager for Manager { if !self.devcg_whitelist { set_devices_resources(&self.cgroup, &r.devices, res, pod_res); } - info!( + debug!( sl(), "Resources after processed, pod_res = {:?}, res = {:?}", pod_res, res ); @@ -204,7 +203,6 @@ impl CgroupManager for Manager { err ); } - // TODO: Clean up pod cgroup if pod cgroup has no children. Ok(()) } @@ -537,28 +535,6 @@ fn build_blk_io_device_throttle_resource( blk_io_device_throttle_resources } -#[allow(dead_code)] -fn linux_device_to_device_resource(d: &LinuxDevice) -> Option { - let dev_type = match DeviceType::from_char(d.r#type.chars().next()) { - Some(t) => t, - None => return None, - }; - - let permissions = vec![ - DevicePermissions::Read, - DevicePermissions::Write, - DevicePermissions::MkNod, - ]; - - Some(DeviceResource { - allow: true, - devtype: dev_type, - major: d.major, - minor: d.minor, - access: permissions, - }) -} - fn linux_device_cgroup_to_device_resource(d: &LinuxDeviceCgroup) -> Option { let dev_type = match DeviceType::from_char(d.r#type.chars().next()) { Some(t) => t, @@ -1038,7 +1014,7 @@ impl Manager { spec: &Spec, devcg_info: Option>>, ) -> Result { - let (paths, mounts) = Self::get_paths_and_mounts(cpath)?; + let (paths, mounts) = Self::get_paths_and_mounts(cpath).context("Get paths and mounts")?; // Do not expect poisoning lock let mut devices_group_info = devcg_info.as_ref().map(|i| i.write().unwrap()); @@ -1053,10 +1029,11 @@ impl Manager { .to_string(); // Create a cgroup for the pod if not exists. - // Note that this step MUST be done before the pause container's - // cgroup created, since children inherit upper nodes' rules, which - // have excessive permission. You'll feel painful to shrink upper - // nodes' permission. + // Note that creating pod cgroup MUST be done before the pause + // container's cgroup created, since the upper node might have + // some excessive permissions, and children inherit upper + // node's rules. You'll feel painful to shrink upper nodes' + // permissions if the new permissions are subset of old. pod_cgroup = Some(load_cgroup(cgroups::hierarchies::auto(), &pod_cpath)); let pod_cg = pod_cgroup.as_ref().unwrap(); @@ -1133,7 +1110,7 @@ impl Manager { /// rules is done by the systemd. That is, the cgroupfs manager runs in /// read-only mode. pub fn new_read_only(cpath: &str) -> Result { - let (paths, mounts) = Self::get_paths_and_mounts(cpath)?; + let (paths, mounts) = Self::get_paths_and_mounts(cpath).context("Get paths and mounts")?; let cg = load_cgroup(cgroups::hierarchies::auto(), cpath); @@ -1224,7 +1201,7 @@ impl Manager { // Do not append default allowed devices for simplicity while // testing. #[cfg(not(test))] - dev_res_list.append(&mut Self::default_allowed_devices()); + dev_res_list.append(&mut default_allowed_devices()); let res = cgroups::Resources { devices: cgroups::DeviceResources { @@ -1237,24 +1214,6 @@ impl Manager { Ok(()) } - /// Generate a list for allowed devices including `DEFAULT_DEVICES` and - /// `DEFAULT_ALLOWED_DEVICES`. - #[cfg(not(test))] - fn default_allowed_devices() -> Vec { - let mut dev_res_list = Vec::new(); - DEFAULT_DEVICES.iter().for_each(|dev| { - if let Some(dev_res) = linux_device_to_device_resource(dev) { - dev_res_list.push(dev_res) - } - }); - DEFAULT_ALLOWED_DEVICES.iter().for_each(|dev| { - if let Some(dev_res) = linux_device_cgroup_to_device_resource(dev) { - dev_res_list.push(dev_res) - } - }); - dev_res_list - } - /// Check if OCI spec contains a whiltlist rule. /// /// # Returns @@ -1279,6 +1238,45 @@ impl Manager { } } +/// Generate a list for allowed devices including `DEFAULT_DEVICES` and +/// `DEFAULT_ALLOWED_DEVICES`. +fn default_allowed_devices() -> Vec { + let mut dev_res_list = Vec::new(); + DEFAULT_DEVICES.iter().for_each(|dev| { + if let Some(dev_res) = linux_device_to_device_resource(dev) { + dev_res_list.push(dev_res) + } + }); + DEFAULT_ALLOWED_DEVICES.iter().for_each(|dev| { + if let Some(dev_res) = linux_device_cgroup_to_device_resource(dev) { + dev_res_list.push(dev_res) + } + }); + dev_res_list +} + +/// Convert LinuxDevice to DeviceResource. +fn linux_device_to_device_resource(d: &LinuxDevice) -> Option { + let dev_type = match DeviceType::from_char(d.r#type.chars().next()) { + Some(t) => t, + None => return None, + }; + + let permissions = vec![ + DevicePermissions::Read, + DevicePermissions::Write, + DevicePermissions::MkNod, + ]; + + Some(DeviceResource { + allow: true, + devtype: dev_type, + major: d.major, + minor: d.minor, + access: permissions, + }) +} + // get the guest's online cpus. pub fn get_guest_cpuset() -> Result { let c = fs::read_to_string(GUEST_CPUS_PATH)?; @@ -1327,13 +1325,20 @@ fn convert_memory_swap_to_v2_value(memory_swap: i64, memory: i64) -> Result #[cfg(test)] mod tests { + use std::collections::HashMap; use std::process::Command; + use std::sync::{Arc, RwLock}; - use oci::Linux; - use oci::LinuxResources; + use cgroups::devices::{DevicePermissions, DeviceType}; + use oci::{Linux, LinuxDeviceCgroup, LinuxResources, Spec}; use test_utils::skip_if_not_root; - use super::*; + use super::default_allowed_devices; + use crate::cgroups::fs::{ + line_to_vec, lines_to_map, Manager, DEFAULT_ALLOWED_DEVICES, WILDCARD, + }; + use crate::cgroups::DevicesCgroupInfo; + use crate::container::DEFAULT_DEVICES; #[test] fn test_line_to_vec() { @@ -1534,4 +1539,39 @@ mod tests { managers[0].pod_cgroup.as_ref().unwrap().delete().unwrap(); } } + + #[test] + fn test_default_allowed_devices() { + let allowed_devices = default_allowed_devices(); + assert_eq!( + allowed_devices.len(), + DEFAULT_DEVICES.len() + DEFAULT_ALLOWED_DEVICES.len() + ); + + let allowed_permissions = vec![ + DevicePermissions::Read, + DevicePermissions::Write, + DevicePermissions::MkNod, + ]; + + let default_devices_0 = &allowed_devices[0]; + assert!(default_devices_0.allow); + assert_eq!(default_devices_0.devtype, DeviceType::Char); + assert_eq!(default_devices_0.major, 1); + assert_eq!(default_devices_0.minor, 3); + assert!(default_devices_0 + .access + .iter() + .all(|&p| allowed_permissions.iter().any(|&ap| ap == p))); + + let default_allowed_devices_0 = &allowed_devices[DEFAULT_DEVICES.len()]; + assert!(default_allowed_devices_0.allow); + assert_eq!(default_allowed_devices_0.devtype, DeviceType::Char); + assert_eq!(default_allowed_devices_0.major, WILDCARD); + assert_eq!(default_allowed_devices_0.minor, WILDCARD); + assert_eq!( + default_allowed_devices_0.access, + vec![DevicePermissions::MkNod] + ); + } } diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index da11dad339..5587e77991 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -32,7 +32,6 @@ fn sl() -> slog::Logger { slog_scope::logger().new(o!("subsystem" => "device")) } -pub const VM_ROOTFS: &str = "/"; const BLOCK: &str = "block"; pub const DRIVER_9P_TYPE: &str = "9p"; pub const DRIVER_VIRTIOFS_TYPE: &str = "virtio-fs"; @@ -1035,6 +1034,8 @@ mod tests { use std::iter::FromIterator; use tempfile::tempdir; + const VM_ROOTFS: &str = "/"; + #[test] fn test_update_device_cgroup() { let mut spec = Spec { diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index bb7e2a943a..276208bf3f 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -52,10 +52,7 @@ use nix::sys::{stat, statfs}; use nix::unistd::{self, Pid}; use rustjail::process::ProcessOperations; -use crate::device::{ - add_devices, get_virtio_blk_pci_device_name, insert_devices_cgroup_rule, update_env_pci, - DeviceInfo, VM_ROOTFS, -}; +use crate::device::{add_devices, get_virtio_blk_pci_device_name, update_env_pci}; use crate::linux_abi::*; use crate::metrics::get_metrics; use crate::mount::baremount; @@ -239,10 +236,6 @@ impl AgentService { update_container_namespaces(&s, &mut oci, use_sandbox_pidns)?; - // Add the root partition to the device cgroup to prevent access - let dev_info = DeviceInfo::new(VM_ROOTFS, false)?; - insert_devices_cgroup_rule(&mut oci, &dev_info, false, "rw")?; - // Append guest hooks append_guest_hooks(&s, &mut oci)?; From b5f3a8cb399f5e78d2e88eb0cdaa489545de23eb Mon Sep 17 00:00:00 2001 From: Xuewei Niu Date: Fri, 13 Oct 2023 17:20:41 +0800 Subject: [PATCH 4/6] agent: Fix container launching failure with systemd cgroup FSManager of systemd cgroup manager is responsible for setting up cgroup path. The container launching will be failed if the FSManager is in read-only mode. Signed-off-by: Xuewei Niu --- src/agent/rustjail/src/cgroups/fs/mod.rs | 12 +++++------- src/agent/rustjail/src/cgroups/systemd/manager.rs | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/agent/rustjail/src/cgroups/fs/mod.rs b/src/agent/rustjail/src/cgroups/fs/mod.rs index 9913e945ee..f601f06069 100644 --- a/src/agent/rustjail/src/cgroups/fs/mod.rs +++ b/src/agent/rustjail/src/cgroups/fs/mod.rs @@ -1104,15 +1104,13 @@ impl Manager { }) } - /// Create a cgroupfs manager without creating any cgroups. - /// A typical case is for systemd cgroup: Systemd manager retains a - /// cgroupfs manager to read cgroup information only. Writing cgroup - /// rules is done by the systemd. That is, the cgroupfs manager runs in - /// read-only mode. - pub fn new_read_only(cpath: &str) -> Result { + /// Create a cgroupfs manager for systemd cgroup. + /// The device cgroup is disabled in systemd cgroup, given that it is + /// implemented by eBPF. + pub fn new_systemd(cpath: &str) -> Result { let (paths, mounts) = Self::get_paths_and_mounts(cpath).context("Get paths and mounts")?; - let cg = load_cgroup(cgroups::hierarchies::auto(), cpath); + let cg = new_cgroup(cgroups::hierarchies::auto(), cpath)?; Ok(Self { paths, diff --git a/src/agent/rustjail/src/cgroups/systemd/manager.rs b/src/agent/rustjail/src/cgroups/systemd/manager.rs index 07e1ecb80d..b4974d2bb7 100644 --- a/src/agent/rustjail/src/cgroups/systemd/manager.rs +++ b/src/agent/rustjail/src/cgroups/systemd/manager.rs @@ -113,7 +113,7 @@ impl Manager { let (parent_slice, unit_name) = cgroups_path.parse()?; let cpath = parent_slice + "/" + &unit_name; - let fs_manager = FsManager::new_read_only(cpath.as_str())?; + let fs_manager = FsManager::new_systemd(cpath.as_str())?; Ok(Manager { paths: fs_manager.paths.clone(), From 136fb76222753082c702479b8942176ebfa618fb Mon Sep 17 00:00:00 2001 From: Xuewei Niu Date: Sat, 7 Oct 2023 16:58:20 +0800 Subject: [PATCH 5/6] tests: Add a integrated test for device cgroup `TestDeviceCgroup` is added to cri-containerd's integration tests. The test launches two containers. Each container has a block device. It checks the validity of device cgroup. Signed-off-by: Xuewei Niu --- .../cri-containerd/integration-tests.sh | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/tests/integration/cri-containerd/integration-tests.sh b/tests/integration/cri-containerd/integration-tests.sh index 13caac55f7..58d28dcb71 100755 --- a/tests/integration/cri-containerd/integration-tests.sh +++ b/tests/integration/cri-containerd/integration-tests.sh @@ -430,6 +430,143 @@ function stop_containerd() { sudo systemctl stop containerd } +function mountLoopDevice() { + local loop_file="$1" + if [ -e "$loop_file" ]; then + sudo rm -f $loop_file + info "$loop_file was removed" + fi + + sudo dd if=/dev/zero of=$loop_file bs=100M count=2 + sudo mkfs.ext4 $loop_file + sudo losetup -fP $loop_file + local loinfo=$(sudo losetup -a | grep $loop_file) + local device=$(echo "$loinfo" | awk -F'[: ]' '{print $1}') + echo $device +} + +function startDeviceCgroupContainers() { + local pod_yaml=${REPORT_DIR}/device-cgroup-pod.yaml + local container1_yaml=${REPORT_DIR}/device-cgroup-container1.yaml + local container2_yaml=${REPORT_DIR}/device-cgroup-container2.yaml + local image="busybox:latest" + + cat > "$pod_yaml" < "$container1_yaml" < "$container2_yaml" < Date: Fri, 3 Nov 2023 23:58:08 +0800 Subject: [PATCH 6/6] agent: Changes according to Pan's comments - Disable device cgroup restriction while pod cgroup is not available. - Remove balcklist-related names and change whitelist-related names to allowed_all. Signed-off-by: Xuewei Niu --- src/agent/rustjail/src/cgroups/fs/mod.rs | 145 +++++++++++------------ src/agent/rustjail/src/cgroups/mod.rs | 8 +- 2 files changed, 76 insertions(+), 77 deletions(-) diff --git a/src/agent/rustjail/src/cgroups/fs/mod.rs b/src/agent/rustjail/src/cgroups/fs/mod.rs index f601f06069..e00650494c 100644 --- a/src/agent/rustjail/src/cgroups/fs/mod.rs +++ b/src/agent/rustjail/src/cgroups/fs/mod.rs @@ -18,7 +18,7 @@ use cgroups::{ DeviceResource, HugePageResource, MaxValue, NetworkPriority, }; -use crate::cgroups::{is_all_devices_rule, Manager as CgroupManager}; +use crate::cgroups::{rule_for_all_devices, Manager as CgroupManager}; use crate::container::DEFAULT_DEVICES; use anyhow::{anyhow, Context, Result}; use libc::{self, pid_t}; @@ -66,7 +66,7 @@ pub struct Manager { #[serde(skip)] pod_cgroup: Option, #[serde(skip)] - devcg_whitelist: bool, + devcg_allowed_all: bool, } // set_resource is used to set reources by cgroup controller. @@ -125,7 +125,7 @@ impl CgroupManager for Manager { } // set devices resources - if !self.devcg_whitelist { + if !self.devcg_allowed_all { set_devices_resources(&self.cgroup, &r.devices, res, pod_res); } debug!( @@ -329,7 +329,7 @@ fn set_devices_resources( let mut devices = vec![]; for d in device_resources.iter() { - if is_all_devices_rule(d) { + if rule_for_all_devices(d) { continue; } if let Some(dev) = linux_device_cgroup_to_device_resource(d) { @@ -1024,56 +1024,57 @@ impl Manager { // Cgroup path of parent of container let pod_cpath = PathBuf::from(cpath) .parent() - .context("Get parent of cgroup path")? + .unwrap_or(Path::new("/")) .display() .to_string(); - // Create a cgroup for the pod if not exists. - // Note that creating pod cgroup MUST be done before the pause - // container's cgroup created, since the upper node might have - // some excessive permissions, and children inherit upper - // node's rules. You'll feel painful to shrink upper nodes' - // permissions if the new permissions are subset of old. - pod_cgroup = Some(load_cgroup(cgroups::hierarchies::auto(), &pod_cpath)); - let pod_cg = pod_cgroup.as_ref().unwrap(); - - // The default is blacklist mode. - let is_whitelist_mode = Self::devcg_whitelist_rule(spec).unwrap_or(false); - if devices_group_info.inited { - // Do nothing for whitelist except for updating whitelist - // field of devices_group_info. - debug!(sl(), "Devices cgroup has been initialzied."); - - // Set allowed all devices to pod cgroup - if !devices_group_info.whitelist && is_whitelist_mode { - info!( - sl(), - "Pod devices cgroup is changed to whitelist mode, devices_group_info = {:?}", - devices_group_info - ); - Self::setup_whitelist_mode(pod_cg) - .with_context(|| format!("Setup whitelist mode for {}", pod_cpath))?; - devices_group_info.whitelist = true; - } + if pod_cpath.as_str() == "/" { + // Skip setting pod cgroup for cpath due to no parent path + pod_cgroup = None } else { - // This is the first container (aka pause container) - debug!(sl(), "Started to init devices cgroup"); + // Create a cgroup for the pod if not exists. + // Note that creating pod cgroup MUST be done before the pause + // container's cgroup created, since the upper node might have + // some excessive permissions, and children inherit upper + // node's rules. You'll feel painful to shrink upper nodes' + // permissions if the new permissions are subset of old. + pod_cgroup = Some(load_cgroup(cgroups::hierarchies::auto(), &pod_cpath)); + let pod_cg = pod_cgroup.as_ref().unwrap(); - pod_cg.create().context("Create pod cgroup")?; + let is_allowded_all = Self::has_allowed_all_devices_rule(spec); + if devices_group_info.inited { + debug!(sl(), "Devices cgroup has been initialzied."); - // Setup blacklist for pod cgroup - if !is_whitelist_mode { - // Setup blacklist rule and allowed default devices for - // blacklist. - Self::setup_blacklist_mode(pod_cg) - .with_context(|| format!("Setup blacklist mode for {}", pod_cpath))?; + // Set allowed all devices to pod cgroup + if !devices_group_info.allowed_all && is_allowded_all { + info!( + sl(), + "Pod devices cgroup is changed to allowed all devices mode, devices_group_info = {:?}", + devices_group_info + ); + Self::setup_allowed_all_mode(pod_cg).with_context(|| { + format!("Setup allowed all devices mode for {}", pod_cpath) + })?; + devices_group_info.allowed_all = true; + } } else { - Self::setup_whitelist_mode(pod_cg) - .with_context(|| format!("Setup whitelist mode for {}", pod_cpath))?; - devices_group_info.whitelist = true; - } + // This is the first container (aka pause container) + debug!(sl(), "Started to init devices cgroup"); - devices_group_info.inited = true + pod_cg.create().context("Create pod cgroup")?; + + if !is_allowded_all { + Self::setup_devcg_whitelist(pod_cg).with_context(|| { + format!("Setup device cgroup whitelist for {}", pod_cpath) + })?; + } else { + Self::setup_allowed_all_mode(pod_cg) + .with_context(|| format!("Setup allowed all mode for {}", pod_cpath))?; + devices_group_info.allowed_all = true; + } + + devices_group_info.inited = true + } } } else { pod_cgroup = None; @@ -1085,9 +1086,9 @@ impl Manager { // contains some permissions that the container doesn't need. // Therefore, resetting the container's devices cgroup is required. if let Some(devices_group_info) = devices_group_info.as_ref() { - if !devices_group_info.whitelist { - Self::setup_blacklist_mode(&cg) - .with_context(|| format!("Setup blacklist mode for {}", cpath))?; + if !devices_group_info.allowed_all { + Self::setup_devcg_whitelist(&cg) + .with_context(|| format!("Setup device cgroup whitelist for {}", cpath))?; } } @@ -1098,8 +1099,8 @@ impl Manager { cpath: cpath.to_string(), cgroup: cg, pod_cgroup, - devcg_whitelist: devices_group_info - .map(|info| info.whitelist) + devcg_allowed_all: devices_group_info + .map(|info| info.allowed_all) .unwrap_or(false), }) } @@ -1118,7 +1119,7 @@ impl Manager { cpath: cpath.to_string(), pod_cgroup: None, cgroup: cg, - devcg_whitelist: false, + devcg_allowed_all: false, }) } @@ -1143,7 +1144,7 @@ impl Manager { Ok((m, mounts)) } - fn setup_whitelist_mode(cgroup: &cgroups::Cgroup) -> Result<()> { + fn setup_allowed_all_mode(cgroup: &cgroups::Cgroup) -> Result<()> { // Insert two rules: `b *:* rwm` and `c *:* rwm`. // The reason of not inserting `a *:* rwm` is that the Linux kernel // will deny writing `a` to `devices.allow` once a cgroup has @@ -1183,7 +1184,10 @@ impl Manager { Ok(()) } - fn setup_blacklist_mode(cgroup: &cgroups::Cgroup) -> Result<()> { + /// Setup device cgroup whitelist: + /// - Deny all devices in order to cleanup device cgroup. + /// - Allow default devices and default allowed devices. + fn setup_devcg_whitelist(cgroup: &cgroups::Cgroup) -> Result<()> { #[allow(unused_mut)] let mut dev_res_list = vec![DeviceResource { allow: false, @@ -1212,27 +1216,22 @@ impl Manager { Ok(()) } - /// Check if OCI spec contains a whiltlist rule. - /// - /// # Returns - /// - /// * `Some(true)` - Contains whitelist all rule; - /// * `Some(false)` - Contains blacklist all rule; - /// * `None` - Neither whitelist rule nor blacklist rule found. - fn devcg_whitelist_rule(spec: &Spec) -> Option { + /// Check if OCI spec contains a rule of allowed all devices. + fn has_allowed_all_devices_rule(spec: &Spec) -> bool { let linux = match spec.linux.as_ref() { Some(linux) => linux, - None => return None, + None => return false, }; let resources = match linux.resources.as_ref() { Some(resource) => resource, - None => return None, + None => return false, }; resources .devices .iter() - .find(|dev| is_all_devices_rule(dev)) + .find(|dev| rule_for_all_devices(dev)) .map(|dev| dev.allow) + .unwrap_or_default() } } @@ -1404,7 +1403,7 @@ mod tests { struct TestCase { cpath: Vec, devices: Vec>, - whitelist: Vec, + allowed_all: Vec, pod_devices_list: Vec, container_devices_list: Vec, } @@ -1431,7 +1430,7 @@ mod tests { "/kata-agent-fs-manager-test/449ccd81-9320-4f3e-bb67-78f84700fac9", )], devices: vec![vec![allow_all.clone()]], - whitelist: vec![true], + allowed_all: vec![true], pod_devices_list: vec![String::from("a *:* rwm\n")], container_devices_list: vec![String::from("a *:* rwm\n")], }, @@ -1440,7 +1439,7 @@ mod tests { "/kata-agent-fs-manager-test/449ccd81-9320-4f3e-bb67-78f84700fac9", )], devices: vec![vec![deny_all.clone()]], - whitelist: vec![false], + allowed_all: vec![false], pod_devices_list: vec![String::new()], container_devices_list: vec![String::new()], }, @@ -1454,7 +1453,7 @@ mod tests { ), ], devices: vec![vec![deny_all.clone()], vec![allow_all.clone()]], - whitelist: vec![false, true], + allowed_all: vec![false, true], pod_devices_list: vec![String::new(), String::from("b *:* rwm\nc *:* rwm\n")], container_devices_list: vec![String::new(), String::from("b *:* rwm\nc *:* rwm\n")], }, @@ -1468,7 +1467,7 @@ mod tests { ), ], devices: vec![vec![allow_all], vec![deny_all]], - whitelist: vec![true, true], + allowed_all: vec![true, true], pod_devices_list: vec![String::from("a *:* rwm\n"), String::from("a *:* rwm\n")], container_devices_list: vec![ String::from("a *:* rwm\n"), @@ -1481,7 +1480,7 @@ mod tests { let sandbox = MockSandbox::new(); let devcg_info = sandbox.devcg_info.read().unwrap(); assert!(!devcg_info.inited); - assert!(!devcg_info.whitelist); + assert!(!devcg_info.allowed_all); drop(devcg_info); let mut managers = Vec::with_capacity(tc.devices.len()); @@ -1503,8 +1502,8 @@ mod tests { let devcg_info = sandbox.devcg_info.read().unwrap(); assert!(devcg_info.inited); assert_eq!( - devcg_info.whitelist, tc.whitelist[cid], - "Round {}, cid {} whitelist assertion failure", + devcg_info.allowed_all, tc.allowed_all[cid], + "Round {}, cid {} allowed all assertion failure", round, cid ); drop(devcg_info); diff --git a/src/agent/rustjail/src/cgroups/mod.rs b/src/agent/rustjail/src/cgroups/mod.rs index ec4727b406..7eb8a73763 100644 --- a/src/agent/rustjail/src/cgroups/mod.rs +++ b/src/agent/rustjail/src/cgroups/mod.rs @@ -22,7 +22,7 @@ pub struct DevicesCgroupInfo { inited: bool, /// Indicate if pod's devices cgroup is in whitelist mode. Returns true /// once one container requires `a *:* rwm` permission. - whitelist: bool, + allowed_all: bool, } pub trait Manager { @@ -71,15 +71,15 @@ impl Debug for dyn Manager + Send + Sync { } } -/// Check if device cgroup is an all rule from OCI spec. +/// Check if device cgroup is a rule for all devices from OCI spec. /// /// The formats representing all devices between OCI spec and cgroups-rs /// are different. -/// - OCI spec: major: 0, minor: 0, type: "" (All devices), access: "rwm"; +/// - OCI spec: major: 0, minor: 0, type: "", access: "rwm"; /// - Cgroups-rs: major: -1, minor: -1, type: "a", access: "rwm"; /// - Linux: a *:* rwm #[inline] -fn is_all_devices_rule(dev_cgroup: &LinuxDeviceCgroup) -> bool { +fn rule_for_all_devices(dev_cgroup: &LinuxDeviceCgroup) -> bool { dev_cgroup.major.unwrap_or(0) == 0 && dev_cgroup.minor.unwrap_or(0) == 0 && (dev_cgroup.r#type.as_str() == "" || dev_cgroup.r#type.as_str() == "a")