diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 24b8be55b8..b789d24f27 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -35,9 +35,9 @@ const VM_ROOTFS: &str = "/"; const BLOCK: &str = "block"; pub const DRIVER_9P_TYPE: &str = "9p"; pub const DRIVER_VIRTIOFS_TYPE: &str = "virtio-fs"; -pub const DRIVER_BLK_TYPE: &str = "blk"; +pub const DRIVER_BLK_PCI_TYPE: &str = "blk"; pub const DRIVER_BLK_CCW_TYPE: &str = "blk-ccw"; -pub const DRIVER_MMIO_BLK_TYPE: &str = "mmioblk"; +pub const DRIVER_BLK_MMIO_TYPE: &str = "mmioblk"; pub const DRIVER_SCSI_TYPE: &str = "scsi"; pub const DRIVER_NVDIMM_TYPE: &str = "nvdimm"; pub const DRIVER_EPHEMERAL_TYPE: &str = "ephemeral"; @@ -937,9 +937,9 @@ async fn add_device(device: &Device, sandbox: &Arc>) -> Result virtio_blk_device_handler(device, sandbox).await, + DRIVER_BLK_PCI_TYPE => virtio_blk_device_handler(device, sandbox).await, DRIVER_BLK_CCW_TYPE => virtio_blk_ccw_device_handler(device, sandbox).await, - DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, sandbox).await, + DRIVER_BLK_MMIO_TYPE => virtiommio_blk_device_handler(device, sandbox).await, DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, sandbox).await, DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, sandbox).await, DRIVER_VFIO_PCI_GK_TYPE | DRIVER_VFIO_PCI_TYPE => { diff --git a/src/agent/src/main.rs b/src/agent/src/main.rs index a869e5afa3..7e59e2daab 100644 --- a/src/agent/src/main.rs +++ b/src/agent/src/main.rs @@ -48,6 +48,7 @@ mod pci; pub mod random; mod sandbox; mod signal; +mod storage; mod uevent; mod util; mod version; diff --git a/src/agent/src/mount.rs b/src/agent/src/mount.rs index ca4253aea1..e7bbf96f8d 100644 --- a/src/agent/src/mount.rs +++ b/src/agent/src/mount.rs @@ -5,92 +5,22 @@ use std::collections::HashMap; use std::fmt::Debug; -use std::fs::{self, File, OpenOptions}; -use std::io::{BufRead, BufReader, Write}; -use std::iter; +use std::fs::{self, File}; +use std::io::{BufRead, BufReader}; use std::ops::Deref; -use std::os::unix::fs::{MetadataExt, PermissionsExt}; use std::path::Path; -use std::str::FromStr; -use std::sync::Arc; use anyhow::{anyhow, Context, Result}; -use kata_sys_util::mount::get_linux_mount_info; +use kata_sys_util::mount::{get_linux_mount_info, parse_mount_options}; use nix::mount::MsFlags; -use nix::unistd::{Gid, Uid}; use regex::Regex; use slog::Logger; -use tokio::sync::Mutex; use tracing::instrument; -use crate::device::{ - get_scsi_device_name, get_virtio_blk_pci_device_name, get_virtio_mmio_device_name, - online_device, wait_for_pmem_device, DRIVER_9P_TYPE, DRIVER_BLK_CCW_TYPE, DRIVER_BLK_TYPE, - DRIVER_EPHEMERAL_TYPE, DRIVER_LOCAL_TYPE, DRIVER_MMIO_BLK_TYPE, DRIVER_NVDIMM_TYPE, - DRIVER_OVERLAYFS_TYPE, DRIVER_SCSI_TYPE, DRIVER_VIRTIOFS_TYPE, DRIVER_WATCHABLE_BIND_TYPE, - FS_TYPE_HUGETLB, -}; +use crate::device::online_device; use crate::linux_abi::*; -use crate::pci; -use crate::protocols::agent::Storage; -use crate::protocols::types::FSGroupChangePolicy; -use crate::Sandbox; -#[cfg(target_arch = "s390x")] -use crate::{ccw, device::get_virtio_blk_ccw_device_name}; pub const TYPE_ROOTFS: &str = "rootfs"; -pub const MOUNT_GUEST_TAG: &str = "kataShared"; - -// Allocating an FSGroup that owns the pod's volumes -const FS_GID: &str = "fsgid"; -const FS_GID_EQ: &str = "fsgid="; -const SYS_FS_HUGEPAGES_PREFIX: &str = "/sys/kernel/mm/hugepages"; - -const RW_MASK: u32 = 0o660; -const RO_MASK: u32 = 0o440; -const EXEC_MASK: u32 = 0o110; -const MODE_SETGID: u32 = 0o2000; - -#[rustfmt::skip] -lazy_static! { - pub static ref FLAGS: HashMap<&'static str, (bool, MsFlags)> = { - let mut m = HashMap::new(); - m.insert("defaults", (false, MsFlags::empty())); - m.insert("ro", (false, MsFlags::MS_RDONLY)); - m.insert("rw", (true, MsFlags::MS_RDONLY)); - m.insert("suid", (true, MsFlags::MS_NOSUID)); - m.insert("nosuid", (false, MsFlags::MS_NOSUID)); - m.insert("dev", (true, MsFlags::MS_NODEV)); - m.insert("nodev", (false, MsFlags::MS_NODEV)); - m.insert("exec", (true, MsFlags::MS_NOEXEC)); - m.insert("noexec", (false, MsFlags::MS_NOEXEC)); - m.insert("sync", (false, MsFlags::MS_SYNCHRONOUS)); - m.insert("async", (true, MsFlags::MS_SYNCHRONOUS)); - m.insert("dirsync", (false, MsFlags::MS_DIRSYNC)); - m.insert("remount", (false, MsFlags::MS_REMOUNT)); - m.insert("mand", (false, MsFlags::MS_MANDLOCK)); - m.insert("nomand", (true, MsFlags::MS_MANDLOCK)); - m.insert("atime", (true, MsFlags::MS_NOATIME)); - m.insert("noatime", (false, MsFlags::MS_NOATIME)); - m.insert("diratime", (true, MsFlags::MS_NODIRATIME)); - m.insert("nodiratime", (false, MsFlags::MS_NODIRATIME)); - m.insert("bind", (false, MsFlags::MS_BIND)); - m.insert("rbind", (false, MsFlags::MS_BIND | MsFlags::MS_REC)); - m.insert("unbindable", (false, MsFlags::MS_UNBINDABLE)); - m.insert("runbindable", (false, MsFlags::MS_UNBINDABLE | MsFlags::MS_REC)); - m.insert("private", (false, MsFlags::MS_PRIVATE)); - m.insert("rprivate", (false, MsFlags::MS_PRIVATE | MsFlags::MS_REC)); - m.insert("shared", (false, MsFlags::MS_SHARED)); - m.insert("rshared", (false, MsFlags::MS_SHARED | MsFlags::MS_REC)); - m.insert("slave", (false, MsFlags::MS_SLAVE)); - m.insert("rslave", (false, MsFlags::MS_SLAVE | MsFlags::MS_REC)); - m.insert("relatime", (false, MsFlags::MS_RELATIME)); - m.insert("norelatime", (true, MsFlags::MS_RELATIME)); - m.insert("strictatime", (false, MsFlags::MS_STRICTATIME)); - m.insert("nostrictatime", (true, MsFlags::MS_STRICTATIME)); - m - }; -} #[derive(Debug, PartialEq)] pub struct InitMount<'a> { @@ -133,19 +63,6 @@ lazy_static! { ]; } -pub const STORAGE_HANDLER_LIST: &[&str] = &[ - DRIVER_BLK_TYPE, - DRIVER_9P_TYPE, - DRIVER_VIRTIOFS_TYPE, - DRIVER_EPHEMERAL_TYPE, - DRIVER_OVERLAYFS_TYPE, - DRIVER_MMIO_BLK_TYPE, - DRIVER_LOCAL_TYPE, - DRIVER_SCSI_TYPE, - DRIVER_NVDIMM_TYPE, - DRIVER_WATCHABLE_BIND_TYPE, -]; - #[instrument] pub fn baremount( source: &Path, @@ -170,16 +87,12 @@ pub fn baremount( } let destination_str = destination.to_string_lossy(); - let mut already_mounted = false; if let Ok(m) = get_linux_mount_info(destination_str.deref()) { if m.fs_type == fs_type { - already_mounted = true; + slog_info!(logger, "{source:?} is already mounted at {destination:?}"); + return Ok(()); } } - if already_mounted { - slog_info!(logger, "{source:?} is already mounted at {destination:?}"); - return Ok(()); - } info!( logger, @@ -208,560 +121,6 @@ pub fn baremount( }) } -#[instrument] -async fn ephemeral_storage_handler( - logger: &Logger, - storage: &Storage, - _sandbox: &Arc>, -) -> Result { - // hugetlbfs - if storage.fstype == FS_TYPE_HUGETLB { - return handle_hugetlbfs_storage(logger, storage).await; - } - - // normal ephemeral storage - fs::create_dir_all(&storage.mount_point)?; - - if !storage.options.is_empty() { - // By now we only support one option field: "fsGroup" which - // isn't an valid mount option, thus we should remove it when - // do mount. - let mut new_storage = storage.clone(); - new_storage.options = Default::default(); - common_storage_handler(logger, &new_storage)?; - - let opts = parse_options(&storage.options); - - // ephemeral_storage didn't support mount options except fsGroup. - if let Some(fsgid) = opts.get(FS_GID) { - let gid = fsgid.parse::()?; - - nix::unistd::chown(storage.mount_point.as_str(), None, Some(Gid::from_raw(gid)))?; - - let meta = fs::metadata(&storage.mount_point)?; - let mut permission = meta.permissions(); - - let o_mode = meta.mode() | MODE_SETGID; - permission.set_mode(o_mode); - fs::set_permissions(&storage.mount_point, permission)?; - } - } else { - common_storage_handler(logger, storage)?; - } - - Ok("".to_string()) -} - -// update_ephemeral_mounts takes a list of ephemeral mounts and remounts them -// with mount options passed by the caller -#[instrument] -pub async fn update_ephemeral_mounts( - logger: Logger, - storages: &[Storage], - _sandbox: &Arc>, -) -> Result<()> { - for storage in storages { - let handler_name = &storage.driver; - let logger = logger.new(o!( - "msg" => "updating tmpfs storage", - "subsystem" => "storage", - "storage-type" => handler_name.to_owned())); - - match handler_name.as_str() { - DRIVER_EPHEMERAL_TYPE => { - fs::create_dir_all(&storage.mount_point)?; - - if storage.options.is_empty() { - continue; - } else { - // assume that fsGid has already been set - let mount_path = Path::new(&storage.mount_point); - let src_path = Path::new(&storage.source); - let opts = storage - .options - .iter() - .filter(|&opt| !opt.starts_with(FS_GID_EQ)); - let (flags, options) = parse_mount_flags_and_options(opts); - - info!(logger, "mounting storage"; - "mount-source" => src_path.display(), - "mount-destination" => mount_path.display(), - "mount-fstype" => storage.fstype.as_str(), - "mount-options" => options.as_str(), - ); - - baremount( - src_path, - mount_path, - storage.fstype.as_str(), - flags, - options.as_str(), - &logger, - )?; - } - } - _ => { - return Err(anyhow!( - "Unsupported storage type for syncing mounts {}. Only ephemeral storage update is supported", - storage.driver - )); - } - }; - } - - Ok(()) -} - -#[instrument] -async fn overlayfs_storage_handler( - logger: &Logger, - storage: &Storage, - cid: Option<&str>, - _sandbox: &Arc>, -) -> Result { - if storage - .options - .iter() - .any(|e| e == "io.katacontainers.fs-opt.overlay-rw") - { - let cid = cid.ok_or_else(|| anyhow!("No container id in rw overlay"))?; - let cpath = Path::new(crate::rpc::CONTAINER_BASE).join(cid); - let work = cpath.join("work"); - let upper = cpath.join("upper"); - - fs::create_dir_all(&work).context("Creating overlay work directory")?; - fs::create_dir_all(&upper).context("Creating overlay upper directory")?; - - let mut storage = storage.clone(); - storage.fstype = "overlay".into(); - storage - .options - .push(format!("upperdir={}", upper.to_string_lossy())); - storage - .options - .push(format!("workdir={}", work.to_string_lossy())); - return common_storage_handler(logger, &storage); - } - - common_storage_handler(logger, storage) -} - -#[instrument] -async fn local_storage_handler( - _logger: &Logger, - storage: &Storage, - _sandbox: &Arc>, -) -> Result { - fs::create_dir_all(&storage.mount_point).context(format!( - "failed to create dir all {:?}", - &storage.mount_point - ))?; - - let opts = parse_options(&storage.options); - - let mut need_set_fsgid = false; - if let Some(fsgid) = opts.get(FS_GID) { - let gid = fsgid.parse::()?; - - nix::unistd::chown(storage.mount_point.as_str(), None, Some(Gid::from_raw(gid)))?; - need_set_fsgid = true; - } - - if let Some(mode) = opts.get("mode") { - let mut permission = fs::metadata(&storage.mount_point)?.permissions(); - - let mut o_mode = u32::from_str_radix(mode, 8)?; - - if need_set_fsgid { - // set SetGid mode mask. - o_mode |= MODE_SETGID; - } - permission.set_mode(o_mode); - - fs::set_permissions(&storage.mount_point, permission)?; - } - - Ok("".to_string()) -} - -#[instrument] -async fn virtio9p_storage_handler( - logger: &Logger, - storage: &Storage, - _sandbox: &Arc>, -) -> Result { - common_storage_handler(logger, storage) -} - -#[instrument] -async fn handle_hugetlbfs_storage(logger: &Logger, storage: &Storage) -> Result { - info!(logger, "handle hugetlbfs storage"); - // Allocate hugepages before mount - // /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages - // /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages - // options eg "pagesize=2097152,size=524288000"(2M, 500M) - allocate_hugepages(logger, &storage.options).context("allocate hugepages")?; - - common_storage_handler(logger, storage)?; - - // hugetlbfs return empty string as ephemeral_storage_handler do. - // this is a sandbox level storage, but not a container-level mount. - Ok("".to_string()) -} - -// Allocate hugepages by writing to sysfs -fn allocate_hugepages(logger: &Logger, options: &[String]) -> Result<()> { - info!(logger, "mounting hugePages storage options: {:?}", options); - - let (pagesize, size) = get_pagesize_and_size_from_option(options) - .context(format!("parse mount options: {:?}", &options))?; - - info!( - logger, - "allocate hugepages. pageSize: {}, size: {}", pagesize, size - ); - - // sysfs entry is always of the form hugepages-${pagesize}kB - // Ref: https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt - let path = Path::new(SYS_FS_HUGEPAGES_PREFIX) - .join(format!("hugepages-{}kB", pagesize / 1024)) - .join("nr_hugepages"); - - // write numpages to nr_hugepages file. - let numpages = format!("{}", size / pagesize); - info!(logger, "write {} pages to {:?}", &numpages, &path); - - let mut file = OpenOptions::new() - .write(true) - .open(&path) - .context(format!("open nr_hugepages directory {:?}", &path))?; - - file.write_all(numpages.as_bytes()) - .context(format!("write nr_hugepages failed: {:?}", &path))?; - - // Even if the write succeeds, the kernel isn't guaranteed to be - // able to allocate all the pages we requested. Verify that it - // did. - let verify = fs::read_to_string(&path).context(format!("reading {:?}", &path))?; - let allocated = verify - .trim_end() - .parse::() - .map_err(|_| anyhow!("Unexpected text {:?} in {:?}", &verify, &path))?; - if allocated != size / pagesize { - return Err(anyhow!( - "Only allocated {} of {} hugepages of size {}", - allocated, - numpages, - pagesize - )); - } - - Ok(()) -} - -// Parse filesystem options string to retrieve hugepage details -// options eg "pagesize=2048,size=107374182" -fn get_pagesize_and_size_from_option(options: &[String]) -> Result<(u64, u64)> { - let mut pagesize_str: Option<&str> = None; - let mut size_str: Option<&str> = None; - - for option in options { - let vars: Vec<&str> = option.trim().split(',').collect(); - - for var in vars { - if let Some(stripped) = var.strip_prefix("pagesize=") { - pagesize_str = Some(stripped); - } else if let Some(stripped) = var.strip_prefix("size=") { - size_str = Some(stripped); - } - - if pagesize_str.is_some() && size_str.is_some() { - break; - } - } - } - - if pagesize_str.is_none() || size_str.is_none() { - return Err(anyhow!("no pagesize/size options found")); - } - - let pagesize = pagesize_str - .unwrap() - .parse::() - .context(format!("parse pagesize: {:?}", &pagesize_str))?; - let size = size_str - .unwrap() - .parse::() - .context(format!("parse size: {:?}", &pagesize_str))?; - - Ok((pagesize, size)) -} - -// virtiommio_blk_storage_handler handles the storage for mmio blk driver. -#[instrument] -async fn virtiommio_blk_storage_handler( - logger: &Logger, - storage: &Storage, - sandbox: &Arc>, -) -> Result { - let storage = storage.clone(); - if !Path::new(&storage.source).exists() { - get_virtio_mmio_device_name(sandbox, &storage.source) - .await - .context("failed to get mmio device name")?; - } - //The source path is VmPath - common_storage_handler(logger, &storage) -} - -// virtiofs_storage_handler handles the storage for virtio-fs. -#[instrument] -async fn virtiofs_storage_handler( - logger: &Logger, - storage: &Storage, - _sandbox: &Arc>, -) -> Result { - common_storage_handler(logger, storage) -} - -// virtio_blk_storage_handler handles the storage for blk driver. -#[instrument] -async fn virtio_blk_storage_handler( - logger: &Logger, - storage: &Storage, - sandbox: &Arc>, -) -> Result { - let mut storage = storage.clone(); - // If hot-plugged, get the device node path based on the PCI path - // otherwise use the virt path provided in Storage Source - if storage.source.starts_with("/dev") { - let metadata = fs::metadata(&storage.source) - .context(format!("get metadata on file {:?}", &storage.source))?; - let mode = metadata.permissions().mode(); - if mode & libc::S_IFBLK == 0 { - return Err(anyhow!("Invalid device {}", &storage.source)); - } - } else { - let pcipath = pci::Path::from_str(&storage.source)?; - let dev_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; - storage.source = dev_path; - } - - common_storage_handler(logger, &storage) -} - -// virtio_blk_ccw_storage_handler handles storage for the blk-ccw driver (s390x) -#[cfg(target_arch = "s390x")] -#[instrument] -async fn virtio_blk_ccw_storage_handler( - logger: &Logger, - storage: &Storage, - sandbox: &Arc>, -) -> Result { - let mut storage = storage.clone(); - let ccw_device = ccw::Device::from_str(&storage.source)?; - let dev_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?; - storage.source = dev_path; - common_storage_handler(logger, &storage) -} - -#[cfg(not(target_arch = "s390x"))] -#[instrument] -async fn virtio_blk_ccw_storage_handler( - _: &Logger, - _: &Storage, - _: &Arc>, -) -> Result { - Err(anyhow!("CCW is only supported on s390x")) -} - -// virtio_scsi_storage_handler handles the storage for scsi driver. -#[instrument] -async fn virtio_scsi_storage_handler( - logger: &Logger, - storage: &Storage, - sandbox: &Arc>, -) -> Result { - let mut storage = storage.clone(); - - // Retrieve the device path from SCSI address. - let dev_path = get_scsi_device_name(sandbox, &storage.source).await?; - storage.source = dev_path; - - common_storage_handler(logger, &storage) -} - -#[instrument] -fn common_storage_handler(logger: &Logger, storage: &Storage) -> Result { - mount_storage(logger, storage)?; - set_ownership(logger, storage)?; - Ok(storage.mount_point.clone()) -} - -// nvdimm_storage_handler handles the storage for NVDIMM driver. -#[instrument] -async fn nvdimm_storage_handler( - logger: &Logger, - storage: &Storage, - sandbox: &Arc>, -) -> Result { - let storage = storage.clone(); - - // Retrieve the device path from NVDIMM address. - wait_for_pmem_device(sandbox, &storage.source).await?; - - common_storage_handler(logger, &storage) -} - -async fn bind_watcher_storage_handler( - logger: &Logger, - storage: &Storage, - sandbox: &Arc>, - cid: Option, -) -> Result<()> { - let mut locked = sandbox.lock().await; - - if let Some(cid) = cid { - locked - .bind_watcher - .add_container(cid, iter::once(storage.clone()), logger) - .await - } else { - Ok(()) - } -} - -// mount_storage performs the mount described by the storage structure. -#[instrument] -fn mount_storage(logger: &Logger, storage: &Storage) -> Result<()> { - let logger = logger.new(o!("subsystem" => "mount")); - - // There's a special mechanism to create mountpoint from a `sharedfs` instance before - // starting the kata-agent. Check for such cases. - if storage.source == MOUNT_GUEST_TAG && is_mounted(&storage.mount_point)? { - warn!( - logger, - "{} already mounted on {}, ignoring...", MOUNT_GUEST_TAG, &storage.mount_point - ); - return Ok(()); - } - - let mount_path = Path::new(&storage.mount_point); - let src_path = Path::new(&storage.source); - if storage.fstype == "bind" && !src_path.is_dir() { - ensure_destination_file_exists(mount_path).context("Could not create mountpoint file")?; - } else { - fs::create_dir_all(mount_path) - .map_err(anyhow::Error::from) - .context("Could not create mountpoint")?; - } - let (flags, options) = parse_mount_flags_and_options(storage.options.iter()); - - info!(logger, "mounting storage"; - "mount-source" => src_path.display(), - "mount-destination" => mount_path.display(), - "mount-fstype" => storage.fstype.as_str(), - "mount-options" => options.as_str(), - ); - - baremount( - src_path, - mount_path, - storage.fstype.as_str(), - flags, - options.as_str(), - &logger, - ) -} - -#[instrument] -pub fn set_ownership(logger: &Logger, storage: &Storage) -> Result<()> { - let logger = logger.new(o!("subsystem" => "mount", "fn" => "set_ownership")); - - // If fsGroup is not set, skip performing ownership change - if storage.fs_group.is_none() { - return Ok(()); - } - - let fs_group = storage.fs_group(); - let read_only = storage.options.contains(&String::from("ro")); - let mount_path = Path::new(&storage.mount_point); - let metadata = mount_path.metadata().map_err(|err| { - error!(logger, "failed to obtain metadata for mount path"; - "mount-path" => mount_path.to_str(), - "error" => err.to_string(), - ); - err - })?; - - if fs_group.group_change_policy == FSGroupChangePolicy::OnRootMismatch.into() - && metadata.gid() == fs_group.group_id - { - let mut mask = if read_only { RO_MASK } else { RW_MASK }; - mask |= EXEC_MASK; - - // With fsGroup change policy to OnRootMismatch, if the current - // gid of the mount path root directory matches the desired gid - // and the current permission of mount path root directory is correct, - // then ownership change will be skipped. - let current_mode = metadata.permissions().mode(); - if (mask & current_mode == mask) && (current_mode & MODE_SETGID != 0) { - info!(logger, "skipping ownership change for volume"; - "mount-path" => mount_path.to_str(), - "fs-group" => fs_group.group_id.to_string(), - ); - return Ok(()); - } - } - - info!(logger, "performing recursive ownership change"; - "mount-path" => mount_path.to_str(), - "fs-group" => fs_group.group_id.to_string(), - ); - recursive_ownership_change( - mount_path, - None, - Some(Gid::from_raw(fs_group.group_id)), - read_only, - ) -} - -#[instrument] -pub fn recursive_ownership_change( - path: &Path, - uid: Option, - gid: Option, - read_only: bool, -) -> Result<()> { - let mut mask = if read_only { RO_MASK } else { RW_MASK }; - if path.is_dir() { - for entry in fs::read_dir(path)? { - recursive_ownership_change(entry?.path().as_path(), uid, gid, read_only)?; - } - mask |= EXEC_MASK; - mask |= MODE_SETGID; - } - - // We do not want to change the permission of the underlying file - // using symlink. Hence we skip symlinks from recursive ownership - // and permission changes. - if path.is_symlink() { - return Ok(()); - } - - nix::unistd::chown(path, uid, gid)?; - - if gid.is_some() { - let metadata = path.metadata()?; - let mut permission = metadata.permissions(); - let target_mode = metadata.mode() | mask; - permission.set_mode(target_mode); - fs::set_permissions(path, permission)?; - } - - Ok(()) -} - /// Looks for `mount_point` entry in the /proc/mounts. #[instrument] pub fn is_mounted(mount_point: &str) -> Result { @@ -770,140 +129,25 @@ pub fn is_mounted(mount_point: &str) -> Result { Ok(found) } -#[instrument] -fn parse_mount_flags_and_options( - opts_iter: impl Iterator> + Debug, -) -> (MsFlags, String) { - let mut flags = MsFlags::empty(); - let mut options: String = "".to_string(); - - for opt in opts_iter { - let opt = opt.as_ref(); - if !opt.is_empty() { - match FLAGS.get(opt) { - Some(x) => { - let (clear, f) = *x; - if clear { - flags &= !f; - } else { - flags |= f; - } - } - None => { - if opt.starts_with("io.katacontainers.") { - continue; - } - - if !options.is_empty() { - options.push_str(format!(",{}", opt).as_str()); - } else { - options.push_str(opt); - } - } - }; - } - } - (flags, options) -} - -// add_storages takes a list of storages passed by the caller, and perform the -// associated operations such as waiting for the device to show up, and mount -// it to a specific location, according to the type of handler chosen, and for -// each storage. -#[instrument] -pub async fn add_storages( - logger: Logger, - storages: &[Storage], - sandbox: &Arc>, - cid: Option, -) -> Result> { - let mut mount_list = Vec::new(); - - for storage in storages { - let handler_name = &storage.driver; - let logger = - logger.new(o!( "subsystem" => "storage", "storage-type" => handler_name.to_string())); - - { - let mut sb = sandbox.lock().await; - let new_storage = sb.set_sandbox_storage(&storage.mount_point); - if !new_storage { - continue; - } - } - - let res = match handler_name.as_str() { - DRIVER_BLK_TYPE => virtio_blk_storage_handler(&logger, storage, sandbox).await, - DRIVER_BLK_CCW_TYPE => virtio_blk_ccw_storage_handler(&logger, storage, sandbox).await, - DRIVER_9P_TYPE => virtio9p_storage_handler(&logger, storage, sandbox).await, - DRIVER_VIRTIOFS_TYPE => virtiofs_storage_handler(&logger, storage, sandbox).await, - DRIVER_EPHEMERAL_TYPE => ephemeral_storage_handler(&logger, storage, sandbox).await, - DRIVER_OVERLAYFS_TYPE => { - overlayfs_storage_handler(&logger, storage, cid.as_deref(), sandbox).await - } - DRIVER_MMIO_BLK_TYPE => virtiommio_blk_storage_handler(&logger, storage, sandbox).await, - DRIVER_LOCAL_TYPE => local_storage_handler(&logger, storage, sandbox).await, - DRIVER_SCSI_TYPE => virtio_scsi_storage_handler(&logger, storage, sandbox).await, - DRIVER_NVDIMM_TYPE => nvdimm_storage_handler(&logger, storage, sandbox).await, - DRIVER_WATCHABLE_BIND_TYPE => { - bind_watcher_storage_handler(&logger, storage, sandbox, cid.clone()).await?; - // Don't register watch mounts, they're handled separately by the watcher. - Ok(String::new()) - } - _ => { - return Err(anyhow!( - "Failed to find the storage handler {}", - storage.driver.to_owned() - )); - } - }; - - let mount_point = match res { - Err(e) => { - error!( - logger, - "add_storages failed, storage: {:?}, error: {:?} ", storage, e - ); - let mut sb = sandbox.lock().await; - if let Err(e) = sb.unset_sandbox_storage(&storage.mount_point) { - warn!(logger, "fail to unset sandbox storage {:?}", e); - } - return Err(e); - } - Ok(m) => m, - }; - - if !mount_point.is_empty() { - mount_list.push(mount_point); - } - } - - Ok(mount_list) -} - #[instrument] fn mount_to_rootfs(logger: &Logger, m: &InitMount) -> Result<()> { - let (flags, options) = parse_mount_flags_and_options(m.options.iter()); - fs::create_dir_all(m.dest).context("could not create directory")?; + let (flags, options) = parse_mount_options(&m.options)?; let source = Path::new(m.src); let dest = Path::new(m.dest); baremount(source, dest, m.fstype, flags, &options, logger).or_else(|e| { - if m.src != "dev" { - return Err(e); + if m.src == "dev" { + error!( + logger, + "Could not mount filesystem from {} to {}", m.src, m.dest + ); + Ok(()) + } else { + Err(e) } - - error!( - logger, - "Could not mount filesystem from {} to {}", m.src, m.dest - ); - - Ok(()) - })?; - - Ok(()) + }) } #[instrument] @@ -923,7 +167,7 @@ pub fn get_mount_fs_type(mount_point: &str) -> Result { } // get_mount_fs_type_from_file returns the FS type corresponding to the passed mount point and -// any error ecountered. +// any error encountered. #[instrument] pub fn get_mount_fs_type_from_file(mount_file: &str, mount_point: &str) -> Result { if mount_point.is_empty() { @@ -937,13 +181,10 @@ pub fn get_mount_fs_type_from_file(mount_file: &str, mount_point: &str) -> Resul // Read the file line by line using the lines() iterator from std::io::BufRead. for (_index, line) in content.lines().enumerate() { - let capes = match re.captures(line) { - Some(c) => c, - None => continue, - }; - - if capes.len() > 1 { - return Ok(capes[1].to_string()); + if let Some(capes) = re.captures(line) { + if capes.len() > 1 { + return Ok(capes[1].to_string()); + } } } @@ -1058,57 +299,25 @@ pub fn cgroups_mount(logger: &Logger, unified_cgroup_hierarchy: bool) -> Result< // Enable memory hierarchical account. // For more information see https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt - online_device("/sys/fs/cgroup/memory/memory.use_hierarchy")?; - Ok(()) + online_device("/sys/fs/cgroup/memory/memory.use_hierarchy") } #[instrument] -pub fn remove_mounts(mounts: &[String]) -> Result<()> { +pub fn remove_mounts + std::fmt::Debug>(mounts: &[P]) -> Result<()> { for m in mounts.iter() { - nix::mount::umount(m.as_str()).context(format!("failed to umount {:?}", m))?; + nix::mount::umount(m.as_ref()).context(format!("failed to umount {:?}", m.as_ref()))?; } Ok(()) } -#[instrument] -fn ensure_destination_file_exists(path: &Path) -> Result<()> { - if path.is_file() { - return Ok(()); - } else if path.exists() { - return Err(anyhow!("{:?} exists but is not a regular file", path)); - } - - let dir = path - .parent() - .ok_or_else(|| anyhow!("failed to find parent path for {:?}", path))?; - - fs::create_dir_all(dir).context(format!("create_dir_all {:?}", dir))?; - - fs::File::create(path).context(format!("create empty file {:?}", path))?; - - Ok(()) -} - -#[instrument] -fn parse_options(option_list: &[String]) -> HashMap { - let mut options = HashMap::new(); - for opt in option_list { - let fields: Vec<&str> = opt.split('=').collect(); - if fields.len() == 2 { - options.insert(fields[0].to_string(), fields[1].to_string()); - } - } - options -} - #[cfg(test)] mod tests { use super::*; - use protocols::agent::FSGroup; use slog::Drain; use std::fs::File; use std::fs::OpenOptions; use std::io::Write; + use std::os::unix::fs::PermissionsExt; use std::path::PathBuf; use tempfile::tempdir; use test_utils::TestUserType; @@ -1678,145 +887,6 @@ mod tests { } } - #[test] - fn test_ensure_destination_file_exists() { - let dir = tempdir().expect("failed to create tmpdir"); - - let mut testfile = dir.into_path(); - testfile.push("testfile"); - - let result = ensure_destination_file_exists(&testfile); - - assert!(result.is_ok()); - assert!(testfile.exists()); - - let result = ensure_destination_file_exists(&testfile); - assert!(result.is_ok()); - - assert!(testfile.is_file()); - } - - #[test] - fn test_mount_storage() { - #[derive(Debug)] - struct TestData<'a> { - test_user: TestUserType, - storage: Storage, - error_contains: &'a str, - - make_source_dir: bool, - make_mount_dir: bool, - deny_mount_permission: bool, - } - - impl Default for TestData<'_> { - fn default() -> Self { - TestData { - test_user: TestUserType::Any, - storage: Storage { - mount_point: "mnt".to_string(), - source: "src".to_string(), - fstype: "tmpfs".to_string(), - ..Default::default() - }, - make_source_dir: true, - make_mount_dir: false, - deny_mount_permission: false, - error_contains: "", - } - } - } - - let tests = &[ - TestData { - test_user: TestUserType::NonRootOnly, - error_contains: "EPERM: Operation not permitted", - ..Default::default() - }, - TestData { - test_user: TestUserType::RootOnly, - ..Default::default() - }, - TestData { - storage: Storage { - mount_point: "mnt".to_string(), - source: "src".to_string(), - fstype: "bind".to_string(), - ..Default::default() - }, - make_source_dir: false, - make_mount_dir: true, - error_contains: "Could not create mountpoint", - ..Default::default() - }, - TestData { - test_user: TestUserType::NonRootOnly, - deny_mount_permission: true, - error_contains: "Could not create mountpoint", - ..Default::default() - }, - ]; - - for (i, d) in tests.iter().enumerate() { - let msg = format!("test[{}]: {:?}", i, d); - - skip_loop_by_user!(msg, d.test_user); - - let drain = slog::Discard; - let logger = slog::Logger::root(drain, o!()); - - let tempdir = tempdir().unwrap(); - - let source = tempdir.path().join(&d.storage.source); - let mount_point = tempdir.path().join(&d.storage.mount_point); - - let storage = Storage { - source: source.to_str().unwrap().to_string(), - mount_point: mount_point.to_str().unwrap().to_string(), - ..d.storage.clone() - }; - - if d.make_source_dir { - fs::create_dir_all(&storage.source).unwrap(); - } - if d.make_mount_dir { - fs::create_dir_all(&storage.mount_point).unwrap(); - } - - if d.deny_mount_permission { - fs::set_permissions( - mount_point.parent().unwrap(), - fs::Permissions::from_mode(0o000), - ) - .unwrap(); - } - - let result = mount_storage(&logger, &storage); - - // restore permissions so tempdir can be cleaned up - if d.deny_mount_permission { - fs::set_permissions( - mount_point.parent().unwrap(), - fs::Permissions::from_mode(0o755), - ) - .unwrap(); - } - - if result.is_ok() { - nix::mount::umount(&mount_point).unwrap(); - } - - let msg = format!("{}: result: {:?}", msg, result); - if d.error_contains.is_empty() { - assert!(result.is_ok(), "{}", msg); - } else { - assert!(result.is_err(), "{}", msg); - let error_msg = format!("{}", result.unwrap_err()); - assert!(error_msg.contains(d.error_contains), "{}", msg); - } - } - } - #[test] fn test_mount_to_rootfs() { #[derive(Debug)] @@ -1916,62 +986,6 @@ mod tests { } } - #[test] - fn test_get_pagesize_and_size_from_option() { - let expected_pagesize = 2048; - let expected_size = 107374182; - let expected = (expected_pagesize, expected_size); - - let data = vec![ - // (input, expected, is_ok) - ("size-1=107374182,pagesize-1=2048", expected, false), - ("size-1=107374182,pagesize=2048", expected, false), - ("size=107374182,pagesize-1=2048", expected, false), - ("size=107374182,pagesize=abc", expected, false), - ("size=abc,pagesize=2048", expected, false), - ("size=,pagesize=2048", expected, false), - ("size=107374182,pagesize=", expected, false), - ("size=107374182,pagesize=2048", expected, true), - ("pagesize=2048,size=107374182", expected, true), - ("foo=bar,pagesize=2048,size=107374182", expected, true), - ( - "foo=bar,pagesize=2048,foo1=bar1,size=107374182", - expected, - true, - ), - ( - "pagesize=2048,foo1=bar1,foo=bar,size=107374182", - expected, - true, - ), - ( - "foo=bar,pagesize=2048,foo1=bar1,size=107374182,foo2=bar2", - expected, - true, - ), - ( - "foo=bar,size=107374182,foo1=bar1,pagesize=2048", - expected, - true, - ), - ]; - - for case in data { - let input = case.0; - let r = get_pagesize_and_size_from_option(&[input.to_string()]); - - let is_ok = case.2; - if is_ok { - let expected = case.1; - let (pagesize, size) = r.unwrap(); - assert_eq!(expected.0, pagesize); - assert_eq!(expected.1, size); - } else { - assert!(r.is_err()); - } - } - } - #[test] fn test_parse_mount_flags_and_options() { #[derive(Debug)] @@ -2014,7 +1028,7 @@ mod tests { for (i, d) in tests.iter().enumerate() { let msg = format!("test[{}]: {:?}", i, d); - let result = parse_mount_flags_and_options(d.options_vec.iter()); + let result = parse_mount_options(&d.options_vec).unwrap(); let msg = format!("{}: result: {:?}", msg, result); @@ -2022,207 +1036,4 @@ mod tests { assert_eq!(expected_result, result, "{}", msg); } } - - #[test] - fn test_set_ownership() { - skip_if_not_root!(); - - let logger = slog::Logger::root(slog::Discard, o!()); - - #[derive(Debug)] - struct TestData<'a> { - mount_path: &'a str, - fs_group: Option, - read_only: bool, - expected_group_id: u32, - expected_permission: u32, - } - - let tests = &[ - TestData { - mount_path: "foo", - fs_group: None, - read_only: false, - expected_group_id: 0, - expected_permission: 0, - }, - TestData { - mount_path: "rw_mount", - fs_group: Some(FSGroup { - group_id: 3000, - group_change_policy: FSGroupChangePolicy::Always.into(), - ..Default::default() - }), - read_only: false, - expected_group_id: 3000, - expected_permission: RW_MASK | EXEC_MASK | MODE_SETGID, - }, - TestData { - mount_path: "ro_mount", - fs_group: Some(FSGroup { - group_id: 3000, - group_change_policy: FSGroupChangePolicy::OnRootMismatch.into(), - ..Default::default() - }), - read_only: true, - expected_group_id: 3000, - expected_permission: RO_MASK | EXEC_MASK | MODE_SETGID, - }, - ]; - - let tempdir = tempdir().expect("failed to create tmpdir"); - - for (i, d) in tests.iter().enumerate() { - let msg = format!("test[{}]: {:?}", i, d); - - let mount_dir = tempdir.path().join(d.mount_path); - fs::create_dir(&mount_dir) - .unwrap_or_else(|_| panic!("{}: failed to create root directory", msg)); - - let directory_mode = mount_dir.as_path().metadata().unwrap().permissions().mode(); - let mut storage_data = Storage::new(); - if d.read_only { - storage_data.set_options(vec!["foo".to_string(), "ro".to_string()]); - } - if let Some(fs_group) = d.fs_group.clone() { - storage_data.set_fs_group(fs_group); - } - storage_data.mount_point = mount_dir.clone().into_os_string().into_string().unwrap(); - - let result = set_ownership(&logger, &storage_data); - assert!(result.is_ok()); - - assert_eq!( - mount_dir.as_path().metadata().unwrap().gid(), - d.expected_group_id - ); - assert_eq!( - mount_dir.as_path().metadata().unwrap().permissions().mode(), - (directory_mode | d.expected_permission) - ); - } - } - - #[test] - fn test_recursive_ownership_change() { - skip_if_not_root!(); - - const COUNT: usize = 5; - - #[derive(Debug)] - struct TestData<'a> { - // Directory where the recursive ownership change should be performed on - path: &'a str, - - // User ID for ownership change - uid: u32, - - // Group ID for ownership change - gid: u32, - - // Set when the permission should be read-only - read_only: bool, - - // The expected permission of all directories after ownership change - expected_permission_directory: u32, - - // The expected permission of all files after ownership change - expected_permission_file: u32, - } - - let tests = &[ - TestData { - path: "no_gid_change", - uid: 0, - gid: 0, - read_only: false, - expected_permission_directory: 0, - expected_permission_file: 0, - }, - TestData { - path: "rw_gid_change", - uid: 0, - gid: 3000, - read_only: false, - expected_permission_directory: RW_MASK | EXEC_MASK | MODE_SETGID, - expected_permission_file: RW_MASK, - }, - TestData { - path: "ro_gid_change", - uid: 0, - gid: 3000, - read_only: true, - expected_permission_directory: RO_MASK | EXEC_MASK | MODE_SETGID, - expected_permission_file: RO_MASK, - }, - ]; - - let tempdir = tempdir().expect("failed to create tmpdir"); - - for (i, d) in tests.iter().enumerate() { - let msg = format!("test[{}]: {:?}", i, d); - - let mount_dir = tempdir.path().join(d.path); - fs::create_dir(&mount_dir) - .unwrap_or_else(|_| panic!("{}: failed to create root directory", msg)); - - let directory_mode = mount_dir.as_path().metadata().unwrap().permissions().mode(); - let mut file_mode: u32 = 0; - - // create testing directories and files - for n in 1..COUNT { - let nest_dir = mount_dir.join(format!("nested{}", n)); - fs::create_dir(&nest_dir) - .unwrap_or_else(|_| panic!("{}: failed to create nest directory", msg)); - - for f in 1..COUNT { - let filename = nest_dir.join(format!("file{}", f)); - File::create(&filename) - .unwrap_or_else(|_| panic!("{}: failed to create file", msg)); - file_mode = filename.as_path().metadata().unwrap().permissions().mode(); - } - } - - let uid = if d.uid > 0 { - Some(Uid::from_raw(d.uid)) - } else { - None - }; - let gid = if d.gid > 0 { - Some(Gid::from_raw(d.gid)) - } else { - None - }; - let result = recursive_ownership_change(&mount_dir, uid, gid, d.read_only); - - assert!(result.is_ok()); - - assert_eq!(mount_dir.as_path().metadata().unwrap().gid(), d.gid); - assert_eq!( - mount_dir.as_path().metadata().unwrap().permissions().mode(), - (directory_mode | d.expected_permission_directory) - ); - - for n in 1..COUNT { - let nest_dir = mount_dir.join(format!("nested{}", n)); - for f in 1..COUNT { - let filename = nest_dir.join(format!("file{}", f)); - let file = Path::new(&filename); - - assert_eq!(file.metadata().unwrap().gid(), d.gid); - assert_eq!( - file.metadata().unwrap().permissions().mode(), - (file_mode | d.expected_permission_file) - ); - } - - let dir = Path::new(&nest_dir); - assert_eq!(dir.metadata().unwrap().gid(), d.gid); - assert_eq!( - dir.metadata().unwrap().permissions().mode(), - (directory_mode | d.expected_permission_directory) - ); - } - } - } } diff --git a/src/agent/src/namespace.rs b/src/agent/src/namespace.rs index a3ad266162..bf24cd1048 100644 --- a/src/agent/src/namespace.rs +++ b/src/agent/src/namespace.rs @@ -7,14 +7,14 @@ use anyhow::{anyhow, Result}; use nix::mount::MsFlags; use nix::sched::{unshare, CloneFlags}; use nix::unistd::{getpid, gettid}; +use slog::Logger; use std::fmt; use std::fs; use std::fs::File; use std::path::{Path, PathBuf}; use tracing::instrument; -use crate::mount::{baremount, FLAGS}; -use slog::Logger; +use crate::mount::baremount; const PERSISTENT_NS_DIR: &str = "/var/run/sandbox-ns"; pub const NSTYPEIPC: &str = "ipc"; @@ -116,15 +116,7 @@ impl Namespace { // Bind mount the new namespace from the current thread onto the mount point to persist it. let mut flags = MsFlags::empty(); - - if let Some(x) = FLAGS.get("rbind") { - let (clear, f) = *x; - if clear { - flags &= !f; - } else { - flags |= f; - } - }; + flags |= MsFlags::MS_BIND | MsFlags::MS_REC; baremount(source, destination, "none", flags, "", &logger).map_err(|e| { anyhow!( diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index effedb0b13..6db6bbcc0b 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -59,12 +59,13 @@ use crate::device::{ use crate::image_rpc; use crate::linux_abi::*; use crate::metrics::get_metrics; -use crate::mount::{add_storages, baremount, update_ephemeral_mounts, STORAGE_HANDLER_LIST}; +use crate::mount::baremount; use crate::namespace::{NSTYPEIPC, NSTYPEPID, NSTYPEUTS}; use crate::network::setup_guest_dns; use crate::pci; use crate::random; use crate::sandbox::Sandbox; +use crate::storage::{add_storages, update_ephemeral_mounts, STORAGE_HANDLERS}; use crate::version::{AGENT_VERSION, API_VERSION}; use crate::AGENT_CONFIG; @@ -231,13 +232,7 @@ impl AgentService { // After all those storages have been processed, no matter the order // here, the agent will rely on rustjail (using the oci.Mounts // list) to bind mount all of them inside the container. - let m = add_storages( - sl(), - &req.storages, - &self.sandbox, - Some(req.container_id.clone()), - ) - .await?; + let m = add_storages(sl(), req.storages, &self.sandbox, Some(req.container_id)).await?; let mut s = self.sandbox.lock().await; s.container_mounts.insert(cid.clone(), m); @@ -296,7 +291,7 @@ impl AgentService { if let Err(e) = ctr.destroy().await { error!(sl(), "failed to destroy container: {:?}", e); } - if let Err(e) = remove_container_resources(&mut s, &cid) { + if let Err(e) = remove_container_resources(&mut s, &cid).await { error!(sl(), "failed to remove container resources: {:?}", e); } return Err(err); @@ -348,7 +343,7 @@ impl AgentService { .ok_or_else(|| anyhow!("Invalid container id"))? .destroy() .await?; - remove_container_resources(&mut sandbox, &cid)?; + remove_container_resources(&mut sandbox, &cid).await?; return Ok(()); } @@ -370,7 +365,7 @@ impl AgentService { .await .map_err(|_| anyhow!(nix::Error::ETIME))???; - remove_container_resources(&mut *self.sandbox.lock().await, &cid) + remove_container_resources(&mut *self.sandbox.lock().await, &cid).await } #[instrument] @@ -1268,7 +1263,7 @@ impl agent_ttrpc::AgentService for AgentService { .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; } - match add_storages(sl(), &req.storages, &self.sandbox, None).await { + match add_storages(sl(), req.storages, &self.sandbox, None).await { Ok(m) => { self.sandbox.lock().await.mounts = m; } @@ -1675,7 +1670,7 @@ fn get_agent_details() -> AgentDetails { detail.init_daemon = unistd::getpid() == Pid::from_raw(1); detail.device_handlers = Vec::new(); - detail.storage_handlers = STORAGE_HANDLER_LIST.iter().map(|x| x.to_string()).collect(); + detail.storage_handlers = STORAGE_HANDLERS.get_handlers(); detail } @@ -1780,21 +1775,21 @@ fn update_container_namespaces( Ok(()) } -fn remove_container_resources(sandbox: &mut Sandbox, cid: &str) -> Result<()> { +async fn remove_container_resources(sandbox: &mut Sandbox, cid: &str) -> Result<()> { let mut cmounts: Vec = vec![]; // Find the sandbox storage used by this container let mounts = sandbox.container_mounts.get(cid); if let Some(mounts) = mounts { for m in mounts.iter() { - if sandbox.storages.get(m).is_some() { + if sandbox.storages.contains_key(m) { cmounts.push(m.to_string()); } } } for m in cmounts.iter() { - if let Err(err) = sandbox.unset_and_remove_sandbox_storage(m) { + if let Err(err) = sandbox.remove_sandbox_storage(m).await { error!( sl(), "failed to unset_and_remove_sandbox_storage for container {}, error: {:?}", diff --git a/src/agent/src/sandbox.rs b/src/agent/src/sandbox.rs index 473c7cee7f..788f29278f 100644 --- a/src/agent/src/sandbox.rs +++ b/src/agent/src/sandbox.rs @@ -3,16 +3,20 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::collections::hash_map::Entry; use std::collections::HashMap; +use std::fmt::{Debug, Formatter}; use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::str::FromStr; +use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::Arc; use std::{thread, time}; use anyhow::{anyhow, Context, Result}; use kata_types::cpu::CpuSet; +use kata_types::mount::{StorageDevice, StorageDeviceGeneric}; use libc::pid_t; use oci::{Hook, Hooks}; use protocols::agent::OnlineCPUMemRequest; @@ -28,7 +32,7 @@ use tokio::sync::Mutex; use tracing::instrument; use crate::linux_abi::*; -use crate::mount::{get_mount_fs_type, remove_mounts, TYPE_ROOTFS}; +use crate::mount::{get_mount_fs_type, is_mounted, remove_mounts, TYPE_ROOTFS}; use crate::namespace::Namespace; use crate::netlink::Handle; use crate::network::Network; @@ -40,6 +44,46 @@ pub const ERR_INVALID_CONTAINER_ID: &str = "Invalid container id"; type UeventWatcher = (Box, oneshot::Sender); +#[derive(Clone)] +pub struct StorageState { + count: Arc, + device: Arc, +} + +impl Debug for StorageState { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StorageState").finish() + } +} + +impl StorageState { + fn new() -> Self { + StorageState { + count: Arc::new(AtomicU32::new(1)), + device: Arc::new(StorageDeviceGeneric::new("".to_string())), + } + } + + pub fn from_device(device: Arc) -> Self { + Self { + count: Arc::new(AtomicU32::new(1)), + device, + } + } + + pub async fn ref_count(&self) -> u32 { + self.count.load(Ordering::Relaxed) + } + + async fn inc_ref_count(&self) { + self.count.fetch_add(1, Ordering::Acquire); + } + + async fn dec_and_test_ref_count(&self) -> bool { + self.count.fetch_sub(1, Ordering::AcqRel) == 1 + } +} + #[derive(Debug)] pub struct Sandbox { pub logger: Logger, @@ -54,7 +98,7 @@ pub struct Sandbox { pub shared_utsns: Namespace, pub shared_ipcns: Namespace, pub sandbox_pidns: Option, - pub storages: HashMap, + pub storages: HashMap, pub running: bool, pub no_pivot_root: bool, pub sender: Option>, @@ -100,52 +144,55 @@ impl Sandbox { }) } - // set_sandbox_storage sets the sandbox level reference - // counter for the sandbox storage. - // This method also returns a boolean to let - // callers know if the storage already existed or not. - // It will return true if storage is new. + /// Add a new storage object or increase reference count of existing one. + /// The caller may detect new storage object by checking `StorageState.refcount == 1`. #[instrument] - pub fn set_sandbox_storage(&mut self, path: &str) -> bool { - match self.storages.get_mut(path) { - None => { - self.storages.insert(path.to_string(), 1); - true + pub async fn add_sandbox_storage(&mut self, path: &str) -> StorageState { + match self.storages.entry(path.to_string()) { + Entry::Occupied(e) => { + let state = e.get().clone(); + state.inc_ref_count().await; + state } - Some(count) => { - *count += 1; - false + Entry::Vacant(e) => { + let state = StorageState::new(); + e.insert(state.clone()); + state } } } - // unset_sandbox_storage will decrement the sandbox storage - // reference counter. If there aren't any containers using - // that sandbox storage, this method will remove the - // storage reference from the sandbox and return 'true' to - // let the caller know that they can clean up the storage - // related directories by calling remove_sandbox_storage - #[instrument] - pub fn unset_sandbox_storage(&mut self, path: &str) -> Result { - match self.storages.get_mut(path) { - None => Err(anyhow!("Sandbox storage with path {} not found", path)), - Some(count) => { - *count -= 1; - if *count == 0 { - self.storages.remove(path); - return Ok(true); - } - Ok(false) - } + /// Update the storage device associated with a path. + pub fn update_sandbox_storage( + &mut self, + path: &str, + device: Arc, + ) -> std::result::Result, Arc> { + if !self.storages.contains_key(path) { + return Err(device); } + + let state = StorageState::from_device(device); + // Safe to unwrap() because we have just ensured existence of entry. + let state = self.storages.insert(path.to_string(), state).unwrap(); + Ok(state.device) } - // remove_sandbox_storage removes the sandbox storage if no - // containers are using that storage. + // Clean mount and directory of a mountpoint. + // This is actually StorageDeviceGeneric::cleanup(), kept here due to dependency chain. #[instrument] - pub fn remove_sandbox_storage(&mut self, path: &str) -> Result<()> { - let mounts = vec![path.to_string()]; - remove_mounts(&mounts)?; + fn cleanup_sandbox_storage(&mut self, path: &str) -> Result<()> { + if path.is_empty() { + return Err(anyhow!("mountpoint path is empty")); + } else if !Path::new(path).exists() { + return Ok(()); + } + + if matches!(is_mounted(path), Ok(true)) { + let mounts = vec![path.to_string()]; + remove_mounts(&mounts)?; + } + // "remove_dir" will fail if the mount point is backed by a read-only filesystem. // This is the case with the device mapper snapshotter, where we mount the block device directly // at the underlying sandbox path which was provided from the base RO kataShared path from the host. @@ -155,16 +202,23 @@ impl Sandbox { Ok(()) } - // unset_and_remove_sandbox_storage unsets the storage from sandbox - // and if there are no containers using this storage it will - // remove it from the sandbox. + /// Decrease reference count and destroy the storage object if reference count reaches zero. + /// Returns `Ok(true)` if the reference count has reached zero and the storage object has been + /// removed. #[instrument] - pub fn unset_and_remove_sandbox_storage(&mut self, path: &str) -> Result<()> { - if self.unset_sandbox_storage(path)? { - return self.remove_sandbox_storage(path); + pub async fn remove_sandbox_storage(&mut self, path: &str) -> Result { + match self.storages.get(path) { + None => Err(anyhow!("Sandbox storage with path {} not found", path)), + Some(state) => { + if state.dec_and_test_ref_count().await { + self.storages.remove(path); + self.cleanup_sandbox_storage(path)?; + Ok(true) + } else { + Ok(false) + } + } } - - Ok(()) } #[instrument] @@ -493,24 +547,22 @@ mod tests { let tmpdir_path = tmpdir.path().to_str().unwrap(); // Add a new sandbox storage - let new_storage = s.set_sandbox_storage(tmpdir_path); + let new_storage = s.add_sandbox_storage(tmpdir_path).await; // Check the reference counter - let ref_count = s.storages[tmpdir_path]; + let ref_count = new_storage.ref_count().await; assert_eq!( ref_count, 1, "Invalid refcount, got {} expected 1.", ref_count ); - assert!(new_storage); // Use the existing sandbox storage - let new_storage = s.set_sandbox_storage(tmpdir_path); - assert!(!new_storage, "Should be false as already exists."); + let new_storage = s.add_sandbox_storage(tmpdir_path).await; // Since we are using existing storage, the reference counter // should be 2 by now. - let ref_count = s.storages[tmpdir_path]; + let ref_count = new_storage.ref_count().await; assert_eq!( ref_count, 2, "Invalid refcount, got {} expected 2.", @@ -546,22 +598,20 @@ mod tests { .tempdir_in(tmpdir_path) .unwrap(); - assert!( - s.remove_sandbox_storage(srcdir_path).is_err(), - "Expect Err as the directory is not a mountpoint" - ); - - assert!(s.remove_sandbox_storage("").is_err()); + assert!(s.cleanup_sandbox_storage("").is_err()); let invalid_dir = emptydir.path().join("invalid"); assert!(s - .remove_sandbox_storage(invalid_dir.to_str().unwrap()) - .is_err()); + .cleanup_sandbox_storage(invalid_dir.to_str().unwrap()) + .is_ok()); assert!(bind_mount(srcdir_path, destdir_path, &logger).is_ok()); - assert!(s.remove_sandbox_storage(destdir_path).is_ok()); + assert!(s.cleanup_sandbox_storage(destdir_path).is_ok()); + + // remove a directory without umount + s.cleanup_sandbox_storage(srcdir_path).unwrap(); } #[tokio::test] @@ -573,8 +623,7 @@ mod tests { let mut s = Sandbox::new(&logger).unwrap(); assert!( - s.unset_and_remove_sandbox_storage("/tmp/testEphePath") - .is_err(), + s.remove_sandbox_storage("/tmp/testEphePath").await.is_err(), "Should fail because sandbox storage doesn't exist" ); @@ -595,8 +644,8 @@ mod tests { assert!(bind_mount(srcdir_path, destdir_path, &logger).is_ok()); - assert!(s.set_sandbox_storage(destdir_path)); - assert!(s.unset_and_remove_sandbox_storage(destdir_path).is_ok()); + s.add_sandbox_storage(destdir_path).await; + assert!(s.remove_sandbox_storage(destdir_path).await.is_ok()); let other_dir_str; { @@ -609,10 +658,10 @@ mod tests { let other_dir_path = other_dir.path().to_str().unwrap(); other_dir_str = other_dir_path.to_string(); - assert!(s.set_sandbox_storage(other_dir_path)); + s.add_sandbox_storage(other_dir_path).await; } - assert!(s.unset_and_remove_sandbox_storage(&other_dir_str).is_err()); + assert!(s.remove_sandbox_storage(&other_dir_str).await.is_ok()); } #[tokio::test] @@ -624,28 +673,30 @@ mod tests { let storage_path = "/tmp/testEphe"; // Add a new sandbox storage - assert!(s.set_sandbox_storage(storage_path)); + s.add_sandbox_storage(storage_path).await; // Use the existing sandbox storage + let state = s.add_sandbox_storage(storage_path).await; assert!( - !s.set_sandbox_storage(storage_path), + state.ref_count().await > 1, "Expects false as the storage is not new." ); assert!( - !s.unset_sandbox_storage(storage_path).unwrap(), + !s.remove_sandbox_storage(storage_path).await.unwrap(), "Expects false as there is still a storage." ); // Reference counter should decrement to 1. - let ref_count = s.storages[storage_path]; + let storage = &s.storages[storage_path]; + let refcount = storage.ref_count().await; assert_eq!( - ref_count, 1, + refcount, 1, "Invalid refcount, got {} expected 1.", - ref_count + refcount ); assert!( - s.unset_sandbox_storage(storage_path).unwrap(), + s.remove_sandbox_storage(storage_path).await.unwrap(), "Expects true as there is still a storage." ); @@ -661,7 +712,7 @@ mod tests { // If no container is using the sandbox storage, the reference // counter for it should not exist. assert!( - s.unset_sandbox_storage(storage_path).is_err(), + s.remove_sandbox_storage(storage_path).await.is_err(), "Expects false as the reference counter should no exist." ); } diff --git a/src/agent/src/storage/bind_watcher_handler.rs b/src/agent/src/storage/bind_watcher_handler.rs new file mode 100644 index 0000000000..3b50327d12 --- /dev/null +++ b/src/agent/src/storage/bind_watcher_handler.rs @@ -0,0 +1,37 @@ +// Copyright (c) 2019 Ant Financial +// Copyright (c) 2023 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; +use kata_types::mount::StorageDevice; +use protocols::agent::Storage; +use std::iter; +use std::sync::Arc; +use tracing::instrument; + +use crate::storage::{new_device, StorageContext, StorageHandler}; + +#[derive(Debug)] +pub struct BindWatcherHandler {} + +#[async_trait::async_trait] +impl StorageHandler for BindWatcherHandler { + #[instrument] + async fn create_device( + &self, + storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + if let Some(cid) = ctx.cid { + ctx.sandbox + .lock() + .await + .bind_watcher + .add_container(cid.to_string(), iter::once(storage.clone()), ctx.logger) + .await?; + } + new_device("".to_string()) + } +} diff --git a/src/agent/src/storage/block_handler.rs b/src/agent/src/storage/block_handler.rs new file mode 100644 index 0000000000..60330253ce --- /dev/null +++ b/src/agent/src/storage/block_handler.rs @@ -0,0 +1,146 @@ +// Copyright (c) 2019 Ant Financial +// Copyright (c) 2023 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::path::Path; +use std::str::FromStr; +use std::sync::Arc; + +use anyhow::{anyhow, Context, Result}; +use kata_types::mount::StorageDevice; +use protocols::agent::Storage; +use tracing::instrument; + +use crate::device::{ + get_scsi_device_name, get_virtio_blk_pci_device_name, get_virtio_mmio_device_name, + wait_for_pmem_device, +}; +use crate::pci; +use crate::storage::{common_storage_handler, new_device, StorageContext, StorageHandler}; +#[cfg(target_arch = "s390x")] +use crate::{ccw, device::get_virtio_blk_ccw_device_name}; + +#[derive(Debug)] +pub struct VirtioBlkMmioHandler {} + +#[async_trait::async_trait] +impl StorageHandler for VirtioBlkMmioHandler { + #[instrument] + async fn create_device( + &self, + storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + if !Path::new(&storage.source).exists() { + get_virtio_mmio_device_name(ctx.sandbox, &storage.source) + .await + .context("failed to get mmio device name")?; + } + let path = common_storage_handler(ctx.logger, &storage)?; + new_device(path) + } +} + +#[derive(Debug)] +pub struct VirtioBlkPciHandler {} + +#[async_trait::async_trait] +impl StorageHandler for VirtioBlkPciHandler { + #[instrument] + async fn create_device( + &self, + mut storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + // If hot-plugged, get the device node path based on the PCI path + // otherwise use the virt path provided in Storage Source + if storage.source.starts_with("/dev") { + let metadata = fs::metadata(&storage.source) + .context(format!("get metadata on file {:?}", &storage.source))?; + let mode = metadata.permissions().mode(); + if mode & libc::S_IFBLK == 0 { + return Err(anyhow!("Invalid device {}", &storage.source)); + } + } else { + let pcipath = pci::Path::from_str(&storage.source)?; + let dev_path = get_virtio_blk_pci_device_name(ctx.sandbox, &pcipath).await?; + storage.source = dev_path; + } + + let path = common_storage_handler(ctx.logger, &storage)?; + new_device(path) + } +} + +#[derive(Debug)] +pub struct VirtioBlkCcwHandler {} + +#[async_trait::async_trait] +impl StorageHandler for VirtioBlkCcwHandler { + #[cfg(target_arch = "s390x")] + #[instrument] + async fn create_device( + &self, + mut storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + let ccw_device = ccw::Device::from_str(&storage.source)?; + let dev_path = get_virtio_blk_ccw_device_name(ctx.sandbox, &ccw_device).await?; + storage.source = dev_path; + let path = common_storage_handler(ctx.logger, &storage)?; + new_device(path) + } + + #[cfg(not(target_arch = "s390x"))] + #[instrument] + async fn create_device( + &self, + _storage: Storage, + _ctx: &mut StorageContext, + ) -> Result> { + Err(anyhow!("CCW is only supported on s390x")) + } +} + +#[derive(Debug)] +pub struct ScsiHandler {} + +#[async_trait::async_trait] +impl StorageHandler for ScsiHandler { + #[instrument] + async fn create_device( + &self, + mut storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + // Retrieve the device path from SCSI address. + let dev_path = get_scsi_device_name(ctx.sandbox, &storage.source).await?; + storage.source = dev_path; + + let path = common_storage_handler(ctx.logger, &storage)?; + new_device(path) + } +} + +#[derive(Debug)] +pub struct PmemHandler {} + +#[async_trait::async_trait] +impl StorageHandler for PmemHandler { + #[instrument] + async fn create_device( + &self, + storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + // Retrieve the device for pmem storage + wait_for_pmem_device(ctx.sandbox, &storage.source).await?; + + let path = common_storage_handler(ctx.logger, &storage)?; + new_device(path) + } +} diff --git a/src/agent/src/storage/ephemeral_handler.rs b/src/agent/src/storage/ephemeral_handler.rs new file mode 100644 index 0000000000..8fc70f6959 --- /dev/null +++ b/src/agent/src/storage/ephemeral_handler.rs @@ -0,0 +1,293 @@ +// Copyright (c) 2019 Ant Financial +// Copyright (c) 2023 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fs; +use std::fs::OpenOptions; +use std::io::Write; +use std::os::unix::fs::{MetadataExt, PermissionsExt}; +use std::path::Path; +use std::sync::Arc; + +use anyhow::{anyhow, Context, Result}; +use kata_sys_util::mount::parse_mount_options; +use kata_types::mount::{StorageDevice, KATA_MOUNT_OPTION_FS_GID}; +use nix::unistd::Gid; +use protocols::agent::Storage; +use slog::Logger; +use tokio::sync::Mutex; +use tracing::instrument; + +use crate::device::{DRIVER_EPHEMERAL_TYPE, FS_TYPE_HUGETLB}; +use crate::mount::baremount; +use crate::sandbox::Sandbox; +use crate::storage::{ + common_storage_handler, new_device, parse_options, StorageContext, StorageHandler, MODE_SETGID, +}; + +const FS_GID_EQ: &str = "fsgid="; +const SYS_FS_HUGEPAGES_PREFIX: &str = "/sys/kernel/mm/hugepages"; + +#[derive(Debug)] +pub struct EphemeralHandler {} + +#[async_trait::async_trait] +impl StorageHandler for EphemeralHandler { + #[instrument] + async fn create_device( + &self, + mut storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + // hugetlbfs + if storage.fstype == FS_TYPE_HUGETLB { + info!(ctx.logger, "handle hugetlbfs storage"); + // Allocate hugepages before mount + // /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages + // /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + // options eg "pagesize=2097152,size=524288000"(2M, 500M) + Self::allocate_hugepages(ctx.logger, &storage.options.to_vec()) + .context("allocate hugepages")?; + common_storage_handler(ctx.logger, &storage)?; + } else if !storage.options.is_empty() { + // By now we only support one option field: "fsGroup" which + // isn't an valid mount option, thus we should remove it when + // do mount. + let opts = parse_options(&storage.options); + storage.options = Default::default(); + common_storage_handler(ctx.logger, &storage)?; + + // ephemeral_storage didn't support mount options except fsGroup. + if let Some(fsgid) = opts.get(KATA_MOUNT_OPTION_FS_GID) { + let gid = fsgid.parse::()?; + + nix::unistd::chown(storage.mount_point.as_str(), None, Some(Gid::from_raw(gid)))?; + + let meta = fs::metadata(&storage.mount_point)?; + let mut permission = meta.permissions(); + + let o_mode = meta.mode() | MODE_SETGID; + permission.set_mode(o_mode); + fs::set_permissions(&storage.mount_point, permission)?; + } + } else { + common_storage_handler(ctx.logger, &storage)?; + } + + new_device("".to_string()) + } +} + +impl EphemeralHandler { + // Allocate hugepages by writing to sysfs + fn allocate_hugepages(logger: &Logger, options: &[String]) -> Result<()> { + info!(logger, "mounting hugePages storage options: {:?}", options); + + let (pagesize, size) = Self::get_pagesize_and_size_from_option(options) + .context(format!("parse mount options: {:?}", &options))?; + + info!( + logger, + "allocate hugepages. pageSize: {}, size: {}", pagesize, size + ); + + // sysfs entry is always of the form hugepages-${pagesize}kB + // Ref: https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt + let path = Path::new(SYS_FS_HUGEPAGES_PREFIX) + .join(format!("hugepages-{}kB", pagesize / 1024)) + .join("nr_hugepages"); + + // write numpages to nr_hugepages file. + let numpages = format!("{}", size / pagesize); + info!(logger, "write {} pages to {:?}", &numpages, &path); + + let mut file = OpenOptions::new() + .write(true) + .open(&path) + .context(format!("open nr_hugepages directory {:?}", &path))?; + + file.write_all(numpages.as_bytes()) + .context(format!("write nr_hugepages failed: {:?}", &path))?; + + // Even if the write succeeds, the kernel isn't guaranteed to be + // able to allocate all the pages we requested. Verify that it + // did. + let verify = fs::read_to_string(&path).context(format!("reading {:?}", &path))?; + let allocated = verify + .trim_end() + .parse::() + .map_err(|_| anyhow!("Unexpected text {:?} in {:?}", &verify, &path))?; + if allocated != size / pagesize { + return Err(anyhow!( + "Only allocated {} of {} hugepages of size {}", + allocated, + numpages, + pagesize + )); + } + + Ok(()) + } + + // Parse filesystem options string to retrieve hugepage details + // options eg "pagesize=2048,size=107374182" + fn get_pagesize_and_size_from_option(options: &[String]) -> Result<(u64, u64)> { + let mut pagesize_str: Option<&str> = None; + let mut size_str: Option<&str> = None; + + for option in options { + let vars: Vec<&str> = option.trim().split(',').collect(); + + for var in vars { + if let Some(stripped) = var.strip_prefix("pagesize=") { + pagesize_str = Some(stripped); + } else if let Some(stripped) = var.strip_prefix("size=") { + size_str = Some(stripped); + } + + if pagesize_str.is_some() && size_str.is_some() { + break; + } + } + } + + if pagesize_str.is_none() || size_str.is_none() { + return Err(anyhow!("no pagesize/size options found")); + } + + let pagesize = pagesize_str + .unwrap() + .parse::() + .context(format!("parse pagesize: {:?}", &pagesize_str))?; + let size = size_str + .unwrap() + .parse::() + .context(format!("parse size: {:?}", &pagesize_str))?; + + Ok((pagesize, size)) + } +} + +// update_ephemeral_mounts takes a list of ephemeral mounts and remounts them +// with mount options passed by the caller +#[instrument] +pub async fn update_ephemeral_mounts( + logger: Logger, + storages: &[Storage], + _sandbox: &Arc>, +) -> Result<()> { + for storage in storages { + let handler_name = &storage.driver; + let logger = logger.new(o!( + "msg" => "updating tmpfs storage", + "subsystem" => "storage", + "storage-type" => handler_name.to_owned())); + + match handler_name.as_str() { + DRIVER_EPHEMERAL_TYPE => { + fs::create_dir_all(&storage.mount_point)?; + + if storage.options.is_empty() { + continue; + } else { + // assume that fsGid has already been set + let mount_path = Path::new(&storage.mount_point); + let src_path = Path::new(&storage.source); + let opts: Vec<&String> = storage + .options + .iter() + .filter(|&opt| !opt.starts_with(FS_GID_EQ)) + .collect(); + let (flags, options) = parse_mount_options(&opts)?; + + info!(logger, "mounting storage"; + "mount-source" => src_path.display(), + "mount-destination" => mount_path.display(), + "mount-fstype" => storage.fstype.as_str(), + "mount-options" => options.as_str(), + ); + + baremount( + src_path, + mount_path, + storage.fstype.as_str(), + flags, + options.as_str(), + &logger, + )?; + } + } + _ => { + return Err(anyhow!( + "Unsupported storage type for syncing mounts {}. Only ephemeral storage update is supported", + storage.driver + )); + } + }; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_pagesize_and_size_from_option() { + let expected_pagesize = 2048; + let expected_size = 107374182; + let expected = (expected_pagesize, expected_size); + + let data = vec![ + // (input, expected, is_ok) + ("size-1=107374182,pagesize-1=2048", expected, false), + ("size-1=107374182,pagesize=2048", expected, false), + ("size=107374182,pagesize-1=2048", expected, false), + ("size=107374182,pagesize=abc", expected, false), + ("size=abc,pagesize=2048", expected, false), + ("size=,pagesize=2048", expected, false), + ("size=107374182,pagesize=", expected, false), + ("size=107374182,pagesize=2048", expected, true), + ("pagesize=2048,size=107374182", expected, true), + ("foo=bar,pagesize=2048,size=107374182", expected, true), + ( + "foo=bar,pagesize=2048,foo1=bar1,size=107374182", + expected, + true, + ), + ( + "pagesize=2048,foo1=bar1,foo=bar,size=107374182", + expected, + true, + ), + ( + "foo=bar,pagesize=2048,foo1=bar1,size=107374182,foo2=bar2", + expected, + true, + ), + ( + "foo=bar,size=107374182,foo1=bar1,pagesize=2048", + expected, + true, + ), + ]; + + for case in data { + let input = case.0; + let r = EphemeralHandler::get_pagesize_and_size_from_option(&[input.to_string()]); + + let is_ok = case.2; + if is_ok { + let expected = case.1; + let (pagesize, size) = r.unwrap(); + assert_eq!(expected.0, pagesize); + assert_eq!(expected.1, size); + } else { + assert!(r.is_err()); + } + } + } +} diff --git a/src/agent/src/storage/fs_handler.rs b/src/agent/src/storage/fs_handler.rs new file mode 100644 index 0000000000..fce59c0b14 --- /dev/null +++ b/src/agent/src/storage/fs_handler.rs @@ -0,0 +1,89 @@ +// Copyright (c) 2019 Ant Financial +// Copyright (c) 2023 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fs; +use std::path::Path; +use std::sync::Arc; + +use anyhow::{anyhow, Context, Result}; +use kata_types::mount::StorageDevice; +use protocols::agent::Storage; +use tracing::instrument; + +use crate::storage::{common_storage_handler, new_device, StorageContext, StorageHandler}; + +#[derive(Debug)] +pub struct OverlayfsHandler {} + +#[async_trait::async_trait] +impl StorageHandler for OverlayfsHandler { + #[instrument] + async fn create_device( + &self, + mut storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + if storage + .options + .iter() + .any(|e| e == "io.katacontainers.fs-opt.overlay-rw") + { + let cid = ctx + .cid + .clone() + .ok_or_else(|| anyhow!("No container id in rw overlay"))?; + let cpath = Path::new(crate::rpc::CONTAINER_BASE).join(cid); + let work = cpath.join("work"); + let upper = cpath.join("upper"); + + fs::create_dir_all(&work).context("Creating overlay work directory")?; + fs::create_dir_all(&upper).context("Creating overlay upper directory")?; + + storage.fstype = "overlay".into(); + storage + .options + .push(format!("upperdir={}", upper.to_string_lossy())); + storage + .options + .push(format!("workdir={}", work.to_string_lossy())); + } + + let path = common_storage_handler(ctx.logger, &storage)?; + new_device(path) + } +} + +#[derive(Debug)] +pub struct Virtio9pHandler {} + +#[async_trait::async_trait] +impl StorageHandler for Virtio9pHandler { + #[instrument] + async fn create_device( + &self, + storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + let path = common_storage_handler(ctx.logger, &storage)?; + new_device(path) + } +} + +#[derive(Debug)] +pub struct VirtioFsHandler {} + +#[async_trait::async_trait] +impl StorageHandler for VirtioFsHandler { + #[instrument] + async fn create_device( + &self, + storage: Storage, + ctx: &mut StorageContext, + ) -> Result> { + let path = common_storage_handler(ctx.logger, &storage)?; + new_device(path) + } +} diff --git a/src/agent/src/storage/local_handler.rs b/src/agent/src/storage/local_handler.rs new file mode 100644 index 0000000000..5bcee2d01f --- /dev/null +++ b/src/agent/src/storage/local_handler.rs @@ -0,0 +1,61 @@ +// Copyright (c) 2019 Ant Financial +// Copyright (c) 2023 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use kata_types::mount::{StorageDevice, KATA_MOUNT_OPTION_FS_GID}; +use nix::unistd::Gid; +use protocols::agent::Storage; +use tracing::instrument; + +use crate::storage::{new_device, parse_options, StorageContext, StorageHandler, MODE_SETGID}; + +#[derive(Debug)] +pub struct LocalHandler {} + +#[async_trait::async_trait] +impl StorageHandler for LocalHandler { + #[instrument] + async fn create_device( + &self, + storage: Storage, + _ctx: &mut StorageContext, + ) -> Result> { + fs::create_dir_all(&storage.mount_point).context(format!( + "failed to create dir all {:?}", + &storage.mount_point + ))?; + + let opts = parse_options(&storage.options); + + let mut need_set_fsgid = false; + if let Some(fsgid) = opts.get(KATA_MOUNT_OPTION_FS_GID) { + let gid = fsgid.parse::()?; + + nix::unistd::chown(storage.mount_point.as_str(), None, Some(Gid::from_raw(gid)))?; + need_set_fsgid = true; + } + + if let Some(mode) = opts.get("mode") { + let mut permission = fs::metadata(&storage.mount_point)?.permissions(); + + let mut o_mode = u32::from_str_radix(mode, 8)?; + + if need_set_fsgid { + // set SetGid mode mask. + o_mode |= MODE_SETGID; + } + permission.set_mode(o_mode); + + fs::set_permissions(&storage.mount_point, permission)?; + } + + new_device("".to_string()) + } +} diff --git a/src/agent/src/storage/mod.rs b/src/agent/src/storage/mod.rs new file mode 100644 index 0000000000..84348c972c --- /dev/null +++ b/src/agent/src/storage/mod.rs @@ -0,0 +1,648 @@ +// Copyright (c) 2019 Ant Financial +// Copyright (c) 2023 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::HashMap; +use std::fs; +use std::os::unix::fs::{MetadataExt, PermissionsExt}; +use std::path::Path; +use std::sync::Arc; + +use anyhow::{anyhow, Context, Result}; +use kata_sys_util::mount::{create_mount_destination, parse_mount_options}; +use kata_types::mount::{ + StorageDevice, StorageDeviceGeneric, StorageHandlerManager, KATA_SHAREDFS_GUEST_PREMOUNT_TAG, +}; +use nix::unistd::{Gid, Uid}; +use protocols::agent::Storage; +use protocols::types::FSGroupChangePolicy; +use slog::Logger; +use tokio::sync::Mutex; +use tracing::instrument; + +use self::bind_watcher_handler::BindWatcherHandler; +use self::block_handler::{PmemHandler, ScsiHandler, VirtioBlkMmioHandler, VirtioBlkPciHandler}; +use self::ephemeral_handler::EphemeralHandler; +use self::fs_handler::{OverlayfsHandler, Virtio9pHandler, VirtioFsHandler}; +use self::local_handler::LocalHandler; +use crate::device::{ + DRIVER_9P_TYPE, DRIVER_BLK_MMIO_TYPE, DRIVER_BLK_PCI_TYPE, DRIVER_EPHEMERAL_TYPE, + DRIVER_LOCAL_TYPE, DRIVER_NVDIMM_TYPE, DRIVER_OVERLAYFS_TYPE, DRIVER_SCSI_TYPE, + DRIVER_VIRTIOFS_TYPE, DRIVER_WATCHABLE_BIND_TYPE, +}; +use crate::mount::{baremount, is_mounted}; +use crate::sandbox::Sandbox; + +pub use self::ephemeral_handler::update_ephemeral_mounts; + +mod bind_watcher_handler; +mod block_handler; +mod ephemeral_handler; +mod fs_handler; +mod local_handler; + +const RW_MASK: u32 = 0o660; +const RO_MASK: u32 = 0o440; +const EXEC_MASK: u32 = 0o110; +const MODE_SETGID: u32 = 0o2000; + +#[derive(Debug)] +pub struct StorageContext<'a> { + cid: &'a Option, + logger: &'a Logger, + sandbox: &'a Arc>, +} + +/// Trait object to handle storage device. +#[async_trait::async_trait] +pub trait StorageHandler: Send + Sync { + /// Create a new storage device. + async fn create_device( + &self, + storage: Storage, + ctx: &mut StorageContext, + ) -> Result>; +} + +#[rustfmt::skip] +lazy_static! { + pub static ref STORAGE_HANDLERS: StorageHandlerManager> = { + let mut manager: StorageHandlerManager> = StorageHandlerManager::new(); + manager.add_handler(DRIVER_9P_TYPE, Arc::new(Virtio9pHandler{})).unwrap(); + #[cfg(target_arch = "s390x")] + manager.add_handler(crate::device::DRIVER_BLK_CCW_TYPE, Arc::new(self::block_handler::VirtioBlkCcwHandler{})).unwrap(); + manager.add_handler(DRIVER_BLK_MMIO_TYPE, Arc::new(VirtioBlkMmioHandler{})).unwrap(); + manager.add_handler(DRIVER_BLK_PCI_TYPE, Arc::new(VirtioBlkPciHandler{})).unwrap(); + manager.add_handler(DRIVER_EPHEMERAL_TYPE, Arc::new(EphemeralHandler{})).unwrap(); + manager.add_handler(DRIVER_LOCAL_TYPE, Arc::new(LocalHandler{})).unwrap(); + manager.add_handler(DRIVER_NVDIMM_TYPE, Arc::new(PmemHandler{})).unwrap(); + manager.add_handler(DRIVER_OVERLAYFS_TYPE, Arc::new(OverlayfsHandler{})).unwrap(); + manager.add_handler(DRIVER_SCSI_TYPE, Arc::new(ScsiHandler{})).unwrap(); + manager.add_handler(DRIVER_VIRTIOFS_TYPE, Arc::new(VirtioFsHandler{})).unwrap(); + manager.add_handler(DRIVER_WATCHABLE_BIND_TYPE, Arc::new(BindWatcherHandler{})).unwrap(); + manager + }; +} + +// add_storages takes a list of storages passed by the caller, and perform the +// associated operations such as waiting for the device to show up, and mount +// it to a specific location, according to the type of handler chosen, and for +// each storage. +#[instrument] +pub async fn add_storages( + logger: Logger, + storages: Vec, + sandbox: &Arc>, + cid: Option, +) -> Result> { + let mut mount_list = Vec::new(); + + for storage in storages { + let path = storage.mount_point.clone(); + let state = sandbox.lock().await.add_sandbox_storage(&path).await; + if state.ref_count().await > 1 { + // The device already exists. + continue; + } + + if let Some(handler) = STORAGE_HANDLERS.handler(&storage.driver) { + let logger = + logger.new(o!( "subsystem" => "storage", "storage-type" => storage.driver.clone())); + let mut ctx = StorageContext { + cid: &cid, + logger: &logger, + sandbox, + }; + + match handler.create_device(storage, &mut ctx).await { + Ok(device) => { + match sandbox + .lock() + .await + .update_sandbox_storage(&path, device.clone()) + { + Ok(d) => { + let path = device.path().to_string(); + if !path.is_empty() { + mount_list.push(path.clone()); + } + drop(d); + } + Err(device) => { + error!(logger, "failed to update device for storage"); + if let Err(e) = sandbox.lock().await.remove_sandbox_storage(&path).await + { + warn!(logger, "failed to remove dummy sandbox storage {:?}", e); + } + device.cleanup(); + return Err(anyhow!("failed to update device for storage")); + } + } + } + Err(e) => { + error!(logger, "failed to create device for storage, error: {e:?}"); + if let Err(e) = sandbox.lock().await.remove_sandbox_storage(&path).await { + warn!(logger, "failed to remove dummy sandbox storage {e:?}"); + } + return Err(e); + } + } + } else { + return Err(anyhow!( + "Failed to find the storage handler {}", + storage.driver + )); + } + } + + Ok(mount_list) +} + +pub(crate) fn new_device(path: String) -> Result> { + let device = StorageDeviceGeneric::new(path); + Ok(Arc::new(device)) +} + +#[instrument] +pub(crate) fn common_storage_handler(logger: &Logger, storage: &Storage) -> Result { + mount_storage(logger, storage)?; + set_ownership(logger, storage)?; + Ok(storage.mount_point.clone()) +} + +// mount_storage performs the mount described by the storage structure. +#[instrument] +fn mount_storage(logger: &Logger, storage: &Storage) -> Result<()> { + let logger = logger.new(o!("subsystem" => "mount")); + + // There's a special mechanism to create mountpoint from a `sharedfs` instance before + // starting the kata-agent. Check for such cases. + if storage.source == KATA_SHAREDFS_GUEST_PREMOUNT_TAG && is_mounted(&storage.mount_point)? { + warn!( + logger, + "{} already mounted on {}, ignoring...", + KATA_SHAREDFS_GUEST_PREMOUNT_TAG, + &storage.mount_point + ); + return Ok(()); + } + + let (flags, options) = parse_mount_options(&storage.options)?; + let mount_path = Path::new(&storage.mount_point); + let src_path = Path::new(&storage.source); + create_mount_destination(src_path, mount_path, "", &storage.fstype) + .context("Could not create mountpoint")?; + + info!(logger, "mounting storage"; + "mount-source" => src_path.display(), + "mount-destination" => mount_path.display(), + "mount-fstype" => storage.fstype.as_str(), + "mount-options" => options.as_str(), + ); + + baremount( + src_path, + mount_path, + storage.fstype.as_str(), + flags, + options.as_str(), + &logger, + ) +} + +#[instrument] +pub(crate) fn parse_options(option_list: &[String]) -> HashMap { + let mut options = HashMap::new(); + for opt in option_list { + let fields: Vec<&str> = opt.split('=').collect(); + if fields.len() == 2 { + options.insert(fields[0].to_string(), fields[1].to_string()); + } + } + options +} + +#[instrument] +pub fn set_ownership(logger: &Logger, storage: &Storage) -> Result<()> { + let logger = logger.new(o!("subsystem" => "mount", "fn" => "set_ownership")); + + // If fsGroup is not set, skip performing ownership change + if storage.fs_group.is_none() { + return Ok(()); + } + + let fs_group = storage.fs_group(); + let read_only = storage.options.contains(&String::from("ro")); + let mount_path = Path::new(&storage.mount_point); + let metadata = mount_path.metadata().map_err(|err| { + error!(logger, "failed to obtain metadata for mount path"; + "mount-path" => mount_path.to_str(), + "error" => err.to_string(), + ); + err + })?; + + if fs_group.group_change_policy == FSGroupChangePolicy::OnRootMismatch.into() + && metadata.gid() == fs_group.group_id + { + let mut mask = if read_only { RO_MASK } else { RW_MASK }; + mask |= EXEC_MASK; + + // With fsGroup change policy to OnRootMismatch, if the current + // gid of the mount path root directory matches the desired gid + // and the current permission of mount path root directory is correct, + // then ownership change will be skipped. + let current_mode = metadata.permissions().mode(); + if (mask & current_mode == mask) && (current_mode & MODE_SETGID != 0) { + info!(logger, "skipping ownership change for volume"; + "mount-path" => mount_path.to_str(), + "fs-group" => fs_group.group_id.to_string(), + ); + return Ok(()); + } + } + + info!(logger, "performing recursive ownership change"; + "mount-path" => mount_path.to_str(), + "fs-group" => fs_group.group_id.to_string(), + ); + recursive_ownership_change( + mount_path, + None, + Some(Gid::from_raw(fs_group.group_id)), + read_only, + ) +} + +#[instrument] +pub fn recursive_ownership_change( + path: &Path, + uid: Option, + gid: Option, + read_only: bool, +) -> Result<()> { + let mut mask = if read_only { RO_MASK } else { RW_MASK }; + if path.is_dir() { + for entry in fs::read_dir(path)? { + recursive_ownership_change(entry?.path().as_path(), uid, gid, read_only)?; + } + mask |= EXEC_MASK; + mask |= MODE_SETGID; + } + + // We do not want to change the permission of the underlying file + // using symlink. Hence we skip symlinks from recursive ownership + // and permission changes. + if path.is_symlink() { + return Ok(()); + } + + nix::unistd::chown(path, uid, gid)?; + + if gid.is_some() { + let metadata = path.metadata()?; + let mut permission = metadata.permissions(); + let target_mode = metadata.mode() | mask; + permission.set_mode(target_mode); + fs::set_permissions(path, permission)?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use protocols::agent::FSGroup; + use std::fs::File; + use tempfile::tempdir; + use test_utils::{ + skip_if_not_root, skip_loop_by_user, skip_loop_if_not_root, skip_loop_if_root, TestUserType, + }; + + #[test] + fn test_mount_storage() { + #[derive(Debug)] + struct TestData<'a> { + test_user: TestUserType, + storage: Storage, + error_contains: &'a str, + + make_source_dir: bool, + make_mount_dir: bool, + deny_mount_permission: bool, + } + + impl Default for TestData<'_> { + fn default() -> Self { + TestData { + test_user: TestUserType::Any, + storage: Storage { + mount_point: "mnt".to_string(), + source: "src".to_string(), + fstype: "tmpfs".to_string(), + ..Default::default() + }, + make_source_dir: true, + make_mount_dir: false, + deny_mount_permission: false, + error_contains: "", + } + } + } + + let tests = &[ + TestData { + test_user: TestUserType::NonRootOnly, + error_contains: "EPERM: Operation not permitted", + ..Default::default() + }, + TestData { + test_user: TestUserType::RootOnly, + ..Default::default() + }, + TestData { + storage: Storage { + mount_point: "mnt".to_string(), + source: "src".to_string(), + fstype: "bind".to_string(), + ..Default::default() + }, + make_source_dir: false, + make_mount_dir: true, + error_contains: "Could not create mountpoint", + ..Default::default() + }, + TestData { + test_user: TestUserType::NonRootOnly, + deny_mount_permission: true, + error_contains: "Could not create mountpoint", + ..Default::default() + }, + ]; + + for (i, d) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, d); + + skip_loop_by_user!(msg, d.test_user); + + let drain = slog::Discard; + let logger = slog::Logger::root(drain, o!()); + + let tempdir = tempdir().unwrap(); + + let source = tempdir.path().join(&d.storage.source); + let mount_point = tempdir.path().join(&d.storage.mount_point); + + let storage = Storage { + source: source.to_str().unwrap().to_string(), + mount_point: mount_point.to_str().unwrap().to_string(), + ..d.storage.clone() + }; + + if d.make_source_dir { + fs::create_dir_all(&storage.source).unwrap(); + } + if d.make_mount_dir { + fs::create_dir_all(&storage.mount_point).unwrap(); + } + + if d.deny_mount_permission { + fs::set_permissions( + mount_point.parent().unwrap(), + fs::Permissions::from_mode(0o000), + ) + .unwrap(); + } + + let result = mount_storage(&logger, &storage); + + // restore permissions so tempdir can be cleaned up + if d.deny_mount_permission { + fs::set_permissions( + mount_point.parent().unwrap(), + fs::Permissions::from_mode(0o755), + ) + .unwrap(); + } + + if result.is_ok() { + nix::mount::umount(&mount_point).unwrap(); + } + + let msg = format!("{}: result: {:?}", msg, result); + if d.error_contains.is_empty() { + assert!(result.is_ok(), "{}", msg); + } else { + assert!(result.is_err(), "{}", msg); + let error_msg = format!("{}", result.unwrap_err()); + assert!(error_msg.contains(d.error_contains), "{}", msg); + } + } + } + + #[test] + fn test_set_ownership() { + skip_if_not_root!(); + + let logger = slog::Logger::root(slog::Discard, o!()); + + #[derive(Debug)] + struct TestData<'a> { + mount_path: &'a str, + fs_group: Option, + read_only: bool, + expected_group_id: u32, + expected_permission: u32, + } + + let tests = &[ + TestData { + mount_path: "foo", + fs_group: None, + read_only: false, + expected_group_id: 0, + expected_permission: 0, + }, + TestData { + mount_path: "rw_mount", + fs_group: Some(FSGroup { + group_id: 3000, + group_change_policy: FSGroupChangePolicy::Always.into(), + ..Default::default() + }), + read_only: false, + expected_group_id: 3000, + expected_permission: RW_MASK | EXEC_MASK | MODE_SETGID, + }, + TestData { + mount_path: "ro_mount", + fs_group: Some(FSGroup { + group_id: 3000, + group_change_policy: FSGroupChangePolicy::OnRootMismatch.into(), + ..Default::default() + }), + read_only: true, + expected_group_id: 3000, + expected_permission: RO_MASK | EXEC_MASK | MODE_SETGID, + }, + ]; + + let tempdir = tempdir().expect("failed to create tmpdir"); + + for (i, d) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, d); + + let mount_dir = tempdir.path().join(d.mount_path); + fs::create_dir(&mount_dir) + .unwrap_or_else(|_| panic!("{}: failed to create root directory", msg)); + + let directory_mode = mount_dir.as_path().metadata().unwrap().permissions().mode(); + let mut storage_data = Storage::new(); + if d.read_only { + storage_data.set_options(vec!["foo".to_string(), "ro".to_string()]); + } + if let Some(fs_group) = d.fs_group.clone() { + storage_data.set_fs_group(fs_group); + } + storage_data.mount_point = mount_dir.clone().into_os_string().into_string().unwrap(); + + let result = set_ownership(&logger, &storage_data); + assert!(result.is_ok()); + + assert_eq!( + mount_dir.as_path().metadata().unwrap().gid(), + d.expected_group_id + ); + assert_eq!( + mount_dir.as_path().metadata().unwrap().permissions().mode(), + (directory_mode | d.expected_permission) + ); + } + } + + #[test] + fn test_recursive_ownership_change() { + skip_if_not_root!(); + + const COUNT: usize = 5; + + #[derive(Debug)] + struct TestData<'a> { + // Directory where the recursive ownership change should be performed on + path: &'a str, + + // User ID for ownership change + uid: u32, + + // Group ID for ownership change + gid: u32, + + // Set when the permission should be read-only + read_only: bool, + + // The expected permission of all directories after ownership change + expected_permission_directory: u32, + + // The expected permission of all files after ownership change + expected_permission_file: u32, + } + + let tests = &[ + TestData { + path: "no_gid_change", + uid: 0, + gid: 0, + read_only: false, + expected_permission_directory: 0, + expected_permission_file: 0, + }, + TestData { + path: "rw_gid_change", + uid: 0, + gid: 3000, + read_only: false, + expected_permission_directory: RW_MASK | EXEC_MASK | MODE_SETGID, + expected_permission_file: RW_MASK, + }, + TestData { + path: "ro_gid_change", + uid: 0, + gid: 3000, + read_only: true, + expected_permission_directory: RO_MASK | EXEC_MASK | MODE_SETGID, + expected_permission_file: RO_MASK, + }, + ]; + + let tempdir = tempdir().expect("failed to create tmpdir"); + + for (i, d) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, d); + + let mount_dir = tempdir.path().join(d.path); + fs::create_dir(&mount_dir) + .unwrap_or_else(|_| panic!("{}: failed to create root directory", msg)); + + let directory_mode = mount_dir.as_path().metadata().unwrap().permissions().mode(); + let mut file_mode: u32 = 0; + + // create testing directories and files + for n in 1..COUNT { + let nest_dir = mount_dir.join(format!("nested{}", n)); + fs::create_dir(&nest_dir) + .unwrap_or_else(|_| panic!("{}: failed to create nest directory", msg)); + + for f in 1..COUNT { + let filename = nest_dir.join(format!("file{}", f)); + File::create(&filename) + .unwrap_or_else(|_| panic!("{}: failed to create file", msg)); + file_mode = filename.as_path().metadata().unwrap().permissions().mode(); + } + } + + let uid = if d.uid > 0 { + Some(Uid::from_raw(d.uid)) + } else { + None + }; + let gid = if d.gid > 0 { + Some(Gid::from_raw(d.gid)) + } else { + None + }; + let result = recursive_ownership_change(&mount_dir, uid, gid, d.read_only); + + assert!(result.is_ok()); + + assert_eq!(mount_dir.as_path().metadata().unwrap().gid(), d.gid); + assert_eq!( + mount_dir.as_path().metadata().unwrap().permissions().mode(), + (directory_mode | d.expected_permission_directory) + ); + + for n in 1..COUNT { + let nest_dir = mount_dir.join(format!("nested{}", n)); + for f in 1..COUNT { + let filename = nest_dir.join(format!("file{}", f)); + let file = Path::new(&filename); + + assert_eq!(file.metadata().unwrap().gid(), d.gid); + assert_eq!( + file.metadata().unwrap().permissions().mode(), + (file_mode | d.expected_permission_file) + ); + } + + let dir = Path::new(&nest_dir); + assert_eq!(dir.metadata().unwrap().gid(), d.gid); + assert_eq!( + dir.metadata().unwrap().permissions().mode(), + (directory_mode | d.expected_permission_directory) + ); + } + } + } +} diff --git a/src/libs/kata-sys-util/src/mount.rs b/src/libs/kata-sys-util/src/mount.rs index 522ce3acb1..873db5f5b9 100644 --- a/src/libs/kata-sys-util/src/mount.rs +++ b/src/libs/kata-sys-util/src/mount.rs @@ -58,7 +58,8 @@ use crate::fs::is_symlink; use crate::sl; /// Default permission for directories created for mountpoint. -const MOUNT_PERM: u32 = 0o755; +const MOUNT_DIR_PERM: u32 = 0o755; +const MOUNT_FILE_PERM: u32 = 0o644; pub const PROC_MOUNTS_FILE: &str = "/proc/mounts"; const PROC_FIELDS_PER_LINE: usize = 6; @@ -187,13 +188,16 @@ pub fn create_mount_destination, D: AsRef, R: AsRef>( .parent() .ok_or_else(|| Error::InvalidPath(dst.to_path_buf()))?; let mut builder = fs::DirBuilder::new(); - builder.mode(MOUNT_PERM).recursive(true).create(parent)?; + builder + .mode(MOUNT_DIR_PERM) + .recursive(true) + .create(parent)?; if fs_type == "bind" { // The source and destination for bind mounting must be the same type: file or directory. if !src.as_ref().is_dir() { fs::OpenOptions::new() - .mode(MOUNT_PERM) + .mode(MOUNT_FILE_PERM) .write(true) .create(true) .open(dst)?; @@ -390,19 +394,17 @@ fn do_rebind_mount>(path: P, readonly: bool, flags: MsFlags) -> R } /// Take fstab style mount options and parses them for use with a standard mount() syscall. -fn parse_mount_options(options: &[String]) -> Result<(MsFlags, String)> { +pub fn parse_mount_options>(options: &[T]) -> Result<(MsFlags, String)> { let mut flags: MsFlags = MsFlags::empty(); let mut data: Vec = Vec::new(); for opt in options.iter() { - if opt == "defaults" { - continue; - } else if opt == "loop" { + if opt.as_ref() == "loop" { return Err(Error::InvalidMountOption("loop".to_string())); - } else if let Some(v) = parse_mount_flags(flags, opt) { + } else if let Some(v) = parse_mount_flags(flags, opt.as_ref()) { flags = v; } else { - data.push(opt.clone()); + data.push(opt.as_ref().to_string()); } } @@ -441,6 +443,7 @@ fn parse_mount_flags(mut flags: MsFlags, flag_str: &str) -> Option { // overridden by subsequent options, as in the option line users,exec,dev,suid). match flag_str { // Clear flags + "defaults" => {} "async" => flags &= !MsFlags::MS_SYNCHRONOUS, "atime" => flags &= !MsFlags::MS_NOATIME, "dev" => flags &= !MsFlags::MS_NODEV, @@ -464,6 +467,14 @@ fn parse_mount_flags(mut flags: MsFlags, flag_str: &str) -> Option { "noexec" => flags |= MsFlags::MS_NOEXEC, "nosuid" => flags |= MsFlags::MS_NOSUID, "rbind" => flags |= MsFlags::MS_BIND | MsFlags::MS_REC, + "unbindable" => flags |= MsFlags::MS_UNBINDABLE, + "runbindable" => flags |= MsFlags::MS_UNBINDABLE | MsFlags::MS_REC, + "private" => flags |= MsFlags::MS_PRIVATE, + "rprivate" => flags |= MsFlags::MS_PRIVATE | MsFlags::MS_REC, + "shared" => flags |= MsFlags::MS_SHARED, + "rshared" => flags |= MsFlags::MS_SHARED | MsFlags::MS_REC, + "slave" => flags |= MsFlags::MS_SLAVE, + "rslave" => flags |= MsFlags::MS_SLAVE | MsFlags::MS_REC, "relatime" => flags |= MsFlags::MS_RELATIME, "remount" => flags |= MsFlags::MS_REMOUNT, "ro" => flags |= MsFlags::MS_RDONLY, @@ -1030,7 +1041,7 @@ mod tests { #[test] fn test_parse_mount_options() { - let options = vec![]; + let options: Vec<&str> = vec![]; let (flags, data) = parse_mount_options(&options).unwrap(); assert!(flags.is_empty()); assert!(data.is_empty()); diff --git a/src/libs/kata-types/src/mount.rs b/src/libs/kata-types/src/mount.rs index 1c0e69d3ec..521a24a4e7 100644 --- a/src/libs/kata-types/src/mount.rs +++ b/src/libs/kata-types/src/mount.rs @@ -5,7 +5,9 @@ // use anyhow::{anyhow, Context, Error, Result}; +use std::collections::hash_map::Entry; use std::convert::TryFrom; +use std::fmt::Formatter; use std::{collections::HashMap, fs, path::PathBuf}; /// Prefix to mark a volume as Kata special. @@ -14,6 +16,9 @@ pub const KATA_VOLUME_TYPE_PREFIX: &str = "kata:"; /// The Mount should be ignored by the host and handled by the guest. pub const KATA_GUEST_MOUNT_PREFIX: &str = "kata:guest-mount:"; +/// The sharedfs volume is mounted by guest OS before starting the kata-agent. +pub const KATA_SHAREDFS_GUEST_PREMOUNT_TAG: &str = "kataShared"; + /// KATA_EPHEMERAL_DEV_TYPE creates a tmpfs backed volume for sharing files between containers. pub const KATA_EPHEMERAL_VOLUME_TYPE: &str = "ephemeral"; @@ -23,6 +28,9 @@ pub const KATA_HOST_DIR_VOLUME_TYPE: &str = "kata:hostdir"; /// KATA_MOUNT_INFO_FILE_NAME is used for the file that holds direct-volume mount info pub const KATA_MOUNT_INFO_FILE_NAME: &str = "mountInfo.json"; +/// Specify `fsgid` for a volume or mount, `fsgid=1`. +pub const KATA_MOUNT_OPTION_FS_GID: &str = "fsgid"; + /// KATA_DIRECT_VOLUME_ROOT_PATH is the root path used for concatenating with the direct-volume mount info file path pub const KATA_DIRECT_VOLUME_ROOT_PATH: &str = "/run/kata-containers/shared/direct-volumes"; @@ -421,6 +429,84 @@ impl TryFrom<&NydusExtraOptions> for KataVirtualVolume { } } +/// An implementation of generic storage device. +pub struct StorageDeviceGeneric { + path: String, +} + +impl std::fmt::Debug for StorageDeviceGeneric { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StorageState") + .field("path", &self.path) + .finish() + } +} + +impl StorageDeviceGeneric { + /// Create a new instance of `StorageStateCommon`. + pub fn new(path: String) -> Self { + StorageDeviceGeneric { path } + } +} + +impl StorageDevice for StorageDeviceGeneric { + fn path(&self) -> &str { + &self.path + } + + fn cleanup(&self) {} +} + +/// Trait object for storage device. +pub trait StorageDevice: Send + Sync { + /// Path + fn path(&self) -> &str; + + /// Clean up resources related to the storage device. + fn cleanup(&self); +} + +/// Manager to manage registered storage device handlers. +pub struct StorageHandlerManager { + handlers: HashMap, +} + +impl Default for StorageHandlerManager { + fn default() -> Self { + Self::new() + } +} + +impl StorageHandlerManager { + /// Create a new instance of `StorageHandlerManager`. + pub fn new() -> Self { + Self { + handlers: HashMap::new(), + } + } + + /// Register a storage device handler. + pub fn add_handler(&mut self, id: &str, handler: H) -> Result<()> { + match self.handlers.entry(id.to_string()) { + Entry::Occupied(_) => Err(anyhow!("storage handler for {} already exists", id)), + Entry::Vacant(entry) => { + entry.insert(handler); + Ok(()) + } + } + } + + /// Get storage handler with specified `id`. + pub fn handler(&self, id: &str) -> Option<&H> { + self.handlers.get(id) + } + + /// Get names of registered handlers. + pub fn get_handlers(&self) -> Vec { + self.handlers.keys().map(|v| v.to_string()).collect() + } +} + /// Join user provided volume path with kata direct-volume root path. /// /// The `volume_path` is base64-encoded and then safely joined to the `prefix`