From 8da7cd161103a7365f50ee801c0f2613822d44f9 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 7 May 2025 16:10:00 +0800 Subject: [PATCH 1/6] runtime-rs: Impl recursive directory copy with metadata preservation Add async directory traversal using BFS algorithm: (1) Support file type handling: Regular files (S_IFREG) with content streaming; Directories (S_IFDIR) with mode preservation; Symbolic links (S_IFLNK) with target recreation; (2) Maintain POSIX metadata: UID/GID preservation,File mode bits, and Directory permissions (3) Implement async I/O operations for: Directory enumeration, file reading, symlink target resolution Fixes #11237 Signed-off-by: alex.lyn --- .../resource/src/volume/share_fs_volume.rs | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs index 7aa30c52b1..535bfdfcf0 100644 --- a/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs +++ b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs @@ -5,6 +5,7 @@ // use std::{ + collections::VecDeque, fs::File, io::Read, os::unix::fs::MetadataExt, @@ -18,6 +19,8 @@ use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use hypervisor::device::device_manager::DeviceManager; use kata_sys_util::mount::{get_mount_options, get_mount_path, get_mount_type}; +use nix::sys::stat::SFlag; +use tokio::io::AsyncReadExt; use tokio::sync::RwLock; use super::Volume; @@ -307,6 +310,130 @@ impl Volume for ShareFsVolume { } } +#[allow(dead_code)] +async fn copy_dir_recursively>( + src_dir: P, + dest_dir: &str, + agent: &Arc, +) -> Result<()> { + let mut queue = VecDeque::new(); + queue.push_back((src_dir.as_ref().to_path_buf(), dest_dir.to_string())); + + while let Some((current_src, current_dest)) = queue.pop_front() { + let mut entries = tokio::fs::read_dir(¤t_src) + .await + .context(format!("read directory: {:?}", current_src))?; + + while let Some(entry) = entries + .next_entry() + .await + .context(format!("read directory entry in {:?}", current_src))? + { + let entry_path = entry.path(); + let file_name = entry_path + .file_name() + .ok_or_else(|| anyhow!("get file name for {:?}", entry_path))? + .to_string_lossy() + .to_string(); + + let dest_path = format!("{}/{}", current_dest, file_name); + + let metadata = entry + .metadata() + .await + .context(format!("read metadata for {:?}", entry_path))?; + + if metadata.is_symlink() { + // handle symlinks + let entry_path_err = entry_path.clone(); + let entry_path_clone = entry_path.clone(); + let link_target = + tokio::task::spawn_blocking(move || std::fs::read_link(&entry_path_clone)) + .await + .context(format!( + "failed to spawn blocking task for symlink: {:?}", + entry_path_err + ))??; + + let link_target_str = link_target.to_string_lossy().into_owned(); + let symlink_request = agent::CopyFileRequest { + path: dest_path.clone(), + file_size: link_target_str.len() as i64, + uid: metadata.uid() as i32, + gid: metadata.gid() as i32, + file_mode: SFlag::S_IFLNK.bits(), + data: link_target_str.clone().into_bytes(), + ..Default::default() + }; + info!( + sl!(), + "copying symlink_request {:?} in sandbox with file_mode: {:?}", + dest_path.clone(), + symlink_request.file_mode + ); + + agent.copy_file(symlink_request).await.context(format!( + "failed to create symlink: {:?} -> {:?}", + dest_path, link_target_str + ))?; + } else if metadata.is_dir() { + // handle directory + let dir_request = agent::CopyFileRequest { + path: dest_path.clone(), + file_size: 0, + uid: metadata.uid() as i32, + gid: metadata.gid() as i32, + dir_mode: metadata.mode(), + file_mode: SFlag::S_IFDIR.bits(), + data: vec![], + ..Default::default() + }; + info!( + sl!(), + "copying subdirectory {:?} in sandbox with file_mode: {:?}", + dir_request.path, + dir_request.file_mode + ); + agent + .copy_file(dir_request) + .await + .context(format!("Failed to create subdirectory: {:?}", dest_path))?; + + // push back the sub-dir into queue to handle it in time + queue.push_back((entry_path, dest_path)); + } else if metadata.is_file() { + // async read file + let mut file = tokio::fs::File::open(&entry_path) + .await + .context(format!("open file: {:?}", entry_path))?; + + let mut buffer = Vec::new(); + file.read_to_end(&mut buffer) + .await + .context(format!("read file: {:?}", entry_path))?; + + let file_request = agent::CopyFileRequest { + path: dest_path.clone(), + file_size: metadata.len() as i64, + uid: metadata.uid() as i32, + gid: metadata.gid() as i32, + file_mode: SFlag::S_IFREG.bits(), + data: buffer, + ..Default::default() + }; + + info!(sl!(), "copy file {:?} to guest", dest_path.clone()); + agent + .copy_file(file_request) + .await + .context(format!("copy file: {:?} -> {:?}", entry_path, dest_path))?; + } + } + } + + Ok(()) +} + pub(crate) fn is_share_fs_volume(m: &oci::Mount) -> bool { let mount_type = get_mount_type(m); (mount_type == "bind" || mount_type == mount::KATA_EPHEMERAL_VOLUME_TYPE) From 79b832b2f58f3751bf19f276f9ec4ab49141ac76 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 7 May 2025 16:28:27 +0800 Subject: [PATCH 2/6] runtime-rs: Propagate k8s configs correctly when sharedfs is disabled In Kubernetes (k8s), while Kata Pods often use virtiofs for injecting Service Accounts, Secrets, and ConfigMaps, security-sensitive environments like CoCo disable host-guest sharing. Consequently, when SharedFs is disabled, we propagate these configurations into the guest via file copy and bind mount for correct container access. Fixes #11237 Signed-off-by: alex.lyn --- .../resource/src/volume/share_fs_volume.rs | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs index 535bfdfcf0..5905f73f67 100644 --- a/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs +++ b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs @@ -136,6 +136,70 @@ impl ShareFsVolume { oci_mount.set_source(Some(PathBuf::from(&dest))); oci_mount.set_options(m.options().clone()); volume.mounts.push(oci_mount); + } else if src.is_dir() { + // source_path: "/var/lib/kubelet/pods/6dad7281-57ff-49e4-b844-c588ceabec16/volumes/kubernetes.io~projected/kube-api-access-8s2nl" + info!(sl!(), "copying directory {:?} to guest", &source_path); + + // create target path in guest + let dest_dir = [ + DEFAULT_KATA_GUEST_SANDBOX_DIR, + PASSTHROUGH_FS_DIR, + file_name.clone().as_str(), + ] + .join("/"); + + // create directory + let dir_metadata = std::fs::metadata(src.clone()) + .context(format!("read metadata from directory: {:?}", src))?; + + // ttRPC request for creating directory + let dir_request = agent::CopyFileRequest { + path: dest_dir.clone(), + file_size: 0, // useless for dir + uid: dir_metadata.uid() as i32, + gid: dir_metadata.gid() as i32, + dir_mode: dir_metadata.mode(), + file_mode: SFlag::S_IFDIR.bits(), + data: vec![], // no files + ..Default::default() + }; + + // dest_dir: "/run/kata-containers/sandbox/passthrough/sandbox-b2790ec0-kube-api-access-8s2nl" + info!( + sl!(), + "creating directory: {:?} in sandbox with file_mode: {:?}", + dest_dir, + dir_request.file_mode + ); + + // send request for creating directory + agent + .copy_file(dir_request) + .await + .context(format!("create directory in sandbox: {:?}", dest_dir))?; + + // recursively copy files from this directory + // similar to `scp -r $source_dir $target_dir` + copy_dir_recursively(src.clone(), &dest_dir, &agent) + .await + .context(format!("failed to copy directory contents: {:?}", src))?; + + // handle special mount options + let mut options = m.options().clone().unwrap_or_default(); + if !options.iter().any(|x| x == "rbind") { + options.push("rbind".into()); + } + if !options.iter().any(|x| x == "rprivate") { + options.push("rprivate".into()); + } + + // add OCI Mount + let mut oci_mount = oci::Mount::default(); + oci_mount.set_destination(m.destination().clone()); + oci_mount.set_typ(Some("bind".to_string())); + oci_mount.set_source(Some(PathBuf::from(&dest_dir))); + oci_mount.set_options(Some(options)); + volume.mounts.push(oci_mount); } else { // If not, we can ignore it. Let's issue a warning so that the user knows. warn!( From 654e6db91f2eb6c2a8016a0a77607058cdbc12e7 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Fri, 9 May 2025 21:31:54 +0800 Subject: [PATCH 3/6] runtime-rs: Add inotify-based real-time directory synchronization Introduce event-driven file sync mechanism between host and guest when sharedfs is disabled, which will help monitor the host path in time and do sync files changes: 1. Introduce FsWatcher to monitor directory changes via inotify; 2. Support recursive watching with configurable filters; 3. Add debounce logic (default 500ms cooldown) to handle burst events; 4. Trigger `copy_dir_recursively` on stable state; 5. Handle CREATE/MODIFY/DELETE/MOVED/CLOSE_WRITE events; Fixes #11237 Signed-off-by: alex.lyn --- src/runtime-rs/Cargo.lock | 52 +++++ src/runtime-rs/crates/resource/Cargo.toml | 2 + .../resource/src/volume/share_fs_volume.rs | 215 +++++++++++++++++- 3 files changed, 266 insertions(+), 3 deletions(-) diff --git a/src/runtime-rs/Cargo.lock b/src/runtime-rs/Cargo.lock index a4d0c3342a..91395f1c26 100644 --- a/src/runtime-rs/Cargo.lock +++ b/src/runtime-rs/Cargo.lock @@ -1860,6 +1860,28 @@ dependencies = [ "hashbrown 0.15.2", ] +[[package]] +name = "inotify" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3" +dependencies = [ + "bitflags 2.9.0", + "futures-core", + "inotify-sys", + "libc", + "tokio", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + [[package]] name = "instant" version = "0.1.12" @@ -3575,6 +3597,7 @@ dependencies = [ "cgroups-rs", "futures 0.3.28", "hypervisor", + "inotify", "kata-sys-util", "kata-types", "lazy_static", @@ -3599,6 +3622,7 @@ dependencies = [ "tokio", "tracing", "uuid 0.4.0", + "walkdir", ] [[package]] @@ -3806,6 +3830,15 @@ dependencies = [ "libc", ] +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.22" @@ -5075,6 +5108,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -5220,6 +5263,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/src/runtime-rs/crates/resource/Cargo.toml b/src/runtime-rs/crates/resource/Cargo.toml index 54a8e014aa..0ec974d001 100644 --- a/src/runtime-rs/crates/resource/Cargo.toml +++ b/src/runtime-rs/crates/resource/Cargo.toml @@ -33,6 +33,8 @@ tokio = { workspace = true, features = ["process"] } tracing = { workspace = true } uuid = { version = "0.4", features = ["v4"] } oci-spec = { workspace = true } +inotify = "0.11.0" +walkdir = "2.5.0" ## Dependencies from `rust-netlink` netlink-packet-route = "0.22" diff --git a/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs index 5905f73f67..ba06ea118b 100644 --- a/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs +++ b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs @@ -5,23 +5,30 @@ // use std::{ - collections::VecDeque, + collections::{HashSet, VecDeque}, fs::File, io::Read, os::unix::fs::MetadataExt, path::{Path, PathBuf}, str::FromStr, sync::Arc, + time::Duration, }; use agent::Agent; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use hypervisor::device::device_manager::DeviceManager; +use inotify::{EventMask, Inotify, WatchMask}; use kata_sys_util::mount::{get_mount_options, get_mount_path, get_mount_type}; use nix::sys::stat::SFlag; -use tokio::io::AsyncReadExt; -use tokio::sync::RwLock; +use tokio::{ + io::AsyncReadExt, + sync::{Mutex, RwLock}, + task::JoinHandle, + time::Instant, +}; +use walkdir::WalkDir; use super::Volume; use crate::share_fs::DEFAULT_KATA_GUEST_SANDBOX_DIR; @@ -31,6 +38,8 @@ use kata_types::mount; use oci_spec::runtime as oci; const SYS_MOUNT_PREFIX: [&str; 2] = ["/proc", "/sys"]; +const MONITOR_INTERVAL: Duration = Duration::from_millis(100); +const DEBOUNCE_TIME: Duration = Duration::from_millis(500); // copy file to container's rootfs if filesystem sharing is not supported, otherwise // bind mount it in the shared directory. @@ -42,6 +51,198 @@ pub(crate) struct ShareFsVolume { share_fs: Option>, mounts: Vec, storages: Vec, + monitor_task: Option>, +} + +/// Directory Monitor Config +/// path: the to be watched target directory +/// recursive: recursively monitor sub-dirs or not, +/// follow_symlinks: track symlinks or not, +/// exclude_hidden: exclude hidden files or not, +/// watch_events: Watcher Event types with CREATE/DELETE/MODIFY/MOVED_FROM/MOVED_TO +#[derive(Clone, Debug)] +struct MonitorConfig { + path: PathBuf, + recursive: bool, + follow_symlinks: bool, + exclude_hidden: bool, + watch_events: WatchMask, +} + +impl MonitorConfig { + fn new(path: &Path) -> Self { + Self { + path: path.to_path_buf(), + recursive: true, + follow_symlinks: false, + exclude_hidden: true, + watch_events: WatchMask::CREATE + | WatchMask::DELETE + | WatchMask::MODIFY + | WatchMask::MOVED_FROM + | WatchMask::MOVED_TO + | WatchMask::CLOSE_WRITE, + } + } +} + +#[derive(Clone)] +struct FsWatcher { + config: MonitorConfig, + inotify: Arc>, + watch_dirs: Arc>>, + pending_events: Arc>>, + need_sync: Arc>, +} + +impl FsWatcher { + async fn new(source_path: &Path) -> Result { + let inotify = Inotify::init()?; + let mon_cfg = MonitorConfig::new(source_path); + let mut watcher = Self { + config: mon_cfg, + inotify: Arc::new(Mutex::new(inotify)), + pending_events: Arc::new(Mutex::new(HashSet::new())), + watch_dirs: Arc::new(Mutex::new(HashSet::new())), + need_sync: Arc::new(Mutex::new(false)), + }; + + watcher.add_watchers().await?; + + Ok(watcher) + } + + /// add watched directory recursively + async fn add_watchers(&mut self) -> Result<()> { + let mut watched_dirs = self.watch_dirs.lock().await; + let config: &MonitorConfig = &self.config; + let walker = WalkDir::new(&config.path) + .follow_links(config.follow_symlinks) + .min_depth(0) + .max_depth(if config.recursive { usize::MAX } else { 1 }) + .into_iter() + .filter_entry(|e| { + !(config.exclude_hidden + && e.file_name() + .to_str() + .map(|s| s.starts_with('.')) + .unwrap_or(false)) + }); + + for entry in walker.filter_map(|e| e.ok()) { + if entry.file_type().is_dir() { + let path = entry.path(); + if watched_dirs.insert(path.to_path_buf()) { + self.inotify + .lock() + .await + .watches() + .add(path, config.watch_events)?; // we don't use WatchMask::ALL_EVENTS + } + } + } + + Ok(()) + } + + /// start monitor + pub async fn start_monitor( + &self, + agent: Arc, + src: PathBuf, + dst: PathBuf, + ) -> JoinHandle<()> { + let need_sync = self.need_sync.clone(); + let pending_events = self.pending_events.clone(); + let inotify = self.inotify.clone(); + let monitor_config = self.config.clone(); + + tokio::spawn(async move { + let mut buffer = [0u8; 4096]; + let mut last_event_time = None; + + loop { + // use cloned inotify instance + match inotify.lock().await.read_events(&mut buffer) { + Ok(events) => { + for event in events { + if !event.mask.intersects( + EventMask::CREATE + | EventMask::MODIFY + | EventMask::DELETE + | EventMask::MOVED_FROM + | EventMask::MOVED_TO, + ) { + continue; + } + + if let Some(file_name) = event.name { + let full_path = &monitor_config.path.join(file_name); + let event_types: Vec<&str> = event + .mask + .iter() + .map(|m| match m { + EventMask::CREATE => "CREATE", + EventMask::DELETE => "DELETE", + EventMask::MODIFY => "MODIFY", + EventMask::MOVED_FROM => "MOVED_FROM", + EventMask::MOVED_TO => "MOVED_TO", + EventMask::CLOSE_WRITE => "CLOSE_WRITE", + _ => "OTHER", + }) + .collect(); + + info!( + sl!(), + "handle events [{}] {:?} -> {:?}", + event_types.join("|"), + event.mask, + full_path + ); + pending_events.lock().await.insert(full_path.clone()); + } + } + } + Err(e) => eprintln!("inotify error: {}", e), + } + + // handle events to be synchronized + let events_paths = { + let mut pending = pending_events.lock().await; + pending.drain().collect::>() + }; + if !events_paths.is_empty() { + *need_sync.lock().await = true; + last_event_time = Some(Instant::now()); + } + + // Debounce handling + // It is used to prevent unnecessary repeated copies when file changes are triggered + // multiple times in a short period; we only execute the last one. + if let Some(t) = last_event_time { + if Instant::now().duration_since(t) > DEBOUNCE_TIME && *need_sync.lock().await { + info!(sl!(), "debounce handle copyfile {:?} -> {:?}", &src, &dst); + if let Err(e) = + copy_dir_recursively(&src, &dst.display().to_string(), &agent).await + { + error!( + sl!(), + "debounce handle copyfile {:?} -> {:?} failed with error: {:?}", + &src, + &dst, + e + ); + eprintln!("sync host/guest files failed: {}", e); + } + *need_sync.lock().await = false; + last_event_time = None; + } + } + + tokio::time::sleep(MONITOR_INTERVAL).await; + } + }) + } } impl ShareFsVolume { @@ -65,6 +266,7 @@ impl ShareFsVolume { share_fs: share_fs.as_ref().map(Arc::clone), mounts: vec![], storages: vec![], + monitor_task: None, }; match share_fs { None => { @@ -200,6 +402,13 @@ impl ShareFsVolume { oci_mount.set_source(Some(PathBuf::from(&dest_dir))); oci_mount.set_options(Some(options)); volume.mounts.push(oci_mount); + + // start monitoring + let watcher = FsWatcher::new(Path::new(&source_path)).await?; + let monitor_task = watcher + .start_monitor(agent.clone(), src.clone(), dest_dir.into()) + .await; + volume.monitor_task = Some(monitor_task); } else { // If not, we can ignore it. Let's issue a warning so that the user knows. warn!( From 6fa409df1aff3e20c602551fe7e45fcb28309993 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Fri, 9 May 2025 23:02:26 +0800 Subject: [PATCH 4/6] kata-agent: Improve file sync handling and address symlink issues When synchronizing file changes on the host, a "symlink AlreadyExists" issue occurs, primarily due to improper handling of symbolic links (symlinks). Additionally, there are other related problems. This patch will try to address these problems. (1) Handle symlink target existence (files, dirs, symlinks) during host file sync. Use appropriate removal methods (unlink, remove_file, remove_dir_all). (2) Enhance temporary file handling for safer operations and implement truncate only at offset 0 for resume support. (3) Set permissions and ownership for parent directories. (4) Check and clean target path for regular files before rename. Fixes #11237 Signed-off-by: alex.lyn --- src/agent/src/rpc.rs | 49 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index ee879bd4b1..05c3924943 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -2013,22 +2013,35 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> { )); } + // Create parent directories if missing if let Some(parent) = path.parent() { if !parent.exists() { let dir = parent.to_path_buf(); + // Attempt to create directory, ignore AlreadyExists errors if let Err(e) = fs::create_dir_all(&dir) { if e.kind() != std::io::ErrorKind::AlreadyExists { return Err(e.into()); } - } else { - std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(req.dir_mode))?; } + + // Set directory permissions and ownership + std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(req.dir_mode))?; + unistd::chown( + &dir, + Some(Uid::from_raw(req.uid as u32)), + Some(Gid::from_raw(req.gid as u32)), + )?; } } let sflag = stat::SFlag::from_bits_truncate(req.file_mode); if sflag.contains(stat::SFlag::S_IFDIR) { + // Remove existing non-directory file if present + if path.exists() && !path.is_dir() { + fs::remove_file(&path)?; + } + fs::create_dir(&path).or_else(|e| { if e.kind() != std::io::ErrorKind::AlreadyExists { return Err(e); @@ -2047,16 +2060,25 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> { return Ok(()); } + // Handle symlink creation if sflag.contains(stat::SFlag::S_IFLNK) { - // After kubernetes secret's volume update, the '..data' symlink should point to - // the new timestamped directory. - // TODO:The old and deleted timestamped dir still exists due to missing DELETE api in agent. - // Hence, Unlink the existing symlink. - if path.is_symlink() && path.exists() { - unistd::unlink(&path)?; + // Clean up existing path (whether symlink, dir, or file) + if path.exists() || path.is_symlink() { + // Use appropriate removal method based on path type + if path.is_symlink() { + unistd::unlink(&path)?; + } else if path.is_dir() { + fs::remove_dir_all(&path)?; + } else { + fs::remove_file(&path)?; + } } + + // Create new symbolic link let src = PathBuf::from(OsStr::from_bytes(&req.data)); unistd::symlinkat(&src, None, &path)?; + + // Set symlink ownership (permissions not supported for symlinks) let path_str = CString::new(path.as_os_str().as_bytes())?; let ret = unsafe { libc::lchown(path_str.as_ptr(), req.uid as u32, req.gid as u32) }; @@ -2071,7 +2093,7 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> { let file = OpenOptions::new() .write(true) .create(true) - .truncate(false) + .truncate(req.offset == 0) // Only truncate when offset is 0 .open(&tmpfile)?; file.write_all_at(req.data.as_slice(), req.offset as u64)?; @@ -2089,6 +2111,15 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> { Some(Gid::from_raw(req.gid as u32)), )?; + // Remove existing target path before rename + if path.exists() || path.is_symlink() { + if path.is_dir() { + fs::remove_dir_all(&path)?; + } else { + fs::remove_file(&path)?; + } + } + fs::rename(tmpfile, path)?; Ok(()) From 8910bddce84256711f4b229cfda6d7c02dd71854 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Thu, 15 May 2025 16:29:23 +0800 Subject: [PATCH 5/6] kata-types: Introduce k8s special volumes for projected and downward-api Fixes #11237 Signed-off-by: alex.lyn --- src/libs/kata-types/src/k8s.rs | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/libs/kata-types/src/k8s.rs b/src/libs/kata-types/src/k8s.rs index 63d9382d0b..8af2e324bf 100644 --- a/src/libs/kata-types/src/k8s.rs +++ b/src/libs/kata-types/src/k8s.rs @@ -16,6 +16,10 @@ const K8S_EMPTY_DIR: &str = "kubernetes.io~empty-dir"; const K8S_CONFIGMAP: &str = "kubernetes.io~configmap"; // K8S_SECRET is the K8s specific path for `secret` volumes const K8S_SECRET: &str = "kubernetes.io~secret"; +// K8S_PROJECTED is the K8s specific path for `projected` volumes +const K8S_PROJECTED: &str = "kubernetes.io~projected"; +// K8S_DOWNWARD_API is the K8s specific path for `downward-api` volumes +const K8S_DOWNWARD_API: &str = "kubernetes.io~downward-api"; /// Check whether the path is a K8s empty directory. pub fn is_empty_dir>(path: P) -> bool { @@ -32,6 +36,16 @@ pub fn is_secret>(path: P) -> bool { is_special_dir(path, K8S_SECRET) } +/// Check whether the path is a K8s projected volume. +pub fn is_projected>(path: P) -> bool { + is_special_dir(path, K8S_PROJECTED) +} + +/// Check whether the path is a K8s downward-api volume. +pub fn is_downward_api>(path: P) -> bool { + is_special_dir(path, K8S_DOWNWARD_API) +} + /// Check whether the path is a K8s empty directory, configmap, or secret. /// /// For example, given a K8s EmptyDir, Kubernetes mounts @@ -321,6 +335,36 @@ mod tests { assert!(!is_secret(path)); } + #[test] + fn test_is_projected() { + let path = "/volumes/kubernetes.io~projected/foo"; + assert!(is_projected(path)); + + let path = "/volumes/kubernetes.io~projected//foo"; + assert!(is_projected(path)); + + let path = "/volumes/kubernetes.io~projected-test/foo"; + assert!(!is_projected(path)); + + let path = "/volumes/kubernetes.io~projected"; + assert!(!is_projected(path)); + } + + #[test] + fn test_is_downward_api() { + let path = "/volumes/kubernetes.io~downward-api/foo"; + assert!(is_downward_api(path)); + + let path = "/volumes/kubernetes.io~downward-api//foo"; + assert!(is_downward_api(path)); + + let path = "/volumes/kubernetes.io~downward-api-test/foo"; + assert!(!is_downward_api(path)); + + let path = "/volumes/kubernetes.io~downward-api"; + assert!(!is_downward_api(path)); + } + #[test] fn test_container_type() { let sid = "sid".to_string(); From 4b27ca9233126a5ae1713064c398db66af45879f Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Thu, 15 May 2025 17:15:21 +0800 Subject: [PATCH 6/6] runtime-rs: Implement volume copy allowlist check For security reasons, we have restricted directory copying. Introduces the `is_allowlisted_copy_volume` function to verify if a given volume path is present in an allowed copy directory. This enhances security by ensuring only permitted volumes are copied Currently, only directories under the path `/var/lib/kubelet/pods//volumes/{kubernetes.io~configmap, kubernetes.io~secret, kubernetes.io~downward-api, kubernetes.io~projected}` are allowed to be copied into the guest. Copying of other directories will be prohibited. Fixes #11237 Signed-off-by: alex.lyn --- .../resource/src/volume/share_fs_volume.rs | 55 ++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs index ba06ea118b..26abc2dc74 100644 --- a/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs +++ b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs @@ -34,7 +34,10 @@ use super::Volume; use crate::share_fs::DEFAULT_KATA_GUEST_SANDBOX_DIR; use crate::share_fs::PASSTHROUGH_FS_DIR; use crate::share_fs::{MountedInfo, ShareFs, ShareFsVolumeConfig}; -use kata_types::mount; +use kata_types::{ + k8s::{is_configmap, is_downward_api, is_projected, is_secret}, + mount, +}; use oci_spec::runtime as oci; const SYS_MOUNT_PREFIX: [&str; 2] = ["/proc", "/sys"]; @@ -338,7 +341,11 @@ impl ShareFsVolume { oci_mount.set_source(Some(PathBuf::from(&dest))); oci_mount.set_options(m.options().clone()); volume.mounts.push(oci_mount); - } else if src.is_dir() { + } else if is_allowlisted_copy_volume(&src) { + // For security reasons, we have restricted directory copying. Currently, only directories under + // the path `/var/lib/kubelet/pods//volumes/{kubernetes.io~configmap, kubernetes.io~secret, kubernetes.io~downward-api, kubernetes.io~projected}` + // are allowed to be copied into the guest. Copying of other directories will be prohibited. + // source_path: "/var/lib/kubelet/pods/6dad7281-57ff-49e4-b844-c588ceabec16/volumes/kubernetes.io~projected/kube-api-access-8s2nl" info!(sl!(), "copying directory {:?} to guest", &source_path); @@ -763,6 +770,20 @@ pub fn generate_mount_path(id: &str, file_name: &str) -> String { format!("{}-{}-{}", nid, uid, file_name) } +/// This function is used to check whether a given volume is in the allowed copy allowlist. +/// More specifically, it determines whether the volume's path is located under a predefined +/// list of allowed copy directories. +pub(crate) fn is_allowlisted_copy_volume(source_path: &PathBuf) -> bool { + if !source_path.is_dir() { + return false; + } + // allowlist: { kubernetes.io~projected, kubernetes.io~configmap, kubernetes.io~secret, kubernetes.io~downward-api } + is_projected(source_path) + || is_downward_api(source_path) + || is_secret(source_path) + || is_configmap(source_path) +} + #[cfg(test)] mod test { use super::*; @@ -781,4 +802,34 @@ mod test { assert!(is_system_mount(proc_sub_dir)); assert!(!is_system_mount(not_sys_dir)); } + + #[test] + fn test_is_allowlisted_copy_volume() { + // The configmap is /var/lib/kubelet/pods//volumes/kubernetes.io~configmap/kube-configmap-0s2no/{..data, key1, key2,...} + // The secret is /var/lib/kubelet/pods//volumes/kubernetes.io~secret/kube-secret-2s2np/{..data, key1, key2,...} + // The projected is /var/lib/kubelet/pods//volumes/kubernetes.io~projected/kube-api-access-8s2nl/{..data, key1, key2,...} + // The downward-api is /var/lib/kubelet/pods//volumes/kubernetes.io~downward-api/downward-api-xxxx/{..data, key1, key2,...} + let configmap = + "var/lib/kubelet/pods/1000/volumes/kubernetes.io~configmap/kube-configmap-0s2no"; + let secret = "var/lib/kubelet/pods/1000/volumes/kubernetes.io~secret/kube-secret-2s2np"; + let projected = + "var/lib/kubelet/1000//volumes/kubernetes.io~projected/kube-api-access-8s2nl"; + let downward_api = + "var/lib/kubelet/1000//volumes/kubernetes.io~downward-api/downward-api-xxxx"; + + let temp_dir = tempfile::tempdir().unwrap(); + let cm_path = temp_dir.path().join(configmap); + std::fs::create_dir_all(&cm_path).unwrap(); + let secret_path = temp_dir.path().join(secret); + std::fs::create_dir_all(&secret_path).unwrap(); + let projected_path = temp_dir.path().join(projected); + std::fs::create_dir_all(&projected_path).unwrap(); + let downward_api_path = temp_dir.path().join(downward_api); + std::fs::create_dir_all(&downward_api_path).unwrap(); + + assert!(is_allowlisted_copy_volume(&cm_path)); + assert!(is_allowlisted_copy_volume(&secret_path)); + assert!(is_allowlisted_copy_volume(&projected_path)); + assert!(is_allowlisted_copy_volume(&downward_api_path)); + } }