From bdbba40381c447843c829732f5619a6a8c01ac08 Mon Sep 17 00:00:00 2001 From: Alex Lyn Date: Tue, 31 Mar 2026 17:25:56 +0800 Subject: [PATCH] agent: Register MultiLayerErofsHandler and process multiple EROFS Introduce MultiLayerErofsHandler and method of handle_multi_layer_storage for multi-layer storage: (1) Register MultiLayerErofsHandler to STORAGE_HANDLERS to handle multi-layer EROFS storage with driver type 'multi-layer-erofs'. (2) Add handle_multi_layer_erofs function to process multiple EROFS storages with X-kata.multi-layer marker together in guest. Signed-off-by: Alex Lyn --- src/agent/src/storage/mod.rs | 106 +++++- src/agent/src/storage/multi_layer_erofs.rs | 390 +++++++++++++-------- 2 files changed, 345 insertions(+), 151 deletions(-) diff --git a/src/agent/src/storage/mod.rs b/src/agent/src/storage/mod.rs index a9920c6d8c..ef80193b51 100644 --- a/src/agent/src/storage/mod.rs +++ b/src/agent/src/storage/mod.rs @@ -4,7 +4,7 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fs; use std::os::unix::fs::{MetadataExt, PermissionsExt}; use std::path::Path; @@ -26,6 +26,7 @@ use self::ephemeral_handler::EphemeralHandler; use self::fs_handler::{OverlayfsHandler, VirtioFsHandler}; use self::image_pull_handler::ImagePullHandler; use self::local_handler::LocalHandler; +use self::multi_layer_erofs::{handle_multi_layer_erofs_group, is_multi_layer_storage}; use crate::mount::{baremount, is_mounted, remove_mounts}; use crate::sandbox::Sandbox; @@ -37,6 +38,7 @@ mod ephemeral_handler; mod fs_handler; mod image_pull_handler; mod local_handler; +mod multi_layer_erofs; const RW_MASK: u32 = 0o660; const RO_MASK: u32 = 0o440; @@ -146,6 +148,7 @@ lazy_static! { #[cfg(target_arch = "s390x")] Arc::new(self::block_handler::VirtioBlkCcwHandler {}), Arc::new(ImagePullHandler {}), + Arc::new(self::multi_layer_erofs::MultiLayerErofsHandler {}), ]; for handler in handlers { @@ -156,6 +159,89 @@ lazy_static! { }; } +async fn handle_multi_layer_storage( + logger: &Logger, + storage: &Storage, + storages: &[Storage], + sandbox: &Arc>, + cid: &Option, + processed_multi_layer_mount_points: &mut HashSet, + mount_list: &mut Vec, +) -> Result { + if !is_multi_layer_storage(storage) { + return Ok(false); + } + + if processed_multi_layer_mount_points.contains(&storage.mount_point) { + return Ok(true); + } + + slog::info!( + logger, + "processing multi-layer EROFS storage from unified loop"; + "mount-point" => &storage.mount_point, + "source" => &storage.source, + "driver" => &storage.driver, + "fstype" => &storage.fstype, + ); + + let result = match handle_multi_layer_erofs_group(storage, storages, cid, sandbox, logger).await + { + Ok(r) => r, + Err(e) => { + error!(logger, "failed to handle multi-layer EROFS: {:?}", e); + return Err(e); + } + }; + + for processed_mount_point in &result.processed_mount_points { + processed_multi_layer_mount_points.insert(processed_mount_point.clone()); + } + + for grouped_storage in storages { + if !result + .processed_mount_points + .iter() + .any(|mp| mp == &grouped_storage.mount_point) + { + continue; + } + + let path = grouped_storage.mount_point.clone(); + let state = sandbox + .lock() + .await + .add_sandbox_storage(&path, grouped_storage.shared) + .await; + + if state.ref_count().await > 1 { + continue; + } + + let device = new_device(result.mount_point.clone())?; + if let Err(device) = sandbox + .lock() + .await + .update_sandbox_storage(&path, device.clone()) + { + error!(logger, "failed to update device for multi-layer storage"); + if let Err(e) = sandbox.lock().await.remove_sandbox_storage(&path).await { + warn!(logger, "failed to remove dummy sandbox storage {:?}", e); + } + if let Err(e) = device.cleanup() { + error!( + logger, + "failed to clean state for multi-layer storage device {}, {}", path, e + ); + } + return Err(anyhow!("failed to update device for multi-layer storage")); + } + } + + mount_list.push(result.mount_point); + Ok(true) +} + // add_storages takes a list of storages passed by the caller, and perform the // associated operations such as waiting for the device to show up, and mount // it to a specific location, according to the type of handler chosen, and for @@ -168,8 +254,22 @@ pub async fn add_storages( cid: Option, ) -> Result> { let mut mount_list = Vec::new(); + let mut processed_multi_layer_mount_points = HashSet::new(); - for storage in storages { + for storage in &storages { + if handle_multi_layer_storage( + &logger, + storage, + &storages, + sandbox, + &cid, + &mut processed_multi_layer_mount_points, + &mut mount_list, + ) + .await? + { + continue; + } let path = storage.mount_point.clone(); let state = sandbox .lock() @@ -195,7 +295,7 @@ pub async fn add_storages( sandbox, }; - match handler.create_device(storage, &mut ctx).await { + match handler.create_device(storage.clone(), &mut ctx).await { Ok(device) => { match sandbox .lock() diff --git a/src/agent/src/storage/multi_layer_erofs.rs b/src/agent/src/storage/multi_layer_erofs.rs index f29bfe2fb7..f06d9f62bb 100644 --- a/src/agent/src/storage/multi_layer_erofs.rs +++ b/src/agent/src/storage/multi_layer_erofs.rs @@ -12,12 +12,15 @@ //! - Supports X-kata.mkdir.path options to create directories in upper layer before overlay mount use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::Arc; use anyhow::{anyhow, Context, Result}; +use kata_sys_util::mount::create_mount_destination; use kata_types::mount::StorageDevice; use protocols::agent::Storage; +use regex::Regex; +use slog::Logger; use tokio::sync::Mutex; use crate::device::BLOCK; @@ -25,9 +28,6 @@ use crate::mount::baremount; use crate::sandbox::Sandbox; use crate::storage::{StorageContext, StorageHandler}; use crate::uevent::{wait_for_uevent, Uevent, UeventMatcher}; -use kata_sys_util::mount::create_mount_destination; -use regex::Regex; -use slog::Logger; /// EROFS Type const EROFS_TYPE: &str = "erofs"; @@ -68,6 +68,19 @@ impl UeventMatcher for VirtioBlkMatcher { #[derive(Debug)] pub struct MultiLayerErofsHandler {} +#[derive(Debug, Clone)] +pub struct MultiLayerErofsResult { + pub mount_point: String, + pub processed_mount_points: Vec, +} + +#[allow(dead_code)] +#[derive(Debug)] +struct MkdirDirective { + raw_path: String, + mode: Option, +} + #[async_trait::async_trait] impl StorageHandler for MultiLayerErofsHandler { fn driver_types(&self) -> &[&str] { @@ -95,42 +108,52 @@ impl StorageHandler for MultiLayerErofsHandler { } } -/// Handle multi-layer EROFS storage by combining multiple storages -pub async fn handle_multi_layer_erofs( - storages: &[&Storage], +pub fn is_multi_layer_storage(storage: &Storage) -> bool { + storage.options.iter().any(|o| o == OPT_MULTI_LAYER) + || storage.driver == DRIVER_MULTI_LAYER_EROFS +} + +pub async fn handle_multi_layer_erofs_group( + trigger: &Storage, + storages: &[Storage], cid: &Option, sandbox: &Arc>, logger: &Logger, -) -> Result { - let logger = logger.new(o!("subsystem" => "multi-layer-erofs")); +) -> Result { + let logger = logger.new(o!( + "subsystem" => "multi-layer-erofs", + "trigger-mount-point" => trigger.mount_point.clone(), + )); + + let multi_layer_storages: Vec<&Storage> = storages + .iter() + .filter(|s| is_multi_layer_storage(s)) + .collect(); + + if multi_layer_storages.is_empty() { + return Err(anyhow!("no multi-layer storages found")); + } - // Find ext4 (upper) and erofs (lower) storages let mut ext4_storage: Option<&Storage> = None; let mut erofs_storages: Vec<&Storage> = Vec::new(); - let mut mkdir_dirs: Vec<(String, Option)> = Vec::new(); // (path, mode) + let mut mkdir_dirs: Vec = Vec::new(); - for storage in storages { - if storage.options.iter().any(|o| o == OPT_OVERLAY_UPPER) - || (storage.fstype == EXT4_TYPE && storage.options.iter().any(|o| o == OPT_MULTI_LAYER)) - { + for storage in &multi_layer_storages { + if is_upper_storage(storage) { + if ext4_storage.is_some() { + return Err(anyhow!( + "multi-layer erofs currently supports exactly one ext4 upper layer" + )); + } ext4_storage = Some(*storage); // Extract mkdir directories from X-kata.mkdir.path options for opt in &storage.options { if let Some(mkdir_spec) = opt.strip_prefix(OPT_MKDIR_PATH) { - // Format: path:mode or path:mode:uid:gid - let parts: Vec<&str> = mkdir_spec.splitn(2, ':').collect(); - if !parts.is_empty() { - let path = parts[0].to_string(); - let mode = parts.get(1).map(|s| s.to_string()); - mkdir_dirs.push((path, mode)); - } + mkdir_dirs.push(parse_mkdir_directive(mkdir_spec)?); } } - } else if storage.options.iter().any(|o| o == OPT_OVERLAY_LOWER) - || (storage.fstype == EROFS_TYPE - && storage.options.iter().any(|o| o == OPT_MULTI_LAYER)) - { + } else if is_lower_storage(storage) { erofs_storages.push(*storage); } } @@ -146,40 +169,200 @@ pub async fn handle_multi_layer_erofs( slog::info!( logger, - "handling multi-layer erofs"; + "handling multi-layer erofs group"; "ext4-device" => &ext4.source, - "erofs-devices" => erofs_storages.iter().map(|s| s.source.as_str()).collect::>().join(", "), + "erofs-devices" => erofs_storages + .iter() + .map(|s| s.source.as_str()) + .collect::>() + .join(","), "mount-point" => &ext4.mount_point, - "ext4-fstype" => &ext4.fstype, - "ext4-options" => ext4.options.join(","), "mkdir-dirs-count" => mkdir_dirs.len(), ); // Create temporary mount points for upper and lower layers let cid_str = cid.as_deref().unwrap_or("sandbox"); - let temp_base = - std::path::PathBuf::from(format!("/run/kata-containers/{}/multi-layer", cid_str)); + let temp_base = PathBuf::from(format!("/run/kata-containers/{}/multi-layer", cid_str)); fs::create_dir_all(&temp_base).context("failed to create temp mount base")?; let upper_mount = temp_base.join("upper"); - let lower_mount = temp_base.join("lower"); + fs::create_dir_all(&upper_mount).context("failed to create upper mount dir")?; - // work_dir MUST be inside the ext4 mount for overlay to work - // It will be set to upper_mount/work after ext4 is mounted + wait_and_mount_upper(ext4, &upper_mount, sandbox, &logger).await?; + + for mkdir_dir in &mkdir_dirs { + // As {{ mount 1 }} refers to the first lower layer, which is not available until we mount it. + // Just skip it for now and handle it in a second pass after mounting the lower layers. + if mkdir_dir.raw_path.contains("{{ mount 1 }}") { + continue; + } + let resolved_path = resolve_mkdir_path(&mkdir_dir.raw_path, &upper_mount, None); + slog::info!( + logger, + "creating mkdir directory in upper layer"; + "raw-path" => &mkdir_dir.raw_path, + "resolved-path" => &resolved_path, + ); + + fs::create_dir_all(&resolved_path) + .with_context(|| format!("failed to create mkdir directory: {}", resolved_path))?; + } + + let mut lower_mounts = Vec::new(); + for (index, erofs) in erofs_storages.iter().enumerate() { + let lower_mount = temp_base.join(format!("lower-{}", index)); + fs::create_dir_all(&lower_mount).with_context(|| { + format!("failed to create lower mount dir {}", lower_mount.display()) + })?; + + wait_and_mount_lower(erofs, &lower_mount, sandbox, &logger).await?; + lower_mounts.push(lower_mount); + } + + // If any mkdir directive refers to {{ mount 1 }}, resolve it now using the first lower mount. + // This matches current supported placeholder behavior without inventing a broader template scheme. + for mkdir_dir in &mkdir_dirs { + if mkdir_dir.raw_path.contains("{{ mount 1 }}") { + let first_lower = lower_mounts + .first() + .ok_or_else(|| anyhow!("lower mount is missing while resolving mkdir path"))?; + let resolved_path = + resolve_mkdir_path(&mkdir_dir.raw_path, &upper_mount, Some(first_lower)); + slog::info!( + logger, + "creating deferred mkdir directory"; + "raw-path" => &mkdir_dir.raw_path, + "resolved-path" => &resolved_path, + ); + + fs::create_dir_all(&resolved_path).with_context(|| { + format!( + "failed to create deferred mkdir directory: {}", + resolved_path + ) + })?; + } + } + + let upperdir = upper_mount.join("upper"); + let workdir = upper_mount.join("work"); + + if !upperdir.exists() { + fs::create_dir_all(&upperdir).context("failed to create upperdir")?; + } + fs::create_dir_all(&workdir).context("failed to create workdir")?; + + let lowerdir = lower_mounts + .iter() + .map(|p| p.display().to_string()) + .collect::>() + .join(":"); slog::info!( logger, - "created temp mount directories"; - "temp-base" => temp_base.display(), - "upper-mount" => upper_mount.display(), - "lower-mount" => lower_mount.display(), + "creating overlay mount"; + "upperdir" => upperdir.display(), + "lowerdir" => &lowerdir, + "workdir" => workdir.display(), + "target" => &ext4.mount_point, ); - fs::create_dir_all(&upper_mount).context("failed to create upper mount dir")?; - fs::create_dir_all(&lower_mount).context("failed to create lower mount dir")?; + create_mount_destination( + Path::new(OVERLAY_TYPE), + Path::new(&ext4.mount_point), + "", + OVERLAY_TYPE, + ) + .context("failed to create overlay mount destination")?; - // Wait for block devices to be ready before mounting - // Extract device name from source (e.g., /dev/vda -> vda) + let overlay_options = format!( + "upperdir={},lowerdir={},workdir={}", + upperdir.display(), + lowerdir, + workdir.display() + ); + + baremount( + Path::new(OVERLAY_TYPE), + Path::new(&ext4.mount_point), + OVERLAY_TYPE, + nix::mount::MsFlags::empty(), + &overlay_options, + &logger, + ) + .context("failed to mount overlay")?; + + slog::info!( + logger, + "multi-layer erofs overlay mounted successfully"; + "mount-point" => &ext4.mount_point, + ); + + // Collect all unique mount points to maintain a clean resource state. + // + // In multi-layer EROFS configurations, upper and lower storages may share + // the same mount point. + // We must deduplicate these entries before processing to prevent: + // 1. Double-incrementing sandbox refcounts for the same resource. + // 2. Redundant bookkeeping operations that could lead to state inconsistency. + // + // Note: We maintain the original order of insertion, which is essential for + // ensuring a predictable and correct sequence during resource cleanup. + let processed_mount_points = multi_layer_storages.iter().fold(Vec::new(), |mut acc, s| { + if !acc.contains(&s.mount_point) { + acc.push(s.mount_point.clone()); + } + acc + }); + + Ok(MultiLayerErofsResult { + mount_point: ext4.mount_point.clone(), + processed_mount_points, + }) +} + +fn is_upper_storage(storage: &Storage) -> bool { + storage.options.iter().any(|o| o == OPT_OVERLAY_UPPER) + || (storage.fstype == EXT4_TYPE && storage.options.iter().any(|o| o == OPT_MULTI_LAYER)) +} + +fn is_lower_storage(storage: &Storage) -> bool { + storage.options.iter().any(|o| o == OPT_OVERLAY_LOWER) + || (storage.fstype == EROFS_TYPE && storage.options.iter().any(|o| o == OPT_MULTI_LAYER)) +} + +fn parse_mkdir_directive(spec: &str) -> Result { + let parts: Vec<&str> = spec.splitn(2, ':').collect(); + if parts.is_empty() || parts[0].is_empty() { + return Err(anyhow!("invalid X-kata.mkdir.path directive: '{}'", spec)); + } + + Ok(MkdirDirective { + raw_path: parts[0].to_string(), + mode: parts.get(1).map(|s| s.to_string()), + }) +} + +fn resolve_mkdir_path( + raw_path: &str, + upper_mount: &Path, + first_lower_mount: Option<&Path>, +) -> String { + let mut resolved = raw_path.replace("{{ mount 0 }}", upper_mount.to_str().unwrap_or("")); + + if let Some(lower) = first_lower_mount { + resolved = resolved.replace("{{ mount 1 }}", lower.to_str().unwrap_or("")); + } + + resolved +} + +async fn wait_and_mount_upper( + ext4: &Storage, + upper_mount: &Path, + sandbox: &Arc>, + logger: &Logger, +) -> Result<()> { let ext4_devname = extract_device_name(&ext4.source)?; slog::info!( logger, @@ -193,7 +376,6 @@ pub async fn handle_multi_layer_erofs( .await .context("timeout waiting for ext4 block device")?; - // Step 1: Mount upper layer (ext4) slog::info!( logger, "mounting ext4 upper layer"; @@ -203,7 +385,7 @@ pub async fn handle_multi_layer_erofs( "options" => ext4.options.join(","), ); - create_mount_destination(Path::new(&ext4.source), &upper_mount, "", &ext4.fstype) + create_mount_destination(Path::new(&ext4.source), upper_mount, "", &ext4.fstype) .context("failed to create upper mount destination")?; // Filter out X-kata.* custom options before mount @@ -225,57 +407,23 @@ pub async fn handle_multi_layer_erofs( let (flags, options) = kata_sys_util::mount::parse_mount_options(&mount_options)?; baremount( Path::new(&ext4.source), - &upper_mount, + upper_mount, &ext4.fstype, flags, options.as_str(), - &logger, + logger, ) .context("failed to mount ext4 upper layer")?; - slog::info!( - logger, - "ext4 upper layer mounted successfully"; - "mount-point" => upper_mount.display(), - ); + Ok(()) +} - // Step 2: Create mkdir directories specified in X-kata.mkdir.path options - for (raw_path, _mode) in &mkdir_dirs { - // Resolve template variables like {{ mount 0 }}/upper - // Currently we only support {{ mount 0 }} which references upper_mount - let resolved_path = if raw_path.contains("{{ mount 0 }}") { - raw_path.replace("{{ mount 0 }}", upper_mount.to_str().unwrap_or("")) - } else if raw_path.contains("{{ mount 1 }}") { - // This will be resolved after erofs mount - slog::warn!( - logger, - "mkdir path references unmounted layer, deferring creation"; - "path" => raw_path - ); - continue; - } else { - raw_path.clone() - }; - - slog::info!( - logger, - "creating mkdir directory"; - "raw-path" => raw_path, - "resolved-path" => &resolved_path, - ); - - // Create directory with default permissions (0755) - // Mode parsing can be added in future if needed - fs::create_dir_all(&resolved_path).context(format!( - "failed to create mkdir directory: {}", - resolved_path - ))?; - } - - // Step 3: Mount lower layers (erofs) - let erofs = erofs_storages[0]; - - // Wait for erofs block device to be ready before mounting +async fn wait_and_mount_lower( + erofs: &Storage, + lower_mount: &Path, + sandbox: &Arc>, + logger: &Logger, +) -> Result<()> { let erofs_devname = extract_device_name(&erofs.source)?; slog::info!( logger, @@ -291,79 +439,25 @@ pub async fn handle_multi_layer_erofs( slog::info!( logger, - "block device is ready, mounting erofs lower layer"; + "mounting erofs lower layer"; "device" => &erofs.source, "mount-point" => lower_mount.display(), ); - create_mount_destination(Path::new(&erofs.source), &lower_mount, "", EROFS_TYPE) + create_mount_destination(Path::new(&erofs.source), lower_mount, "", EROFS_TYPE) .context("failed to create lower mount destination")?; baremount( Path::new(&erofs.source), - &lower_mount, + lower_mount, EROFS_TYPE, nix::mount::MsFlags::MS_RDONLY, "ro", - &logger, + logger, ) .context("failed to mount erofs lower layer")?; - // Step 4: Create upperdir and workdir within the ext4 mount - // Both MUST be on the same filesystem (the ext4 mount) - // mkdir step already created {{ mount 0 }}/upper, so upperdir is upper_mount/upper - let upperdir = upper_mount.join("upper"); - let workdir = upper_mount.join("work"); - - // Ensure upperdir exists (mkdir step should have created it, but check again) - if !upperdir.exists() { - fs::create_dir_all(&upperdir).context("failed to create upperdir")?; - } - // workdir MUST be created inside ext4 mount, not in temp_base - fs::create_dir_all(&workdir).context("failed to create workdir")?; - - // Step 5: Create overlay mount at final mount_point - slog::info!( - logger, - "creating overlay mount"; - "upperdir" => upperdir.display(), - "lowerdir" => lower_mount.display(), - "workdir" => workdir.display(), - "target" => &ext4.mount_point, - ); - - create_mount_destination( - Path::new(OVERLAY_TYPE), - Path::new(&ext4.mount_point), - "", - OVERLAY_TYPE, - ) - .context("failed to create overlay mount destination")?; - - let overlay_options = format!( - "upperdir={},lowerdir={},workdir={}", - upperdir.display(), - lower_mount.display(), - workdir.display() - ); - - baremount( - Path::new(OVERLAY_TYPE), - Path::new(&ext4.mount_point), - OVERLAY_TYPE, - nix::mount::MsFlags::empty(), - &overlay_options, - &logger, - ) - .context("failed to mount overlay")?; - - slog::info!( - logger, - "multi-layer erofs overlay mounted successfully"; - "mount-point" => &ext4.mount_point, - ); - - Ok(ext4.mount_point.clone()) + Ok(()) } /// Extract device name from a device path