diff --git a/Cargo.lock b/Cargo.lock index ad16dd5faf..6699601bd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -139,7 +139,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -150,7 +150,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -1963,7 +1963,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -2458,6 +2458,18 @@ dependencies = [ "cfg-if 0.1.10", ] +[[package]] +name = "gpt" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3696fafb1ecdcc2ae3ce337de73e9202806068594b77d22fdf2f3573c5ec2219" +dependencies = [ + "bitflags 2.11.1", + "crc", + "simple-bytes", + "uuid 1.23.1", +] + [[package]] name = "h2" version = "0.3.27" @@ -3163,7 +3175,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi 0.5.2", "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -3632,8 +3644,10 @@ dependencies = [ "base64 0.13.1", "bitmask-enum", "byte-unit", + "crc", "flate2", "glob", + "gpt", "lazy_static", "nix 0.26.4", "num_cpus", @@ -3641,6 +3655,7 @@ dependencies = [ "regex", "rstest 0.18.2", "safe-path 0.1.0", + "scopeguard", "serde", "serde-enum-str", "serde_json", @@ -4390,7 +4405,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -5295,7 +5310,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "059a34f111a9dee2ce1ac2826a68b24601c4298cfeb1a587c3cb493d5ab46f52" dependencies = [ "libc", - "nix 0.23.2", + "nix 0.30.1", ] [[package]] @@ -6516,7 +6531,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.12.1", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -6617,7 +6632,7 @@ dependencies = [ "security-framework", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -7182,6 +7197,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" +[[package]] +name = "simple-bytes" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c11532d9d241904f095185f35dcdaf930b1427a94d5b01d7002d74ba19b44cc4" + [[package]] name = "siphasher" version = "1.0.3" @@ -7305,7 +7326,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -7546,7 +7567,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix 1.1.4", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -7555,7 +7576,7 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -8259,7 +8280,7 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e" dependencies = [ "memoffset 0.9.1", "tempfile", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -8842,7 +8863,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a1ad09f9fb..49e5c96ebb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -197,6 +197,7 @@ tracing-subscriber = "0.3.20" ttrpc = "0.8.4" url = "2.5.4" which = "4.3.0" +gpt = "4.1.0" # Per-package release profile overrides for kata-deploy. The kata-deploy # binary runs once at pod start and then idles waiting for SIGTERM, so we diff --git a/docs/how-to/how-to-use-erofs-snapshotter-with-kata.md b/docs/how-to/how-to-use-erofs-snapshotter-with-kata.md index bc2405a5d0..2460f5a780 100644 --- a/docs/how-to/how-to-use-erofs-snapshotter-with-kata.md +++ b/docs/how-to/how-to-use-erofs-snapshotter-with-kata.md @@ -90,7 +90,7 @@ version = 3 [plugins.'io.containerd.snapshotter.v1.erofs'] default_size = '' # SIZE=6G or 10G or other size - max_unmerged_layers = 1 + max_unmerged_layers = 0 ``` #### Verify the EROFS plugins are loaded diff --git a/src/agent/src/device/block_device_handler.rs b/src/agent/src/device/block_device_handler.rs index 3d78c84201..5e22ac28d4 100644 --- a/src/agent/src/device/block_device_handler.rs +++ b/src/agent/src/device/block_device_handler.rs @@ -173,7 +173,8 @@ pub struct VirtioBlkPciMatcher { impl VirtioBlkPciMatcher { pub fn new(relpath: &str, root_complex: &str) -> VirtioBlkPciMatcher { let root_bus = create_pci_root_bus_path(root_complex); - let re = format!(r"^{root_bus}{relpath}/virtio[0-9]+/block/"); + // [^/]+$ ensures it only match the whole-disk uevent (e.g. block/vdx) + let re = format!(r"^{root_bus}{relpath}/virtio[0-9]+/block/[^/]+$"); VirtioBlkPciMatcher { rex: Regex::new(&re).expect("BUG: failed to compile VirtioBlkPciMatcher regex"), @@ -259,6 +260,17 @@ mod tests { assert!(matcher_b.is_match(&uev_b)); assert!(!matcher_b.is_match(&uev_a)); assert!(!matcher_a.is_match(&uev_b)); + + // Partition uevents must NOT match (only the whole-disk uevent should match) + let mut uev_part = uev_a.clone(); + uev_part.devname = "vda1".to_string(); + uev_part.devpath = format!("{root_bus}{relpath_a}/virtio4/block/{devname}/vda1"); + assert!(!matcher_a.is_match(&uev_part)); + + let mut uev_part91 = uev_a.clone(); + uev_part91.devname = "vda91".to_string(); + uev_part91.devpath = format!("{root_bus}{relpath_a}/virtio4/block/{devname}/vda91"); + assert!(!matcher_a.is_match(&uev_part91)); } #[cfg(target_arch = "s390x")] diff --git a/src/agent/src/device/scsi_device_handler.rs b/src/agent/src/device/scsi_device_handler.rs index 972fc7ee78..7a9e68acae 100644 --- a/src/agent/src/device/scsi_device_handler.rs +++ b/src/agent/src/device/scsi_device_handler.rs @@ -17,6 +17,9 @@ use std::sync::Arc; use tokio::sync::Mutex; use tracing::instrument; +/// The path segment in the uevent devpath that separates the SCSI path and the block device name. +const BLOCK_SEGMENT: &str = "/block/"; + #[derive(Debug)] pub struct ScsiDeviceHandler {} @@ -53,20 +56,41 @@ pub async fn get_scsi_device_name( // SCSI host. #[derive(Debug)] pub struct ScsiBlockMatcher { - search: String, + /// Expected SCSI path suffix before `/block/`, e.g. `/0:0:2:0` + scsi_path_suffix: String, } impl ScsiBlockMatcher { pub fn new(scsi_addr: &str) -> ScsiBlockMatcher { - let search = format!(r"/0:0:{scsi_addr}/block/"); + ScsiBlockMatcher { + scsi_path_suffix: format!("/0:0:{scsi_addr}"), + } + } - ScsiBlockMatcher { search } + fn split_block_devpath<'a>(&self, devpath: &'a str) -> Option<(&'a str, &'a str)> { + let idx = devpath.find(BLOCK_SEGMENT)?; + let prefix = &devpath[..idx]; + let suffix = &devpath[idx + BLOCK_SEGMENT.len()..]; + Some((prefix, suffix)) } } impl UeventMatcher for ScsiBlockMatcher { fn is_match(&self, uev: &Uevent) -> bool { - uev.subsystem == BLOCK && uev.devpath.contains(&self.search) && !uev.devname.is_empty() + if uev.action != U_EVENT_ACTION_ADD { + return false; + } + + if uev.subsystem != BLOCK || uev.devname.is_empty() { + return false; + } + + let (prefix, suffix) = match self.split_block_devpath(&uev.devpath) { + Some(parts) => parts, + None => return false, + }; + + prefix.ends_with(&self.scsi_path_suffix) && !suffix.contains('/') && suffix == uev.devname } } @@ -106,6 +130,23 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { #[cfg(test)] mod tests { use super::*; + use crate::linux_abi::U_EVENT_ACTION_ADD; + + fn make_scsi_block_uevent(addr: &str, devname: &str, devpath_suffix: &str) -> Uevent { + let root_bus = create_pci_root_bus_path("00"); + + let mut uev = Uevent::default(); + uev.action = U_EVENT_ACTION_ADD.to_string(); + uev.subsystem = BLOCK.to_string(); + uev.devname = devname.to_string(); + uev.devpath = format!( + "{root_bus}/0000:00:00.0/virtio0/host0/target0:0:{target}/0:0:{addr}/block/{devpath_suffix}", + target = addr.split(':').next().unwrap_or("0"), + addr = addr, + devpath_suffix = devpath_suffix, + ); + uev + } #[tokio::test] #[allow(clippy::redundant_clone)] @@ -124,6 +165,7 @@ mod tests { let mut uev_b = uev_a.clone(); let addr_b = "2:0"; + uev_b.devname = "sdb".to_string(); uev_b.devpath = format!("{root_bus}/0000:00:00.0/virtio0/host0/target0:0:2/0:0:{addr_b}/block/sdb"); let matcher_b = ScsiBlockMatcher::new(addr_b); @@ -133,4 +175,21 @@ mod tests { assert!(!matcher_b.is_match(&uev_a)); assert!(!matcher_a.is_match(&uev_b)); } + + #[tokio::test] + async fn test_scsi_block_matcher_rejects_partitions() { + let uev_whole = make_scsi_block_uevent("0:0", "sda", "sda"); + let uev_part = make_scsi_block_uevent("0:0", "sda1", "sda/sda1"); + + let matcher = ScsiBlockMatcher::new("0:0"); + + assert!( + matcher.is_match(&uev_whole), + "whole disk uevent should match" + ); + assert!( + !matcher.is_match(&uev_part), + "partition uevent should not match" + ); + } } diff --git a/src/agent/src/storage/mod.rs b/src/agent/src/storage/mod.rs index 6b24bf1a9e..48eb4bfe37 100644 --- a/src/agent/src/storage/mod.rs +++ b/src/agent/src/storage/mod.rs @@ -22,6 +22,7 @@ use tracing::instrument; use self::bind_watcher_handler::BindWatcherHandler; use self::block_handler::{PmemHandler, ScsiHandler, VirtioBlkMmioHandler, VirtioBlkPciHandler}; +pub use self::ephemeral_handler::update_ephemeral_mounts; use self::ephemeral_handler::EphemeralHandler; use self::fs_handler::{OverlayfsHandler, VirtioFsHandler}; use self::image_pull_handler::ImagePullHandler; @@ -30,15 +31,13 @@ use self::multi_layer_erofs::{handle_multi_layer_erofs_group, is_multi_layer_sto use crate::mount::{baremount, is_mounted, remove_mounts}; use crate::sandbox::Sandbox; -pub use self::ephemeral_handler::update_ephemeral_mounts; - mod bind_watcher_handler; mod block_handler; mod ephemeral_handler; mod fs_handler; mod image_pull_handler; mod local_handler; -mod multi_layer_erofs; +pub mod multi_layer_erofs; const RW_MASK: u32 = 0o660; const RO_MASK: u32 = 0o440; @@ -168,6 +167,8 @@ struct MultiLayerProcessResult { /// Temporary mount points (upper/lower) backing the overlay, needed for /// container-scoped cleanup via `container_mounts`. temp_mount_points: Vec, + /// dm-verity device paths that need to be destroyed during cleanup + verity_devices: Vec, } /// Handle multi-layer storage by creating the overlay device. @@ -209,6 +210,7 @@ async fn handle_multi_layer_storage( device, processed_mount_points: result.processed_mount_points, temp_mount_points: result.temp_mount_points, + verity_devices: result.verity_devices, })) } @@ -303,6 +305,7 @@ pub async fn add_storages( } } mount_list.extend(result.temp_mount_points); + mount_list.extend(result.verity_devices); continue; } diff --git a/src/agent/src/storage/multi_layer_erofs.rs b/src/agent/src/storage/multi_layer_erofs.rs index 2195bdfcd7..088fd4edf1 100644 --- a/src/agent/src/storage/multi_layer_erofs.rs +++ b/src/agent/src/storage/multi_layer_erofs.rs @@ -10,10 +10,14 @@ //! - Storage with X-kata.overlay-lower: erofs layers (lowerdir) //! - Creates overlay to combine them //! - Supports X-kata.mkdir.path options to create directories in upper layer before overlay mount +//! - Supports GPT-partitioned disks where each layer is a separate partition +use std::collections::HashMap; use std::fs; use std::path::{Path, PathBuf}; use std::sync::Arc; +use std::time::Duration; +use tokio::time::sleep; use crate::device::block_device_handler::get_virtio_blk_pci_device_name; use crate::device::scsi_device_handler::get_scsi_device_name; @@ -44,7 +48,9 @@ pub const DRIVER_MULTI_LAYER_EROFS: &str = "erofs.multi-layer"; const OPT_OVERLAY_UPPER: &str = "X-kata.overlay-upper"; const OPT_OVERLAY_LOWER: &str = "X-kata.overlay-lower"; const OPT_MULTI_LAYER: &str = "X-kata.multi-layer=true"; +const OPT_GPT_PARTITIONED: &str = "X-kata.gpt-partitioned=true"; const OPT_MKDIR_PATH: &str = "X-kata.mkdir.path="; +const OPT_PARTITION_NUMBER: &str = "X-kata.partition-number="; #[derive(Debug)] pub struct MultiLayerErofsHandler {} @@ -57,13 +63,19 @@ pub struct MultiLayerErofsResult { /// overlay. These must be tracked so they are unmounted *after* the /// overlay target during container teardown. pub temp_mount_points: Vec, + /// dm-verity device paths that need to be destroyed during cleanup + pub verity_devices: Vec, } -#[allow(dead_code)] #[derive(Debug)] struct MkdirDirective { raw_path: String, - mode: Option, +} + +/// Helper struct to track layer mount information including dm-verity devices +#[derive(Debug)] +struct LayerMountInfo { + verity_device: Option, } #[async_trait::async_trait] @@ -122,6 +134,7 @@ pub async fn handle_multi_layer_erofs_group( let mut ext4_storage: Option<&Storage> = None; let mut erofs_storages: Vec<&Storage> = Vec::new(); let mut mkdir_dirs: Vec = Vec::new(); + let mut has_gpt_partition: bool = false; for storage in &multi_layer_storages { if is_upper_storage(storage) { @@ -139,19 +152,33 @@ pub async fn handle_multi_layer_erofs_group( } } } else if is_lower_storage(storage) { + // Each GPT partition is provided as a separate storage entry by the host + if !has_gpt_partition && is_gpt_partitioned(storage) { + has_gpt_partition = true; + } erofs_storages.push(*storage); } } - let ext4 = ext4_storage - .ok_or_else(|| anyhow!("multi-layer erofs missing ext4 upper layer storage"))?; - if erofs_storages.is_empty() { return Err(anyhow!( "multi-layer erofs missing erofs lower layer storage" )); } + // Only sort erofs layers by partition number in GPT mode. + // In GPT mode, each storage carries X-kata.partition-number=N and layers + // must be ordered by partition number so that the overlay lowerdir + // precedence is correct (lower partition number = higher overlay priority). + // In non-GPT mode all partition numbers are None, so sorting would be a + // no-op that needlessly reorders elements. + if has_gpt_partition { + erofs_storages.sort_by_key(|storage| get_partition_number(storage).unwrap_or(u32::MAX)); + } + + let ext4 = ext4_storage + .ok_or_else(|| anyhow!("multi-layer erofs missing ext4 upper layer storage"))?; + info!( logger, "Handling multi-layer erofs group"; @@ -178,7 +205,7 @@ pub async fn handle_multi_layer_erofs_group( let upper_mount = temp_base.join("upper"); fs::create_dir_all(&upper_mount).context("failed to create upper mount dir")?; - wait_and_mount_layer(ext4, &upper_mount, sandbox, &logger).await?; + wait_and_mount_layer(ext4, &upper_mount, sandbox, &logger, None).await?; for mkdir_dir in &mkdir_dirs { // As {{ mount 1 }} refers to the first lower layer, which is not available until we mount it. @@ -201,6 +228,9 @@ pub async fn handle_multi_layer_erofs_group( } let mut lower_mounts = Vec::new(); + let mut verity_devices = Vec::new(); + let mut base_device_cache: HashMap = HashMap::new(); + for (index, erofs) in erofs_storages.iter().enumerate() { let lower_mount = temp_base.join(format!("lower-{}", index)); fs::create_dir_all(&lower_mount).context(format!( @@ -208,8 +238,25 @@ pub async fn handle_multi_layer_erofs_group( lower_mount.display() ))?; - wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger).await?; + let base_dev_path = if is_gpt_partitioned(erofs) { + Some( + base_device_cache + .entry(erofs.source.clone()) + .or_insert(resolve_base_device_path(erofs, sandbox).await?) + .clone(), + ) + } else { + None + }; + + let mount_info = + wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger, base_dev_path).await?; lower_mounts.push(lower_mount); + + // Collect dm-verity device for cleanup + if let Some(verity_dev) = mount_info.verity_device { + verity_devices.push(verity_dev); + } } // If any mkdir directive refers to {{ mount 1 }}, resolve it now using the first lower mount. @@ -318,6 +365,7 @@ pub async fn handle_multi_layer_erofs_group( mount_point: ext4.mount_point.clone(), processed_mount_points, temp_mount_points, + verity_devices, }) } @@ -407,7 +455,6 @@ fn parse_mkdir_directive(spec: &str) -> Result { Ok(MkdirDirective { raw_path: raw_path.to_string(), - mode: parts.get(1).map(|s| s.to_string()), }) } @@ -467,7 +514,8 @@ async fn wait_and_mount_layer( layer_mount: &Path, sandbox: &Arc>, logger: &Logger, -) -> Result<()> { + base_dev_path: Option, +) -> Result { info!( logger, "Waiting for layer device"; @@ -475,22 +523,41 @@ async fn wait_and_mount_layer( "driver" => &layer.driver, "mount-point" => layer_mount.display(), ); - let dev_path = match layer.driver.as_str() { - DRIVER_SCSI_TYPE => { - // For SCSI devices, we need to wait for the device to appear and get its path before mounting. - get_scsi_device_name(sandbox, &layer.source).await? - } - DRIVER_BLK_PCI_TYPE => { - let (root_complex, pcipath) = pcipath_from_dev_tree_path(&layer.source)?; - get_virtio_blk_pci_device_name(sandbox, root_complex, &pcipath).await? - } - _ => { - // For non-SCSI devices, we can assume the source is directly mountable. + + let is_gpt = is_gpt_partitioned(layer); + let partition_num = get_partition_number(layer); + + // Get the base device path + let dev_path = match base_dev_path { + Some(path) => path, + None => resolve_base_device_path(layer, sandbox).await?, + }; + + // For GPT-partitioned disks, use the partition device path + let dev_path = if is_gpt { + if let Some(part_num) = partition_num { + let path = get_partition_device_path(&dev_path, part_num); + info!( + logger, + "GPT-partitioned mode: using partition device"; + "base-device" => &dev_path, + "partition-number" => part_num, + "partition-device" => &path, + ); + + // Wait for partition device node to appear + wait_for_partition_device(&path, logger).await?; + + path + } else { return Err(anyhow!( - "unsupported driver type '{}' for multi-layer erofs", - layer.driver + "GPT-partitioned storage missing partition number: {:?}", + layer )); } + } else { + // Non-GPT mode: use base device directly + dev_path.clone() }; info!( @@ -500,6 +567,7 @@ async fn wait_and_mount_layer( "fstype" => &layer.fstype, "devname" => &dev_path, "mount-point" => layer_mount.display(), + "gpt-mode" => is_gpt, ); create_mount_destination(Path::new(&dev_path), layer_mount, "", &layer.fstype) @@ -548,7 +616,106 @@ async fn wait_and_mount_layer( // After successfully mounting the layer, we track the mount point for cleanup. track_temporary_mount_for_cleanup(sandbox, layer_mount, logger).await?; - Ok(()) + Ok(LayerMountInfo { + verity_device: None, + }) +} + +async fn resolve_base_device_path( + layer: &Storage, + sandbox: &Arc>, +) -> Result { + let base_dev_path = match layer.driver.as_str() { + DRIVER_SCSI_TYPE => { + // For SCSI devices, we need to wait for the device to appear and get its path before mounting. + get_scsi_device_name(sandbox, &layer.source).await? + } + DRIVER_BLK_PCI_TYPE => { + let (root_complex, pcipath) = pcipath_from_dev_tree_path(&layer.source)?; + get_virtio_blk_pci_device_name(sandbox, root_complex, &pcipath).await? + } + _ => { + // For non-SCSI devices, we can assume the source is directly mountable. + return Err(anyhow!( + "unsupported driver type '{}' for multi-layer erofs", + layer.driver + )); + } + }; + + Ok(base_dev_path) +} + +/// Check if the storage is GPT-partitioned +fn is_gpt_partitioned(storage: &Storage) -> bool { + storage.options.iter().any(|o| o == OPT_GPT_PARTITIONED) +} + +/// Extract partition number from storage options +/// Returns None if not specified (non-GPT mode) +fn get_partition_number(storage: &Storage) -> Option { + for opt in &storage.options { + if let Some(num_str) = opt.strip_prefix(OPT_PARTITION_NUMBER) { + return num_str.parse::().ok(); + } + } + None +} + +/// Get the partition device path for a GPT-partitioned disk +/// +/// For GPT mode: the storage.source contains the base disk path (e.g., "/dev/vda") +/// We need to append the partition number to get the partition path (e.g., "/dev/vda1") +/// +/// Follows the kernel naming rule: if the base device name ends with a digit, +/// insert a 'p' separator before the partition number to avoid ambiguity. +/// This correctly handles all device families: +/// - /dev/vda -> /dev/vda1 (no trailing digit, bare number) +/// - /dev/sda -> /dev/sda1 +/// - /dev/nvme0n1 -> /dev/nvme0n1p1 (trailing digit, needs 'p') +/// - /dev/mmcblk0 -> /dev/mmcblk0p1 +/// - /dev/loop0 -> /dev/loop0p1 +fn get_partition_device_path(base_path: &str, partition_number: u32) -> String { + if base_path.ends_with(char::is_numeric) { + format!("{}p{}", base_path, partition_number) + } else { + format!("{}{}", base_path, partition_number) + } +} + +/// Wait for partition device node to appear in /dev. +/// +/// When a virtio-blk device with a GPT is hotplugged, the kernel automatically +/// scans the partition table and creates partition nodes. However, devtmpfs node +/// creation may lag slightly behind the uevent, so we poll briefly if needed. +#[allow(dead_code)] +async fn wait_for_partition_device(device_path: &str, logger: &Logger) -> Result<()> { + let device_path_buf = PathBuf::from(device_path); + if device_path_buf.exists() { + return Ok(()); + } + + const MAX_WAIT_MS: u64 = 1000; + const POLL_INTERVAL_MS: u64 = 50; + + for attempt in 0..(MAX_WAIT_MS / POLL_INTERVAL_MS) { + sleep(Duration::from_millis(POLL_INTERVAL_MS)).await; + if device_path_buf.exists() { + info!( + logger, + "Partition device node appeared after polling: {} (attempt {})", + device_path, + attempt + 1 + ); + return Ok(()); + } + } + + Err(anyhow!( + "partition device {} did not appear within {} ms", + device_path, + MAX_WAIT_MS + )) } #[cfg(test)] @@ -603,27 +770,6 @@ mod tests { // --- parse_mkdir_directive --- - #[rstest] - #[case("some/path", true, "some/path", None)] - #[case("some/path:0755", true, "some/path", Some("0755"))] - #[case("path:mode:extra", true, "path", Some("mode:extra"))] - #[case("", false, "", None)] - fn test_parse_mkdir_directive( - #[case] spec: &str, - #[case] should_pass: bool, - #[case] expected_path: &str, - #[case] expected_mode: Option<&str>, - ) { - let result = parse_mkdir_directive(spec); - if should_pass { - let d = result.expect("expected Ok"); - assert_eq!(d.raw_path, expected_path); - assert_eq!(d.mode.as_deref(), expected_mode); - } else { - assert!(result.is_err(), "expected Err for spec {:?}", spec); - } - } - #[test] fn test_parse_mkdir_directive_rejects_null_bytes() { assert!(parse_mkdir_directive("foo\0bar").is_err()); @@ -728,4 +874,29 @@ mod tests { s.options ); } + + // --- get_partition_device_path --- + + #[rstest] + #[case("/dev/vda", 1, "/dev/vda1")] + #[case("/dev/sda", 3, "/dev/sda3")] + #[case("/dev/hda", 2, "/dev/hda2")] + #[case("/dev/nvme0n1", 1, "/dev/nvme0n1p1")] + #[case("/dev/nvme0n1", 2, "/dev/nvme0n1p2")] + #[case("/dev/mmcblk0", 1, "/dev/mmcblk0p1")] + #[case("/dev/loop0", 1, "/dev/loop0p1")] + #[case("/dev/nbd0", 3, "/dev/nbd0p3")] + fn test_get_partition_device_path( + #[case] base: &str, + #[case] part: u32, + #[case] expected: &str, + ) { + assert_eq!( + get_partition_device_path(base, part), + expected, + "get_partition_device_path({}, {})", + base, + part + ); + } } diff --git a/src/libs/kata-types/Cargo.toml b/src/libs/kata-types/Cargo.toml index 3f2c682558..e6202e6486 100644 --- a/src/libs/kata-types/Cargo.toml +++ b/src/libs/kata-types/Cargo.toml @@ -31,7 +31,9 @@ sha2 = "0.10.8" flate2 = "1.1" nix = "0.26.4" oci-spec = { version = "0.8.1", features = ["runtime"] } - +gpt = "4.1.0" +scopeguard = "1.0.0" +crc = "3.4.0" safe-path = { path = "../safe-path", optional = true } [target.'cfg(target_os = "macos")'.dependencies] diff --git a/src/libs/kata-types/src/gpt_disk.rs b/src/libs/kata-types/src/gpt_disk.rs new file mode 100644 index 0000000000..50f2c5e578 --- /dev/null +++ b/src/libs/kata-types/src/gpt_disk.rs @@ -0,0 +1,463 @@ +// Copyright (c) 2026 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// +// GPT (GUID Partition Table) disk metadata generation for EROFS multi-layer rootfs. +// +// This module generates a GPT metadata file (gpt_meta_head.img) that is used +// in conjunction with VMDK descriptors to present multiple EROFS layers as a +// single virtual disk with multiple GPT partitions to the guest VM. +// Backup GPT structures are omitted — the virtual disk is ephemeral and +// read-only, so backup recovery serves no purpose. +// +// Key features: +// - Only includes read-only EROFS layers in GPT partitions (rw layer handled separately) +// - Preserves the original order of layers from rootfs_mounts +// - Generates minimal GPT metadata without copying layer data +// - Supports 1MiB alignment for partitions +// - Creates VMDK-compatible descriptor with head/layer/pad extents + +use anyhow::{anyhow, Context, Result}; +use crc::Crc; +use gpt::{disk::LogicalBlockSize, mbr::ProtectiveMBR, partition_types, GptConfig}; +use scopeguard; +use std::convert::TryFrom; +use std::fs; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::path::{Path, PathBuf}; + +use crate::sl; + +/// GPT disk parameters (using gpt crate constants where available) +/// DEFAULT_SECTOR_SIZE is LogicalBlockSize enum, not u64 +const SECTOR_SIZE: u64 = 512; +/// 1 MiB alignment start +const FIRST_PARTITION_LBA: u64 = 2048; +/// 1 MiB alignment +const ALIGNMENT_LBA: u64 = 2048; +/// bytes per GPT partition entry (UEFI standard) +const GPT_ENTRY_SIZE: u64 = 128; +/// standard GPT partition entry count +const MAX_GPT_PARTITIONS: usize = 128; +/// 32 sectors for partition entries (128 entries * 128 bytes each / 512 bytes per sector) +const ENTRIES_SECTORS: u64 = (MAX_GPT_PARTITIONS as u64 * GPT_ENTRY_SIZE) / SECTOR_SIZE; +/// GPT header size in bytes (UEFI specification) +const GPT_HEADER_SIZE: usize = 92; +/// Offset (in bytes) of the GPT primary header within the head file (LBA 1) +const GPT_HEADER_FILE_OFFSET: u64 = SECTOR_SIZE; +/// CRC-32/ISO-HDLC — the same algorithm the `gpt` crate uses internally. +const CRC_32: Crc = Crc::::new(&crc::CRC_32_ISO_HDLC); + +/// GPT head metadata file name +const GPT_META_HEAD_IMG: &str = "gpt_meta_head.img"; +/// Temporary full GPT image used to synthesize head metadata +const GPT_META_FULL_IMG: &str = "gpt_meta_full.img"; + +/// Represents a read-only EROFS layer to be placed in a GPT partition +#[derive(Debug, Clone)] +pub struct ErofsLayer { + /// Path to the EROFS image file + pub path: String, + /// Size in sectors (ceiling division, sector = 512 bytes) + pub size_sectors: u64, + /// Snapshot ID extracted from path (for naming) + pub snapshot_id: String, +} + +/// GPT partition layout information for a single layer +#[derive(Debug, Clone)] +pub struct PartitionLayout { + /// Layer information + pub layer: ErofsLayer, + /// Partition number (1-indexed) + pub partition_number: u32, + /// First LBA of the partition + pub start_lba: u64, + /// Last LBA of the partition + pub end_lba: u64, + /// Partition name + pub name: String, +} + +/// Complete GPT disk layout calculation result +#[derive(Debug, Clone)] +pub struct GptDiskLayout { + /// All partition layouts in order + pub partitions: Vec, + /// Total sectors in the virtual disk + pub total_sectors: u64, + /// Logical block size in bytes + pub lb_size: u64, +} + +/// Result of GPT metadata file generation +#[derive(Debug)] +pub struct GptMetadataFiles { + /// Path to generated gpt_meta_head.img + pub head_path: PathBuf, + /// Size of head file in sectors + pub head_sectors: u64, + /// Paths to generated padding files (between partitions) + pub pad_paths: Vec, +} + +/// Extract snapshot ID from a source path +/// +/// Examples: +/// ".../snapshots/35/layer.erofs" ---> "35" +pub fn extract_snapshot_id(source: &str) -> String { + Path::new(source) + .parent() + .and_then(|p| p.file_name()) + .map(|id| id.to_string_lossy().to_string()) + .unwrap_or_else(|| "unknown".to_string()) +} + +/// Get file size in bytes +pub fn get_erofs_layer_size(path: &str) -> Result { + let metadata = fs::metadata(path).context(format!("failed to stat EROFS file: {}", path))?; + Ok(metadata.len()) +} + +/// Align LBA up to the specified alignment +fn align_up(lba: u64, alignment: u64) -> u64 { + if lba.is_multiple_of(alignment) { + lba + } else { + ((lba / alignment) + 1) * alignment + } +} + +/// Calculate GPT disk layout from EROFS layers +/// +/// This function computes the LBA positions for all partitions without +/// modifying any files. It follows the layout: +/// - LBA 0: Protective MBR +/// - LBA 1: Primary GPT Header +/// - LBA 2-33: Primary Partition Entry Array +/// - LBA 34-2047: Reserved/padding +/// - LBA 2048+: Partitions (1MiB aligned) +/// - End: Backup Partition Entry Array + Backup GPT Header +pub fn calculate_gpt_layout(layers: &[ErofsLayer]) -> Result { + if layers.is_empty() { + return Err(anyhow!("no EROFS layers provided for GPT layout")); + } + + // TODO: Fix the length of partitions exceeding GPT limits. + // It should be addressed by splitting into multiple GPT disks if needed, but for now we enforce the limit. + if layers.len() > MAX_GPT_PARTITIONS { + return Err(anyhow!( + "The layers for GPT: {} exceeds maximum {} partitions \ + (ENTRIES_SECTORS is sized for {} entries)", + layers.len(), + MAX_GPT_PARTITIONS, + MAX_GPT_PARTITIONS, + )); + } + + // Validate that all layers have non-zero size + for (idx, layer) in layers.iter().enumerate() { + if layer.size_sectors == 0 { + return Err(anyhow!( + "EROFS layer {} ({}) has size_sectors = 0, cannot generate GPT partition", + idx, + layer.path + )); + } + } + + let lb_size = SECTOR_SIZE; + let first_usable_lba = FIRST_PARTITION_LBA; + + // Calculate partition positions + let mut partitions = Vec::with_capacity(layers.len()); + let mut current_lba = first_usable_lba; + + for (idx, layer) in layers.iter().enumerate() { + // Align start LBA to 1MiB boundary + let start_lba = align_up(current_lba, ALIGNMENT_LBA); + let end_lba = start_lba + layer.size_sectors - 1; + + // Generate partition name: erofs-{index}-s{snapshot_id} + let name = format!("erofs-{}-s{}", idx, layer.snapshot_id); + // Truncate to fit GPT name limit without slicing through a UTF-8 codepoint. + let name = match name.char_indices().nth(36) { + Some((truncate_at, _)) => name[..truncate_at].to_string(), + None => name, + }; + + partitions.push(PartitionLayout { + layer: layer.clone(), + partition_number: (idx + 1) as u32, + start_lba, + end_lba, + name, + }); + + // Next partition starts after this one + current_lba = end_lba + 1; + } + + // Calculate backup GPT position + // Backup entries are placed after the last partition, aligned + let backup_entries_lba = align_up(current_lba, ALIGNMENT_LBA); + let backup_header_lba = backup_entries_lba + ENTRIES_SECTORS; + let total_sectors = backup_header_lba + 1; + + let last_usable_lba = backup_entries_lba - 1; + + // Validate that all partitions fit in usable area + for (idx, part) in partitions.iter().enumerate() { + if part.end_lba > last_usable_lba { + return Err(anyhow!( + "partition {} (end_lba={}) exceeds last usable LBA ({})", + idx, + part.end_lba, + last_usable_lba + )); + } + } + + Ok(GptDiskLayout { + partitions, + total_sectors, + lb_size, + }) +} + +/// Generate GPT head metadata and return layout information +/// +/// This is the main entry point for GPT metadata generation. +/// It creates a temporary full GPT image (needed by the gpt crate to +/// produce valid primary structures), extracts the head region, patches +/// the primary header to remove references to backup GPT, and discards +/// the rest. +/// +/// Output: +/// - gpt_meta_head.img: Primary GPT structures (MBR + GPT header + partition entries + padding) +#[allow(unused_variables)] +pub fn generate_gpt_metadata( + sid: &str, + cid: &str, + erofs_layers: Vec, + container_dir: &Path, +) -> Result<(GptDiskLayout, GptMetadataFiles)> { + if erofs_layers.is_empty() { + return Err(anyhow!( + "no EROFS layers provided for GPT metadata generation" + )); + } + + let mut layout = calculate_gpt_layout(&erofs_layers)?; + if layout.partitions.is_empty() { + return Err(anyhow!( + "no partitions in layout, cannot generate GPT metadata" + )); + } + + let full_path = container_dir.join(GPT_META_FULL_IMG); + generate_full_gpt_image(&layout, &full_path).context("failed to generate full GPT image")?; + let _cleanup = scopeguard::guard((), |_| { + let _ = fs::remove_file(&full_path); + }); + + // Extract head: LBA 0 to FIRST_PARTITION_LBA (2048 sectors = 1 MiB) + let lb_size = layout.lb_size; + let head_sectors = FIRST_PARTITION_LBA; + let head_size = head_sectors * lb_size; + let head_path = container_dir.join(GPT_META_HEAD_IMG); + extract_file_range(&full_path, &head_path, 0, head_size) + .context("failed to extract GPT head metadata")?; + + // Patch the primary GPT header so AlternateLBA / LastUsableLBA are + let last_partition_end = layout.partitions.last().unwrap().end_lba; + patch_primary_gpt_header(&head_path, last_partition_end) + .context("failed to patch primary GPT header")?; + + // Adjust the layout to reflect the virtual disk size (no backup). + layout.total_sectors = last_partition_end + 1; + + info!( + sl!(), + "Generated GPT head file: {} ({} sectors, {} bytes, virtual disk {} sectors)", + head_path.display(), + head_sectors, + head_size, + layout.total_sectors + ); + + let metadata_files = GptMetadataFiles { + head_path, + head_sectors, + pad_paths: Vec::new(), + }; + + Ok((layout, metadata_files)) +} + +fn generate_full_gpt_image(layout: &GptDiskLayout, output_path: &Path) -> Result<()> { + let lb_size = layout.lb_size; + let total_size = layout.total_sectors * lb_size; + + let mut file = fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(output_path) + .context(format!( + "failed to create full GPT image: {}", + output_path.display() + ))?; + + file.set_len(total_size) + .context("failed to pre-allocate full GPT image")?; + + let mbr = + ProtectiveMBR::with_lb_size(u32::try_from(layout.total_sectors - 1).unwrap_or(0xFFFF_FFFF)); + mbr.overwrite_lba0(&mut file) + .context("failed to write Protective MBR")?; + + let mut gdisk = GptConfig::new() + .writable(true) + .logical_block_size(LogicalBlockSize::Lb512) + .change_partition_count(true) + .create_from_device(file, None) + .context("failed to initialize GPT config")?; + + for part_layout in &layout.partitions { + let part_size_bytes = (part_layout.end_lba - part_layout.start_lba + 1) * lb_size; + gdisk + .add_partition( + &part_layout.name, + part_size_bytes, + partition_types::LINUX_FS, + 0, + Some(ALIGNMENT_LBA), + ) + .context(format!("failed to add partition '{}'", part_layout.name))?; + } + + let mut file = gdisk + .write() + .context("failed to write GPT partition table")?; + file.flush().context("failed to flush full GPT image")?; + + Ok(()) +} + +/// Patch the primary GPT header in the extracted head file to remove +/// backup GPT references. +/// +/// Sets `AlternateLBA` to one sector beyond the virtual disk (so the kernel +/// detects "no valid backup" and falls back to the primary) and +/// `LastUsableLBA` to the end of the last partition, then recomputes the +/// header CRC32. +fn patch_primary_gpt_header(head_path: &Path, last_partition_end_lba: u64) -> Result<()> { + let mut file = fs::OpenOptions::new() + .read(true) + .write(true) + .open(head_path) + .context("failed to open head file for patching")?; + + // Read the 92-byte GPT header starting at LBA 1. + file.seek(SeekFrom::Start(GPT_HEADER_FILE_OFFSET))?; + let mut header = [0u8; GPT_HEADER_SIZE]; + file.read_exact(&mut header)?; + + // AlternateLBA (offset 32, 8 bytes LE) — point beyond virtual disk + let alternate_lba = last_partition_end_lba + 1; + header[32..40].copy_from_slice(&alternate_lba.to_le_bytes()); + + // LastUsableLBA (offset 48, 8 bytes LE) — last partition end + header[48..56].copy_from_slice(&last_partition_end_lba.to_le_bytes()); + + // Zero HeaderCRC32 (offset 16, 4 bytes LE) before computing new CRC + header[16..20].copy_from_slice(&0u32.to_le_bytes()); + + let new_crc = { + let mut digest = CRC_32.digest(); + digest.update(&header); + digest.finalize() + }; + header[16..20].copy_from_slice(&new_crc.to_le_bytes()); + + // Write patched header back + file.seek(SeekFrom::Start(GPT_HEADER_FILE_OFFSET))?; + file.write_all(&header)?; + file.flush()?; + + info!( + sl!(), + "Patched primary GPT header: AlternateLBA={}, LastUsableLBA={}, CRC32={:#010x}", + alternate_lba, + last_partition_end_lba, + new_crc + ); + + Ok(()) +} + +fn extract_file_range(src: &Path, dst: &Path, offset: u64, size: u64) -> Result<()> { + let mut src_file = fs::OpenOptions::new() + .read(true) + .open(src) + .context(format!("failed to open source file: {}", src.display()))?; + src_file + .seek(SeekFrom::Start(offset)) + .context("failed to seek source file")?; + + let mut dst_file = fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(dst) + .context(format!("failed to create output file: {}", dst.display()))?; + + dst_file + .set_len(size) + .context("failed to pre-allocate output file")?; + + let mut limited = src_file.take(size); + std::io::copy(&mut limited, &mut dst_file).context("failed to copy file range")?; + dst_file.flush().context("failed to flush output file")?; + + Ok(()) +} + +/// Generate padding file content (all zeros) +/// +/// Returns the file path and size in sectors. +pub fn generate_padding_file(output_path: &Path, size_sectors: u64) -> Result { + let size_bytes = size_sectors * SECTOR_SIZE; + + if size_bytes == 0 { + return Err(anyhow!("cannot create zero-size padding file")); + } + + let mut file = fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(output_path) + .context(format!( + "failed to create padding file: {}", + output_path.display() + ))?; + + // Pre-allocate with zeros + file.set_len(size_bytes) + .context("failed to pre-allocate padding file")?; + file.flush().context("failed to flush padding file")?; + drop(file); + + info!( + sl!(), + "Generated padding file: {} ({} sectors, {} bytes)", + output_path.display(), + size_sectors, + size_bytes + ); + + Ok(size_sectors) +} diff --git a/src/libs/kata-types/src/lib.rs b/src/libs/kata-types/src/lib.rs index 402b9ce54e..9049caf54d 100644 --- a/src/libs/kata-types/src/lib.rs +++ b/src/libs/kata-types/src/lib.rs @@ -54,6 +54,9 @@ pub mod rootless; /// machine type pub mod machine_type; +/// GPT (GUID Partition Table) disk layout and metadata generation. +pub mod gpt_disk; + use std::path::Path; use crate::rootless::{is_rootless, rootless_dir}; diff --git a/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs b/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs index 5c1093a171..36e03b2802 100644 --- a/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs +++ b/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs @@ -24,6 +24,10 @@ use hypervisor::{ use kata_types::device::{ DRIVER_BLK_CCW_TYPE as KATA_CCW_DEV_TYPE, DRIVER_BLK_PCI_TYPE as KATA_BLK_DEV_TYPE, }; +use kata_types::gpt_disk::{ + extract_snapshot_id, generate_gpt_metadata, generate_padding_file, get_erofs_layer_size, + ErofsLayer, GptDiskLayout, GptMetadataFiles, +}; use kata_types::mount::Mount; use oci_spec::runtime as oci; use std::fs; @@ -39,14 +43,11 @@ pub(crate) const EROFS_ROOTFS_TYPE: &str = "erofs"; pub(crate) const RW_LAYER_ROOTFS_TYPE: &str = "ext4"; /// VMDK file extension for merged EROFS image const EROFS_MERGED_VMDK: &str = "merged_fs.vmdk"; -/// Maximum number of virtio-blk devices allowed for multi-layer EROFS rootfs. -/// -/// This defensive limit prevents exhausting PCI slot resources, especially on -/// lightweight VMMs (Dragonball, Cloud Hypervisor) where the PCIe root bus has -/// only 32 slots (PCIE_ROOT_BUS_SLOTS_CAPACITY). For QEMU with PCI bridges -/// (30 slots/bridge), this limit is conservative but still applies as a uniform -/// safeguard across all hypervisor backends. -const MAX_VIRTIO_BLK_DEVICES: usize = 10; + +/// Maximum number of rootfs layer devices (erofs + rw layer) allowed in multi-layer EROFS mode. +/// This is a pre-flight sanity check before VMDK merging, to prevent excessive block devices +/// when many layers are used without fsmerge. +const MAX_ROOTFS_LAYER_DEVICES: usize = 129; // 128 EROFS layers + 1 rw layer (129 total) /// Maximum sectors per 2GB extent (2GB / 512 bytes per sector) const MAX_2GB_EXTENT_SECTORS: u64 = 0x8000_0000 >> 9; /// Sectors per track for VMDK geometry @@ -60,12 +61,25 @@ const VMDK_ADAPTER_TYPE: &str = "ide"; /// VMDK hardware version const VMDK_HW_VERSION: &str = "4"; /// Default shared directory for guest rootfs VMDK files (for multi-layer EROFS) -const DEFAULT_KATA_GUEST_ROOT_SHARED_FS: &str = "/run/kata-containers/"; +pub(crate) const DEFAULT_KATA_GUEST_ROOT_SHARED_FS: &str = "/run/kata-containers/"; /// Template for mkdir option in overlay mount (X-containerd.mkdir.path) const X_CONTAINERD_MKDIR_PATH: &str = "X-containerd.mkdir.path="; /// Template for mkdir option passed to guest agent (X-kata.mkdir.path) const X_KATA_MKDIR_PATH: &str = "X-kata.mkdir.path="; +/// Create the per-container directory under the shared filesystem root. +pub(crate) fn ensure_container_dir(sid: &str, cid: &str) -> Result { + let dir = PathBuf::from(kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS)) + .join(sid) + .join(cid); + fs::create_dir_all(&dir).context(format!( + "failed to create container directory: {}", + dir.display() + ))?; + + Ok(dir) +} + /// Generate merged VMDK file from multiple EROFS devices /// /// Creates a VMDK descriptor that combines multiple EROFS images into a single @@ -104,14 +118,7 @@ async fn generate_merged_erofs_vmdk( } // For multiple devices, create VMDK descriptor - let sandbox_dir = - PathBuf::from(kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS)).join(sid); - let container_dir = sandbox_dir.join(cid); - fs::create_dir_all(&container_dir).context(format!( - "failed to create container directory: {}", - container_dir.display() - ))?; - + let container_dir = ensure_container_dir(sid, cid)?; let vmdk_path = container_dir.join(EROFS_MERGED_VMDK); info!( @@ -129,6 +136,105 @@ async fn generate_merged_erofs_vmdk( Ok((vmdk_path.display().to_string(), BlockDeviceFormat::Vmdk)) } +/// Helper struct for writing VMDK descriptor files atomically. +/// +/// Encapsulates the common VMDK descriptor format: header, extent descriptions, +/// DDB footer, and atomic write (temp file + rename). Used by both fsmerge mode +/// (`create_vmdk_descriptor`) and GPT mode (`create_gpt_vmdk_descriptor`). +struct VmdkDescriptorWriter { + writer: BufWriter, + temp_path: PathBuf, + final_path: PathBuf, +} + +impl VmdkDescriptorWriter { + fn new(vmdk_path: &Path) -> Result { + let temp_path = vmdk_path.with_extension("vmdk.tmp"); + if temp_path.components().any(|c| c == Component::ParentDir) { + return Err(anyhow!("Invalid input: {}", temp_path.display())); + } + let file = fs::File::create(&temp_path).context(format!( + "failed to create temp VMDK file: {}", + temp_path.display() + ))?; + let mut writer = BufWriter::new(file); + + writeln!(writer, "# Disk DescriptorFile")?; + writeln!(writer, "version=1")?; + writeln!(writer, "CID=fffffffe")?; + writeln!(writer, "parentCID=ffffffff")?; + writeln!(writer, "createType=\"{}\"", VMDK_SUBFORMAT)?; + writeln!(writer)?; + writeln!(writer, "# Extent description")?; + + Ok(Self { + writer, + temp_path, + final_path: vmdk_path.to_path_buf(), + }) + } + + // Write a single extent line (no 2GB chunking). + fn write_extent(&mut self, path: &str, sectors: u64, file_offset: u64) -> Result<()> { + writeln!( + self.writer, + "RW {} FLAT \"{}\" {}", + sectors, path, file_offset + )?; + Ok(()) + } + + // Write extent lines with 2GB chunking for large files. + fn write_extent_chunked(&mut self, path: &str, total_sectors: u64) -> Result<()> { + let mut remaining = total_sectors; + let mut file_offset: u64 = 0; + while remaining > 0 { + let chunk = remaining.min(MAX_2GB_EXTENT_SECTORS); + self.write_extent(path, chunk, file_offset)?; + file_offset += chunk; + remaining -= chunk; + } + Ok(()) + } + + // Write DDB footer, flush, and atomically rename to final path. + fn finalize(mut self, total_sectors: u64) -> Result<()> { + writeln!(self.writer)?; + + let cylinders = total_sectors.div_ceil(SECTORS_PER_TRACK * NUMBER_HEADS); + + writeln!(self.writer, "# The Disk Data Base")?; + writeln!(self.writer, "#DDB")?; + writeln!(self.writer)?; + writeln!( + self.writer, + "ddb.virtualHWVersion = \"{}\"", + VMDK_HW_VERSION + )?; + writeln!(self.writer, "ddb.geometry.cylinders = \"{}\"", cylinders)?; + writeln!(self.writer, "ddb.geometry.heads = \"{}\"", NUMBER_HEADS)?; + writeln!( + self.writer, + "ddb.geometry.sectors = \"{}\"", + SECTORS_PER_TRACK + )?; + writeln!(self.writer, "ddb.adapterType = \"{}\"", VMDK_ADAPTER_TYPE)?; + + self.writer + .flush() + .context("failed to flush VMDK descriptor")?; + drop(self.writer); + + fs::rename(&self.temp_path, &self.final_path).context(format!( + "failed to rename temp VMDK {} -> {}", + self.temp_path.display(), + self.final_path.display() + ))?; + + Ok(()) + } +} + /// Create VMDK descriptor for multiple EROFS extents (flatten device) /// /// Generates a VMDK descriptor file (twoGbMaxExtentFlat format) that references @@ -141,7 +247,6 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<() )); } - // collect extent information without writing anything. struct ExtentInfo { path: String, total_sectors: u64, @@ -160,9 +265,6 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<() continue; } - // round up to whole sectors to avoid losing tail bytes on non-aligned files. - // VMDK extents are measured in 512-byte sectors; a file that is not sector-aligned - // still needs the last partial sector to be addressable by the VM. let sectors = file_size.div_ceil(512); if file_size % 512 != 0 { @@ -197,43 +299,9 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<() )); } - // write descriptor to a temp file, then atomically rename. - let tmp_path = vmdk_path.with_extension("vmdk.tmp"); - // Prevent path traversal attacks by rejecting paths containing '..'. - if tmp_path.components().any(|c| c == Component::ParentDir) { - return Err(anyhow!("Invalid input: {}", tmp_path.display())); - } - let file = fs::File::create(&tmp_path).context(format!( - "failed to create temp VMDK file: {}", - tmp_path.display() - ))?; - let mut writer = BufWriter::new(file); - - // Header - writeln!(writer, "# Disk DescriptorFile")?; - writeln!(writer, "version=1")?; - writeln!(writer, "CID=fffffffe")?; - writeln!(writer, "parentCID=ffffffff")?; - writeln!(writer, "createType=\"{}\"", VMDK_SUBFORMAT)?; - writeln!(writer)?; - - // Extent descriptions - writeln!(writer, "# Extent description")?; + let mut vmdk = VmdkDescriptorWriter::new(vmdk_path)?; for extent in &extents { - let mut remaining = extent.total_sectors; - let mut file_offset: u64 = 0; - - while remaining > 0 { - let chunk = remaining.min(MAX_2GB_EXTENT_SECTORS); - writeln!( - writer, - "RW {} FLAT \"{}\" {}", - chunk, extent.path, file_offset - )?; - file_offset += chunk; - remaining -= chunk; - } - + vmdk.write_extent_chunked(&extent.path, extent.total_sectors)?; info!( sl!(), "VMDK extent: {} ({} sectors, {} extent chunk(s))", @@ -242,45 +310,149 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<() extent.total_sectors.div_ceil(MAX_2GB_EXTENT_SECTORS) ); } - writeln!(writer)?; - // Disk Data Base (DDB) - // Geometry: cylinders = ceil(total_sectors / (sectors_per_track * heads)) - let cylinders = total_sectors.div_ceil(SECTORS_PER_TRACK * NUMBER_HEADS); - - writeln!(writer, "# The Disk Data Base")?; - writeln!(writer, "#DDB")?; - writeln!(writer)?; - writeln!(writer, "ddb.virtualHWVersion = \"{}\"", VMDK_HW_VERSION)?; - writeln!(writer, "ddb.geometry.cylinders = \"{}\"", cylinders)?; - writeln!(writer, "ddb.geometry.heads = \"{}\"", NUMBER_HEADS)?; - writeln!(writer, "ddb.geometry.sectors = \"{}\"", SECTORS_PER_TRACK)?; - writeln!(writer, "ddb.adapterType = \"{}\"", VMDK_ADAPTER_TYPE)?; - - // Flush the BufWriter to ensure all data is written before rename. - writer.flush().context("failed to flush VMDK descriptor")?; - // Explicitly drop to close the file handle before rename. - drop(writer); - - // atomic rename: tmp -> final path. - fs::rename(&tmp_path, vmdk_path).context(format!( - "failed to rename temp VMDK {} -> {}", - tmp_path.display(), - vmdk_path.display() - ))?; + vmdk.finalize(total_sectors)?; info!( sl!(), - "VMDK descriptor created: {} (total {} sectors, {} extents, {} cylinders)", + "VMDK descriptor created: {} (total {} sectors, {} extents)", vmdk_path.display(), total_sectors, - extents.len(), - cylinders + extents.len() ); Ok(()) } +/// Generate GPT-partitioned VMDK and return layout information for per-partition storage creation +/// +/// Returns: (vmdk_path, BlockDeviceFormat::Vmdk, GptDiskLayout, GptMetadataFiles) +fn generate_gpt_vmdk_with_layout( + sid: &str, + cid: &str, + erofs_layers: Vec, +) -> Result<(String, BlockDeviceFormat, GptDiskLayout, GptMetadataFiles)> { + if erofs_layers.is_empty() { + return Err(anyhow!("no EROFS layers provided for GPT VMDK generation")); + } + + // Validate all layer paths exist and are regular files + for layer in &erofs_layers { + let metadata = fs::metadata(&layer.path) + .context(format!("EROFS layer path not accessible: {}", layer.path))?; + if !metadata.is_file() { + return Err(anyhow!( + "EROFS layer path is not a regular file: {}", + layer.path + )); + } + } + + // Create container directory + let container_dir = ensure_container_dir(sid, cid)?; + let vmdk_path = container_dir.join(EROFS_MERGED_VMDK); + + info!( + sl!(), + "creating GPT-partitioned VMDK for {} EROFS layers: {}", + erofs_layers.len(), + vmdk_path.display() + ); + + // Generate GPT metadata files + let (layout, mut gpt_files) = generate_gpt_metadata(sid, cid, erofs_layers, &container_dir) + .context("failed to generate GPT metadata")?; + + // Create VMDK descriptor with GPT layout and collect generated padding paths + let pad_paths = create_gpt_vmdk_descriptor(&vmdk_path, &layout, &gpt_files) + .context("failed to create GPT VMDK descriptor")?; + gpt_files.pad_paths = pad_paths; + + Ok(( + vmdk_path.display().to_string(), + BlockDeviceFormat::Vmdk, + layout, + gpt_files, + )) +} + +/// Create VMDK descriptor for GPT-partitioned disk +/// +/// Returns the list of generated padding file paths for cleanup tracking. +fn create_gpt_vmdk_descriptor( + vmdk_path: &Path, + layout: &GptDiskLayout, + gpt_files: &GptMetadataFiles, +) -> Result> { + let mut vmdk = VmdkDescriptorWriter::new(vmdk_path)?; + let mut pad_paths: Vec = Vec::new(); + + // 1. GPT head metadata + vmdk.write_extent( + &gpt_files.head_path.display().to_string(), + gpt_files.head_sectors, + 0, + )?; + info!( + sl!(), + "VMDK extent: GPT head ({} sectors) at {}", + gpt_files.head_sectors, + gpt_files.head_path.display() + ); + + // 2. Layer extents with padding gaps + // head ends at LBA 2047, so first gap starts at LBA 2048. + let mut prev_end_lba = gpt_files.head_sectors - 1; + + let metadata_dir = gpt_files.head_path.parent().ok_or_else(|| { + anyhow!( + "GPT head file has no parent directory: {}", + gpt_files.head_path.display() + ) + })?; + + for (idx, part) in layout.partitions.iter().enumerate() { + let gap_start_lba = prev_end_lba + 1; + if part.start_lba > gap_start_lba { + let gap_sectors = part.start_lba - gap_start_lba; + let pad_path = metadata_dir.join(format!("pad-{}.img", idx)); + + generate_padding_file(&pad_path, gap_sectors).context(format!( + "failed to generate padding file: {}", + pad_path.display() + ))?; + + vmdk.write_extent(&pad_path.display().to_string(), gap_sectors, 0)?; + pad_paths.push(pad_path); + } + + vmdk.write_extent_chunked(&part.layer.path, part.layer.size_sectors)?; + info!( + sl!(), + "VMDK extent: {} (partition {}, LBA {}-{}, {} sectors)", + part.layer.path, + part.partition_number, + part.start_lba, + part.end_lba, + part.layer.size_sectors + ); + + prev_end_lba = part.end_lba; + } + + vmdk.finalize(layout.total_sectors)?; + + info!( + sl!(), + "GPT VMDK descriptor created: {} (total {} sectors, {} partitions)", + vmdk_path.display(), + layout.total_sectors, + layout.partitions.len() + ); + + Ok(pad_paths) +} + fn extract_block_device_info( device_info: &DeviceType, read_only: bool, @@ -338,10 +510,16 @@ fn extract_block_device_info( pub(crate) struct ErofsMultiLayerRootfs { guest_path: String, device_ids: Vec, - rwlayer_storage: Option, // Writable layer storage (upper layer), typically ext4 - erofs_storage: Option, - /// Path to generated VMDK descriptor (only set when multiple EROFS devices are merged) + // Writable layer storage (upper layer), typically ext4 + rwlayer_storage: Option, + // Read-only EROFS layer storages (lower layers), one per partition in GPT mode + erofs_storages: Vec, + // Path to generated VMDK descriptor (only set when multiple EROFS devices are merged) vmdk_path: Option, + // Paths to generated GPT metadata files (head, padding) for cleanup + gpt_metadata_paths: Vec, + // Container-scoped runtime directory that may only contain generated helper artifacts. + generated_artifacts_dir: PathBuf, } impl ErofsMultiLayerRootfs { @@ -360,8 +538,11 @@ impl ErofsMultiLayerRootfs { let mut device_ids = Vec::new(); let mut rwlayer_storage: Option = None; - let mut erofs_storage: Option = None; + let mut erofs_storages: Vec = Vec::new(); let mut vmdk_path: Option = None; + let mut gpt_metadata_paths: Vec = Vec::new(); + // Track whether GPT+VMDK erofs layers have already been processed in bulk. + let mut gpt_erofs_processed = false; // Directories to create (X-containerd.mkdir.path) let mut mkdir_dirs: Vec = Vec::new(); @@ -374,14 +555,33 @@ impl ErofsMultiLayerRootfs { .iter() .filter(|m| matches!(m.fs_type.as_str(), RW_LAYER_ROOTFS_TYPE | EROFS_ROOTFS_TYPE)) .count(); - if expected_device_count > MAX_VIRTIO_BLK_DEVICES { + + // TODO(Alex Lyn): fsmerge mode with single erofs mount and multiple device= options + // may require multiple block devices if containerd does not merge layers into one file. + // This is a fallback or default mode if fsmerge is not enabled. + if expected_device_count > MAX_ROOTFS_LAYER_DEVICES { return Err(anyhow!( "exceeded maximum block devices for multi-layer EROFS: {} > {}", expected_device_count, - MAX_VIRTIO_BLK_DEVICES + MAX_ROOTFS_LAYER_DEVICES )); } + // Pre-extract mkdir directives from overlay mounts before the main loop, + // so they are available regardless of mount ordering. + for mount in rootfs_mounts { + if matches!( + mount.fs_type.as_str(), + "overlay" | "format/overlay" | "format/mkdir/overlay" + ) { + for opt in &mount.options { + if let Some(mkdir_spec) = opt.strip_prefix(X_CONTAINERD_MKDIR_PATH) { + mkdir_dirs.push(mkdir_spec.to_string()); + } + } + } + } + // Process each mount in rootfs_mounts to set up devices and storages for mount in rootfs_mounts { match mount.fs_type.as_str() { @@ -407,8 +607,6 @@ impl ErofsMultiLayerRootfs { .await .context("failed to attach rw block device")?; - // let (device_id, guest_path, blk_driver) = - // extract_block_device_info(&device_info, &block_driver)?; let (mut rwlayer, device_id) = extract_block_device_info(&device_info, false) .context("failed to get block device for rw layer")?; @@ -441,110 +639,253 @@ impl ErofsMultiLayerRootfs { } fmt if fmt.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE) => { // Mount[1]: erofs layers -> virtio-blk via VMDK /dev/vdX2 - info!( - sl!(), - "multi-layer erofs: adding erofs layers: {}", mount.source - ); + // + // Two modes are supported: + // 1. fsmerge mode: Single erofs mount with `device=` options pointing to additional files. + // This is used when containerd has already merged layers into a single file. + // 2. GPT+VMDK mode: Multiple independent erofs mounts (each mount is a separate layer file). + // This is used when containerd does NOT use fsmerge, and we need to create GPT partitions. - // Collect all EROFS devices: source + `device=` options - let mut erofs_devices = vec![mount.source.clone()]; - for opt in &mount.options { - if let Some(device_path) = opt.strip_prefix("device=") { - erofs_devices.push(device_path.to_string()); - } + // In GPT mode, all erofs layers are processed in bulk on the first + // encounter. Skip subsequent erofs mounts but continue iterating + // so that later ext4 rw-layer and overlay mounts are still handled. + if gpt_erofs_processed { + info!( + sl!(), + "multi-layer erofs: skipping already-processed erofs mount: {}", + mount.source + ); + continue; } - info!(sl!(), "EROFS devices count: {}", erofs_devices.len()); - - // Generate merged VMDK file from all EROFS devices - // Returns (path, format) - format is Vmdk for multiple devices, Raw for single device - let (erofs_path, erofs_format) = - generate_merged_erofs_vmdk(sid, cid, &erofs_devices) - .await - .context("failed to generate EROFS VMDK")?; - - // Track VMDK path for cleanup (only when VMDK is actually created) - if erofs_format == BlockDeviceFormat::Vmdk { - vmdk_path = Some(PathBuf::from(&erofs_path)); - } - - info!( - sl!(), - "EROFS block device config - path: {}, format: {:?}", - erofs_path, - erofs_format - ); - - let device_config = &mut BlockConfig { - driver_option: block_driver.clone(), - format: erofs_format, // Vmdk for multiple devices, Raw for single device - path_on_host: erofs_path, - is_readonly: true, // EROFS layers are read-only, must set to avoid "resize" lock errors - blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio), - ..Default::default() - }; - - let device_info = do_handle_device( - device_manager, - &DeviceConfig::BlockCfg(device_config.clone()), - ) - .await - .context("failed to attach erofs block device")?; - - let (mut rolayer, device_id) = extract_block_device_info(&device_info, true)?; - info!( - sl!(), - "erofs device attached - device_id: {} guest_path: {}", - device_id, - &rolayer.source - ); - - let mut options: Vec = mount - .options + // Collect all EROFS mounts once with their original indices. + let erofs_mounts_indexed: Vec<(usize, &Mount)> = rootfs_mounts .iter() - .filter(|o| { - // Filter out options that are not valid erofs mount parameters: - // 1. "loop" - not needed in VM, device is already /dev/vdX - // 2. "device=" prefix - used for VMDK generation only, not for mount - // 3. "X-kata." prefix - metadata markers for kata internals - *o != "loop" && !o.starts_with("device=") && !o.starts_with("X-kata.") - }) - .cloned() + .enumerate() + .filter(|(_, m)| m.fs_type.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE)) .collect(); + let total_erofs_mounts = erofs_mounts_indexed.len(); - // Erofs layers are read-only lower layers (marked with X-kata.overlay-lower) - options.push("X-kata.overlay-lower".to_string()); - options.push("X-kata.multi-layer=true".to_string()); + // GPT+VMDK mode: Multiple independent erofs layer files + if total_erofs_mounts > 1 { + info!( + sl!(), + "multi-layer erofs: using GPT+VMDK mode for {} independent layers", + total_erofs_mounts + ); - info!( - sl!(), - "erofs storage options filtered: {:?} -> {:?}", mount.options, options - ); + let mut erofs_layers = Vec::new(); - rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string(); - rolayer.mount_point = container_path.clone(); - rolayer.options = options; + for (_mount_idx, erofs_mount) in &erofs_mounts_indexed { + let layer_path = erofs_mount.source.clone(); + let size_bytes = get_erofs_layer_size(&layer_path).context(format!( + "gptdisk: failed to get size of EROFS layer: {}", + layer_path + ))?; - erofs_storage = Some(rolayer); - device_ids.push(device_id); + if size_bytes == 0 { + warn!( + sl!(), + "gptdisk: EROFS layer {} is zero-length, skipping", layer_path + ); + continue; + } + + let size_sectors = size_bytes.div_ceil(512); + let snapshot_id = extract_snapshot_id(&layer_path); + + erofs_layers.push(ErofsLayer { + path: layer_path, + size_sectors, + snapshot_id, + }); + } + + if erofs_layers.is_empty() { + return Err(anyhow!( + "gptdisk: no valid EROFS layers found for GPT VMDK" + )); + } + + // Generate GPT-partitioned VMDK and get layout information + let (erofs_path, erofs_format, layout, gpt_files) = + generate_gpt_vmdk_with_layout(sid, cid, erofs_layers) + .context("gptdisk: failed to generate GPT VMDK")?; + + // Track VMDK path for cleanup + vmdk_path = Some(PathBuf::from(&erofs_path)); + + // Track GPT metadata files (head + padding) for cleanup + gpt_metadata_paths.push(gpt_files.head_path.clone()); + gpt_metadata_paths.extend(gpt_files.pad_paths.iter().cloned()); + + info!( + sl!(), + "GPT VMDK created - path: {}, format: {:?}, {} partitions", + erofs_path, + erofs_format, + layout.partitions.len() + ); + + let device_config = &mut BlockConfig { + driver_option: block_driver.clone(), + format: erofs_format, + path_on_host: erofs_path, + is_readonly: true, + blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio), + ..Default::default() + }; + + let device_info = do_handle_device( + device_manager, + &DeviceConfig::BlockCfg(device_config.clone()), + ) + .await + .context("failed to attach GPT VMDK block device")?; + + let (base_device, device_id) = + extract_block_device_info(&device_info, true)?; + info!( + sl!(), + "GPT VMDK device attached - device_id: {} guest_path: {}", + device_id, + &base_device.source + ); + + device_ids.push(device_id); + + // Create a storage entry for each GPT partition. + for (idx, part) in layout.partitions.iter().enumerate() { + let mut rolayer = base_device.clone(); + let options: Vec = vec![ + "X-kata.overlay-lower".to_string(), + "X-kata.multi-layer=true".to_string(), + "X-kata.gpt-partitioned=true".to_string(), + format!("X-kata.partition-number={}", part.partition_number), + ]; + + rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string(); + rolayer.mount_point = container_path.clone(); + rolayer.options = options; + rolayer.source = base_device.source.clone(); + + info!( + sl!(), + "Created storage for GPT partition {} (partition number {}, LBA {}-{})", + idx, part.partition_number, part.start_lba, part.end_lba + ); + + erofs_storages.push(rolayer); + } + + // Mark GPT erofs as processed so subsequent erofs mounts + // in the loop are skipped, while still allowing ext4 and + // overlay mounts to be visited. + gpt_erofs_processed = true; + } else { + // fsmerge mode: Single erofs mount with device= options + info!( + sl!(), + "multi-layer erofs: using fsmerge mode for erofs layers: {}", + mount.source + ); + + // Collect all EROFS devices: source + `device=` options + let mut erofs_devices = vec![mount.source.clone()]; + for opt in &mount.options { + if let Some(device_path) = opt.strip_prefix("device=") { + erofs_devices.push(device_path.to_string()); + } + } + + info!(sl!(), "EROFS devices count: {}", erofs_devices.len()); + + // Generate merged VMDK file from all EROFS devices + // Returns (path, format) - format is Vmdk for multiple devices, Raw for single device + let (erofs_path, erofs_format) = + generate_merged_erofs_vmdk(sid, cid, &erofs_devices) + .await + .context("failed to generate EROFS VMDK")?; + + // Track VMDK path for cleanup (only when VMDK is actually created) + if erofs_format == BlockDeviceFormat::Vmdk { + vmdk_path = Some(PathBuf::from(&erofs_path)); + } + + info!( + sl!(), + "EROFS block device config - path: {}, format: {:?}", + erofs_path, + erofs_format + ); + + let device_config = &mut BlockConfig { + driver_option: block_driver.clone(), + format: erofs_format, // Vmdk for multiple devices, Raw for single device + path_on_host: erofs_path, + is_readonly: true, // EROFS layers are read-only, must set to avoid "resize" lock errors + blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio), + ..Default::default() + }; + + let device_info = do_handle_device( + device_manager, + &DeviceConfig::BlockCfg(device_config.clone()), + ) + .await + .context("failed to attach erofs block device")?; + + let (mut rolayer, device_id) = + extract_block_device_info(&device_info, true)?; + info!( + sl!(), + "erofs device attached - device_id: {} guest_path: {}", + device_id, + &rolayer.source + ); + + let mut options: Vec = mount + .options + .iter() + .filter(|o| { + // Filter out options that are not valid erofs mount parameters: + // 1. "loop" - not needed in VM, device is already /dev/vdX + // 2. "device=" prefix - used for VMDK generation only, not for mount + // 3. "X-kata." prefix - metadata markers for kata internals + *o != "loop" + && !o.starts_with("device=") + && !o.starts_with("X-kata.") + }) + .cloned() + .collect(); + + // Erofs layers are read-only lower layers (marked with X-kata.overlay-lower) + options.push("X-kata.overlay-lower".to_string()); + options.push("X-kata.multi-layer=true".to_string()); + + info!( + sl!(), + "erofs storage options filtered: {:?} -> {:?}", mount.options, options + ); + + rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string(); + rolayer.mount_point = container_path.clone(); + rolayer.options = options; + + erofs_storages.push(rolayer); + device_ids.push(device_id); + } } fmt if fmt.eq_ignore_ascii_case("overlay") || fmt.eq_ignore_ascii_case("format/overlay") || fmt.eq_ignore_ascii_case("format/mkdir/overlay") => { // Mount[2]: overlay to combine rwlayer (upper) + erofs (lower) + // mkdir directives already extracted before the main loop info!( sl!(), - "multi-layer erofs: parsing overlay mount, options: {:?}", mount.options + "multi-layer erofs: overlay mount (mkdir directives pre-extracted)" ); - - // Parse mkdir options (X-containerd.mkdir.path) - for opt in &mount.options { - if let Some(mkdir_spec) = opt.strip_prefix(X_CONTAINERD_MKDIR_PATH) { - // Keep the full spec (path:mode or path:mode:uid:gid) for guest agent - mkdir_dirs.push(mkdir_spec.to_string()); - } - } } _ => { info!( @@ -572,8 +913,14 @@ impl ErofsMultiLayerRootfs { guest_path: container_path, device_ids, rwlayer_storage, - erofs_storage, + erofs_storages, vmdk_path, + gpt_metadata_paths, + generated_artifacts_dir: PathBuf::from( + kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS), + ) + .join(sid) + .join(cid), }) } } @@ -589,16 +936,18 @@ impl Rootfs for ErofsMultiLayerRootfs { } async fn get_storage(&self) -> Option> { - // Return all storages for multi-layer EROFS (rw layer + erofs layer) to guest agent. - // Guest agent needs both to create overlay mount + // Return all storages for multi-layer EROFS (rw layer + erofs layers) to guest agent. + // Guest agent needs all of them to create overlay mount. + // In GPT mode, each partition has its own storage entry. let mut storages = Vec::new(); if let Some(rwlayer) = self.rwlayer_storage.clone() { storages.push(rwlayer); } - if let Some(erofs) = self.erofs_storage.clone() { - storages.push(erofs); + // Add all EROFS layer storages (single storage in fsmerge mode, multiple in GPT mode) + for erofs in &self.erofs_storages { + storages.push(erofs.clone()); } if storages.is_empty() { @@ -613,23 +962,27 @@ impl Rootfs for ErofsMultiLayerRootfs { } async fn cleanup(&self, device_manager: &RwLock) -> Result<()> { + // Helper function to safely remove a file if it exists and is within the specified directory. + let safely_remove_file = |path: &Path, dir: &Path| -> Result<()> { + if path.starts_with(dir) && path.exists() { + fs::remove_file(path).context(format!("failed to remove file: {}", path.display()))?; + } + Ok(()) + }; + let mut dm = device_manager.write().await; for device_id in &self.device_ids { dm.try_remove_device(device_id).await?; } - // Clean up generated VMDK descriptor file if it exists (only for multi-device case) + // Clean up generated VMDK descriptor file if it exists. if let Some(ref vmdk) = self.vmdk_path { - if vmdk.exists() { - if let Err(e) = fs::remove_file(vmdk) { - warn!( - sl!(), - "failed to remove VMDK descriptor {}: {}", - vmdk.display(), - e - ); - } - } + safely_remove_file(vmdk, &self.generated_artifacts_dir)?; + } + + // Clean up GPT metadata files (head, padding). + for metadata_path in &self.gpt_metadata_paths { + safely_remove_file(metadata_path, &self.generated_artifacts_dir)?; } Ok(()) diff --git a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs index c9ae3bb52f..b907658f3d 100644 --- a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs +++ b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs @@ -91,7 +91,7 @@ pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &P toml_utils::set_toml_value( configuration_file, ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", - "1", + "0", )?; Ok(())