Merge pull request #13085 from Apokleos/erofs-gpt-vmdk-only

runtime-rs: Support erofs snapshotter with gpt vmdk mode
This commit is contained in:
Fabiano Fidêncio
2026-05-25 16:29:59 +02:00
committed by GitHub
12 changed files with 1351 additions and 263 deletions

47
Cargo.lock generated
View File

@@ -139,7 +139,7 @@ version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
dependencies = [
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -150,7 +150,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
dependencies = [
"anstyle",
"once_cell_polyfill",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -1963,7 +1963,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [
"libc",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -2458,6 +2458,18 @@ dependencies = [
"cfg-if 0.1.10",
]
[[package]]
name = "gpt"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3696fafb1ecdcc2ae3ce337de73e9202806068594b77d22fdf2f3573c5ec2219"
dependencies = [
"bitflags 2.11.1",
"crc",
"simple-bytes",
"uuid 1.23.1",
]
[[package]]
name = "h2"
version = "0.3.27"
@@ -3163,7 +3175,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
dependencies = [
"hermit-abi 0.5.2",
"libc",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -3632,8 +3644,10 @@ dependencies = [
"base64 0.13.1",
"bitmask-enum",
"byte-unit",
"crc",
"flate2",
"glob",
"gpt",
"lazy_static",
"nix 0.26.4",
"num_cpus",
@@ -3641,6 +3655,7 @@ dependencies = [
"regex",
"rstest 0.18.2",
"safe-path 0.1.0",
"scopeguard",
"serde",
"serde-enum-str",
"serde_json",
@@ -4390,7 +4405,7 @@ version = "0.50.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
dependencies = [
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -5295,7 +5310,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "059a34f111a9dee2ce1ac2826a68b24601c4298cfeb1a587c3cb493d5ab46f52"
dependencies = [
"libc",
"nix 0.23.2",
"nix 0.30.1",
]
[[package]]
@@ -6516,7 +6531,7 @@ dependencies = [
"errno",
"libc",
"linux-raw-sys 0.12.1",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -6617,7 +6632,7 @@ dependencies = [
"security-framework",
"security-framework-sys",
"webpki-root-certs",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -7182,6 +7197,12 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
[[package]]
name = "simple-bytes"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c11532d9d241904f095185f35dcdaf930b1427a94d5b01d7002d74ba19b44cc4"
[[package]]
name = "siphasher"
version = "1.0.3"
@@ -7305,7 +7326,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
dependencies = [
"libc",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -7546,7 +7567,7 @@ dependencies = [
"getrandom 0.4.2",
"once_cell",
"rustix 1.1.4",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -7555,7 +7576,7 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1"
dependencies = [
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -8259,7 +8280,7 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e"
dependencies = [
"memoffset 0.9.1",
"tempfile",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@@ -8842,7 +8863,7 @@ version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]

View File

@@ -197,6 +197,7 @@ tracing-subscriber = "0.3.20"
ttrpc = "0.8.4"
url = "2.5.4"
which = "4.3.0"
gpt = "4.1.0"
# Per-package release profile overrides for kata-deploy. The kata-deploy
# binary runs once at pod start and then idles waiting for SIGTERM, so we

View File

@@ -90,7 +90,7 @@ version = 3
[plugins.'io.containerd.snapshotter.v1.erofs']
default_size = '<SIZE>' # SIZE=6G or 10G or other size
max_unmerged_layers = 1
max_unmerged_layers = 0
```
#### Verify the EROFS plugins are loaded

View File

@@ -173,7 +173,8 @@ pub struct VirtioBlkPciMatcher {
impl VirtioBlkPciMatcher {
pub fn new(relpath: &str, root_complex: &str) -> VirtioBlkPciMatcher {
let root_bus = create_pci_root_bus_path(root_complex);
let re = format!(r"^{root_bus}{relpath}/virtio[0-9]+/block/");
// [^/]+$ ensures it only match the whole-disk uevent (e.g. block/vdx)
let re = format!(r"^{root_bus}{relpath}/virtio[0-9]+/block/[^/]+$");
VirtioBlkPciMatcher {
rex: Regex::new(&re).expect("BUG: failed to compile VirtioBlkPciMatcher regex"),
@@ -259,6 +260,17 @@ mod tests {
assert!(matcher_b.is_match(&uev_b));
assert!(!matcher_b.is_match(&uev_a));
assert!(!matcher_a.is_match(&uev_b));
// Partition uevents must NOT match (only the whole-disk uevent should match)
let mut uev_part = uev_a.clone();
uev_part.devname = "vda1".to_string();
uev_part.devpath = format!("{root_bus}{relpath_a}/virtio4/block/{devname}/vda1");
assert!(!matcher_a.is_match(&uev_part));
let mut uev_part91 = uev_a.clone();
uev_part91.devname = "vda91".to_string();
uev_part91.devpath = format!("{root_bus}{relpath_a}/virtio4/block/{devname}/vda91");
assert!(!matcher_a.is_match(&uev_part91));
}
#[cfg(target_arch = "s390x")]

View File

@@ -17,6 +17,9 @@ use std::sync::Arc;
use tokio::sync::Mutex;
use tracing::instrument;
/// The path segment in the uevent devpath that separates the SCSI path and the block device name.
const BLOCK_SEGMENT: &str = "/block/";
#[derive(Debug)]
pub struct ScsiDeviceHandler {}
@@ -53,20 +56,41 @@ pub async fn get_scsi_device_name(
// SCSI host.
#[derive(Debug)]
pub struct ScsiBlockMatcher {
search: String,
/// Expected SCSI path suffix before `/block/`, e.g. `/0:0:2:0`
scsi_path_suffix: String,
}
impl ScsiBlockMatcher {
pub fn new(scsi_addr: &str) -> ScsiBlockMatcher {
let search = format!(r"/0:0:{scsi_addr}/block/");
ScsiBlockMatcher {
scsi_path_suffix: format!("/0:0:{scsi_addr}"),
}
}
ScsiBlockMatcher { search }
fn split_block_devpath<'a>(&self, devpath: &'a str) -> Option<(&'a str, &'a str)> {
let idx = devpath.find(BLOCK_SEGMENT)?;
let prefix = &devpath[..idx];
let suffix = &devpath[idx + BLOCK_SEGMENT.len()..];
Some((prefix, suffix))
}
}
impl UeventMatcher for ScsiBlockMatcher {
fn is_match(&self, uev: &Uevent) -> bool {
uev.subsystem == BLOCK && uev.devpath.contains(&self.search) && !uev.devname.is_empty()
if uev.action != U_EVENT_ACTION_ADD {
return false;
}
if uev.subsystem != BLOCK || uev.devname.is_empty() {
return false;
}
let (prefix, suffix) = match self.split_block_devpath(&uev.devpath) {
Some(parts) => parts,
None => return false,
};
prefix.ends_with(&self.scsi_path_suffix) && !suffix.contains('/') && suffix == uev.devname
}
}
@@ -106,6 +130,23 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> {
#[cfg(test)]
mod tests {
use super::*;
use crate::linux_abi::U_EVENT_ACTION_ADD;
fn make_scsi_block_uevent(addr: &str, devname: &str, devpath_suffix: &str) -> Uevent {
let root_bus = create_pci_root_bus_path("00");
let mut uev = Uevent::default();
uev.action = U_EVENT_ACTION_ADD.to_string();
uev.subsystem = BLOCK.to_string();
uev.devname = devname.to_string();
uev.devpath = format!(
"{root_bus}/0000:00:00.0/virtio0/host0/target0:0:{target}/0:0:{addr}/block/{devpath_suffix}",
target = addr.split(':').next().unwrap_or("0"),
addr = addr,
devpath_suffix = devpath_suffix,
);
uev
}
#[tokio::test]
#[allow(clippy::redundant_clone)]
@@ -124,6 +165,7 @@ mod tests {
let mut uev_b = uev_a.clone();
let addr_b = "2:0";
uev_b.devname = "sdb".to_string();
uev_b.devpath =
format!("{root_bus}/0000:00:00.0/virtio0/host0/target0:0:2/0:0:{addr_b}/block/sdb");
let matcher_b = ScsiBlockMatcher::new(addr_b);
@@ -133,4 +175,21 @@ mod tests {
assert!(!matcher_b.is_match(&uev_a));
assert!(!matcher_a.is_match(&uev_b));
}
#[tokio::test]
async fn test_scsi_block_matcher_rejects_partitions() {
let uev_whole = make_scsi_block_uevent("0:0", "sda", "sda");
let uev_part = make_scsi_block_uevent("0:0", "sda1", "sda/sda1");
let matcher = ScsiBlockMatcher::new("0:0");
assert!(
matcher.is_match(&uev_whole),
"whole disk uevent should match"
);
assert!(
!matcher.is_match(&uev_part),
"partition uevent should not match"
);
}
}

View File

@@ -22,6 +22,7 @@ use tracing::instrument;
use self::bind_watcher_handler::BindWatcherHandler;
use self::block_handler::{PmemHandler, ScsiHandler, VirtioBlkMmioHandler, VirtioBlkPciHandler};
pub use self::ephemeral_handler::update_ephemeral_mounts;
use self::ephemeral_handler::EphemeralHandler;
use self::fs_handler::{OverlayfsHandler, VirtioFsHandler};
use self::image_pull_handler::ImagePullHandler;
@@ -30,15 +31,13 @@ use self::multi_layer_erofs::{handle_multi_layer_erofs_group, is_multi_layer_sto
use crate::mount::{baremount, is_mounted, remove_mounts};
use crate::sandbox::Sandbox;
pub use self::ephemeral_handler::update_ephemeral_mounts;
mod bind_watcher_handler;
mod block_handler;
mod ephemeral_handler;
mod fs_handler;
mod image_pull_handler;
mod local_handler;
mod multi_layer_erofs;
pub mod multi_layer_erofs;
const RW_MASK: u32 = 0o660;
const RO_MASK: u32 = 0o440;
@@ -168,6 +167,8 @@ struct MultiLayerProcessResult {
/// Temporary mount points (upper/lower) backing the overlay, needed for
/// container-scoped cleanup via `container_mounts`.
temp_mount_points: Vec<String>,
/// dm-verity device paths that need to be destroyed during cleanup
verity_devices: Vec<String>,
}
/// Handle multi-layer storage by creating the overlay device.
@@ -209,6 +210,7 @@ async fn handle_multi_layer_storage(
device,
processed_mount_points: result.processed_mount_points,
temp_mount_points: result.temp_mount_points,
verity_devices: result.verity_devices,
}))
}
@@ -303,6 +305,7 @@ pub async fn add_storages(
}
}
mount_list.extend(result.temp_mount_points);
mount_list.extend(result.verity_devices);
continue;
}

View File

@@ -10,10 +10,14 @@
//! - Storage with X-kata.overlay-lower: erofs layers (lowerdir)
//! - Creates overlay to combine them
//! - Supports X-kata.mkdir.path options to create directories in upper layer before overlay mount
//! - Supports GPT-partitioned disks where each layer is a separate partition
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use tokio::time::sleep;
use crate::device::block_device_handler::get_virtio_blk_pci_device_name;
use crate::device::scsi_device_handler::get_scsi_device_name;
@@ -44,7 +48,9 @@ pub const DRIVER_MULTI_LAYER_EROFS: &str = "erofs.multi-layer";
const OPT_OVERLAY_UPPER: &str = "X-kata.overlay-upper";
const OPT_OVERLAY_LOWER: &str = "X-kata.overlay-lower";
const OPT_MULTI_LAYER: &str = "X-kata.multi-layer=true";
const OPT_GPT_PARTITIONED: &str = "X-kata.gpt-partitioned=true";
const OPT_MKDIR_PATH: &str = "X-kata.mkdir.path=";
const OPT_PARTITION_NUMBER: &str = "X-kata.partition-number=";
#[derive(Debug)]
pub struct MultiLayerErofsHandler {}
@@ -57,13 +63,19 @@ pub struct MultiLayerErofsResult {
/// overlay. These must be tracked so they are unmounted *after* the
/// overlay target during container teardown.
pub temp_mount_points: Vec<String>,
/// dm-verity device paths that need to be destroyed during cleanup
pub verity_devices: Vec<String>,
}
#[allow(dead_code)]
#[derive(Debug)]
struct MkdirDirective {
raw_path: String,
mode: Option<String>,
}
/// Helper struct to track layer mount information including dm-verity devices
#[derive(Debug)]
struct LayerMountInfo {
verity_device: Option<String>,
}
#[async_trait::async_trait]
@@ -122,6 +134,7 @@ pub async fn handle_multi_layer_erofs_group(
let mut ext4_storage: Option<&Storage> = None;
let mut erofs_storages: Vec<&Storage> = Vec::new();
let mut mkdir_dirs: Vec<MkdirDirective> = Vec::new();
let mut has_gpt_partition: bool = false;
for storage in &multi_layer_storages {
if is_upper_storage(storage) {
@@ -139,19 +152,33 @@ pub async fn handle_multi_layer_erofs_group(
}
}
} else if is_lower_storage(storage) {
// Each GPT partition is provided as a separate storage entry by the host
if !has_gpt_partition && is_gpt_partitioned(storage) {
has_gpt_partition = true;
}
erofs_storages.push(*storage);
}
}
let ext4 = ext4_storage
.ok_or_else(|| anyhow!("multi-layer erofs missing ext4 upper layer storage"))?;
if erofs_storages.is_empty() {
return Err(anyhow!(
"multi-layer erofs missing erofs lower layer storage"
));
}
// Only sort erofs layers by partition number in GPT mode.
// In GPT mode, each storage carries X-kata.partition-number=N and layers
// must be ordered by partition number so that the overlay lowerdir
// precedence is correct (lower partition number = higher overlay priority).
// In non-GPT mode all partition numbers are None, so sorting would be a
// no-op that needlessly reorders elements.
if has_gpt_partition {
erofs_storages.sort_by_key(|storage| get_partition_number(storage).unwrap_or(u32::MAX));
}
let ext4 = ext4_storage
.ok_or_else(|| anyhow!("multi-layer erofs missing ext4 upper layer storage"))?;
info!(
logger,
"Handling multi-layer erofs group";
@@ -178,7 +205,7 @@ pub async fn handle_multi_layer_erofs_group(
let upper_mount = temp_base.join("upper");
fs::create_dir_all(&upper_mount).context("failed to create upper mount dir")?;
wait_and_mount_layer(ext4, &upper_mount, sandbox, &logger).await?;
wait_and_mount_layer(ext4, &upper_mount, sandbox, &logger, None).await?;
for mkdir_dir in &mkdir_dirs {
// As {{ mount 1 }} refers to the first lower layer, which is not available until we mount it.
@@ -201,6 +228,9 @@ pub async fn handle_multi_layer_erofs_group(
}
let mut lower_mounts = Vec::new();
let mut verity_devices = Vec::new();
let mut base_device_cache: HashMap<String, String> = HashMap::new();
for (index, erofs) in erofs_storages.iter().enumerate() {
let lower_mount = temp_base.join(format!("lower-{}", index));
fs::create_dir_all(&lower_mount).context(format!(
@@ -208,8 +238,25 @@ pub async fn handle_multi_layer_erofs_group(
lower_mount.display()
))?;
wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger).await?;
let base_dev_path = if is_gpt_partitioned(erofs) {
Some(
base_device_cache
.entry(erofs.source.clone())
.or_insert(resolve_base_device_path(erofs, sandbox).await?)
.clone(),
)
} else {
None
};
let mount_info =
wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger, base_dev_path).await?;
lower_mounts.push(lower_mount);
// Collect dm-verity device for cleanup
if let Some(verity_dev) = mount_info.verity_device {
verity_devices.push(verity_dev);
}
}
// If any mkdir directive refers to {{ mount 1 }}, resolve it now using the first lower mount.
@@ -318,6 +365,7 @@ pub async fn handle_multi_layer_erofs_group(
mount_point: ext4.mount_point.clone(),
processed_mount_points,
temp_mount_points,
verity_devices,
})
}
@@ -407,7 +455,6 @@ fn parse_mkdir_directive(spec: &str) -> Result<MkdirDirective> {
Ok(MkdirDirective {
raw_path: raw_path.to_string(),
mode: parts.get(1).map(|s| s.to_string()),
})
}
@@ -467,7 +514,8 @@ async fn wait_and_mount_layer(
layer_mount: &Path,
sandbox: &Arc<Mutex<Sandbox>>,
logger: &Logger,
) -> Result<()> {
base_dev_path: Option<String>,
) -> Result<LayerMountInfo> {
info!(
logger,
"Waiting for layer device";
@@ -475,22 +523,41 @@ async fn wait_and_mount_layer(
"driver" => &layer.driver,
"mount-point" => layer_mount.display(),
);
let dev_path = match layer.driver.as_str() {
DRIVER_SCSI_TYPE => {
// For SCSI devices, we need to wait for the device to appear and get its path before mounting.
get_scsi_device_name(sandbox, &layer.source).await?
}
DRIVER_BLK_PCI_TYPE => {
let (root_complex, pcipath) = pcipath_from_dev_tree_path(&layer.source)?;
get_virtio_blk_pci_device_name(sandbox, root_complex, &pcipath).await?
}
_ => {
// For non-SCSI devices, we can assume the source is directly mountable.
let is_gpt = is_gpt_partitioned(layer);
let partition_num = get_partition_number(layer);
// Get the base device path
let dev_path = match base_dev_path {
Some(path) => path,
None => resolve_base_device_path(layer, sandbox).await?,
};
// For GPT-partitioned disks, use the partition device path
let dev_path = if is_gpt {
if let Some(part_num) = partition_num {
let path = get_partition_device_path(&dev_path, part_num);
info!(
logger,
"GPT-partitioned mode: using partition device";
"base-device" => &dev_path,
"partition-number" => part_num,
"partition-device" => &path,
);
// Wait for partition device node to appear
wait_for_partition_device(&path, logger).await?;
path
} else {
return Err(anyhow!(
"unsupported driver type '{}' for multi-layer erofs",
layer.driver
"GPT-partitioned storage missing partition number: {:?}",
layer
));
}
} else {
// Non-GPT mode: use base device directly
dev_path.clone()
};
info!(
@@ -500,6 +567,7 @@ async fn wait_and_mount_layer(
"fstype" => &layer.fstype,
"devname" => &dev_path,
"mount-point" => layer_mount.display(),
"gpt-mode" => is_gpt,
);
create_mount_destination(Path::new(&dev_path), layer_mount, "", &layer.fstype)
@@ -548,7 +616,106 @@ async fn wait_and_mount_layer(
// After successfully mounting the layer, we track the mount point for cleanup.
track_temporary_mount_for_cleanup(sandbox, layer_mount, logger).await?;
Ok(())
Ok(LayerMountInfo {
verity_device: None,
})
}
async fn resolve_base_device_path(
layer: &Storage,
sandbox: &Arc<Mutex<Sandbox>>,
) -> Result<String> {
let base_dev_path = match layer.driver.as_str() {
DRIVER_SCSI_TYPE => {
// For SCSI devices, we need to wait for the device to appear and get its path before mounting.
get_scsi_device_name(sandbox, &layer.source).await?
}
DRIVER_BLK_PCI_TYPE => {
let (root_complex, pcipath) = pcipath_from_dev_tree_path(&layer.source)?;
get_virtio_blk_pci_device_name(sandbox, root_complex, &pcipath).await?
}
_ => {
// For non-SCSI devices, we can assume the source is directly mountable.
return Err(anyhow!(
"unsupported driver type '{}' for multi-layer erofs",
layer.driver
));
}
};
Ok(base_dev_path)
}
/// Check if the storage is GPT-partitioned
fn is_gpt_partitioned(storage: &Storage) -> bool {
storage.options.iter().any(|o| o == OPT_GPT_PARTITIONED)
}
/// Extract partition number from storage options
/// Returns None if not specified (non-GPT mode)
fn get_partition_number(storage: &Storage) -> Option<u32> {
for opt in &storage.options {
if let Some(num_str) = opt.strip_prefix(OPT_PARTITION_NUMBER) {
return num_str.parse::<u32>().ok();
}
}
None
}
/// Get the partition device path for a GPT-partitioned disk
///
/// For GPT mode: the storage.source contains the base disk path (e.g., "/dev/vda")
/// We need to append the partition number to get the partition path (e.g., "/dev/vda1")
///
/// Follows the kernel naming rule: if the base device name ends with a digit,
/// insert a 'p' separator before the partition number to avoid ambiguity.
/// This correctly handles all device families:
/// - /dev/vda -> /dev/vda1 (no trailing digit, bare number)
/// - /dev/sda -> /dev/sda1
/// - /dev/nvme0n1 -> /dev/nvme0n1p1 (trailing digit, needs 'p')
/// - /dev/mmcblk0 -> /dev/mmcblk0p1
/// - /dev/loop0 -> /dev/loop0p1
fn get_partition_device_path(base_path: &str, partition_number: u32) -> String {
if base_path.ends_with(char::is_numeric) {
format!("{}p{}", base_path, partition_number)
} else {
format!("{}{}", base_path, partition_number)
}
}
/// Wait for partition device node to appear in /dev.
///
/// When a virtio-blk device with a GPT is hotplugged, the kernel automatically
/// scans the partition table and creates partition nodes. However, devtmpfs node
/// creation may lag slightly behind the uevent, so we poll briefly if needed.
#[allow(dead_code)]
async fn wait_for_partition_device(device_path: &str, logger: &Logger) -> Result<()> {
let device_path_buf = PathBuf::from(device_path);
if device_path_buf.exists() {
return Ok(());
}
const MAX_WAIT_MS: u64 = 1000;
const POLL_INTERVAL_MS: u64 = 50;
for attempt in 0..(MAX_WAIT_MS / POLL_INTERVAL_MS) {
sleep(Duration::from_millis(POLL_INTERVAL_MS)).await;
if device_path_buf.exists() {
info!(
logger,
"Partition device node appeared after polling: {} (attempt {})",
device_path,
attempt + 1
);
return Ok(());
}
}
Err(anyhow!(
"partition device {} did not appear within {} ms",
device_path,
MAX_WAIT_MS
))
}
#[cfg(test)]
@@ -603,27 +770,6 @@ mod tests {
// --- parse_mkdir_directive ---
#[rstest]
#[case("some/path", true, "some/path", None)]
#[case("some/path:0755", true, "some/path", Some("0755"))]
#[case("path:mode:extra", true, "path", Some("mode:extra"))]
#[case("", false, "", None)]
fn test_parse_mkdir_directive(
#[case] spec: &str,
#[case] should_pass: bool,
#[case] expected_path: &str,
#[case] expected_mode: Option<&str>,
) {
let result = parse_mkdir_directive(spec);
if should_pass {
let d = result.expect("expected Ok");
assert_eq!(d.raw_path, expected_path);
assert_eq!(d.mode.as_deref(), expected_mode);
} else {
assert!(result.is_err(), "expected Err for spec {:?}", spec);
}
}
#[test]
fn test_parse_mkdir_directive_rejects_null_bytes() {
assert!(parse_mkdir_directive("foo\0bar").is_err());
@@ -728,4 +874,29 @@ mod tests {
s.options
);
}
// --- get_partition_device_path ---
#[rstest]
#[case("/dev/vda", 1, "/dev/vda1")]
#[case("/dev/sda", 3, "/dev/sda3")]
#[case("/dev/hda", 2, "/dev/hda2")]
#[case("/dev/nvme0n1", 1, "/dev/nvme0n1p1")]
#[case("/dev/nvme0n1", 2, "/dev/nvme0n1p2")]
#[case("/dev/mmcblk0", 1, "/dev/mmcblk0p1")]
#[case("/dev/loop0", 1, "/dev/loop0p1")]
#[case("/dev/nbd0", 3, "/dev/nbd0p3")]
fn test_get_partition_device_path(
#[case] base: &str,
#[case] part: u32,
#[case] expected: &str,
) {
assert_eq!(
get_partition_device_path(base, part),
expected,
"get_partition_device_path({}, {})",
base,
part
);
}
}

View File

@@ -31,7 +31,9 @@ sha2 = "0.10.8"
flate2 = "1.1"
nix = "0.26.4"
oci-spec = { version = "0.8.1", features = ["runtime"] }
gpt = "4.1.0"
scopeguard = "1.0.0"
crc = "3.4.0"
safe-path = { path = "../safe-path", optional = true }
[target.'cfg(target_os = "macos")'.dependencies]

View File

@@ -0,0 +1,463 @@
// Copyright (c) 2026 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
// GPT (GUID Partition Table) disk metadata generation for EROFS multi-layer rootfs.
//
// This module generates a GPT metadata file (gpt_meta_head.img) that is used
// in conjunction with VMDK descriptors to present multiple EROFS layers as a
// single virtual disk with multiple GPT partitions to the guest VM.
// Backup GPT structures are omitted — the virtual disk is ephemeral and
// read-only, so backup recovery serves no purpose.
//
// Key features:
// - Only includes read-only EROFS layers in GPT partitions (rw layer handled separately)
// - Preserves the original order of layers from rootfs_mounts
// - Generates minimal GPT metadata without copying layer data
// - Supports 1MiB alignment for partitions
// - Creates VMDK-compatible descriptor with head/layer/pad extents
use anyhow::{anyhow, Context, Result};
use crc::Crc;
use gpt::{disk::LogicalBlockSize, mbr::ProtectiveMBR, partition_types, GptConfig};
use scopeguard;
use std::convert::TryFrom;
use std::fs;
use std::io::{Read, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use crate::sl;
/// GPT disk parameters (using gpt crate constants where available)
/// DEFAULT_SECTOR_SIZE is LogicalBlockSize enum, not u64
const SECTOR_SIZE: u64 = 512;
/// 1 MiB alignment start
const FIRST_PARTITION_LBA: u64 = 2048;
/// 1 MiB alignment
const ALIGNMENT_LBA: u64 = 2048;
/// bytes per GPT partition entry (UEFI standard)
const GPT_ENTRY_SIZE: u64 = 128;
/// standard GPT partition entry count
const MAX_GPT_PARTITIONS: usize = 128;
/// 32 sectors for partition entries (128 entries * 128 bytes each / 512 bytes per sector)
const ENTRIES_SECTORS: u64 = (MAX_GPT_PARTITIONS as u64 * GPT_ENTRY_SIZE) / SECTOR_SIZE;
/// GPT header size in bytes (UEFI specification)
const GPT_HEADER_SIZE: usize = 92;
/// Offset (in bytes) of the GPT primary header within the head file (LBA 1)
const GPT_HEADER_FILE_OFFSET: u64 = SECTOR_SIZE;
/// CRC-32/ISO-HDLC — the same algorithm the `gpt` crate uses internally.
const CRC_32: Crc<u32> = Crc::<u32>::new(&crc::CRC_32_ISO_HDLC);
/// GPT head metadata file name
const GPT_META_HEAD_IMG: &str = "gpt_meta_head.img";
/// Temporary full GPT image used to synthesize head metadata
const GPT_META_FULL_IMG: &str = "gpt_meta_full.img";
/// Represents a read-only EROFS layer to be placed in a GPT partition
#[derive(Debug, Clone)]
pub struct ErofsLayer {
/// Path to the EROFS image file
pub path: String,
/// Size in sectors (ceiling division, sector = 512 bytes)
pub size_sectors: u64,
/// Snapshot ID extracted from path (for naming)
pub snapshot_id: String,
}
/// GPT partition layout information for a single layer
#[derive(Debug, Clone)]
pub struct PartitionLayout {
/// Layer information
pub layer: ErofsLayer,
/// Partition number (1-indexed)
pub partition_number: u32,
/// First LBA of the partition
pub start_lba: u64,
/// Last LBA of the partition
pub end_lba: u64,
/// Partition name
pub name: String,
}
/// Complete GPT disk layout calculation result
#[derive(Debug, Clone)]
pub struct GptDiskLayout {
/// All partition layouts in order
pub partitions: Vec<PartitionLayout>,
/// Total sectors in the virtual disk
pub total_sectors: u64,
/// Logical block size in bytes
pub lb_size: u64,
}
/// Result of GPT metadata file generation
#[derive(Debug)]
pub struct GptMetadataFiles {
/// Path to generated gpt_meta_head.img
pub head_path: PathBuf,
/// Size of head file in sectors
pub head_sectors: u64,
/// Paths to generated padding files (between partitions)
pub pad_paths: Vec<PathBuf>,
}
/// Extract snapshot ID from a source path
///
/// Examples:
/// ".../snapshots/35/layer.erofs" ---> "35"
pub fn extract_snapshot_id(source: &str) -> String {
Path::new(source)
.parent()
.and_then(|p| p.file_name())
.map(|id| id.to_string_lossy().to_string())
.unwrap_or_else(|| "unknown".to_string())
}
/// Get file size in bytes
pub fn get_erofs_layer_size(path: &str) -> Result<u64> {
let metadata = fs::metadata(path).context(format!("failed to stat EROFS file: {}", path))?;
Ok(metadata.len())
}
/// Align LBA up to the specified alignment
fn align_up(lba: u64, alignment: u64) -> u64 {
if lba.is_multiple_of(alignment) {
lba
} else {
((lba / alignment) + 1) * alignment
}
}
/// Calculate GPT disk layout from EROFS layers
///
/// This function computes the LBA positions for all partitions without
/// modifying any files. It follows the layout:
/// - LBA 0: Protective MBR
/// - LBA 1: Primary GPT Header
/// - LBA 2-33: Primary Partition Entry Array
/// - LBA 34-2047: Reserved/padding
/// - LBA 2048+: Partitions (1MiB aligned)
/// - End: Backup Partition Entry Array + Backup GPT Header
pub fn calculate_gpt_layout(layers: &[ErofsLayer]) -> Result<GptDiskLayout> {
if layers.is_empty() {
return Err(anyhow!("no EROFS layers provided for GPT layout"));
}
// TODO: Fix the length of partitions exceeding GPT limits.
// It should be addressed by splitting into multiple GPT disks if needed, but for now we enforce the limit.
if layers.len() > MAX_GPT_PARTITIONS {
return Err(anyhow!(
"The layers for GPT: {} exceeds maximum {} partitions \
(ENTRIES_SECTORS is sized for {} entries)",
layers.len(),
MAX_GPT_PARTITIONS,
MAX_GPT_PARTITIONS,
));
}
// Validate that all layers have non-zero size
for (idx, layer) in layers.iter().enumerate() {
if layer.size_sectors == 0 {
return Err(anyhow!(
"EROFS layer {} ({}) has size_sectors = 0, cannot generate GPT partition",
idx,
layer.path
));
}
}
let lb_size = SECTOR_SIZE;
let first_usable_lba = FIRST_PARTITION_LBA;
// Calculate partition positions
let mut partitions = Vec::with_capacity(layers.len());
let mut current_lba = first_usable_lba;
for (idx, layer) in layers.iter().enumerate() {
// Align start LBA to 1MiB boundary
let start_lba = align_up(current_lba, ALIGNMENT_LBA);
let end_lba = start_lba + layer.size_sectors - 1;
// Generate partition name: erofs-{index}-s{snapshot_id}
let name = format!("erofs-{}-s{}", idx, layer.snapshot_id);
// Truncate to fit GPT name limit without slicing through a UTF-8 codepoint.
let name = match name.char_indices().nth(36) {
Some((truncate_at, _)) => name[..truncate_at].to_string(),
None => name,
};
partitions.push(PartitionLayout {
layer: layer.clone(),
partition_number: (idx + 1) as u32,
start_lba,
end_lba,
name,
});
// Next partition starts after this one
current_lba = end_lba + 1;
}
// Calculate backup GPT position
// Backup entries are placed after the last partition, aligned
let backup_entries_lba = align_up(current_lba, ALIGNMENT_LBA);
let backup_header_lba = backup_entries_lba + ENTRIES_SECTORS;
let total_sectors = backup_header_lba + 1;
let last_usable_lba = backup_entries_lba - 1;
// Validate that all partitions fit in usable area
for (idx, part) in partitions.iter().enumerate() {
if part.end_lba > last_usable_lba {
return Err(anyhow!(
"partition {} (end_lba={}) exceeds last usable LBA ({})",
idx,
part.end_lba,
last_usable_lba
));
}
}
Ok(GptDiskLayout {
partitions,
total_sectors,
lb_size,
})
}
/// Generate GPT head metadata and return layout information
///
/// This is the main entry point for GPT metadata generation.
/// It creates a temporary full GPT image (needed by the gpt crate to
/// produce valid primary structures), extracts the head region, patches
/// the primary header to remove references to backup GPT, and discards
/// the rest.
///
/// Output:
/// - gpt_meta_head.img: Primary GPT structures (MBR + GPT header + partition entries + padding)
#[allow(unused_variables)]
pub fn generate_gpt_metadata(
sid: &str,
cid: &str,
erofs_layers: Vec<ErofsLayer>,
container_dir: &Path,
) -> Result<(GptDiskLayout, GptMetadataFiles)> {
if erofs_layers.is_empty() {
return Err(anyhow!(
"no EROFS layers provided for GPT metadata generation"
));
}
let mut layout = calculate_gpt_layout(&erofs_layers)?;
if layout.partitions.is_empty() {
return Err(anyhow!(
"no partitions in layout, cannot generate GPT metadata"
));
}
let full_path = container_dir.join(GPT_META_FULL_IMG);
generate_full_gpt_image(&layout, &full_path).context("failed to generate full GPT image")?;
let _cleanup = scopeguard::guard((), |_| {
let _ = fs::remove_file(&full_path);
});
// Extract head: LBA 0 to FIRST_PARTITION_LBA (2048 sectors = 1 MiB)
let lb_size = layout.lb_size;
let head_sectors = FIRST_PARTITION_LBA;
let head_size = head_sectors * lb_size;
let head_path = container_dir.join(GPT_META_HEAD_IMG);
extract_file_range(&full_path, &head_path, 0, head_size)
.context("failed to extract GPT head metadata")?;
// Patch the primary GPT header so AlternateLBA / LastUsableLBA are
let last_partition_end = layout.partitions.last().unwrap().end_lba;
patch_primary_gpt_header(&head_path, last_partition_end)
.context("failed to patch primary GPT header")?;
// Adjust the layout to reflect the virtual disk size (no backup).
layout.total_sectors = last_partition_end + 1;
info!(
sl!(),
"Generated GPT head file: {} ({} sectors, {} bytes, virtual disk {} sectors)",
head_path.display(),
head_sectors,
head_size,
layout.total_sectors
);
let metadata_files = GptMetadataFiles {
head_path,
head_sectors,
pad_paths: Vec::new(),
};
Ok((layout, metadata_files))
}
fn generate_full_gpt_image(layout: &GptDiskLayout, output_path: &Path) -> Result<()> {
let lb_size = layout.lb_size;
let total_size = layout.total_sectors * lb_size;
let mut file = fs::OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(true)
.open(output_path)
.context(format!(
"failed to create full GPT image: {}",
output_path.display()
))?;
file.set_len(total_size)
.context("failed to pre-allocate full GPT image")?;
let mbr =
ProtectiveMBR::with_lb_size(u32::try_from(layout.total_sectors - 1).unwrap_or(0xFFFF_FFFF));
mbr.overwrite_lba0(&mut file)
.context("failed to write Protective MBR")?;
let mut gdisk = GptConfig::new()
.writable(true)
.logical_block_size(LogicalBlockSize::Lb512)
.change_partition_count(true)
.create_from_device(file, None)
.context("failed to initialize GPT config")?;
for part_layout in &layout.partitions {
let part_size_bytes = (part_layout.end_lba - part_layout.start_lba + 1) * lb_size;
gdisk
.add_partition(
&part_layout.name,
part_size_bytes,
partition_types::LINUX_FS,
0,
Some(ALIGNMENT_LBA),
)
.context(format!("failed to add partition '{}'", part_layout.name))?;
}
let mut file = gdisk
.write()
.context("failed to write GPT partition table")?;
file.flush().context("failed to flush full GPT image")?;
Ok(())
}
/// Patch the primary GPT header in the extracted head file to remove
/// backup GPT references.
///
/// Sets `AlternateLBA` to one sector beyond the virtual disk (so the kernel
/// detects "no valid backup" and falls back to the primary) and
/// `LastUsableLBA` to the end of the last partition, then recomputes the
/// header CRC32.
fn patch_primary_gpt_header(head_path: &Path, last_partition_end_lba: u64) -> Result<()> {
let mut file = fs::OpenOptions::new()
.read(true)
.write(true)
.open(head_path)
.context("failed to open head file for patching")?;
// Read the 92-byte GPT header starting at LBA 1.
file.seek(SeekFrom::Start(GPT_HEADER_FILE_OFFSET))?;
let mut header = [0u8; GPT_HEADER_SIZE];
file.read_exact(&mut header)?;
// AlternateLBA (offset 32, 8 bytes LE) — point beyond virtual disk
let alternate_lba = last_partition_end_lba + 1;
header[32..40].copy_from_slice(&alternate_lba.to_le_bytes());
// LastUsableLBA (offset 48, 8 bytes LE) — last partition end
header[48..56].copy_from_slice(&last_partition_end_lba.to_le_bytes());
// Zero HeaderCRC32 (offset 16, 4 bytes LE) before computing new CRC
header[16..20].copy_from_slice(&0u32.to_le_bytes());
let new_crc = {
let mut digest = CRC_32.digest();
digest.update(&header);
digest.finalize()
};
header[16..20].copy_from_slice(&new_crc.to_le_bytes());
// Write patched header back
file.seek(SeekFrom::Start(GPT_HEADER_FILE_OFFSET))?;
file.write_all(&header)?;
file.flush()?;
info!(
sl!(),
"Patched primary GPT header: AlternateLBA={}, LastUsableLBA={}, CRC32={:#010x}",
alternate_lba,
last_partition_end_lba,
new_crc
);
Ok(())
}
fn extract_file_range(src: &Path, dst: &Path, offset: u64, size: u64) -> Result<()> {
let mut src_file = fs::OpenOptions::new()
.read(true)
.open(src)
.context(format!("failed to open source file: {}", src.display()))?;
src_file
.seek(SeekFrom::Start(offset))
.context("failed to seek source file")?;
let mut dst_file = fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(dst)
.context(format!("failed to create output file: {}", dst.display()))?;
dst_file
.set_len(size)
.context("failed to pre-allocate output file")?;
let mut limited = src_file.take(size);
std::io::copy(&mut limited, &mut dst_file).context("failed to copy file range")?;
dst_file.flush().context("failed to flush output file")?;
Ok(())
}
/// Generate padding file content (all zeros)
///
/// Returns the file path and size in sectors.
pub fn generate_padding_file(output_path: &Path, size_sectors: u64) -> Result<u64> {
let size_bytes = size_sectors * SECTOR_SIZE;
if size_bytes == 0 {
return Err(anyhow!("cannot create zero-size padding file"));
}
let mut file = fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(output_path)
.context(format!(
"failed to create padding file: {}",
output_path.display()
))?;
// Pre-allocate with zeros
file.set_len(size_bytes)
.context("failed to pre-allocate padding file")?;
file.flush().context("failed to flush padding file")?;
drop(file);
info!(
sl!(),
"Generated padding file: {} ({} sectors, {} bytes)",
output_path.display(),
size_sectors,
size_bytes
);
Ok(size_sectors)
}

View File

@@ -54,6 +54,9 @@ pub mod rootless;
/// machine type
pub mod machine_type;
/// GPT (GUID Partition Table) disk layout and metadata generation.
pub mod gpt_disk;
use std::path::Path;
use crate::rootless::{is_rootless, rootless_dir};

View File

@@ -24,6 +24,10 @@ use hypervisor::{
use kata_types::device::{
DRIVER_BLK_CCW_TYPE as KATA_CCW_DEV_TYPE, DRIVER_BLK_PCI_TYPE as KATA_BLK_DEV_TYPE,
};
use kata_types::gpt_disk::{
extract_snapshot_id, generate_gpt_metadata, generate_padding_file, get_erofs_layer_size,
ErofsLayer, GptDiskLayout, GptMetadataFiles,
};
use kata_types::mount::Mount;
use oci_spec::runtime as oci;
use std::fs;
@@ -39,14 +43,11 @@ pub(crate) const EROFS_ROOTFS_TYPE: &str = "erofs";
pub(crate) const RW_LAYER_ROOTFS_TYPE: &str = "ext4";
/// VMDK file extension for merged EROFS image
const EROFS_MERGED_VMDK: &str = "merged_fs.vmdk";
/// Maximum number of virtio-blk devices allowed for multi-layer EROFS rootfs.
///
/// This defensive limit prevents exhausting PCI slot resources, especially on
/// lightweight VMMs (Dragonball, Cloud Hypervisor) where the PCIe root bus has
/// only 32 slots (PCIE_ROOT_BUS_SLOTS_CAPACITY). For QEMU with PCI bridges
/// (30 slots/bridge), this limit is conservative but still applies as a uniform
/// safeguard across all hypervisor backends.
const MAX_VIRTIO_BLK_DEVICES: usize = 10;
/// Maximum number of rootfs layer devices (erofs + rw layer) allowed in multi-layer EROFS mode.
/// This is a pre-flight sanity check before VMDK merging, to prevent excessive block devices
/// when many layers are used without fsmerge.
const MAX_ROOTFS_LAYER_DEVICES: usize = 129; // 128 EROFS layers + 1 rw layer (129 total)
/// Maximum sectors per 2GB extent (2GB / 512 bytes per sector)
const MAX_2GB_EXTENT_SECTORS: u64 = 0x8000_0000 >> 9;
/// Sectors per track for VMDK geometry
@@ -60,12 +61,25 @@ const VMDK_ADAPTER_TYPE: &str = "ide";
/// VMDK hardware version
const VMDK_HW_VERSION: &str = "4";
/// Default shared directory for guest rootfs VMDK files (for multi-layer EROFS)
const DEFAULT_KATA_GUEST_ROOT_SHARED_FS: &str = "/run/kata-containers/";
pub(crate) const DEFAULT_KATA_GUEST_ROOT_SHARED_FS: &str = "/run/kata-containers/";
/// Template for mkdir option in overlay mount (X-containerd.mkdir.path)
const X_CONTAINERD_MKDIR_PATH: &str = "X-containerd.mkdir.path=";
/// Template for mkdir option passed to guest agent (X-kata.mkdir.path)
const X_KATA_MKDIR_PATH: &str = "X-kata.mkdir.path=";
/// Create the per-container directory under the shared filesystem root.
pub(crate) fn ensure_container_dir(sid: &str, cid: &str) -> Result<PathBuf> {
let dir = PathBuf::from(kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS))
.join(sid)
.join(cid);
fs::create_dir_all(&dir).context(format!(
"failed to create container directory: {}",
dir.display()
))?;
Ok(dir)
}
/// Generate merged VMDK file from multiple EROFS devices
///
/// Creates a VMDK descriptor that combines multiple EROFS images into a single
@@ -104,14 +118,7 @@ async fn generate_merged_erofs_vmdk(
}
// For multiple devices, create VMDK descriptor
let sandbox_dir =
PathBuf::from(kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS)).join(sid);
let container_dir = sandbox_dir.join(cid);
fs::create_dir_all(&container_dir).context(format!(
"failed to create container directory: {}",
container_dir.display()
))?;
let container_dir = ensure_container_dir(sid, cid)?;
let vmdk_path = container_dir.join(EROFS_MERGED_VMDK);
info!(
@@ -129,6 +136,105 @@ async fn generate_merged_erofs_vmdk(
Ok((vmdk_path.display().to_string(), BlockDeviceFormat::Vmdk))
}
/// Helper struct for writing VMDK descriptor files atomically.
///
/// Encapsulates the common VMDK descriptor format: header, extent descriptions,
/// DDB footer, and atomic write (temp file + rename). Used by both fsmerge mode
/// (`create_vmdk_descriptor`) and GPT mode (`create_gpt_vmdk_descriptor`).
struct VmdkDescriptorWriter {
writer: BufWriter<fs::File>,
temp_path: PathBuf,
final_path: PathBuf,
}
impl VmdkDescriptorWriter {
fn new(vmdk_path: &Path) -> Result<Self> {
let temp_path = vmdk_path.with_extension("vmdk.tmp");
if temp_path.components().any(|c| c == Component::ParentDir) {
return Err(anyhow!("Invalid input: {}", temp_path.display()));
}
let file = fs::File::create(&temp_path).context(format!(
"failed to create temp VMDK file: {}",
temp_path.display()
))?;
let mut writer = BufWriter::new(file);
writeln!(writer, "# Disk DescriptorFile")?;
writeln!(writer, "version=1")?;
writeln!(writer, "CID=fffffffe")?;
writeln!(writer, "parentCID=ffffffff")?;
writeln!(writer, "createType=\"{}\"", VMDK_SUBFORMAT)?;
writeln!(writer)?;
writeln!(writer, "# Extent description")?;
Ok(Self {
writer,
temp_path,
final_path: vmdk_path.to_path_buf(),
})
}
// Write a single extent line (no 2GB chunking).
fn write_extent(&mut self, path: &str, sectors: u64, file_offset: u64) -> Result<()> {
writeln!(
self.writer,
"RW {} FLAT \"{}\" {}",
sectors, path, file_offset
)?;
Ok(())
}
// Write extent lines with 2GB chunking for large files.
fn write_extent_chunked(&mut self, path: &str, total_sectors: u64) -> Result<()> {
let mut remaining = total_sectors;
let mut file_offset: u64 = 0;
while remaining > 0 {
let chunk = remaining.min(MAX_2GB_EXTENT_SECTORS);
self.write_extent(path, chunk, file_offset)?;
file_offset += chunk;
remaining -= chunk;
}
Ok(())
}
// Write DDB footer, flush, and atomically rename to final path.
fn finalize(mut self, total_sectors: u64) -> Result<()> {
writeln!(self.writer)?;
let cylinders = total_sectors.div_ceil(SECTORS_PER_TRACK * NUMBER_HEADS);
writeln!(self.writer, "# The Disk Data Base")?;
writeln!(self.writer, "#DDB")?;
writeln!(self.writer)?;
writeln!(
self.writer,
"ddb.virtualHWVersion = \"{}\"",
VMDK_HW_VERSION
)?;
writeln!(self.writer, "ddb.geometry.cylinders = \"{}\"", cylinders)?;
writeln!(self.writer, "ddb.geometry.heads = \"{}\"", NUMBER_HEADS)?;
writeln!(
self.writer,
"ddb.geometry.sectors = \"{}\"",
SECTORS_PER_TRACK
)?;
writeln!(self.writer, "ddb.adapterType = \"{}\"", VMDK_ADAPTER_TYPE)?;
self.writer
.flush()
.context("failed to flush VMDK descriptor")?;
drop(self.writer);
fs::rename(&self.temp_path, &self.final_path).context(format!(
"failed to rename temp VMDK {} -> {}",
self.temp_path.display(),
self.final_path.display()
))?;
Ok(())
}
}
/// Create VMDK descriptor for multiple EROFS extents (flatten device)
///
/// Generates a VMDK descriptor file (twoGbMaxExtentFlat format) that references
@@ -141,7 +247,6 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
));
}
// collect extent information without writing anything.
struct ExtentInfo {
path: String,
total_sectors: u64,
@@ -160,9 +265,6 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
continue;
}
// round up to whole sectors to avoid losing tail bytes on non-aligned files.
// VMDK extents are measured in 512-byte sectors; a file that is not sector-aligned
// still needs the last partial sector to be addressable by the VM.
let sectors = file_size.div_ceil(512);
if file_size % 512 != 0 {
@@ -197,43 +299,9 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
));
}
// write descriptor to a temp file, then atomically rename.
let tmp_path = vmdk_path.with_extension("vmdk.tmp");
// Prevent path traversal attacks by rejecting paths containing '..'.
if tmp_path.components().any(|c| c == Component::ParentDir) {
return Err(anyhow!("Invalid input: {}", tmp_path.display()));
}
let file = fs::File::create(&tmp_path).context(format!(
"failed to create temp VMDK file: {}",
tmp_path.display()
))?;
let mut writer = BufWriter::new(file);
// Header
writeln!(writer, "# Disk DescriptorFile")?;
writeln!(writer, "version=1")?;
writeln!(writer, "CID=fffffffe")?;
writeln!(writer, "parentCID=ffffffff")?;
writeln!(writer, "createType=\"{}\"", VMDK_SUBFORMAT)?;
writeln!(writer)?;
// Extent descriptions
writeln!(writer, "# Extent description")?;
let mut vmdk = VmdkDescriptorWriter::new(vmdk_path)?;
for extent in &extents {
let mut remaining = extent.total_sectors;
let mut file_offset: u64 = 0;
while remaining > 0 {
let chunk = remaining.min(MAX_2GB_EXTENT_SECTORS);
writeln!(
writer,
"RW {} FLAT \"{}\" {}",
chunk, extent.path, file_offset
)?;
file_offset += chunk;
remaining -= chunk;
}
vmdk.write_extent_chunked(&extent.path, extent.total_sectors)?;
info!(
sl!(),
"VMDK extent: {} ({} sectors, {} extent chunk(s))",
@@ -242,45 +310,149 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
extent.total_sectors.div_ceil(MAX_2GB_EXTENT_SECTORS)
);
}
writeln!(writer)?;
// Disk Data Base (DDB)
// Geometry: cylinders = ceil(total_sectors / (sectors_per_track * heads))
let cylinders = total_sectors.div_ceil(SECTORS_PER_TRACK * NUMBER_HEADS);
writeln!(writer, "# The Disk Data Base")?;
writeln!(writer, "#DDB")?;
writeln!(writer)?;
writeln!(writer, "ddb.virtualHWVersion = \"{}\"", VMDK_HW_VERSION)?;
writeln!(writer, "ddb.geometry.cylinders = \"{}\"", cylinders)?;
writeln!(writer, "ddb.geometry.heads = \"{}\"", NUMBER_HEADS)?;
writeln!(writer, "ddb.geometry.sectors = \"{}\"", SECTORS_PER_TRACK)?;
writeln!(writer, "ddb.adapterType = \"{}\"", VMDK_ADAPTER_TYPE)?;
// Flush the BufWriter to ensure all data is written before rename.
writer.flush().context("failed to flush VMDK descriptor")?;
// Explicitly drop to close the file handle before rename.
drop(writer);
// atomic rename: tmp -> final path.
fs::rename(&tmp_path, vmdk_path).context(format!(
"failed to rename temp VMDK {} -> {}",
tmp_path.display(),
vmdk_path.display()
))?;
vmdk.finalize(total_sectors)?;
info!(
sl!(),
"VMDK descriptor created: {} (total {} sectors, {} extents, {} cylinders)",
"VMDK descriptor created: {} (total {} sectors, {} extents)",
vmdk_path.display(),
total_sectors,
extents.len(),
cylinders
extents.len()
);
Ok(())
}
/// Generate GPT-partitioned VMDK and return layout information for per-partition storage creation
///
/// Returns: (vmdk_path, BlockDeviceFormat::Vmdk, GptDiskLayout, GptMetadataFiles)
fn generate_gpt_vmdk_with_layout(
sid: &str,
cid: &str,
erofs_layers: Vec<ErofsLayer>,
) -> Result<(String, BlockDeviceFormat, GptDiskLayout, GptMetadataFiles)> {
if erofs_layers.is_empty() {
return Err(anyhow!("no EROFS layers provided for GPT VMDK generation"));
}
// Validate all layer paths exist and are regular files
for layer in &erofs_layers {
let metadata = fs::metadata(&layer.path)
.context(format!("EROFS layer path not accessible: {}", layer.path))?;
if !metadata.is_file() {
return Err(anyhow!(
"EROFS layer path is not a regular file: {}",
layer.path
));
}
}
// Create container directory
let container_dir = ensure_container_dir(sid, cid)?;
let vmdk_path = container_dir.join(EROFS_MERGED_VMDK);
info!(
sl!(),
"creating GPT-partitioned VMDK for {} EROFS layers: {}",
erofs_layers.len(),
vmdk_path.display()
);
// Generate GPT metadata files
let (layout, mut gpt_files) = generate_gpt_metadata(sid, cid, erofs_layers, &container_dir)
.context("failed to generate GPT metadata")?;
// Create VMDK descriptor with GPT layout and collect generated padding paths
let pad_paths = create_gpt_vmdk_descriptor(&vmdk_path, &layout, &gpt_files)
.context("failed to create GPT VMDK descriptor")?;
gpt_files.pad_paths = pad_paths;
Ok((
vmdk_path.display().to_string(),
BlockDeviceFormat::Vmdk,
layout,
gpt_files,
))
}
/// Create VMDK descriptor for GPT-partitioned disk
///
/// Returns the list of generated padding file paths for cleanup tracking.
fn create_gpt_vmdk_descriptor(
vmdk_path: &Path,
layout: &GptDiskLayout,
gpt_files: &GptMetadataFiles,
) -> Result<Vec<PathBuf>> {
let mut vmdk = VmdkDescriptorWriter::new(vmdk_path)?;
let mut pad_paths: Vec<PathBuf> = Vec::new();
// 1. GPT head metadata
vmdk.write_extent(
&gpt_files.head_path.display().to_string(),
gpt_files.head_sectors,
0,
)?;
info!(
sl!(),
"VMDK extent: GPT head ({} sectors) at {}",
gpt_files.head_sectors,
gpt_files.head_path.display()
);
// 2. Layer extents with padding gaps
// head ends at LBA 2047, so first gap starts at LBA 2048.
let mut prev_end_lba = gpt_files.head_sectors - 1;
let metadata_dir = gpt_files.head_path.parent().ok_or_else(|| {
anyhow!(
"GPT head file has no parent directory: {}",
gpt_files.head_path.display()
)
})?;
for (idx, part) in layout.partitions.iter().enumerate() {
let gap_start_lba = prev_end_lba + 1;
if part.start_lba > gap_start_lba {
let gap_sectors = part.start_lba - gap_start_lba;
let pad_path = metadata_dir.join(format!("pad-{}.img", idx));
generate_padding_file(&pad_path, gap_sectors).context(format!(
"failed to generate padding file: {}",
pad_path.display()
))?;
vmdk.write_extent(&pad_path.display().to_string(), gap_sectors, 0)?;
pad_paths.push(pad_path);
}
vmdk.write_extent_chunked(&part.layer.path, part.layer.size_sectors)?;
info!(
sl!(),
"VMDK extent: {} (partition {}, LBA {}-{}, {} sectors)",
part.layer.path,
part.partition_number,
part.start_lba,
part.end_lba,
part.layer.size_sectors
);
prev_end_lba = part.end_lba;
}
vmdk.finalize(layout.total_sectors)?;
info!(
sl!(),
"GPT VMDK descriptor created: {} (total {} sectors, {} partitions)",
vmdk_path.display(),
layout.total_sectors,
layout.partitions.len()
);
Ok(pad_paths)
}
fn extract_block_device_info(
device_info: &DeviceType,
read_only: bool,
@@ -338,10 +510,16 @@ fn extract_block_device_info(
pub(crate) struct ErofsMultiLayerRootfs {
guest_path: String,
device_ids: Vec<String>,
rwlayer_storage: Option<Storage>, // Writable layer storage (upper layer), typically ext4
erofs_storage: Option<Storage>,
/// Path to generated VMDK descriptor (only set when multiple EROFS devices are merged)
// Writable layer storage (upper layer), typically ext4
rwlayer_storage: Option<Storage>,
// Read-only EROFS layer storages (lower layers), one per partition in GPT mode
erofs_storages: Vec<Storage>,
// Path to generated VMDK descriptor (only set when multiple EROFS devices are merged)
vmdk_path: Option<PathBuf>,
// Paths to generated GPT metadata files (head, padding) for cleanup
gpt_metadata_paths: Vec<PathBuf>,
// Container-scoped runtime directory that may only contain generated helper artifacts.
generated_artifacts_dir: PathBuf,
}
impl ErofsMultiLayerRootfs {
@@ -360,8 +538,11 @@ impl ErofsMultiLayerRootfs {
let mut device_ids = Vec::new();
let mut rwlayer_storage: Option<Storage> = None;
let mut erofs_storage: Option<Storage> = None;
let mut erofs_storages: Vec<Storage> = Vec::new();
let mut vmdk_path: Option<PathBuf> = None;
let mut gpt_metadata_paths: Vec<PathBuf> = Vec::new();
// Track whether GPT+VMDK erofs layers have already been processed in bulk.
let mut gpt_erofs_processed = false;
// Directories to create (X-containerd.mkdir.path)
let mut mkdir_dirs: Vec<String> = Vec::new();
@@ -374,14 +555,33 @@ impl ErofsMultiLayerRootfs {
.iter()
.filter(|m| matches!(m.fs_type.as_str(), RW_LAYER_ROOTFS_TYPE | EROFS_ROOTFS_TYPE))
.count();
if expected_device_count > MAX_VIRTIO_BLK_DEVICES {
// TODO(Alex Lyn): fsmerge mode with single erofs mount and multiple device= options
// may require multiple block devices if containerd does not merge layers into one file.
// This is a fallback or default mode if fsmerge is not enabled.
if expected_device_count > MAX_ROOTFS_LAYER_DEVICES {
return Err(anyhow!(
"exceeded maximum block devices for multi-layer EROFS: {} > {}",
expected_device_count,
MAX_VIRTIO_BLK_DEVICES
MAX_ROOTFS_LAYER_DEVICES
));
}
// Pre-extract mkdir directives from overlay mounts before the main loop,
// so they are available regardless of mount ordering.
for mount in rootfs_mounts {
if matches!(
mount.fs_type.as_str(),
"overlay" | "format/overlay" | "format/mkdir/overlay"
) {
for opt in &mount.options {
if let Some(mkdir_spec) = opt.strip_prefix(X_CONTAINERD_MKDIR_PATH) {
mkdir_dirs.push(mkdir_spec.to_string());
}
}
}
}
// Process each mount in rootfs_mounts to set up devices and storages
for mount in rootfs_mounts {
match mount.fs_type.as_str() {
@@ -407,8 +607,6 @@ impl ErofsMultiLayerRootfs {
.await
.context("failed to attach rw block device")?;
// let (device_id, guest_path, blk_driver) =
// extract_block_device_info(&device_info, &block_driver)?;
let (mut rwlayer, device_id) =
extract_block_device_info(&device_info, false)
.context("failed to get block device for rw layer")?;
@@ -441,110 +639,253 @@ impl ErofsMultiLayerRootfs {
}
fmt if fmt.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE) => {
// Mount[1]: erofs layers -> virtio-blk via VMDK /dev/vdX2
info!(
sl!(),
"multi-layer erofs: adding erofs layers: {}", mount.source
);
//
// Two modes are supported:
// 1. fsmerge mode: Single erofs mount with `device=` options pointing to additional files.
// This is used when containerd has already merged layers into a single file.
// 2. GPT+VMDK mode: Multiple independent erofs mounts (each mount is a separate layer file).
// This is used when containerd does NOT use fsmerge, and we need to create GPT partitions.
// Collect all EROFS devices: source + `device=` options
let mut erofs_devices = vec![mount.source.clone()];
for opt in &mount.options {
if let Some(device_path) = opt.strip_prefix("device=") {
erofs_devices.push(device_path.to_string());
}
// In GPT mode, all erofs layers are processed in bulk on the first
// encounter. Skip subsequent erofs mounts but continue iterating
// so that later ext4 rw-layer and overlay mounts are still handled.
if gpt_erofs_processed {
info!(
sl!(),
"multi-layer erofs: skipping already-processed erofs mount: {}",
mount.source
);
continue;
}
info!(sl!(), "EROFS devices count: {}", erofs_devices.len());
// Generate merged VMDK file from all EROFS devices
// Returns (path, format) - format is Vmdk for multiple devices, Raw for single device
let (erofs_path, erofs_format) =
generate_merged_erofs_vmdk(sid, cid, &erofs_devices)
.await
.context("failed to generate EROFS VMDK")?;
// Track VMDK path for cleanup (only when VMDK is actually created)
if erofs_format == BlockDeviceFormat::Vmdk {
vmdk_path = Some(PathBuf::from(&erofs_path));
}
info!(
sl!(),
"EROFS block device config - path: {}, format: {:?}",
erofs_path,
erofs_format
);
let device_config = &mut BlockConfig {
driver_option: block_driver.clone(),
format: erofs_format, // Vmdk for multiple devices, Raw for single device
path_on_host: erofs_path,
is_readonly: true, // EROFS layers are read-only, must set to avoid "resize" lock errors
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
..Default::default()
};
let device_info = do_handle_device(
device_manager,
&DeviceConfig::BlockCfg(device_config.clone()),
)
.await
.context("failed to attach erofs block device")?;
let (mut rolayer, device_id) = extract_block_device_info(&device_info, true)?;
info!(
sl!(),
"erofs device attached - device_id: {} guest_path: {}",
device_id,
&rolayer.source
);
let mut options: Vec<String> = mount
.options
// Collect all EROFS mounts once with their original indices.
let erofs_mounts_indexed: Vec<(usize, &Mount)> = rootfs_mounts
.iter()
.filter(|o| {
// Filter out options that are not valid erofs mount parameters:
// 1. "loop" - not needed in VM, device is already /dev/vdX
// 2. "device=" prefix - used for VMDK generation only, not for mount
// 3. "X-kata." prefix - metadata markers for kata internals
*o != "loop" && !o.starts_with("device=") && !o.starts_with("X-kata.")
})
.cloned()
.enumerate()
.filter(|(_, m)| m.fs_type.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE))
.collect();
let total_erofs_mounts = erofs_mounts_indexed.len();
// Erofs layers are read-only lower layers (marked with X-kata.overlay-lower)
options.push("X-kata.overlay-lower".to_string());
options.push("X-kata.multi-layer=true".to_string());
// GPT+VMDK mode: Multiple independent erofs layer files
if total_erofs_mounts > 1 {
info!(
sl!(),
"multi-layer erofs: using GPT+VMDK mode for {} independent layers",
total_erofs_mounts
);
info!(
sl!(),
"erofs storage options filtered: {:?} -> {:?}", mount.options, options
);
let mut erofs_layers = Vec::new();
rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string();
rolayer.mount_point = container_path.clone();
rolayer.options = options;
for (_mount_idx, erofs_mount) in &erofs_mounts_indexed {
let layer_path = erofs_mount.source.clone();
let size_bytes = get_erofs_layer_size(&layer_path).context(format!(
"gptdisk: failed to get size of EROFS layer: {}",
layer_path
))?;
erofs_storage = Some(rolayer);
device_ids.push(device_id);
if size_bytes == 0 {
warn!(
sl!(),
"gptdisk: EROFS layer {} is zero-length, skipping", layer_path
);
continue;
}
let size_sectors = size_bytes.div_ceil(512);
let snapshot_id = extract_snapshot_id(&layer_path);
erofs_layers.push(ErofsLayer {
path: layer_path,
size_sectors,
snapshot_id,
});
}
if erofs_layers.is_empty() {
return Err(anyhow!(
"gptdisk: no valid EROFS layers found for GPT VMDK"
));
}
// Generate GPT-partitioned VMDK and get layout information
let (erofs_path, erofs_format, layout, gpt_files) =
generate_gpt_vmdk_with_layout(sid, cid, erofs_layers)
.context("gptdisk: failed to generate GPT VMDK")?;
// Track VMDK path for cleanup
vmdk_path = Some(PathBuf::from(&erofs_path));
// Track GPT metadata files (head + padding) for cleanup
gpt_metadata_paths.push(gpt_files.head_path.clone());
gpt_metadata_paths.extend(gpt_files.pad_paths.iter().cloned());
info!(
sl!(),
"GPT VMDK created - path: {}, format: {:?}, {} partitions",
erofs_path,
erofs_format,
layout.partitions.len()
);
let device_config = &mut BlockConfig {
driver_option: block_driver.clone(),
format: erofs_format,
path_on_host: erofs_path,
is_readonly: true,
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
..Default::default()
};
let device_info = do_handle_device(
device_manager,
&DeviceConfig::BlockCfg(device_config.clone()),
)
.await
.context("failed to attach GPT VMDK block device")?;
let (base_device, device_id) =
extract_block_device_info(&device_info, true)?;
info!(
sl!(),
"GPT VMDK device attached - device_id: {} guest_path: {}",
device_id,
&base_device.source
);
device_ids.push(device_id);
// Create a storage entry for each GPT partition.
for (idx, part) in layout.partitions.iter().enumerate() {
let mut rolayer = base_device.clone();
let options: Vec<String> = vec![
"X-kata.overlay-lower".to_string(),
"X-kata.multi-layer=true".to_string(),
"X-kata.gpt-partitioned=true".to_string(),
format!("X-kata.partition-number={}", part.partition_number),
];
rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string();
rolayer.mount_point = container_path.clone();
rolayer.options = options;
rolayer.source = base_device.source.clone();
info!(
sl!(),
"Created storage for GPT partition {} (partition number {}, LBA {}-{})",
idx, part.partition_number, part.start_lba, part.end_lba
);
erofs_storages.push(rolayer);
}
// Mark GPT erofs as processed so subsequent erofs mounts
// in the loop are skipped, while still allowing ext4 and
// overlay mounts to be visited.
gpt_erofs_processed = true;
} else {
// fsmerge mode: Single erofs mount with device= options
info!(
sl!(),
"multi-layer erofs: using fsmerge mode for erofs layers: {}",
mount.source
);
// Collect all EROFS devices: source + `device=` options
let mut erofs_devices = vec![mount.source.clone()];
for opt in &mount.options {
if let Some(device_path) = opt.strip_prefix("device=") {
erofs_devices.push(device_path.to_string());
}
}
info!(sl!(), "EROFS devices count: {}", erofs_devices.len());
// Generate merged VMDK file from all EROFS devices
// Returns (path, format) - format is Vmdk for multiple devices, Raw for single device
let (erofs_path, erofs_format) =
generate_merged_erofs_vmdk(sid, cid, &erofs_devices)
.await
.context("failed to generate EROFS VMDK")?;
// Track VMDK path for cleanup (only when VMDK is actually created)
if erofs_format == BlockDeviceFormat::Vmdk {
vmdk_path = Some(PathBuf::from(&erofs_path));
}
info!(
sl!(),
"EROFS block device config - path: {}, format: {:?}",
erofs_path,
erofs_format
);
let device_config = &mut BlockConfig {
driver_option: block_driver.clone(),
format: erofs_format, // Vmdk for multiple devices, Raw for single device
path_on_host: erofs_path,
is_readonly: true, // EROFS layers are read-only, must set to avoid "resize" lock errors
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
..Default::default()
};
let device_info = do_handle_device(
device_manager,
&DeviceConfig::BlockCfg(device_config.clone()),
)
.await
.context("failed to attach erofs block device")?;
let (mut rolayer, device_id) =
extract_block_device_info(&device_info, true)?;
info!(
sl!(),
"erofs device attached - device_id: {} guest_path: {}",
device_id,
&rolayer.source
);
let mut options: Vec<String> = mount
.options
.iter()
.filter(|o| {
// Filter out options that are not valid erofs mount parameters:
// 1. "loop" - not needed in VM, device is already /dev/vdX
// 2. "device=" prefix - used for VMDK generation only, not for mount
// 3. "X-kata." prefix - metadata markers for kata internals
*o != "loop"
&& !o.starts_with("device=")
&& !o.starts_with("X-kata.")
})
.cloned()
.collect();
// Erofs layers are read-only lower layers (marked with X-kata.overlay-lower)
options.push("X-kata.overlay-lower".to_string());
options.push("X-kata.multi-layer=true".to_string());
info!(
sl!(),
"erofs storage options filtered: {:?} -> {:?}", mount.options, options
);
rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string();
rolayer.mount_point = container_path.clone();
rolayer.options = options;
erofs_storages.push(rolayer);
device_ids.push(device_id);
}
}
fmt if fmt.eq_ignore_ascii_case("overlay")
|| fmt.eq_ignore_ascii_case("format/overlay")
|| fmt.eq_ignore_ascii_case("format/mkdir/overlay") =>
{
// Mount[2]: overlay to combine rwlayer (upper) + erofs (lower)
// mkdir directives already extracted before the main loop
info!(
sl!(),
"multi-layer erofs: parsing overlay mount, options: {:?}", mount.options
"multi-layer erofs: overlay mount (mkdir directives pre-extracted)"
);
// Parse mkdir options (X-containerd.mkdir.path)
for opt in &mount.options {
if let Some(mkdir_spec) = opt.strip_prefix(X_CONTAINERD_MKDIR_PATH) {
// Keep the full spec (path:mode or path:mode:uid:gid) for guest agent
mkdir_dirs.push(mkdir_spec.to_string());
}
}
}
_ => {
info!(
@@ -572,8 +913,14 @@ impl ErofsMultiLayerRootfs {
guest_path: container_path,
device_ids,
rwlayer_storage,
erofs_storage,
erofs_storages,
vmdk_path,
gpt_metadata_paths,
generated_artifacts_dir: PathBuf::from(
kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS),
)
.join(sid)
.join(cid),
})
}
}
@@ -589,16 +936,18 @@ impl Rootfs for ErofsMultiLayerRootfs {
}
async fn get_storage(&self) -> Option<Vec<Storage>> {
// Return all storages for multi-layer EROFS (rw layer + erofs layer) to guest agent.
// Guest agent needs both to create overlay mount
// Return all storages for multi-layer EROFS (rw layer + erofs layers) to guest agent.
// Guest agent needs all of them to create overlay mount.
// In GPT mode, each partition has its own storage entry.
let mut storages = Vec::new();
if let Some(rwlayer) = self.rwlayer_storage.clone() {
storages.push(rwlayer);
}
if let Some(erofs) = self.erofs_storage.clone() {
storages.push(erofs);
// Add all EROFS layer storages (single storage in fsmerge mode, multiple in GPT mode)
for erofs in &self.erofs_storages {
storages.push(erofs.clone());
}
if storages.is_empty() {
@@ -613,23 +962,27 @@ impl Rootfs for ErofsMultiLayerRootfs {
}
async fn cleanup(&self, device_manager: &RwLock<DeviceManager>) -> Result<()> {
// Helper function to safely remove a file if it exists and is within the specified directory.
let safely_remove_file = |path: &Path, dir: &Path| -> Result<()> {
if path.starts_with(dir) && path.exists() {
fs::remove_file(path).context(format!("failed to remove file: {}", path.display()))?;
}
Ok(())
};
let mut dm = device_manager.write().await;
for device_id in &self.device_ids {
dm.try_remove_device(device_id).await?;
}
// Clean up generated VMDK descriptor file if it exists (only for multi-device case)
// Clean up generated VMDK descriptor file if it exists.
if let Some(ref vmdk) = self.vmdk_path {
if vmdk.exists() {
if let Err(e) = fs::remove_file(vmdk) {
warn!(
sl!(),
"failed to remove VMDK descriptor {}: {}",
vmdk.display(),
e
);
}
}
safely_remove_file(vmdk, &self.generated_artifacts_dir)?;
}
// Clean up GPT metadata files (head, padding).
for metadata_path in &self.gpt_metadata_paths {
safely_remove_file(metadata_path, &self.generated_artifacts_dir)?;
}
Ok(())

View File

@@ -91,7 +91,7 @@ pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &P
toml_utils::set_toml_value(
configuration_file,
".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
"1",
"0",
)?;
Ok(())