mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-01 06:28:11 +00:00
Merge pull request #13085 from Apokleos/erofs-gpt-vmdk-only
runtime-rs: Support erofs snapshotter with gpt vmdk mode
This commit is contained in:
47
Cargo.lock
generated
47
Cargo.lock
generated
@@ -139,7 +139,7 @@ version = "1.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
|
||||
dependencies = [
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -150,7 +150,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"once_cell_polyfill",
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1963,7 +1963,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2458,6 +2458,18 @@ dependencies = [
|
||||
"cfg-if 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gpt"
|
||||
version = "4.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3696fafb1ecdcc2ae3ce337de73e9202806068594b77d22fdf2f3573c5ec2219"
|
||||
dependencies = [
|
||||
"bitflags 2.11.1",
|
||||
"crc",
|
||||
"simple-bytes",
|
||||
"uuid 1.23.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.3.27"
|
||||
@@ -3163,7 +3175,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
|
||||
dependencies = [
|
||||
"hermit-abi 0.5.2",
|
||||
"libc",
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3632,8 +3644,10 @@ dependencies = [
|
||||
"base64 0.13.1",
|
||||
"bitmask-enum",
|
||||
"byte-unit",
|
||||
"crc",
|
||||
"flate2",
|
||||
"glob",
|
||||
"gpt",
|
||||
"lazy_static",
|
||||
"nix 0.26.4",
|
||||
"num_cpus",
|
||||
@@ -3641,6 +3655,7 @@ dependencies = [
|
||||
"regex",
|
||||
"rstest 0.18.2",
|
||||
"safe-path 0.1.0",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde-enum-str",
|
||||
"serde_json",
|
||||
@@ -4390,7 +4405,7 @@ version = "0.50.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
|
||||
dependencies = [
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5295,7 +5310,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "059a34f111a9dee2ce1ac2826a68b24601c4298cfeb1a587c3cb493d5ab46f52"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"nix 0.23.2",
|
||||
"nix 0.30.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6516,7 +6531,7 @@ dependencies = [
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys 0.12.1",
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6617,7 +6632,7 @@ dependencies = [
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"webpki-root-certs",
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7182,6 +7197,12 @@ version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
|
||||
|
||||
[[package]]
|
||||
name = "simple-bytes"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c11532d9d241904f095185f35dcdaf930b1427a94d5b01d7002d74ba19b44cc4"
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "1.0.3"
|
||||
@@ -7305,7 +7326,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7546,7 +7567,7 @@ dependencies = [
|
||||
"getrandom 0.4.2",
|
||||
"once_cell",
|
||||
"rustix 1.1.4",
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7555,7 +7576,7 @@ version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1"
|
||||
dependencies = [
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -8259,7 +8280,7 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e"
|
||||
dependencies = [
|
||||
"memoffset 0.9.1",
|
||||
"tempfile",
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -8842,7 +8863,7 @@ version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
|
||||
dependencies = [
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -197,6 +197,7 @@ tracing-subscriber = "0.3.20"
|
||||
ttrpc = "0.8.4"
|
||||
url = "2.5.4"
|
||||
which = "4.3.0"
|
||||
gpt = "4.1.0"
|
||||
|
||||
# Per-package release profile overrides for kata-deploy. The kata-deploy
|
||||
# binary runs once at pod start and then idles waiting for SIGTERM, so we
|
||||
|
||||
@@ -90,7 +90,7 @@ version = 3
|
||||
|
||||
[plugins.'io.containerd.snapshotter.v1.erofs']
|
||||
default_size = '<SIZE>' # SIZE=6G or 10G or other size
|
||||
max_unmerged_layers = 1
|
||||
max_unmerged_layers = 0
|
||||
```
|
||||
|
||||
#### Verify the EROFS plugins are loaded
|
||||
|
||||
@@ -173,7 +173,8 @@ pub struct VirtioBlkPciMatcher {
|
||||
impl VirtioBlkPciMatcher {
|
||||
pub fn new(relpath: &str, root_complex: &str) -> VirtioBlkPciMatcher {
|
||||
let root_bus = create_pci_root_bus_path(root_complex);
|
||||
let re = format!(r"^{root_bus}{relpath}/virtio[0-9]+/block/");
|
||||
// [^/]+$ ensures it only match the whole-disk uevent (e.g. block/vdx)
|
||||
let re = format!(r"^{root_bus}{relpath}/virtio[0-9]+/block/[^/]+$");
|
||||
|
||||
VirtioBlkPciMatcher {
|
||||
rex: Regex::new(&re).expect("BUG: failed to compile VirtioBlkPciMatcher regex"),
|
||||
@@ -259,6 +260,17 @@ mod tests {
|
||||
assert!(matcher_b.is_match(&uev_b));
|
||||
assert!(!matcher_b.is_match(&uev_a));
|
||||
assert!(!matcher_a.is_match(&uev_b));
|
||||
|
||||
// Partition uevents must NOT match (only the whole-disk uevent should match)
|
||||
let mut uev_part = uev_a.clone();
|
||||
uev_part.devname = "vda1".to_string();
|
||||
uev_part.devpath = format!("{root_bus}{relpath_a}/virtio4/block/{devname}/vda1");
|
||||
assert!(!matcher_a.is_match(&uev_part));
|
||||
|
||||
let mut uev_part91 = uev_a.clone();
|
||||
uev_part91.devname = "vda91".to_string();
|
||||
uev_part91.devpath = format!("{root_bus}{relpath_a}/virtio4/block/{devname}/vda91");
|
||||
assert!(!matcher_a.is_match(&uev_part91));
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "s390x")]
|
||||
|
||||
@@ -17,6 +17,9 @@ use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
use tracing::instrument;
|
||||
|
||||
/// The path segment in the uevent devpath that separates the SCSI path and the block device name.
|
||||
const BLOCK_SEGMENT: &str = "/block/";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ScsiDeviceHandler {}
|
||||
|
||||
@@ -53,20 +56,41 @@ pub async fn get_scsi_device_name(
|
||||
// SCSI host.
|
||||
#[derive(Debug)]
|
||||
pub struct ScsiBlockMatcher {
|
||||
search: String,
|
||||
/// Expected SCSI path suffix before `/block/`, e.g. `/0:0:2:0`
|
||||
scsi_path_suffix: String,
|
||||
}
|
||||
|
||||
impl ScsiBlockMatcher {
|
||||
pub fn new(scsi_addr: &str) -> ScsiBlockMatcher {
|
||||
let search = format!(r"/0:0:{scsi_addr}/block/");
|
||||
ScsiBlockMatcher {
|
||||
scsi_path_suffix: format!("/0:0:{scsi_addr}"),
|
||||
}
|
||||
}
|
||||
|
||||
ScsiBlockMatcher { search }
|
||||
fn split_block_devpath<'a>(&self, devpath: &'a str) -> Option<(&'a str, &'a str)> {
|
||||
let idx = devpath.find(BLOCK_SEGMENT)?;
|
||||
let prefix = &devpath[..idx];
|
||||
let suffix = &devpath[idx + BLOCK_SEGMENT.len()..];
|
||||
Some((prefix, suffix))
|
||||
}
|
||||
}
|
||||
|
||||
impl UeventMatcher for ScsiBlockMatcher {
|
||||
fn is_match(&self, uev: &Uevent) -> bool {
|
||||
uev.subsystem == BLOCK && uev.devpath.contains(&self.search) && !uev.devname.is_empty()
|
||||
if uev.action != U_EVENT_ACTION_ADD {
|
||||
return false;
|
||||
}
|
||||
|
||||
if uev.subsystem != BLOCK || uev.devname.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let (prefix, suffix) = match self.split_block_devpath(&uev.devpath) {
|
||||
Some(parts) => parts,
|
||||
None => return false,
|
||||
};
|
||||
|
||||
prefix.ends_with(&self.scsi_path_suffix) && !suffix.contains('/') && suffix == uev.devname
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,6 +130,23 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::linux_abi::U_EVENT_ACTION_ADD;
|
||||
|
||||
fn make_scsi_block_uevent(addr: &str, devname: &str, devpath_suffix: &str) -> Uevent {
|
||||
let root_bus = create_pci_root_bus_path("00");
|
||||
|
||||
let mut uev = Uevent::default();
|
||||
uev.action = U_EVENT_ACTION_ADD.to_string();
|
||||
uev.subsystem = BLOCK.to_string();
|
||||
uev.devname = devname.to_string();
|
||||
uev.devpath = format!(
|
||||
"{root_bus}/0000:00:00.0/virtio0/host0/target0:0:{target}/0:0:{addr}/block/{devpath_suffix}",
|
||||
target = addr.split(':').next().unwrap_or("0"),
|
||||
addr = addr,
|
||||
devpath_suffix = devpath_suffix,
|
||||
);
|
||||
uev
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[allow(clippy::redundant_clone)]
|
||||
@@ -124,6 +165,7 @@ mod tests {
|
||||
|
||||
let mut uev_b = uev_a.clone();
|
||||
let addr_b = "2:0";
|
||||
uev_b.devname = "sdb".to_string();
|
||||
uev_b.devpath =
|
||||
format!("{root_bus}/0000:00:00.0/virtio0/host0/target0:0:2/0:0:{addr_b}/block/sdb");
|
||||
let matcher_b = ScsiBlockMatcher::new(addr_b);
|
||||
@@ -133,4 +175,21 @@ mod tests {
|
||||
assert!(!matcher_b.is_match(&uev_a));
|
||||
assert!(!matcher_a.is_match(&uev_b));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scsi_block_matcher_rejects_partitions() {
|
||||
let uev_whole = make_scsi_block_uevent("0:0", "sda", "sda");
|
||||
let uev_part = make_scsi_block_uevent("0:0", "sda1", "sda/sda1");
|
||||
|
||||
let matcher = ScsiBlockMatcher::new("0:0");
|
||||
|
||||
assert!(
|
||||
matcher.is_match(&uev_whole),
|
||||
"whole disk uevent should match"
|
||||
);
|
||||
assert!(
|
||||
!matcher.is_match(&uev_part),
|
||||
"partition uevent should not match"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ use tracing::instrument;
|
||||
|
||||
use self::bind_watcher_handler::BindWatcherHandler;
|
||||
use self::block_handler::{PmemHandler, ScsiHandler, VirtioBlkMmioHandler, VirtioBlkPciHandler};
|
||||
pub use self::ephemeral_handler::update_ephemeral_mounts;
|
||||
use self::ephemeral_handler::EphemeralHandler;
|
||||
use self::fs_handler::{OverlayfsHandler, VirtioFsHandler};
|
||||
use self::image_pull_handler::ImagePullHandler;
|
||||
@@ -30,15 +31,13 @@ use self::multi_layer_erofs::{handle_multi_layer_erofs_group, is_multi_layer_sto
|
||||
use crate::mount::{baremount, is_mounted, remove_mounts};
|
||||
use crate::sandbox::Sandbox;
|
||||
|
||||
pub use self::ephemeral_handler::update_ephemeral_mounts;
|
||||
|
||||
mod bind_watcher_handler;
|
||||
mod block_handler;
|
||||
mod ephemeral_handler;
|
||||
mod fs_handler;
|
||||
mod image_pull_handler;
|
||||
mod local_handler;
|
||||
mod multi_layer_erofs;
|
||||
pub mod multi_layer_erofs;
|
||||
|
||||
const RW_MASK: u32 = 0o660;
|
||||
const RO_MASK: u32 = 0o440;
|
||||
@@ -168,6 +167,8 @@ struct MultiLayerProcessResult {
|
||||
/// Temporary mount points (upper/lower) backing the overlay, needed for
|
||||
/// container-scoped cleanup via `container_mounts`.
|
||||
temp_mount_points: Vec<String>,
|
||||
/// dm-verity device paths that need to be destroyed during cleanup
|
||||
verity_devices: Vec<String>,
|
||||
}
|
||||
|
||||
/// Handle multi-layer storage by creating the overlay device.
|
||||
@@ -209,6 +210,7 @@ async fn handle_multi_layer_storage(
|
||||
device,
|
||||
processed_mount_points: result.processed_mount_points,
|
||||
temp_mount_points: result.temp_mount_points,
|
||||
verity_devices: result.verity_devices,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -303,6 +305,7 @@ pub async fn add_storages(
|
||||
}
|
||||
}
|
||||
mount_list.extend(result.temp_mount_points);
|
||||
mount_list.extend(result.verity_devices);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@@ -10,10 +10,14 @@
|
||||
//! - Storage with X-kata.overlay-lower: erofs layers (lowerdir)
|
||||
//! - Creates overlay to combine them
|
||||
//! - Supports X-kata.mkdir.path options to create directories in upper layer before overlay mount
|
||||
//! - Supports GPT-partitioned disks where each layer is a separate partition
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::time::sleep;
|
||||
|
||||
use crate::device::block_device_handler::get_virtio_blk_pci_device_name;
|
||||
use crate::device::scsi_device_handler::get_scsi_device_name;
|
||||
@@ -44,7 +48,9 @@ pub const DRIVER_MULTI_LAYER_EROFS: &str = "erofs.multi-layer";
|
||||
const OPT_OVERLAY_UPPER: &str = "X-kata.overlay-upper";
|
||||
const OPT_OVERLAY_LOWER: &str = "X-kata.overlay-lower";
|
||||
const OPT_MULTI_LAYER: &str = "X-kata.multi-layer=true";
|
||||
const OPT_GPT_PARTITIONED: &str = "X-kata.gpt-partitioned=true";
|
||||
const OPT_MKDIR_PATH: &str = "X-kata.mkdir.path=";
|
||||
const OPT_PARTITION_NUMBER: &str = "X-kata.partition-number=";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MultiLayerErofsHandler {}
|
||||
@@ -57,13 +63,19 @@ pub struct MultiLayerErofsResult {
|
||||
/// overlay. These must be tracked so they are unmounted *after* the
|
||||
/// overlay target during container teardown.
|
||||
pub temp_mount_points: Vec<String>,
|
||||
/// dm-verity device paths that need to be destroyed during cleanup
|
||||
pub verity_devices: Vec<String>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug)]
|
||||
struct MkdirDirective {
|
||||
raw_path: String,
|
||||
mode: Option<String>,
|
||||
}
|
||||
|
||||
/// Helper struct to track layer mount information including dm-verity devices
|
||||
#[derive(Debug)]
|
||||
struct LayerMountInfo {
|
||||
verity_device: Option<String>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -122,6 +134,7 @@ pub async fn handle_multi_layer_erofs_group(
|
||||
let mut ext4_storage: Option<&Storage> = None;
|
||||
let mut erofs_storages: Vec<&Storage> = Vec::new();
|
||||
let mut mkdir_dirs: Vec<MkdirDirective> = Vec::new();
|
||||
let mut has_gpt_partition: bool = false;
|
||||
|
||||
for storage in &multi_layer_storages {
|
||||
if is_upper_storage(storage) {
|
||||
@@ -139,19 +152,33 @@ pub async fn handle_multi_layer_erofs_group(
|
||||
}
|
||||
}
|
||||
} else if is_lower_storage(storage) {
|
||||
// Each GPT partition is provided as a separate storage entry by the host
|
||||
if !has_gpt_partition && is_gpt_partitioned(storage) {
|
||||
has_gpt_partition = true;
|
||||
}
|
||||
erofs_storages.push(*storage);
|
||||
}
|
||||
}
|
||||
|
||||
let ext4 = ext4_storage
|
||||
.ok_or_else(|| anyhow!("multi-layer erofs missing ext4 upper layer storage"))?;
|
||||
|
||||
if erofs_storages.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"multi-layer erofs missing erofs lower layer storage"
|
||||
));
|
||||
}
|
||||
|
||||
// Only sort erofs layers by partition number in GPT mode.
|
||||
// In GPT mode, each storage carries X-kata.partition-number=N and layers
|
||||
// must be ordered by partition number so that the overlay lowerdir
|
||||
// precedence is correct (lower partition number = higher overlay priority).
|
||||
// In non-GPT mode all partition numbers are None, so sorting would be a
|
||||
// no-op that needlessly reorders elements.
|
||||
if has_gpt_partition {
|
||||
erofs_storages.sort_by_key(|storage| get_partition_number(storage).unwrap_or(u32::MAX));
|
||||
}
|
||||
|
||||
let ext4 = ext4_storage
|
||||
.ok_or_else(|| anyhow!("multi-layer erofs missing ext4 upper layer storage"))?;
|
||||
|
||||
info!(
|
||||
logger,
|
||||
"Handling multi-layer erofs group";
|
||||
@@ -178,7 +205,7 @@ pub async fn handle_multi_layer_erofs_group(
|
||||
let upper_mount = temp_base.join("upper");
|
||||
fs::create_dir_all(&upper_mount).context("failed to create upper mount dir")?;
|
||||
|
||||
wait_and_mount_layer(ext4, &upper_mount, sandbox, &logger).await?;
|
||||
wait_and_mount_layer(ext4, &upper_mount, sandbox, &logger, None).await?;
|
||||
|
||||
for mkdir_dir in &mkdir_dirs {
|
||||
// As {{ mount 1 }} refers to the first lower layer, which is not available until we mount it.
|
||||
@@ -201,6 +228,9 @@ pub async fn handle_multi_layer_erofs_group(
|
||||
}
|
||||
|
||||
let mut lower_mounts = Vec::new();
|
||||
let mut verity_devices = Vec::new();
|
||||
let mut base_device_cache: HashMap<String, String> = HashMap::new();
|
||||
|
||||
for (index, erofs) in erofs_storages.iter().enumerate() {
|
||||
let lower_mount = temp_base.join(format!("lower-{}", index));
|
||||
fs::create_dir_all(&lower_mount).context(format!(
|
||||
@@ -208,8 +238,25 @@ pub async fn handle_multi_layer_erofs_group(
|
||||
lower_mount.display()
|
||||
))?;
|
||||
|
||||
wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger).await?;
|
||||
let base_dev_path = if is_gpt_partitioned(erofs) {
|
||||
Some(
|
||||
base_device_cache
|
||||
.entry(erofs.source.clone())
|
||||
.or_insert(resolve_base_device_path(erofs, sandbox).await?)
|
||||
.clone(),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let mount_info =
|
||||
wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger, base_dev_path).await?;
|
||||
lower_mounts.push(lower_mount);
|
||||
|
||||
// Collect dm-verity device for cleanup
|
||||
if let Some(verity_dev) = mount_info.verity_device {
|
||||
verity_devices.push(verity_dev);
|
||||
}
|
||||
}
|
||||
|
||||
// If any mkdir directive refers to {{ mount 1 }}, resolve it now using the first lower mount.
|
||||
@@ -318,6 +365,7 @@ pub async fn handle_multi_layer_erofs_group(
|
||||
mount_point: ext4.mount_point.clone(),
|
||||
processed_mount_points,
|
||||
temp_mount_points,
|
||||
verity_devices,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -407,7 +455,6 @@ fn parse_mkdir_directive(spec: &str) -> Result<MkdirDirective> {
|
||||
|
||||
Ok(MkdirDirective {
|
||||
raw_path: raw_path.to_string(),
|
||||
mode: parts.get(1).map(|s| s.to_string()),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -467,7 +514,8 @@ async fn wait_and_mount_layer(
|
||||
layer_mount: &Path,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
logger: &Logger,
|
||||
) -> Result<()> {
|
||||
base_dev_path: Option<String>,
|
||||
) -> Result<LayerMountInfo> {
|
||||
info!(
|
||||
logger,
|
||||
"Waiting for layer device";
|
||||
@@ -475,22 +523,41 @@ async fn wait_and_mount_layer(
|
||||
"driver" => &layer.driver,
|
||||
"mount-point" => layer_mount.display(),
|
||||
);
|
||||
let dev_path = match layer.driver.as_str() {
|
||||
DRIVER_SCSI_TYPE => {
|
||||
// For SCSI devices, we need to wait for the device to appear and get its path before mounting.
|
||||
get_scsi_device_name(sandbox, &layer.source).await?
|
||||
}
|
||||
DRIVER_BLK_PCI_TYPE => {
|
||||
let (root_complex, pcipath) = pcipath_from_dev_tree_path(&layer.source)?;
|
||||
get_virtio_blk_pci_device_name(sandbox, root_complex, &pcipath).await?
|
||||
}
|
||||
_ => {
|
||||
// For non-SCSI devices, we can assume the source is directly mountable.
|
||||
|
||||
let is_gpt = is_gpt_partitioned(layer);
|
||||
let partition_num = get_partition_number(layer);
|
||||
|
||||
// Get the base device path
|
||||
let dev_path = match base_dev_path {
|
||||
Some(path) => path,
|
||||
None => resolve_base_device_path(layer, sandbox).await?,
|
||||
};
|
||||
|
||||
// For GPT-partitioned disks, use the partition device path
|
||||
let dev_path = if is_gpt {
|
||||
if let Some(part_num) = partition_num {
|
||||
let path = get_partition_device_path(&dev_path, part_num);
|
||||
info!(
|
||||
logger,
|
||||
"GPT-partitioned mode: using partition device";
|
||||
"base-device" => &dev_path,
|
||||
"partition-number" => part_num,
|
||||
"partition-device" => &path,
|
||||
);
|
||||
|
||||
// Wait for partition device node to appear
|
||||
wait_for_partition_device(&path, logger).await?;
|
||||
|
||||
path
|
||||
} else {
|
||||
return Err(anyhow!(
|
||||
"unsupported driver type '{}' for multi-layer erofs",
|
||||
layer.driver
|
||||
"GPT-partitioned storage missing partition number: {:?}",
|
||||
layer
|
||||
));
|
||||
}
|
||||
} else {
|
||||
// Non-GPT mode: use base device directly
|
||||
dev_path.clone()
|
||||
};
|
||||
|
||||
info!(
|
||||
@@ -500,6 +567,7 @@ async fn wait_and_mount_layer(
|
||||
"fstype" => &layer.fstype,
|
||||
"devname" => &dev_path,
|
||||
"mount-point" => layer_mount.display(),
|
||||
"gpt-mode" => is_gpt,
|
||||
);
|
||||
|
||||
create_mount_destination(Path::new(&dev_path), layer_mount, "", &layer.fstype)
|
||||
@@ -548,7 +616,106 @@ async fn wait_and_mount_layer(
|
||||
// After successfully mounting the layer, we track the mount point for cleanup.
|
||||
track_temporary_mount_for_cleanup(sandbox, layer_mount, logger).await?;
|
||||
|
||||
Ok(())
|
||||
Ok(LayerMountInfo {
|
||||
verity_device: None,
|
||||
})
|
||||
}
|
||||
|
||||
async fn resolve_base_device_path(
|
||||
layer: &Storage,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
) -> Result<String> {
|
||||
let base_dev_path = match layer.driver.as_str() {
|
||||
DRIVER_SCSI_TYPE => {
|
||||
// For SCSI devices, we need to wait for the device to appear and get its path before mounting.
|
||||
get_scsi_device_name(sandbox, &layer.source).await?
|
||||
}
|
||||
DRIVER_BLK_PCI_TYPE => {
|
||||
let (root_complex, pcipath) = pcipath_from_dev_tree_path(&layer.source)?;
|
||||
get_virtio_blk_pci_device_name(sandbox, root_complex, &pcipath).await?
|
||||
}
|
||||
_ => {
|
||||
// For non-SCSI devices, we can assume the source is directly mountable.
|
||||
return Err(anyhow!(
|
||||
"unsupported driver type '{}' for multi-layer erofs",
|
||||
layer.driver
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(base_dev_path)
|
||||
}
|
||||
|
||||
/// Check if the storage is GPT-partitioned
|
||||
fn is_gpt_partitioned(storage: &Storage) -> bool {
|
||||
storage.options.iter().any(|o| o == OPT_GPT_PARTITIONED)
|
||||
}
|
||||
|
||||
/// Extract partition number from storage options
|
||||
/// Returns None if not specified (non-GPT mode)
|
||||
fn get_partition_number(storage: &Storage) -> Option<u32> {
|
||||
for opt in &storage.options {
|
||||
if let Some(num_str) = opt.strip_prefix(OPT_PARTITION_NUMBER) {
|
||||
return num_str.parse::<u32>().ok();
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Get the partition device path for a GPT-partitioned disk
|
||||
///
|
||||
/// For GPT mode: the storage.source contains the base disk path (e.g., "/dev/vda")
|
||||
/// We need to append the partition number to get the partition path (e.g., "/dev/vda1")
|
||||
///
|
||||
/// Follows the kernel naming rule: if the base device name ends with a digit,
|
||||
/// insert a 'p' separator before the partition number to avoid ambiguity.
|
||||
/// This correctly handles all device families:
|
||||
/// - /dev/vda -> /dev/vda1 (no trailing digit, bare number)
|
||||
/// - /dev/sda -> /dev/sda1
|
||||
/// - /dev/nvme0n1 -> /dev/nvme0n1p1 (trailing digit, needs 'p')
|
||||
/// - /dev/mmcblk0 -> /dev/mmcblk0p1
|
||||
/// - /dev/loop0 -> /dev/loop0p1
|
||||
fn get_partition_device_path(base_path: &str, partition_number: u32) -> String {
|
||||
if base_path.ends_with(char::is_numeric) {
|
||||
format!("{}p{}", base_path, partition_number)
|
||||
} else {
|
||||
format!("{}{}", base_path, partition_number)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait for partition device node to appear in /dev.
|
||||
///
|
||||
/// When a virtio-blk device with a GPT is hotplugged, the kernel automatically
|
||||
/// scans the partition table and creates partition nodes. However, devtmpfs node
|
||||
/// creation may lag slightly behind the uevent, so we poll briefly if needed.
|
||||
#[allow(dead_code)]
|
||||
async fn wait_for_partition_device(device_path: &str, logger: &Logger) -> Result<()> {
|
||||
let device_path_buf = PathBuf::from(device_path);
|
||||
if device_path_buf.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
const MAX_WAIT_MS: u64 = 1000;
|
||||
const POLL_INTERVAL_MS: u64 = 50;
|
||||
|
||||
for attempt in 0..(MAX_WAIT_MS / POLL_INTERVAL_MS) {
|
||||
sleep(Duration::from_millis(POLL_INTERVAL_MS)).await;
|
||||
if device_path_buf.exists() {
|
||||
info!(
|
||||
logger,
|
||||
"Partition device node appeared after polling: {} (attempt {})",
|
||||
device_path,
|
||||
attempt + 1
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow!(
|
||||
"partition device {} did not appear within {} ms",
|
||||
device_path,
|
||||
MAX_WAIT_MS
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -603,27 +770,6 @@ mod tests {
|
||||
|
||||
// --- parse_mkdir_directive ---
|
||||
|
||||
#[rstest]
|
||||
#[case("some/path", true, "some/path", None)]
|
||||
#[case("some/path:0755", true, "some/path", Some("0755"))]
|
||||
#[case("path:mode:extra", true, "path", Some("mode:extra"))]
|
||||
#[case("", false, "", None)]
|
||||
fn test_parse_mkdir_directive(
|
||||
#[case] spec: &str,
|
||||
#[case] should_pass: bool,
|
||||
#[case] expected_path: &str,
|
||||
#[case] expected_mode: Option<&str>,
|
||||
) {
|
||||
let result = parse_mkdir_directive(spec);
|
||||
if should_pass {
|
||||
let d = result.expect("expected Ok");
|
||||
assert_eq!(d.raw_path, expected_path);
|
||||
assert_eq!(d.mode.as_deref(), expected_mode);
|
||||
} else {
|
||||
assert!(result.is_err(), "expected Err for spec {:?}", spec);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_mkdir_directive_rejects_null_bytes() {
|
||||
assert!(parse_mkdir_directive("foo\0bar").is_err());
|
||||
@@ -728,4 +874,29 @@ mod tests {
|
||||
s.options
|
||||
);
|
||||
}
|
||||
|
||||
// --- get_partition_device_path ---
|
||||
|
||||
#[rstest]
|
||||
#[case("/dev/vda", 1, "/dev/vda1")]
|
||||
#[case("/dev/sda", 3, "/dev/sda3")]
|
||||
#[case("/dev/hda", 2, "/dev/hda2")]
|
||||
#[case("/dev/nvme0n1", 1, "/dev/nvme0n1p1")]
|
||||
#[case("/dev/nvme0n1", 2, "/dev/nvme0n1p2")]
|
||||
#[case("/dev/mmcblk0", 1, "/dev/mmcblk0p1")]
|
||||
#[case("/dev/loop0", 1, "/dev/loop0p1")]
|
||||
#[case("/dev/nbd0", 3, "/dev/nbd0p3")]
|
||||
fn test_get_partition_device_path(
|
||||
#[case] base: &str,
|
||||
#[case] part: u32,
|
||||
#[case] expected: &str,
|
||||
) {
|
||||
assert_eq!(
|
||||
get_partition_device_path(base, part),
|
||||
expected,
|
||||
"get_partition_device_path({}, {})",
|
||||
base,
|
||||
part
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,7 +31,9 @@ sha2 = "0.10.8"
|
||||
flate2 = "1.1"
|
||||
nix = "0.26.4"
|
||||
oci-spec = { version = "0.8.1", features = ["runtime"] }
|
||||
|
||||
gpt = "4.1.0"
|
||||
scopeguard = "1.0.0"
|
||||
crc = "3.4.0"
|
||||
safe-path = { path = "../safe-path", optional = true }
|
||||
|
||||
[target.'cfg(target_os = "macos")'.dependencies]
|
||||
|
||||
463
src/libs/kata-types/src/gpt_disk.rs
Normal file
463
src/libs/kata-types/src/gpt_disk.rs
Normal file
@@ -0,0 +1,463 @@
|
||||
// Copyright (c) 2026 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// GPT (GUID Partition Table) disk metadata generation for EROFS multi-layer rootfs.
|
||||
//
|
||||
// This module generates a GPT metadata file (gpt_meta_head.img) that is used
|
||||
// in conjunction with VMDK descriptors to present multiple EROFS layers as a
|
||||
// single virtual disk with multiple GPT partitions to the guest VM.
|
||||
// Backup GPT structures are omitted — the virtual disk is ephemeral and
|
||||
// read-only, so backup recovery serves no purpose.
|
||||
//
|
||||
// Key features:
|
||||
// - Only includes read-only EROFS layers in GPT partitions (rw layer handled separately)
|
||||
// - Preserves the original order of layers from rootfs_mounts
|
||||
// - Generates minimal GPT metadata without copying layer data
|
||||
// - Supports 1MiB alignment for partitions
|
||||
// - Creates VMDK-compatible descriptor with head/layer/pad extents
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use crc::Crc;
|
||||
use gpt::{disk::LogicalBlockSize, mbr::ProtectiveMBR, partition_types, GptConfig};
|
||||
use scopeguard;
|
||||
use std::convert::TryFrom;
|
||||
use std::fs;
|
||||
use std::io::{Read, Seek, SeekFrom, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use crate::sl;
|
||||
|
||||
/// GPT disk parameters (using gpt crate constants where available)
|
||||
/// DEFAULT_SECTOR_SIZE is LogicalBlockSize enum, not u64
|
||||
const SECTOR_SIZE: u64 = 512;
|
||||
/// 1 MiB alignment start
|
||||
const FIRST_PARTITION_LBA: u64 = 2048;
|
||||
/// 1 MiB alignment
|
||||
const ALIGNMENT_LBA: u64 = 2048;
|
||||
/// bytes per GPT partition entry (UEFI standard)
|
||||
const GPT_ENTRY_SIZE: u64 = 128;
|
||||
/// standard GPT partition entry count
|
||||
const MAX_GPT_PARTITIONS: usize = 128;
|
||||
/// 32 sectors for partition entries (128 entries * 128 bytes each / 512 bytes per sector)
|
||||
const ENTRIES_SECTORS: u64 = (MAX_GPT_PARTITIONS as u64 * GPT_ENTRY_SIZE) / SECTOR_SIZE;
|
||||
/// GPT header size in bytes (UEFI specification)
|
||||
const GPT_HEADER_SIZE: usize = 92;
|
||||
/// Offset (in bytes) of the GPT primary header within the head file (LBA 1)
|
||||
const GPT_HEADER_FILE_OFFSET: u64 = SECTOR_SIZE;
|
||||
/// CRC-32/ISO-HDLC — the same algorithm the `gpt` crate uses internally.
|
||||
const CRC_32: Crc<u32> = Crc::<u32>::new(&crc::CRC_32_ISO_HDLC);
|
||||
|
||||
/// GPT head metadata file name
|
||||
const GPT_META_HEAD_IMG: &str = "gpt_meta_head.img";
|
||||
/// Temporary full GPT image used to synthesize head metadata
|
||||
const GPT_META_FULL_IMG: &str = "gpt_meta_full.img";
|
||||
|
||||
/// Represents a read-only EROFS layer to be placed in a GPT partition
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ErofsLayer {
|
||||
/// Path to the EROFS image file
|
||||
pub path: String,
|
||||
/// Size in sectors (ceiling division, sector = 512 bytes)
|
||||
pub size_sectors: u64,
|
||||
/// Snapshot ID extracted from path (for naming)
|
||||
pub snapshot_id: String,
|
||||
}
|
||||
|
||||
/// GPT partition layout information for a single layer
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PartitionLayout {
|
||||
/// Layer information
|
||||
pub layer: ErofsLayer,
|
||||
/// Partition number (1-indexed)
|
||||
pub partition_number: u32,
|
||||
/// First LBA of the partition
|
||||
pub start_lba: u64,
|
||||
/// Last LBA of the partition
|
||||
pub end_lba: u64,
|
||||
/// Partition name
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
/// Complete GPT disk layout calculation result
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GptDiskLayout {
|
||||
/// All partition layouts in order
|
||||
pub partitions: Vec<PartitionLayout>,
|
||||
/// Total sectors in the virtual disk
|
||||
pub total_sectors: u64,
|
||||
/// Logical block size in bytes
|
||||
pub lb_size: u64,
|
||||
}
|
||||
|
||||
/// Result of GPT metadata file generation
|
||||
#[derive(Debug)]
|
||||
pub struct GptMetadataFiles {
|
||||
/// Path to generated gpt_meta_head.img
|
||||
pub head_path: PathBuf,
|
||||
/// Size of head file in sectors
|
||||
pub head_sectors: u64,
|
||||
/// Paths to generated padding files (between partitions)
|
||||
pub pad_paths: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
/// Extract snapshot ID from a source path
|
||||
///
|
||||
/// Examples:
|
||||
/// ".../snapshots/35/layer.erofs" ---> "35"
|
||||
pub fn extract_snapshot_id(source: &str) -> String {
|
||||
Path::new(source)
|
||||
.parent()
|
||||
.and_then(|p| p.file_name())
|
||||
.map(|id| id.to_string_lossy().to_string())
|
||||
.unwrap_or_else(|| "unknown".to_string())
|
||||
}
|
||||
|
||||
/// Get file size in bytes
|
||||
pub fn get_erofs_layer_size(path: &str) -> Result<u64> {
|
||||
let metadata = fs::metadata(path).context(format!("failed to stat EROFS file: {}", path))?;
|
||||
Ok(metadata.len())
|
||||
}
|
||||
|
||||
/// Align LBA up to the specified alignment
|
||||
fn align_up(lba: u64, alignment: u64) -> u64 {
|
||||
if lba.is_multiple_of(alignment) {
|
||||
lba
|
||||
} else {
|
||||
((lba / alignment) + 1) * alignment
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate GPT disk layout from EROFS layers
|
||||
///
|
||||
/// This function computes the LBA positions for all partitions without
|
||||
/// modifying any files. It follows the layout:
|
||||
/// - LBA 0: Protective MBR
|
||||
/// - LBA 1: Primary GPT Header
|
||||
/// - LBA 2-33: Primary Partition Entry Array
|
||||
/// - LBA 34-2047: Reserved/padding
|
||||
/// - LBA 2048+: Partitions (1MiB aligned)
|
||||
/// - End: Backup Partition Entry Array + Backup GPT Header
|
||||
pub fn calculate_gpt_layout(layers: &[ErofsLayer]) -> Result<GptDiskLayout> {
|
||||
if layers.is_empty() {
|
||||
return Err(anyhow!("no EROFS layers provided for GPT layout"));
|
||||
}
|
||||
|
||||
// TODO: Fix the length of partitions exceeding GPT limits.
|
||||
// It should be addressed by splitting into multiple GPT disks if needed, but for now we enforce the limit.
|
||||
if layers.len() > MAX_GPT_PARTITIONS {
|
||||
return Err(anyhow!(
|
||||
"The layers for GPT: {} exceeds maximum {} partitions \
|
||||
(ENTRIES_SECTORS is sized for {} entries)",
|
||||
layers.len(),
|
||||
MAX_GPT_PARTITIONS,
|
||||
MAX_GPT_PARTITIONS,
|
||||
));
|
||||
}
|
||||
|
||||
// Validate that all layers have non-zero size
|
||||
for (idx, layer) in layers.iter().enumerate() {
|
||||
if layer.size_sectors == 0 {
|
||||
return Err(anyhow!(
|
||||
"EROFS layer {} ({}) has size_sectors = 0, cannot generate GPT partition",
|
||||
idx,
|
||||
layer.path
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let lb_size = SECTOR_SIZE;
|
||||
let first_usable_lba = FIRST_PARTITION_LBA;
|
||||
|
||||
// Calculate partition positions
|
||||
let mut partitions = Vec::with_capacity(layers.len());
|
||||
let mut current_lba = first_usable_lba;
|
||||
|
||||
for (idx, layer) in layers.iter().enumerate() {
|
||||
// Align start LBA to 1MiB boundary
|
||||
let start_lba = align_up(current_lba, ALIGNMENT_LBA);
|
||||
let end_lba = start_lba + layer.size_sectors - 1;
|
||||
|
||||
// Generate partition name: erofs-{index}-s{snapshot_id}
|
||||
let name = format!("erofs-{}-s{}", idx, layer.snapshot_id);
|
||||
// Truncate to fit GPT name limit without slicing through a UTF-8 codepoint.
|
||||
let name = match name.char_indices().nth(36) {
|
||||
Some((truncate_at, _)) => name[..truncate_at].to_string(),
|
||||
None => name,
|
||||
};
|
||||
|
||||
partitions.push(PartitionLayout {
|
||||
layer: layer.clone(),
|
||||
partition_number: (idx + 1) as u32,
|
||||
start_lba,
|
||||
end_lba,
|
||||
name,
|
||||
});
|
||||
|
||||
// Next partition starts after this one
|
||||
current_lba = end_lba + 1;
|
||||
}
|
||||
|
||||
// Calculate backup GPT position
|
||||
// Backup entries are placed after the last partition, aligned
|
||||
let backup_entries_lba = align_up(current_lba, ALIGNMENT_LBA);
|
||||
let backup_header_lba = backup_entries_lba + ENTRIES_SECTORS;
|
||||
let total_sectors = backup_header_lba + 1;
|
||||
|
||||
let last_usable_lba = backup_entries_lba - 1;
|
||||
|
||||
// Validate that all partitions fit in usable area
|
||||
for (idx, part) in partitions.iter().enumerate() {
|
||||
if part.end_lba > last_usable_lba {
|
||||
return Err(anyhow!(
|
||||
"partition {} (end_lba={}) exceeds last usable LBA ({})",
|
||||
idx,
|
||||
part.end_lba,
|
||||
last_usable_lba
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(GptDiskLayout {
|
||||
partitions,
|
||||
total_sectors,
|
||||
lb_size,
|
||||
})
|
||||
}
|
||||
|
||||
/// Generate GPT head metadata and return layout information
|
||||
///
|
||||
/// This is the main entry point for GPT metadata generation.
|
||||
/// It creates a temporary full GPT image (needed by the gpt crate to
|
||||
/// produce valid primary structures), extracts the head region, patches
|
||||
/// the primary header to remove references to backup GPT, and discards
|
||||
/// the rest.
|
||||
///
|
||||
/// Output:
|
||||
/// - gpt_meta_head.img: Primary GPT structures (MBR + GPT header + partition entries + padding)
|
||||
#[allow(unused_variables)]
|
||||
pub fn generate_gpt_metadata(
|
||||
sid: &str,
|
||||
cid: &str,
|
||||
erofs_layers: Vec<ErofsLayer>,
|
||||
container_dir: &Path,
|
||||
) -> Result<(GptDiskLayout, GptMetadataFiles)> {
|
||||
if erofs_layers.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"no EROFS layers provided for GPT metadata generation"
|
||||
));
|
||||
}
|
||||
|
||||
let mut layout = calculate_gpt_layout(&erofs_layers)?;
|
||||
if layout.partitions.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"no partitions in layout, cannot generate GPT metadata"
|
||||
));
|
||||
}
|
||||
|
||||
let full_path = container_dir.join(GPT_META_FULL_IMG);
|
||||
generate_full_gpt_image(&layout, &full_path).context("failed to generate full GPT image")?;
|
||||
let _cleanup = scopeguard::guard((), |_| {
|
||||
let _ = fs::remove_file(&full_path);
|
||||
});
|
||||
|
||||
// Extract head: LBA 0 to FIRST_PARTITION_LBA (2048 sectors = 1 MiB)
|
||||
let lb_size = layout.lb_size;
|
||||
let head_sectors = FIRST_PARTITION_LBA;
|
||||
let head_size = head_sectors * lb_size;
|
||||
let head_path = container_dir.join(GPT_META_HEAD_IMG);
|
||||
extract_file_range(&full_path, &head_path, 0, head_size)
|
||||
.context("failed to extract GPT head metadata")?;
|
||||
|
||||
// Patch the primary GPT header so AlternateLBA / LastUsableLBA are
|
||||
let last_partition_end = layout.partitions.last().unwrap().end_lba;
|
||||
patch_primary_gpt_header(&head_path, last_partition_end)
|
||||
.context("failed to patch primary GPT header")?;
|
||||
|
||||
// Adjust the layout to reflect the virtual disk size (no backup).
|
||||
layout.total_sectors = last_partition_end + 1;
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"Generated GPT head file: {} ({} sectors, {} bytes, virtual disk {} sectors)",
|
||||
head_path.display(),
|
||||
head_sectors,
|
||||
head_size,
|
||||
layout.total_sectors
|
||||
);
|
||||
|
||||
let metadata_files = GptMetadataFiles {
|
||||
head_path,
|
||||
head_sectors,
|
||||
pad_paths: Vec::new(),
|
||||
};
|
||||
|
||||
Ok((layout, metadata_files))
|
||||
}
|
||||
|
||||
fn generate_full_gpt_image(layout: &GptDiskLayout, output_path: &Path) -> Result<()> {
|
||||
let lb_size = layout.lb_size;
|
||||
let total_size = layout.total_sectors * lb_size;
|
||||
|
||||
let mut file = fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(output_path)
|
||||
.context(format!(
|
||||
"failed to create full GPT image: {}",
|
||||
output_path.display()
|
||||
))?;
|
||||
|
||||
file.set_len(total_size)
|
||||
.context("failed to pre-allocate full GPT image")?;
|
||||
|
||||
let mbr =
|
||||
ProtectiveMBR::with_lb_size(u32::try_from(layout.total_sectors - 1).unwrap_or(0xFFFF_FFFF));
|
||||
mbr.overwrite_lba0(&mut file)
|
||||
.context("failed to write Protective MBR")?;
|
||||
|
||||
let mut gdisk = GptConfig::new()
|
||||
.writable(true)
|
||||
.logical_block_size(LogicalBlockSize::Lb512)
|
||||
.change_partition_count(true)
|
||||
.create_from_device(file, None)
|
||||
.context("failed to initialize GPT config")?;
|
||||
|
||||
for part_layout in &layout.partitions {
|
||||
let part_size_bytes = (part_layout.end_lba - part_layout.start_lba + 1) * lb_size;
|
||||
gdisk
|
||||
.add_partition(
|
||||
&part_layout.name,
|
||||
part_size_bytes,
|
||||
partition_types::LINUX_FS,
|
||||
0,
|
||||
Some(ALIGNMENT_LBA),
|
||||
)
|
||||
.context(format!("failed to add partition '{}'", part_layout.name))?;
|
||||
}
|
||||
|
||||
let mut file = gdisk
|
||||
.write()
|
||||
.context("failed to write GPT partition table")?;
|
||||
file.flush().context("failed to flush full GPT image")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Patch the primary GPT header in the extracted head file to remove
|
||||
/// backup GPT references.
|
||||
///
|
||||
/// Sets `AlternateLBA` to one sector beyond the virtual disk (so the kernel
|
||||
/// detects "no valid backup" and falls back to the primary) and
|
||||
/// `LastUsableLBA` to the end of the last partition, then recomputes the
|
||||
/// header CRC32.
|
||||
fn patch_primary_gpt_header(head_path: &Path, last_partition_end_lba: u64) -> Result<()> {
|
||||
let mut file = fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(head_path)
|
||||
.context("failed to open head file for patching")?;
|
||||
|
||||
// Read the 92-byte GPT header starting at LBA 1.
|
||||
file.seek(SeekFrom::Start(GPT_HEADER_FILE_OFFSET))?;
|
||||
let mut header = [0u8; GPT_HEADER_SIZE];
|
||||
file.read_exact(&mut header)?;
|
||||
|
||||
// AlternateLBA (offset 32, 8 bytes LE) — point beyond virtual disk
|
||||
let alternate_lba = last_partition_end_lba + 1;
|
||||
header[32..40].copy_from_slice(&alternate_lba.to_le_bytes());
|
||||
|
||||
// LastUsableLBA (offset 48, 8 bytes LE) — last partition end
|
||||
header[48..56].copy_from_slice(&last_partition_end_lba.to_le_bytes());
|
||||
|
||||
// Zero HeaderCRC32 (offset 16, 4 bytes LE) before computing new CRC
|
||||
header[16..20].copy_from_slice(&0u32.to_le_bytes());
|
||||
|
||||
let new_crc = {
|
||||
let mut digest = CRC_32.digest();
|
||||
digest.update(&header);
|
||||
digest.finalize()
|
||||
};
|
||||
header[16..20].copy_from_slice(&new_crc.to_le_bytes());
|
||||
|
||||
// Write patched header back
|
||||
file.seek(SeekFrom::Start(GPT_HEADER_FILE_OFFSET))?;
|
||||
file.write_all(&header)?;
|
||||
file.flush()?;
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"Patched primary GPT header: AlternateLBA={}, LastUsableLBA={}, CRC32={:#010x}",
|
||||
alternate_lba,
|
||||
last_partition_end_lba,
|
||||
new_crc
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn extract_file_range(src: &Path, dst: &Path, offset: u64, size: u64) -> Result<()> {
|
||||
let mut src_file = fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(src)
|
||||
.context(format!("failed to open source file: {}", src.display()))?;
|
||||
src_file
|
||||
.seek(SeekFrom::Start(offset))
|
||||
.context("failed to seek source file")?;
|
||||
|
||||
let mut dst_file = fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(dst)
|
||||
.context(format!("failed to create output file: {}", dst.display()))?;
|
||||
|
||||
dst_file
|
||||
.set_len(size)
|
||||
.context("failed to pre-allocate output file")?;
|
||||
|
||||
let mut limited = src_file.take(size);
|
||||
std::io::copy(&mut limited, &mut dst_file).context("failed to copy file range")?;
|
||||
dst_file.flush().context("failed to flush output file")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate padding file content (all zeros)
|
||||
///
|
||||
/// Returns the file path and size in sectors.
|
||||
pub fn generate_padding_file(output_path: &Path, size_sectors: u64) -> Result<u64> {
|
||||
let size_bytes = size_sectors * SECTOR_SIZE;
|
||||
|
||||
if size_bytes == 0 {
|
||||
return Err(anyhow!("cannot create zero-size padding file"));
|
||||
}
|
||||
|
||||
let mut file = fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(output_path)
|
||||
.context(format!(
|
||||
"failed to create padding file: {}",
|
||||
output_path.display()
|
||||
))?;
|
||||
|
||||
// Pre-allocate with zeros
|
||||
file.set_len(size_bytes)
|
||||
.context("failed to pre-allocate padding file")?;
|
||||
file.flush().context("failed to flush padding file")?;
|
||||
drop(file);
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"Generated padding file: {} ({} sectors, {} bytes)",
|
||||
output_path.display(),
|
||||
size_sectors,
|
||||
size_bytes
|
||||
);
|
||||
|
||||
Ok(size_sectors)
|
||||
}
|
||||
@@ -54,6 +54,9 @@ pub mod rootless;
|
||||
/// machine type
|
||||
pub mod machine_type;
|
||||
|
||||
/// GPT (GUID Partition Table) disk layout and metadata generation.
|
||||
pub mod gpt_disk;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use crate::rootless::{is_rootless, rootless_dir};
|
||||
|
||||
@@ -24,6 +24,10 @@ use hypervisor::{
|
||||
use kata_types::device::{
|
||||
DRIVER_BLK_CCW_TYPE as KATA_CCW_DEV_TYPE, DRIVER_BLK_PCI_TYPE as KATA_BLK_DEV_TYPE,
|
||||
};
|
||||
use kata_types::gpt_disk::{
|
||||
extract_snapshot_id, generate_gpt_metadata, generate_padding_file, get_erofs_layer_size,
|
||||
ErofsLayer, GptDiskLayout, GptMetadataFiles,
|
||||
};
|
||||
use kata_types::mount::Mount;
|
||||
use oci_spec::runtime as oci;
|
||||
use std::fs;
|
||||
@@ -39,14 +43,11 @@ pub(crate) const EROFS_ROOTFS_TYPE: &str = "erofs";
|
||||
pub(crate) const RW_LAYER_ROOTFS_TYPE: &str = "ext4";
|
||||
/// VMDK file extension for merged EROFS image
|
||||
const EROFS_MERGED_VMDK: &str = "merged_fs.vmdk";
|
||||
/// Maximum number of virtio-blk devices allowed for multi-layer EROFS rootfs.
|
||||
///
|
||||
/// This defensive limit prevents exhausting PCI slot resources, especially on
|
||||
/// lightweight VMMs (Dragonball, Cloud Hypervisor) where the PCIe root bus has
|
||||
/// only 32 slots (PCIE_ROOT_BUS_SLOTS_CAPACITY). For QEMU with PCI bridges
|
||||
/// (30 slots/bridge), this limit is conservative but still applies as a uniform
|
||||
/// safeguard across all hypervisor backends.
|
||||
const MAX_VIRTIO_BLK_DEVICES: usize = 10;
|
||||
|
||||
/// Maximum number of rootfs layer devices (erofs + rw layer) allowed in multi-layer EROFS mode.
|
||||
/// This is a pre-flight sanity check before VMDK merging, to prevent excessive block devices
|
||||
/// when many layers are used without fsmerge.
|
||||
const MAX_ROOTFS_LAYER_DEVICES: usize = 129; // 128 EROFS layers + 1 rw layer (129 total)
|
||||
/// Maximum sectors per 2GB extent (2GB / 512 bytes per sector)
|
||||
const MAX_2GB_EXTENT_SECTORS: u64 = 0x8000_0000 >> 9;
|
||||
/// Sectors per track for VMDK geometry
|
||||
@@ -60,12 +61,25 @@ const VMDK_ADAPTER_TYPE: &str = "ide";
|
||||
/// VMDK hardware version
|
||||
const VMDK_HW_VERSION: &str = "4";
|
||||
/// Default shared directory for guest rootfs VMDK files (for multi-layer EROFS)
|
||||
const DEFAULT_KATA_GUEST_ROOT_SHARED_FS: &str = "/run/kata-containers/";
|
||||
pub(crate) const DEFAULT_KATA_GUEST_ROOT_SHARED_FS: &str = "/run/kata-containers/";
|
||||
/// Template for mkdir option in overlay mount (X-containerd.mkdir.path)
|
||||
const X_CONTAINERD_MKDIR_PATH: &str = "X-containerd.mkdir.path=";
|
||||
/// Template for mkdir option passed to guest agent (X-kata.mkdir.path)
|
||||
const X_KATA_MKDIR_PATH: &str = "X-kata.mkdir.path=";
|
||||
|
||||
/// Create the per-container directory under the shared filesystem root.
|
||||
pub(crate) fn ensure_container_dir(sid: &str, cid: &str) -> Result<PathBuf> {
|
||||
let dir = PathBuf::from(kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS))
|
||||
.join(sid)
|
||||
.join(cid);
|
||||
fs::create_dir_all(&dir).context(format!(
|
||||
"failed to create container directory: {}",
|
||||
dir.display()
|
||||
))?;
|
||||
|
||||
Ok(dir)
|
||||
}
|
||||
|
||||
/// Generate merged VMDK file from multiple EROFS devices
|
||||
///
|
||||
/// Creates a VMDK descriptor that combines multiple EROFS images into a single
|
||||
@@ -104,14 +118,7 @@ async fn generate_merged_erofs_vmdk(
|
||||
}
|
||||
|
||||
// For multiple devices, create VMDK descriptor
|
||||
let sandbox_dir =
|
||||
PathBuf::from(kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS)).join(sid);
|
||||
let container_dir = sandbox_dir.join(cid);
|
||||
fs::create_dir_all(&container_dir).context(format!(
|
||||
"failed to create container directory: {}",
|
||||
container_dir.display()
|
||||
))?;
|
||||
|
||||
let container_dir = ensure_container_dir(sid, cid)?;
|
||||
let vmdk_path = container_dir.join(EROFS_MERGED_VMDK);
|
||||
|
||||
info!(
|
||||
@@ -129,6 +136,105 @@ async fn generate_merged_erofs_vmdk(
|
||||
Ok((vmdk_path.display().to_string(), BlockDeviceFormat::Vmdk))
|
||||
}
|
||||
|
||||
/// Helper struct for writing VMDK descriptor files atomically.
|
||||
///
|
||||
/// Encapsulates the common VMDK descriptor format: header, extent descriptions,
|
||||
/// DDB footer, and atomic write (temp file + rename). Used by both fsmerge mode
|
||||
/// (`create_vmdk_descriptor`) and GPT mode (`create_gpt_vmdk_descriptor`).
|
||||
struct VmdkDescriptorWriter {
|
||||
writer: BufWriter<fs::File>,
|
||||
temp_path: PathBuf,
|
||||
final_path: PathBuf,
|
||||
}
|
||||
|
||||
impl VmdkDescriptorWriter {
|
||||
fn new(vmdk_path: &Path) -> Result<Self> {
|
||||
let temp_path = vmdk_path.with_extension("vmdk.tmp");
|
||||
if temp_path.components().any(|c| c == Component::ParentDir) {
|
||||
return Err(anyhow!("Invalid input: {}", temp_path.display()));
|
||||
}
|
||||
let file = fs::File::create(&temp_path).context(format!(
|
||||
"failed to create temp VMDK file: {}",
|
||||
temp_path.display()
|
||||
))?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
writeln!(writer, "# Disk DescriptorFile")?;
|
||||
writeln!(writer, "version=1")?;
|
||||
writeln!(writer, "CID=fffffffe")?;
|
||||
writeln!(writer, "parentCID=ffffffff")?;
|
||||
writeln!(writer, "createType=\"{}\"", VMDK_SUBFORMAT)?;
|
||||
writeln!(writer)?;
|
||||
writeln!(writer, "# Extent description")?;
|
||||
|
||||
Ok(Self {
|
||||
writer,
|
||||
temp_path,
|
||||
final_path: vmdk_path.to_path_buf(),
|
||||
})
|
||||
}
|
||||
|
||||
// Write a single extent line (no 2GB chunking).
|
||||
fn write_extent(&mut self, path: &str, sectors: u64, file_offset: u64) -> Result<()> {
|
||||
writeln!(
|
||||
self.writer,
|
||||
"RW {} FLAT \"{}\" {}",
|
||||
sectors, path, file_offset
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write extent lines with 2GB chunking for large files.
|
||||
fn write_extent_chunked(&mut self, path: &str, total_sectors: u64) -> Result<()> {
|
||||
let mut remaining = total_sectors;
|
||||
let mut file_offset: u64 = 0;
|
||||
while remaining > 0 {
|
||||
let chunk = remaining.min(MAX_2GB_EXTENT_SECTORS);
|
||||
self.write_extent(path, chunk, file_offset)?;
|
||||
file_offset += chunk;
|
||||
remaining -= chunk;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write DDB footer, flush, and atomically rename to final path.
|
||||
fn finalize(mut self, total_sectors: u64) -> Result<()> {
|
||||
writeln!(self.writer)?;
|
||||
|
||||
let cylinders = total_sectors.div_ceil(SECTORS_PER_TRACK * NUMBER_HEADS);
|
||||
|
||||
writeln!(self.writer, "# The Disk Data Base")?;
|
||||
writeln!(self.writer, "#DDB")?;
|
||||
writeln!(self.writer)?;
|
||||
writeln!(
|
||||
self.writer,
|
||||
"ddb.virtualHWVersion = \"{}\"",
|
||||
VMDK_HW_VERSION
|
||||
)?;
|
||||
writeln!(self.writer, "ddb.geometry.cylinders = \"{}\"", cylinders)?;
|
||||
writeln!(self.writer, "ddb.geometry.heads = \"{}\"", NUMBER_HEADS)?;
|
||||
writeln!(
|
||||
self.writer,
|
||||
"ddb.geometry.sectors = \"{}\"",
|
||||
SECTORS_PER_TRACK
|
||||
)?;
|
||||
writeln!(self.writer, "ddb.adapterType = \"{}\"", VMDK_ADAPTER_TYPE)?;
|
||||
|
||||
self.writer
|
||||
.flush()
|
||||
.context("failed to flush VMDK descriptor")?;
|
||||
drop(self.writer);
|
||||
|
||||
fs::rename(&self.temp_path, &self.final_path).context(format!(
|
||||
"failed to rename temp VMDK {} -> {}",
|
||||
self.temp_path.display(),
|
||||
self.final_path.display()
|
||||
))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Create VMDK descriptor for multiple EROFS extents (flatten device)
|
||||
///
|
||||
/// Generates a VMDK descriptor file (twoGbMaxExtentFlat format) that references
|
||||
@@ -141,7 +247,6 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
|
||||
));
|
||||
}
|
||||
|
||||
// collect extent information without writing anything.
|
||||
struct ExtentInfo {
|
||||
path: String,
|
||||
total_sectors: u64,
|
||||
@@ -160,9 +265,6 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
|
||||
continue;
|
||||
}
|
||||
|
||||
// round up to whole sectors to avoid losing tail bytes on non-aligned files.
|
||||
// VMDK extents are measured in 512-byte sectors; a file that is not sector-aligned
|
||||
// still needs the last partial sector to be addressable by the VM.
|
||||
let sectors = file_size.div_ceil(512);
|
||||
|
||||
if file_size % 512 != 0 {
|
||||
@@ -197,43 +299,9 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
|
||||
));
|
||||
}
|
||||
|
||||
// write descriptor to a temp file, then atomically rename.
|
||||
let tmp_path = vmdk_path.with_extension("vmdk.tmp");
|
||||
// Prevent path traversal attacks by rejecting paths containing '..'.
|
||||
if tmp_path.components().any(|c| c == Component::ParentDir) {
|
||||
return Err(anyhow!("Invalid input: {}", tmp_path.display()));
|
||||
}
|
||||
let file = fs::File::create(&tmp_path).context(format!(
|
||||
"failed to create temp VMDK file: {}",
|
||||
tmp_path.display()
|
||||
))?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
// Header
|
||||
writeln!(writer, "# Disk DescriptorFile")?;
|
||||
writeln!(writer, "version=1")?;
|
||||
writeln!(writer, "CID=fffffffe")?;
|
||||
writeln!(writer, "parentCID=ffffffff")?;
|
||||
writeln!(writer, "createType=\"{}\"", VMDK_SUBFORMAT)?;
|
||||
writeln!(writer)?;
|
||||
|
||||
// Extent descriptions
|
||||
writeln!(writer, "# Extent description")?;
|
||||
let mut vmdk = VmdkDescriptorWriter::new(vmdk_path)?;
|
||||
for extent in &extents {
|
||||
let mut remaining = extent.total_sectors;
|
||||
let mut file_offset: u64 = 0;
|
||||
|
||||
while remaining > 0 {
|
||||
let chunk = remaining.min(MAX_2GB_EXTENT_SECTORS);
|
||||
writeln!(
|
||||
writer,
|
||||
"RW {} FLAT \"{}\" {}",
|
||||
chunk, extent.path, file_offset
|
||||
)?;
|
||||
file_offset += chunk;
|
||||
remaining -= chunk;
|
||||
}
|
||||
|
||||
vmdk.write_extent_chunked(&extent.path, extent.total_sectors)?;
|
||||
info!(
|
||||
sl!(),
|
||||
"VMDK extent: {} ({} sectors, {} extent chunk(s))",
|
||||
@@ -242,45 +310,149 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
|
||||
extent.total_sectors.div_ceil(MAX_2GB_EXTENT_SECTORS)
|
||||
);
|
||||
}
|
||||
writeln!(writer)?;
|
||||
|
||||
// Disk Data Base (DDB)
|
||||
// Geometry: cylinders = ceil(total_sectors / (sectors_per_track * heads))
|
||||
let cylinders = total_sectors.div_ceil(SECTORS_PER_TRACK * NUMBER_HEADS);
|
||||
|
||||
writeln!(writer, "# The Disk Data Base")?;
|
||||
writeln!(writer, "#DDB")?;
|
||||
writeln!(writer)?;
|
||||
writeln!(writer, "ddb.virtualHWVersion = \"{}\"", VMDK_HW_VERSION)?;
|
||||
writeln!(writer, "ddb.geometry.cylinders = \"{}\"", cylinders)?;
|
||||
writeln!(writer, "ddb.geometry.heads = \"{}\"", NUMBER_HEADS)?;
|
||||
writeln!(writer, "ddb.geometry.sectors = \"{}\"", SECTORS_PER_TRACK)?;
|
||||
writeln!(writer, "ddb.adapterType = \"{}\"", VMDK_ADAPTER_TYPE)?;
|
||||
|
||||
// Flush the BufWriter to ensure all data is written before rename.
|
||||
writer.flush().context("failed to flush VMDK descriptor")?;
|
||||
// Explicitly drop to close the file handle before rename.
|
||||
drop(writer);
|
||||
|
||||
// atomic rename: tmp -> final path.
|
||||
fs::rename(&tmp_path, vmdk_path).context(format!(
|
||||
"failed to rename temp VMDK {} -> {}",
|
||||
tmp_path.display(),
|
||||
vmdk_path.display()
|
||||
))?;
|
||||
vmdk.finalize(total_sectors)?;
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"VMDK descriptor created: {} (total {} sectors, {} extents, {} cylinders)",
|
||||
"VMDK descriptor created: {} (total {} sectors, {} extents)",
|
||||
vmdk_path.display(),
|
||||
total_sectors,
|
||||
extents.len(),
|
||||
cylinders
|
||||
extents.len()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate GPT-partitioned VMDK and return layout information for per-partition storage creation
|
||||
///
|
||||
/// Returns: (vmdk_path, BlockDeviceFormat::Vmdk, GptDiskLayout, GptMetadataFiles)
|
||||
fn generate_gpt_vmdk_with_layout(
|
||||
sid: &str,
|
||||
cid: &str,
|
||||
erofs_layers: Vec<ErofsLayer>,
|
||||
) -> Result<(String, BlockDeviceFormat, GptDiskLayout, GptMetadataFiles)> {
|
||||
if erofs_layers.is_empty() {
|
||||
return Err(anyhow!("no EROFS layers provided for GPT VMDK generation"));
|
||||
}
|
||||
|
||||
// Validate all layer paths exist and are regular files
|
||||
for layer in &erofs_layers {
|
||||
let metadata = fs::metadata(&layer.path)
|
||||
.context(format!("EROFS layer path not accessible: {}", layer.path))?;
|
||||
if !metadata.is_file() {
|
||||
return Err(anyhow!(
|
||||
"EROFS layer path is not a regular file: {}",
|
||||
layer.path
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Create container directory
|
||||
let container_dir = ensure_container_dir(sid, cid)?;
|
||||
let vmdk_path = container_dir.join(EROFS_MERGED_VMDK);
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"creating GPT-partitioned VMDK for {} EROFS layers: {}",
|
||||
erofs_layers.len(),
|
||||
vmdk_path.display()
|
||||
);
|
||||
|
||||
// Generate GPT metadata files
|
||||
let (layout, mut gpt_files) = generate_gpt_metadata(sid, cid, erofs_layers, &container_dir)
|
||||
.context("failed to generate GPT metadata")?;
|
||||
|
||||
// Create VMDK descriptor with GPT layout and collect generated padding paths
|
||||
let pad_paths = create_gpt_vmdk_descriptor(&vmdk_path, &layout, &gpt_files)
|
||||
.context("failed to create GPT VMDK descriptor")?;
|
||||
gpt_files.pad_paths = pad_paths;
|
||||
|
||||
Ok((
|
||||
vmdk_path.display().to_string(),
|
||||
BlockDeviceFormat::Vmdk,
|
||||
layout,
|
||||
gpt_files,
|
||||
))
|
||||
}
|
||||
|
||||
/// Create VMDK descriptor for GPT-partitioned disk
|
||||
///
|
||||
/// Returns the list of generated padding file paths for cleanup tracking.
|
||||
fn create_gpt_vmdk_descriptor(
|
||||
vmdk_path: &Path,
|
||||
layout: &GptDiskLayout,
|
||||
gpt_files: &GptMetadataFiles,
|
||||
) -> Result<Vec<PathBuf>> {
|
||||
let mut vmdk = VmdkDescriptorWriter::new(vmdk_path)?;
|
||||
let mut pad_paths: Vec<PathBuf> = Vec::new();
|
||||
|
||||
// 1. GPT head metadata
|
||||
vmdk.write_extent(
|
||||
&gpt_files.head_path.display().to_string(),
|
||||
gpt_files.head_sectors,
|
||||
0,
|
||||
)?;
|
||||
info!(
|
||||
sl!(),
|
||||
"VMDK extent: GPT head ({} sectors) at {}",
|
||||
gpt_files.head_sectors,
|
||||
gpt_files.head_path.display()
|
||||
);
|
||||
|
||||
// 2. Layer extents with padding gaps
|
||||
// head ends at LBA 2047, so first gap starts at LBA 2048.
|
||||
let mut prev_end_lba = gpt_files.head_sectors - 1;
|
||||
|
||||
let metadata_dir = gpt_files.head_path.parent().ok_or_else(|| {
|
||||
anyhow!(
|
||||
"GPT head file has no parent directory: {}",
|
||||
gpt_files.head_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
for (idx, part) in layout.partitions.iter().enumerate() {
|
||||
let gap_start_lba = prev_end_lba + 1;
|
||||
if part.start_lba > gap_start_lba {
|
||||
let gap_sectors = part.start_lba - gap_start_lba;
|
||||
let pad_path = metadata_dir.join(format!("pad-{}.img", idx));
|
||||
|
||||
generate_padding_file(&pad_path, gap_sectors).context(format!(
|
||||
"failed to generate padding file: {}",
|
||||
pad_path.display()
|
||||
))?;
|
||||
|
||||
vmdk.write_extent(&pad_path.display().to_string(), gap_sectors, 0)?;
|
||||
pad_paths.push(pad_path);
|
||||
}
|
||||
|
||||
vmdk.write_extent_chunked(&part.layer.path, part.layer.size_sectors)?;
|
||||
info!(
|
||||
sl!(),
|
||||
"VMDK extent: {} (partition {}, LBA {}-{}, {} sectors)",
|
||||
part.layer.path,
|
||||
part.partition_number,
|
||||
part.start_lba,
|
||||
part.end_lba,
|
||||
part.layer.size_sectors
|
||||
);
|
||||
|
||||
prev_end_lba = part.end_lba;
|
||||
}
|
||||
|
||||
vmdk.finalize(layout.total_sectors)?;
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"GPT VMDK descriptor created: {} (total {} sectors, {} partitions)",
|
||||
vmdk_path.display(),
|
||||
layout.total_sectors,
|
||||
layout.partitions.len()
|
||||
);
|
||||
|
||||
Ok(pad_paths)
|
||||
}
|
||||
|
||||
fn extract_block_device_info(
|
||||
device_info: &DeviceType,
|
||||
read_only: bool,
|
||||
@@ -338,10 +510,16 @@ fn extract_block_device_info(
|
||||
pub(crate) struct ErofsMultiLayerRootfs {
|
||||
guest_path: String,
|
||||
device_ids: Vec<String>,
|
||||
rwlayer_storage: Option<Storage>, // Writable layer storage (upper layer), typically ext4
|
||||
erofs_storage: Option<Storage>,
|
||||
/// Path to generated VMDK descriptor (only set when multiple EROFS devices are merged)
|
||||
// Writable layer storage (upper layer), typically ext4
|
||||
rwlayer_storage: Option<Storage>,
|
||||
// Read-only EROFS layer storages (lower layers), one per partition in GPT mode
|
||||
erofs_storages: Vec<Storage>,
|
||||
// Path to generated VMDK descriptor (only set when multiple EROFS devices are merged)
|
||||
vmdk_path: Option<PathBuf>,
|
||||
// Paths to generated GPT metadata files (head, padding) for cleanup
|
||||
gpt_metadata_paths: Vec<PathBuf>,
|
||||
// Container-scoped runtime directory that may only contain generated helper artifacts.
|
||||
generated_artifacts_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl ErofsMultiLayerRootfs {
|
||||
@@ -360,8 +538,11 @@ impl ErofsMultiLayerRootfs {
|
||||
|
||||
let mut device_ids = Vec::new();
|
||||
let mut rwlayer_storage: Option<Storage> = None;
|
||||
let mut erofs_storage: Option<Storage> = None;
|
||||
let mut erofs_storages: Vec<Storage> = Vec::new();
|
||||
let mut vmdk_path: Option<PathBuf> = None;
|
||||
let mut gpt_metadata_paths: Vec<PathBuf> = Vec::new();
|
||||
// Track whether GPT+VMDK erofs layers have already been processed in bulk.
|
||||
let mut gpt_erofs_processed = false;
|
||||
|
||||
// Directories to create (X-containerd.mkdir.path)
|
||||
let mut mkdir_dirs: Vec<String> = Vec::new();
|
||||
@@ -374,14 +555,33 @@ impl ErofsMultiLayerRootfs {
|
||||
.iter()
|
||||
.filter(|m| matches!(m.fs_type.as_str(), RW_LAYER_ROOTFS_TYPE | EROFS_ROOTFS_TYPE))
|
||||
.count();
|
||||
if expected_device_count > MAX_VIRTIO_BLK_DEVICES {
|
||||
|
||||
// TODO(Alex Lyn): fsmerge mode with single erofs mount and multiple device= options
|
||||
// may require multiple block devices if containerd does not merge layers into one file.
|
||||
// This is a fallback or default mode if fsmerge is not enabled.
|
||||
if expected_device_count > MAX_ROOTFS_LAYER_DEVICES {
|
||||
return Err(anyhow!(
|
||||
"exceeded maximum block devices for multi-layer EROFS: {} > {}",
|
||||
expected_device_count,
|
||||
MAX_VIRTIO_BLK_DEVICES
|
||||
MAX_ROOTFS_LAYER_DEVICES
|
||||
));
|
||||
}
|
||||
|
||||
// Pre-extract mkdir directives from overlay mounts before the main loop,
|
||||
// so they are available regardless of mount ordering.
|
||||
for mount in rootfs_mounts {
|
||||
if matches!(
|
||||
mount.fs_type.as_str(),
|
||||
"overlay" | "format/overlay" | "format/mkdir/overlay"
|
||||
) {
|
||||
for opt in &mount.options {
|
||||
if let Some(mkdir_spec) = opt.strip_prefix(X_CONTAINERD_MKDIR_PATH) {
|
||||
mkdir_dirs.push(mkdir_spec.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process each mount in rootfs_mounts to set up devices and storages
|
||||
for mount in rootfs_mounts {
|
||||
match mount.fs_type.as_str() {
|
||||
@@ -407,8 +607,6 @@ impl ErofsMultiLayerRootfs {
|
||||
.await
|
||||
.context("failed to attach rw block device")?;
|
||||
|
||||
// let (device_id, guest_path, blk_driver) =
|
||||
// extract_block_device_info(&device_info, &block_driver)?;
|
||||
let (mut rwlayer, device_id) =
|
||||
extract_block_device_info(&device_info, false)
|
||||
.context("failed to get block device for rw layer")?;
|
||||
@@ -441,110 +639,253 @@ impl ErofsMultiLayerRootfs {
|
||||
}
|
||||
fmt if fmt.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE) => {
|
||||
// Mount[1]: erofs layers -> virtio-blk via VMDK /dev/vdX2
|
||||
info!(
|
||||
sl!(),
|
||||
"multi-layer erofs: adding erofs layers: {}", mount.source
|
||||
);
|
||||
//
|
||||
// Two modes are supported:
|
||||
// 1. fsmerge mode: Single erofs mount with `device=` options pointing to additional files.
|
||||
// This is used when containerd has already merged layers into a single file.
|
||||
// 2. GPT+VMDK mode: Multiple independent erofs mounts (each mount is a separate layer file).
|
||||
// This is used when containerd does NOT use fsmerge, and we need to create GPT partitions.
|
||||
|
||||
// Collect all EROFS devices: source + `device=` options
|
||||
let mut erofs_devices = vec![mount.source.clone()];
|
||||
for opt in &mount.options {
|
||||
if let Some(device_path) = opt.strip_prefix("device=") {
|
||||
erofs_devices.push(device_path.to_string());
|
||||
}
|
||||
// In GPT mode, all erofs layers are processed in bulk on the first
|
||||
// encounter. Skip subsequent erofs mounts but continue iterating
|
||||
// so that later ext4 rw-layer and overlay mounts are still handled.
|
||||
if gpt_erofs_processed {
|
||||
info!(
|
||||
sl!(),
|
||||
"multi-layer erofs: skipping already-processed erofs mount: {}",
|
||||
mount.source
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
info!(sl!(), "EROFS devices count: {}", erofs_devices.len());
|
||||
|
||||
// Generate merged VMDK file from all EROFS devices
|
||||
// Returns (path, format) - format is Vmdk for multiple devices, Raw for single device
|
||||
let (erofs_path, erofs_format) =
|
||||
generate_merged_erofs_vmdk(sid, cid, &erofs_devices)
|
||||
.await
|
||||
.context("failed to generate EROFS VMDK")?;
|
||||
|
||||
// Track VMDK path for cleanup (only when VMDK is actually created)
|
||||
if erofs_format == BlockDeviceFormat::Vmdk {
|
||||
vmdk_path = Some(PathBuf::from(&erofs_path));
|
||||
}
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"EROFS block device config - path: {}, format: {:?}",
|
||||
erofs_path,
|
||||
erofs_format
|
||||
);
|
||||
|
||||
let device_config = &mut BlockConfig {
|
||||
driver_option: block_driver.clone(),
|
||||
format: erofs_format, // Vmdk for multiple devices, Raw for single device
|
||||
path_on_host: erofs_path,
|
||||
is_readonly: true, // EROFS layers are read-only, must set to avoid "resize" lock errors
|
||||
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let device_info = do_handle_device(
|
||||
device_manager,
|
||||
&DeviceConfig::BlockCfg(device_config.clone()),
|
||||
)
|
||||
.await
|
||||
.context("failed to attach erofs block device")?;
|
||||
|
||||
let (mut rolayer, device_id) = extract_block_device_info(&device_info, true)?;
|
||||
info!(
|
||||
sl!(),
|
||||
"erofs device attached - device_id: {} guest_path: {}",
|
||||
device_id,
|
||||
&rolayer.source
|
||||
);
|
||||
|
||||
let mut options: Vec<String> = mount
|
||||
.options
|
||||
// Collect all EROFS mounts once with their original indices.
|
||||
let erofs_mounts_indexed: Vec<(usize, &Mount)> = rootfs_mounts
|
||||
.iter()
|
||||
.filter(|o| {
|
||||
// Filter out options that are not valid erofs mount parameters:
|
||||
// 1. "loop" - not needed in VM, device is already /dev/vdX
|
||||
// 2. "device=" prefix - used for VMDK generation only, not for mount
|
||||
// 3. "X-kata." prefix - metadata markers for kata internals
|
||||
*o != "loop" && !o.starts_with("device=") && !o.starts_with("X-kata.")
|
||||
})
|
||||
.cloned()
|
||||
.enumerate()
|
||||
.filter(|(_, m)| m.fs_type.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE))
|
||||
.collect();
|
||||
let total_erofs_mounts = erofs_mounts_indexed.len();
|
||||
|
||||
// Erofs layers are read-only lower layers (marked with X-kata.overlay-lower)
|
||||
options.push("X-kata.overlay-lower".to_string());
|
||||
options.push("X-kata.multi-layer=true".to_string());
|
||||
// GPT+VMDK mode: Multiple independent erofs layer files
|
||||
if total_erofs_mounts > 1 {
|
||||
info!(
|
||||
sl!(),
|
||||
"multi-layer erofs: using GPT+VMDK mode for {} independent layers",
|
||||
total_erofs_mounts
|
||||
);
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"erofs storage options filtered: {:?} -> {:?}", mount.options, options
|
||||
);
|
||||
let mut erofs_layers = Vec::new();
|
||||
|
||||
rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string();
|
||||
rolayer.mount_point = container_path.clone();
|
||||
rolayer.options = options;
|
||||
for (_mount_idx, erofs_mount) in &erofs_mounts_indexed {
|
||||
let layer_path = erofs_mount.source.clone();
|
||||
let size_bytes = get_erofs_layer_size(&layer_path).context(format!(
|
||||
"gptdisk: failed to get size of EROFS layer: {}",
|
||||
layer_path
|
||||
))?;
|
||||
|
||||
erofs_storage = Some(rolayer);
|
||||
device_ids.push(device_id);
|
||||
if size_bytes == 0 {
|
||||
warn!(
|
||||
sl!(),
|
||||
"gptdisk: EROFS layer {} is zero-length, skipping", layer_path
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
let size_sectors = size_bytes.div_ceil(512);
|
||||
let snapshot_id = extract_snapshot_id(&layer_path);
|
||||
|
||||
erofs_layers.push(ErofsLayer {
|
||||
path: layer_path,
|
||||
size_sectors,
|
||||
snapshot_id,
|
||||
});
|
||||
}
|
||||
|
||||
if erofs_layers.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"gptdisk: no valid EROFS layers found for GPT VMDK"
|
||||
));
|
||||
}
|
||||
|
||||
// Generate GPT-partitioned VMDK and get layout information
|
||||
let (erofs_path, erofs_format, layout, gpt_files) =
|
||||
generate_gpt_vmdk_with_layout(sid, cid, erofs_layers)
|
||||
.context("gptdisk: failed to generate GPT VMDK")?;
|
||||
|
||||
// Track VMDK path for cleanup
|
||||
vmdk_path = Some(PathBuf::from(&erofs_path));
|
||||
|
||||
// Track GPT metadata files (head + padding) for cleanup
|
||||
gpt_metadata_paths.push(gpt_files.head_path.clone());
|
||||
gpt_metadata_paths.extend(gpt_files.pad_paths.iter().cloned());
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"GPT VMDK created - path: {}, format: {:?}, {} partitions",
|
||||
erofs_path,
|
||||
erofs_format,
|
||||
layout.partitions.len()
|
||||
);
|
||||
|
||||
let device_config = &mut BlockConfig {
|
||||
driver_option: block_driver.clone(),
|
||||
format: erofs_format,
|
||||
path_on_host: erofs_path,
|
||||
is_readonly: true,
|
||||
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let device_info = do_handle_device(
|
||||
device_manager,
|
||||
&DeviceConfig::BlockCfg(device_config.clone()),
|
||||
)
|
||||
.await
|
||||
.context("failed to attach GPT VMDK block device")?;
|
||||
|
||||
let (base_device, device_id) =
|
||||
extract_block_device_info(&device_info, true)?;
|
||||
info!(
|
||||
sl!(),
|
||||
"GPT VMDK device attached - device_id: {} guest_path: {}",
|
||||
device_id,
|
||||
&base_device.source
|
||||
);
|
||||
|
||||
device_ids.push(device_id);
|
||||
|
||||
// Create a storage entry for each GPT partition.
|
||||
for (idx, part) in layout.partitions.iter().enumerate() {
|
||||
let mut rolayer = base_device.clone();
|
||||
let options: Vec<String> = vec![
|
||||
"X-kata.overlay-lower".to_string(),
|
||||
"X-kata.multi-layer=true".to_string(),
|
||||
"X-kata.gpt-partitioned=true".to_string(),
|
||||
format!("X-kata.partition-number={}", part.partition_number),
|
||||
];
|
||||
|
||||
rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string();
|
||||
rolayer.mount_point = container_path.clone();
|
||||
rolayer.options = options;
|
||||
rolayer.source = base_device.source.clone();
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"Created storage for GPT partition {} (partition number {}, LBA {}-{})",
|
||||
idx, part.partition_number, part.start_lba, part.end_lba
|
||||
);
|
||||
|
||||
erofs_storages.push(rolayer);
|
||||
}
|
||||
|
||||
// Mark GPT erofs as processed so subsequent erofs mounts
|
||||
// in the loop are skipped, while still allowing ext4 and
|
||||
// overlay mounts to be visited.
|
||||
gpt_erofs_processed = true;
|
||||
} else {
|
||||
// fsmerge mode: Single erofs mount with device= options
|
||||
info!(
|
||||
sl!(),
|
||||
"multi-layer erofs: using fsmerge mode for erofs layers: {}",
|
||||
mount.source
|
||||
);
|
||||
|
||||
// Collect all EROFS devices: source + `device=` options
|
||||
let mut erofs_devices = vec![mount.source.clone()];
|
||||
for opt in &mount.options {
|
||||
if let Some(device_path) = opt.strip_prefix("device=") {
|
||||
erofs_devices.push(device_path.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
info!(sl!(), "EROFS devices count: {}", erofs_devices.len());
|
||||
|
||||
// Generate merged VMDK file from all EROFS devices
|
||||
// Returns (path, format) - format is Vmdk for multiple devices, Raw for single device
|
||||
let (erofs_path, erofs_format) =
|
||||
generate_merged_erofs_vmdk(sid, cid, &erofs_devices)
|
||||
.await
|
||||
.context("failed to generate EROFS VMDK")?;
|
||||
|
||||
// Track VMDK path for cleanup (only when VMDK is actually created)
|
||||
if erofs_format == BlockDeviceFormat::Vmdk {
|
||||
vmdk_path = Some(PathBuf::from(&erofs_path));
|
||||
}
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"EROFS block device config - path: {}, format: {:?}",
|
||||
erofs_path,
|
||||
erofs_format
|
||||
);
|
||||
|
||||
let device_config = &mut BlockConfig {
|
||||
driver_option: block_driver.clone(),
|
||||
format: erofs_format, // Vmdk for multiple devices, Raw for single device
|
||||
path_on_host: erofs_path,
|
||||
is_readonly: true, // EROFS layers are read-only, must set to avoid "resize" lock errors
|
||||
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let device_info = do_handle_device(
|
||||
device_manager,
|
||||
&DeviceConfig::BlockCfg(device_config.clone()),
|
||||
)
|
||||
.await
|
||||
.context("failed to attach erofs block device")?;
|
||||
|
||||
let (mut rolayer, device_id) =
|
||||
extract_block_device_info(&device_info, true)?;
|
||||
info!(
|
||||
sl!(),
|
||||
"erofs device attached - device_id: {} guest_path: {}",
|
||||
device_id,
|
||||
&rolayer.source
|
||||
);
|
||||
|
||||
let mut options: Vec<String> = mount
|
||||
.options
|
||||
.iter()
|
||||
.filter(|o| {
|
||||
// Filter out options that are not valid erofs mount parameters:
|
||||
// 1. "loop" - not needed in VM, device is already /dev/vdX
|
||||
// 2. "device=" prefix - used for VMDK generation only, not for mount
|
||||
// 3. "X-kata." prefix - metadata markers for kata internals
|
||||
*o != "loop"
|
||||
&& !o.starts_with("device=")
|
||||
&& !o.starts_with("X-kata.")
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
// Erofs layers are read-only lower layers (marked with X-kata.overlay-lower)
|
||||
options.push("X-kata.overlay-lower".to_string());
|
||||
options.push("X-kata.multi-layer=true".to_string());
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"erofs storage options filtered: {:?} -> {:?}", mount.options, options
|
||||
);
|
||||
|
||||
rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string();
|
||||
rolayer.mount_point = container_path.clone();
|
||||
rolayer.options = options;
|
||||
|
||||
erofs_storages.push(rolayer);
|
||||
device_ids.push(device_id);
|
||||
}
|
||||
}
|
||||
fmt if fmt.eq_ignore_ascii_case("overlay")
|
||||
|| fmt.eq_ignore_ascii_case("format/overlay")
|
||||
|| fmt.eq_ignore_ascii_case("format/mkdir/overlay") =>
|
||||
{
|
||||
// Mount[2]: overlay to combine rwlayer (upper) + erofs (lower)
|
||||
// mkdir directives already extracted before the main loop
|
||||
info!(
|
||||
sl!(),
|
||||
"multi-layer erofs: parsing overlay mount, options: {:?}", mount.options
|
||||
"multi-layer erofs: overlay mount (mkdir directives pre-extracted)"
|
||||
);
|
||||
|
||||
// Parse mkdir options (X-containerd.mkdir.path)
|
||||
for opt in &mount.options {
|
||||
if let Some(mkdir_spec) = opt.strip_prefix(X_CONTAINERD_MKDIR_PATH) {
|
||||
// Keep the full spec (path:mode or path:mode:uid:gid) for guest agent
|
||||
mkdir_dirs.push(mkdir_spec.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
info!(
|
||||
@@ -572,8 +913,14 @@ impl ErofsMultiLayerRootfs {
|
||||
guest_path: container_path,
|
||||
device_ids,
|
||||
rwlayer_storage,
|
||||
erofs_storage,
|
||||
erofs_storages,
|
||||
vmdk_path,
|
||||
gpt_metadata_paths,
|
||||
generated_artifacts_dir: PathBuf::from(
|
||||
kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS),
|
||||
)
|
||||
.join(sid)
|
||||
.join(cid),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -589,16 +936,18 @@ impl Rootfs for ErofsMultiLayerRootfs {
|
||||
}
|
||||
|
||||
async fn get_storage(&self) -> Option<Vec<Storage>> {
|
||||
// Return all storages for multi-layer EROFS (rw layer + erofs layer) to guest agent.
|
||||
// Guest agent needs both to create overlay mount
|
||||
// Return all storages for multi-layer EROFS (rw layer + erofs layers) to guest agent.
|
||||
// Guest agent needs all of them to create overlay mount.
|
||||
// In GPT mode, each partition has its own storage entry.
|
||||
let mut storages = Vec::new();
|
||||
|
||||
if let Some(rwlayer) = self.rwlayer_storage.clone() {
|
||||
storages.push(rwlayer);
|
||||
}
|
||||
|
||||
if let Some(erofs) = self.erofs_storage.clone() {
|
||||
storages.push(erofs);
|
||||
// Add all EROFS layer storages (single storage in fsmerge mode, multiple in GPT mode)
|
||||
for erofs in &self.erofs_storages {
|
||||
storages.push(erofs.clone());
|
||||
}
|
||||
|
||||
if storages.is_empty() {
|
||||
@@ -613,23 +962,27 @@ impl Rootfs for ErofsMultiLayerRootfs {
|
||||
}
|
||||
|
||||
async fn cleanup(&self, device_manager: &RwLock<DeviceManager>) -> Result<()> {
|
||||
// Helper function to safely remove a file if it exists and is within the specified directory.
|
||||
let safely_remove_file = |path: &Path, dir: &Path| -> Result<()> {
|
||||
if path.starts_with(dir) && path.exists() {
|
||||
fs::remove_file(path).context(format!("failed to remove file: {}", path.display()))?;
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let mut dm = device_manager.write().await;
|
||||
for device_id in &self.device_ids {
|
||||
dm.try_remove_device(device_id).await?;
|
||||
}
|
||||
|
||||
// Clean up generated VMDK descriptor file if it exists (only for multi-device case)
|
||||
// Clean up generated VMDK descriptor file if it exists.
|
||||
if let Some(ref vmdk) = self.vmdk_path {
|
||||
if vmdk.exists() {
|
||||
if let Err(e) = fs::remove_file(vmdk) {
|
||||
warn!(
|
||||
sl!(),
|
||||
"failed to remove VMDK descriptor {}: {}",
|
||||
vmdk.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
safely_remove_file(vmdk, &self.generated_artifacts_dir)?;
|
||||
}
|
||||
|
||||
// Clean up GPT metadata files (head, padding).
|
||||
for metadata_path in &self.gpt_metadata_paths {
|
||||
safely_remove_file(metadata_path, &self.generated_artifacts_dir)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -91,7 +91,7 @@ pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &P
|
||||
toml_utils::set_toml_value(
|
||||
configuration_file,
|
||||
".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
|
||||
"1",
|
||||
"0",
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
|
||||
Reference in New Issue
Block a user