From 56f05aa534ed4d81ebc389424a290285da2c83f7 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Tue, 19 May 2026 14:44:07 +0800
Subject: [PATCH 01/12] kata-agent: Enhance SCSI block device matcher to reject
 partition uevents

Refactor ScsiBlockMatcher to only match whole-disk uevents. This
prevents the matcher from incorrectly matching partition uevents
(e.g., block/sdd/sdd9) which is critical for partitioned disks
where partition uevents appear alongside whole-disk uevents.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 src/agent/src/device/scsi_device_handler.rs | 67 +++++++++++++++++++--
 1 file changed, 63 insertions(+), 4 deletions(-)

diff --git a/src/agent/src/device/scsi_device_handler.rs b/src/agent/src/device/scsi_device_handler.rs
index 972fc7ee78..7a9e68acae 100644
--- a/src/agent/src/device/scsi_device_handler.rs
+++ b/src/agent/src/device/scsi_device_handler.rs
@@ -17,6 +17,9 @@ use std::sync::Arc;
 use tokio::sync::Mutex;
 use tracing::instrument;
 
+/// The path segment in the uevent devpath that separates the SCSI path and the block device name.
+const BLOCK_SEGMENT: &str = "/block/";
+
 #[derive(Debug)]
 pub struct ScsiDeviceHandler {}
 
@@ -53,20 +56,41 @@ pub async fn get_scsi_device_name(
 // SCSI host.
 #[derive(Debug)]
 pub struct ScsiBlockMatcher {
-    search: String,
+    /// Expected SCSI path suffix before `/block/`, e.g. `/0:0:2:0`
+    scsi_path_suffix: String,
 }
 
 impl ScsiBlockMatcher {
     pub fn new(scsi_addr: &str) -> ScsiBlockMatcher {
-        let search = format!(r"/0:0:{scsi_addr}/block/");
+        ScsiBlockMatcher {
+            scsi_path_suffix: format!("/0:0:{scsi_addr}"),
+        }
+    }
 
-        ScsiBlockMatcher { search }
+    fn split_block_devpath<'a>(&self, devpath: &'a str) -> Option<(&'a str, &'a str)> {
+        let idx = devpath.find(BLOCK_SEGMENT)?;
+        let prefix = &devpath[..idx];
+        let suffix = &devpath[idx + BLOCK_SEGMENT.len()..];
+        Some((prefix, suffix))
     }
 }
 
 impl UeventMatcher for ScsiBlockMatcher {
     fn is_match(&self, uev: &Uevent) -> bool {
-        uev.subsystem == BLOCK && uev.devpath.contains(&self.search) && !uev.devname.is_empty()
+        if uev.action != U_EVENT_ACTION_ADD {
+            return false;
+        }
+
+        if uev.subsystem != BLOCK || uev.devname.is_empty() {
+            return false;
+        }
+
+        let (prefix, suffix) = match self.split_block_devpath(&uev.devpath) {
+            Some(parts) => parts,
+            None => return false,
+        };
+
+        prefix.ends_with(&self.scsi_path_suffix) && !suffix.contains('/') && suffix == uev.devname
     }
 }
 
@@ -106,6 +130,23 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::linux_abi::U_EVENT_ACTION_ADD;
+
+    fn make_scsi_block_uevent(addr: &str, devname: &str, devpath_suffix: &str) -> Uevent {
+        let root_bus = create_pci_root_bus_path("00");
+
+        let mut uev = Uevent::default();
+        uev.action = U_EVENT_ACTION_ADD.to_string();
+        uev.subsystem = BLOCK.to_string();
+        uev.devname = devname.to_string();
+        uev.devpath = format!(
+            "{root_bus}/0000:00:00.0/virtio0/host0/target0:0:{target}/0:0:{addr}/block/{devpath_suffix}",
+            target = addr.split(':').next().unwrap_or("0"),
+            addr = addr,
+            devpath_suffix = devpath_suffix,
+        );
+        uev
+    }
 
     #[tokio::test]
     #[allow(clippy::redundant_clone)]
@@ -124,6 +165,7 @@ mod tests {
 
         let mut uev_b = uev_a.clone();
         let addr_b = "2:0";
+        uev_b.devname = "sdb".to_string();
         uev_b.devpath =
             format!("{root_bus}/0000:00:00.0/virtio0/host0/target0:0:2/0:0:{addr_b}/block/sdb");
         let matcher_b = ScsiBlockMatcher::new(addr_b);
@@ -133,4 +175,21 @@ mod tests {
         assert!(!matcher_b.is_match(&uev_a));
         assert!(!matcher_a.is_match(&uev_b));
     }
+
+    #[tokio::test]
+    async fn test_scsi_block_matcher_rejects_partitions() {
+        let uev_whole = make_scsi_block_uevent("0:0", "sda", "sda");
+        let uev_part = make_scsi_block_uevent("0:0", "sda1", "sda/sda1");
+
+        let matcher = ScsiBlockMatcher::new("0:0");
+
+        assert!(
+            matcher.is_match(&uev_whole),
+            "whole disk uevent should match"
+        );
+        assert!(
+            !matcher.is_match(&uev_part),
+            "partition uevent should not match"
+        );
+    }
 }

From 39c512bc3635abb7b365c408b11823f9474a804b Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Wed, 20 May 2026 17:31:44 +0800
Subject: [PATCH 02/12] kata-agent: Enhance virtio block matcher to reject
 partition uevents

Enhance VirtioBlkPciMatcher to only match whole-disk uevents. This
prevents the matcher from incorrectly matching partition uevents
(e.g., /dev/vdaX) which is critical for partitioned disks where
partition uevents appear alongside whole-disk uevents.

This commit aims to eliminate such bad cases.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 src/agent/src/device/block_device_handler.rs | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/agent/src/device/block_device_handler.rs b/src/agent/src/device/block_device_handler.rs
index 3d78c84201..5e22ac28d4 100644
--- a/src/agent/src/device/block_device_handler.rs
+++ b/src/agent/src/device/block_device_handler.rs
@@ -173,7 +173,8 @@ pub struct VirtioBlkPciMatcher {
 impl VirtioBlkPciMatcher {
     pub fn new(relpath: &str, root_complex: &str) -> VirtioBlkPciMatcher {
         let root_bus = create_pci_root_bus_path(root_complex);
-        let re = format!(r"^{root_bus}{relpath}/virtio[0-9]+/block/");
+        // [^/]+$ ensures it only match the whole-disk uevent (e.g. block/vdx)
+        let re = format!(r"^{root_bus}{relpath}/virtio[0-9]+/block/[^/]+$");
 
         VirtioBlkPciMatcher {
             rex: Regex::new(&re).expect("BUG: failed to compile VirtioBlkPciMatcher regex"),
@@ -259,6 +260,17 @@ mod tests {
         assert!(matcher_b.is_match(&uev_b));
         assert!(!matcher_b.is_match(&uev_a));
         assert!(!matcher_a.is_match(&uev_b));
+
+        // Partition uevents must NOT match (only the whole-disk uevent should match)
+        let mut uev_part = uev_a.clone();
+        uev_part.devname = "vda1".to_string();
+        uev_part.devpath = format!("{root_bus}{relpath_a}/virtio4/block/{devname}/vda1");
+        assert!(!matcher_a.is_match(&uev_part));
+
+        let mut uev_part91 = uev_a.clone();
+        uev_part91.devname = "vda91".to_string();
+        uev_part91.devpath = format!("{root_bus}{relpath_a}/virtio4/block/{devname}/vda91");
+        assert!(!matcher_a.is_match(&uev_part91));
     }
 
     #[cfg(target_arch = "s390x")]

From 7086caaddf78632e32296315ac7e03594feffa05 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Wed, 20 May 2026 17:41:26 +0800
Subject: [PATCH 03/12] kata-agent: Remove unused mode field from
 MkdirDirective

As previous unused codes are with attribute of dead_code which
actually are never used, we'd better remove them totally.

It will remove the mode field from MkdirDirective structure and
also remove its relavent test cases.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 src/agent/src/storage/multi_layer_erofs.rs | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/agent/src/storage/multi_layer_erofs.rs b/src/agent/src/storage/multi_layer_erofs.rs
index 2195bdfcd7..920141d5c7 100644
--- a/src/agent/src/storage/multi_layer_erofs.rs
+++ b/src/agent/src/storage/multi_layer_erofs.rs
@@ -59,11 +59,9 @@ pub struct MultiLayerErofsResult {
     pub temp_mount_points: Vec<String>,
 }
 
-#[allow(dead_code)]
 #[derive(Debug)]
 struct MkdirDirective {
     raw_path: String,
-    mode: Option<String>,
 }
 
 #[async_trait::async_trait]
@@ -407,7 +405,6 @@ fn parse_mkdir_directive(spec: &str) -> Result<MkdirDirective> {
 
     Ok(MkdirDirective {
         raw_path: raw_path.to_string(),
-        mode: parts.get(1).map(|s| s.to_string()),
     })
 }
 
@@ -604,21 +601,19 @@ mod tests {
     // --- parse_mkdir_directive ---
 
     #[rstest]
-    #[case("some/path", true, "some/path", None)]
-    #[case("some/path:0755", true, "some/path", Some("0755"))]
-    #[case("path:mode:extra", true, "path", Some("mode:extra"))]
-    #[case("", false, "", None)]
+    #[case("some/path", true, "some/path")]
+    #[case("some/path:0755", true, "some/path")]
+    #[case("path:mode:extra", true, "path")]
+    #[case("", false, "")]
     fn test_parse_mkdir_directive(
         #[case] spec: &str,
         #[case] should_pass: bool,
         #[case] expected_path: &str,
-        #[case] expected_mode: Option<&str>,
     ) {
         let result = parse_mkdir_directive(spec);
         if should_pass {
             let d = result.expect("expected Ok");
             assert_eq!(d.raw_path, expected_path);
-            assert_eq!(d.mode.as_deref(), expected_mode);
         } else {
             assert!(result.is_err(), "expected Err for spec {:?}", spec);
         }

From 148810312d35ea5ee85334847686a288d9c50c7c Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Wed, 20 May 2026 17:56:44 +0800
Subject: [PATCH 04/12] runtime-rs: Refactor VMDK writer and erofs rootfs
 handling logic

Restructure the erofs rootfs handler to support multi-layer GPT+VMDK
mode where multiple EROFS layers are wrapped into a single virtual
disk with a GPT partition table.

Extract VmdkDescriptorWriter as a reusable struct for atomic VMDK
descriptor generation. Change erofs_storage from Option<Storage> to
Vec<Storage> to hold per-layer metadata, and add GPT metadata path
tracking for proper cleanup with path-traversal guards.

Bump MAX_VIRTIO_BLK_DEVICES from 10 to 127 to accommodate GPT disks
carrying many partitions. Pre-extract mkdir directives from overlay
mounts before the main loop to avoid redundant option parsing.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 .../resource/src/rootfs/erofs_rootfs.rs       | 313 +++++++++++-------
 1 file changed, 192 insertions(+), 121 deletions(-)

diff --git a/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs b/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs
index 5c1093a171..0ec198544e 100644
--- a/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs
+++ b/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs
@@ -39,14 +39,11 @@ pub(crate) const EROFS_ROOTFS_TYPE: &str = "erofs";
 pub(crate) const RW_LAYER_ROOTFS_TYPE: &str = "ext4";
 /// VMDK file extension for merged EROFS image
 const EROFS_MERGED_VMDK: &str = "merged_fs.vmdk";
-/// Maximum number of virtio-blk devices allowed for multi-layer EROFS rootfs.
-///
-/// This defensive limit prevents exhausting PCI slot resources, especially on
-/// lightweight VMMs (Dragonball, Cloud Hypervisor) where the PCIe root bus has
-/// only 32 slots (PCIE_ROOT_BUS_SLOTS_CAPACITY). For QEMU with PCI bridges
-/// (30 slots/bridge), this limit is conservative but still applies as a uniform
-/// safeguard across all hypervisor backends.
-const MAX_VIRTIO_BLK_DEVICES: usize = 10;
+
+/// Maximum number of rootfs layer devices (erofs + rw layer) allowed in multi-layer EROFS mode.
+/// This is a pre-flight sanity check before VMDK merging, to prevent excessive block devices
+/// when many layers are used without fsmerge.
+const MAX_ROOTFS_LAYER_DEVICES: usize = 129; // 128 EROFS layers + 1 rw layer (129 total)
 /// Maximum sectors per 2GB extent (2GB / 512 bytes per sector)
 const MAX_2GB_EXTENT_SECTORS: u64 = 0x8000_0000 >> 9;
 /// Sectors per track for VMDK geometry
@@ -60,12 +57,25 @@ const VMDK_ADAPTER_TYPE: &str = "ide";
 /// VMDK hardware version
 const VMDK_HW_VERSION: &str = "4";
 /// Default shared directory for guest rootfs VMDK files (for multi-layer EROFS)
-const DEFAULT_KATA_GUEST_ROOT_SHARED_FS: &str = "/run/kata-containers/";
+pub(crate) const DEFAULT_KATA_GUEST_ROOT_SHARED_FS: &str = "/run/kata-containers/";
 /// Template for mkdir option in overlay mount (X-containerd.mkdir.path)
 const X_CONTAINERD_MKDIR_PATH: &str = "X-containerd.mkdir.path=";
 /// Template for mkdir option passed to guest agent (X-kata.mkdir.path)
 const X_KATA_MKDIR_PATH: &str = "X-kata.mkdir.path=";
 
+/// Create the per-container directory under the shared filesystem root.
+pub(crate) fn ensure_container_dir(sid: &str, cid: &str) -> Result<PathBuf> {
+    let dir = PathBuf::from(kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS))
+        .join(sid)
+        .join(cid);
+    fs::create_dir_all(&dir).context(format!(
+        "failed to create container directory: {}",
+        dir.display()
+    ))?;
+
+    Ok(dir)
+}
+
 /// Generate merged VMDK file from multiple EROFS devices
 ///
 /// Creates a VMDK descriptor that combines multiple EROFS images into a single
@@ -104,14 +114,7 @@ async fn generate_merged_erofs_vmdk(
     }
 
     // For multiple devices, create VMDK descriptor
-    let sandbox_dir =
-        PathBuf::from(kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS)).join(sid);
-    let container_dir = sandbox_dir.join(cid);
-    fs::create_dir_all(&container_dir).context(format!(
-        "failed to create container directory: {}",
-        container_dir.display()
-    ))?;
-
+    let container_dir = ensure_container_dir(sid, cid)?;
     let vmdk_path = container_dir.join(EROFS_MERGED_VMDK);
 
     info!(
@@ -129,6 +132,105 @@ async fn generate_merged_erofs_vmdk(
     Ok((vmdk_path.display().to_string(), BlockDeviceFormat::Vmdk))
 }
 
+/// Helper struct for writing VMDK descriptor files atomically.
+///
+/// Encapsulates the common VMDK descriptor format: header, extent descriptions,
+/// DDB footer, and atomic write (temp file + rename). Used by both fsmerge mode
+/// (`create_vmdk_descriptor`) and GPT mode (`create_gpt_vmdk_descriptor`).
+struct VmdkDescriptorWriter {
+    writer: BufWriter<fs::File>,
+    temp_path: PathBuf,
+    final_path: PathBuf,
+}
+
+impl VmdkDescriptorWriter {
+    fn new(vmdk_path: &Path) -> Result<Self> {
+        let temp_path = vmdk_path.with_extension("vmdk.tmp");
+        if temp_path.components().any(|c| c == Component::ParentDir) {
+            return Err(anyhow!("Invalid input: {}", temp_path.display()));
+        }
+        let file = fs::File::create(&temp_path).context(format!(
+            "failed to create temp VMDK file: {}",
+            temp_path.display()
+        ))?;
+        let mut writer = BufWriter::new(file);
+
+        writeln!(writer, "# Disk DescriptorFile")?;
+        writeln!(writer, "version=1")?;
+        writeln!(writer, "CID=fffffffe")?;
+        writeln!(writer, "parentCID=ffffffff")?;
+        writeln!(writer, "createType=\"{}\"", VMDK_SUBFORMAT)?;
+        writeln!(writer)?;
+        writeln!(writer, "# Extent description")?;
+
+        Ok(Self {
+            writer,
+            temp_path,
+            final_path: vmdk_path.to_path_buf(),
+        })
+    }
+
+    // Write a single extent line (no 2GB chunking).
+    fn write_extent(&mut self, path: &str, sectors: u64, file_offset: u64) -> Result<()> {
+        writeln!(
+            self.writer,
+            "RW {} FLAT \"{}\" {}",
+            sectors, path, file_offset
+        )?;
+        Ok(())
+    }
+
+    // Write extent lines with 2GB chunking for large files.
+    fn write_extent_chunked(&mut self, path: &str, total_sectors: u64) -> Result<()> {
+        let mut remaining = total_sectors;
+        let mut file_offset: u64 = 0;
+        while remaining > 0 {
+            let chunk = remaining.min(MAX_2GB_EXTENT_SECTORS);
+            self.write_extent(path, chunk, file_offset)?;
+            file_offset += chunk;
+            remaining -= chunk;
+        }
+        Ok(())
+    }
+
+    // Write DDB footer, flush, and atomically rename to final path.
+    fn finalize(mut self, total_sectors: u64) -> Result<()> {
+        writeln!(self.writer)?;
+
+        let cylinders = total_sectors.div_ceil(SECTORS_PER_TRACK * NUMBER_HEADS);
+
+        writeln!(self.writer, "# The Disk Data Base")?;
+        writeln!(self.writer, "#DDB")?;
+        writeln!(self.writer)?;
+        writeln!(
+            self.writer,
+            "ddb.virtualHWVersion = \"{}\"",
+            VMDK_HW_VERSION
+        )?;
+        writeln!(self.writer, "ddb.geometry.cylinders = \"{}\"", cylinders)?;
+        writeln!(self.writer, "ddb.geometry.heads = \"{}\"", NUMBER_HEADS)?;
+        writeln!(
+            self.writer,
+            "ddb.geometry.sectors = \"{}\"",
+            SECTORS_PER_TRACK
+        )?;
+        writeln!(self.writer, "ddb.adapterType = \"{}\"", VMDK_ADAPTER_TYPE)?;
+
+        self.writer
+            .flush()
+            .context("failed to flush VMDK descriptor")?;
+        drop(self.writer);
+
+        fs::rename(&self.temp_path, &self.final_path).context(format!(
+            "failed to rename temp VMDK {} -> {}",
+            self.temp_path.display(),
+            self.final_path.display()
+        ))?;
+
+        Ok(())
+    }
+}
+
 /// Create VMDK descriptor for multiple EROFS extents (flatten device)
 ///
 /// Generates a VMDK descriptor file (twoGbMaxExtentFlat format) that references
@@ -141,7 +243,6 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
         ));
     }
 
-    // collect extent information without writing anything.
     struct ExtentInfo {
         path: String,
         total_sectors: u64,
@@ -160,9 +261,6 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
             continue;
         }
 
-        // round up to whole sectors to avoid losing tail bytes on non-aligned files.
-        // VMDK extents are measured in 512-byte sectors; a file that is not sector-aligned
-        // still needs the last partial sector to be addressable by the VM.
         let sectors = file_size.div_ceil(512);
 
         if file_size % 512 != 0 {
@@ -197,43 +295,9 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
         ));
     }
 
-    // write descriptor to a temp file, then atomically rename.
-    let tmp_path = vmdk_path.with_extension("vmdk.tmp");
-    // Prevent path traversal attacks by rejecting paths containing '..'.
-    if tmp_path.components().any(|c| c == Component::ParentDir) {
-        return Err(anyhow!("Invalid input: {}", tmp_path.display()));
-    }
-    let file = fs::File::create(&tmp_path).context(format!(
-        "failed to create temp VMDK file: {}",
-        tmp_path.display()
-    ))?;
-    let mut writer = BufWriter::new(file);
-
-    // Header
-    writeln!(writer, "# Disk DescriptorFile")?;
-    writeln!(writer, "version=1")?;
-    writeln!(writer, "CID=fffffffe")?;
-    writeln!(writer, "parentCID=ffffffff")?;
-    writeln!(writer, "createType=\"{}\"", VMDK_SUBFORMAT)?;
-    writeln!(writer)?;
-
-    // Extent descriptions
-    writeln!(writer, "# Extent description")?;
+    let mut vmdk = VmdkDescriptorWriter::new(vmdk_path)?;
     for extent in &extents {
-        let mut remaining = extent.total_sectors;
-        let mut file_offset: u64 = 0;
-
-        while remaining > 0 {
-            let chunk = remaining.min(MAX_2GB_EXTENT_SECTORS);
-            writeln!(
-                writer,
-                "RW {} FLAT \"{}\" {}",
-                chunk, extent.path, file_offset
-            )?;
-            file_offset += chunk;
-            remaining -= chunk;
-        }
-
+        vmdk.write_extent_chunked(&extent.path, extent.total_sectors)?;
         info!(
             sl!(),
             "VMDK extent: {} ({} sectors, {} extent chunk(s))",
@@ -242,40 +306,15 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
             extent.total_sectors.div_ceil(MAX_2GB_EXTENT_SECTORS)
         );
     }
-    writeln!(writer)?;
 
-    // Disk Data Base (DDB)
-    // Geometry: cylinders = ceil(total_sectors / (sectors_per_track * heads))
-    let cylinders = total_sectors.div_ceil(SECTORS_PER_TRACK * NUMBER_HEADS);
-
-    writeln!(writer, "# The Disk Data Base")?;
-    writeln!(writer, "#DDB")?;
-    writeln!(writer)?;
-    writeln!(writer, "ddb.virtualHWVersion = \"{}\"", VMDK_HW_VERSION)?;
-    writeln!(writer, "ddb.geometry.cylinders = \"{}\"", cylinders)?;
-    writeln!(writer, "ddb.geometry.heads = \"{}\"", NUMBER_HEADS)?;
-    writeln!(writer, "ddb.geometry.sectors = \"{}\"", SECTORS_PER_TRACK)?;
-    writeln!(writer, "ddb.adapterType = \"{}\"", VMDK_ADAPTER_TYPE)?;
-
-    // Flush the BufWriter to ensure all data is written before rename.
-    writer.flush().context("failed to flush VMDK descriptor")?;
-    // Explicitly drop to close the file handle before rename.
-    drop(writer);
-
-    // atomic rename: tmp -> final path.
-    fs::rename(&tmp_path, vmdk_path).context(format!(
-        "failed to rename temp VMDK {} -> {}",
-        tmp_path.display(),
-        vmdk_path.display()
-    ))?;
+    vmdk.finalize(total_sectors)?;
 
     info!(
         sl!(),
-        "VMDK descriptor created: {} (total {} sectors, {} extents, {} cylinders)",
+        "VMDK descriptor created: {} (total {} sectors, {} extents)",
         vmdk_path.display(),
         total_sectors,
-        extents.len(),
-        cylinders
+        extents.len()
     );
 
     Ok(())
@@ -338,10 +377,16 @@ fn extract_block_device_info(
 pub(crate) struct ErofsMultiLayerRootfs {
     guest_path: String,
     device_ids: Vec<String>,
-    rwlayer_storage: Option<Storage>, // Writable layer storage (upper layer), typically ext4
-    erofs_storage: Option<Storage>,
-    /// Path to generated VMDK descriptor (only set when multiple EROFS devices are merged)
+    // Writable layer storage (upper layer), typically ext4
+    rwlayer_storage: Option<Storage>,
+    // Read-only EROFS layer storages (lower layers), one per partition in GPT mode
+    erofs_storages: Vec<Storage>,
+    // Path to generated VMDK descriptor (only set when multiple EROFS devices are merged)
     vmdk_path: Option<PathBuf>,
+    // Paths to generated GPT metadata files (head, tail, padding) for cleanup
+    gpt_metadata_paths: Vec<PathBuf>,
+    // Container-scoped runtime directory that may only contain generated helper artifacts.
+    generated_artifacts_dir: PathBuf,
 }
 
 impl ErofsMultiLayerRootfs {
@@ -360,8 +405,9 @@ impl ErofsMultiLayerRootfs {
 
         let mut device_ids = Vec::new();
         let mut rwlayer_storage: Option<Storage> = None;
-        let mut erofs_storage: Option<Storage> = None;
+        let mut erofs_storages: Vec<Storage> = Vec::new();
         let mut vmdk_path: Option<PathBuf> = None;
+        let gpt_metadata_paths: Vec<PathBuf> = Vec::new();
 
         // Directories to create (X-containerd.mkdir.path)
         let mut mkdir_dirs: Vec<String> = Vec::new();
@@ -374,14 +420,33 @@ impl ErofsMultiLayerRootfs {
             .iter()
             .filter(|m| matches!(m.fs_type.as_str(), RW_LAYER_ROOTFS_TYPE | EROFS_ROOTFS_TYPE))
             .count();
-        if expected_device_count > MAX_VIRTIO_BLK_DEVICES {
+
+        // TODO(Alex Lyn): fsmerge mode with single erofs mount and multiple device= options
+        // may require multiple block devices if containerd does not merge layers into one file.
+        // This is a fallback or default mode if fsmerge is not enabled.
+        if expected_device_count > MAX_ROOTFS_LAYER_DEVICES {
             return Err(anyhow!(
                 "exceeded maximum block devices for multi-layer EROFS: {} > {}",
                 expected_device_count,
-                MAX_VIRTIO_BLK_DEVICES
+                MAX_ROOTFS_LAYER_DEVICES
             ));
         }
 
+        // Pre-extract mkdir directives from overlay mounts before the main loop,
+        // so they are available regardless of mount ordering.
+        for mount in rootfs_mounts {
+            if matches!(
+                mount.fs_type.as_str(),
+                "overlay" | "format/overlay" | "format/mkdir/overlay"
+            ) {
+                for opt in &mount.options {
+                    if let Some(mkdir_spec) = opt.strip_prefix(X_CONTAINERD_MKDIR_PATH) {
+                        mkdir_dirs.push(mkdir_spec.to_string());
+                    }
+                }
+            }
+        }
+
         // Process each mount in rootfs_mounts to set up devices and storages
         for mount in rootfs_mounts {
             match mount.fs_type.as_str() {
@@ -407,8 +472,6 @@ impl ErofsMultiLayerRootfs {
                     .await
                     .context("failed to attach rw block device")?;
 
-                    // let (device_id, guest_path, blk_driver) =
-                    //     extract_block_device_info(&device_info, &block_driver)?;
                     let (mut rwlayer, device_id) =
                         extract_block_device_info(&device_info, false)
                             .context("failed to get block device for rw layer")?;
@@ -491,7 +554,8 @@ impl ErofsMultiLayerRootfs {
                     .await
                     .context("failed to attach erofs block device")?;
 
-                    let (mut rolayer, device_id) = extract_block_device_info(&device_info, true)?;
+                    let (mut rolayer, device_id) =
+                        extract_block_device_info(&device_info, true)?;
                     info!(
                         sl!(),
                         "erofs device attached - device_id: {} guest_path: {}",
@@ -507,7 +571,9 @@ impl ErofsMultiLayerRootfs {
                             // 1. "loop" - not needed in VM, device is already /dev/vdX
                             // 2. "device=" prefix - used for VMDK generation only, not for mount
                             // 3. "X-kata." prefix - metadata markers for kata internals
-                            *o != "loop" && !o.starts_with("device=") && !o.starts_with("X-kata.")
+                            *o != "loop"
+                                && !o.starts_with("device=")
+                                && !o.starts_with("X-kata.")
                         })
                         .cloned()
                         .collect();
@@ -525,7 +591,7 @@ impl ErofsMultiLayerRootfs {
                     rolayer.mount_point = container_path.clone();
                     rolayer.options = options;
 
-                    erofs_storage = Some(rolayer);
+                    erofs_storages.push(rolayer);
                     device_ids.push(device_id);
                 }
                 fmt if fmt.eq_ignore_ascii_case("overlay")
@@ -533,18 +599,11 @@ impl ErofsMultiLayerRootfs {
                     || fmt.eq_ignore_ascii_case("format/mkdir/overlay") =>
                 {
                     // Mount[2]: overlay to combine rwlayer (upper) + erofs (lower)
+                    // mkdir directives already extracted before the main loop
                     info!(
                         sl!(),
-                        "multi-layer erofs: parsing overlay mount, options: {:?}", mount.options
+                        "multi-layer erofs: overlay mount (mkdir directives pre-extracted)"
                     );
-
-                    // Parse mkdir options (X-containerd.mkdir.path)
-                    for opt in &mount.options {
-                        if let Some(mkdir_spec) = opt.strip_prefix(X_CONTAINERD_MKDIR_PATH) {
-                            // Keep the full spec (path:mode or path:mode:uid:gid) for guest agent
-                            mkdir_dirs.push(mkdir_spec.to_string());
-                        }
-                    }
                 }
                 _ => {
                     info!(
@@ -572,8 +631,14 @@ impl ErofsMultiLayerRootfs {
             guest_path: container_path,
             device_ids,
             rwlayer_storage,
-            erofs_storage,
+            erofs_storages,
             vmdk_path,
+            gpt_metadata_paths,
+            generated_artifacts_dir: PathBuf::from(
+                kata_types::build_path(DEFAULT_KATA_GUEST_ROOT_SHARED_FS),
+            )
+            .join(sid)
+            .join(cid),
         })
     }
 }
@@ -589,16 +654,18 @@ impl Rootfs for ErofsMultiLayerRootfs {
     }
 
     async fn get_storage(&self) -> Option<Vec<Storage>> {
-        // Return all storages for multi-layer EROFS (rw layer + erofs layer) to guest agent.
-        // Guest agent needs both to create overlay mount
+        // Return all storages for multi-layer EROFS (rw layer + erofs layers) to guest agent.
+        // Guest agent needs all of them to create overlay mount.
+        // In GPT mode, each partition has its own storage entry.
         let mut storages = Vec::new();
 
         if let Some(rwlayer) = self.rwlayer_storage.clone() {
             storages.push(rwlayer);
         }
 
-        if let Some(erofs) = self.erofs_storage.clone() {
-            storages.push(erofs);
+        // Add all EROFS layer storages (single storage in fsmerge mode, multiple in GPT mode)
+        for erofs in &self.erofs_storages {
+            storages.push(erofs.clone());
         }
 
         if storages.is_empty() {
@@ -613,23 +680,27 @@ impl Rootfs for ErofsMultiLayerRootfs {
     }
 
     async fn cleanup(&self, device_manager: &RwLock<DeviceManager>) -> Result<()> {
+        // Helper function to safely remove a file if it exists and is within the specified directory.
+        let safely_remove_file = |path: &Path, dir: &Path| -> Result<()> {
+            if path.starts_with(dir) && path.exists() {
+                fs::remove_file(path).context(format!("failed to remove file: {}", path.display()))?;
+            }
+            Ok(())
+        };
+
         let mut dm = device_manager.write().await;
         for device_id in &self.device_ids {
             dm.try_remove_device(device_id).await?;
         }
 
-        // Clean up generated VMDK descriptor file if it exists (only for multi-device case)
+        // Clean up generated VMDK descriptor file if it exists.
         if let Some(ref vmdk) = self.vmdk_path {
-            if vmdk.exists() {
-                if let Err(e) = fs::remove_file(vmdk) {
-                    warn!(
-                        sl!(),
-                        "failed to remove VMDK descriptor {}: {}",
-                        vmdk.display(),
-                        e
-                    );
-                }
-            }
+            safely_remove_file(vmdk, &self.generated_artifacts_dir)?;
+        }
+
+        // Clean up GPT metadata files (head, tail, padding).
+        for metadata_path in &self.gpt_metadata_paths {
+            safely_remove_file(metadata_path, &self.generated_artifacts_dir)?;
         }
 
         Ok(())

From c3b06af4c7a9b8c17653078bbf72e109d57bbd11 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Wed, 20 May 2026 18:51:28 +0800
Subject: [PATCH 05/12] kata-types: Add gpt_disk module for GPT metadata
 generation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce gpt_disk.rs to compute GPT partition layouts and generate
metadata files for multi-layer EROFS rootfs. The module creates GPT
head metadata that are combined with EROFS layer images via VMDK
descriptors, presenting a single GPT-partitioned virtual disk to the
guest VM — each EROFS layer mapped to its own partition.

The layout engine calculates LBA positions for an arbitrary number of
EROFS layers, then writes a full protective-MBR + GPT image and extracts
the head (MBR + primary GPT table)  segments as standalone files for
VMDK extent assembly.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 Cargo.lock                          |  47 ++-
 Cargo.toml                          |   1 +
 src/libs/kata-types/Cargo.toml      |   4 +-
 src/libs/kata-types/src/gpt_disk.rs | 463 ++++++++++++++++++++++++++++
 src/libs/kata-types/src/lib.rs      |   3 +
 5 files changed, 504 insertions(+), 14 deletions(-)
 create mode 100644 src/libs/kata-types/src/gpt_disk.rs

diff --git a/Cargo.lock b/Cargo.lock
index ad16dd5faf..6699601bd4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -139,7 +139,7 @@ version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
 dependencies = [
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -150,7 +150,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
 dependencies = [
  "anstyle",
  "once_cell_polyfill",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -1963,7 +1963,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2458,6 +2458,18 @@ dependencies = [
  "cfg-if 0.1.10",
 ]
 
+[[package]]
+name = "gpt"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3696fafb1ecdcc2ae3ce337de73e9202806068594b77d22fdf2f3573c5ec2219"
+dependencies = [
+ "bitflags 2.11.1",
+ "crc",
+ "simple-bytes",
+ "uuid 1.23.1",
+]
+
 [[package]]
 name = "h2"
 version = "0.3.27"
@@ -3163,7 +3175,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
 dependencies = [
  "hermit-abi 0.5.2",
  "libc",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -3632,8 +3644,10 @@ dependencies = [
  "base64 0.13.1",
  "bitmask-enum",
  "byte-unit",
+ "crc",
  "flate2",
  "glob",
+ "gpt",
  "lazy_static",
  "nix 0.26.4",
  "num_cpus",
@@ -3641,6 +3655,7 @@ dependencies = [
  "regex",
  "rstest 0.18.2",
  "safe-path 0.1.0",
+ "scopeguard",
  "serde",
  "serde-enum-str",
  "serde_json",
@@ -4390,7 +4405,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -5295,7 +5310,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "059a34f111a9dee2ce1ac2826a68b24601c4298cfeb1a587c3cb493d5ab46f52"
 dependencies = [
  "libc",
- "nix 0.23.2",
+ "nix 0.30.1",
 ]
 
 [[package]]
@@ -6516,7 +6531,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys 0.12.1",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -6617,7 +6632,7 @@ dependencies = [
  "security-framework",
  "security-framework-sys",
  "webpki-root-certs",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -7182,6 +7197,12 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
 
+[[package]]
+name = "simple-bytes"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c11532d9d241904f095185f35dcdaf930b1427a94d5b01d7002d74ba19b44cc4"
+
 [[package]]
 name = "siphasher"
 version = "1.0.3"
@@ -7305,7 +7326,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
 dependencies = [
  "libc",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -7546,7 +7567,7 @@ dependencies = [
  "getrandom 0.4.2",
  "once_cell",
  "rustix 1.1.4",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -7555,7 +7576,7 @@ version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1"
 dependencies = [
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -8259,7 +8280,7 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e"
 dependencies = [
  "memoffset 0.9.1",
  "tempfile",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -8842,7 +8863,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index a1ad09f9fb..49e5c96ebb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -197,6 +197,7 @@ tracing-subscriber = "0.3.20"
 ttrpc = "0.8.4"
 url = "2.5.4"
 which = "4.3.0"
+gpt = "4.1.0"
 
 # Per-package release profile overrides for kata-deploy. The kata-deploy
 # binary runs once at pod start and then idles waiting for SIGTERM, so we
diff --git a/src/libs/kata-types/Cargo.toml b/src/libs/kata-types/Cargo.toml
index 3f2c682558..e6202e6486 100644
--- a/src/libs/kata-types/Cargo.toml
+++ b/src/libs/kata-types/Cargo.toml
@@ -31,7 +31,9 @@ sha2 = "0.10.8"
 flate2 = "1.1"
 nix = "0.26.4"
 oci-spec = { version = "0.8.1", features = ["runtime"] }
-
+gpt = "4.1.0"
+scopeguard = "1.0.0"
+crc = "3.4.0"
 safe-path = { path = "../safe-path", optional = true }
 
 [target.'cfg(target_os = "macos")'.dependencies]
diff --git a/src/libs/kata-types/src/gpt_disk.rs b/src/libs/kata-types/src/gpt_disk.rs
new file mode 100644
index 0000000000..50f2c5e578
--- /dev/null
+++ b/src/libs/kata-types/src/gpt_disk.rs
@@ -0,0 +1,463 @@
+// Copyright (c) 2026 Ant Group
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// GPT (GUID Partition Table) disk metadata generation for EROFS multi-layer rootfs.
+//
+// This module generates a GPT metadata file (gpt_meta_head.img) that is used
+// in conjunction with VMDK descriptors to present multiple EROFS layers as a
+// single virtual disk with multiple GPT partitions to the guest VM.
+// Backup GPT structures are omitted — the virtual disk is ephemeral and
+// read-only, so backup recovery serves no purpose.
+//
+// Key features:
+// - Only includes read-only EROFS layers in GPT partitions (rw layer handled separately)
+// - Preserves the original order of layers from rootfs_mounts
+// - Generates minimal GPT metadata without copying layer data
+// - Supports 1MiB alignment for partitions
+// - Creates VMDK-compatible descriptor with head/layer/pad extents
+
+use anyhow::{anyhow, Context, Result};
+use crc::Crc;
+use gpt::{disk::LogicalBlockSize, mbr::ProtectiveMBR, partition_types, GptConfig};
+use scopeguard;
+use std::convert::TryFrom;
+use std::fs;
+use std::io::{Read, Seek, SeekFrom, Write};
+use std::path::{Path, PathBuf};
+
+use crate::sl;
+
+/// GPT disk parameters (using gpt crate constants where available)
+/// DEFAULT_SECTOR_SIZE is LogicalBlockSize enum, not u64
+const SECTOR_SIZE: u64 = 512;
+/// 1 MiB alignment start
+const FIRST_PARTITION_LBA: u64 = 2048;
+/// 1 MiB alignment
+const ALIGNMENT_LBA: u64 = 2048;
+/// bytes per GPT partition entry (UEFI standard)
+const GPT_ENTRY_SIZE: u64 = 128;
+/// standard GPT partition entry count
+const MAX_GPT_PARTITIONS: usize = 128;
+/// 32 sectors for partition entries (128 entries * 128 bytes each / 512 bytes per sector)
+const ENTRIES_SECTORS: u64 = (MAX_GPT_PARTITIONS as u64 * GPT_ENTRY_SIZE) / SECTOR_SIZE;
+/// GPT header size in bytes (UEFI specification)
+const GPT_HEADER_SIZE: usize = 92;
+/// Offset (in bytes) of the GPT primary header within the head file (LBA 1)
+const GPT_HEADER_FILE_OFFSET: u64 = SECTOR_SIZE;
+/// CRC-32/ISO-HDLC — the same algorithm the `gpt` crate uses internally.
+const CRC_32: Crc<u32> = Crc::<u32>::new(&crc::CRC_32_ISO_HDLC);
+
+/// GPT head metadata file name
+const GPT_META_HEAD_IMG: &str = "gpt_meta_head.img";
+/// Temporary full GPT image used to synthesize head metadata
+const GPT_META_FULL_IMG: &str = "gpt_meta_full.img";
+
+/// Represents a read-only EROFS layer to be placed in a GPT partition
+#[derive(Debug, Clone)]
+pub struct ErofsLayer {
+    /// Path to the EROFS image file
+    pub path: String,
+    /// Size in sectors (ceiling division, sector = 512 bytes)
+    pub size_sectors: u64,
+    /// Snapshot ID extracted from path (for naming)
+    pub snapshot_id: String,
+}
+
+/// GPT partition layout information for a single layer
+#[derive(Debug, Clone)]
+pub struct PartitionLayout {
+    /// Layer information
+    pub layer: ErofsLayer,
+    /// Partition number (1-indexed)
+    pub partition_number: u32,
+    /// First LBA of the partition
+    pub start_lba: u64,
+    /// Last LBA of the partition
+    pub end_lba: u64,
+    /// Partition name
+    pub name: String,
+}
+
+/// Complete GPT disk layout calculation result
+#[derive(Debug, Clone)]
+pub struct GptDiskLayout {
+    /// All partition layouts in order
+    pub partitions: Vec<PartitionLayout>,
+    /// Total sectors in the virtual disk
+    pub total_sectors: u64,
+    /// Logical block size in bytes
+    pub lb_size: u64,
+}
+
+/// Result of GPT metadata file generation
+#[derive(Debug)]
+pub struct GptMetadataFiles {
+    /// Path to generated gpt_meta_head.img
+    pub head_path: PathBuf,
+    /// Size of head file in sectors
+    pub head_sectors: u64,
+    /// Paths to generated padding files (between partitions)
+    pub pad_paths: Vec<PathBuf>,
+}
+
+/// Extract snapshot ID from a source path
+///
+/// Examples:
+///   ".../snapshots/35/layer.erofs" ---> "35"
+pub fn extract_snapshot_id(source: &str) -> String {
+    Path::new(source)
+        .parent()
+        .and_then(|p| p.file_name())
+        .map(|id| id.to_string_lossy().to_string())
+        .unwrap_or_else(|| "unknown".to_string())
+}
+
+/// Get file size in bytes
+pub fn get_erofs_layer_size(path: &str) -> Result<u64> {
+    let metadata = fs::metadata(path).context(format!("failed to stat EROFS file: {}", path))?;
+    Ok(metadata.len())
+}
+
+/// Align LBA up to the specified alignment
+fn align_up(lba: u64, alignment: u64) -> u64 {
+    if lba.is_multiple_of(alignment) {
+        lba
+    } else {
+        ((lba / alignment) + 1) * alignment
+    }
+}
+
+/// Calculate GPT disk layout from EROFS layers
+///
+/// This function computes the LBA positions for all partitions without
+/// modifying any files. It follows the layout:
+/// - LBA 0: Protective MBR
+/// - LBA 1: Primary GPT Header
+/// - LBA 2-33: Primary Partition Entry Array
+/// - LBA 34-2047: Reserved/padding
+/// - LBA 2048+: Partitions (1MiB aligned)
+/// - End: Backup Partition Entry Array + Backup GPT Header
+pub fn calculate_gpt_layout(layers: &[ErofsLayer]) -> Result<GptDiskLayout> {
+    if layers.is_empty() {
+        return Err(anyhow!("no EROFS layers provided for GPT layout"));
+    }
+
+    // TODO: Fix the length of partitions exceeding GPT limits.
+    // It should be addressed by splitting into multiple GPT disks if needed, but for now we enforce the limit.
+    if layers.len() > MAX_GPT_PARTITIONS {
+        return Err(anyhow!(
+            "The layers for GPT: {} exceeds maximum {} partitions \
+             (ENTRIES_SECTORS is sized for {} entries)",
+            layers.len(),
+            MAX_GPT_PARTITIONS,
+            MAX_GPT_PARTITIONS,
+        ));
+    }
+
+    // Validate that all layers have non-zero size
+    for (idx, layer) in layers.iter().enumerate() {
+        if layer.size_sectors == 0 {
+            return Err(anyhow!(
+                "EROFS layer {} ({}) has size_sectors = 0, cannot generate GPT partition",
+                idx,
+                layer.path
+            ));
+        }
+    }
+
+    let lb_size = SECTOR_SIZE;
+    let first_usable_lba = FIRST_PARTITION_LBA;
+
+    // Calculate partition positions
+    let mut partitions = Vec::with_capacity(layers.len());
+    let mut current_lba = first_usable_lba;
+
+    for (idx, layer) in layers.iter().enumerate() {
+        // Align start LBA to 1MiB boundary
+        let start_lba = align_up(current_lba, ALIGNMENT_LBA);
+        let end_lba = start_lba + layer.size_sectors - 1;
+
+        // Generate partition name: erofs-{index}-s{snapshot_id}
+        let name = format!("erofs-{}-s{}", idx, layer.snapshot_id);
+        // Truncate to fit GPT name limit without slicing through a UTF-8 codepoint.
+        let name = match name.char_indices().nth(36) {
+            Some((truncate_at, _)) => name[..truncate_at].to_string(),
+            None => name,
+        };
+
+        partitions.push(PartitionLayout {
+            layer: layer.clone(),
+            partition_number: (idx + 1) as u32,
+            start_lba,
+            end_lba,
+            name,
+        });
+
+        // Next partition starts after this one
+        current_lba = end_lba + 1;
+    }
+
+    // Calculate backup GPT position
+    // Backup entries are placed after the last partition, aligned
+    let backup_entries_lba = align_up(current_lba, ALIGNMENT_LBA);
+    let backup_header_lba = backup_entries_lba + ENTRIES_SECTORS;
+    let total_sectors = backup_header_lba + 1;
+
+    let last_usable_lba = backup_entries_lba - 1;
+
+    // Validate that all partitions fit in usable area
+    for (idx, part) in partitions.iter().enumerate() {
+        if part.end_lba > last_usable_lba {
+            return Err(anyhow!(
+                "partition {} (end_lba={}) exceeds last usable LBA ({})",
+                idx,
+                part.end_lba,
+                last_usable_lba
+            ));
+        }
+    }
+
+    Ok(GptDiskLayout {
+        partitions,
+        total_sectors,
+        lb_size,
+    })
+}
+
+/// Generate GPT head metadata and return layout information
+///
+/// This is the main entry point for GPT metadata generation.
+/// It creates a temporary full GPT image (needed by the gpt crate to
+/// produce valid primary structures), extracts the head region, patches
+/// the primary header to remove references to backup GPT, and discards
+/// the rest.
+///
+/// Output:
+/// - gpt_meta_head.img: Primary GPT structures (MBR + GPT header + partition entries + padding)
+#[allow(unused_variables)]
+pub fn generate_gpt_metadata(
+    sid: &str,
+    cid: &str,
+    erofs_layers: Vec<ErofsLayer>,
+    container_dir: &Path,
+) -> Result<(GptDiskLayout, GptMetadataFiles)> {
+    if erofs_layers.is_empty() {
+        return Err(anyhow!(
+            "no EROFS layers provided for GPT metadata generation"
+        ));
+    }
+
+    let mut layout = calculate_gpt_layout(&erofs_layers)?;
+    if layout.partitions.is_empty() {
+        return Err(anyhow!(
+            "no partitions in layout, cannot generate GPT metadata"
+        ));
+    }
+
+    let full_path = container_dir.join(GPT_META_FULL_IMG);
+    generate_full_gpt_image(&layout, &full_path).context("failed to generate full GPT image")?;
+    let _cleanup = scopeguard::guard((), |_| {
+        let _ = fs::remove_file(&full_path);
+    });
+
+    // Extract head: LBA 0 to FIRST_PARTITION_LBA (2048 sectors = 1 MiB)
+    let lb_size = layout.lb_size;
+    let head_sectors = FIRST_PARTITION_LBA;
+    let head_size = head_sectors * lb_size;
+    let head_path = container_dir.join(GPT_META_HEAD_IMG);
+    extract_file_range(&full_path, &head_path, 0, head_size)
+        .context("failed to extract GPT head metadata")?;
+
+    // Patch the primary GPT header so AlternateLBA / LastUsableLBA are
+    let last_partition_end = layout.partitions.last().unwrap().end_lba;
+    patch_primary_gpt_header(&head_path, last_partition_end)
+        .context("failed to patch primary GPT header")?;
+
+    // Adjust the layout to reflect the virtual disk size (no backup).
+    layout.total_sectors = last_partition_end + 1;
+
+    info!(
+        sl!(),
+        "Generated GPT head file: {} ({} sectors, {} bytes, virtual disk {} sectors)",
+        head_path.display(),
+        head_sectors,
+        head_size,
+        layout.total_sectors
+    );
+
+    let metadata_files = GptMetadataFiles {
+        head_path,
+        head_sectors,
+        pad_paths: Vec::new(),
+    };
+
+    Ok((layout, metadata_files))
+}
+
+fn generate_full_gpt_image(layout: &GptDiskLayout, output_path: &Path) -> Result<()> {
+    let lb_size = layout.lb_size;
+    let total_size = layout.total_sectors * lb_size;
+
+    let mut file = fs::OpenOptions::new()
+        .read(true)
+        .write(true)
+        .create(true)
+        .truncate(true)
+        .open(output_path)
+        .context(format!(
+            "failed to create full GPT image: {}",
+            output_path.display()
+        ))?;
+
+    file.set_len(total_size)
+        .context("failed to pre-allocate full GPT image")?;
+
+    let mbr =
+        ProtectiveMBR::with_lb_size(u32::try_from(layout.total_sectors - 1).unwrap_or(0xFFFF_FFFF));
+    mbr.overwrite_lba0(&mut file)
+        .context("failed to write Protective MBR")?;
+
+    let mut gdisk = GptConfig::new()
+        .writable(true)
+        .logical_block_size(LogicalBlockSize::Lb512)
+        .change_partition_count(true)
+        .create_from_device(file, None)
+        .context("failed to initialize GPT config")?;
+
+    for part_layout in &layout.partitions {
+        let part_size_bytes = (part_layout.end_lba - part_layout.start_lba + 1) * lb_size;
+        gdisk
+            .add_partition(
+                &part_layout.name,
+                part_size_bytes,
+                partition_types::LINUX_FS,
+                0,
+                Some(ALIGNMENT_LBA),
+            )
+            .context(format!("failed to add partition '{}'", part_layout.name))?;
+    }
+
+    let mut file = gdisk
+        .write()
+        .context("failed to write GPT partition table")?;
+    file.flush().context("failed to flush full GPT image")?;
+
+    Ok(())
+}
+
+/// Patch the primary GPT header in the extracted head file to remove
+/// backup GPT references.
+///
+/// Sets `AlternateLBA` to one sector beyond the virtual disk (so the kernel
+/// detects "no valid backup" and falls back to the primary) and
+/// `LastUsableLBA` to the end of the last partition, then recomputes the
+/// header CRC32.
+fn patch_primary_gpt_header(head_path: &Path, last_partition_end_lba: u64) -> Result<()> {
+    let mut file = fs::OpenOptions::new()
+        .read(true)
+        .write(true)
+        .open(head_path)
+        .context("failed to open head file for patching")?;
+
+    // Read the 92-byte GPT header starting at LBA 1.
+    file.seek(SeekFrom::Start(GPT_HEADER_FILE_OFFSET))?;
+    let mut header = [0u8; GPT_HEADER_SIZE];
+    file.read_exact(&mut header)?;
+
+    // AlternateLBA (offset 32, 8 bytes LE) — point beyond virtual disk
+    let alternate_lba = last_partition_end_lba + 1;
+    header[32..40].copy_from_slice(&alternate_lba.to_le_bytes());
+
+    // LastUsableLBA (offset 48, 8 bytes LE) — last partition end
+    header[48..56].copy_from_slice(&last_partition_end_lba.to_le_bytes());
+
+    // Zero HeaderCRC32 (offset 16, 4 bytes LE) before computing new CRC
+    header[16..20].copy_from_slice(&0u32.to_le_bytes());
+
+    let new_crc = {
+        let mut digest = CRC_32.digest();
+        digest.update(&header);
+        digest.finalize()
+    };
+    header[16..20].copy_from_slice(&new_crc.to_le_bytes());
+
+    // Write patched header back
+    file.seek(SeekFrom::Start(GPT_HEADER_FILE_OFFSET))?;
+    file.write_all(&header)?;
+    file.flush()?;
+
+    info!(
+        sl!(),
+        "Patched primary GPT header: AlternateLBA={}, LastUsableLBA={}, CRC32={:#010x}",
+        alternate_lba,
+        last_partition_end_lba,
+        new_crc
+    );
+
+    Ok(())
+}
+
+fn extract_file_range(src: &Path, dst: &Path, offset: u64, size: u64) -> Result<()> {
+    let mut src_file = fs::OpenOptions::new()
+        .read(true)
+        .open(src)
+        .context(format!("failed to open source file: {}", src.display()))?;
+    src_file
+        .seek(SeekFrom::Start(offset))
+        .context("failed to seek source file")?;
+
+    let mut dst_file = fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .truncate(true)
+        .open(dst)
+        .context(format!("failed to create output file: {}", dst.display()))?;
+
+    dst_file
+        .set_len(size)
+        .context("failed to pre-allocate output file")?;
+
+    let mut limited = src_file.take(size);
+    std::io::copy(&mut limited, &mut dst_file).context("failed to copy file range")?;
+    dst_file.flush().context("failed to flush output file")?;
+
+    Ok(())
+}
+
+/// Generate padding file content (all zeros)
+///
+/// Returns the file path and size in sectors.
+pub fn generate_padding_file(output_path: &Path, size_sectors: u64) -> Result<u64> {
+    let size_bytes = size_sectors * SECTOR_SIZE;
+
+    if size_bytes == 0 {
+        return Err(anyhow!("cannot create zero-size padding file"));
+    }
+
+    let mut file = fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .truncate(true)
+        .open(output_path)
+        .context(format!(
+            "failed to create padding file: {}",
+            output_path.display()
+        ))?;
+
+    // Pre-allocate with zeros
+    file.set_len(size_bytes)
+        .context("failed to pre-allocate padding file")?;
+    file.flush().context("failed to flush padding file")?;
+    drop(file);
+
+    info!(
+        sl!(),
+        "Generated padding file: {} ({} sectors, {} bytes)",
+        output_path.display(),
+        size_sectors,
+        size_bytes
+    );
+
+    Ok(size_sectors)
+}
diff --git a/src/libs/kata-types/src/lib.rs b/src/libs/kata-types/src/lib.rs
index 402b9ce54e..9049caf54d 100644
--- a/src/libs/kata-types/src/lib.rs
+++ b/src/libs/kata-types/src/lib.rs
@@ -54,6 +54,9 @@ pub mod rootless;
 /// machine type
 pub mod machine_type;
 
+/// GPT (GUID Partition Table) disk layout and metadata generation.
+pub mod gpt_disk;
+
 use std::path::Path;
 
 use crate::rootless::{is_rootless, rootless_dir};

From 0bd150e5f16b1f132020c56f968f3626cb2c4c72 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Wed, 20 May 2026 19:07:24 +0800
Subject: [PATCH 06/12] runtime-rs: Integrate GPT+VMDK mode for multi-layer
 EROFS rootfs

When multiple EROFS layers are present, wrap them into a single
GPT-partitioned virtual disk delivered via one VMDK descriptor and a
single block device hotplug which significantly reduce pci bus slots
compared with the previous one-device-per-layer approach that exhausts
virtio-blk slots for large layer counts.

The host detects multi-layer mounts, computes the GPT layout, generates
head metadata plus a VMDK descriptor referencing all EROFS images, and
hot-plugs the composite disk. Per-partition Storage entries are created
with X-kata.gpt-partitioned and X-kata.partition-number options so the
guest agent can resolve each layer to its partition device.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 .../resource/src/rootfs/erofs_rootfs.rs       | 448 ++++++++++++++----
 1 file changed, 365 insertions(+), 83 deletions(-)

diff --git a/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs b/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs
index 0ec198544e..36e03b2802 100644
--- a/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs
+++ b/src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs
@@ -24,6 +24,10 @@ use hypervisor::{
 use kata_types::device::{
     DRIVER_BLK_CCW_TYPE as KATA_CCW_DEV_TYPE, DRIVER_BLK_PCI_TYPE as KATA_BLK_DEV_TYPE,
 };
+use kata_types::gpt_disk::{
+    extract_snapshot_id, generate_gpt_metadata, generate_padding_file, get_erofs_layer_size,
+    ErofsLayer, GptDiskLayout, GptMetadataFiles,
+};
 use kata_types::mount::Mount;
 use oci_spec::runtime as oci;
 use std::fs;
@@ -320,6 +324,135 @@ fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()
     Ok(())
 }
 
+/// Generate GPT-partitioned VMDK and return layout information for per-partition storage creation
+///
+/// Returns: (vmdk_path, BlockDeviceFormat::Vmdk, GptDiskLayout, GptMetadataFiles)
+fn generate_gpt_vmdk_with_layout(
+    sid: &str,
+    cid: &str,
+    erofs_layers: Vec<ErofsLayer>,
+) -> Result<(String, BlockDeviceFormat, GptDiskLayout, GptMetadataFiles)> {
+    if erofs_layers.is_empty() {
+        return Err(anyhow!("no EROFS layers provided for GPT VMDK generation"));
+    }
+
+    // Validate all layer paths exist and are regular files
+    for layer in &erofs_layers {
+        let metadata = fs::metadata(&layer.path)
+            .context(format!("EROFS layer path not accessible: {}", layer.path))?;
+        if !metadata.is_file() {
+            return Err(anyhow!(
+                "EROFS layer path is not a regular file: {}",
+                layer.path
+            ));
+        }
+    }
+
+    // Create container directory
+    let container_dir = ensure_container_dir(sid, cid)?;
+    let vmdk_path = container_dir.join(EROFS_MERGED_VMDK);
+
+    info!(
+        sl!(),
+        "creating GPT-partitioned VMDK for {} EROFS layers: {}",
+        erofs_layers.len(),
+        vmdk_path.display()
+    );
+
+    // Generate GPT metadata files
+    let (layout, mut gpt_files) = generate_gpt_metadata(sid, cid, erofs_layers, &container_dir)
+        .context("failed to generate GPT metadata")?;
+
+    // Create VMDK descriptor with GPT layout and collect generated padding paths
+    let pad_paths = create_gpt_vmdk_descriptor(&vmdk_path, &layout, &gpt_files)
+        .context("failed to create GPT VMDK descriptor")?;
+    gpt_files.pad_paths = pad_paths;
+
+    Ok((
+        vmdk_path.display().to_string(),
+        BlockDeviceFormat::Vmdk,
+        layout,
+        gpt_files,
+    ))
+}
+
+/// Create VMDK descriptor for GPT-partitioned disk
+///
+/// Returns the list of generated padding file paths for cleanup tracking.
+fn create_gpt_vmdk_descriptor(
+    vmdk_path: &Path,
+    layout: &GptDiskLayout,
+    gpt_files: &GptMetadataFiles,
+) -> Result<Vec<PathBuf>> {
+    let mut vmdk = VmdkDescriptorWriter::new(vmdk_path)?;
+    let mut pad_paths: Vec<PathBuf> = Vec::new();
+
+    // 1. GPT head metadata
+    vmdk.write_extent(
+        &gpt_files.head_path.display().to_string(),
+        gpt_files.head_sectors,
+        0,
+    )?;
+    info!(
+        sl!(),
+        "VMDK extent: GPT head ({} sectors) at {}",
+        gpt_files.head_sectors,
+        gpt_files.head_path.display()
+    );
+
+    // 2. Layer extents with padding gaps
+    // head ends at LBA 2047, so first gap starts at LBA 2048.
+    let mut prev_end_lba = gpt_files.head_sectors - 1;
+
+    let metadata_dir = gpt_files.head_path.parent().ok_or_else(|| {
+        anyhow!(
+            "GPT head file has no parent directory: {}",
+            gpt_files.head_path.display()
+        )
+    })?;
+
+    for (idx, part) in layout.partitions.iter().enumerate() {
+        let gap_start_lba = prev_end_lba + 1;
+        if part.start_lba > gap_start_lba {
+            let gap_sectors = part.start_lba - gap_start_lba;
+            let pad_path = metadata_dir.join(format!("pad-{}.img", idx));
+
+            generate_padding_file(&pad_path, gap_sectors).context(format!(
+                "failed to generate padding file: {}",
+                pad_path.display()
+            ))?;
+
+            vmdk.write_extent(&pad_path.display().to_string(), gap_sectors, 0)?;
+            pad_paths.push(pad_path);
+        }
+
+        vmdk.write_extent_chunked(&part.layer.path, part.layer.size_sectors)?;
+        info!(
+            sl!(),
+            "VMDK extent: {} (partition {}, LBA {}-{}, {} sectors)",
+            part.layer.path,
+            part.partition_number,
+            part.start_lba,
+            part.end_lba,
+            part.layer.size_sectors
+        );
+
+        prev_end_lba = part.end_lba;
+    }
+
+    vmdk.finalize(layout.total_sectors)?;
+
+    info!(
+        sl!(),
+        "GPT VMDK descriptor created: {} (total {} sectors, {} partitions)",
+        vmdk_path.display(),
+        layout.total_sectors,
+        layout.partitions.len()
+    );
+
+    Ok(pad_paths)
+}
+
 fn extract_block_device_info(
     device_info: &DeviceType,
     read_only: bool,
@@ -383,7 +516,7 @@ pub(crate) struct ErofsMultiLayerRootfs {
     erofs_storages: Vec<Storage>,
     // Path to generated VMDK descriptor (only set when multiple EROFS devices are merged)
     vmdk_path: Option<PathBuf>,
-    // Paths to generated GPT metadata files (head, tail, padding) for cleanup
+    // Paths to generated GPT metadata files (head, padding) for cleanup
     gpt_metadata_paths: Vec<PathBuf>,
     // Container-scoped runtime directory that may only contain generated helper artifacts.
     generated_artifacts_dir: PathBuf,
@@ -407,7 +540,9 @@ impl ErofsMultiLayerRootfs {
         let mut rwlayer_storage: Option<Storage> = None;
         let mut erofs_storages: Vec<Storage> = Vec::new();
         let mut vmdk_path: Option<PathBuf> = None;
-        let gpt_metadata_paths: Vec<PathBuf> = Vec::new();
+        let mut gpt_metadata_paths: Vec<PathBuf> = Vec::new();
+        // Track whether GPT+VMDK erofs layers have already been processed in bulk.
+        let mut gpt_erofs_processed = false;
 
         // Directories to create (X-containerd.mkdir.path)
         let mut mkdir_dirs: Vec<String> = Vec::new();
@@ -504,95 +639,242 @@ impl ErofsMultiLayerRootfs {
                 }
                 fmt if fmt.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE) => {
                     // Mount[1]: erofs layers -> virtio-blk via VMDK /dev/vdX2
-                    info!(
-                        sl!(),
-                        "multi-layer erofs: adding erofs layers: {}", mount.source
-                    );
+                    //
+                    // Two modes are supported:
+                    // 1. fsmerge mode: Single erofs mount with `device=` options pointing to additional files.
+                    //    This is used when containerd has already merged layers into a single file.
+                    // 2. GPT+VMDK mode: Multiple independent erofs mounts (each mount is a separate layer file).
+                    //    This is used when containerd does NOT use fsmerge, and we need to create GPT partitions.
 
-                    // Collect all EROFS devices: source + `device=` options
-                    let mut erofs_devices = vec![mount.source.clone()];
-                    for opt in &mount.options {
-                        if let Some(device_path) = opt.strip_prefix("device=") {
-                            erofs_devices.push(device_path.to_string());
-                        }
+                    // In GPT mode, all erofs layers are processed in bulk on the first
+                    // encounter. Skip subsequent erofs mounts but continue iterating
+                    // so that later ext4 rw-layer and overlay mounts are still handled.
+                    if gpt_erofs_processed {
+                        info!(
+                            sl!(),
+                            "multi-layer erofs: skipping already-processed erofs mount: {}",
+                            mount.source
+                        );
+                        continue;
                     }
 
-                    info!(sl!(), "EROFS devices count: {}", erofs_devices.len());
-
-                    // Generate merged VMDK file from all EROFS devices
-                    // Returns (path, format) - format is Vmdk for multiple devices, Raw for single device
-                    let (erofs_path, erofs_format) =
-                        generate_merged_erofs_vmdk(sid, cid, &erofs_devices)
-                            .await
-                            .context("failed to generate EROFS VMDK")?;
-
-                    // Track VMDK path for cleanup (only when VMDK is actually created)
-                    if erofs_format == BlockDeviceFormat::Vmdk {
-                        vmdk_path = Some(PathBuf::from(&erofs_path));
-                    }
-
-                    info!(
-                        sl!(),
-                        "EROFS block device config - path: {}, format: {:?}",
-                        erofs_path,
-                        erofs_format
-                    );
-
-                    let device_config = &mut BlockConfig {
-                        driver_option: block_driver.clone(),
-                        format: erofs_format, // Vmdk for multiple devices, Raw for single device
-                        path_on_host: erofs_path,
-                        is_readonly: true, // EROFS layers are read-only, must set to avoid "resize" lock errors
-                        blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
-                        ..Default::default()
-                    };
-
-                    let device_info = do_handle_device(
-                        device_manager,
-                        &DeviceConfig::BlockCfg(device_config.clone()),
-                    )
-                    .await
-                    .context("failed to attach erofs block device")?;
-
-                    let (mut rolayer, device_id) =
-                        extract_block_device_info(&device_info, true)?;
-                    info!(
-                        sl!(),
-                        "erofs device attached - device_id: {} guest_path: {}",
-                        device_id,
-                        &rolayer.source
-                    );
-
-                    let mut options: Vec<String> = mount
-                        .options
+                    // Collect all EROFS mounts once with their original indices.
+                    let erofs_mounts_indexed: Vec<(usize, &Mount)> = rootfs_mounts
                         .iter()
-                        .filter(|o| {
-                            // Filter out options that are not valid erofs mount parameters:
-                            // 1. "loop" - not needed in VM, device is already /dev/vdX
-                            // 2. "device=" prefix - used for VMDK generation only, not for mount
-                            // 3. "X-kata." prefix - metadata markers for kata internals
-                            *o != "loop"
-                                && !o.starts_with("device=")
-                                && !o.starts_with("X-kata.")
-                        })
-                        .cloned()
+                        .enumerate()
+                        .filter(|(_, m)| m.fs_type.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE))
                         .collect();
+                    let total_erofs_mounts = erofs_mounts_indexed.len();
 
-                    // Erofs layers are read-only lower layers (marked with X-kata.overlay-lower)
-                    options.push("X-kata.overlay-lower".to_string());
-                    options.push("X-kata.multi-layer=true".to_string());
+                    // GPT+VMDK mode: Multiple independent erofs layer files
+                    if total_erofs_mounts > 1 {
+                        info!(
+                            sl!(),
+                            "multi-layer erofs: using GPT+VMDK mode for {} independent layers",
+                            total_erofs_mounts
+                        );
 
-                    info!(
-                        sl!(),
-                        "erofs storage options filtered: {:?} -> {:?}", mount.options, options
-                    );
+                        let mut erofs_layers = Vec::new();
 
-                    rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string();
-                    rolayer.mount_point = container_path.clone();
-                    rolayer.options = options;
+                        for (_mount_idx, erofs_mount) in &erofs_mounts_indexed {
+                            let layer_path = erofs_mount.source.clone();
+                            let size_bytes = get_erofs_layer_size(&layer_path).context(format!(
+                                "gptdisk: failed to get size of EROFS layer: {}",
+                                layer_path
+                            ))?;
 
-                    erofs_storages.push(rolayer);
-                    device_ids.push(device_id);
+                            if size_bytes == 0 {
+                                warn!(
+                                    sl!(),
+                                    "gptdisk: EROFS layer {} is zero-length, skipping", layer_path
+                                );
+                                continue;
+                            }
+
+                            let size_sectors = size_bytes.div_ceil(512);
+                            let snapshot_id = extract_snapshot_id(&layer_path);
+
+                            erofs_layers.push(ErofsLayer {
+                                path: layer_path,
+                                size_sectors,
+                                snapshot_id,
+                            });
+                        }
+
+                        if erofs_layers.is_empty() {
+                            return Err(anyhow!(
+                                "gptdisk: no valid EROFS layers found for GPT VMDK"
+                            ));
+                        }
+
+                        // Generate GPT-partitioned VMDK and get layout information
+                        let (erofs_path, erofs_format, layout, gpt_files) =
+                            generate_gpt_vmdk_with_layout(sid, cid, erofs_layers)
+                                .context("gptdisk: failed to generate GPT VMDK")?;
+
+                        // Track VMDK path for cleanup
+                        vmdk_path = Some(PathBuf::from(&erofs_path));
+
+                        // Track GPT metadata files (head + padding) for cleanup
+                        gpt_metadata_paths.push(gpt_files.head_path.clone());
+                        gpt_metadata_paths.extend(gpt_files.pad_paths.iter().cloned());
+
+                        info!(
+                            sl!(),
+                            "GPT VMDK created - path: {}, format: {:?}, {} partitions",
+                            erofs_path,
+                            erofs_format,
+                            layout.partitions.len()
+                        );
+
+                        let device_config = &mut BlockConfig {
+                            driver_option: block_driver.clone(),
+                            format: erofs_format,
+                            path_on_host: erofs_path,
+                            is_readonly: true,
+                            blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
+                            ..Default::default()
+                        };
+
+                        let device_info = do_handle_device(
+                            device_manager,
+                            &DeviceConfig::BlockCfg(device_config.clone()),
+                        )
+                        .await
+                        .context("failed to attach GPT VMDK block device")?;
+
+                        let (base_device, device_id) =
+                            extract_block_device_info(&device_info, true)?;
+                        info!(
+                            sl!(),
+                            "GPT VMDK device attached - device_id: {} guest_path: {}",
+                            device_id,
+                            &base_device.source
+                        );
+
+                        device_ids.push(device_id);
+
+                        // Create a storage entry for each GPT partition.
+                        for (idx, part) in layout.partitions.iter().enumerate() {
+                            let mut rolayer = base_device.clone();
+                            let options: Vec<String> = vec![
+                                "X-kata.overlay-lower".to_string(),
+                                "X-kata.multi-layer=true".to_string(),
+                                "X-kata.gpt-partitioned=true".to_string(),
+                                format!("X-kata.partition-number={}", part.partition_number),
+                            ];
+
+                            rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string();
+                            rolayer.mount_point = container_path.clone();
+                            rolayer.options = options;
+                            rolayer.source = base_device.source.clone();
+
+                            info!(
+                                sl!(),
+                                "Created storage for GPT partition {} (partition number {}, LBA {}-{})",
+                                idx, part.partition_number, part.start_lba, part.end_lba
+                            );
+
+                            erofs_storages.push(rolayer);
+                        }
+
+                        // Mark GPT erofs as processed so subsequent erofs mounts
+                        // in the loop are skipped, while still allowing ext4 and
+                        // overlay mounts to be visited.
+                        gpt_erofs_processed = true;
+                    } else {
+                        // fsmerge mode: Single erofs mount with device= options
+                        info!(
+                            sl!(),
+                            "multi-layer erofs: using fsmerge mode for erofs layers: {}",
+                            mount.source
+                        );
+
+                        // Collect all EROFS devices: source + `device=` options
+                        let mut erofs_devices = vec![mount.source.clone()];
+                        for opt in &mount.options {
+                            if let Some(device_path) = opt.strip_prefix("device=") {
+                                erofs_devices.push(device_path.to_string());
+                            }
+                        }
+
+                        info!(sl!(), "EROFS devices count: {}", erofs_devices.len());
+
+                        // Generate merged VMDK file from all EROFS devices
+                        // Returns (path, format) - format is Vmdk for multiple devices, Raw for single device
+                        let (erofs_path, erofs_format) =
+                            generate_merged_erofs_vmdk(sid, cid, &erofs_devices)
+                                .await
+                                .context("failed to generate EROFS VMDK")?;
+
+                        // Track VMDK path for cleanup (only when VMDK is actually created)
+                        if erofs_format == BlockDeviceFormat::Vmdk {
+                            vmdk_path = Some(PathBuf::from(&erofs_path));
+                        }
+
+                        info!(
+                            sl!(),
+                            "EROFS block device config - path: {}, format: {:?}",
+                            erofs_path,
+                            erofs_format
+                        );
+
+                        let device_config = &mut BlockConfig {
+                            driver_option: block_driver.clone(),
+                            format: erofs_format, // Vmdk for multiple devices, Raw for single device
+                            path_on_host: erofs_path,
+                            is_readonly: true, // EROFS layers are read-only, must set to avoid "resize" lock errors
+                            blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
+                            ..Default::default()
+                        };
+
+                        let device_info = do_handle_device(
+                            device_manager,
+                            &DeviceConfig::BlockCfg(device_config.clone()),
+                        )
+                        .await
+                        .context("failed to attach erofs block device")?;
+
+                        let (mut rolayer, device_id) =
+                            extract_block_device_info(&device_info, true)?;
+                        info!(
+                            sl!(),
+                            "erofs device attached - device_id: {} guest_path: {}",
+                            device_id,
+                            &rolayer.source
+                        );
+
+                        let mut options: Vec<String> = mount
+                            .options
+                            .iter()
+                            .filter(|o| {
+                                // Filter out options that are not valid erofs mount parameters:
+                                // 1. "loop" - not needed in VM, device is already /dev/vdX
+                                // 2. "device=" prefix - used for VMDK generation only, not for mount
+                                // 3. "X-kata." prefix - metadata markers for kata internals
+                                *o != "loop"
+                                    && !o.starts_with("device=")
+                                    && !o.starts_with("X-kata.")
+                            })
+                            .cloned()
+                            .collect();
+
+                        // Erofs layers are read-only lower layers (marked with X-kata.overlay-lower)
+                        options.push("X-kata.overlay-lower".to_string());
+                        options.push("X-kata.multi-layer=true".to_string());
+
+                        info!(
+                            sl!(),
+                            "erofs storage options filtered: {:?} -> {:?}", mount.options, options
+                        );
+
+                        rolayer.fs_type = EROFS_ROOTFS_TYPE.to_string();
+                        rolayer.mount_point = container_path.clone();
+                        rolayer.options = options;
+
+                        erofs_storages.push(rolayer);
+                        device_ids.push(device_id);
+                    }
                 }
                 fmt if fmt.eq_ignore_ascii_case("overlay")
                     || fmt.eq_ignore_ascii_case("format/overlay")
@@ -698,7 +980,7 @@ impl Rootfs for ErofsMultiLayerRootfs {
             safely_remove_file(vmdk, &self.generated_artifacts_dir)?;
         }
 
-        // Clean up GPT metadata files (head, tail, padding).
+        // Clean up GPT metadata files (head, padding).
         for metadata_path in &self.gpt_metadata_paths {
             safely_remove_file(metadata_path, &self.generated_artifacts_dir)?;
         }

From 8119a561aed79e443b65da5ca4d0ceaec37b9a00 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Wed, 20 May 2026 19:37:52 +0800
Subject: [PATCH 07/12] kata-agent: Refactor wait_and_mount_layer to return
 LayerMountInfo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit has No functional change — all callers pass None, so
every call still resolves the device via uevent exactly as before.

It just prepare the multi-layer EROFS handler for GPT partition and
dm-verity support by widening the wait_and_mount_layer() interface
without changing behavior.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 src/agent/src/storage/mod.rs               | 10 ++-
 src/agent/src/storage/multi_layer_erofs.rs | 86 ++++++++++++----------
 2 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/src/agent/src/storage/mod.rs b/src/agent/src/storage/mod.rs
index 6b24bf1a9e..0b8c91cf17 100644
--- a/src/agent/src/storage/mod.rs
+++ b/src/agent/src/storage/mod.rs
@@ -22,6 +22,7 @@ use tracing::instrument;
 
 use self::bind_watcher_handler::BindWatcherHandler;
 use self::block_handler::{PmemHandler, ScsiHandler, VirtioBlkMmioHandler, VirtioBlkPciHandler};
+pub use self::ephemeral_handler::update_ephemeral_mounts;
 use self::ephemeral_handler::EphemeralHandler;
 use self::fs_handler::{OverlayfsHandler, VirtioFsHandler};
 use self::image_pull_handler::ImagePullHandler;
@@ -30,15 +31,13 @@ use self::multi_layer_erofs::{handle_multi_layer_erofs_group, is_multi_layer_sto
 use crate::mount::{baremount, is_mounted, remove_mounts};
 use crate::sandbox::Sandbox;
 
-pub use self::ephemeral_handler::update_ephemeral_mounts;
-
 mod bind_watcher_handler;
 mod block_handler;
 mod ephemeral_handler;
 mod fs_handler;
 mod image_pull_handler;
 mod local_handler;
-mod multi_layer_erofs;
+pub mod multi_layer_erofs;
 
 const RW_MASK: u32 = 0o660;
 const RO_MASK: u32 = 0o440;
@@ -168,6 +167,9 @@ struct MultiLayerProcessResult {
     /// Temporary mount points (upper/lower) backing the overlay, needed for
     /// container-scoped cleanup via `container_mounts`.
     temp_mount_points: Vec<String>,
+    /// dm-verity device paths that need to be destroyed during cleanup
+    #[allow(dead_code)]
+    verity_devices: Vec<String>,
 }
 
 /// Handle multi-layer storage by creating the overlay device.
@@ -209,6 +211,7 @@ async fn handle_multi_layer_storage(
         device,
         processed_mount_points: result.processed_mount_points,
         temp_mount_points: result.temp_mount_points,
+        verity_devices: result.verity_devices,
     }))
 }
 
@@ -303,6 +306,7 @@ pub async fn add_storages(
                 }
             }
             mount_list.extend(result.temp_mount_points);
+            mount_list.extend(result.verity_devices);
             continue;
         }
 
diff --git a/src/agent/src/storage/multi_layer_erofs.rs b/src/agent/src/storage/multi_layer_erofs.rs
index 920141d5c7..7abc3df786 100644
--- a/src/agent/src/storage/multi_layer_erofs.rs
+++ b/src/agent/src/storage/multi_layer_erofs.rs
@@ -57,6 +57,8 @@ pub struct MultiLayerErofsResult {
     /// overlay.  These must be tracked so they are unmounted *after* the
     /// overlay target during container teardown.
     pub temp_mount_points: Vec<String>,
+    /// dm-verity device paths that need to be destroyed during cleanup
+    pub verity_devices: Vec<String>,
 }
 
 #[derive(Debug)]
@@ -64,6 +66,13 @@ struct MkdirDirective {
     raw_path: String,
 }
 
+/// Helper struct to track layer mount information including dm-verity devices
+#[derive(Debug)]
+struct LayerMountInfo {
+    #[allow(dead_code)]
+    verity_device: Option<String>,
+}
+
 #[async_trait::async_trait]
 impl StorageHandler for MultiLayerErofsHandler {
     fn driver_types(&self) -> &[&str] {
@@ -176,7 +185,7 @@ pub async fn handle_multi_layer_erofs_group(
     let upper_mount = temp_base.join("upper");
     fs::create_dir_all(&upper_mount).context("failed to create upper mount dir")?;
 
-    wait_and_mount_layer(ext4, &upper_mount, sandbox, &logger).await?;
+    wait_and_mount_layer(ext4, &upper_mount, sandbox, &logger, None).await?;
 
     for mkdir_dir in &mkdir_dirs {
         // As {{ mount 1 }} refers to the first lower layer, which is not available until we mount it.
@@ -206,7 +215,7 @@ pub async fn handle_multi_layer_erofs_group(
             lower_mount.display()
         ))?;
 
-        wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger).await?;
+        let _mount_info = wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger, None).await?;
         lower_mounts.push(lower_mount);
     }
 
@@ -316,6 +325,7 @@ pub async fn handle_multi_layer_erofs_group(
         mount_point: ext4.mount_point.clone(),
         processed_mount_points,
         temp_mount_points,
+        verity_devices: vec![],
     })
 }
 
@@ -464,7 +474,8 @@ async fn wait_and_mount_layer(
     layer_mount: &Path,
     sandbox: &Arc<Mutex<Sandbox>>,
     logger: &Logger,
-) -> Result<()> {
+    base_dev_path: Option<String>,
+) -> Result<LayerMountInfo> {
     info!(
         logger,
         "Waiting for layer device";
@@ -472,22 +483,11 @@ async fn wait_and_mount_layer(
         "driver" => &layer.driver,
         "mount-point" => layer_mount.display(),
     );
-    let dev_path = match layer.driver.as_str() {
-        DRIVER_SCSI_TYPE => {
-            // For SCSI devices, we need to wait for the device to appear and get its path before mounting.
-            get_scsi_device_name(sandbox, &layer.source).await?
-        }
-        DRIVER_BLK_PCI_TYPE => {
-            let (root_complex, pcipath) = pcipath_from_dev_tree_path(&layer.source)?;
-            get_virtio_blk_pci_device_name(sandbox, root_complex, &pcipath).await?
-        }
-        _ => {
-            // For non-SCSI devices, we can assume the source is directly mountable.
-            return Err(anyhow!(
-                "unsupported driver type '{}' for multi-layer erofs",
-                layer.driver
-            ));
-        }
+
+    // Get the base device path
+    let dev_path = match base_dev_path {
+        Some(path) => path,
+        None => resolve_base_device_path(layer, sandbox).await?,
     };
 
     info!(
@@ -545,7 +545,34 @@ async fn wait_and_mount_layer(
     // After successfully mounting the layer, we track the mount point for cleanup.
     track_temporary_mount_for_cleanup(sandbox, layer_mount, logger).await?;
 
-    Ok(())
+    Ok(LayerMountInfo {
+        verity_device: None,
+    })
+}
+
+async fn resolve_base_device_path(
+    layer: &Storage,
+    sandbox: &Arc<Mutex<Sandbox>>,
+) -> Result<String> {
+    let base_dev_path = match layer.driver.as_str() {
+        DRIVER_SCSI_TYPE => {
+            // For SCSI devices, we need to wait for the device to appear and get its path before mounting.
+            get_scsi_device_name(sandbox, &layer.source).await?
+        }
+        DRIVER_BLK_PCI_TYPE => {
+            let (root_complex, pcipath) = pcipath_from_dev_tree_path(&layer.source)?;
+            get_virtio_blk_pci_device_name(sandbox, root_complex, &pcipath).await?
+        }
+        _ => {
+            // For non-SCSI devices, we can assume the source is directly mountable.
+            return Err(anyhow!(
+                "unsupported driver type '{}' for multi-layer erofs",
+                layer.driver
+            ));
+        }
+    };
+
+    Ok(base_dev_path)
 }
 
 #[cfg(test)]
@@ -600,25 +627,6 @@ mod tests {
 
     // --- parse_mkdir_directive ---
 
-    #[rstest]
-    #[case("some/path", true, "some/path")]
-    #[case("some/path:0755", true, "some/path")]
-    #[case("path:mode:extra", true, "path")]
-    #[case("", false, "")]
-    fn test_parse_mkdir_directive(
-        #[case] spec: &str,
-        #[case] should_pass: bool,
-        #[case] expected_path: &str,
-    ) {
-        let result = parse_mkdir_directive(spec);
-        if should_pass {
-            let d = result.expect("expected Ok");
-            assert_eq!(d.raw_path, expected_path);
-        } else {
-            assert!(result.is_err(), "expected Err for spec {:?}", spec);
-        }
-    }
-
     #[test]
     fn test_parse_mkdir_directive_rejects_null_bytes() {
         assert!(parse_mkdir_directive("foo\0bar").is_err());

From 17fadde6d87ea0b5f0f11ad20d629d578a7bf933 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Wed, 20 May 2026 19:48:35 +0800
Subject: [PATCH 08/12] kata-agent: Add GPT partition utility functions

The guest agent needs to resolve individual partition devices from a
single GPT-partitioned block device, but the kernel does not always
create partition nodes immediately after the base device appears,
especially when another fd holds the device open during hot-plug.

Add utility functions that handle two problems:
(1) Mapping a base device path to its partition path following the
kernel naming convention (bare suffix vs 'p' separator).
(2) And ensuring the partition node exists before mount.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 src/agent/src/storage/multi_layer_erofs.rs | 109 +++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/src/agent/src/storage/multi_layer_erofs.rs b/src/agent/src/storage/multi_layer_erofs.rs
index 7abc3df786..d8263b7155 100644
--- a/src/agent/src/storage/multi_layer_erofs.rs
+++ b/src/agent/src/storage/multi_layer_erofs.rs
@@ -10,10 +10,15 @@
 //! - Storage with X-kata.overlay-lower: erofs layers (lowerdir)
 //! - Creates overlay to combine them
 //! - Supports X-kata.mkdir.path options to create directories in upper layer before overlay mount
+//! - Supports GPT-partitioned disks where each layer is a separate partition
 
+#[allow(unused_imports)]
+use std::collections::HashMap;
 use std::fs;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
+use std::time::Duration;
+use tokio::time::sleep;
 
 use crate::device::block_device_handler::get_virtio_blk_pci_device_name;
 use crate::device::scsi_device_handler::get_scsi_device_name;
@@ -44,7 +49,11 @@ pub const DRIVER_MULTI_LAYER_EROFS: &str = "erofs.multi-layer";
 const OPT_OVERLAY_UPPER: &str = "X-kata.overlay-upper";
 const OPT_OVERLAY_LOWER: &str = "X-kata.overlay-lower";
 const OPT_MULTI_LAYER: &str = "X-kata.multi-layer=true";
+#[allow(dead_code)]
+const OPT_GPT_PARTITIONED: &str = "X-kata.gpt-partitioned=true";
 const OPT_MKDIR_PATH: &str = "X-kata.mkdir.path=";
+#[allow(dead_code)]
+const OPT_PARTITION_NUMBER: &str = "X-kata.partition-number=";
 
 #[derive(Debug)]
 pub struct MultiLayerErofsHandler {}
@@ -575,6 +584,81 @@ async fn resolve_base_device_path(
     Ok(base_dev_path)
 }
 
+/// Check if the storage is GPT-partitioned
+#[allow(dead_code)]
+fn is_gpt_partitioned(storage: &Storage) -> bool {
+    storage.options.iter().any(|o| o == OPT_GPT_PARTITIONED)
+}
+
+/// Extract partition number from storage options
+/// Returns None if not specified (non-GPT mode)
+#[allow(dead_code)]
+fn get_partition_number(storage: &Storage) -> Option<u32> {
+    for opt in &storage.options {
+        if let Some(num_str) = opt.strip_prefix(OPT_PARTITION_NUMBER) {
+            return num_str.parse::<u32>().ok();
+        }
+    }
+    None
+}
+
+/// Get the partition device path for a GPT-partitioned disk
+///
+/// For GPT mode: the storage.source contains the base disk path (e.g., "/dev/vda")
+/// We need to append the partition number to get the partition path (e.g., "/dev/vda1")
+///
+/// Follows the kernel naming rule: if the base device name ends with a digit,
+/// insert a 'p' separator before the partition number to avoid ambiguity.
+/// This correctly handles all device families:
+/// - /dev/vda   -> /dev/vda1   (no trailing digit, bare number)
+/// - /dev/sda   -> /dev/sda1
+/// - /dev/nvme0n1 -> /dev/nvme0n1p1 (trailing digit, needs 'p')
+/// - /dev/mmcblk0 -> /dev/mmcblk0p1
+/// - /dev/loop0 -> /dev/loop0p1
+#[allow(dead_code)]
+fn get_partition_device_path(base_path: &str, partition_number: u32) -> String {
+    if base_path.ends_with(char::is_numeric) {
+        format!("{}p{}", base_path, partition_number)
+    } else {
+        format!("{}{}", base_path, partition_number)
+    }
+}
+
+/// Wait for partition device node to appear in /dev.
+///
+/// When a virtio-blk device with a GPT is hotplugged, the kernel automatically
+/// scans the partition table and creates partition nodes. However, devtmpfs node
+/// creation may lag slightly behind the uevent, so we poll briefly if needed.
+#[allow(dead_code)]
+async fn wait_for_partition_device(device_path: &str, logger: &Logger) -> Result<()> {
+    let device_path_buf = PathBuf::from(device_path);
+    if device_path_buf.exists() {
+        return Ok(());
+    }
+
+    const MAX_WAIT_MS: u64 = 1000;
+    const POLL_INTERVAL_MS: u64 = 50;
+
+    for attempt in 0..(MAX_WAIT_MS / POLL_INTERVAL_MS) {
+        sleep(Duration::from_millis(POLL_INTERVAL_MS)).await;
+        if device_path_buf.exists() {
+            info!(
+                logger,
+                "Partition device node appeared after polling: {} (attempt {})",
+                device_path,
+                attempt + 1
+            );
+            return Ok(());
+        }
+    }
+
+    Err(anyhow!(
+        "partition device {} did not appear within {} ms",
+        device_path,
+        MAX_WAIT_MS
+    ))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -731,4 +815,29 @@ mod tests {
             s.options
         );
     }
+
+    // --- get_partition_device_path ---
+
+    #[rstest]
+    #[case("/dev/vda", 1, "/dev/vda1")]
+    #[case("/dev/sda", 3, "/dev/sda3")]
+    #[case("/dev/hda", 2, "/dev/hda2")]
+    #[case("/dev/nvme0n1", 1, "/dev/nvme0n1p1")]
+    #[case("/dev/nvme0n1", 2, "/dev/nvme0n1p2")]
+    #[case("/dev/mmcblk0", 1, "/dev/mmcblk0p1")]
+    #[case("/dev/loop0", 1, "/dev/loop0p1")]
+    #[case("/dev/nbd0", 3, "/dev/nbd0p3")]
+    fn test_get_partition_device_path(
+        #[case] base: &str,
+        #[case] part: u32,
+        #[case] expected: &str,
+    ) {
+        assert_eq!(
+            get_partition_device_path(base, part),
+            expected,
+            "get_partition_device_path({}, {})",
+            base,
+            part
+        );
+    }
 }

From 2036e66bc34cf861240375252df9dab9899941d7 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Wed, 20 May 2026 20:05:24 +0800
Subject: [PATCH 09/12] kata-agent: Integrate GPT partition support into
 multi-layer handler

In GPT mode, all partitions share the same base block device, so
resolving it once per uevent source and caching the result avoids
redundant hotplug waits that would otherwise scale linearly with
layer count.

Layers are sorted by partition number before mounting to guarantee
correct overlay lowerdir precedence regardless of the order the host
emits Storage entries.

And it will remove dead_code attributes to mark the codes working.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 src/agent/src/storage/mod.rs               |  1 -
 src/agent/src/storage/multi_layer_erofs.rs | 83 ++++++++++++++++++----
 2 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/src/agent/src/storage/mod.rs b/src/agent/src/storage/mod.rs
index 0b8c91cf17..48eb4bfe37 100644
--- a/src/agent/src/storage/mod.rs
+++ b/src/agent/src/storage/mod.rs
@@ -168,7 +168,6 @@ struct MultiLayerProcessResult {
     /// container-scoped cleanup via `container_mounts`.
     temp_mount_points: Vec<String>,
     /// dm-verity device paths that need to be destroyed during cleanup
-    #[allow(dead_code)]
     verity_devices: Vec<String>,
 }
 
diff --git a/src/agent/src/storage/multi_layer_erofs.rs b/src/agent/src/storage/multi_layer_erofs.rs
index d8263b7155..088fd4edf1 100644
--- a/src/agent/src/storage/multi_layer_erofs.rs
+++ b/src/agent/src/storage/multi_layer_erofs.rs
@@ -12,7 +12,6 @@
 //! - Supports X-kata.mkdir.path options to create directories in upper layer before overlay mount
 //! - Supports GPT-partitioned disks where each layer is a separate partition
 
-#[allow(unused_imports)]
 use std::collections::HashMap;
 use std::fs;
 use std::path::{Path, PathBuf};
@@ -49,10 +48,8 @@ pub const DRIVER_MULTI_LAYER_EROFS: &str = "erofs.multi-layer";
 const OPT_OVERLAY_UPPER: &str = "X-kata.overlay-upper";
 const OPT_OVERLAY_LOWER: &str = "X-kata.overlay-lower";
 const OPT_MULTI_LAYER: &str = "X-kata.multi-layer=true";
-#[allow(dead_code)]
 const OPT_GPT_PARTITIONED: &str = "X-kata.gpt-partitioned=true";
 const OPT_MKDIR_PATH: &str = "X-kata.mkdir.path=";
-#[allow(dead_code)]
 const OPT_PARTITION_NUMBER: &str = "X-kata.partition-number=";
 
 #[derive(Debug)]
@@ -78,7 +75,6 @@ struct MkdirDirective {
 /// Helper struct to track layer mount information including dm-verity devices
 #[derive(Debug)]
 struct LayerMountInfo {
-    #[allow(dead_code)]
     verity_device: Option<String>,
 }
 
@@ -138,6 +134,7 @@ pub async fn handle_multi_layer_erofs_group(
     let mut ext4_storage: Option<&Storage> = None;
     let mut erofs_storages: Vec<&Storage> = Vec::new();
     let mut mkdir_dirs: Vec<MkdirDirective> = Vec::new();
+    let mut has_gpt_partition: bool = false;
 
     for storage in &multi_layer_storages {
         if is_upper_storage(storage) {
@@ -155,19 +152,33 @@ pub async fn handle_multi_layer_erofs_group(
                 }
             }
         } else if is_lower_storage(storage) {
+            // Each GPT partition is provided as a separate storage entry by the host
+            if !has_gpt_partition && is_gpt_partitioned(storage) {
+                has_gpt_partition = true;
+            }
             erofs_storages.push(*storage);
         }
     }
 
-    let ext4 = ext4_storage
-        .ok_or_else(|| anyhow!("multi-layer erofs missing ext4 upper layer storage"))?;
-
     if erofs_storages.is_empty() {
         return Err(anyhow!(
             "multi-layer erofs missing erofs lower layer storage"
         ));
     }
 
+    // Only sort erofs layers by partition number in GPT mode.
+    // In GPT mode, each storage carries X-kata.partition-number=N and layers
+    // must be ordered by partition number so that the overlay lowerdir
+    // precedence is correct (lower partition number = higher overlay priority).
+    // In non-GPT mode all partition numbers are None, so sorting would be a
+    // no-op that needlessly reorders elements.
+    if has_gpt_partition {
+        erofs_storages.sort_by_key(|storage| get_partition_number(storage).unwrap_or(u32::MAX));
+    }
+
+    let ext4 = ext4_storage
+        .ok_or_else(|| anyhow!("multi-layer erofs missing ext4 upper layer storage"))?;
+
     info!(
         logger,
         "Handling multi-layer erofs group";
@@ -217,6 +228,9 @@ pub async fn handle_multi_layer_erofs_group(
     }
 
     let mut lower_mounts = Vec::new();
+    let mut verity_devices = Vec::new();
+    let mut base_device_cache: HashMap<String, String> = HashMap::new();
+
     for (index, erofs) in erofs_storages.iter().enumerate() {
         let lower_mount = temp_base.join(format!("lower-{}", index));
         fs::create_dir_all(&lower_mount).context(format!(
@@ -224,8 +238,25 @@ pub async fn handle_multi_layer_erofs_group(
             lower_mount.display()
         ))?;
 
-        let _mount_info = wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger, None).await?;
+        let base_dev_path = if is_gpt_partitioned(erofs) {
+            Some(
+                base_device_cache
+                    .entry(erofs.source.clone())
+                    .or_insert(resolve_base_device_path(erofs, sandbox).await?)
+                    .clone(),
+            )
+        } else {
+            None
+        };
+
+        let mount_info =
+            wait_and_mount_layer(erofs, &lower_mount, sandbox, &logger, base_dev_path).await?;
         lower_mounts.push(lower_mount);
+
+        // Collect dm-verity device for cleanup
+        if let Some(verity_dev) = mount_info.verity_device {
+            verity_devices.push(verity_dev);
+        }
     }
 
     // If any mkdir directive refers to {{ mount 1 }}, resolve it now using the first lower mount.
@@ -334,7 +365,7 @@ pub async fn handle_multi_layer_erofs_group(
         mount_point: ext4.mount_point.clone(),
         processed_mount_points,
         temp_mount_points,
-        verity_devices: vec![],
+        verity_devices,
     })
 }
 
@@ -493,12 +524,42 @@ async fn wait_and_mount_layer(
         "mount-point" => layer_mount.display(),
     );
 
+    let is_gpt = is_gpt_partitioned(layer);
+    let partition_num = get_partition_number(layer);
+
     // Get the base device path
     let dev_path = match base_dev_path {
         Some(path) => path,
         None => resolve_base_device_path(layer, sandbox).await?,
     };
 
+    // For GPT-partitioned disks, use the partition device path
+    let dev_path = if is_gpt {
+        if let Some(part_num) = partition_num {
+            let path = get_partition_device_path(&dev_path, part_num);
+            info!(
+                logger,
+                "GPT-partitioned mode: using partition device";
+                "base-device" => &dev_path,
+                "partition-number" => part_num,
+                "partition-device" => &path,
+            );
+
+            // Wait for partition device node to appear
+            wait_for_partition_device(&path, logger).await?;
+
+            path
+        } else {
+            return Err(anyhow!(
+                "GPT-partitioned storage missing partition number: {:?}",
+                layer
+            ));
+        }
+    } else {
+        // Non-GPT mode: use base device directly
+        dev_path.clone()
+    };
+
     info!(
         logger,
         "Mounting layer";
@@ -506,6 +567,7 @@ async fn wait_and_mount_layer(
         "fstype" => &layer.fstype,
         "devname" => &dev_path,
         "mount-point" => layer_mount.display(),
+        "gpt-mode" => is_gpt,
     );
 
     create_mount_destination(Path::new(&dev_path), layer_mount, "", &layer.fstype)
@@ -585,14 +647,12 @@ async fn resolve_base_device_path(
 }
 
 /// Check if the storage is GPT-partitioned
-#[allow(dead_code)]
 fn is_gpt_partitioned(storage: &Storage) -> bool {
     storage.options.iter().any(|o| o == OPT_GPT_PARTITIONED)
 }
 
 /// Extract partition number from storage options
 /// Returns None if not specified (non-GPT mode)
-#[allow(dead_code)]
 fn get_partition_number(storage: &Storage) -> Option<u32> {
     for opt in &storage.options {
         if let Some(num_str) = opt.strip_prefix(OPT_PARTITION_NUMBER) {
@@ -615,7 +675,6 @@ fn get_partition_number(storage: &Storage) -> Option<u32> {
 /// - /dev/nvme0n1 -> /dev/nvme0n1p1 (trailing digit, needs 'p')
 /// - /dev/mmcblk0 -> /dev/mmcblk0p1
 /// - /dev/loop0 -> /dev/loop0p1
-#[allow(dead_code)]
 fn get_partition_device_path(base_path: &str, partition_number: u32) -> String {
     if base_path.ends_with(char::is_numeric) {
         format!("{}p{}", base_path, partition_number)

From fd139a11436a3c40fedec862705a1fe387465c08 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Thu, 21 May 2026 10:20:10 +0800
Subject: [PATCH 10/12] kata-deploy: Reset max_unmerged_layers to "0" within
 erofs snapshotter

we should set max_unmerged_layers = 0 for erofs snapshotter gpt-vmdk
mode.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 .../packaging/kata-deploy/binary/src/artifacts/snapshotters.rs  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs
index c9ae3bb52f..b907658f3d 100644
--- a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs
+++ b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs
@@ -91,7 +91,7 @@ pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &P
     toml_utils::set_toml_value(
         configuration_file,
         ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
-        "1",
+        "0",
     )?;
 
     Ok(())

From a359d13476fa426790438dd727a8373c345d95a6 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Mon, 25 May 2026 19:12:52 +0800
Subject: [PATCH 11/12] build: Validate measured-rootfs root hashes all shims
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cached shim-v2 tarballs ship per-variant `root_hash_*.txt` files
embedded in the matching measured-rootfs image. Until now only
shim-v2-rust validated those hashes against the freshly built rootfs
images on a cache hit; shim-v2-go reused whatever was cached without
checking, even though its bundled configuration files contain the
`KERNELVERITYPARAMS_*` values baked in at build time.

When a PR changes the agent (and therefore the rootfs image and its
dm-verity hash) but does not touch `src/runtime`, the shim-v2-go cache
key stays the same and the stale tarball is reused. The resulting
guest cmdline carries a verity hash that no longer matches the new
rootfs image, so the VM panics very early in boot:

    device-mapper: verity: 254:1: metadata block 0 is corrupted
    erofs (device dm-0): cannot read erofs superblock
    Kernel panic - not syncing: VFS: Unable to mount root fs ...

Generalize the shim-v2-rust cache validation so it also runs for
shim-v2-go, push the per-variant root-hash sidecar files for both
shims, and fall back to a full rebuild whenever the cached hash is
missing or differs from the image one.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 .../local-build/kata-deploy-binaries.sh       | 40 +++++++++++++------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh
index 0e491eea8d..896764f6ec 100755
--- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh
+++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh
@@ -182,13 +182,18 @@ get_kernel_modules_dir() {
 	echo "${kernel_modules_dir}"
 }
 
-cleanup_and_fail_shim_v2_rust_specifics() {
+cleanup_and_fail_shim_v2_specifics() {
+	local component="${1:-}"
+	local component_tarball_path="${2:-}"
+	local extra_tarballs="${3:-}"
+	local tarball_dir="${repo_root_dir}/tools/packaging/kata-deploy/local-build/build"
+
 	for variant in confidential nvidia-gpu nvidia-gpu-confidential; do
-		local root_hash_file="${repo_root_dir}/tools/packaging/kata-deploy/local-build/build/shim-v2-rust-root_hash_${variant}.txt"
+		local root_hash_file="${tarball_dir}/${component}-root_hash_${variant}.txt"
 		[[ -f "${root_hash_file}" ]] && rm -f "${root_hash_file}"
 	done
 
-	cleanup_and_fail "${1:-}" "${2:-}"
+	cleanup_and_fail "${component_tarball_path}" "${extra_tarballs}"
 }
 
 cleanup_and_fail() {
@@ -229,15 +234,22 @@ install_cached_shim_v2_tarball_get_root_hash() {
 	return 0
 }
 
-install_cached_shim_v2_rust_tarball_compare_root_hashes() {
+install_cached_shim_v2_tarball_compare_root_hashes() {
+	local component="${1:-}"
 	local found_any=""
 	local tarball_dir="${repo_root_dir}/tools/packaging/kata-deploy/local-build/build"
 
 	for variant in confidential nvidia-gpu nvidia-gpu-confidential; do
-		# Skip if one or the other does not exist.
-		[[ ! -f "${tarball_dir}/root_hash_${variant}.txt" ]] && continue
+		local image_root_hash="${tarball_dir}/root_hash_${variant}.txt"
+		local cached_root_hash="${component}-root_hash_${variant}.txt"
 
-		diff "${tarball_dir}/root_hash_${variant}.txt" "shim-v2-rust-root_hash_${variant}.txt" || return 1
+		# Skip if the current image tarball did not ship a root hash for this variant.
+		[[ ! -f "${image_root_hash}" ]] && continue
+
+		if [[ ! -f "${cached_root_hash}" ]] || ! cmp -s "${image_root_hash}" "${cached_root_hash}"; then
+			info "Measured rootfs hash mismatch for ${component} variant ${variant}; rebuilding shim"
+			return 1
+		fi
 		found_any="yes"
 	done
 	[[ -z "${found_any}" ]] && return 0
@@ -260,7 +272,8 @@ install_cached_tarball_component() {
 	# "tarball1_name:tarball1_path tarball2_name:tarball2_path ... tarballN_name:tarballN_path"
 	local extra_tarballs="${6:-}"
 
-	if [[ "${component}" = "shim-v2-rust" ]]; then
+	if [[ "${MEASURED_ROOTFS}" = "yes" ]] && \
+		{ [[ "${component}" = "shim-v2-go" ]] || [[ "${component}" = "shim-v2-rust" ]]; }; then
 		install_cached_shim_v2_tarball_get_root_hash
 	fi
 
@@ -282,8 +295,9 @@ install_cached_tarball_component() {
 	fi
 	sha256sum -c "${component}-sha256sum" || { cleanup_and_fail "${component_tarball_path}" "${extra_tarballs}"; return 1; }
 
-	if [[ "${component}" = "shim-v2-rust" ]]; then
-		install_cached_shim_v2_rust_tarball_compare_root_hashes || { cleanup_and_fail_shim_v2_rust_specifics "${component_tarball_path}" "${extra_tarballs}"; return 1; }
+	if [[ "${MEASURED_ROOTFS}" = "yes" ]] && \
+		{ [[ "${component}" = "shim-v2-go" ]] || [[ "${component}" = "shim-v2-rust" ]]; }; then
+		install_cached_shim_v2_tarball_compare_root_hashes "${component}" || { cleanup_and_fail_shim_v2_specifics "${component}" "${component_tarball_path}" "${extra_tarballs}"; return 1; }
 	fi
 
 	info "Using cached tarball of ${component}"
@@ -1637,15 +1651,15 @@ handle_build() {
 					"kata-static-${build_target}-modules.tar.zst"
 				)
 				;;
-		shim-v2-rust)
+		shim-v2-go|shim-v2-rust)
 			if [[ "${MEASURED_ROOTFS}" == "yes" ]]; then
 				local found_any=""
 				for variant in confidential nvidia-gpu nvidia-gpu-confidential; do
 					# The variants could be built independently we need to check if
 					# they exist and then push them to the registry
-					[[ -f "${workdir}/shim-v2-rust-root_hash_${variant}.txt" ]] && files_to_push+=("shim-v2-rust-root_hash_${variant}.txt") && found_any="yes"
+					[[ -f "${workdir}/${build_target}-root_hash_${variant}.txt" ]] && files_to_push+=("${build_target}-root_hash_${variant}.txt") && found_any="yes"
 				done
-				[[ -z "${found_any}" ]] && die "No files to push for shim-v2-rust with MEASURED_ROOTFS support"
+				[[ -z "${found_any}" ]] && die "No files to push for ${build_target} with MEASURED_ROOTFS support"
 			fi
 			;;
 			*)

From 53699b0170492af751fd12d0db0ded324001d697 Mon Sep 17 00:00:00 2001
From: Alex Lyn <alex.lyn@antgroup.com>
Date: Mon, 25 May 2026 19:13:28 +0800
Subject: [PATCH 12/12] docs: Reset max_unmerged_layers = 0 for gpt+vmdk mode

As max_unmerged_layers = 1 is just for fsmerge mode, as containerd
temperally unsupport fsmerge, we just reset it with default 0.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
---
 docs/how-to/how-to-use-erofs-snapshotter-with-kata.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/how-to/how-to-use-erofs-snapshotter-with-kata.md b/docs/how-to/how-to-use-erofs-snapshotter-with-kata.md
index bc2405a5d0..2460f5a780 100644
--- a/docs/how-to/how-to-use-erofs-snapshotter-with-kata.md
+++ b/docs/how-to/how-to-use-erofs-snapshotter-with-kata.md
@@ -90,7 +90,7 @@ version = 3
 
   [plugins.'io.containerd.snapshotter.v1.erofs']
     default_size = '<SIZE>' # SIZE=6G or 10G or other size
-    max_unmerged_layers = 1
+    max_unmerged_layers = 0
 ```
 
 #### Verify the EROFS plugins are loaded