Merge pull request #13196 from microsoft/cameronbaird/upstream/runtime-go-clh-templating

runtime: Enable VM Templating Support for CLH
2026-07-01 22:50:54 +00:00 · 2026-06-21 16:31:19 +02:00
parent ec55a74969 730307f32c
commit 374a867774
17 changed files with 945 additions and 86 deletions
--- a/.github/workflows/run-k8s-tests-on-free-runner.yaml
+++ b/.github/workflows/run-k8s-tests-on-free-runner.yaml
@@ -41,10 +41,12 @@ jobs:
      matrix:
        environment: [
          { vmm: clh, containerd_version: latest },
+          { vmm: clh, containerd_version: latest, snapshotter: erofs, erofs_mode: disk, erofs_merge_mode: unmerged },
          { vmm: clh, containerd_version: minimum },
          { vmm: dragonball, containerd_version: latest },
          { vmm: dragonball, containerd_version: minimum },
          { vmm: qemu, containerd_version: latest },
+          { vmm: qemu, containerd_version: latest, snapshotter: erofs, erofs_mode: disk, erofs_merge_mode: unmerged },
          { vmm: qemu, containerd_version: minimum },
          { vmm: qemu-runtime-rs, containerd_version: latest },
          { vmm: qemu-runtime-rs, containerd_version: minimum },
@@ -68,6 +70,9 @@ jobs:
      K8S_TEST_HOST_TYPE: baremetal-no-attestation
      CONTAINER_ENGINE: containerd
      CONTAINER_ENGINE_VERSION: ${{ matrix.environment.containerd_version }}
+      SNAPSHOTTER: ${{ matrix.environment.snapshotter }}
+      EROFS_SNAPSHOTTER_MODE: ${{ matrix.environment.erofs_mode }}
+      EROFS_MERGE_MODE: ${{ matrix.environment.erofs_merge_mode }}
      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -2085,12 +2085,6 @@ func checkNetNsConfig(config oci.RuntimeConfig) error {

 // checkFactoryConfig ensures the VM factory configuration is valid.
 func checkFactoryConfig(config oci.RuntimeConfig) error {
-	if config.FactoryConfig.Template {
-		if config.HypervisorConfig.InitrdPath == "" {
-			return errors.New("Factory option enable_template requires an initrd image")
-		}
-	}
-
 	if config.FactoryConfig.VMCacheNumber > 0 {
 		if config.HypervisorType != vc.QemuHypervisor {
 			return errors.New("VM cache just support qemu")
--- a/src/runtime/pkg/katautils/config_test.go
+++ b/src/runtime/pkg/katautils/config_test.go
@@ -1696,7 +1696,7 @@ func TestCheckFactoryConfig(t *testing.T) {
 		{false, false, "", "initrd"},

 		{true, false, "", "initrd"},
-		{true, true, "image", ""},
+		{true, false, "image", ""},
 	}

 	for i, d := range data {
--- a/src/runtime/pkg/katautils/create.go
+++ b/src/runtime/pkg/katautils/create.go
@@ -82,12 +82,8 @@ func HandleFactory(ctx context.Context, vci vc.VC, runtimeConfig *oci.RuntimeCon
 	kataUtilsLogger.WithField("factory", factoryConfig).Info("load vm factory")

 	f, err := vf.NewFactory(ctx, factoryConfig, true)
-	if err != nil && !factoryConfig.VMCache {
-		kataUtilsLogger.WithError(err).Warn("load vm factory failed, about to create new one")
-		f, err = vf.NewFactory(ctx, factoryConfig, false)
-	}
 	if err != nil {
-		kataUtilsLogger.WithError(err).Warn("create vm factory failed")
+		kataUtilsLogger.WithError(err).Warn("load vm factory failed, will use direct boot")
 		return
 	}

--- a/src/runtime/virtcontainers/clh.go
+++ b/src/runtime/virtcontainers/clh.go
@@ -68,6 +68,7 @@ const (
 const (
 	clhStateCreated = "Created"
 	clhStateRunning = "Running"
+	clhStatePaused  = "Paused"
 )

 const (
@@ -112,8 +113,16 @@ type clhClient interface {
 	VmAddDevicePut(ctx context.Context, deviceConfig chclient.DeviceConfig) (chclient.PciDeviceInfo, *http.Response, error)
 	// Add a new disk device to the VM
 	VmAddDiskPut(ctx context.Context, diskConfig chclient.DiskConfig) (chclient.PciDeviceInfo, *http.Response, error)
+	// Pause the VM
+	VmPausePut(ctx context.Context) (*http.Response, error)
+	// Create a snapshot of the VM
+	VmSnapshotPut(ctx context.Context, vmSnapshotConfig chclient.VmSnapshotConfig) (*http.Response, error)
 	// Remove a device from the VM
 	VmRemoveDevicePut(ctx context.Context, vmRemoveDevice chclient.VmRemoveDevice) (*http.Response, error)
+	// Restore VM from a snapshot
+	VmRestorePut(ctx context.Context, restoreConfig chclient.RestoreConfig) (*http.Response, error)
+	// Resume a paused VM
+	ResumeVM(ctx context.Context) (*http.Response, error)
 }

 type clhClientApi struct {
@@ -153,10 +162,26 @@ func (c *clhClientApi) VmAddDiskPut(ctx context.Context, diskConfig chclient.Dis
 	return c.ApiInternal.VmAddDiskPut(ctx).DiskConfig(diskConfig).Execute()
 }

+func (c *clhClientApi) VmPausePut(ctx context.Context) (*http.Response, error) {
+	return c.ApiInternal.PauseVM(ctx).Execute()
+}
+
+func (c *clhClientApi) VmSnapshotPut(ctx context.Context, vmSnapshotConfig chclient.VmSnapshotConfig) (*http.Response, error) {
+	return c.ApiInternal.VmSnapshotPut(ctx).VmSnapshotConfig(vmSnapshotConfig).Execute()
+}
+
 func (c *clhClientApi) VmRemoveDevicePut(ctx context.Context, vmRemoveDevice chclient.VmRemoveDevice) (*http.Response, error) {
 	return c.ApiInternal.VmRemoveDevicePut(ctx).VmRemoveDevice(vmRemoveDevice).Execute()
 }

+func (c *clhClientApi) VmRestorePut(ctx context.Context, restoreConfig chclient.RestoreConfig) (*http.Response, error) {
+	return c.ApiInternal.VmRestorePut(ctx).RestoreConfig(restoreConfig).Execute()
+}
+
+func (c *clhClientApi) ResumeVM(ctx context.Context) (*http.Response, error) {
+	return c.ApiInternal.ResumeVM(ctx).Execute()
+}
+
 // This is done in order to be able to override such a function as part of
 // our unit tests, as when testing bootVM we're on a mocked scenario already.
 var vmAddNetPutRequest = func(clh *cloudHypervisor) ([]chclient.PciDeviceInfo, error) {
@@ -255,12 +280,14 @@ type CloudHypervisorState struct {
 	PID               int
 	VirtiofsDaemonPid int
 	state             clhState
+	isRestoring       bool
 }

 func (s *CloudHypervisorState) reset() {
 	s.PID = 0
 	s.VirtiofsDaemonPid = 0
 	s.state = clhNotReady
+	s.isRestoring = false
 }

 type cloudHypervisor struct {
@@ -501,7 +528,7 @@ func getNonUserDefinedKernelParams(rootfstype string, disableNvdimm bool, dax bo
 }

 // For cloudHypervisor this call only sets the internal structure up.
-// The VM will be created and started through StartVM().
+// The VM will be created and started through StartVM(), or restored from template if template files exist.
 func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error {
 	clh.ctx = ctx

@@ -559,29 +586,78 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net
 		}
 	}

-	// Create the VM memory config via the constructor to ensure default values are properly assigned
-	clh.vmconfig.Memory = chclient.NewMemoryConfig(int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes()))
-	// Memory config shared is to be enabled when using vhost_user backends, ex. virtio-fs
-	// or when using HugePages.
-	// If such features are disabled, turn off shared memory config.
-	if clh.config.SharedFS == config.NoSharedFS && !clh.config.HugePages {
-		clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(false)
-	} else {
-		clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(true)
-	}
-	// Enable hugepages if needed
-	clh.vmconfig.Memory.Hugepages = func(b bool) *bool { return &b }(clh.config.HugePages)
-	if !clh.config.ConfidentialGuest {
-		hotplugSize := clh.config.DefaultMaxMemorySize
-		// OpenAPI only supports int64 values
-		clh.vmconfig.Memory.HotplugSize = func(i int64) *int64 { return &i }(int64((utils.MemUnit(hotplugSize) * utils.MiB).ToBytes()))
-
-		if clh.config.ReclaimGuestFreedMemory {
-			// Create VM with a balloon config so we can enable free page reporting (size of the balloon can be set to zero)
-			clh.vmconfig.Balloon = chclient.NewBalloonConfig(0)
-			// Set the free page reporting flag for ballooning to be true
-			clh.vmconfig.Balloon.SetFreePageReporting(true)
+	// If the VM is booting from a template, or if the VM is going to be used as a template
+	// the memory is to be backed by a file, so we need to configure the memory zones accordingly.
+	if clh.config.BootFromTemplate || clh.config.BootToBeTemplate {
+		// VM templating is incompatible with virtio-fs because virtio-fs requires shared memory,
+		// while templating needs COW/private memory on restore.
+		if clh.config.SharedFS == config.VirtioFS || clh.config.SharedFS == config.VirtioFSNydus {
+			return errors.New("VM templating has been enabled with virtio-fs and this configuration will not work")
 		}
+
+		// Double-check that the clh.config.MemoryPath file is accessible before using it in the VM config, to avoid hitting a less clear error from cloud hypervisor when it tries to access the memory file.
+		if _, err := os.Stat(clh.config.MemoryPath); err != nil {
+			return fmt.Errorf("memory file %s is not accessible: %w", clh.config.MemoryPath, err)
+		}
+
+		// Set the size to be 0 since we are going to configure actual size via zones
+		clh.vmconfig.Memory = chclient.NewMemoryConfig(0)
+
+		memoryZoneConfig := chclient.NewMemoryZoneConfig("mem0", int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes()))
+		if clh.config.BootToBeTemplate {
+			// When BootToBeTemplate is true, the memory file backing the VM memory is shared between multiple VMs created from the same template.
+			// So we need to set shared to true in this case.
+			memoryZoneConfig.SetShared(true)
+			clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(true)
+
+			if !clh.config.ConfidentialGuest {
+				// TODO: Remove this warning once memory hotplugging is supported
+				// for template VMs.
+				//
+				// Memory hotplug is intentionally not configured for template VMs.
+				// Resizing a memory zone requires the virtio-mem hotplug method
+				// (cloud-hypervisor rejects the default ACPI hotplug on a zone that
+				// carries a hotplug_size), which is not currently supported in the
+				// templating path. As a result, VMs restored from this template
+				// cannot grow their memory beyond the template's boot size.
+				clh.Logger().Warn("memory hotplugging is currently unsupported for template VMs")
+			}
+		} else {
+			// When BootFromTemplate is true, set shared=false to ensure Copy-On-Write is used for the memory file.
+			// So that the VM can have its own private memory.
+			memoryZoneConfig.SetShared(false)
+			clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(false)
+		}
+		memoryZoneConfig.SetFile(clh.config.MemoryPath)
+		clh.vmconfig.Memory.Zones = &[]chclient.MemoryZoneConfig{
+			*memoryZoneConfig,
+		}
+	} else { // Normal (non-template) VM creation
+		// Create the VM memory config via the constructor to ensure default values are properly assigned
+		clh.vmconfig.Memory = chclient.NewMemoryConfig(int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes()))
+		// Memory config shared is to be enabled when using vhost_user backends, ex. virtio-fs
+		// or when using HugePages.
+		// If such features are disabled, turn off shared memory config.
+		if clh.config.SharedFS == config.NoSharedFS && !clh.config.HugePages {
+			clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(false)
+		} else {
+			clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(true)
+		}
+		// Enable hugepages if needed
+		clh.vmconfig.Memory.Hugepages = func(b bool) *bool { return &b }(clh.config.HugePages)
+		if !clh.config.ConfidentialGuest {
+			hotplugSize := clh.config.DefaultMaxMemorySize
+			// OpenAPI only supports int64 values
+			clh.vmconfig.Memory.HotplugSize = func(i int64) *int64 { return &i }(int64((utils.MemUnit(hotplugSize) * utils.MiB).ToBytes()))
+		}
+	}
+
+	// Configure balloon device for free page reporting. This is set unconditionally
+	// (for both template and non-template paths) so that template VMs include the
+	// balloon in their snapshot, and VMs restored from a template inherit it.
+	if !clh.config.ConfidentialGuest && clh.config.ReclaimGuestFreedMemory {
+		clh.vmconfig.Balloon = chclient.NewBalloonConfig(0)
+		clh.vmconfig.Balloon.SetFreePageReporting(true)
 	}

 	// Set initial amount of cpu's for the virtual machine
@@ -700,9 +776,118 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net
 		return err
 	}

+	// Check if we should restore from template instead of creating new VM
+	if clh.config.BootFromTemplate && clh.shouldRestoreFromTemplate() {
+		clh.Logger().Info("Template files found, will restore VM instead of creating new")
+		// Mark this as a restore operation for StartVM to use RestoreVM instead
+		clh.state.isRestoring = true
+		return nil
+	}
+
 	return nil
 }

+// shouldRestoreFromTemplate checks if template snapshot files exist and we should restore instead of creating new VM
+func (clh *cloudHypervisor) shouldRestoreFromTemplate() bool {
+	// For template restore, we need the snapshot directory to contain the necessary files
+	// The snapshotDir is derived from the MemoryPath directory
+	snapshotDir := filepath.Dir(clh.config.MemoryPath)
+
+	// Check for required template files (config.json, state.json, and memory file)
+	configFile := filepath.Join(snapshotDir, "config.json")
+	stateFile := filepath.Join(snapshotDir, "state.json")
+	memoryFile := clh.config.MemoryPath
+
+	if _, err := os.Stat(configFile); err != nil {
+		clh.Logger().WithError(err).WithField("configFile", configFile).Debug("Template config file not accessible")
+		return false
+	}
+
+	if _, err := os.Stat(stateFile); err != nil {
+		clh.Logger().WithError(err).WithField("stateFile", stateFile).Debug("Template state file not accessible")
+		return false
+	}
+
+	if _, err := os.Stat(memoryFile); err != nil {
+		clh.Logger().WithError(err).WithField("memoryFile", memoryFile).Debug("Template memory file not accessible")
+		return false
+	}
+
+	clh.Logger().WithFields(log.Fields{
+		"configFile": configFile,
+		"stateFile":  stateFile,
+		"memoryFile": memoryFile,
+	}).Info("Template files found, can restore VM from template")
+
+	return true
+}
+
+// copyFile copies a file from src to dst, preserving the source file's permissions.
+func (clh *cloudHypervisor) copyFile(src, dst string) error {
+	srcFile, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer srcFile.Close()
+
+	srcInfo, err := srcFile.Stat()
+	if err != nil {
+		return err
+	}
+
+	dstFile, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, srcInfo.Mode())
+	if err != nil {
+		return err
+	}
+	defer dstFile.Close()
+
+	_, err = io.Copy(dstFile, srcFile)
+	if err != nil {
+		return err
+	}
+
+	return dstFile.Sync()
+}
+
+// updateVsockSocketPath updates the vsock socket path in the config.json file
+func (clh *cloudHypervisor) updateVsockSocketPath(configPath, vmID string) error {
+	// Read the config file
+	configData, err := os.ReadFile(configPath)
+	if err != nil {
+		return err
+	}
+
+	var config map[string]interface{}
+	dec := json.NewDecoder(bytes.NewReader(configData))
+	dec.UseNumber()
+	if err := dec.Decode(&config); err != nil {
+		return err
+	}
+
+	// Update vsock socket path if vsock exists
+	if vsock, ok := config["vsock"].(map[string]interface{}); ok {
+		// Generate new vsock socket path for this VM
+		newVsockPath, err := clh.vsockSocketPath(vmID)
+		if err != nil {
+			return err
+		}
+		vsock["socket"] = newVsockPath
+
+		clh.Logger().WithFields(log.Fields{
+			"vmID":         vmID,
+			"newVsockPath": newVsockPath,
+		}).Debug("Updated vsock socket path in config.json")
+	}
+
+	// Write the updated config back to file
+	updatedConfig, err := json.Marshal(config)
+	if err != nil {
+		return err
+	}
+
+	return os.WriteFile(configPath, updatedConfig, 0600)
+}
+
 // setupInitdata prepares and attaches the initdata disk if present.
 func setupInitdata(clh *cloudHypervisor, hypervisorConfig *HypervisorConfig) error {
 	if len(hypervisorConfig.Initdata) == 0 {
@@ -771,8 +956,37 @@ func (clh *cloudHypervisor) StartVM(ctx context.Context, timeout int) error {
 	ctx, cancel := context.WithTimeout(ctx, bootTimeout*time.Second)
 	defer cancel()

-	if err := clh.bootVM(ctx); err != nil {
-		return err
+	// Check if we should restore from template or create new VM
+	if clh.state.isRestoring {
+		// Copy template files to VM directory
+		snapshotDir := filepath.Dir(clh.config.MemoryPath)
+
+		// Copy config.json from template to VM directory
+		srcConfig := filepath.Join(snapshotDir, "config.json")
+		dstConfig := filepath.Join(vmPath, "config.json")
+		if err := clh.copyFile(srcConfig, dstConfig); err != nil {
+			return fmt.Errorf("failed to copy config.json: %v", err)
+		}
+
+		// Copy state.json from template to VM directory
+		srcState := filepath.Join(snapshotDir, "state.json")
+		dstState := filepath.Join(vmPath, "state.json")
+		if err := clh.copyFile(srcState, dstState); err != nil {
+			return fmt.Errorf("failed to copy state.json: %v", err)
+		}
+
+		// Update vsock socket path in the copied config.json
+		if err := clh.updateVsockSocketPath(dstConfig, clh.id); err != nil {
+			return fmt.Errorf("failed to update vsock socket path: %v", err)
+		}
+
+		if err := clh.restoreVM(ctx); err != nil {
+			return err
+		}
+	} else {
+		if err := clh.bootVM(ctx); err != nil {
+			return err
+		}
 	}

 	clh.state.state = clhReady
@@ -1287,16 +1501,109 @@ func (clh *cloudHypervisor) Cleanup(ctx context.Context) error {

 func (clh *cloudHypervisor) PauseVM(ctx context.Context) error {
 	clh.Logger().WithField("function", "PauseVM").Info("Pause Sandbox")
+
+	cl := clh.client()
+	ctx, cancel := context.WithTimeout(ctx, clh.getClhAPITimeout()*time.Second)
+	defer cancel()
+
+	_, err := cl.VmPausePut(ctx)
+	if err != nil {
+		clh.Logger().WithError(err).Error("Failed to pause VM")
+		return openAPIClientError(err)
+	}
+
 	return nil
 }

 func (clh *cloudHypervisor) SaveVM() error {
-	clh.Logger().WithField("function", "saveSandboxC").Info("Save Sandbox")
+	clh.Logger().WithField("function", "SaveVM").Info("Save Sandbox")
+
+	cl := clh.client()
+	ctx, cancel := context.WithTimeout(context.Background(), clh.getClhAPITimeout()*time.Second)
+	defer cancel()
+
+	snapshotDir := filepath.Dir(clh.config.MemoryPath)
+	// Create snapshot config with file URL to template path
+	// Use MemoryPath as base for snapshot destination
+	// When creating a template, the MemoryPath is set to the template path, so we can use it to save the snapshot.
+	fileURL := "file://" + snapshotDir
+
+	vmSnapshotConfig := *chclient.NewVmSnapshotConfig()
+	vmSnapshotConfig.SetDestinationUrl(fileURL)
+
+	_, err := cl.VmSnapshotPut(ctx, vmSnapshotConfig)
+	if err != nil {
+		clh.Logger().WithError(err).Error("Failed to save VM snapshot")
+		return openAPIClientError(err)
+	}
+
+	if clh.config.BootToBeTemplate {
+		// Update the config.json file in the snapshotDir to set memory shared=false
+		snapshotConfigPath := filepath.Join(snapshotDir, "config.json")
+		snapshotConfig, err := os.ReadFile(snapshotConfigPath)
+		if err != nil {
+			clh.Logger().WithError(err).Error("Failed to read snapshot config")
+			return err
+		}
+
+		var snapshotConfigData map[string]interface{}
+		dec := json.NewDecoder(bytes.NewReader(snapshotConfig))
+		dec.UseNumber()
+		if err := dec.Decode(&snapshotConfigData); err != nil {
+			clh.Logger().WithError(err).Error("Failed to unmarshal snapshot config")
+			return err
+		}
+
+		// Access the memory section and cast it to a map
+		if memorySection, ok := snapshotConfigData["memory"].(map[string]interface{}); ok {
+			memorySection["shared"] = false
+			// Do the same update for each element for the "zones" array in the memorySection
+			if zones, ok := memorySection["zones"].([]interface{}); ok {
+				for _, zone := range zones {
+					if zoneMap, ok := zone.(map[string]interface{}); ok {
+						zoneMap["shared"] = false
+					} else {
+						clh.Logger().Error("Unable to access zone in snapshot config memory section")
+						return fmt.Errorf("invalid snapshot config structure: zone in memory section not found or invalid")
+					}
+				}
+			} else {
+				clh.Logger().Error("Unable to access zones array in snapshot config memory section")
+				return fmt.Errorf("invalid snapshot config structure: zones array in memory section not found or invalid")
+			}
+		} else {
+			clh.Logger().Error("Unable to access memory section in snapshot config")
+			return fmt.Errorf("invalid snapshot config structure: memory section not found or invalid")
+		}
+
+		// Write the modified config back to file
+		modifiedConfig, err := json.Marshal(snapshotConfigData)
+		if err != nil {
+			clh.Logger().WithError(err).Error("Failed to marshal modified snapshot config")
+			return err
+		}
+
+		if err := os.WriteFile(snapshotConfigPath, modifiedConfig, 0600); err != nil {
+			clh.Logger().WithError(err).Error("Failed to write modified snapshot config")
+			return err
+		}
+	}
+
 	return nil
 }

 func (clh *cloudHypervisor) ResumeVM(ctx context.Context) error {
 	clh.Logger().WithField("function", "ResumeVM").Info("Resume Sandbox")
+	cl := clh.client()
+	ctx, cancel := context.WithTimeout(ctx, clh.getClhAPITimeout()*time.Second)
+	defer cancel()
+
+	_, err := cl.ResumeVM(ctx)
+	if err != nil {
+		clh.Logger().WithError(err).Error("Failed to resume VM")
+		return openAPIClientError(err)
+	}
+
 	return nil
 }

@@ -1509,11 +1816,11 @@ func (clh *cloudHypervisor) clhPath() (string, error) {
 		p = defaultClhPath
 	}

-	if _, err = os.Stat(p); os.IsNotExist(err) {
-		return "", fmt.Errorf("Cloud-Hypervisor path (%s) does not exist", p)
+	if _, err = os.Stat(p); err != nil {
+		return "", fmt.Errorf("Cloud-Hypervisor path (%s) is not accessible: %w", p, err)
 	}

-	return p, err
+	return p, nil
 }

 func (clh *cloudHypervisor) launchClh() error {
@@ -1741,6 +2048,60 @@ func (clh *cloudHypervisor) bootVM(ctx context.Context) error {
 	return nil
 }

+// restoreVM restores a VM from a template snapshot. The restored VM will be in
+// Paused state. The caller (factory layer, via factory.GetVM → vm.Resume) is
+// responsible for resuming the VM, reseeding the RNG, and syncing the guest clock
+// before the VM is used. See factory_linux.go GetVM().
+func (clh *cloudHypervisor) restoreVM(ctx context.Context) error {
+	clh.Logger().Info("Restoring VM from template")
+
+	cl := clh.client()
+
+	// use the VMStorePath as the base for the restore source URL
+	vmPath := filepath.Join(clh.config.VMStorePath, clh.id)
+	sourceURL := "file://" + vmPath
+
+	// check if the snapshot directory contains the state.json and config.json files
+	// which contain the VM state and configuration respectively
+	stateFile := filepath.Join(vmPath, "state.json")
+	configFile := filepath.Join(vmPath, "config.json")
+
+	if _, err := os.Stat(stateFile); err != nil {
+		return fmt.Errorf("failed to access state file %s: %v", stateFile, err)
+	}
+
+	if _, err := os.Stat(configFile); err != nil {
+		return fmt.Errorf("failed to access config file %s: %v", configFile, err)
+	}
+
+	// Prepare restore configuration
+	restoreConfig := *chclient.NewRestoreConfig(sourceURL)
+
+	clh.Logger().WithField("sourceURL", sourceURL).Debug("Restore configuration")
+
+	// Restore VM from template (uses the caller's ctx, which already has the boot timeout)
+	_, err := cl.VmRestorePut(ctx, restoreConfig)
+	if err != nil {
+		clh.Logger().WithError(err).Error("failed to restore VM from template")
+		return openAPIClientError(err)
+	}
+
+	// Check VM state after restoration
+	info, err := clh.vmInfo()
+	if err != nil {
+		return err
+	}
+
+	clh.Logger().Debugf("VM state after restore: %#v", info)
+
+	if info.State != clhStatePaused {
+		clh.Logger().Warnf("VM state is '%s' after restore, expected 'Paused'", info.State)
+	}
+
+	clh.Logger().Info("Successfully restored VM from template")
+	return nil
+}
+
 func (clh *cloudHypervisor) addVSock(cid int64, path string) {
 	clh.Logger().WithFields(log.Fields{
 		"path": path,
--- a/src/runtime/virtcontainers/clh_test.go
+++ b/src/runtime/virtcontainers/clh_test.go
@@ -74,7 +74,9 @@ func newClhConfig() (HypervisorConfig, error) {
 }

 type clhClientMock struct {
-	vmInfo chclient.VmInfo
+	vmInfo          chclient.VmInfo
+	restoreRequest  *chclient.RestoreConfig
+	snapshotRequest *chclient.VmSnapshotConfig
 }

 func (c *clhClientMock) VmmPingGet(ctx context.Context) (chclient.VmmPingResponse, *http.Response, error) {
@@ -115,11 +117,35 @@ func (c *clhClientMock) VmAddDiskPut(ctx context.Context, diskConfig chclient.Di
 	return chclient.PciDeviceInfo{Bdf: "0000:00:0a.0"}, nil, nil
 }

+//nolint:golint
+func (c *clhClientMock) VmPausePut(ctx context.Context) (*http.Response, error) {
+	c.vmInfo.State = clhStatePaused
+	return nil, nil
+}
+
+//nolint:golint
+func (c *clhClientMock) VmSnapshotPut(ctx context.Context, vmSnapshotConfig chclient.VmSnapshotConfig) (*http.Response, error) {
+	c.snapshotRequest = &vmSnapshotConfig
+	return nil, nil
+}
+
 //nolint:golint
 func (c *clhClientMock) VmRemoveDevicePut(ctx context.Context, vmRemoveDevice chclient.VmRemoveDevice) (*http.Response, error) {
 	return nil, nil
 }

+func (c *clhClientMock) VmRestorePut(ctx context.Context, restoreConfig chclient.RestoreConfig) (*http.Response, error) {
+	c.restoreRequest = &restoreConfig
+	// restoreVM() verifies Paused after restore.
+	c.vmInfo.State = clhStatePaused
+	return nil, nil
+}
+
+func (c *clhClientMock) ResumeVM(ctx context.Context) (*http.Response, error) {
+	c.vmInfo.State = clhStateRunning
+	return nil, nil
+}
+
 func TestCloudHypervisorAddVSock(t *testing.T) {
 	assert := assert.New(t)
 	clh := cloudHypervisor{}
@@ -431,7 +457,8 @@ func TestCloudHypervisorCleanupVM(t *testing.T) {
 	assert.NoError(err, "persist.GetDriver() unexpected error")

 	dir := filepath.Join(store.RunVMStoragePath(), clh.id)
-	os.MkdirAll(dir, os.ModePerm)
+	err = os.MkdirAll(dir, os.ModePerm)
+	assert.NoError(err, "failed to create dir %s", dir)

 	err = clh.cleanupVM(false)
 	assert.NoError(err, "persist.GetDriver() unexpected error")
@@ -516,6 +543,81 @@ func TestClhCreateVM(t *testing.T) {
 	}
 }

+func TestClhRestoreVM(t *testing.T) {
+	assert := assert.New(t)
+
+	store, err := persist.GetDriver()
+	assert.NoError(err)
+
+	clhConfig, err := newClhConfig()
+	assert.NoError(err)
+	clhConfig.VMStorePath = store.RunVMStoragePath()
+	clhConfig.RunStorePath = store.RunStoragePath()
+
+	mockClient := &clhClientMock{}
+	clh := &cloudHypervisor{
+		config:    clhConfig,
+		APIClient: mockClient,
+	}
+
+	// First call restoreVM without the VM snapshot files (state.json, config.json) present.
+	err = clh.restoreVM(context.Background())
+	// An error is expected because restoreVM expects the VM snapshot files to be present.
+	assert.Error(err)
+	assert.Contains(err.Error(), filepath.Join(clhConfig.VMStorePath, "state.json"))
+
+	// Now create the VM snapshot files and call restoreVM again.
+	err = os.MkdirAll(clhConfig.VMStorePath, os.ModePerm)
+	assert.NoError(err, "failed to create dir %s", clhConfig.VMStorePath)
+	stateFile := filepath.Join(clhConfig.VMStorePath, "state.json")
+	configFile := filepath.Join(clhConfig.VMStorePath, "config.json")
+	err = os.WriteFile(stateFile, []byte("{}"), 0o600)
+	assert.NoError(err)
+	err = os.WriteFile(configFile, []byte("{}"), 0o600)
+	assert.NoError(err)
+
+	// Call restoreVM again, this time it should succeed.
+	err = clh.restoreVM(context.Background())
+	assert.NoError(err)
+
+	if assert.NotNil(mockClient.restoreRequest) {
+		expectedSourceURL := "file://" + clhConfig.VMStorePath
+		assert.Equal(expectedSourceURL, mockClient.restoreRequest.GetSourceUrl())
+	}
+
+	info, err := clh.vmInfo()
+	assert.NoError(err)
+	assert.Equal(clhStatePaused, info.State)
+}
+
+func TestClhSaveVM(t *testing.T) {
+	assert := assert.New(t)
+
+	store, err := persist.GetDriver()
+	assert.NoError(err)
+
+	clhConfig, err := newClhConfig()
+	assert.NoError(err)
+	// For testing, assume the memory path is located within the VM store path.
+	clhConfig.MemoryPath = filepath.Join(store.RunVMStoragePath(), "memory")
+	clhConfig.VMStorePath = store.RunVMStoragePath()
+	clhConfig.RunStorePath = store.RunStoragePath()
+
+	mockClient := &clhClientMock{}
+	clh := &cloudHypervisor{
+		config:    clhConfig,
+		APIClient: mockClient,
+	}
+
+	err = clh.SaveVM()
+	assert.NoError(err)
+
+	if assert.NotNil(mockClient.snapshotRequest) {
+		expectedDestinationURL := "file://" + filepath.Dir(clhConfig.MemoryPath)
+		assert.Equal(expectedDestinationURL, mockClient.snapshotRequest.GetDestinationUrl())
+	}
+}
+
 func TestCloudHypervisorStartSandbox(t *testing.T) {
 	assert := assert.New(t)
 	clhConfig, err := newClhConfig()
--- a/src/runtime/virtcontainers/factory/factory_linux.go
+++ b/src/runtime/virtcontainers/factory/factory_linux.go
@@ -80,6 +80,8 @@ func resetHypervisorConfig(config *vc.VMConfig) {
 	config.HypervisorConfig.SharedPath = ""
 	config.HypervisorConfig.VMStorePath = ""
 	config.HypervisorConfig.RunStorePath = ""
+	config.HypervisorConfig.SandboxName = ""
+	config.HypervisorConfig.SandboxNamespace = ""
 }

 // It's important that baseConfig and newConfig are passed by value!
--- a/src/runtime/virtcontainers/factory/template/template_linux.go
+++ b/src/runtime/virtcontainers/factory/template/template_linux.go
@@ -11,6 +11,7 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"path/filepath"
 	"syscall"
 	"time"

@@ -115,6 +116,15 @@ func (t *template) prepareTemplateFiles() error {
 	}
 	f.Close()

+	// truncate the memory file to the exact size of the VM memory
+	memoryInBytes := int64(t.config.HypervisorConfig.MemorySize) * 1024 * 1024
+	t.Logger().Infof("truncating memory file %s to %d bytes", t.statePath+"/memory", memoryInBytes)
+	err = os.Truncate(t.statePath+"/memory", memoryInBytes)
+	if err != nil {
+		t.close()
+		return err
+	}
+
 	return nil
 }

@@ -124,7 +134,8 @@ func (t *template) createTemplateVM(ctx context.Context) error {
 	config.HypervisorConfig.BootToBeTemplate = true
 	config.HypervisorConfig.BootFromTemplate = false
 	config.HypervisorConfig.MemoryPath = t.statePath + "/memory"
-	config.HypervisorConfig.DevicesStatePath = t.statePath + "/state"
+	config.HypervisorConfig.DevicesStatePath = t.deviceStatePath()
+	config.HypervisorConfig.VMStorePath = t.statePath

 	vm, err := vc.NewVM(ctx, config)
 	if err != nil {
@@ -161,7 +172,7 @@ func (t *template) createFromTemplateVM(ctx context.Context, c vc.VMConfig) (*vc
 	config.HypervisorConfig.BootToBeTemplate = false
 	config.HypervisorConfig.BootFromTemplate = true
 	config.HypervisorConfig.MemoryPath = t.statePath + "/memory"
-	config.HypervisorConfig.DevicesStatePath = t.statePath + "/state"
+	config.HypervisorConfig.DevicesStatePath = t.deviceStatePath()
 	config.HypervisorConfig.SharedPath = c.HypervisorConfig.SharedPath
 	config.HypervisorConfig.VMStorePath = c.HypervisorConfig.VMStorePath
 	config.HypervisorConfig.RunStorePath = c.HypervisorConfig.RunStorePath
@@ -175,6 +186,15 @@ func (t *template) checkTemplateVM() error {
 		return err
 	}

-	_, err = os.Stat(t.statePath + "/state")
+	_, err = os.Stat(t.deviceStatePath())
 	return err
 }
+
+func (t *template) deviceStatePath() string {
+	stateFileName := "state"
+	if t.config.HypervisorType == vc.ClhHypervisor {
+		stateFileName = "state.json"
+	}
+
+	return filepath.Join(t.statePath, stateFileName)
+}
--- a/src/runtime/virtcontainers/factory/template/template_test.go
+++ b/src/runtime/virtcontainers/factory/template/template_test.go
@@ -57,15 +57,26 @@ func TestTemplateFactory(t *testing.T) {
 	assert.NoError(err)
 	defer hybridVSockTTRPCMock.Stop()

-	// New
+	// Create 2 sets of instance-specific directories for per-VM storage
+	runStorePath1 := t.TempDir()
+	vmStorePath1 := t.TempDir()
+	runStorePath2 := t.TempDir()
+	vmStorePath2 := t.TempDir()
+
+	// Create a new Template Factory
 	f, err := New(ctx, vmConfig, testDir)
 	assert.Nil(err)

 	// Config
 	assert.Equal(f.Config(), vmConfig)

-	// GetBaseVM
-	vm, err := f.GetBaseVM(ctx, vmConfig)
+	// GetBaseVM with first instance paths
+	vmConfig1 := vmConfig
+	vmConfig1.HypervisorConfig.RunStorePath = runStorePath1
+	vmConfig1.HypervisorConfig.VMStorePath = vmStorePath1
+
+	// Test the creation of a new VM from the template factory
+	vm, err := f.GetBaseVM(ctx, vmConfig1)
 	assert.Nil(err)

 	err = vm.Stop(ctx)
@@ -79,44 +90,59 @@ func TestTemplateFactory(t *testing.T) {

 	assert.Equal(tt.Config(), vmConfig)

+	// Checking that template VM check fails
+	// if the corresponding memory and state files are absent
 	err = tt.checkTemplateVM()
 	assert.Error(err)

-	_, err = os.Create(tt.statePath + "/memory")
+	memFile, err := os.Create(tt.statePath + "/memory")
 	assert.Nil(err)
+	memFile.Close()
 	err = tt.checkTemplateVM()
 	assert.Error(err)

-	_, err = os.Create(tt.statePath + "/state")
+	devFile, err := os.Create(tt.deviceStatePath())
 	assert.Nil(err)
+	devFile.Close()
+
+	// After creating state and memory files, checkTemplateVM should succeed
 	err = tt.checkTemplateVM()
 	assert.Nil(err)

+	// Recreate the template VM, which should succeed
 	err = tt.createTemplateVM(ctx)
 	assert.Nil(err)

-	vm, err = tt.GetBaseVM(ctx, vmConfig)
+	// Ensuring that directly calling template's GetBaseVM function
+	// returns a VM instance similar to the one returned by the factory's GetBaseVM function
+	vm, err = tt.GetBaseVM(ctx, vmConfig1)
 	assert.Nil(err)

 	err = vm.Stop(ctx)
 	assert.Nil(err)

-	vm, err = f.GetBaseVM(ctx, vmConfig)
+	vm, err = f.GetBaseVM(ctx, vmConfig1)
 	assert.Nil(err)

 	err = vm.Stop(ctx)
 	assert.Nil(err)

+	// Overwriting the template VM should succeed
 	err = tt.createTemplateVM(ctx)
 	assert.Nil(err)

-	vm, err = tt.GetBaseVM(ctx, vmConfig)
+	// Create second instance with different storage paths
+	vmConfig2 := vmConfig
+	vmConfig2.HypervisorConfig.RunStorePath = runStorePath2
+	vmConfig2.HypervisorConfig.VMStorePath = vmStorePath2
+
+	vm, err = tt.GetBaseVM(ctx, vmConfig2)
 	assert.Nil(err)

 	err = vm.Stop(ctx)
 	assert.Nil(err)

-	vm, err = f.GetBaseVM(ctx, vmConfig)
+	vm, err = f.GetBaseVM(ctx, vmConfig2)
 	assert.Nil(err)

 	err = vm.Stop(ctx)
--- a/tests/gha-run-k8s-common.sh
+++ b/tests/gha-run-k8s-common.sh
@@ -43,6 +43,7 @@ TEST_CLUSTER_NAMESPACE="${TEST_CLUSTER_NAMESPACE:-}"
 CONTAINER_RUNTIME="${CONTAINER_RUNTIME:-containerd}"
 SNAPSHOTTER="${SNAPSHOTTER:-}"
 EROFS_SNAPSHOTTER_MODE="${EROFS_SNAPSHOTTER_MODE:-}"
+EROFS_MERGE_MODE="${EROFS_MERGE_MODE:-}"

 # Wait for the Kubernetes API to recover after kata-deploy uninstall, then
 # retry the uninstall to purge any stale helm release state. On k3s/rke2,
@@ -851,6 +852,26 @@ function helm_helper() {
 				yq -i '.containerd.userDropIn = strenv(HELM_CONTAINERD_USER_DROP_IN)' "${values_yaml}"
 		fi

+		# EROFS merge mode ("merged" default, or "unmerged"). This is orthogonal
+		# to EROFS_SNAPSHOTTER_MODE (which controls default_size): it controls
+		# whether containerd merges layers into a single fsmeta.erofs (merged,
+		# runtime-rs only) or keeps per-layer layer.erofs (unmerged, required by
+		# the Go runtime).
+		if [[ -n "${EROFS_MERGE_MODE}" ]]; then
+			if [[ "${SNAPSHOTTER}" != "erofs" ]]; then
+				die "EROFS_MERGE_MODE is only supported with SNAPSHOTTER=erofs"
+			fi
+
+			case "${EROFS_MERGE_MODE}" in
+				merged|unmerged) ;;
+				*)
+					die "Unsupported EROFS_MERGE_MODE: ${EROFS_MERGE_MODE}"
+					;;
+			esac
+
+			yq -i ".snapshotter.erofsMergeMode = \"${EROFS_MERGE_MODE}\"" "${values_yaml}"
+		fi
+
 		if [[ -z "${HELM_SHIMS}" ]]; then
 			die "A list of shims is expected but none was provided"
 		fi
--- a/tests/integration/kubernetes/k8s-vm-templating.bats
+++ b/tests/integration/kubernetes/k8s-vm-templating.bats
@@ -0,0 +1,114 @@
+#!/usr/bin/env bats
+#
+# Copyright (c) 2024 Kata Containers
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tests for Kata VM templating (factory) functionality in Kubernetes integration mode
+
+load "${BATS_TEST_DIRNAME}/lib.sh"
+load "${BATS_TEST_DIRNAME}/../../common.bash"
+load "${BATS_TEST_DIRNAME}/confidential_common.sh"
+load "${BATS_TEST_DIRNAME}/tests_common.sh"
+
+# Returns 0 if the current environment supports VM templating, non-zero
+# otherwise. VM templating is only supported on non-confidential clh/qemu
+# hypervisors, and because it uses shared_fs="none" it also requires a
+# block-device-based snapshotter (blockfile or erofs).
+vm_templating_supported() {
+	[[ "${KATA_HYPERVISOR}" == "clh" || "${KATA_HYPERVISOR}" == "qemu" ]] || return 1
+	is_confidential_runtime_class && return 1
+	[[ "${SNAPSHOTTER:-}" =~ ^(blockfile|erofs)$ ]] || return 1
+	return 0
+}
+
+setup() {
+	if ! vm_templating_supported; then
+		skip "VM templating requires a non-confidential clh/qemu hypervisor and a blockfile/erofs snapshotter (KATA_HYPERVISOR=${KATA_HYPERVISOR}, SNAPSHOTTER=${SNAPSHOTTER:-unset})"
+	fi
+
+	setup_common || die "setup_common failed"
+
+	# Build a Kata runtime config drop-in that enables VM templating and
+	# disables shared_fs (incompatible with templating).
+	# QEMU VM templating requires an initrd, CLH does not.
+	local rootfs_override=""
+	if [[ "${KATA_HYPERVISOR}" == "qemu" ]]; then
+		rootfs_override=$'image = ""\ninitrd = "/opt/kata/share/kata-containers/kata-containers-initrd.img"'
+	fi
+
+	local runtime_config_dropin_file="${BATS_TEST_TMPDIR}/99-k8s-vm-templating.toml"
+	cat > "${runtime_config_dropin_file}" <<DROPIN
+[hypervisor.${KATA_HYPERVISOR}]
+shared_fs = "none"
+default_vcpus = 1
+default_memory = 512
+${rootfs_override}
+
+[factory]
+enable_template = true
+template_path = "/run/vc/vm/template"
+DROPIN
+
+	# Install the drop-in on the node selected by setup_common and record the
+	# remote path so teardown can remove it.
+	dropin_path="$(set_kata_runtime_config_dropin_file "$node" "${runtime_config_dropin_file}")" \
+		|| die "Failed to install Kata runtime config drop-in on node $node"
+
+	# kata-runtime defaults to the QEMU config; point it at the active
+	# hypervisor so that factory init/destroy use the correct configuration.
+	kata_config_path="/opt/kata/share/defaults/kata-containers/runtimes/${KATA_HYPERVISOR}/configuration-${KATA_HYPERVISOR}.toml"
+}
+
+@test "Pod can be created with a templated VM" {
+	# Initialize the VM template on the target node.
+	exec_host "$node" "nsenter --mount=/proc/1/ns/mnt /opt/kata/bin/kata-runtime --config ${kata_config_path} factory init"
+
+	# The factory init above must have created the template directory. exec_host
+	# pipes the remote output through `tr`, so the pipeline's exit status is not
+	# the remote command's; assert on the output instead. Check inside PID 1's
+	# mount namespace, where the template tmpfs was actually mounted.
+	exec_host "$node" "nsenter --mount=/proc/1/ns/mnt test -f /run/vc/vm/template/memory && echo present" | grep -q present
+
+	pod_name="test-templated-pod"
+	ctr_name="test-container"
+
+	pod_config=$(mktemp --tmpdir pod_config.XXXXXX.yaml)
+	cp "$pod_config_dir/busybox-template.yaml" "$pod_config"
+
+	sed -i "s/POD_NAME/$pod_name/" "$pod_config"
+	sed -i "s/CTR_NAME/$ctr_name/" "$pod_config"
+
+	kubectl create -f "${pod_config}"
+	kubectl wait --for=condition=Ready --timeout="$timeout" "pod/${pod_name}"
+
+	grep_pod_exec_output "${pod_name}" "Hello from templated VM" sh -c "echo 'Hello from templated VM'"
+
+	# Confirm at least one VM sandbox under /run/vc/vm/ is a symlink, which
+	# proves the factory/template path was used. A non-templated VM creates a
+	# real directory at /run/vc/vm/<sandbox-id>/, whereas a factory-spawned VM
+	# stores its state under a generated UUID and /run/vc/vm/<sandbox-id> is a
+	# symlink pointing at it (see assignSandbox() in
+	# src/runtime/virtcontainers/vm.go). Inspect PID 1's mount namespace, where
+	# the shim creates these entries alongside the template tmpfs.
+	exec_host "$node" \
+		"nsenter --mount=/proc/1/ns/mnt find /run/vc/vm -maxdepth 1 -mindepth 1 -type l ! -name template | grep -q . && echo symlink" \
+		| grep -q symlink
+}
+
+teardown() {
+	vm_templating_supported || return 0
+
+	rm -f "${pod_config:-}"
+
+	# Destroy the VM template and remove the config drop-in on the target node.
+	# factory destroy must run in PID 1's mount namespace to unmount the template
+	# tmpfs that factory init created there (see the @test for details).
+	exec_host "$node" "nsenter --mount=/proc/1/ns/mnt /opt/kata/bin/kata-runtime --config ${kata_config_path} factory destroy" \
+		|| echo "Warning: Failed to destroy VM template on node $node"
+
+	remove_kata_runtime_config_dropin_file "$node" "${dropin_path:-}" \
+		|| echo "Warning: Failed to remove Kata runtime config drop-in on node $node"
+
+	teardown_common "${node:-}" "${node_start_time:-}"
+}
--- a/tests/integration/kubernetes/run_kubernetes_tests.sh
+++ b/tests/integration/kubernetes/run_kubernetes_tests.sh
@@ -104,6 +104,7 @@ else
 		"k8s-security-context.bats" \
 		"k8s-shared-volume.bats" \
 		"k8s-volume.bats" \
+		"k8s-vm-templating.bats" \
 		"k8s-nginx-connectivity.bats" \
 	)

--- a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs
+++ b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs
@@ -15,36 +15,50 @@ use std::path::Path;
 pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &Path) -> Result<()> {
    info!("Configuring erofs-snapshotter");

+    // "unmerged" mode keeps each image layer as its own per-layer `layer.erofs`
+    // (containerd's default, non-fsmerged layout), which is the only layout the
+    // Go runtime can consume. In the default "merged" mode we force containerd
+    // to merge layers into a single `fsmeta.erofs`, which is runtime-rs only.
+    let unmerged = config.erofs_merge_mode.as_deref() == Some("unmerged");
+
    // The Go runtime does not support fsmerged EROFS (fsmeta.erofs).
    // If the snapshotter handler mapping explicitly pairs a Go shim with
-    // erofs, that is a hard misconfiguration — bail out so the operator
-    // fixes the mapping instead of hitting cryptic runtime errors later.
-    if let Some(mapping) = config.snapshotter_handler_mapping_for_arch.as_ref() {
-        let mut go_shims_on_erofs = Vec::new();
-        for entry in mapping.split(',') {
-            let parts: Vec<&str> = entry.split(':').collect();
-            if parts.len() == 2 && parts[1] == "erofs" && !utils::is_rust_shim(parts[0]) {
-                go_shims_on_erofs.push(parts[0].to_string());
+    // erofs in the (default) merged mode, that is a hard misconfiguration —
+    // bail out so the operator fixes the mapping instead of hitting cryptic
+    // runtime errors later. In "unmerged" mode the Go runtime is supported, so
+    // skip this guard.
+    if !unmerged {
+        if let Some(mapping) = config.snapshotter_handler_mapping_for_arch.as_ref() {
+            let mut go_shims_on_erofs = Vec::new();
+            for entry in mapping.split(',') {
+                let parts: Vec<&str> = entry.split(':').collect();
+                if parts.len() == 2 && parts[1] == "erofs" && !utils::is_rust_shim(parts[0]) {
+                    go_shims_on_erofs.push(parts[0].to_string());
+                }
            }
-        }
-        if !go_shims_on_erofs.is_empty() {
-            warn!("##########################################################################");
-            warn!("#                                                                        #");
-            warn!("#  Go runtime shim(s) mapped to the erofs snapshotter:                   #");
-            for s in &go_shims_on_erofs {
-                warn!("#    - {:<64} #", s);
+            if !go_shims_on_erofs.is_empty() {
+                warn!("##########################################################################");
+                warn!("#                                                                        #");
+                warn!("#  Go runtime shim(s) mapped to the erofs snapshotter:                   #");
+                for s in &go_shims_on_erofs {
+                    warn!("#    - {:<64} #", s);
+                }
+                warn!("#                                                                        #");
+                warn!(
+                    "#  The Go runtime does NOT support fsmerged EROFS (fsmeta.erofs).         #"
+                );
+                warn!("#  Only runtime-rs shims are supported with merged erofs. Set            #");
+                warn!("#  EROFS_MERGE_MODE=unmerged to use the Go runtime with erofs.           #");
+                warn!("#                                                                        #");
+                warn!("##########################################################################");
+                return Err(anyhow::anyhow!(
+                    "erofs snapshotter: Go runtime shim(s) [{}] cannot be mapped to merged erofs. \
+                     The Go runtime does not support fsmerged EROFS. \
+                     Set EROFS_MERGE_MODE=unmerged, remove these shims from \
+                     SNAPSHOTTER_HANDLER_MAPPING, or switch them to runtime-rs.",
+                    go_shims_on_erofs.join(", ")
+                ));
            }
-            warn!("#                                                                        #");
-            warn!("#  The Go runtime does NOT support fsmerged EROFS (fsmeta.erofs).         #");
-            warn!("#  Only runtime-rs shims are supported with the erofs snapshotter.        #");
-            warn!("#                                                                        #");
-            warn!("##########################################################################");
-            return Err(anyhow::anyhow!(
-                "erofs snapshotter: Go runtime shim(s) [{}] cannot be mapped to erofs. \
-                 The Go runtime does not support fsmerged EROFS. \
-                 Remove these shims from SNAPSHOTTER_HANDLER_MAPPING or switch them to runtime-rs.",
-                go_shims_on_erofs.join(", ")
-            ));
        }
    }

@@ -88,11 +102,27 @@ pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &P
        ".plugins.\"io.containerd.snapshotter.v1.erofs\".default_size",
        "\"10G\"",
    )?;
-    toml_utils::set_toml_value(
-        configuration_file,
-        ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
-        "0",
-    )?;
+    // In the default "merged" mode, force containerd to merge all layers into a
+    // single fsmeta.erofs (max_unmerged_layers = 0). In "unmerged" mode we delete
+    // any previously-written value so each layer stays a separate layer.erofs,
+    // which the Go runtime requires.
+    //
+    // Because kata-deploy edits the containerd config in place, switching from
+    // merged to unmerged must actively remove the old `max_unmerged_layers = 0`
+    // left behind by a previous install. Otherwise the stale `0` would keep
+    // forcing the merged layout and break Go-runtime compatibility.
+    if !unmerged {
+        toml_utils::set_toml_value(
+            configuration_file,
+            ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
+            "0",
+        )?;
+    } else {
+        toml_utils::delete_toml_value(
+            configuration_file,
+            ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
+        )?;
+    }

    Ok(())
 }
--- a/tools/packaging/kata-deploy/binary/src/config.rs
+++ b/tools/packaging/kata-deploy/binary/src/config.rs
@@ -178,6 +178,14 @@ pub struct Config {
    pub multi_install_suffix: Option<String>,
    pub helm_post_delete_hook: bool,
    pub experimental_setup_snapshotter: Option<Vec<String>>,
+    /// EROFS snapshotter merge mode: "merged" (default) or "unmerged".
+    ///
+    /// In "unmerged" mode kata-deploy does not force containerd's erofs
+    /// snapshotter to merge layers (it leaves `max_unmerged_layers` at the
+    /// containerd default), so each image layer is exposed as its own
+    /// per-layer `layer.erofs`. This is the only layout the Go runtime can
+    /// consume; the merged (`fsmeta.erofs`) layout is runtime-rs only.
+    pub erofs_merge_mode: Option<String>,
    pub experimental_force_guest_pull_for_arch: Vec<String>,
    pub dest_dir: String,
    pub host_install_dir: String,
@@ -307,6 +315,11 @@ impl Config {
            .filter(|s| !s.is_empty())
            .map(|s| s.split(',').map(|s| s.trim().to_string()).collect());

+        let erofs_merge_mode = env::var("EROFS_MERGE_MODE")
+            .ok()
+            .map(|s| s.trim().to_lowercase())
+            .filter(|s| !s.is_empty());
+
        // Only use arch-specific variable for experimental force guest pull
        let experimental_force_guest_pull_for_arch =
            get_arch_var("EXPERIMENTAL_FORCE_GUEST_PULL", "", &arch)
@@ -338,6 +351,7 @@ impl Config {
            multi_install_suffix,
            helm_post_delete_hook,
            experimental_setup_snapshotter,
+            erofs_merge_mode,
            experimental_force_guest_pull_for_arch,
            dest_dir,
            host_install_dir,
@@ -508,6 +522,17 @@ impl Config {
            _ => {}
        }

+        // Validate EROFS_MERGE_MODE
+        // Only "merged" (default) and "unmerged" are accepted.
+        if let Some(mode) = self.erofs_merge_mode.as_ref() {
+            if mode != "merged" && mode != "unmerged" {
+                return Err(anyhow::anyhow!(
+                    "EROFS_MERGE_MODE must be either 'merged' or 'unmerged', got '{}'",
+                    mode
+                ));
+            }
+        }
+
        // Validate EXPERIMENTAL_FORCE_GUEST_PULL_FOR_ARCH
        // This is a list of shim names
        for shim in &self.experimental_force_guest_pull_for_arch {
@@ -551,6 +576,7 @@ impl Config {
            "* EXPERIMENTAL_SETUP_SNAPSHOTTER: {:?}",
            self.experimental_setup_snapshotter
        );
+        info!("* EROFS_MERGE_MODE: {:?}", self.erofs_merge_mode);
        info!(
            "* EXPERIMENTAL_FORCE_GUEST_PULL: {}",
            self.experimental_force_guest_pull_for_arch.join(",")
--- a/tools/packaging/kata-deploy/binary/src/utils/toml.rs
+++ b/tools/packaging/kata-deploy/binary/src/utils/toml.rs
@@ -121,6 +121,47 @@ pub fn set_toml_value(file_path: &Path, path: &str, value: &str) -> Result<()> {
    Ok(())
 }

+/// Delete a TOML value (or table) at a given path.
+///
+/// Navigates to the parent table and removes the final key. This is a no-op if
+/// any path component (including the final key) does not exist, so callers can
+/// unconditionally remove a value that may or may not be present.
+pub fn delete_toml_value(file_path: &Path, path: &str) -> Result<()> {
+    let content = std::fs::read_to_string(file_path)
+        .with_context(|| format!("Failed to read TOML file: {file_path:?}"))?;
+
+    let (header, toml_content) = split_non_toml_header(&content);
+    let mut doc = toml_content
+        .parse::<DocumentMut>()
+        .context("Failed to parse TOML")?;
+
+    let parts = parse_toml_path(path)?;
+
+    let mut current_table = doc.as_table_mut();
+    for (i, part) in parts.iter().enumerate() {
+        let is_last = i == parts.len() - 1;
+
+        if is_last {
+            // Remove the value; absent key is fine (no-op).
+            current_table.remove(part.as_str());
+        } else {
+            // Navigate into the intermediate table. If it does not exist, there
+            // is nothing to delete.
+            match current_table
+                .get_mut(part.as_str())
+                .and_then(|item| item.as_table_mut())
+            {
+                Some(table) => current_table = table,
+                None => return Ok(()),
+            }
+        }
+    }
+
+    write_toml_with_header(file_path, header, &doc)?;
+
+    Ok(())
+}
+
 /// Get a TOML value at a given path
 pub fn get_toml_value(file_path: &Path, path: &str) -> Result<String> {
    let content = std::fs::read_to_string(file_path)
@@ -1714,4 +1755,100 @@ imports = ["/etc/containerd/conf.d/*.toml", "/opt/kata/containerd/config.d/kata-
        .unwrap();
        assert_eq!(runtime_type, "io.containerd.kata-qemu.v2");
    }
+
+    #[test]
+    fn test_delete_toml_value() {
+        let temp_file = NamedTempFile::new().unwrap();
+        let temp_path = temp_file.path();
+        std::fs::write(
+            temp_path,
+            "[plugins.\"io.containerd.snapshotter.v1.erofs\"]\nmax_unmerged_layers = 0\nenable_fsverity = true\n",
+        )
+        .unwrap();
+
+        // Sanity check: value is present before deletion.
+        let before = get_toml_value(
+            temp_path,
+            ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
+        )
+        .unwrap();
+        assert_eq!(before, "0");
+
+        delete_toml_value(
+            temp_path,
+            ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
+        )
+        .unwrap();
+
+        // The deleted key is gone, but sibling keys remain.
+        let result = get_toml_value(
+            temp_path,
+            ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
+        );
+        assert!(result.is_err(), "deleted key should no longer be found");
+
+        let sibling = get_toml_value(
+            temp_path,
+            ".plugins.\"io.containerd.snapshotter.v1.erofs\".enable_fsverity",
+        )
+        .unwrap();
+        assert_eq!(sibling, "true", "sibling keys must be preserved");
+    }
+
+    #[test]
+    fn test_delete_toml_value_missing_key_is_noop() {
+        let temp_file = NamedTempFile::new().unwrap();
+        let temp_path = temp_file.path();
+        let initial = "[plugins.\"io.containerd.snapshotter.v1.erofs\"]\nenable_fsverity = true\n";
+        std::fs::write(temp_path, initial).unwrap();
+
+        // Deleting a key that does not exist must succeed and leave the file usable.
+        delete_toml_value(
+            temp_path,
+            ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
+        )
+        .unwrap();
+
+        // Deleting through a non-existent intermediate table is also a no-op.
+        delete_toml_value(temp_path, ".plugins.\"nonexistent.plugin\".some_key").unwrap();
+        let sibling = get_toml_value(
+            temp_path,
+            ".plugins.\"io.containerd.snapshotter.v1.erofs\".enable_fsverity",
+        )
+        .unwrap();
+        assert_eq!(sibling, "true");
+    }
+
+    #[test]
+    fn test_delete_toml_value_preserves_k3s_header() {
+        let temp_file = NamedTempFile::new().unwrap();
+        let temp_path = temp_file.path();
+        std::fs::write(
+            temp_path,
+            "{{ template \"base\" . }}\n[plugins.\"io.containerd.snapshotter.v1.erofs\"]\nmax_unmerged_layers = 0\n",
+        )
+        .unwrap();
+
+        delete_toml_value(
+            temp_path,
+            ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
+        )
+        .unwrap();
+
+        let content = std::fs::read_to_string(temp_path).unwrap();
+        assert!(
+            content.starts_with("{{ template \"base\" . }}\n"),
+            "non-TOML header must be preserved"
+        );
+        assert!(
+            !content.contains("max_unmerged_layers"),
+            "value must be removed"
+        );
+    }
+
+    #[test]
+    fn test_delete_toml_value_nonexistent_file() {
+        let result = delete_toml_value(Path::new("/nonexistent/file.toml"), "some.path");
+        assert!(result.is_err());
+    }
 }
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
@@ -413,6 +413,13 @@ Get snapshotter setup list from structured config
 {{- join "," .Values.snapshotter.setup -}}
 {{- end -}}

+{{/*
+Get EROFS merge mode from structured config ("merged" or "unmerged")
+*/}}
+{{- define "kata-deploy.getErofsMergeMode" -}}
+{{- .Values.snapshotter.erofsMergeMode | default "" -}}
+{{- end -}}
+
 {{/*
 Get debug value from structured config
 */}}
@@ -569,6 +576,11 @@ e.g. `{{- include "kata-deploy.commonEnv" . | nindent 8 }}`.
 - name: EXPERIMENTAL_SETUP_SNAPSHOTTER
  value: {{ $snapshotterSetup | quote }}
 {{- end }}
+{{- $erofsMergeMode := include "kata-deploy.getErofsMergeMode" . | trim -}}
+{{- if $erofsMergeMode }}
+- name: EROFS_MERGE_MODE
+  value: {{ $erofsMergeMode | quote }}
+{{- end }}
 {{- $forceGuestPullAmd64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "amd64") | trim -}}
 {{- if $forceGuestPullAmd64 }}
 - name: EXPERIMENTAL_FORCE_GUEST_PULL_X86_64
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml
@@ -271,6 +271,18 @@ health:

 snapshotter:
  setup: ["nydus"]  # ["nydus", "erofs"] or []
+  # EROFS merge mode: "merged" (default) or "unmerged".
+  #
+  # "merged" forces containerd's erofs snapshotter to merge all image layers
+  # into a single fsmeta.erofs (max_unmerged_layers = 0). This layout is only
+  # supported by runtime-rs shims.
+  #
+  # "unmerged" leaves max_unmerged_layers at the containerd default so each
+  # image layer is exposed as its own per-layer layer.erofs. This is the only
+  # layout the Go runtime can consume, so set this when mapping a Go shim to the
+  # erofs snapshotter. When empty, kata-deploy uses its built-in default
+  # (merged).
+  erofsMergeMode: ""

 # Shim configuration
 # By default (disableAll: false), all shims with enabled: ~ (null) are enabled.