From c0f9744225c0b58640892afd0c8f951bba709dac Mon Sep 17 00:00:00 2001 From: Cameron Baird Date: Tue, 9 Jun 2026 18:53:58 +0000 Subject: [PATCH 1/3] runtime: Implement support for VM Template factory in clh Add support for VM Template factory on the clh path. In order to support snapshot/restore-based VM templating, the following changes were needed: 1. For clh.go, implement SaveVM, PauseVM, restoreVM, ResumeVM 2. Remove initrd config check for VM Templating path. The root disk image (when using image mode) is created in memory and therefore captured in the VM snapshot. 3. Truncate the memory file to the size of the VM at factory VM create time. This allows CLH to use the memory file as the backing for the template VM memory, allowing O(1) snapshot times. 4. CLH uses memory zones as backing for its memory on the template paths 5. Update StartVM in CLH to use the restore path when template is configured and available Signed-off-by: Cameron Baird --- src/runtime/pkg/katautils/config.go | 6 - src/runtime/pkg/katautils/config_test.go | 2 +- src/runtime/virtcontainers/clh.go | 414 ++++++++++++++++-- src/runtime/virtcontainers/clh_test.go | 102 ++++- .../virtcontainers/factory/factory_linux.go | 3 + .../factory/template/template_linux.go | 26 +- .../factory/template/template_test.go | 44 +- 7 files changed, 548 insertions(+), 49 deletions(-) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index de0b525c59..e78e699e01 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -2085,12 +2085,6 @@ func checkNetNsConfig(config oci.RuntimeConfig) error { // checkFactoryConfig ensures the VM factory configuration is valid. func checkFactoryConfig(config oci.RuntimeConfig) error { - if config.FactoryConfig.Template { - if config.HypervisorConfig.InitrdPath == "" { - return errors.New("Factory option enable_template requires an initrd image") - } - } - if config.FactoryConfig.VMCacheNumber > 0 { if config.HypervisorType != vc.QemuHypervisor { return errors.New("VM cache just support qemu") diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index ef9e70f8ec..187af6cb70 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -1696,7 +1696,7 @@ func TestCheckFactoryConfig(t *testing.T) { {false, false, "", "initrd"}, {true, false, "", "initrd"}, - {true, true, "image", ""}, + {true, false, "image", ""}, } for i, d := range data { diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index bffaa34385..28d5ef63d5 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -68,6 +68,7 @@ const ( const ( clhStateCreated = "Created" clhStateRunning = "Running" + clhStatePaused = "Paused" ) const ( @@ -112,8 +113,16 @@ type clhClient interface { VmAddDevicePut(ctx context.Context, deviceConfig chclient.DeviceConfig) (chclient.PciDeviceInfo, *http.Response, error) // Add a new disk device to the VM VmAddDiskPut(ctx context.Context, diskConfig chclient.DiskConfig) (chclient.PciDeviceInfo, *http.Response, error) + // Pause the VM + VmPausePut(ctx context.Context) (*http.Response, error) + // Create a snapshot of the VM + VmSnapshotPut(ctx context.Context, vmSnapshotConfig chclient.VmSnapshotConfig) (*http.Response, error) // Remove a device from the VM VmRemoveDevicePut(ctx context.Context, vmRemoveDevice chclient.VmRemoveDevice) (*http.Response, error) + // Restore VM from a snapshot + VmRestorePut(ctx context.Context, restoreConfig chclient.RestoreConfig) (*http.Response, error) + // Resume a paused VM + ResumeVM(ctx context.Context) (*http.Response, error) } type clhClientApi struct { @@ -153,10 +162,26 @@ func (c *clhClientApi) VmAddDiskPut(ctx context.Context, diskConfig chclient.Dis return c.ApiInternal.VmAddDiskPut(ctx).DiskConfig(diskConfig).Execute() } +func (c *clhClientApi) VmPausePut(ctx context.Context) (*http.Response, error) { + return c.ApiInternal.PauseVM(ctx).Execute() +} + +func (c *clhClientApi) VmSnapshotPut(ctx context.Context, vmSnapshotConfig chclient.VmSnapshotConfig) (*http.Response, error) { + return c.ApiInternal.VmSnapshotPut(ctx).VmSnapshotConfig(vmSnapshotConfig).Execute() +} + func (c *clhClientApi) VmRemoveDevicePut(ctx context.Context, vmRemoveDevice chclient.VmRemoveDevice) (*http.Response, error) { return c.ApiInternal.VmRemoveDevicePut(ctx).VmRemoveDevice(vmRemoveDevice).Execute() } +func (c *clhClientApi) VmRestorePut(ctx context.Context, restoreConfig chclient.RestoreConfig) (*http.Response, error) { + return c.ApiInternal.VmRestorePut(ctx).RestoreConfig(restoreConfig).Execute() +} + +func (c *clhClientApi) ResumeVM(ctx context.Context) (*http.Response, error) { + return c.ApiInternal.ResumeVM(ctx).Execute() +} + // This is done in order to be able to override such a function as part of // our unit tests, as when testing bootVM we're on a mocked scenario already. var vmAddNetPutRequest = func(clh *cloudHypervisor) ([]chclient.PciDeviceInfo, error) { @@ -255,12 +280,14 @@ type CloudHypervisorState struct { PID int VirtiofsDaemonPid int state clhState + isRestoring bool } func (s *CloudHypervisorState) reset() { s.PID = 0 s.VirtiofsDaemonPid = 0 s.state = clhNotReady + s.isRestoring = false } type cloudHypervisor struct { @@ -501,7 +528,7 @@ func getNonUserDefinedKernelParams(rootfstype string, disableNvdimm bool, dax bo } // For cloudHypervisor this call only sets the internal structure up. -// The VM will be created and started through StartVM(). +// The VM will be created and started through StartVM(), or restored from template if template files exist. func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error { clh.ctx = ctx @@ -559,29 +586,78 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net } } - // Create the VM memory config via the constructor to ensure default values are properly assigned - clh.vmconfig.Memory = chclient.NewMemoryConfig(int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes())) - // Memory config shared is to be enabled when using vhost_user backends, ex. virtio-fs - // or when using HugePages. - // If such features are disabled, turn off shared memory config. - if clh.config.SharedFS == config.NoSharedFS && !clh.config.HugePages { - clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(false) - } else { - clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(true) - } - // Enable hugepages if needed - clh.vmconfig.Memory.Hugepages = func(b bool) *bool { return &b }(clh.config.HugePages) - if !clh.config.ConfidentialGuest { - hotplugSize := clh.config.DefaultMaxMemorySize - // OpenAPI only supports int64 values - clh.vmconfig.Memory.HotplugSize = func(i int64) *int64 { return &i }(int64((utils.MemUnit(hotplugSize) * utils.MiB).ToBytes())) - - if clh.config.ReclaimGuestFreedMemory { - // Create VM with a balloon config so we can enable free page reporting (size of the balloon can be set to zero) - clh.vmconfig.Balloon = chclient.NewBalloonConfig(0) - // Set the free page reporting flag for ballooning to be true - clh.vmconfig.Balloon.SetFreePageReporting(true) + // If the VM is booting from a template, or if the VM is going to be used as a template + // the memory is to be backed by a file, so we need to configure the memory zones accordingly. + if clh.config.BootFromTemplate || clh.config.BootToBeTemplate { + // VM templating is incompatible with virtio-fs because virtio-fs requires shared memory, + // while templating needs COW/private memory on restore. + if clh.config.SharedFS == config.VirtioFS || clh.config.SharedFS == config.VirtioFSNydus { + return errors.New("VM templating has been enabled with virtio-fs and this configuration will not work") } + + // Double-check that the clh.config.MemoryPath file is accessible before using it in the VM config, to avoid hitting a less clear error from cloud hypervisor when it tries to access the memory file. + if _, err := os.Stat(clh.config.MemoryPath); err != nil { + return fmt.Errorf("memory file %s is not accessible: %w", clh.config.MemoryPath, err) + } + + // Set the size to be 0 since we are going to configure actual size via zones + clh.vmconfig.Memory = chclient.NewMemoryConfig(0) + + memoryZoneConfig := chclient.NewMemoryZoneConfig("mem0", int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes())) + if clh.config.BootToBeTemplate { + // When BootToBeTemplate is true, the memory file backing the VM memory is shared between multiple VMs created from the same template. + // So we need to set shared to true in this case. + memoryZoneConfig.SetShared(true) + clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(true) + + if !clh.config.ConfidentialGuest { + // TODO: Remove this warning once memory hotplugging is supported + // for template VMs. + // + // Memory hotplug is intentionally not configured for template VMs. + // Resizing a memory zone requires the virtio-mem hotplug method + // (cloud-hypervisor rejects the default ACPI hotplug on a zone that + // carries a hotplug_size), which is not currently supported in the + // templating path. As a result, VMs restored from this template + // cannot grow their memory beyond the template's boot size. + clh.Logger().Warn("memory hotplugging is currently unsupported for template VMs") + } + } else { + // When BootFromTemplate is true, set shared=false to ensure Copy-On-Write is used for the memory file. + // So that the VM can have its own private memory. + memoryZoneConfig.SetShared(false) + clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(false) + } + memoryZoneConfig.SetFile(clh.config.MemoryPath) + clh.vmconfig.Memory.Zones = &[]chclient.MemoryZoneConfig{ + *memoryZoneConfig, + } + } else { // Normal (non-template) VM creation + // Create the VM memory config via the constructor to ensure default values are properly assigned + clh.vmconfig.Memory = chclient.NewMemoryConfig(int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes())) + // Memory config shared is to be enabled when using vhost_user backends, ex. virtio-fs + // or when using HugePages. + // If such features are disabled, turn off shared memory config. + if clh.config.SharedFS == config.NoSharedFS && !clh.config.HugePages { + clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(false) + } else { + clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(true) + } + // Enable hugepages if needed + clh.vmconfig.Memory.Hugepages = func(b bool) *bool { return &b }(clh.config.HugePages) + if !clh.config.ConfidentialGuest { + hotplugSize := clh.config.DefaultMaxMemorySize + // OpenAPI only supports int64 values + clh.vmconfig.Memory.HotplugSize = func(i int64) *int64 { return &i }(int64((utils.MemUnit(hotplugSize) * utils.MiB).ToBytes())) + } + } + + // Configure balloon device for free page reporting. This is set unconditionally + // (for both template and non-template paths) so that template VMs include the + // balloon in their snapshot, and VMs restored from a template inherit it. + if !clh.config.ConfidentialGuest && clh.config.ReclaimGuestFreedMemory { + clh.vmconfig.Balloon = chclient.NewBalloonConfig(0) + clh.vmconfig.Balloon.SetFreePageReporting(true) } // Set initial amount of cpu's for the virtual machine @@ -700,9 +776,113 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net return err } + // Check if we should restore from template instead of creating new VM + if clh.config.BootFromTemplate && clh.shouldRestoreFromTemplate() { + clh.Logger().Info("Template files found, will restore VM instead of creating new") + // Mark this as a restore operation for StartVM to use RestoreVM instead + clh.state.isRestoring = true + return nil + } + return nil } +// shouldRestoreFromTemplate checks if template snapshot files exist and we should restore instead of creating new VM +func (clh *cloudHypervisor) shouldRestoreFromTemplate() bool { + // For template restore, we need the snapshot directory to contain the necessary files + // The snapshotDir is derived from the MemoryPath directory + snapshotDir := filepath.Dir(clh.config.MemoryPath) + + // Check for required template files (config.json, state.json, and memory file) + configFile := filepath.Join(snapshotDir, "config.json") + stateFile := filepath.Join(snapshotDir, "state.json") + memoryFile := clh.config.MemoryPath + + if _, err := os.Stat(configFile); err != nil { + clh.Logger().WithError(err).WithField("configFile", configFile).Debug("Template config file not accessible") + return false + } + + if _, err := os.Stat(stateFile); err != nil { + clh.Logger().WithError(err).WithField("stateFile", stateFile).Debug("Template state file not accessible") + return false + } + + if _, err := os.Stat(memoryFile); err != nil { + clh.Logger().WithError(err).WithField("memoryFile", memoryFile).Debug("Template memory file not accessible") + return false + } + + clh.Logger().WithFields(log.Fields{ + "configFile": configFile, + "stateFile": stateFile, + "memoryFile": memoryFile, + }).Info("Template files found, can restore VM from template") + + return true +} + +// copyFile copies a file from src to dst +func (clh *cloudHypervisor) copyFile(src, dst string) error { + srcFile, err := os.Open(src) + if err != nil { + return err + } + defer srcFile.Close() + + dstFile, err := os.Create(dst) + if err != nil { + return err + } + defer dstFile.Close() + + _, err = io.Copy(dstFile, srcFile) + if err != nil { + return err + } + + return dstFile.Sync() +} + +// updateVsockSocketPath updates the vsock socket path in the config.json file +func (clh *cloudHypervisor) updateVsockSocketPath(configPath, vmID string) error { + // Read the config file + configData, err := os.ReadFile(configPath) + if err != nil { + return err + } + + var config map[string]interface{} + dec := json.NewDecoder(bytes.NewReader(configData)) + dec.UseNumber() + if err := dec.Decode(&config); err != nil { + return err + } + + // Update vsock socket path if vsock exists + if vsock, ok := config["vsock"].(map[string]interface{}); ok { + // Generate new vsock socket path for this VM + newVsockPath, err := clh.vsockSocketPath(vmID) + if err != nil { + return err + } + vsock["socket"] = newVsockPath + + clh.Logger().WithFields(log.Fields{ + "vmID": vmID, + "newVsockPath": newVsockPath, + }).Debug("Updated vsock socket path in config.json") + } + + // Write the updated config back to file + updatedConfig, err := json.Marshal(config) + if err != nil { + return err + } + + return os.WriteFile(configPath, updatedConfig, 0644) +} + // setupInitdata prepares and attaches the initdata disk if present. func setupInitdata(clh *cloudHypervisor, hypervisorConfig *HypervisorConfig) error { if len(hypervisorConfig.Initdata) == 0 { @@ -771,8 +951,37 @@ func (clh *cloudHypervisor) StartVM(ctx context.Context, timeout int) error { ctx, cancel := context.WithTimeout(ctx, bootTimeout*time.Second) defer cancel() - if err := clh.bootVM(ctx); err != nil { - return err + // Check if we should restore from template or create new VM + if clh.state.isRestoring { + // Copy template files to VM directory + snapshotDir := filepath.Dir(clh.config.MemoryPath) + + // Copy config.json from template to VM directory + srcConfig := filepath.Join(snapshotDir, "config.json") + dstConfig := filepath.Join(vmPath, "config.json") + if err := clh.copyFile(srcConfig, dstConfig); err != nil { + return fmt.Errorf("failed to copy config.json: %v", err) + } + + // Copy state.json from template to VM directory + srcState := filepath.Join(snapshotDir, "state.json") + dstState := filepath.Join(vmPath, "state.json") + if err := clh.copyFile(srcState, dstState); err != nil { + return fmt.Errorf("failed to copy state.json: %v", err) + } + + // Update vsock socket path in the copied config.json + if err := clh.updateVsockSocketPath(dstConfig, clh.id); err != nil { + return fmt.Errorf("failed to update vsock socket path: %v", err) + } + + if err := clh.restoreVM(ctx); err != nil { + return err + } + } else { + if err := clh.bootVM(ctx); err != nil { + return err + } } clh.state.state = clhReady @@ -1287,16 +1496,109 @@ func (clh *cloudHypervisor) Cleanup(ctx context.Context) error { func (clh *cloudHypervisor) PauseVM(ctx context.Context) error { clh.Logger().WithField("function", "PauseVM").Info("Pause Sandbox") + + cl := clh.client() + ctx, cancel := context.WithTimeout(ctx, clh.getClhAPITimeout()*time.Second) + defer cancel() + + _, err := cl.VmPausePut(ctx) + if err != nil { + clh.Logger().WithError(err).Error("Failed to pause VM") + return openAPIClientError(err) + } + return nil } func (clh *cloudHypervisor) SaveVM() error { - clh.Logger().WithField("function", "saveSandboxC").Info("Save Sandbox") + clh.Logger().WithField("function", "SaveVM").Info("Save Sandbox") + + cl := clh.client() + ctx, cancel := context.WithTimeout(context.Background(), clh.getClhAPITimeout()*time.Second) + defer cancel() + + snapshotDir := filepath.Dir(clh.config.MemoryPath) + // Create snapshot config with file URL to template path + // Use MemoryPath as base for snapshot destination + // When creating a template, the MemoryPath is set to the template path, so we can use it to save the snapshot. + fileURL := "file://" + snapshotDir + + vmSnapshotConfig := *chclient.NewVmSnapshotConfig() + vmSnapshotConfig.SetDestinationUrl(fileURL) + + _, err := cl.VmSnapshotPut(ctx, vmSnapshotConfig) + if err != nil { + clh.Logger().WithError(err).Error("Failed to save VM snapshot") + return openAPIClientError(err) + } + + if clh.config.BootToBeTemplate { + // Update the config.json file in the snapshotDir to set memory shared=false + snapshotConfigPath := filepath.Join(snapshotDir, "config.json") + snapshotConfig, err := os.ReadFile(snapshotConfigPath) + if err != nil { + clh.Logger().WithError(err).Error("Failed to read snapshot config") + return err + } + + var snapshotConfigData map[string]interface{} + dec := json.NewDecoder(bytes.NewReader(snapshotConfig)) + dec.UseNumber() + if err := dec.Decode(&snapshotConfigData); err != nil { + clh.Logger().WithError(err).Error("Failed to unmarshal snapshot config") + return err + } + + // Access the memory section and cast it to a map + if memorySection, ok := snapshotConfigData["memory"].(map[string]interface{}); ok { + memorySection["shared"] = false + // Do the same update for each element for the "zones" array in the memorySection + if zones, ok := memorySection["zones"].([]interface{}); ok { + for _, zone := range zones { + if zoneMap, ok := zone.(map[string]interface{}); ok { + zoneMap["shared"] = false + } else { + clh.Logger().Error("Unable to access zone in snapshot config memory section") + return fmt.Errorf("invalid snapshot config structure: zone in memory section not found or invalid") + } + } + } else { + clh.Logger().Error("Unable to access zones array in snapshot config memory section") + return fmt.Errorf("invalid snapshot config structure: zones array in memory section not found or invalid") + } + } else { + clh.Logger().Error("Unable to access memory section in snapshot config") + return fmt.Errorf("invalid snapshot config structure: memory section not found or invalid") + } + + // Write the modified config back to file + modifiedConfig, err := json.Marshal(snapshotConfigData) + if err != nil { + clh.Logger().WithError(err).Error("Failed to marshal modified snapshot config") + return err + } + + if err := os.WriteFile(snapshotConfigPath, modifiedConfig, 0644); err != nil { + clh.Logger().WithError(err).Error("Failed to write modified snapshot config") + return err + } + } + return nil } func (clh *cloudHypervisor) ResumeVM(ctx context.Context) error { clh.Logger().WithField("function", "ResumeVM").Info("Resume Sandbox") + cl := clh.client() + ctx, cancel := context.WithTimeout(ctx, clh.getClhAPITimeout()*time.Second) + defer cancel() + + _, err := cl.ResumeVM(ctx) + if err != nil { + clh.Logger().WithError(err).Error("Failed to resume VM") + return openAPIClientError(err) + } + return nil } @@ -1509,11 +1811,11 @@ func (clh *cloudHypervisor) clhPath() (string, error) { p = defaultClhPath } - if _, err = os.Stat(p); os.IsNotExist(err) { - return "", fmt.Errorf("Cloud-Hypervisor path (%s) does not exist", p) + if _, err = os.Stat(p); err != nil { + return "", fmt.Errorf("Cloud-Hypervisor path (%s) is not accessible: %w", p, err) } - return p, err + return p, nil } func (clh *cloudHypervisor) launchClh() error { @@ -1741,6 +2043,60 @@ func (clh *cloudHypervisor) bootVM(ctx context.Context) error { return nil } +// restoreVM restores a VM from a template snapshot. The restored VM will be in +// Paused state. The caller (factory layer, via factory.GetVM → vm.Resume) is +// responsible for resuming the VM, reseeding the RNG, and syncing the guest clock +// before the VM is used. See factory_linux.go GetVM(). +func (clh *cloudHypervisor) restoreVM(ctx context.Context) error { + clh.Logger().Info("Restoring VM from template") + + cl := clh.client() + + // use the VMStorePath as the base for the restore source URL + vmPath := filepath.Join(clh.config.VMStorePath, clh.id) + sourceURL := "file://" + vmPath + + // check if the snapshot directory contains the state.json and config.json files + // which contain the VM state and configuration respectively + stateFile := filepath.Join(vmPath, "state.json") + configFile := filepath.Join(vmPath, "config.json") + + if _, err := os.Stat(stateFile); err != nil { + return fmt.Errorf("failed to access state file %s: %v", stateFile, err) + } + + if _, err := os.Stat(configFile); err != nil { + return fmt.Errorf("failed to access config file %s: %v", configFile, err) + } + + // Prepare restore configuration + restoreConfig := *chclient.NewRestoreConfig(sourceURL) + + clh.Logger().WithField("sourceURL", sourceURL).Debug("Restore configuration") + + // Restore VM from template (uses the caller's ctx, which already has the boot timeout) + _, err := cl.VmRestorePut(ctx, restoreConfig) + if err != nil { + clh.Logger().WithError(err).Error("failed to restore VM from template") + return openAPIClientError(err) + } + + // Check VM state after restoration + info, err := clh.vmInfo() + if err != nil { + return err + } + + clh.Logger().Debugf("VM state after restore: %#v", info) + + if info.State != clhStatePaused { + clh.Logger().Warnf("VM state is '%s' after restore, expected 'Paused'", info.State) + } + + clh.Logger().Info("Successfully restored VM from template") + return nil +} + func (clh *cloudHypervisor) addVSock(cid int64, path string) { clh.Logger().WithFields(log.Fields{ "path": path, diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index 7ddbcff206..93cf3b1a70 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -74,7 +74,9 @@ func newClhConfig() (HypervisorConfig, error) { } type clhClientMock struct { - vmInfo chclient.VmInfo + vmInfo chclient.VmInfo + restoreRequest *chclient.RestoreConfig + snapshotRequest *chclient.VmSnapshotConfig } func (c *clhClientMock) VmmPingGet(ctx context.Context) (chclient.VmmPingResponse, *http.Response, error) { @@ -115,11 +117,35 @@ func (c *clhClientMock) VmAddDiskPut(ctx context.Context, diskConfig chclient.Di return chclient.PciDeviceInfo{Bdf: "0000:00:0a.0"}, nil, nil } +//nolint:golint +func (c *clhClientMock) VmPausePut(ctx context.Context) (*http.Response, error) { + c.vmInfo.State = clhStatePaused + return nil, nil +} + +//nolint:golint +func (c *clhClientMock) VmSnapshotPut(ctx context.Context, vmSnapshotConfig chclient.VmSnapshotConfig) (*http.Response, error) { + c.snapshotRequest = &vmSnapshotConfig + return nil, nil +} + //nolint:golint func (c *clhClientMock) VmRemoveDevicePut(ctx context.Context, vmRemoveDevice chclient.VmRemoveDevice) (*http.Response, error) { return nil, nil } +func (c *clhClientMock) VmRestorePut(ctx context.Context, restoreConfig chclient.RestoreConfig) (*http.Response, error) { + c.restoreRequest = &restoreConfig + // restoreVM() verifies Paused after restore. + c.vmInfo.State = clhStatePaused + return nil, nil +} + +func (c *clhClientMock) ResumeVM(ctx context.Context) (*http.Response, error) { + c.vmInfo.State = clhStateRunning + return nil, nil +} + func TestCloudHypervisorAddVSock(t *testing.T) { assert := assert.New(t) clh := cloudHypervisor{} @@ -516,6 +542,80 @@ func TestClhCreateVM(t *testing.T) { } } +func TestClhRestoreVM(t *testing.T) { + assert := assert.New(t) + + store, err := persist.GetDriver() + assert.NoError(err) + + clhConfig, err := newClhConfig() + assert.NoError(err) + clhConfig.VMStorePath = store.RunVMStoragePath() + clhConfig.RunStorePath = store.RunStoragePath() + + mockClient := &clhClientMock{} + clh := &cloudHypervisor{ + config: clhConfig, + APIClient: mockClient, + } + + // First call restoreVM without the VM snapshot files (state.json, config.json) present. + err = clh.restoreVM(context.Background()) + // An error is expected because restoreVM expects the VM snapshot files to be present. + assert.Error(err) + assert.Contains(err.Error(), filepath.Join(clhConfig.VMStorePath, "state.json")) + + // Now create the VM snapshot files and call restoreVM again. + os.MkdirAll(clhConfig.VMStorePath, os.ModePerm) + stateFile := filepath.Join(clhConfig.VMStorePath, "state.json") + configFile := filepath.Join(clhConfig.VMStorePath, "config.json") + err = os.WriteFile(stateFile, []byte("{}"), 0o600) + assert.NoError(err) + err = os.WriteFile(configFile, []byte("{}"), 0o600) + assert.NoError(err) + + // Call restoreVM again, this time it should succeed. + err = clh.restoreVM(context.Background()) + assert.NoError(err) + + if assert.NotNil(mockClient.restoreRequest) { + expectedSourceURL := "file://" + clhConfig.VMStorePath + assert.Equal(expectedSourceURL, mockClient.restoreRequest.GetSourceUrl()) + } + + info, err := clh.vmInfo() + assert.NoError(err) + assert.Equal(clhStatePaused, info.State) +} + +func TestClhSaveVM(t *testing.T) { + assert := assert.New(t) + + store, err := persist.GetDriver() + assert.NoError(err) + + clhConfig, err := newClhConfig() + assert.NoError(err) + // For testing, assume the memory path is located within the VM store path. + clhConfig.MemoryPath = filepath.Join(store.RunVMStoragePath(), "memory") + clhConfig.VMStorePath = store.RunVMStoragePath() + clhConfig.RunStorePath = store.RunStoragePath() + + mockClient := &clhClientMock{} + clh := &cloudHypervisor{ + config: clhConfig, + APIClient: mockClient, + } + + err = clh.SaveVM() + assert.NoError(err) + + if assert.NotNil(mockClient.snapshotRequest) { + expectedDestinationURL := "file://" + filepath.Dir(clhConfig.MemoryPath) + assert.Equal(expectedDestinationURL, mockClient.snapshotRequest.GetDestinationUrl()) + } +} + func TestCloudHypervisorStartSandbox(t *testing.T) { assert := assert.New(t) clhConfig, err := newClhConfig() diff --git a/src/runtime/virtcontainers/factory/factory_linux.go b/src/runtime/virtcontainers/factory/factory_linux.go index 4a0cfcfe38..c010916943 100644 --- a/src/runtime/virtcontainers/factory/factory_linux.go +++ b/src/runtime/virtcontainers/factory/factory_linux.go @@ -80,6 +80,9 @@ func resetHypervisorConfig(config *vc.VMConfig) { config.HypervisorConfig.SharedPath = "" config.HypervisorConfig.VMStorePath = "" config.HypervisorConfig.RunStorePath = "" + config.HypervisorConfig.SandboxName = "" + config.HypervisorConfig.SandboxNamespace = "" + config.HypervisorConfig.DefaultMaxVCPUs = 0 } // It's important that baseConfig and newConfig are passed by value! diff --git a/src/runtime/virtcontainers/factory/template/template_linux.go b/src/runtime/virtcontainers/factory/template/template_linux.go index d48ce5c50b..2a23cdbccd 100644 --- a/src/runtime/virtcontainers/factory/template/template_linux.go +++ b/src/runtime/virtcontainers/factory/template/template_linux.go @@ -11,6 +11,7 @@ import ( "context" "fmt" "os" + "path/filepath" "syscall" "time" @@ -115,6 +116,15 @@ func (t *template) prepareTemplateFiles() error { } f.Close() + // truncate the memory file to the exact size of the VM memory + memoryInBytes := int64(t.config.HypervisorConfig.MemorySize) * 1024 * 1024 + t.Logger().Infof("truncating memory file %s to %d bytes", t.statePath+"/memory", memoryInBytes) + err = os.Truncate(t.statePath+"/memory", memoryInBytes) + if err != nil { + t.close() + return err + } + return nil } @@ -124,7 +134,8 @@ func (t *template) createTemplateVM(ctx context.Context) error { config.HypervisorConfig.BootToBeTemplate = true config.HypervisorConfig.BootFromTemplate = false config.HypervisorConfig.MemoryPath = t.statePath + "/memory" - config.HypervisorConfig.DevicesStatePath = t.statePath + "/state" + config.HypervisorConfig.DevicesStatePath = t.deviceStatePath() + config.HypervisorConfig.VMStorePath = t.statePath vm, err := vc.NewVM(ctx, config) if err != nil { @@ -161,7 +172,7 @@ func (t *template) createFromTemplateVM(ctx context.Context, c vc.VMConfig) (*vc config.HypervisorConfig.BootToBeTemplate = false config.HypervisorConfig.BootFromTemplate = true config.HypervisorConfig.MemoryPath = t.statePath + "/memory" - config.HypervisorConfig.DevicesStatePath = t.statePath + "/state" + config.HypervisorConfig.DevicesStatePath = t.deviceStatePath() config.HypervisorConfig.SharedPath = c.HypervisorConfig.SharedPath config.HypervisorConfig.VMStorePath = c.HypervisorConfig.VMStorePath config.HypervisorConfig.RunStorePath = c.HypervisorConfig.RunStorePath @@ -175,6 +186,15 @@ func (t *template) checkTemplateVM() error { return err } - _, err = os.Stat(t.statePath + "/state") + _, err = os.Stat(t.deviceStatePath()) return err } + +func (t *template) deviceStatePath() string { + stateFileName := "state" + if t.config.HypervisorType == vc.ClhHypervisor { + stateFileName = "state.json" + } + + return filepath.Join(t.statePath, stateFileName) +} diff --git a/src/runtime/virtcontainers/factory/template/template_test.go b/src/runtime/virtcontainers/factory/template/template_test.go index c067c793e6..a9c78162b9 100644 --- a/src/runtime/virtcontainers/factory/template/template_test.go +++ b/src/runtime/virtcontainers/factory/template/template_test.go @@ -57,15 +57,26 @@ func TestTemplateFactory(t *testing.T) { assert.NoError(err) defer hybridVSockTTRPCMock.Stop() - // New + // Create 2 sets of instance-specific directories for per-VM storage + runStorePath1 := t.TempDir() + vmStorePath1 := t.TempDir() + runStorePath2 := t.TempDir() + vmStorePath2 := t.TempDir() + + // Create a new Template Factory f, err := New(ctx, vmConfig, testDir) assert.Nil(err) // Config assert.Equal(f.Config(), vmConfig) - // GetBaseVM - vm, err := f.GetBaseVM(ctx, vmConfig) + // GetBaseVM with first instance paths + vmConfig1 := vmConfig + vmConfig1.HypervisorConfig.RunStorePath = runStorePath1 + vmConfig1.HypervisorConfig.VMStorePath = vmStorePath1 + + // Test the creation of a new VM from the template factory + vm, err := f.GetBaseVM(ctx, vmConfig1) assert.Nil(err) err = vm.Stop(ctx) @@ -79,44 +90,59 @@ func TestTemplateFactory(t *testing.T) { assert.Equal(tt.Config(), vmConfig) + // Checking that template VM check fails + // if the corresponding memory and state files are absent err = tt.checkTemplateVM() assert.Error(err) - _, err = os.Create(tt.statePath + "/memory") + memFile, err := os.Create(tt.statePath + "/memory") assert.Nil(err) + memFile.Close() err = tt.checkTemplateVM() assert.Error(err) - _, err = os.Create(tt.statePath + "/state") + devFile, err := os.Create(tt.deviceStatePath()) assert.Nil(err) + devFile.Close() + + // After creating state and memory files, checkTemplateVM should succeed err = tt.checkTemplateVM() assert.Nil(err) + // Recreate the template VM, which should succeed err = tt.createTemplateVM(ctx) assert.Nil(err) - vm, err = tt.GetBaseVM(ctx, vmConfig) + // Ensuring that directly calling template's GetBaseVM function + // returns a VM instance similar to the one returned by the factory's GetBaseVM function + vm, err = tt.GetBaseVM(ctx, vmConfig1) assert.Nil(err) err = vm.Stop(ctx) assert.Nil(err) - vm, err = f.GetBaseVM(ctx, vmConfig) + vm, err = f.GetBaseVM(ctx, vmConfig1) assert.Nil(err) err = vm.Stop(ctx) assert.Nil(err) + // Overwriting the template VM should succeed err = tt.createTemplateVM(ctx) assert.Nil(err) - vm, err = tt.GetBaseVM(ctx, vmConfig) + // Create second instance with different storage paths + vmConfig2 := vmConfig + vmConfig2.HypervisorConfig.RunStorePath = runStorePath2 + vmConfig2.HypervisorConfig.VMStorePath = vmStorePath2 + + vm, err = tt.GetBaseVM(ctx, vmConfig2) assert.Nil(err) err = vm.Stop(ctx) assert.Nil(err) - vm, err = f.GetBaseVM(ctx, vmConfig) + vm, err = f.GetBaseVM(ctx, vmConfig2) assert.Nil(err) err = vm.Stop(ctx) From 65a5f272f85b660eb0d604f3c9c9a0580906e654 Mon Sep 17 00:00:00 2001 From: Cameron Baird Date: Tue, 9 Jun 2026 18:55:28 +0000 Subject: [PATCH 2/3] ci: Introduce tests for VM template factory Add k8s-vm-templating-test.bats which exercises pod create with the factory initialized on the target node. Signed-off-by: Cameron Baird --- .../run-k8s-tests-on-free-runner.yaml | 5 + src/runtime/virtcontainers/clh.go | 13 +- src/runtime/virtcontainers/clh_test.go | 6 +- .../virtcontainers/factory/factory_linux.go | 1 - tests/gha-run-k8s-common.sh | 21 +++ .../kubernetes/k8s-vm-templating.bats | 114 +++++++++++++++ .../kubernetes/run_kubernetes_tests.sh | 1 + .../binary/src/artifacts/snapshotters.rs | 92 ++++++++---- .../kata-deploy/binary/src/config.rs | 26 ++++ .../kata-deploy/binary/src/utils/toml.rs | 137 ++++++++++++++++++ .../kata-deploy/templates/_helpers.tpl | 12 ++ .../helm-chart/kata-deploy/values.yaml | 12 ++ 12 files changed, 402 insertions(+), 38 deletions(-) create mode 100644 tests/integration/kubernetes/k8s-vm-templating.bats diff --git a/.github/workflows/run-k8s-tests-on-free-runner.yaml b/.github/workflows/run-k8s-tests-on-free-runner.yaml index d8d9b4c636..87a86ac8b0 100644 --- a/.github/workflows/run-k8s-tests-on-free-runner.yaml +++ b/.github/workflows/run-k8s-tests-on-free-runner.yaml @@ -41,10 +41,12 @@ jobs: matrix: environment: [ { vmm: clh, containerd_version: latest }, + { vmm: clh, containerd_version: latest, snapshotter: erofs, erofs_mode: disk, erofs_merge_mode: unmerged }, { vmm: clh, containerd_version: minimum }, { vmm: dragonball, containerd_version: latest }, { vmm: dragonball, containerd_version: minimum }, { vmm: qemu, containerd_version: latest }, + { vmm: qemu, containerd_version: latest, snapshotter: erofs, erofs_mode: disk, erofs_merge_mode: unmerged }, { vmm: qemu, containerd_version: minimum }, { vmm: qemu-runtime-rs, containerd_version: latest }, { vmm: qemu-runtime-rs, containerd_version: minimum }, @@ -68,6 +70,9 @@ jobs: K8S_TEST_HOST_TYPE: baremetal-no-attestation CONTAINER_ENGINE: containerd CONTAINER_ENGINE_VERSION: ${{ matrix.environment.containerd_version }} + SNAPSHOTTER: ${{ matrix.environment.snapshotter }} + EROFS_SNAPSHOTTER_MODE: ${{ matrix.environment.erofs_mode }} + EROFS_MERGE_MODE: ${{ matrix.environment.erofs_merge_mode }} GH_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 28d5ef63d5..171dd2bf22 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -822,7 +822,7 @@ func (clh *cloudHypervisor) shouldRestoreFromTemplate() bool { return true } -// copyFile copies a file from src to dst +// copyFile copies a file from src to dst, preserving the source file's permissions. func (clh *cloudHypervisor) copyFile(src, dst string) error { srcFile, err := os.Open(src) if err != nil { @@ -830,7 +830,12 @@ func (clh *cloudHypervisor) copyFile(src, dst string) error { } defer srcFile.Close() - dstFile, err := os.Create(dst) + srcInfo, err := srcFile.Stat() + if err != nil { + return err + } + + dstFile, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, srcInfo.Mode()) if err != nil { return err } @@ -880,7 +885,7 @@ func (clh *cloudHypervisor) updateVsockSocketPath(configPath, vmID string) error return err } - return os.WriteFile(configPath, updatedConfig, 0644) + return os.WriteFile(configPath, updatedConfig, 0600) } // setupInitdata prepares and attaches the initdata disk if present. @@ -1578,7 +1583,7 @@ func (clh *cloudHypervisor) SaveVM() error { return err } - if err := os.WriteFile(snapshotConfigPath, modifiedConfig, 0644); err != nil { + if err := os.WriteFile(snapshotConfigPath, modifiedConfig, 0600); err != nil { clh.Logger().WithError(err).Error("Failed to write modified snapshot config") return err } diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index 93cf3b1a70..d8773a8dfd 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -457,7 +457,8 @@ func TestCloudHypervisorCleanupVM(t *testing.T) { assert.NoError(err, "persist.GetDriver() unexpected error") dir := filepath.Join(store.RunVMStoragePath(), clh.id) - os.MkdirAll(dir, os.ModePerm) + err = os.MkdirAll(dir, os.ModePerm) + assert.NoError(err, "failed to create dir %s", dir) err = clh.cleanupVM(false) assert.NoError(err, "persist.GetDriver() unexpected error") @@ -566,7 +567,8 @@ func TestClhRestoreVM(t *testing.T) { assert.Contains(err.Error(), filepath.Join(clhConfig.VMStorePath, "state.json")) // Now create the VM snapshot files and call restoreVM again. - os.MkdirAll(clhConfig.VMStorePath, os.ModePerm) + err = os.MkdirAll(clhConfig.VMStorePath, os.ModePerm) + assert.NoError(err, "failed to create dir %s", clhConfig.VMStorePath) stateFile := filepath.Join(clhConfig.VMStorePath, "state.json") configFile := filepath.Join(clhConfig.VMStorePath, "config.json") err = os.WriteFile(stateFile, []byte("{}"), 0o600) diff --git a/src/runtime/virtcontainers/factory/factory_linux.go b/src/runtime/virtcontainers/factory/factory_linux.go index c010916943..e2d28696ae 100644 --- a/src/runtime/virtcontainers/factory/factory_linux.go +++ b/src/runtime/virtcontainers/factory/factory_linux.go @@ -82,7 +82,6 @@ func resetHypervisorConfig(config *vc.VMConfig) { config.HypervisorConfig.RunStorePath = "" config.HypervisorConfig.SandboxName = "" config.HypervisorConfig.SandboxNamespace = "" - config.HypervisorConfig.DefaultMaxVCPUs = 0 } // It's important that baseConfig and newConfig are passed by value! diff --git a/tests/gha-run-k8s-common.sh b/tests/gha-run-k8s-common.sh index c23f014573..5560e6497e 100644 --- a/tests/gha-run-k8s-common.sh +++ b/tests/gha-run-k8s-common.sh @@ -43,6 +43,7 @@ TEST_CLUSTER_NAMESPACE="${TEST_CLUSTER_NAMESPACE:-}" CONTAINER_RUNTIME="${CONTAINER_RUNTIME:-containerd}" SNAPSHOTTER="${SNAPSHOTTER:-}" EROFS_SNAPSHOTTER_MODE="${EROFS_SNAPSHOTTER_MODE:-}" +EROFS_MERGE_MODE="${EROFS_MERGE_MODE:-}" # Wait for the Kubernetes API to recover after kata-deploy uninstall, then # retry the uninstall to purge any stale helm release state. On k3s/rke2, @@ -851,6 +852,26 @@ function helm_helper() { yq -i '.containerd.userDropIn = strenv(HELM_CONTAINERD_USER_DROP_IN)' "${values_yaml}" fi + # EROFS merge mode ("merged" default, or "unmerged"). This is orthogonal + # to EROFS_SNAPSHOTTER_MODE (which controls default_size): it controls + # whether containerd merges layers into a single fsmeta.erofs (merged, + # runtime-rs only) or keeps per-layer layer.erofs (unmerged, required by + # the Go runtime). + if [[ -n "${EROFS_MERGE_MODE}" ]]; then + if [[ "${SNAPSHOTTER}" != "erofs" ]]; then + die "EROFS_MERGE_MODE is only supported with SNAPSHOTTER=erofs" + fi + + case "${EROFS_MERGE_MODE}" in + merged|unmerged) ;; + *) + die "Unsupported EROFS_MERGE_MODE: ${EROFS_MERGE_MODE}" + ;; + esac + + yq -i ".snapshotter.erofsMergeMode = \"${EROFS_MERGE_MODE}\"" "${values_yaml}" + fi + if [[ -z "${HELM_SHIMS}" ]]; then die "A list of shims is expected but none was provided" fi diff --git a/tests/integration/kubernetes/k8s-vm-templating.bats b/tests/integration/kubernetes/k8s-vm-templating.bats new file mode 100644 index 0000000000..c3fb0e01fa --- /dev/null +++ b/tests/integration/kubernetes/k8s-vm-templating.bats @@ -0,0 +1,114 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2024 Kata Containers +# +# SPDX-License-Identifier: Apache-2.0 +# +# Tests for Kata VM templating (factory) functionality in Kubernetes integration mode + +load "${BATS_TEST_DIRNAME}/lib.sh" +load "${BATS_TEST_DIRNAME}/../../common.bash" +load "${BATS_TEST_DIRNAME}/confidential_common.sh" +load "${BATS_TEST_DIRNAME}/tests_common.sh" + +# Returns 0 if the current environment supports VM templating, non-zero +# otherwise. VM templating is only supported on non-confidential clh/qemu +# hypervisors, and because it uses shared_fs="none" it also requires a +# block-device-based snapshotter (blockfile or erofs). +vm_templating_supported() { + [[ "${KATA_HYPERVISOR}" == "clh" || "${KATA_HYPERVISOR}" == "qemu" ]] || return 1 + is_confidential_runtime_class && return 1 + [[ "${SNAPSHOTTER:-}" =~ ^(blockfile|erofs)$ ]] || return 1 + return 0 +} + +setup() { + if ! vm_templating_supported; then + skip "VM templating requires a non-confidential clh/qemu hypervisor and a blockfile/erofs snapshotter (KATA_HYPERVISOR=${KATA_HYPERVISOR}, SNAPSHOTTER=${SNAPSHOTTER:-unset})" + fi + + setup_common || die "setup_common failed" + + # Build a Kata runtime config drop-in that enables VM templating and + # disables shared_fs (incompatible with templating). + # QEMU VM templating requires an initrd, CLH does not. + local rootfs_override="" + if [[ "${KATA_HYPERVISOR}" == "qemu" ]]; then + rootfs_override=$'image = ""\ninitrd = "/opt/kata/share/kata-containers/kata-containers-initrd.img"' + fi + + local runtime_config_dropin_file="${BATS_TEST_TMPDIR}/99-k8s-vm-templating.toml" + cat > "${runtime_config_dropin_file}" </, whereas a factory-spawned VM + # stores its state under a generated UUID and /run/vc/vm/ is a + # symlink pointing at it (see assignSandbox() in + # src/runtime/virtcontainers/vm.go). Inspect PID 1's mount namespace, where + # the shim creates these entries alongside the template tmpfs. + exec_host "$node" \ + "nsenter --mount=/proc/1/ns/mnt find /run/vc/vm -maxdepth 1 -mindepth 1 -type l ! -name template | grep -q . && echo symlink" \ + | grep -q symlink +} + +teardown() { + vm_templating_supported || return 0 + + rm -f "${pod_config:-}" + + # Destroy the VM template and remove the config drop-in on the target node. + # factory destroy must run in PID 1's mount namespace to unmount the template + # tmpfs that factory init created there (see the @test for details). + exec_host "$node" "nsenter --mount=/proc/1/ns/mnt /opt/kata/bin/kata-runtime --config ${kata_config_path} factory destroy" \ + || echo "Warning: Failed to destroy VM template on node $node" + + remove_kata_runtime_config_dropin_file "$node" "${dropin_path:-}" \ + || echo "Warning: Failed to remove Kata runtime config drop-in on node $node" + + teardown_common "${node:-}" "${node_start_time:-}" +} diff --git a/tests/integration/kubernetes/run_kubernetes_tests.sh b/tests/integration/kubernetes/run_kubernetes_tests.sh index 6e4f64f45b..115c6fb949 100755 --- a/tests/integration/kubernetes/run_kubernetes_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_tests.sh @@ -104,6 +104,7 @@ else "k8s-security-context.bats" \ "k8s-shared-volume.bats" \ "k8s-volume.bats" \ + "k8s-vm-templating.bats" \ "k8s-nginx-connectivity.bats" \ ) diff --git a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs index bafec197ab..a2b7599d3e 100644 --- a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs +++ b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs @@ -15,36 +15,50 @@ use std::path::Path; pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &Path) -> Result<()> { info!("Configuring erofs-snapshotter"); + // "unmerged" mode keeps each image layer as its own per-layer `layer.erofs` + // (containerd's default, non-fsmerged layout), which is the only layout the + // Go runtime can consume. In the default "merged" mode we force containerd + // to merge layers into a single `fsmeta.erofs`, which is runtime-rs only. + let unmerged = config.erofs_merge_mode.as_deref() == Some("unmerged"); + // The Go runtime does not support fsmerged EROFS (fsmeta.erofs). // If the snapshotter handler mapping explicitly pairs a Go shim with - // erofs, that is a hard misconfiguration — bail out so the operator - // fixes the mapping instead of hitting cryptic runtime errors later. - if let Some(mapping) = config.snapshotter_handler_mapping_for_arch.as_ref() { - let mut go_shims_on_erofs = Vec::new(); - for entry in mapping.split(',') { - let parts: Vec<&str> = entry.split(':').collect(); - if parts.len() == 2 && parts[1] == "erofs" && !utils::is_rust_shim(parts[0]) { - go_shims_on_erofs.push(parts[0].to_string()); + // erofs in the (default) merged mode, that is a hard misconfiguration — + // bail out so the operator fixes the mapping instead of hitting cryptic + // runtime errors later. In "unmerged" mode the Go runtime is supported, so + // skip this guard. + if !unmerged { + if let Some(mapping) = config.snapshotter_handler_mapping_for_arch.as_ref() { + let mut go_shims_on_erofs = Vec::new(); + for entry in mapping.split(',') { + let parts: Vec<&str> = entry.split(':').collect(); + if parts.len() == 2 && parts[1] == "erofs" && !utils::is_rust_shim(parts[0]) { + go_shims_on_erofs.push(parts[0].to_string()); + } } - } - if !go_shims_on_erofs.is_empty() { - warn!("##########################################################################"); - warn!("# #"); - warn!("# Go runtime shim(s) mapped to the erofs snapshotter: #"); - for s in &go_shims_on_erofs { - warn!("# - {:<64} #", s); + if !go_shims_on_erofs.is_empty() { + warn!("##########################################################################"); + warn!("# #"); + warn!("# Go runtime shim(s) mapped to the erofs snapshotter: #"); + for s in &go_shims_on_erofs { + warn!("# - {:<64} #", s); + } + warn!("# #"); + warn!( + "# The Go runtime does NOT support fsmerged EROFS (fsmeta.erofs). #" + ); + warn!("# Only runtime-rs shims are supported with merged erofs. Set #"); + warn!("# EROFS_MERGE_MODE=unmerged to use the Go runtime with erofs. #"); + warn!("# #"); + warn!("##########################################################################"); + return Err(anyhow::anyhow!( + "erofs snapshotter: Go runtime shim(s) [{}] cannot be mapped to merged erofs. \ + The Go runtime does not support fsmerged EROFS. \ + Set EROFS_MERGE_MODE=unmerged, remove these shims from \ + SNAPSHOTTER_HANDLER_MAPPING, or switch them to runtime-rs.", + go_shims_on_erofs.join(", ") + )); } - warn!("# #"); - warn!("# The Go runtime does NOT support fsmerged EROFS (fsmeta.erofs). #"); - warn!("# Only runtime-rs shims are supported with the erofs snapshotter. #"); - warn!("# #"); - warn!("##########################################################################"); - return Err(anyhow::anyhow!( - "erofs snapshotter: Go runtime shim(s) [{}] cannot be mapped to erofs. \ - The Go runtime does not support fsmerged EROFS. \ - Remove these shims from SNAPSHOTTER_HANDLER_MAPPING or switch them to runtime-rs.", - go_shims_on_erofs.join(", ") - )); } } @@ -88,11 +102,27 @@ pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &P ".plugins.\"io.containerd.snapshotter.v1.erofs\".default_size", "\"10G\"", )?; - toml_utils::set_toml_value( - configuration_file, - ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", - "0", - )?; + // In the default "merged" mode, force containerd to merge all layers into a + // single fsmeta.erofs (max_unmerged_layers = 0). In "unmerged" mode we delete + // any previously-written value so each layer stays a separate layer.erofs, + // which the Go runtime requires. + // + // Because kata-deploy edits the containerd config in place, switching from + // merged to unmerged must actively remove the old `max_unmerged_layers = 0` + // left behind by a previous install. Otherwise the stale `0` would keep + // forcing the merged layout and break Go-runtime compatibility. + if !unmerged { + toml_utils::set_toml_value( + configuration_file, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + "0", + )?; + } else { + toml_utils::delete_toml_value( + configuration_file, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + )?; + } Ok(()) } diff --git a/tools/packaging/kata-deploy/binary/src/config.rs b/tools/packaging/kata-deploy/binary/src/config.rs index c3c7f68e8c..4700940740 100644 --- a/tools/packaging/kata-deploy/binary/src/config.rs +++ b/tools/packaging/kata-deploy/binary/src/config.rs @@ -178,6 +178,14 @@ pub struct Config { pub multi_install_suffix: Option, pub helm_post_delete_hook: bool, pub experimental_setup_snapshotter: Option>, + /// EROFS snapshotter merge mode: "merged" (default) or "unmerged". + /// + /// In "unmerged" mode kata-deploy does not force containerd's erofs + /// snapshotter to merge layers (it leaves `max_unmerged_layers` at the + /// containerd default), so each image layer is exposed as its own + /// per-layer `layer.erofs`. This is the only layout the Go runtime can + /// consume; the merged (`fsmeta.erofs`) layout is runtime-rs only. + pub erofs_merge_mode: Option, pub experimental_force_guest_pull_for_arch: Vec, pub dest_dir: String, pub host_install_dir: String, @@ -307,6 +315,11 @@ impl Config { .filter(|s| !s.is_empty()) .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()); + let erofs_merge_mode = env::var("EROFS_MERGE_MODE") + .ok() + .map(|s| s.trim().to_lowercase()) + .filter(|s| !s.is_empty()); + // Only use arch-specific variable for experimental force guest pull let experimental_force_guest_pull_for_arch = get_arch_var("EXPERIMENTAL_FORCE_GUEST_PULL", "", &arch) @@ -338,6 +351,7 @@ impl Config { multi_install_suffix, helm_post_delete_hook, experimental_setup_snapshotter, + erofs_merge_mode, experimental_force_guest_pull_for_arch, dest_dir, host_install_dir, @@ -508,6 +522,17 @@ impl Config { _ => {} } + // Validate EROFS_MERGE_MODE + // Only "merged" (default) and "unmerged" are accepted. + if let Some(mode) = self.erofs_merge_mode.as_ref() { + if mode != "merged" && mode != "unmerged" { + return Err(anyhow::anyhow!( + "EROFS_MERGE_MODE must be either 'merged' or 'unmerged', got '{}'", + mode + )); + } + } + // Validate EXPERIMENTAL_FORCE_GUEST_PULL_FOR_ARCH // This is a list of shim names for shim in &self.experimental_force_guest_pull_for_arch { @@ -551,6 +576,7 @@ impl Config { "* EXPERIMENTAL_SETUP_SNAPSHOTTER: {:?}", self.experimental_setup_snapshotter ); + info!("* EROFS_MERGE_MODE: {:?}", self.erofs_merge_mode); info!( "* EXPERIMENTAL_FORCE_GUEST_PULL: {}", self.experimental_force_guest_pull_for_arch.join(",") diff --git a/tools/packaging/kata-deploy/binary/src/utils/toml.rs b/tools/packaging/kata-deploy/binary/src/utils/toml.rs index 080f678875..58477ffa46 100644 --- a/tools/packaging/kata-deploy/binary/src/utils/toml.rs +++ b/tools/packaging/kata-deploy/binary/src/utils/toml.rs @@ -121,6 +121,47 @@ pub fn set_toml_value(file_path: &Path, path: &str, value: &str) -> Result<()> { Ok(()) } +/// Delete a TOML value (or table) at a given path. +/// +/// Navigates to the parent table and removes the final key. This is a no-op if +/// any path component (including the final key) does not exist, so callers can +/// unconditionally remove a value that may or may not be present. +pub fn delete_toml_value(file_path: &Path, path: &str) -> Result<()> { + let content = std::fs::read_to_string(file_path) + .with_context(|| format!("Failed to read TOML file: {file_path:?}"))?; + + let (header, toml_content) = split_non_toml_header(&content); + let mut doc = toml_content + .parse::() + .context("Failed to parse TOML")?; + + let parts = parse_toml_path(path)?; + + let mut current_table = doc.as_table_mut(); + for (i, part) in parts.iter().enumerate() { + let is_last = i == parts.len() - 1; + + if is_last { + // Remove the value; absent key is fine (no-op). + current_table.remove(part.as_str()); + } else { + // Navigate into the intermediate table. If it does not exist, there + // is nothing to delete. + match current_table + .get_mut(part.as_str()) + .and_then(|item| item.as_table_mut()) + { + Some(table) => current_table = table, + None => return Ok(()), + } + } + } + + write_toml_with_header(file_path, header, &doc)?; + + Ok(()) +} + /// Get a TOML value at a given path pub fn get_toml_value(file_path: &Path, path: &str) -> Result { let content = std::fs::read_to_string(file_path) @@ -1714,4 +1755,100 @@ imports = ["/etc/containerd/conf.d/*.toml", "/opt/kata/containerd/config.d/kata- .unwrap(); assert_eq!(runtime_type, "io.containerd.kata-qemu.v2"); } + + #[test] + fn test_delete_toml_value() { + let temp_file = NamedTempFile::new().unwrap(); + let temp_path = temp_file.path(); + std::fs::write( + temp_path, + "[plugins.\"io.containerd.snapshotter.v1.erofs\"]\nmax_unmerged_layers = 0\nenable_fsverity = true\n", + ) + .unwrap(); + + // Sanity check: value is present before deletion. + let before = get_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ) + .unwrap(); + assert_eq!(before, "0"); + + delete_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ) + .unwrap(); + + // The deleted key is gone, but sibling keys remain. + let result = get_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ); + assert!(result.is_err(), "deleted key should no longer be found"); + + let sibling = get_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".enable_fsverity", + ) + .unwrap(); + assert_eq!(sibling, "true", "sibling keys must be preserved"); + } + + #[test] + fn test_delete_toml_value_missing_key_is_noop() { + let temp_file = NamedTempFile::new().unwrap(); + let temp_path = temp_file.path(); + let initial = "[plugins.\"io.containerd.snapshotter.v1.erofs\"]\nenable_fsverity = true\n"; + std::fs::write(temp_path, initial).unwrap(); + + // Deleting a key that does not exist must succeed and leave the file usable. + delete_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ) + .unwrap(); + + // Deleting through a non-existent intermediate table is also a no-op. + delete_toml_value(temp_path, ".plugins.\"nonexistent.plugin\".some_key").unwrap(); + let sibling = get_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".enable_fsverity", + ) + .unwrap(); + assert_eq!(sibling, "true"); + } + + #[test] + fn test_delete_toml_value_preserves_k3s_header() { + let temp_file = NamedTempFile::new().unwrap(); + let temp_path = temp_file.path(); + std::fs::write( + temp_path, + "{{ template \"base\" . }}\n[plugins.\"io.containerd.snapshotter.v1.erofs\"]\nmax_unmerged_layers = 0\n", + ) + .unwrap(); + + delete_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ) + .unwrap(); + + let content = std::fs::read_to_string(temp_path).unwrap(); + assert!( + content.starts_with("{{ template \"base\" . }}\n"), + "non-TOML header must be preserved" + ); + assert!( + !content.contains("max_unmerged_layers"), + "value must be removed" + ); + } + + #[test] + fn test_delete_toml_value_nonexistent_file() { + let result = delete_toml_value(Path::new("/nonexistent/file.toml"), "some.path"); + assert!(result.is_err()); + } } diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl index 457cb00ab6..24f309ae3c 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl @@ -413,6 +413,13 @@ Get snapshotter setup list from structured config {{- join "," .Values.snapshotter.setup -}} {{- end -}} +{{/* +Get EROFS merge mode from structured config ("merged" or "unmerged") +*/}} +{{- define "kata-deploy.getErofsMergeMode" -}} +{{- .Values.snapshotter.erofsMergeMode | default "" -}} +{{- end -}} + {{/* Get debug value from structured config */}} @@ -569,6 +576,11 @@ e.g. `{{- include "kata-deploy.commonEnv" . | nindent 8 }}`. - name: EXPERIMENTAL_SETUP_SNAPSHOTTER value: {{ $snapshotterSetup | quote }} {{- end }} +{{- $erofsMergeMode := include "kata-deploy.getErofsMergeMode" . | trim -}} +{{- if $erofsMergeMode }} +- name: EROFS_MERGE_MODE + value: {{ $erofsMergeMode | quote }} +{{- end }} {{- $forceGuestPullAmd64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "amd64") | trim -}} {{- if $forceGuestPullAmd64 }} - name: EXPERIMENTAL_FORCE_GUEST_PULL_X86_64 diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml index 120bfd5027..7832ddb995 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml @@ -271,6 +271,18 @@ health: snapshotter: setup: ["nydus"] # ["nydus", "erofs"] or [] + # EROFS merge mode: "merged" (default) or "unmerged". + # + # "merged" forces containerd's erofs snapshotter to merge all image layers + # into a single fsmeta.erofs (max_unmerged_layers = 0). This layout is only + # supported by runtime-rs shims. + # + # "unmerged" leaves max_unmerged_layers at the containerd default so each + # image layer is exposed as its own per-layer layer.erofs. This is the only + # layout the Go runtime can consume, so set this when mapping a Go shim to the + # erofs snapshotter. When empty, kata-deploy uses its built-in default + # (merged). + erofsMergeMode: "" # Shim configuration # By default (disableAll: false), all shims with enabled: ~ (null) are enabled. From 730307f32ccf1410af45c497ddbe1b2e62be98a2 Mon Sep 17 00:00:00 2001 From: Cameron Baird Date: Tue, 9 Jun 2026 18:59:20 +0000 Subject: [PATCH 3/3] factory: Default to normal sandbox boot path when factory init not done The behavior we had before was that, for a starting k8s pod, it sees enable_template=true and therefore: 1. Tries NewFactory with fetchOnly=true 2. When that fails (because template.Fetch fails to find the artifacts, we retry with fetchOnly=false. This creates a direct factory which creates the template from scratch (hence we pay a full pod sandbox boot time here) and then restores from that. Hence the boot times are strictly worse on this path. Now, even when enable_template=true, we don't try to force a direct factory. Instead we just revert to the standard sandbox boot path. Signed-off-by: Cameron Baird --- src/runtime/pkg/katautils/create.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/runtime/pkg/katautils/create.go b/src/runtime/pkg/katautils/create.go index b60b0bee14..4f6edcfbab 100644 --- a/src/runtime/pkg/katautils/create.go +++ b/src/runtime/pkg/katautils/create.go @@ -82,12 +82,8 @@ func HandleFactory(ctx context.Context, vci vc.VC, runtimeConfig *oci.RuntimeCon kataUtilsLogger.WithField("factory", factoryConfig).Info("load vm factory") f, err := vf.NewFactory(ctx, factoryConfig, true) - if err != nil && !factoryConfig.VMCache { - kataUtilsLogger.WithError(err).Warn("load vm factory failed, about to create new one") - f, err = vf.NewFactory(ctx, factoryConfig, false) - } if err != nil { - kataUtilsLogger.WithError(err).Warn("create vm factory failed") + kataUtilsLogger.WithError(err).Warn("load vm factory failed, will use direct boot") return }