diff --git a/.github/workflows/run-k8s-tests-on-free-runner.yaml b/.github/workflows/run-k8s-tests-on-free-runner.yaml index d8d9b4c636..87a86ac8b0 100644 --- a/.github/workflows/run-k8s-tests-on-free-runner.yaml +++ b/.github/workflows/run-k8s-tests-on-free-runner.yaml @@ -41,10 +41,12 @@ jobs: matrix: environment: [ { vmm: clh, containerd_version: latest }, + { vmm: clh, containerd_version: latest, snapshotter: erofs, erofs_mode: disk, erofs_merge_mode: unmerged }, { vmm: clh, containerd_version: minimum }, { vmm: dragonball, containerd_version: latest }, { vmm: dragonball, containerd_version: minimum }, { vmm: qemu, containerd_version: latest }, + { vmm: qemu, containerd_version: latest, snapshotter: erofs, erofs_mode: disk, erofs_merge_mode: unmerged }, { vmm: qemu, containerd_version: minimum }, { vmm: qemu-runtime-rs, containerd_version: latest }, { vmm: qemu-runtime-rs, containerd_version: minimum }, @@ -68,6 +70,9 @@ jobs: K8S_TEST_HOST_TYPE: baremetal-no-attestation CONTAINER_ENGINE: containerd CONTAINER_ENGINE_VERSION: ${{ matrix.environment.containerd_version }} + SNAPSHOTTER: ${{ matrix.environment.snapshotter }} + EROFS_SNAPSHOTTER_MODE: ${{ matrix.environment.erofs_mode }} + EROFS_MERGE_MODE: ${{ matrix.environment.erofs_merge_mode }} GH_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index de0b525c59..e78e699e01 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -2085,12 +2085,6 @@ func checkNetNsConfig(config oci.RuntimeConfig) error { // checkFactoryConfig ensures the VM factory configuration is valid. func checkFactoryConfig(config oci.RuntimeConfig) error { - if config.FactoryConfig.Template { - if config.HypervisorConfig.InitrdPath == "" { - return errors.New("Factory option enable_template requires an initrd image") - } - } - if config.FactoryConfig.VMCacheNumber > 0 { if config.HypervisorType != vc.QemuHypervisor { return errors.New("VM cache just support qemu") diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index ef9e70f8ec..187af6cb70 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -1696,7 +1696,7 @@ func TestCheckFactoryConfig(t *testing.T) { {false, false, "", "initrd"}, {true, false, "", "initrd"}, - {true, true, "image", ""}, + {true, false, "image", ""}, } for i, d := range data { diff --git a/src/runtime/pkg/katautils/create.go b/src/runtime/pkg/katautils/create.go index b60b0bee14..4f6edcfbab 100644 --- a/src/runtime/pkg/katautils/create.go +++ b/src/runtime/pkg/katautils/create.go @@ -82,12 +82,8 @@ func HandleFactory(ctx context.Context, vci vc.VC, runtimeConfig *oci.RuntimeCon kataUtilsLogger.WithField("factory", factoryConfig).Info("load vm factory") f, err := vf.NewFactory(ctx, factoryConfig, true) - if err != nil && !factoryConfig.VMCache { - kataUtilsLogger.WithError(err).Warn("load vm factory failed, about to create new one") - f, err = vf.NewFactory(ctx, factoryConfig, false) - } if err != nil { - kataUtilsLogger.WithError(err).Warn("create vm factory failed") + kataUtilsLogger.WithError(err).Warn("load vm factory failed, will use direct boot") return } diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index bffaa34385..171dd2bf22 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -68,6 +68,7 @@ const ( const ( clhStateCreated = "Created" clhStateRunning = "Running" + clhStatePaused = "Paused" ) const ( @@ -112,8 +113,16 @@ type clhClient interface { VmAddDevicePut(ctx context.Context, deviceConfig chclient.DeviceConfig) (chclient.PciDeviceInfo, *http.Response, error) // Add a new disk device to the VM VmAddDiskPut(ctx context.Context, diskConfig chclient.DiskConfig) (chclient.PciDeviceInfo, *http.Response, error) + // Pause the VM + VmPausePut(ctx context.Context) (*http.Response, error) + // Create a snapshot of the VM + VmSnapshotPut(ctx context.Context, vmSnapshotConfig chclient.VmSnapshotConfig) (*http.Response, error) // Remove a device from the VM VmRemoveDevicePut(ctx context.Context, vmRemoveDevice chclient.VmRemoveDevice) (*http.Response, error) + // Restore VM from a snapshot + VmRestorePut(ctx context.Context, restoreConfig chclient.RestoreConfig) (*http.Response, error) + // Resume a paused VM + ResumeVM(ctx context.Context) (*http.Response, error) } type clhClientApi struct { @@ -153,10 +162,26 @@ func (c *clhClientApi) VmAddDiskPut(ctx context.Context, diskConfig chclient.Dis return c.ApiInternal.VmAddDiskPut(ctx).DiskConfig(diskConfig).Execute() } +func (c *clhClientApi) VmPausePut(ctx context.Context) (*http.Response, error) { + return c.ApiInternal.PauseVM(ctx).Execute() +} + +func (c *clhClientApi) VmSnapshotPut(ctx context.Context, vmSnapshotConfig chclient.VmSnapshotConfig) (*http.Response, error) { + return c.ApiInternal.VmSnapshotPut(ctx).VmSnapshotConfig(vmSnapshotConfig).Execute() +} + func (c *clhClientApi) VmRemoveDevicePut(ctx context.Context, vmRemoveDevice chclient.VmRemoveDevice) (*http.Response, error) { return c.ApiInternal.VmRemoveDevicePut(ctx).VmRemoveDevice(vmRemoveDevice).Execute() } +func (c *clhClientApi) VmRestorePut(ctx context.Context, restoreConfig chclient.RestoreConfig) (*http.Response, error) { + return c.ApiInternal.VmRestorePut(ctx).RestoreConfig(restoreConfig).Execute() +} + +func (c *clhClientApi) ResumeVM(ctx context.Context) (*http.Response, error) { + return c.ApiInternal.ResumeVM(ctx).Execute() +} + // This is done in order to be able to override such a function as part of // our unit tests, as when testing bootVM we're on a mocked scenario already. var vmAddNetPutRequest = func(clh *cloudHypervisor) ([]chclient.PciDeviceInfo, error) { @@ -255,12 +280,14 @@ type CloudHypervisorState struct { PID int VirtiofsDaemonPid int state clhState + isRestoring bool } func (s *CloudHypervisorState) reset() { s.PID = 0 s.VirtiofsDaemonPid = 0 s.state = clhNotReady + s.isRestoring = false } type cloudHypervisor struct { @@ -501,7 +528,7 @@ func getNonUserDefinedKernelParams(rootfstype string, disableNvdimm bool, dax bo } // For cloudHypervisor this call only sets the internal structure up. -// The VM will be created and started through StartVM(). +// The VM will be created and started through StartVM(), or restored from template if template files exist. func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error { clh.ctx = ctx @@ -559,29 +586,78 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net } } - // Create the VM memory config via the constructor to ensure default values are properly assigned - clh.vmconfig.Memory = chclient.NewMemoryConfig(int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes())) - // Memory config shared is to be enabled when using vhost_user backends, ex. virtio-fs - // or when using HugePages. - // If such features are disabled, turn off shared memory config. - if clh.config.SharedFS == config.NoSharedFS && !clh.config.HugePages { - clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(false) - } else { - clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(true) - } - // Enable hugepages if needed - clh.vmconfig.Memory.Hugepages = func(b bool) *bool { return &b }(clh.config.HugePages) - if !clh.config.ConfidentialGuest { - hotplugSize := clh.config.DefaultMaxMemorySize - // OpenAPI only supports int64 values - clh.vmconfig.Memory.HotplugSize = func(i int64) *int64 { return &i }(int64((utils.MemUnit(hotplugSize) * utils.MiB).ToBytes())) - - if clh.config.ReclaimGuestFreedMemory { - // Create VM with a balloon config so we can enable free page reporting (size of the balloon can be set to zero) - clh.vmconfig.Balloon = chclient.NewBalloonConfig(0) - // Set the free page reporting flag for ballooning to be true - clh.vmconfig.Balloon.SetFreePageReporting(true) + // If the VM is booting from a template, or if the VM is going to be used as a template + // the memory is to be backed by a file, so we need to configure the memory zones accordingly. + if clh.config.BootFromTemplate || clh.config.BootToBeTemplate { + // VM templating is incompatible with virtio-fs because virtio-fs requires shared memory, + // while templating needs COW/private memory on restore. + if clh.config.SharedFS == config.VirtioFS || clh.config.SharedFS == config.VirtioFSNydus { + return errors.New("VM templating has been enabled with virtio-fs and this configuration will not work") } + + // Double-check that the clh.config.MemoryPath file is accessible before using it in the VM config, to avoid hitting a less clear error from cloud hypervisor when it tries to access the memory file. + if _, err := os.Stat(clh.config.MemoryPath); err != nil { + return fmt.Errorf("memory file %s is not accessible: %w", clh.config.MemoryPath, err) + } + + // Set the size to be 0 since we are going to configure actual size via zones + clh.vmconfig.Memory = chclient.NewMemoryConfig(0) + + memoryZoneConfig := chclient.NewMemoryZoneConfig("mem0", int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes())) + if clh.config.BootToBeTemplate { + // When BootToBeTemplate is true, the memory file backing the VM memory is shared between multiple VMs created from the same template. + // So we need to set shared to true in this case. + memoryZoneConfig.SetShared(true) + clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(true) + + if !clh.config.ConfidentialGuest { + // TODO: Remove this warning once memory hotplugging is supported + // for template VMs. + // + // Memory hotplug is intentionally not configured for template VMs. + // Resizing a memory zone requires the virtio-mem hotplug method + // (cloud-hypervisor rejects the default ACPI hotplug on a zone that + // carries a hotplug_size), which is not currently supported in the + // templating path. As a result, VMs restored from this template + // cannot grow their memory beyond the template's boot size. + clh.Logger().Warn("memory hotplugging is currently unsupported for template VMs") + } + } else { + // When BootFromTemplate is true, set shared=false to ensure Copy-On-Write is used for the memory file. + // So that the VM can have its own private memory. + memoryZoneConfig.SetShared(false) + clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(false) + } + memoryZoneConfig.SetFile(clh.config.MemoryPath) + clh.vmconfig.Memory.Zones = &[]chclient.MemoryZoneConfig{ + *memoryZoneConfig, + } + } else { // Normal (non-template) VM creation + // Create the VM memory config via the constructor to ensure default values are properly assigned + clh.vmconfig.Memory = chclient.NewMemoryConfig(int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes())) + // Memory config shared is to be enabled when using vhost_user backends, ex. virtio-fs + // or when using HugePages. + // If such features are disabled, turn off shared memory config. + if clh.config.SharedFS == config.NoSharedFS && !clh.config.HugePages { + clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(false) + } else { + clh.vmconfig.Memory.Shared = func(b bool) *bool { return &b }(true) + } + // Enable hugepages if needed + clh.vmconfig.Memory.Hugepages = func(b bool) *bool { return &b }(clh.config.HugePages) + if !clh.config.ConfidentialGuest { + hotplugSize := clh.config.DefaultMaxMemorySize + // OpenAPI only supports int64 values + clh.vmconfig.Memory.HotplugSize = func(i int64) *int64 { return &i }(int64((utils.MemUnit(hotplugSize) * utils.MiB).ToBytes())) + } + } + + // Configure balloon device for free page reporting. This is set unconditionally + // (for both template and non-template paths) so that template VMs include the + // balloon in their snapshot, and VMs restored from a template inherit it. + if !clh.config.ConfidentialGuest && clh.config.ReclaimGuestFreedMemory { + clh.vmconfig.Balloon = chclient.NewBalloonConfig(0) + clh.vmconfig.Balloon.SetFreePageReporting(true) } // Set initial amount of cpu's for the virtual machine @@ -700,9 +776,118 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net return err } + // Check if we should restore from template instead of creating new VM + if clh.config.BootFromTemplate && clh.shouldRestoreFromTemplate() { + clh.Logger().Info("Template files found, will restore VM instead of creating new") + // Mark this as a restore operation for StartVM to use RestoreVM instead + clh.state.isRestoring = true + return nil + } + return nil } +// shouldRestoreFromTemplate checks if template snapshot files exist and we should restore instead of creating new VM +func (clh *cloudHypervisor) shouldRestoreFromTemplate() bool { + // For template restore, we need the snapshot directory to contain the necessary files + // The snapshotDir is derived from the MemoryPath directory + snapshotDir := filepath.Dir(clh.config.MemoryPath) + + // Check for required template files (config.json, state.json, and memory file) + configFile := filepath.Join(snapshotDir, "config.json") + stateFile := filepath.Join(snapshotDir, "state.json") + memoryFile := clh.config.MemoryPath + + if _, err := os.Stat(configFile); err != nil { + clh.Logger().WithError(err).WithField("configFile", configFile).Debug("Template config file not accessible") + return false + } + + if _, err := os.Stat(stateFile); err != nil { + clh.Logger().WithError(err).WithField("stateFile", stateFile).Debug("Template state file not accessible") + return false + } + + if _, err := os.Stat(memoryFile); err != nil { + clh.Logger().WithError(err).WithField("memoryFile", memoryFile).Debug("Template memory file not accessible") + return false + } + + clh.Logger().WithFields(log.Fields{ + "configFile": configFile, + "stateFile": stateFile, + "memoryFile": memoryFile, + }).Info("Template files found, can restore VM from template") + + return true +} + +// copyFile copies a file from src to dst, preserving the source file's permissions. +func (clh *cloudHypervisor) copyFile(src, dst string) error { + srcFile, err := os.Open(src) + if err != nil { + return err + } + defer srcFile.Close() + + srcInfo, err := srcFile.Stat() + if err != nil { + return err + } + + dstFile, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, srcInfo.Mode()) + if err != nil { + return err + } + defer dstFile.Close() + + _, err = io.Copy(dstFile, srcFile) + if err != nil { + return err + } + + return dstFile.Sync() +} + +// updateVsockSocketPath updates the vsock socket path in the config.json file +func (clh *cloudHypervisor) updateVsockSocketPath(configPath, vmID string) error { + // Read the config file + configData, err := os.ReadFile(configPath) + if err != nil { + return err + } + + var config map[string]interface{} + dec := json.NewDecoder(bytes.NewReader(configData)) + dec.UseNumber() + if err := dec.Decode(&config); err != nil { + return err + } + + // Update vsock socket path if vsock exists + if vsock, ok := config["vsock"].(map[string]interface{}); ok { + // Generate new vsock socket path for this VM + newVsockPath, err := clh.vsockSocketPath(vmID) + if err != nil { + return err + } + vsock["socket"] = newVsockPath + + clh.Logger().WithFields(log.Fields{ + "vmID": vmID, + "newVsockPath": newVsockPath, + }).Debug("Updated vsock socket path in config.json") + } + + // Write the updated config back to file + updatedConfig, err := json.Marshal(config) + if err != nil { + return err + } + + return os.WriteFile(configPath, updatedConfig, 0600) +} + // setupInitdata prepares and attaches the initdata disk if present. func setupInitdata(clh *cloudHypervisor, hypervisorConfig *HypervisorConfig) error { if len(hypervisorConfig.Initdata) == 0 { @@ -771,8 +956,37 @@ func (clh *cloudHypervisor) StartVM(ctx context.Context, timeout int) error { ctx, cancel := context.WithTimeout(ctx, bootTimeout*time.Second) defer cancel() - if err := clh.bootVM(ctx); err != nil { - return err + // Check if we should restore from template or create new VM + if clh.state.isRestoring { + // Copy template files to VM directory + snapshotDir := filepath.Dir(clh.config.MemoryPath) + + // Copy config.json from template to VM directory + srcConfig := filepath.Join(snapshotDir, "config.json") + dstConfig := filepath.Join(vmPath, "config.json") + if err := clh.copyFile(srcConfig, dstConfig); err != nil { + return fmt.Errorf("failed to copy config.json: %v", err) + } + + // Copy state.json from template to VM directory + srcState := filepath.Join(snapshotDir, "state.json") + dstState := filepath.Join(vmPath, "state.json") + if err := clh.copyFile(srcState, dstState); err != nil { + return fmt.Errorf("failed to copy state.json: %v", err) + } + + // Update vsock socket path in the copied config.json + if err := clh.updateVsockSocketPath(dstConfig, clh.id); err != nil { + return fmt.Errorf("failed to update vsock socket path: %v", err) + } + + if err := clh.restoreVM(ctx); err != nil { + return err + } + } else { + if err := clh.bootVM(ctx); err != nil { + return err + } } clh.state.state = clhReady @@ -1287,16 +1501,109 @@ func (clh *cloudHypervisor) Cleanup(ctx context.Context) error { func (clh *cloudHypervisor) PauseVM(ctx context.Context) error { clh.Logger().WithField("function", "PauseVM").Info("Pause Sandbox") + + cl := clh.client() + ctx, cancel := context.WithTimeout(ctx, clh.getClhAPITimeout()*time.Second) + defer cancel() + + _, err := cl.VmPausePut(ctx) + if err != nil { + clh.Logger().WithError(err).Error("Failed to pause VM") + return openAPIClientError(err) + } + return nil } func (clh *cloudHypervisor) SaveVM() error { - clh.Logger().WithField("function", "saveSandboxC").Info("Save Sandbox") + clh.Logger().WithField("function", "SaveVM").Info("Save Sandbox") + + cl := clh.client() + ctx, cancel := context.WithTimeout(context.Background(), clh.getClhAPITimeout()*time.Second) + defer cancel() + + snapshotDir := filepath.Dir(clh.config.MemoryPath) + // Create snapshot config with file URL to template path + // Use MemoryPath as base for snapshot destination + // When creating a template, the MemoryPath is set to the template path, so we can use it to save the snapshot. + fileURL := "file://" + snapshotDir + + vmSnapshotConfig := *chclient.NewVmSnapshotConfig() + vmSnapshotConfig.SetDestinationUrl(fileURL) + + _, err := cl.VmSnapshotPut(ctx, vmSnapshotConfig) + if err != nil { + clh.Logger().WithError(err).Error("Failed to save VM snapshot") + return openAPIClientError(err) + } + + if clh.config.BootToBeTemplate { + // Update the config.json file in the snapshotDir to set memory shared=false + snapshotConfigPath := filepath.Join(snapshotDir, "config.json") + snapshotConfig, err := os.ReadFile(snapshotConfigPath) + if err != nil { + clh.Logger().WithError(err).Error("Failed to read snapshot config") + return err + } + + var snapshotConfigData map[string]interface{} + dec := json.NewDecoder(bytes.NewReader(snapshotConfig)) + dec.UseNumber() + if err := dec.Decode(&snapshotConfigData); err != nil { + clh.Logger().WithError(err).Error("Failed to unmarshal snapshot config") + return err + } + + // Access the memory section and cast it to a map + if memorySection, ok := snapshotConfigData["memory"].(map[string]interface{}); ok { + memorySection["shared"] = false + // Do the same update for each element for the "zones" array in the memorySection + if zones, ok := memorySection["zones"].([]interface{}); ok { + for _, zone := range zones { + if zoneMap, ok := zone.(map[string]interface{}); ok { + zoneMap["shared"] = false + } else { + clh.Logger().Error("Unable to access zone in snapshot config memory section") + return fmt.Errorf("invalid snapshot config structure: zone in memory section not found or invalid") + } + } + } else { + clh.Logger().Error("Unable to access zones array in snapshot config memory section") + return fmt.Errorf("invalid snapshot config structure: zones array in memory section not found or invalid") + } + } else { + clh.Logger().Error("Unable to access memory section in snapshot config") + return fmt.Errorf("invalid snapshot config structure: memory section not found or invalid") + } + + // Write the modified config back to file + modifiedConfig, err := json.Marshal(snapshotConfigData) + if err != nil { + clh.Logger().WithError(err).Error("Failed to marshal modified snapshot config") + return err + } + + if err := os.WriteFile(snapshotConfigPath, modifiedConfig, 0600); err != nil { + clh.Logger().WithError(err).Error("Failed to write modified snapshot config") + return err + } + } + return nil } func (clh *cloudHypervisor) ResumeVM(ctx context.Context) error { clh.Logger().WithField("function", "ResumeVM").Info("Resume Sandbox") + cl := clh.client() + ctx, cancel := context.WithTimeout(ctx, clh.getClhAPITimeout()*time.Second) + defer cancel() + + _, err := cl.ResumeVM(ctx) + if err != nil { + clh.Logger().WithError(err).Error("Failed to resume VM") + return openAPIClientError(err) + } + return nil } @@ -1509,11 +1816,11 @@ func (clh *cloudHypervisor) clhPath() (string, error) { p = defaultClhPath } - if _, err = os.Stat(p); os.IsNotExist(err) { - return "", fmt.Errorf("Cloud-Hypervisor path (%s) does not exist", p) + if _, err = os.Stat(p); err != nil { + return "", fmt.Errorf("Cloud-Hypervisor path (%s) is not accessible: %w", p, err) } - return p, err + return p, nil } func (clh *cloudHypervisor) launchClh() error { @@ -1741,6 +2048,60 @@ func (clh *cloudHypervisor) bootVM(ctx context.Context) error { return nil } +// restoreVM restores a VM from a template snapshot. The restored VM will be in +// Paused state. The caller (factory layer, via factory.GetVM → vm.Resume) is +// responsible for resuming the VM, reseeding the RNG, and syncing the guest clock +// before the VM is used. See factory_linux.go GetVM(). +func (clh *cloudHypervisor) restoreVM(ctx context.Context) error { + clh.Logger().Info("Restoring VM from template") + + cl := clh.client() + + // use the VMStorePath as the base for the restore source URL + vmPath := filepath.Join(clh.config.VMStorePath, clh.id) + sourceURL := "file://" + vmPath + + // check if the snapshot directory contains the state.json and config.json files + // which contain the VM state and configuration respectively + stateFile := filepath.Join(vmPath, "state.json") + configFile := filepath.Join(vmPath, "config.json") + + if _, err := os.Stat(stateFile); err != nil { + return fmt.Errorf("failed to access state file %s: %v", stateFile, err) + } + + if _, err := os.Stat(configFile); err != nil { + return fmt.Errorf("failed to access config file %s: %v", configFile, err) + } + + // Prepare restore configuration + restoreConfig := *chclient.NewRestoreConfig(sourceURL) + + clh.Logger().WithField("sourceURL", sourceURL).Debug("Restore configuration") + + // Restore VM from template (uses the caller's ctx, which already has the boot timeout) + _, err := cl.VmRestorePut(ctx, restoreConfig) + if err != nil { + clh.Logger().WithError(err).Error("failed to restore VM from template") + return openAPIClientError(err) + } + + // Check VM state after restoration + info, err := clh.vmInfo() + if err != nil { + return err + } + + clh.Logger().Debugf("VM state after restore: %#v", info) + + if info.State != clhStatePaused { + clh.Logger().Warnf("VM state is '%s' after restore, expected 'Paused'", info.State) + } + + clh.Logger().Info("Successfully restored VM from template") + return nil +} + func (clh *cloudHypervisor) addVSock(cid int64, path string) { clh.Logger().WithFields(log.Fields{ "path": path, diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index 7ddbcff206..d8773a8dfd 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -74,7 +74,9 @@ func newClhConfig() (HypervisorConfig, error) { } type clhClientMock struct { - vmInfo chclient.VmInfo + vmInfo chclient.VmInfo + restoreRequest *chclient.RestoreConfig + snapshotRequest *chclient.VmSnapshotConfig } func (c *clhClientMock) VmmPingGet(ctx context.Context) (chclient.VmmPingResponse, *http.Response, error) { @@ -115,11 +117,35 @@ func (c *clhClientMock) VmAddDiskPut(ctx context.Context, diskConfig chclient.Di return chclient.PciDeviceInfo{Bdf: "0000:00:0a.0"}, nil, nil } +//nolint:golint +func (c *clhClientMock) VmPausePut(ctx context.Context) (*http.Response, error) { + c.vmInfo.State = clhStatePaused + return nil, nil +} + +//nolint:golint +func (c *clhClientMock) VmSnapshotPut(ctx context.Context, vmSnapshotConfig chclient.VmSnapshotConfig) (*http.Response, error) { + c.snapshotRequest = &vmSnapshotConfig + return nil, nil +} + //nolint:golint func (c *clhClientMock) VmRemoveDevicePut(ctx context.Context, vmRemoveDevice chclient.VmRemoveDevice) (*http.Response, error) { return nil, nil } +func (c *clhClientMock) VmRestorePut(ctx context.Context, restoreConfig chclient.RestoreConfig) (*http.Response, error) { + c.restoreRequest = &restoreConfig + // restoreVM() verifies Paused after restore. + c.vmInfo.State = clhStatePaused + return nil, nil +} + +func (c *clhClientMock) ResumeVM(ctx context.Context) (*http.Response, error) { + c.vmInfo.State = clhStateRunning + return nil, nil +} + func TestCloudHypervisorAddVSock(t *testing.T) { assert := assert.New(t) clh := cloudHypervisor{} @@ -431,7 +457,8 @@ func TestCloudHypervisorCleanupVM(t *testing.T) { assert.NoError(err, "persist.GetDriver() unexpected error") dir := filepath.Join(store.RunVMStoragePath(), clh.id) - os.MkdirAll(dir, os.ModePerm) + err = os.MkdirAll(dir, os.ModePerm) + assert.NoError(err, "failed to create dir %s", dir) err = clh.cleanupVM(false) assert.NoError(err, "persist.GetDriver() unexpected error") @@ -516,6 +543,81 @@ func TestClhCreateVM(t *testing.T) { } } +func TestClhRestoreVM(t *testing.T) { + assert := assert.New(t) + + store, err := persist.GetDriver() + assert.NoError(err) + + clhConfig, err := newClhConfig() + assert.NoError(err) + clhConfig.VMStorePath = store.RunVMStoragePath() + clhConfig.RunStorePath = store.RunStoragePath() + + mockClient := &clhClientMock{} + clh := &cloudHypervisor{ + config: clhConfig, + APIClient: mockClient, + } + + // First call restoreVM without the VM snapshot files (state.json, config.json) present. + err = clh.restoreVM(context.Background()) + // An error is expected because restoreVM expects the VM snapshot files to be present. + assert.Error(err) + assert.Contains(err.Error(), filepath.Join(clhConfig.VMStorePath, "state.json")) + + // Now create the VM snapshot files and call restoreVM again. + err = os.MkdirAll(clhConfig.VMStorePath, os.ModePerm) + assert.NoError(err, "failed to create dir %s", clhConfig.VMStorePath) + stateFile := filepath.Join(clhConfig.VMStorePath, "state.json") + configFile := filepath.Join(clhConfig.VMStorePath, "config.json") + err = os.WriteFile(stateFile, []byte("{}"), 0o600) + assert.NoError(err) + err = os.WriteFile(configFile, []byte("{}"), 0o600) + assert.NoError(err) + + // Call restoreVM again, this time it should succeed. + err = clh.restoreVM(context.Background()) + assert.NoError(err) + + if assert.NotNil(mockClient.restoreRequest) { + expectedSourceURL := "file://" + clhConfig.VMStorePath + assert.Equal(expectedSourceURL, mockClient.restoreRequest.GetSourceUrl()) + } + + info, err := clh.vmInfo() + assert.NoError(err) + assert.Equal(clhStatePaused, info.State) +} + +func TestClhSaveVM(t *testing.T) { + assert := assert.New(t) + + store, err := persist.GetDriver() + assert.NoError(err) + + clhConfig, err := newClhConfig() + assert.NoError(err) + // For testing, assume the memory path is located within the VM store path. + clhConfig.MemoryPath = filepath.Join(store.RunVMStoragePath(), "memory") + clhConfig.VMStorePath = store.RunVMStoragePath() + clhConfig.RunStorePath = store.RunStoragePath() + + mockClient := &clhClientMock{} + clh := &cloudHypervisor{ + config: clhConfig, + APIClient: mockClient, + } + + err = clh.SaveVM() + assert.NoError(err) + + if assert.NotNil(mockClient.snapshotRequest) { + expectedDestinationURL := "file://" + filepath.Dir(clhConfig.MemoryPath) + assert.Equal(expectedDestinationURL, mockClient.snapshotRequest.GetDestinationUrl()) + } +} + func TestCloudHypervisorStartSandbox(t *testing.T) { assert := assert.New(t) clhConfig, err := newClhConfig() diff --git a/src/runtime/virtcontainers/factory/factory_linux.go b/src/runtime/virtcontainers/factory/factory_linux.go index 4a0cfcfe38..e2d28696ae 100644 --- a/src/runtime/virtcontainers/factory/factory_linux.go +++ b/src/runtime/virtcontainers/factory/factory_linux.go @@ -80,6 +80,8 @@ func resetHypervisorConfig(config *vc.VMConfig) { config.HypervisorConfig.SharedPath = "" config.HypervisorConfig.VMStorePath = "" config.HypervisorConfig.RunStorePath = "" + config.HypervisorConfig.SandboxName = "" + config.HypervisorConfig.SandboxNamespace = "" } // It's important that baseConfig and newConfig are passed by value! diff --git a/src/runtime/virtcontainers/factory/template/template_linux.go b/src/runtime/virtcontainers/factory/template/template_linux.go index d48ce5c50b..2a23cdbccd 100644 --- a/src/runtime/virtcontainers/factory/template/template_linux.go +++ b/src/runtime/virtcontainers/factory/template/template_linux.go @@ -11,6 +11,7 @@ import ( "context" "fmt" "os" + "path/filepath" "syscall" "time" @@ -115,6 +116,15 @@ func (t *template) prepareTemplateFiles() error { } f.Close() + // truncate the memory file to the exact size of the VM memory + memoryInBytes := int64(t.config.HypervisorConfig.MemorySize) * 1024 * 1024 + t.Logger().Infof("truncating memory file %s to %d bytes", t.statePath+"/memory", memoryInBytes) + err = os.Truncate(t.statePath+"/memory", memoryInBytes) + if err != nil { + t.close() + return err + } + return nil } @@ -124,7 +134,8 @@ func (t *template) createTemplateVM(ctx context.Context) error { config.HypervisorConfig.BootToBeTemplate = true config.HypervisorConfig.BootFromTemplate = false config.HypervisorConfig.MemoryPath = t.statePath + "/memory" - config.HypervisorConfig.DevicesStatePath = t.statePath + "/state" + config.HypervisorConfig.DevicesStatePath = t.deviceStatePath() + config.HypervisorConfig.VMStorePath = t.statePath vm, err := vc.NewVM(ctx, config) if err != nil { @@ -161,7 +172,7 @@ func (t *template) createFromTemplateVM(ctx context.Context, c vc.VMConfig) (*vc config.HypervisorConfig.BootToBeTemplate = false config.HypervisorConfig.BootFromTemplate = true config.HypervisorConfig.MemoryPath = t.statePath + "/memory" - config.HypervisorConfig.DevicesStatePath = t.statePath + "/state" + config.HypervisorConfig.DevicesStatePath = t.deviceStatePath() config.HypervisorConfig.SharedPath = c.HypervisorConfig.SharedPath config.HypervisorConfig.VMStorePath = c.HypervisorConfig.VMStorePath config.HypervisorConfig.RunStorePath = c.HypervisorConfig.RunStorePath @@ -175,6 +186,15 @@ func (t *template) checkTemplateVM() error { return err } - _, err = os.Stat(t.statePath + "/state") + _, err = os.Stat(t.deviceStatePath()) return err } + +func (t *template) deviceStatePath() string { + stateFileName := "state" + if t.config.HypervisorType == vc.ClhHypervisor { + stateFileName = "state.json" + } + + return filepath.Join(t.statePath, stateFileName) +} diff --git a/src/runtime/virtcontainers/factory/template/template_test.go b/src/runtime/virtcontainers/factory/template/template_test.go index c067c793e6..a9c78162b9 100644 --- a/src/runtime/virtcontainers/factory/template/template_test.go +++ b/src/runtime/virtcontainers/factory/template/template_test.go @@ -57,15 +57,26 @@ func TestTemplateFactory(t *testing.T) { assert.NoError(err) defer hybridVSockTTRPCMock.Stop() - // New + // Create 2 sets of instance-specific directories for per-VM storage + runStorePath1 := t.TempDir() + vmStorePath1 := t.TempDir() + runStorePath2 := t.TempDir() + vmStorePath2 := t.TempDir() + + // Create a new Template Factory f, err := New(ctx, vmConfig, testDir) assert.Nil(err) // Config assert.Equal(f.Config(), vmConfig) - // GetBaseVM - vm, err := f.GetBaseVM(ctx, vmConfig) + // GetBaseVM with first instance paths + vmConfig1 := vmConfig + vmConfig1.HypervisorConfig.RunStorePath = runStorePath1 + vmConfig1.HypervisorConfig.VMStorePath = vmStorePath1 + + // Test the creation of a new VM from the template factory + vm, err := f.GetBaseVM(ctx, vmConfig1) assert.Nil(err) err = vm.Stop(ctx) @@ -79,44 +90,59 @@ func TestTemplateFactory(t *testing.T) { assert.Equal(tt.Config(), vmConfig) + // Checking that template VM check fails + // if the corresponding memory and state files are absent err = tt.checkTemplateVM() assert.Error(err) - _, err = os.Create(tt.statePath + "/memory") + memFile, err := os.Create(tt.statePath + "/memory") assert.Nil(err) + memFile.Close() err = tt.checkTemplateVM() assert.Error(err) - _, err = os.Create(tt.statePath + "/state") + devFile, err := os.Create(tt.deviceStatePath()) assert.Nil(err) + devFile.Close() + + // After creating state and memory files, checkTemplateVM should succeed err = tt.checkTemplateVM() assert.Nil(err) + // Recreate the template VM, which should succeed err = tt.createTemplateVM(ctx) assert.Nil(err) - vm, err = tt.GetBaseVM(ctx, vmConfig) + // Ensuring that directly calling template's GetBaseVM function + // returns a VM instance similar to the one returned by the factory's GetBaseVM function + vm, err = tt.GetBaseVM(ctx, vmConfig1) assert.Nil(err) err = vm.Stop(ctx) assert.Nil(err) - vm, err = f.GetBaseVM(ctx, vmConfig) + vm, err = f.GetBaseVM(ctx, vmConfig1) assert.Nil(err) err = vm.Stop(ctx) assert.Nil(err) + // Overwriting the template VM should succeed err = tt.createTemplateVM(ctx) assert.Nil(err) - vm, err = tt.GetBaseVM(ctx, vmConfig) + // Create second instance with different storage paths + vmConfig2 := vmConfig + vmConfig2.HypervisorConfig.RunStorePath = runStorePath2 + vmConfig2.HypervisorConfig.VMStorePath = vmStorePath2 + + vm, err = tt.GetBaseVM(ctx, vmConfig2) assert.Nil(err) err = vm.Stop(ctx) assert.Nil(err) - vm, err = f.GetBaseVM(ctx, vmConfig) + vm, err = f.GetBaseVM(ctx, vmConfig2) assert.Nil(err) err = vm.Stop(ctx) diff --git a/tests/gha-run-k8s-common.sh b/tests/gha-run-k8s-common.sh index c23f014573..5560e6497e 100644 --- a/tests/gha-run-k8s-common.sh +++ b/tests/gha-run-k8s-common.sh @@ -43,6 +43,7 @@ TEST_CLUSTER_NAMESPACE="${TEST_CLUSTER_NAMESPACE:-}" CONTAINER_RUNTIME="${CONTAINER_RUNTIME:-containerd}" SNAPSHOTTER="${SNAPSHOTTER:-}" EROFS_SNAPSHOTTER_MODE="${EROFS_SNAPSHOTTER_MODE:-}" +EROFS_MERGE_MODE="${EROFS_MERGE_MODE:-}" # Wait for the Kubernetes API to recover after kata-deploy uninstall, then # retry the uninstall to purge any stale helm release state. On k3s/rke2, @@ -851,6 +852,26 @@ function helm_helper() { yq -i '.containerd.userDropIn = strenv(HELM_CONTAINERD_USER_DROP_IN)' "${values_yaml}" fi + # EROFS merge mode ("merged" default, or "unmerged"). This is orthogonal + # to EROFS_SNAPSHOTTER_MODE (which controls default_size): it controls + # whether containerd merges layers into a single fsmeta.erofs (merged, + # runtime-rs only) or keeps per-layer layer.erofs (unmerged, required by + # the Go runtime). + if [[ -n "${EROFS_MERGE_MODE}" ]]; then + if [[ "${SNAPSHOTTER}" != "erofs" ]]; then + die "EROFS_MERGE_MODE is only supported with SNAPSHOTTER=erofs" + fi + + case "${EROFS_MERGE_MODE}" in + merged|unmerged) ;; + *) + die "Unsupported EROFS_MERGE_MODE: ${EROFS_MERGE_MODE}" + ;; + esac + + yq -i ".snapshotter.erofsMergeMode = \"${EROFS_MERGE_MODE}\"" "${values_yaml}" + fi + if [[ -z "${HELM_SHIMS}" ]]; then die "A list of shims is expected but none was provided" fi diff --git a/tests/integration/kubernetes/k8s-vm-templating.bats b/tests/integration/kubernetes/k8s-vm-templating.bats new file mode 100644 index 0000000000..c3fb0e01fa --- /dev/null +++ b/tests/integration/kubernetes/k8s-vm-templating.bats @@ -0,0 +1,114 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2024 Kata Containers +# +# SPDX-License-Identifier: Apache-2.0 +# +# Tests for Kata VM templating (factory) functionality in Kubernetes integration mode + +load "${BATS_TEST_DIRNAME}/lib.sh" +load "${BATS_TEST_DIRNAME}/../../common.bash" +load "${BATS_TEST_DIRNAME}/confidential_common.sh" +load "${BATS_TEST_DIRNAME}/tests_common.sh" + +# Returns 0 if the current environment supports VM templating, non-zero +# otherwise. VM templating is only supported on non-confidential clh/qemu +# hypervisors, and because it uses shared_fs="none" it also requires a +# block-device-based snapshotter (blockfile or erofs). +vm_templating_supported() { + [[ "${KATA_HYPERVISOR}" == "clh" || "${KATA_HYPERVISOR}" == "qemu" ]] || return 1 + is_confidential_runtime_class && return 1 + [[ "${SNAPSHOTTER:-}" =~ ^(blockfile|erofs)$ ]] || return 1 + return 0 +} + +setup() { + if ! vm_templating_supported; then + skip "VM templating requires a non-confidential clh/qemu hypervisor and a blockfile/erofs snapshotter (KATA_HYPERVISOR=${KATA_HYPERVISOR}, SNAPSHOTTER=${SNAPSHOTTER:-unset})" + fi + + setup_common || die "setup_common failed" + + # Build a Kata runtime config drop-in that enables VM templating and + # disables shared_fs (incompatible with templating). + # QEMU VM templating requires an initrd, CLH does not. + local rootfs_override="" + if [[ "${KATA_HYPERVISOR}" == "qemu" ]]; then + rootfs_override=$'image = ""\ninitrd = "/opt/kata/share/kata-containers/kata-containers-initrd.img"' + fi + + local runtime_config_dropin_file="${BATS_TEST_TMPDIR}/99-k8s-vm-templating.toml" + cat > "${runtime_config_dropin_file}" </, whereas a factory-spawned VM + # stores its state under a generated UUID and /run/vc/vm/ is a + # symlink pointing at it (see assignSandbox() in + # src/runtime/virtcontainers/vm.go). Inspect PID 1's mount namespace, where + # the shim creates these entries alongside the template tmpfs. + exec_host "$node" \ + "nsenter --mount=/proc/1/ns/mnt find /run/vc/vm -maxdepth 1 -mindepth 1 -type l ! -name template | grep -q . && echo symlink" \ + | grep -q symlink +} + +teardown() { + vm_templating_supported || return 0 + + rm -f "${pod_config:-}" + + # Destroy the VM template and remove the config drop-in on the target node. + # factory destroy must run in PID 1's mount namespace to unmount the template + # tmpfs that factory init created there (see the @test for details). + exec_host "$node" "nsenter --mount=/proc/1/ns/mnt /opt/kata/bin/kata-runtime --config ${kata_config_path} factory destroy" \ + || echo "Warning: Failed to destroy VM template on node $node" + + remove_kata_runtime_config_dropin_file "$node" "${dropin_path:-}" \ + || echo "Warning: Failed to remove Kata runtime config drop-in on node $node" + + teardown_common "${node:-}" "${node_start_time:-}" +} diff --git a/tests/integration/kubernetes/run_kubernetes_tests.sh b/tests/integration/kubernetes/run_kubernetes_tests.sh index 6e4f64f45b..115c6fb949 100755 --- a/tests/integration/kubernetes/run_kubernetes_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_tests.sh @@ -104,6 +104,7 @@ else "k8s-security-context.bats" \ "k8s-shared-volume.bats" \ "k8s-volume.bats" \ + "k8s-vm-templating.bats" \ "k8s-nginx-connectivity.bats" \ ) diff --git a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs index bafec197ab..a2b7599d3e 100644 --- a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs +++ b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs @@ -15,36 +15,50 @@ use std::path::Path; pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &Path) -> Result<()> { info!("Configuring erofs-snapshotter"); + // "unmerged" mode keeps each image layer as its own per-layer `layer.erofs` + // (containerd's default, non-fsmerged layout), which is the only layout the + // Go runtime can consume. In the default "merged" mode we force containerd + // to merge layers into a single `fsmeta.erofs`, which is runtime-rs only. + let unmerged = config.erofs_merge_mode.as_deref() == Some("unmerged"); + // The Go runtime does not support fsmerged EROFS (fsmeta.erofs). // If the snapshotter handler mapping explicitly pairs a Go shim with - // erofs, that is a hard misconfiguration — bail out so the operator - // fixes the mapping instead of hitting cryptic runtime errors later. - if let Some(mapping) = config.snapshotter_handler_mapping_for_arch.as_ref() { - let mut go_shims_on_erofs = Vec::new(); - for entry in mapping.split(',') { - let parts: Vec<&str> = entry.split(':').collect(); - if parts.len() == 2 && parts[1] == "erofs" && !utils::is_rust_shim(parts[0]) { - go_shims_on_erofs.push(parts[0].to_string()); + // erofs in the (default) merged mode, that is a hard misconfiguration — + // bail out so the operator fixes the mapping instead of hitting cryptic + // runtime errors later. In "unmerged" mode the Go runtime is supported, so + // skip this guard. + if !unmerged { + if let Some(mapping) = config.snapshotter_handler_mapping_for_arch.as_ref() { + let mut go_shims_on_erofs = Vec::new(); + for entry in mapping.split(',') { + let parts: Vec<&str> = entry.split(':').collect(); + if parts.len() == 2 && parts[1] == "erofs" && !utils::is_rust_shim(parts[0]) { + go_shims_on_erofs.push(parts[0].to_string()); + } } - } - if !go_shims_on_erofs.is_empty() { - warn!("##########################################################################"); - warn!("# #"); - warn!("# Go runtime shim(s) mapped to the erofs snapshotter: #"); - for s in &go_shims_on_erofs { - warn!("# - {:<64} #", s); + if !go_shims_on_erofs.is_empty() { + warn!("##########################################################################"); + warn!("# #"); + warn!("# Go runtime shim(s) mapped to the erofs snapshotter: #"); + for s in &go_shims_on_erofs { + warn!("# - {:<64} #", s); + } + warn!("# #"); + warn!( + "# The Go runtime does NOT support fsmerged EROFS (fsmeta.erofs). #" + ); + warn!("# Only runtime-rs shims are supported with merged erofs. Set #"); + warn!("# EROFS_MERGE_MODE=unmerged to use the Go runtime with erofs. #"); + warn!("# #"); + warn!("##########################################################################"); + return Err(anyhow::anyhow!( + "erofs snapshotter: Go runtime shim(s) [{}] cannot be mapped to merged erofs. \ + The Go runtime does not support fsmerged EROFS. \ + Set EROFS_MERGE_MODE=unmerged, remove these shims from \ + SNAPSHOTTER_HANDLER_MAPPING, or switch them to runtime-rs.", + go_shims_on_erofs.join(", ") + )); } - warn!("# #"); - warn!("# The Go runtime does NOT support fsmerged EROFS (fsmeta.erofs). #"); - warn!("# Only runtime-rs shims are supported with the erofs snapshotter. #"); - warn!("# #"); - warn!("##########################################################################"); - return Err(anyhow::anyhow!( - "erofs snapshotter: Go runtime shim(s) [{}] cannot be mapped to erofs. \ - The Go runtime does not support fsmerged EROFS. \ - Remove these shims from SNAPSHOTTER_HANDLER_MAPPING or switch them to runtime-rs.", - go_shims_on_erofs.join(", ") - )); } } @@ -88,11 +102,27 @@ pub async fn configure_erofs_snapshotter(config: &Config, configuration_file: &P ".plugins.\"io.containerd.snapshotter.v1.erofs\".default_size", "\"10G\"", )?; - toml_utils::set_toml_value( - configuration_file, - ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", - "0", - )?; + // In the default "merged" mode, force containerd to merge all layers into a + // single fsmeta.erofs (max_unmerged_layers = 0). In "unmerged" mode we delete + // any previously-written value so each layer stays a separate layer.erofs, + // which the Go runtime requires. + // + // Because kata-deploy edits the containerd config in place, switching from + // merged to unmerged must actively remove the old `max_unmerged_layers = 0` + // left behind by a previous install. Otherwise the stale `0` would keep + // forcing the merged layout and break Go-runtime compatibility. + if !unmerged { + toml_utils::set_toml_value( + configuration_file, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + "0", + )?; + } else { + toml_utils::delete_toml_value( + configuration_file, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + )?; + } Ok(()) } diff --git a/tools/packaging/kata-deploy/binary/src/config.rs b/tools/packaging/kata-deploy/binary/src/config.rs index c3c7f68e8c..4700940740 100644 --- a/tools/packaging/kata-deploy/binary/src/config.rs +++ b/tools/packaging/kata-deploy/binary/src/config.rs @@ -178,6 +178,14 @@ pub struct Config { pub multi_install_suffix: Option, pub helm_post_delete_hook: bool, pub experimental_setup_snapshotter: Option>, + /// EROFS snapshotter merge mode: "merged" (default) or "unmerged". + /// + /// In "unmerged" mode kata-deploy does not force containerd's erofs + /// snapshotter to merge layers (it leaves `max_unmerged_layers` at the + /// containerd default), so each image layer is exposed as its own + /// per-layer `layer.erofs`. This is the only layout the Go runtime can + /// consume; the merged (`fsmeta.erofs`) layout is runtime-rs only. + pub erofs_merge_mode: Option, pub experimental_force_guest_pull_for_arch: Vec, pub dest_dir: String, pub host_install_dir: String, @@ -307,6 +315,11 @@ impl Config { .filter(|s| !s.is_empty()) .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()); + let erofs_merge_mode = env::var("EROFS_MERGE_MODE") + .ok() + .map(|s| s.trim().to_lowercase()) + .filter(|s| !s.is_empty()); + // Only use arch-specific variable for experimental force guest pull let experimental_force_guest_pull_for_arch = get_arch_var("EXPERIMENTAL_FORCE_GUEST_PULL", "", &arch) @@ -338,6 +351,7 @@ impl Config { multi_install_suffix, helm_post_delete_hook, experimental_setup_snapshotter, + erofs_merge_mode, experimental_force_guest_pull_for_arch, dest_dir, host_install_dir, @@ -508,6 +522,17 @@ impl Config { _ => {} } + // Validate EROFS_MERGE_MODE + // Only "merged" (default) and "unmerged" are accepted. + if let Some(mode) = self.erofs_merge_mode.as_ref() { + if mode != "merged" && mode != "unmerged" { + return Err(anyhow::anyhow!( + "EROFS_MERGE_MODE must be either 'merged' or 'unmerged', got '{}'", + mode + )); + } + } + // Validate EXPERIMENTAL_FORCE_GUEST_PULL_FOR_ARCH // This is a list of shim names for shim in &self.experimental_force_guest_pull_for_arch { @@ -551,6 +576,7 @@ impl Config { "* EXPERIMENTAL_SETUP_SNAPSHOTTER: {:?}", self.experimental_setup_snapshotter ); + info!("* EROFS_MERGE_MODE: {:?}", self.erofs_merge_mode); info!( "* EXPERIMENTAL_FORCE_GUEST_PULL: {}", self.experimental_force_guest_pull_for_arch.join(",") diff --git a/tools/packaging/kata-deploy/binary/src/utils/toml.rs b/tools/packaging/kata-deploy/binary/src/utils/toml.rs index 080f678875..58477ffa46 100644 --- a/tools/packaging/kata-deploy/binary/src/utils/toml.rs +++ b/tools/packaging/kata-deploy/binary/src/utils/toml.rs @@ -121,6 +121,47 @@ pub fn set_toml_value(file_path: &Path, path: &str, value: &str) -> Result<()> { Ok(()) } +/// Delete a TOML value (or table) at a given path. +/// +/// Navigates to the parent table and removes the final key. This is a no-op if +/// any path component (including the final key) does not exist, so callers can +/// unconditionally remove a value that may or may not be present. +pub fn delete_toml_value(file_path: &Path, path: &str) -> Result<()> { + let content = std::fs::read_to_string(file_path) + .with_context(|| format!("Failed to read TOML file: {file_path:?}"))?; + + let (header, toml_content) = split_non_toml_header(&content); + let mut doc = toml_content + .parse::() + .context("Failed to parse TOML")?; + + let parts = parse_toml_path(path)?; + + let mut current_table = doc.as_table_mut(); + for (i, part) in parts.iter().enumerate() { + let is_last = i == parts.len() - 1; + + if is_last { + // Remove the value; absent key is fine (no-op). + current_table.remove(part.as_str()); + } else { + // Navigate into the intermediate table. If it does not exist, there + // is nothing to delete. + match current_table + .get_mut(part.as_str()) + .and_then(|item| item.as_table_mut()) + { + Some(table) => current_table = table, + None => return Ok(()), + } + } + } + + write_toml_with_header(file_path, header, &doc)?; + + Ok(()) +} + /// Get a TOML value at a given path pub fn get_toml_value(file_path: &Path, path: &str) -> Result { let content = std::fs::read_to_string(file_path) @@ -1714,4 +1755,100 @@ imports = ["/etc/containerd/conf.d/*.toml", "/opt/kata/containerd/config.d/kata- .unwrap(); assert_eq!(runtime_type, "io.containerd.kata-qemu.v2"); } + + #[test] + fn test_delete_toml_value() { + let temp_file = NamedTempFile::new().unwrap(); + let temp_path = temp_file.path(); + std::fs::write( + temp_path, + "[plugins.\"io.containerd.snapshotter.v1.erofs\"]\nmax_unmerged_layers = 0\nenable_fsverity = true\n", + ) + .unwrap(); + + // Sanity check: value is present before deletion. + let before = get_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ) + .unwrap(); + assert_eq!(before, "0"); + + delete_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ) + .unwrap(); + + // The deleted key is gone, but sibling keys remain. + let result = get_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ); + assert!(result.is_err(), "deleted key should no longer be found"); + + let sibling = get_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".enable_fsverity", + ) + .unwrap(); + assert_eq!(sibling, "true", "sibling keys must be preserved"); + } + + #[test] + fn test_delete_toml_value_missing_key_is_noop() { + let temp_file = NamedTempFile::new().unwrap(); + let temp_path = temp_file.path(); + let initial = "[plugins.\"io.containerd.snapshotter.v1.erofs\"]\nenable_fsverity = true\n"; + std::fs::write(temp_path, initial).unwrap(); + + // Deleting a key that does not exist must succeed and leave the file usable. + delete_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ) + .unwrap(); + + // Deleting through a non-existent intermediate table is also a no-op. + delete_toml_value(temp_path, ".plugins.\"nonexistent.plugin\".some_key").unwrap(); + let sibling = get_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".enable_fsverity", + ) + .unwrap(); + assert_eq!(sibling, "true"); + } + + #[test] + fn test_delete_toml_value_preserves_k3s_header() { + let temp_file = NamedTempFile::new().unwrap(); + let temp_path = temp_file.path(); + std::fs::write( + temp_path, + "{{ template \"base\" . }}\n[plugins.\"io.containerd.snapshotter.v1.erofs\"]\nmax_unmerged_layers = 0\n", + ) + .unwrap(); + + delete_toml_value( + temp_path, + ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers", + ) + .unwrap(); + + let content = std::fs::read_to_string(temp_path).unwrap(); + assert!( + content.starts_with("{{ template \"base\" . }}\n"), + "non-TOML header must be preserved" + ); + assert!( + !content.contains("max_unmerged_layers"), + "value must be removed" + ); + } + + #[test] + fn test_delete_toml_value_nonexistent_file() { + let result = delete_toml_value(Path::new("/nonexistent/file.toml"), "some.path"); + assert!(result.is_err()); + } } diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl index 457cb00ab6..24f309ae3c 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl @@ -413,6 +413,13 @@ Get snapshotter setup list from structured config {{- join "," .Values.snapshotter.setup -}} {{- end -}} +{{/* +Get EROFS merge mode from structured config ("merged" or "unmerged") +*/}} +{{- define "kata-deploy.getErofsMergeMode" -}} +{{- .Values.snapshotter.erofsMergeMode | default "" -}} +{{- end -}} + {{/* Get debug value from structured config */}} @@ -569,6 +576,11 @@ e.g. `{{- include "kata-deploy.commonEnv" . | nindent 8 }}`. - name: EXPERIMENTAL_SETUP_SNAPSHOTTER value: {{ $snapshotterSetup | quote }} {{- end }} +{{- $erofsMergeMode := include "kata-deploy.getErofsMergeMode" . | trim -}} +{{- if $erofsMergeMode }} +- name: EROFS_MERGE_MODE + value: {{ $erofsMergeMode | quote }} +{{- end }} {{- $forceGuestPullAmd64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "amd64") | trim -}} {{- if $forceGuestPullAmd64 }} - name: EXPERIMENTAL_FORCE_GUEST_PULL_X86_64 diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml index 120bfd5027..7832ddb995 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml @@ -271,6 +271,18 @@ health: snapshotter: setup: ["nydus"] # ["nydus", "erofs"] or [] + # EROFS merge mode: "merged" (default) or "unmerged". + # + # "merged" forces containerd's erofs snapshotter to merge all image layers + # into a single fsmeta.erofs (max_unmerged_layers = 0). This layout is only + # supported by runtime-rs shims. + # + # "unmerged" leaves max_unmerged_layers at the containerd default so each + # image layer is exposed as its own per-layer layer.erofs. This is the only + # layout the Go runtime can consume, so set this when mapping a Go shim to the + # erofs snapshotter. When empty, kata-deploy uses its built-in default + # (merged). + erofsMergeMode: "" # Shim configuration # By default (disableAll: false), all shims with enabled: ~ (null) are enabled.