From 7834f4127f9a2fd2710393bd98106336596110a9 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Mon, 14 Jun 2021 15:14:05 -0500 Subject: [PATCH 1/3] virtcontainers: change memory_offset to uint64 `memory_offset` is used to increase the maximum amount of memory supported in a VM, this offset is equal to the NVDIMM/PMEM device that is hot added, in real use case workloads such devices are bigger than 4G, which is the current limit (uint32). fixes #2006 Signed-off-by: Julio Montes --- src/runtime/pkg/katautils/config-settings.go.in | 2 +- src/runtime/pkg/katautils/config.go | 4 ++-- src/runtime/virtcontainers/hypervisor.go | 2 +- src/runtime/virtcontainers/persist/api/config.go | 2 +- src/runtime/virtcontainers/pkg/oci/utils.go | 4 ++-- src/runtime/virtcontainers/pkg/oci/utils_test.go | 2 +- src/runtime/virtcontainers/qemu.go | 4 ++-- src/runtime/virtcontainers/qemu_amd64_test.go | 4 ++-- src/runtime/virtcontainers/qemu_arch_base.go | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 470527998f..31a4828c46 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -26,7 +26,7 @@ const defaultVCPUCount uint32 = 1 const defaultMaxVCPUCount uint32 = 0 const defaultMemSize uint32 = 2048 // MiB const defaultMemSlots uint32 = 10 -const defaultMemOffset uint32 = 0 // MiB +const defaultMemOffset uint64 = 0 // MiB const defaultVirtioMem bool = false const defaultBridgesCount uint32 = 1 const defaultInterNetworkingModel = "tcfilter" diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 3da633a24a..c6173234ef 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -114,7 +114,7 @@ type hypervisor struct { DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` MemorySize uint32 `toml:"default_memory"` MemSlots uint32 `toml:"memory_slots"` - MemOffset uint32 `toml:"memory_offset"` + MemOffset uint64 `toml:"memory_offset"` DefaultBridges uint32 `toml:"default_bridges"` Msize9p uint32 `toml:"msize_9p"` PCIeRootPort uint32 `toml:"pcie_root_port"` @@ -359,7 +359,7 @@ func (h hypervisor) defaultMemSlots() uint32 { return slots } -func (h hypervisor) defaultMemOffset() uint32 { +func (h hypervisor) defaultMemOffset() uint64 { offset := h.MemOffset if offset == 0 { offset = defaultMemOffset diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 9e6cabf7ff..d6bc97c118 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -242,7 +242,7 @@ type HypervisorConfig struct { MemSlots uint32 // MemOffset specifies memory space for nvdimm device - MemOffset uint32 + MemOffset uint64 // VirtioFSCacheSize is the DAX cache size in MiB VirtioFSCacheSize uint32 diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index 3bd5567dbe..897aae3668 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -33,7 +33,7 @@ type HypervisorConfig struct { MemSlots uint32 // MemOffset specifies memory space for nvdimm device - MemOffset uint32 + MemOffset uint64 // VirtioFSCacheSize is the DAX cache size in MiB VirtioFSCacheSize uint32 diff --git a/src/runtime/virtcontainers/pkg/oci/utils.go b/src/runtime/virtcontainers/pkg/oci/utils.go index efaee4a969..b007f3ae31 100644 --- a/src/runtime/virtcontainers/pkg/oci/utils.go +++ b/src/runtime/virtcontainers/pkg/oci/utils.go @@ -576,13 +576,13 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig } if value, ok := ocispec.Annotations[vcAnnotations.MemOffset]; ok { - moffset, err := strconv.ParseUint(value, 10, 32) + moffset, err := strconv.ParseUint(value, 10, 64) if err != nil { return fmt.Errorf("Error parsing annotation for memory_offset: %v, please specify positive numeric value", err) } if moffset > 0 { - sbConfig.HypervisorConfig.MemOffset = uint32(moffset) + sbConfig.HypervisorConfig.MemOffset = moffset } } diff --git a/src/runtime/virtcontainers/pkg/oci/utils_test.go b/src/runtime/virtcontainers/pkg/oci/utils_test.go index 436510c47f..8e8a5a3ed3 100644 --- a/src/runtime/virtcontainers/pkg/oci/utils_test.go +++ b/src/runtime/virtcontainers/pkg/oci/utils_test.go @@ -870,7 +870,7 @@ func TestAddHypervisorAnnotations(t *testing.T) { assert.Equal(config.HypervisorConfig.DefaultMaxVCPUs, uint32(1)) assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024)) assert.Equal(config.HypervisorConfig.MemSlots, uint32(20)) - assert.Equal(config.HypervisorConfig.MemOffset, uint32(512)) + assert.Equal(config.HypervisorConfig.MemOffset, uint64(512)) assert.Equal(config.HypervisorConfig.VirtioMem, true) assert.Equal(config.HypervisorConfig.MemPrealloc, true) assert.Equal(config.HypervisorConfig.Mlock, false) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 953b611242..8c478f0b4b 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -2116,12 +2116,12 @@ func genericBridges(number uint32, machineType string) []types.Bridge { } // nolint: unused, deadcode -func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOffset uint32) govmmQemu.Memory { +func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOffset uint64) govmmQemu.Memory { // image NVDIMM device needs memory space 1024MB // See https://github.com/clearcontainers/runtime/issues/380 memoryOffset += 1024 - memMax := fmt.Sprintf("%dM", hostMemoryMb+uint64(memoryOffset)) + memMax := fmt.Sprintf("%dM", hostMemoryMb+memoryOffset) mem := fmt.Sprintf("%dM", memoryMb) diff --git a/src/runtime/virtcontainers/qemu_amd64_test.go b/src/runtime/virtcontainers/qemu_amd64_test.go index b29b84a1b0..5016e58ad7 100644 --- a/src/runtime/virtcontainers/qemu_amd64_test.go +++ b/src/runtime/virtcontainers/qemu_amd64_test.go @@ -109,7 +109,7 @@ func TestQemuAmd64CPUModel(t *testing.T) { func TestQemuAmd64MemoryTopology(t *testing.T) { assert := assert.New(t) amd64 := newTestQemu(assert, QemuPC) - memoryOffset := 1024 + memoryOffset := uint64(1024) hostMem := uint64(100) mem := uint64(120) @@ -117,7 +117,7 @@ func TestQemuAmd64MemoryTopology(t *testing.T) { expectedMemory := govmmQemu.Memory{ Size: fmt.Sprintf("%dM", mem), Slots: slots, - MaxMem: fmt.Sprintf("%dM", hostMem+uint64(memoryOffset)), + MaxMem: fmt.Sprintf("%dM", hostMem+memoryOffset), } m := amd64.memoryTopology(mem, hostMem, slots) diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 6c35550ad3..ebdfc4a3b8 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -177,7 +177,7 @@ const ( type qemuArchBase struct { qemuMachine govmmQemu.Machine qemuExePath string - memoryOffset uint32 + memoryOffset uint64 nestedRun bool vhost bool disableNvdimm bool From 6be8bf5c6678478917f88b099a5c22d047db5c60 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Mon, 14 Jun 2021 15:18:48 -0500 Subject: [PATCH 2/3] docs: update annotations documentation update documentation to reflect value type of `memory_offset` Signed-off-by: Julio Montes --- docs/how-to/how-to-set-sandbox-config-kata.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/how-to/how-to-set-sandbox-config-kata.md b/docs/how-to/how-to-set-sandbox-config-kata.md index d530a0c5ed..044e1576c0 100644 --- a/docs/how-to/how-to-set-sandbox-config-kata.md +++ b/docs/how-to/how-to-set-sandbox-config-kata.md @@ -79,7 +79,7 @@ There are several kinds of Kata configurations and they are listed below. | `io.katacontainers.config.hypervisor.kernel` | string | the kernel used to boot the container VM | | `io.katacontainers.config.hypervisor.machine_accelerators` | string | machine specific accelerators for the hypervisor | | `io.katacontainers.config.hypervisor.machine_type` | string | the type of machine being emulated by the hypervisor | -| `io.katacontainers.config.hypervisor.memory_offset` | uint32| the memory space used for `nvdimm` device by the hypervisor | +| `io.katacontainers.config.hypervisor.memory_offset` | uint64| the memory space used for `nvdimm` device by the hypervisor | | `io.katacontainers.config.hypervisor.memory_slots` | uint32| the memory slots assigned to the VM by the hypervisor | | `io.katacontainers.config.hypervisor.msize_9p` | uint32 | the `msize` for 9p shares | | `io.katacontainers.config.hypervisor.path` | string | the hypervisor that will run the container VM | From 361bee91f7edf40bccf4d44abb437a2a953f4136 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Mon, 14 Jun 2021 15:27:03 -0500 Subject: [PATCH 3/3] runtime/virtcontrainers: fix alignment structures fix alignment of qemuArchBase and HypervisorConfig structures Signed-off-by: Julio Montes --- src/runtime/virtcontainers/hypervisor.go | 62 ++++++++++---------- src/runtime/virtcontainers/qemu_arch_base.go | 10 ++-- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index d6bc97c118..53cebe7e35 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -222,6 +222,10 @@ type Param struct { // HypervisorConfig is the hypervisor configuration. type HypervisorConfig struct { + // PCIeRootPort is used to indicate the number of PCIe Root Port devices + // The PCIe Root Port device is used to hot-plug the PCIe device + PCIeRootPort uint32 + // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 @@ -241,9 +245,6 @@ type HypervisorConfig struct { // MemSlots specifies default memory slots the VM. MemSlots uint32 - // MemOffset specifies memory space for nvdimm device - MemOffset uint64 - // VirtioFSCacheSize is the DAX cache size in MiB VirtioFSCacheSize uint32 @@ -310,9 +311,6 @@ type HypervisorConfig struct { // entropy (/dev/random, /dev/urandom or real hardware RNG device) EntropySource string - // EntropySourceList is the list of valid entropy sources - EntropySourceList []string - // Shared file system type: // - virtio-9p (default) // - virtio-fs @@ -321,6 +319,12 @@ type HypervisorConfig struct { // VirtioFSDaemon is the virtio-fs vhost-user daemon path VirtioFSDaemon string + // File based memory backend root directory + FileBackedMemRootDir string + + // EntropySourceList is the list of valid entropy sources + EntropySourceList []string + // VirtioFSDaemonList is the list of valid virtiofs names for annotations VirtioFSDaemonList []string @@ -330,8 +334,8 @@ type HypervisorConfig struct { // VirtioFSExtraArgs passes options to virtiofsd daemon VirtioFSExtraArgs []string - // File based memory backend root directory - FileBackedMemRootDir string + // Enable annotations by name + EnableAnnotations []string // FileBackedMemRootList is the list of valid root directories values for annotations FileBackedMemRootList []string @@ -339,6 +343,9 @@ type HypervisorConfig struct { // PFlash image paths PFlash []string + // VhostUserStorePathList is the list of valid values for vhost-user paths + VhostUserStorePathList []string + // customAssets is a map of assets. // Each value in that map takes precedence over the configured assets. // For example, if there is a value for the "kernel" key in this map, @@ -401,9 +408,14 @@ type HypervisorConfig struct { // root bus instead of a bridge. HotplugVFIOOnRootBus bool - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device - PCIeRootPort uint32 + // GuestMemoryDumpPaging is used to indicate if enable paging + // for QEMU dump-guest-memory command + GuestMemoryDumpPaging bool + + // Enable confidential guest support. + // Enable or disable different hardware features, ranging + // from memory encryption to both memory and CPU-state encryption and integrity. + ConfidentialGuest bool // BootToBeTemplate used to indicate if the VM is created to be a template VM BootToBeTemplate bool @@ -421,8 +433,8 @@ type HypervisorConfig struct { // related folders, sockets and device nodes should be. VhostUserStorePath string - // VhostUserStorePathList is the list of valid values for vhost-user paths - VhostUserStorePathList []string + // GuestCoredumpPath is the path in host for saving guest memory dump + GuestMemoryDumpPath string // GuestHookPath is the path within the VM that will be used for 'drop-in' hooks GuestHookPath string @@ -434,30 +446,18 @@ type HypervisorConfig struct { // SELinux label for the VM SELinuxProcessLabel string + // SGXEPCSize specifies the size in bytes for the EPC Section. + // Enable SGX. Hardware-based isolation and memory encryption. + SGXEPCSize int64 + // RxRateLimiterMaxRate is used to control network I/O inbound bandwidth on VM level. RxRateLimiterMaxRate uint64 // TxRateLimiterMaxRate is used to control network I/O outbound bandwidth on VM level. TxRateLimiterMaxRate uint64 - // SGXEPCSize specifies the size in bytes for the EPC Section. - // Enable SGX. Hardware-based isolation and memory encryption. - SGXEPCSize int64 - - // Enable annotations by name - EnableAnnotations []string - - // GuestCoredumpPath is the path in host for saving guest memory dump - GuestMemoryDumpPath string - - // GuestMemoryDumpPaging is used to indicate if enable paging - // for QEMU dump-guest-memory command - GuestMemoryDumpPaging bool - - // Enable confidential guest support. - // Enable or disable different hardware features, ranging - // from memory encryption to both memory and CPU-state encryption and integrity. - ConfidentialGuest bool + // MemOffset specifies memory space for nvdimm device + MemOffset uint64 } // vcpu mapping from vcpu number to thread number diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index ebdfc4a3b8..0948401a62 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -175,20 +175,20 @@ const ( ) type qemuArchBase struct { - qemuMachine govmmQemu.Machine - qemuExePath string memoryOffset uint64 + networkIndex int nestedRun bool vhost bool disableNvdimm bool dax bool - networkIndex int + protection guestProtection + qemuMachine govmmQemu.Machine + qemuExePath string + PFlash []string kernelParamsNonDebug []Param kernelParamsDebug []Param kernelParams []Param Bridges []types.Bridge - PFlash []string - protection guestProtection } const (