From 0da03741579637f8e4d24265c35b83ebaba4fb83 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 16 Jul 2024 11:45:53 +0200 Subject: [PATCH 1/6] qemu: enable NUMA support Link qemu with libnuma and enable NUMA feature flag. Signed-off-by: Konstantin Khlebnikov --- tools/packaging/scripts/configure-hypervisor.sh | 3 +++ tools/packaging/static-build/qemu/Dockerfile | 1 + 2 files changed, 4 insertions(+) diff --git a/tools/packaging/scripts/configure-hypervisor.sh b/tools/packaging/scripts/configure-hypervisor.sh index 73f51817b2..f72e85a19c 100755 --- a/tools/packaging/scripts/configure-hypervisor.sh +++ b/tools/packaging/scripts/configure-hypervisor.sh @@ -445,6 +445,9 @@ generate_qemu_options() { qemu_options+=(functionality:--enable-cap-ng) qemu_options+=(functionality:--enable-seccomp) + # Support NUMA topology + qemu_options+=(functionality:--enable-numa) + # AVX2 is enabled by default by x86_64, make sure it's enabled only # for that architecture if [ "$arch" == x86_64 ]; then diff --git a/tools/packaging/static-build/qemu/Dockerfile b/tools/packaging/static-build/qemu/Dockerfile index a6d79f77b1..e0549e3672 100644 --- a/tools/packaging/static-build/qemu/Dockerfile +++ b/tools/packaging/static-build/qemu/Dockerfile @@ -50,6 +50,7 @@ RUN apt-get update && apt-get upgrade -y && \ libglib2.0-dev${DPKG_ARCH} git \ libltdl-dev${DPKG_ARCH} \ libmount-dev${DPKG_ARCH} \ + libnuma-dev${DPKG_ARCH} \ libpixman-1-dev${DPKG_ARCH} \ libselinux1-dev${DPKG_ARCH} \ libtool${DPKG_ARCH} \ From a202741b8f06cd7dc727b2e16ee21b2326f2e09a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 18 Jul 2024 11:16:57 +0200 Subject: [PATCH 2/6] kernel: enable x86-64 ACPI NUMA detection CONFIG_NUMA is already on, but without CONFIG_X86_64_ACPI_NUMA kernel cannot detect NUMA. Signed-off-by: Konstantin Khlebnikov --- tools/packaging/kernel/configs/fragments/x86_64/acpi.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf b/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf index 6c260c0bae..d41ff0fee6 100644 --- a/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf +++ b/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf @@ -4,6 +4,8 @@ CONFIG_X86_INTEL_PSTATE=y # Firecracker needs this to support `vcpu_count` CONFIG_X86_MPPARSE=y +CONFIG_X86_64_ACPI_NUMA=y + CONFIG_ACPI_CPU_FREQ_PSS=y CONFIG_ACPI_HOTPLUG_IOAPIC=y CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y From c7a56ab7116df1e4000f6327c055d7c2da5c85c4 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 18 Jul 2024 16:54:28 +0200 Subject: [PATCH 3/6] runtime: add hypervisor options for NUMA topology With enable_numa=true hypervisor will expose host NUMA topology as is: map vm NUMA nodes to host 1:1 and bind vpus to relates CPUS. Option "numa_mapping" allows to redefine NUMA nodes mapping: - map each vm node to particular host node or several numa nodes - emulate numa on host without numa (useful for tests) Signed-off-by: Konstantin Khlebnikov --- src/runtime/pkg/katautils/config.go | 17 +++++++ src/runtime/pkg/oci/utils.go | 8 +++ src/runtime/virtcontainers/hypervisor.go | 7 +++ .../virtcontainers/hypervisor_config_linux.go | 9 ++++ .../pkg/annotations/annotations.go | 3 ++ src/runtime/virtcontainers/types/sandbox.go | 6 +++ src/runtime/virtcontainers/utils/utils.go | 49 +++++++++++++++++++ .../virtcontainers/utils/utils_linux.go | 23 +++++++++ 8 files changed, 122 insertions(+) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 648b40435d..5a9c0d80d3 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -25,6 +25,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" "github.com/pbnjay/memory" "github.com/sirupsen/logrus" @@ -63,6 +64,8 @@ const ( // the maximum valid loglevel for the hypervisor maxHypervisorLoglevel uint32 = 3 + // the maximum number of NUMA nodes in Linux kernel: 1 << CONFIG_NODES_SHIFT, which is up to 10. + maxNumNUMA uint32 = 1024 errInvalidHypervisorPrefix = "configuration file contains invalid hypervisor section" ) @@ -149,6 +152,8 @@ type hypervisor struct { VirtioMem bool `toml:"enable_virtio_mem"` IOMMU bool `toml:"enable_iommu"` IOMMUPlatform bool `toml:"enable_iommu_platform"` + NUMA bool `toml:"enable_numa"` + NUMAMapping []string `toml:"numa_mapping"` Debug bool `toml:"enable_debug"` DisableNestingChecks bool `toml:"disable_nesting_checks"` EnableIOThreads bool `toml:"enable_iothreads"` @@ -695,6 +700,18 @@ func (h hypervisor) getIOMMUPlatform() bool { return h.IOMMUPlatform } +func (h hypervisor) defaultNUMANodes() []types.NUMANode { + if !h.NUMA { + return nil + } + numaNodes, err := utils.GetNUMANodes(h.NUMAMapping) + if err != nil { + kataUtilsLogger.WithError(err).Warn("Cannot construct NUMA nodes.") + return nil + } + return numaNodes +} + func (h hypervisor) getRemoteHypervisorSocket() string { if h.RemoteHypervisorSocket == "" { return defaultRemoteHypervisorSocket diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index e4f1e562e4..cbe15ee1d8 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -727,6 +727,14 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig return err } + if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok { + numaNodes, err := vcutils.GetNUMANodes(strings.Fields(annotation)) + if err != nil { + return err + } + sbConfig.HypervisorConfig.NUMANodes = numaNodes + } + return nil } diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 8297301645..ad59a03f6f 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -616,6 +616,9 @@ type HypervisorConfig struct { // IOMMUPlatform is used to indicate if IOMMU_PLATFORM is enabled for supported devices IOMMUPlatform bool + // NUMANodes defines VM NUMA topology and mapping to host NUMA nodes and CPUs. + NUMANodes []types.NUMANode + // DisableNestingChecks is used to override customizations performed // when running on top of another VMM. DisableNestingChecks bool @@ -879,6 +882,10 @@ func (conf HypervisorConfig) NumVCPUs() uint32 { return RoundUpNumVCPUs(conf.NumVCPUsF) } +func (conf HypervisorConfig) NumNUMA() uint32 { + return uint32(len(conf.NUMANodes)) +} + func appendParam(params []Param, parameter string, value string) []Param { return append(params, Param{parameter, value}) } diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go index 1bcd47218c..d9ca3bac39 100644 --- a/src/runtime/virtcontainers/hypervisor_config_linux.go +++ b/src/runtime/virtcontainers/hypervisor_config_linux.go @@ -58,6 +58,15 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { conf.DefaultMaxVCPUs = defaultMaxVCPUs } + if numNUMA := conf.NumNUMA(); numNUMA > 1 { + conf.DefaultMaxVCPUs -= conf.DefaultMaxVCPUs % numNUMA + } + + if conf.ConfidentialGuest && conf.NumVCPUs() != conf.DefaultMaxVCPUs { + hvLogger.Warnf("Confidential guests do not support hotplugging of vCPUs. Setting DefaultMaxVCPUs to NumVCPUs (%d)", conf.NumVCPUs()) + conf.DefaultMaxVCPUs = conf.NumVCPUs() + } + if conf.Msize9p == 0 && conf.SharedFS != config.VirtioFS { conf.Msize9p = defaultMsize9p } diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index e71b0525c1..145c0ca865 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -179,6 +179,9 @@ const ( // FileBackedMemRootDir is a sandbox annotation to soecify file based memory backend root directory FileBackedMemRootDir = kataAnnotHypervisorPrefix + "file_mem_backend" + // NUMAMapping is a sandbox annotation that specifies mapping VM NUMA nodes to host NUMA nodes. + NUMAMapping = kataAnnotHypervisorPrefix + "numa_mapping" + // // Shared File System related annotations // diff --git a/src/runtime/virtcontainers/types/sandbox.go b/src/runtime/virtcontainers/types/sandbox.go index 29c909c977..40db6a25c4 100644 --- a/src/runtime/virtcontainers/types/sandbox.go +++ b/src/runtime/virtcontainers/types/sandbox.go @@ -342,3 +342,9 @@ type Resources struct { Memory uint MemorySlots uint8 } + +// NUMANode defines VM NUMA node mapping to host NUMA nodes and CPUs. +type NUMANode struct { + HostNodes string + HostCPUs string +} diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 4893a0fe26..ac71ebfa98 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -21,6 +21,9 @@ import ( "golang.org/x/sys/unix" pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols" + + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" ) const cpBinaryName = "cp" @@ -507,3 +510,49 @@ func IsDockerContainer(spec *specs.Spec) bool { return false } +// GetNUMANodes constructs VM NUMA nodes mapping to host NUMA nodes and host CPUs. +func GetNUMANodes(numaMapping []string) ([]types.NUMANode, error) { + // Add VM NUMA node for each specified subsets of host NUMA nodes. + if numNUMA := len(numaMapping); numNUMA > 0 { + numaNodes := make([]types.NUMANode, numNUMA) + for i, hostNodes := range numaMapping { + hostNodeIds, err := cpuset.Parse(hostNodes) + if err != nil { + return nil, err + } + numaNodes[i].HostNodes = hostNodes + for _, nodeId := range hostNodeIds.ToSlice() { + cpus, err := getHostNUMANodeCPUs(nodeId) + if err != nil { + return nil, err + } + if numaNodes[i].HostCPUs != "" { + numaNodes[i].HostCPUs += "," + } + numaNodes[i].HostCPUs += cpus + } + } + return numaNodes, nil + } + + // Add VM NUMA node for each host NUMA node. + nodeIds, err := getHostNUMANodes() + if err != nil { + return nil, err + } + if len(nodeIds) == 0 { + return nil, nil + } + + numaNodes := make([]types.NUMANode, len(nodeIds)) + for i, nodeId := range nodeIds { + cpus, err := getHostNUMANodeCPUs(nodeId) + if err != nil { + return nil, err + } + numaNodes[i].HostNodes = fmt.Sprintf("%d", nodeId) + numaNodes[i].HostCPUs = cpus + } + + return numaNodes, nil +} diff --git a/src/runtime/virtcontainers/utils/utils_linux.go b/src/runtime/virtcontainers/utils/utils_linux.go index a31b8d3511..0ddb4dd5a9 100644 --- a/src/runtime/virtcontainers/utils/utils_linux.go +++ b/src/runtime/virtcontainers/utils/utils_linux.go @@ -19,6 +19,8 @@ import ( "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" ) var ioctlFunc = Ioctl @@ -197,3 +199,24 @@ func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) b return pidRunning } + +func getHostNUMANodes() ([]int, error) { + data, err := os.ReadFile("/sys/devices/system/node/online") + if err != nil { + return nil, err + } + nodes, err := cpuset.Parse(strings.TrimSuffix(string(data), "\n")) + if err != nil { + return nil, err + } + return nodes.ToSlice(), nil +} + +func getHostNUMANodeCPUs(nodeId int) (string, error) { + fileName := fmt.Sprintf("/sys/devices/system/node/node%v/cpulist", nodeId) + data, err := os.ReadFile(fileName) + if err != nil { + return "", err + } + return strings.TrimSuffix(string(data), "\n"), nil +} From 48b57cb2aeb54433859bb357cbf8a1e205ddcadf Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 18 Jul 2024 17:00:31 +0200 Subject: [PATCH 4/6] runtime: enforce NUMA topology by VCPU threads affinity For optimal performance VCPU threads must utilize only NUMA local CPUs. Signed-off-by: Konstantin Khlebnikov --- src/runtime/virtcontainers/hypervisor.go | 3 +- src/runtime/virtcontainers/mock_hypervisor.go | 2 +- src/runtime/virtcontainers/sandbox.go | 67 ++++++++++++++----- 3 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index ad59a03f6f..3871a31c44 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -686,7 +686,8 @@ type HypervisorConfig struct { // vcpu mapping from vcpu number to thread number type VcpuThreadIDs struct { - vcpus map[int]int + vcpus map[int]int + vcpuToNodeId map[int]uint32 } func (conf *HypervisorConfig) CheckTemplateConfig() error { diff --git a/src/runtime/virtcontainers/mock_hypervisor.go b/src/runtime/virtcontainers/mock_hypervisor.go index 7d6da561fa..c969a33273 100644 --- a/src/runtime/virtcontainers/mock_hypervisor.go +++ b/src/runtime/virtcontainers/mock_hypervisor.go @@ -113,7 +113,7 @@ func (m *mockHypervisor) Disconnect(ctx context.Context) { func (m *mockHypervisor) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) { vcpus := map[int]int{0: os.Getpid()} - return VcpuThreadIDs{vcpus}, nil + return VcpuThreadIDs{vcpus, nil}, nil } func (m *mockHypervisor) Cleanup(ctx context.Context) error { diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 3711da7f5e..a60c2b8c79 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -2852,11 +2852,12 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error { // is set to true. Then it fetches sandbox's number of vCPU threads // and number of CPUs in CPUSet. If the two are equal, each vCPU thread // is then pinned to one fixed CPU in CPUSet. +// For enforcing NUMA topology vCPU threads are pinned to related host CPUs. func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { if s.config == nil { return fmt.Errorf("no sandbox config found") } - if !s.config.EnableVCPUsPinning { + if !s.config.EnableVCPUsPinning && s.config.HypervisorConfig.NumNUMA() == 0 { return nil } @@ -2875,23 +2876,59 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { } cpuSetSlice := cpuSet.ToSlice() - // check if vCPU thread numbers and CPU numbers are equal - numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice) - // if not equal, we should reset threads scheduling to random pattern - if numVCPUs != numCPUs { - if s.isVCPUsPinningOn { - s.isVCPUsPinningOn = false - return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice) + // build NUMA topology mapping, or fake single node if NUMA is not enabled. + numNodes := max(s.config.HypervisorConfig.NumNUMA(), 1) + + numaNodeVCPUs := make([][]int, numNodes) + for vcpuId := range vCPUThreadsMap.vcpus { + nodeId, ok := vCPUThreadsMap.vcpuToNodeId[vcpuId] + if !ok || nodeId > numNodes { + nodeId = 0 } - return nil + numaNodeVCPUs[nodeId] = append(numaNodeVCPUs[nodeId], vcpuId) } - // if equal, we can use vCPU thread pinning - for i, tid := range vCPUThreadsMap.vcpus { - if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil { - if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { - return err + + numaNodeCPUs := make([][]int, numNodes) + numaNodeCPUs[0] = cpuSetSlice + for i, numaNode := range s.config.HypervisorConfig.NUMANodes { + nodeHostCPUs, err := cpuset.Parse(numaNode.HostCPUs) + if err != nil { + return fmt.Errorf("failed to parse NUMA CPUSet string: %v", err) + } + if !cpuSet.IsEmpty() { + nodeHostCPUs = cpuSet.Intersection(nodeHostCPUs) + } + numaNodeCPUs[i] = nodeHostCPUs.ToSlice() + } + + // check if vCPU threads have enough host CPUs in each NUMA node + // if not enough, we should reset threads affinity. + for nodeId := range numaNodeVCPUs { + numVCPUs, numCPUs := len(numaNodeVCPUs[nodeId]), len(numaNodeCPUs[nodeId]) + if s.config.EnableVCPUsPinning && numVCPUs != numCPUs || numVCPUs > numCPUs { + if s.isVCPUsPinningOn { + s.isVCPUsPinningOn = false + return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice) + } + virtLog.Warningf("cannot pin vcpus in vm numa node %d", nodeId) + return nil + } + } + + for nodeId := range numaNodeVCPUs { + nodeCpuSetSlice := numaNodeCPUs[nodeId] + for i, vcpuId := range numaNodeVCPUs[nodeId] { + tid := vCPUThreadsMap.vcpus[vcpuId] + affinity := nodeCpuSetSlice + if s.config.EnableVCPUsPinning { + affinity = affinity[i : i+1] + } + if err := resCtrl.SetThreadAffinity(tid, affinity); err != nil { + if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { + return err + } + return fmt.Errorf("failed to set vcpu thread %d cpu affinity to %v: %v", tid, affinity, err) } - return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d: %v", tid, cpuSetSlice[i], err) } } s.isVCPUsPinningOn = true From 1a2430a4d95c54c56d27a3583cb3a3c315b786a4 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 18 Jul 2024 17:02:20 +0200 Subject: [PATCH 5/6] govmm: setup qemu VM NUMA topology for initial CPUs and memory If NUMA topology is enabled: - groups CPUs into per NUMA node sockets - split memory into per NUMA node modules - report NUMA node for VCPU threads Signed-off-by: Konstantin Khlebnikov --- src/runtime/pkg/govmm/qemu/qemu.go | 88 ++++++++++++++++---- src/runtime/pkg/govmm/qemu/qemu_test.go | 20 ++++- src/runtime/pkg/katautils/config.go | 1 + src/runtime/virtcontainers/qemu.go | 32 +++++++ src/runtime/virtcontainers/qemu_amd64.go | 5 +- src/runtime/virtcontainers/qemu_arch_base.go | 13 ++- 6 files changed, 137 insertions(+), 22 deletions(-) diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index aefa1ffdf7..35d8b028ed 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -2590,8 +2590,13 @@ type SMP struct { Sockets uint32 // MaxCPUs is the maximum number of VCPUs that a VM can have. - // This value, if non-zero, MUST BE equal to or greater than CPUs + // This value, if non-zero, MUST BE equal to or greater than CPUs, + // and must be equal to Sockets * Cores * Threads if all are non-zero. MaxCPUs uint32 + + // NumNUMA is the number of NUMA nodes that VM have. + // The value MUST NOT be greater than Sockets. + NumNUMA uint32 } // Memory is the guest memory configuration structure. @@ -2612,6 +2617,26 @@ type Memory struct { // Path is the file path of the memory device. It points to a local // file path used by FileBackedMem. Path string + + // MemoryModules describes memory topology and allocation policy. + MemoryModules []MemoryModule +} + +// MemoryModule represents single module of guest memory. +type MemoryModule struct { + // Size of memory module. + // It should be suffixed with M or G for sizes in megabytes or + // gigabytes respectively. + Size string + + // NodeId is the guest NUMA node this module belongs to. + NodeId uint32 + + // HostNodes defines host NUMA nodes mask for binding memory allocation. + HostNodes string + + // MemoryPolicy defines host NUMA memory allocation policy. + MemoryPolicy string } // Kernel is the guest kernel configuration structure. @@ -2997,11 +3022,25 @@ func (config *Config) appendCPUs() error { return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d", config.SMP.MaxCPUs, config.SMP.CPUs) } + topologyCPUs := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads + if topologyCPUs != 0 && config.SMP.MaxCPUs != topologyCPUs { + return fmt.Errorf("MaxCPUs %d must match CPU topology: sockets %d * cores %d * thread %d", + config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads) + } SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs)) } config.qemuParams = append(config.qemuParams, "-smp") config.qemuParams = append(config.qemuParams, strings.Join(SMPParams, ",")) + + if config.SMP.NumNUMA > 1 { + // Interleave CPU sockets over NUMA nodes. + for socketId := uint32(0); socketId < config.SMP.Sockets; socketId++ { + nodeId := socketId % config.SMP.NumNUMA + config.qemuParams = append(config.qemuParams, "-numa", + fmt.Sprintf("cpu,node-id=%d,socket-id=%d", nodeId, socketId)) + } + } } return nil @@ -3070,34 +3109,49 @@ func (config *Config) appendMemoryKnobs() { if config.Memory.Size == "" { return } - var objMemParam, numaMemParam string - dimmName := "dimm1" + if len(config.Memory.MemoryModules) == 0 { + config.appendMemoryModule("dimm1", MemoryModule{Size: config.Memory.Size}) + } + for i, memModule := range config.Memory.MemoryModules { + config.appendMemoryModule(fmt.Sprintf("dimm%d", i), memModule) + } +} + +func (config *Config) appendMemoryModule(memoryId string, memoryModule MemoryModule) { + var objMemParams []string + if config.Knobs.HugePages { - objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=/dev/hugepages" - numaMemParam = "node,memdev=" + dimmName + objMemParams = append(objMemParams, "memory-backend-file", "mem-path=/dev/hugepages") } else if config.Knobs.FileBackedMem && config.Memory.Path != "" { - objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=" + config.Memory.Path - numaMemParam = "node,memdev=" + dimmName + objMemParams = append(objMemParams, "memory-backend-file", "mem-path="+config.Memory.Path) } else { - objMemParam = "memory-backend-ram,id=" + dimmName + ",size=" + config.Memory.Size - numaMemParam = "node,memdev=" + dimmName + objMemParams = append(objMemParams, "memory-backend-ram") + } + + objMemParams = append(objMemParams, "id="+memoryId, "size="+memoryModule.Size) + + if memoryModule.MemoryPolicy != "" { + objMemParams = append(objMemParams, "policy="+memoryModule.MemoryPolicy) + } + + if memoryModule.HostNodes != "" { + objMemParams = append(objMemParams, "host-nodes="+memoryModule.HostNodes) } if config.Knobs.MemShared { - objMemParam += ",share=on" + objMemParams = append(objMemParams, "share=on") } if config.Knobs.MemPrealloc { - objMemParam += ",prealloc=on" + objMemParams = append(objMemParams, "prealloc=on") } - config.qemuParams = append(config.qemuParams, "-object") - config.qemuParams = append(config.qemuParams, objMemParam) + + config.qemuParams = append(config.qemuParams, "-object", strings.Join(objMemParams, ",")) if isDimmSupported(config) { - config.qemuParams = append(config.qemuParams, "-numa") - config.qemuParams = append(config.qemuParams, numaMemParam) + config.qemuParams = append(config.qemuParams, "-numa", + fmt.Sprintf("node,nodeid=%d,memdev=%s", memoryModule.NodeId, memoryId)) } else { - config.qemuParams = append(config.qemuParams, "-machine") - config.qemuParams = append(config.qemuParams, "memory-backend="+dimmName) + config.qemuParams = append(config.qemuParams, "-machine", "memory-backend="+memoryId) } } diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 181eb6506d..4c1c771dd2 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -666,7 +666,7 @@ func TestAppendMemory(t *testing.T) { testAppend(memory, memoryString, t) } -var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=6" +var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=4" func TestAppendCPUs(t *testing.T) { smp := SMP{ @@ -674,7 +674,7 @@ func TestAppendCPUs(t *testing.T) { Sockets: 2, Cores: 1, Threads: 2, - MaxCPUs: 6, + MaxCPUs: 4, } testAppend(smp, cpusString, t) @@ -696,6 +696,22 @@ func TestFailToAppendCPUs(t *testing.T) { } } +func TestFailToAppendCPUsWrongTopology(t *testing.T) { + config := Config{ + SMP: SMP{ + CPUs: 2, + Sockets: 2, + Cores: 1, + Threads: 2, + MaxCPUs: 6, + }, + } + + if err := config.appendCPUs(); err == nil { + t.Fatalf("Expected appendCPUs to fail") + } +} + var qmpSingleSocketServerString = "-qmp unix:path=cc-qmp,server=on,wait=off" var qmpSingleSocketString = "-qmp unix:path=cc-qmp" diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 5a9c0d80d3..8b451376df 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -965,6 +965,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { HugePages: h.HugePages, IOMMU: h.IOMMU, IOMMUPlatform: h.getIOMMUPlatform(), + NUMANodes: h.defaultNUMANodes(), FileBackedMemRootDir: h.FileBackedMemRootDir, FileBackedMemRootList: h.FileBackedMemRootList, Debug: h.Debug, diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index ba86c3d63a..8465087594 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -2538,6 +2538,36 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff return memory } +func genericNUMAMemoryModles(memoryMb, memoryAlign uint64, numaNodes []types.NUMANode) []govmmQemu.MemoryModule { + if len(numaNodes) == 0 { + return nil + } + + memoryModules := make([]govmmQemu.MemoryModule, 0, len(numaNodes)) + + // Divide memory among NUMA nodes. + memoryPerNode := memoryMb / uint64(len(numaNodes)) + memoryPerNode -= memoryPerNode % memoryAlign + + // First NUMA node gets more if memory is not divide evenly. + moduleSize := memoryMb - memoryPerNode*uint64(len(numaNodes)-1) + + for nodeId, numaNode := range numaNodes { + memoryModules = append(memoryModules, govmmQemu.MemoryModule{ + Size: fmt.Sprintf("%dM", moduleSize), + NodeId: uint32(nodeId), + HostNodes: numaNode.HostNodes, + MemoryPolicy: "interleave", + }) + moduleSize = memoryPerNode + if moduleSize == 0 { + break + } + } + + return memoryModules +} + // genericAppendPCIeRootPort appends to devices the given pcie-root-port func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { var ( @@ -2662,9 +2692,11 @@ func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) { } tid.vcpus = make(map[int]int, len(cpuInfos)) + tid.vcpuToNodeId = make(map[int]uint32, len(cpuInfos)) for _, i := range cpuInfos { if i.ThreadID > 0 { tid.vcpus[i.CPUIndex] = i.ThreadID + tid.vcpuToNodeId[i.CPUIndex] = uint32(i.Props.Node) } } return tid, nil diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index ade7356eb6..e426062fd8 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -116,6 +116,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { qemuArchBase: qemuArchBase{ qemuMachine: *mp, qemuExePath: defaultQemuPath, + numaNodes: config.NUMANodes, memoryOffset: config.MemOffset, kernelParamsNonDebug: kernelParamsNonDebug, kernelParamsDebug: kernelParamsDebug, @@ -196,7 +197,9 @@ func (q *qemuAmd64) cpuModel() string { } func (q *qemuAmd64) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory { - return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset) + memory := genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset) + memory.MemoryModules = genericNUMAMemoryModles(memoryMb, 4, q.numaNodes) + return memory } // Is Memory Hotplug supported by this architecture/machine type combination? diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index fd92be7724..c1fc3e5f45 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -183,6 +183,7 @@ type qemuArchBase struct { kernelParamsDebug []Param kernelParams []Param Bridges []types.Bridge + numaNodes []types.NUMANode memoryOffset uint64 networkIndex int // Exclude from lint checking for it is ultimately only used in architecture-specific code @@ -321,12 +322,20 @@ func (q *qemuArchBase) bridges(number uint32) { } func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP { + numNUMA := uint32(len(q.numaNodes)) + + numSockets := numNUMA + if numSockets == 0 { + numSockets = maxvcpus + } + smp := govmmQemu.SMP{ CPUs: vcpus, - Sockets: maxvcpus, - Cores: defaultCores, + Sockets: numSockets, + Cores: maxvcpus / numSockets / defaultThreads, Threads: defaultThreads, MaxCPUs: maxvcpus, + NumNUMA: numNUMA, } return smp From 086b4708fc235e0d535ea8c14370a58572226f21 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 18 Jul 2024 16:00:12 +0200 Subject: [PATCH 6/6] runtime: add annotation default_maxmemory It seems there is no annotation for disabling VM memory hotplug. This should be useful at least until hotplug become fully NUMA-aware. Signed-off-by: Konstantin Khlebnikov --- docs/how-to/how-to-set-sandbox-config-kata.md | 1 + src/runtime/pkg/oci/utils.go | 10 ++++++++++ src/runtime/virtcontainers/hypervisor_config_linux.go | 4 ++++ .../virtcontainers/pkg/annotations/annotations.go | 3 +++ 4 files changed, 18 insertions(+) diff --git a/docs/how-to/how-to-set-sandbox-config-kata.md b/docs/how-to/how-to-set-sandbox-config-kata.md index 776a7e02f3..8957aceb56 100644 --- a/docs/how-to/how-to-set-sandbox-config-kata.md +++ b/docs/how-to/how-to-set-sandbox-config-kata.md @@ -47,6 +47,7 @@ There are several kinds of Kata configurations and they are listed below. | `io.katacontainers.config.hypervisor.block_device_driver` | string | the driver to be used for block device, valid values are `virtio-blk`, `virtio-scsi`, `nvdimm`| | `io.katacontainers.config.hypervisor.cpu_features` | `string` | Comma-separated list of CPU features to pass to the CPU (QEMU) | | `io.katacontainers.config.hypervisor.default_max_vcpus` | uint32| the maximum number of vCPUs allocated for the VM by the hypervisor | +| `io.katacontainers.config.hypervisor.default_maxmemory` | uint32| the maximum memory assigned for a VM by the hypervisor in `MiB` | | `io.katacontainers.config.hypervisor.default_memory` | uint32| the memory assigned for a VM by the hypervisor in `MiB` | | `io.katacontainers.config.hypervisor.default_vcpus` | float32| the default vCPUs assigned for a VM by the hypervisor | | `io.katacontainers.config.hypervisor.disable_block_device_use` | `boolean` | disallow a block device from being used | diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index cbe15ee1d8..535934f925 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -662,6 +662,16 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig return err } + if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMaxMemory).setUintWithCheck(func(memorySz uint64) error { + if memorySz < vc.MinHypervisorMemory && sbConfig.HypervisorType != vc.RemoteHypervisor { + return fmt.Errorf("Memory specified in annotation %s is less than minimum required %d, please specify a larger value", vcAnnotations.DefaultMemory, vc.MinHypervisorMemory) + } + sbConfig.HypervisorConfig.DefaultMaxMemorySize = memorySz + return nil + }); err != nil { + return err + } + if err := newAnnotationConfiguration(ocispec, vcAnnotations.MemSlots).setUint(func(mslots uint64) { if mslots > 0 { sbConfig.HypervisorConfig.MemSlots = uint32(mslots) diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go index d9ca3bac39..12c89ae467 100644 --- a/src/runtime/virtcontainers/hypervisor_config_linux.go +++ b/src/runtime/virtcontainers/hypervisor_config_linux.go @@ -44,6 +44,10 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { conf.MemorySize = defaultMemSzMiB } + if uint64(conf.MemorySize) > conf.DefaultMaxMemorySize { + conf.MemorySize = uint32(conf.DefaultMaxMemorySize) + } + if conf.DefaultBridges == 0 { conf.DefaultBridges = defaultBridges } diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 145c0ca865..41cf536775 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -155,6 +155,9 @@ const ( // DefaultMemory is a sandbox annotation for the memory assigned for a VM by the hypervisor. DefaultMemory = kataAnnotHypervisorPrefix + "default_memory" + // MaxMemory is a sandbox annotation for the maximum memory assigned for a VM by the hypervisor. + DefaultMaxMemory = kataAnnotHypervisorPrefix + "default_maxmemory" + // MemSlots is a sandbox annotation to specify the memory slots assigned to the VM by the hypervisor. MemSlots = kataAnnotHypervisorPrefix + "memory_slots"