From 0da03741579637f8e4d24265c35b83ebaba4fb83 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Tue, 16 Jul 2024 11:45:53 +0200
Subject: [PATCH 1/6] qemu: enable NUMA support

Link qemu with libnuma and enable NUMA feature flag.

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
---
 tools/packaging/scripts/configure-hypervisor.sh | 3 +++
 tools/packaging/static-build/qemu/Dockerfile    | 1 +
 2 files changed, 4 insertions(+)

diff --git a/tools/packaging/scripts/configure-hypervisor.sh b/tools/packaging/scripts/configure-hypervisor.sh
index 73f51817b2..f72e85a19c 100755
--- a/tools/packaging/scripts/configure-hypervisor.sh
+++ b/tools/packaging/scripts/configure-hypervisor.sh
@@ -445,6 +445,9 @@ generate_qemu_options() {
 	qemu_options+=(functionality:--enable-cap-ng)
 	qemu_options+=(functionality:--enable-seccomp)
 
+	# Support NUMA topology
+	qemu_options+=(functionality:--enable-numa)
+
 	# AVX2 is enabled by default by x86_64, make sure it's enabled only
 	# for that architecture
 	if [ "$arch" == x86_64 ]; then
diff --git a/tools/packaging/static-build/qemu/Dockerfile b/tools/packaging/static-build/qemu/Dockerfile
index a6d79f77b1..e0549e3672 100644
--- a/tools/packaging/static-build/qemu/Dockerfile
+++ b/tools/packaging/static-build/qemu/Dockerfile
@@ -50,6 +50,7 @@ RUN apt-get update && apt-get upgrade -y && \
 	    libglib2.0-dev${DPKG_ARCH} git \
 	    libltdl-dev${DPKG_ARCH} \
 	    libmount-dev${DPKG_ARCH} \
+	    libnuma-dev${DPKG_ARCH} \
 	    libpixman-1-dev${DPKG_ARCH} \
 	    libselinux1-dev${DPKG_ARCH} \
 	    libtool${DPKG_ARCH} \

From a202741b8f06cd7dc727b2e16ee21b2326f2e09a Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Thu, 18 Jul 2024 11:16:57 +0200
Subject: [PATCH 2/6] kernel: enable x86-64 ACPI NUMA detection

CONFIG_NUMA is already on,
but without CONFIG_X86_64_ACPI_NUMA kernel cannot detect NUMA.

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
---
 tools/packaging/kernel/configs/fragments/x86_64/acpi.conf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf b/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf
index 6c260c0bae..d41ff0fee6 100644
--- a/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf
+++ b/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf
@@ -4,6 +4,8 @@ CONFIG_X86_INTEL_PSTATE=y
 # Firecracker needs this to support `vcpu_count`
 CONFIG_X86_MPPARSE=y
 
+CONFIG_X86_64_ACPI_NUMA=y
+
 CONFIG_ACPI_CPU_FREQ_PSS=y
 CONFIG_ACPI_HOTPLUG_IOAPIC=y
 CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y

From c7a56ab7116df1e4000f6327c055d7c2da5c85c4 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Thu, 18 Jul 2024 16:54:28 +0200
Subject: [PATCH 3/6] runtime: add hypervisor options for NUMA topology

With enable_numa=true hypervisor will expose host NUMA topology as is:
map vm NUMA nodes to host 1:1 and bind vpus to relates CPUS.

Option "numa_mapping" allows to redefine NUMA nodes mapping:
- map each vm node to particular host node or several numa nodes
- emulate numa on host without numa (useful for tests)

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
---
 src/runtime/pkg/katautils/config.go           | 17 +++++++
 src/runtime/pkg/oci/utils.go                  |  8 +++
 src/runtime/virtcontainers/hypervisor.go      |  7 +++
 .../virtcontainers/hypervisor_config_linux.go |  9 ++++
 .../pkg/annotations/annotations.go            |  3 ++
 src/runtime/virtcontainers/types/sandbox.go   |  6 +++
 src/runtime/virtcontainers/utils/utils.go     | 49 +++++++++++++++++++
 .../virtcontainers/utils/utils_linux.go       | 23 +++++++++
 8 files changed, 122 insertions(+)

diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go
index 648b40435d..5a9c0d80d3 100644
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -25,6 +25,7 @@ import (
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
 	vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
 	exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
 	"github.com/pbnjay/memory"
 	"github.com/sirupsen/logrus"
@@ -63,6 +64,8 @@ const (
 
 	// the maximum valid loglevel for the hypervisor
 	maxHypervisorLoglevel uint32 = 3
+	// the maximum number of NUMA nodes in Linux kernel: 1 << CONFIG_NODES_SHIFT, which is up to 10.
+	maxNumNUMA uint32 = 1024
 
 	errInvalidHypervisorPrefix = "configuration file contains invalid hypervisor section"
 )
@@ -149,6 +152,8 @@ type hypervisor struct {
 	VirtioMem                      bool                      `toml:"enable_virtio_mem"`
 	IOMMU                          bool                      `toml:"enable_iommu"`
 	IOMMUPlatform                  bool                      `toml:"enable_iommu_platform"`
+	NUMA                           bool                      `toml:"enable_numa"`
+	NUMAMapping                    []string                  `toml:"numa_mapping"`
 	Debug                          bool                      `toml:"enable_debug"`
 	DisableNestingChecks           bool                      `toml:"disable_nesting_checks"`
 	EnableIOThreads                bool                      `toml:"enable_iothreads"`
@@ -695,6 +700,18 @@ func (h hypervisor) getIOMMUPlatform() bool {
 	return h.IOMMUPlatform
 }
 
+func (h hypervisor) defaultNUMANodes() []types.NUMANode {
+	if !h.NUMA {
+		return nil
+	}
+	numaNodes, err := utils.GetNUMANodes(h.NUMAMapping)
+	if err != nil {
+		kataUtilsLogger.WithError(err).Warn("Cannot construct NUMA nodes.")
+		return nil
+	}
+	return numaNodes
+}
+
 func (h hypervisor) getRemoteHypervisorSocket() string {
 	if h.RemoteHypervisorSocket == "" {
 		return defaultRemoteHypervisorSocket
diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go
index e4f1e562e4..cbe15ee1d8 100644
--- a/src/runtime/pkg/oci/utils.go
+++ b/src/runtime/pkg/oci/utils.go
@@ -727,6 +727,14 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
 		return err
 	}
 
+	if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok {
+		numaNodes, err := vcutils.GetNUMANodes(strings.Fields(annotation))
+		if err != nil {
+			return err
+		}
+		sbConfig.HypervisorConfig.NUMANodes = numaNodes
+	}
+
 	return nil
 }
 
diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go
index 8297301645..ad59a03f6f 100644
--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@@ -616,6 +616,9 @@ type HypervisorConfig struct {
 	// IOMMUPlatform is used to indicate if IOMMU_PLATFORM is enabled for supported devices
 	IOMMUPlatform bool
 
+	// NUMANodes defines VM NUMA topology and mapping to host NUMA nodes and CPUs.
+	NUMANodes []types.NUMANode
+
 	// DisableNestingChecks is used to override customizations performed
 	// when running on top of another VMM.
 	DisableNestingChecks bool
@@ -879,6 +882,10 @@ func (conf HypervisorConfig) NumVCPUs() uint32 {
 	return RoundUpNumVCPUs(conf.NumVCPUsF)
 }
 
+func (conf HypervisorConfig) NumNUMA() uint32 {
+	return uint32(len(conf.NUMANodes))
+}
+
 func appendParam(params []Param, parameter string, value string) []Param {
 	return append(params, Param{parameter, value})
 }
diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go
index 1bcd47218c..d9ca3bac39 100644
--- a/src/runtime/virtcontainers/hypervisor_config_linux.go
+++ b/src/runtime/virtcontainers/hypervisor_config_linux.go
@@ -58,6 +58,15 @@ func validateHypervisorConfig(conf *HypervisorConfig) error {
 		conf.DefaultMaxVCPUs = defaultMaxVCPUs
 	}
 
+	if numNUMA := conf.NumNUMA(); numNUMA > 1 {
+		conf.DefaultMaxVCPUs -= conf.DefaultMaxVCPUs % numNUMA
+	}
+
+	if conf.ConfidentialGuest && conf.NumVCPUs() != conf.DefaultMaxVCPUs {
+		hvLogger.Warnf("Confidential guests do not support hotplugging of vCPUs. Setting DefaultMaxVCPUs to NumVCPUs (%d)", conf.NumVCPUs())
+		conf.DefaultMaxVCPUs = conf.NumVCPUs()
+	}
+
 	if conf.Msize9p == 0 && conf.SharedFS != config.VirtioFS {
 		conf.Msize9p = defaultMsize9p
 	}
diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go
index e71b0525c1..145c0ca865 100644
--- a/src/runtime/virtcontainers/pkg/annotations/annotations.go
+++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go
@@ -179,6 +179,9 @@ const (
 	// FileBackedMemRootDir is a sandbox annotation to soecify file based memory backend root directory
 	FileBackedMemRootDir = kataAnnotHypervisorPrefix + "file_mem_backend"
 
+	// NUMAMapping is a sandbox annotation that specifies mapping VM NUMA nodes to host NUMA nodes.
+	NUMAMapping = kataAnnotHypervisorPrefix + "numa_mapping"
+
 	//
 	// Shared File System related annotations
 	//
diff --git a/src/runtime/virtcontainers/types/sandbox.go b/src/runtime/virtcontainers/types/sandbox.go
index 29c909c977..40db6a25c4 100644
--- a/src/runtime/virtcontainers/types/sandbox.go
+++ b/src/runtime/virtcontainers/types/sandbox.go
@@ -342,3 +342,9 @@ type Resources struct {
 	Memory      uint
 	MemorySlots uint8
 }
+
+// NUMANode defines VM NUMA node mapping to host NUMA nodes and CPUs.
+type NUMANode struct {
+	HostNodes string
+	HostCPUs  string
+}
diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go
index 4893a0fe26..ac71ebfa98 100644
--- a/src/runtime/virtcontainers/utils/utils.go
+++ b/src/runtime/virtcontainers/utils/utils.go
@@ -21,6 +21,9 @@ import (
 	"golang.org/x/sys/unix"
 
 	pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols"
+
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 )
 
 const cpBinaryName = "cp"
@@ -507,3 +510,49 @@ func IsDockerContainer(spec *specs.Spec) bool {
 
 	return false
 }
+// GetNUMANodes constructs VM NUMA nodes mapping to host NUMA nodes and host CPUs.
+func GetNUMANodes(numaMapping []string) ([]types.NUMANode, error) {
+	// Add VM NUMA node for each specified subsets of host NUMA nodes.
+	if numNUMA := len(numaMapping); numNUMA > 0 {
+		numaNodes := make([]types.NUMANode, numNUMA)
+		for i, hostNodes := range numaMapping {
+			hostNodeIds, err := cpuset.Parse(hostNodes)
+			if err != nil {
+				return nil, err
+			}
+			numaNodes[i].HostNodes = hostNodes
+			for _, nodeId := range hostNodeIds.ToSlice() {
+				cpus, err := getHostNUMANodeCPUs(nodeId)
+				if err != nil {
+					return nil, err
+				}
+				if numaNodes[i].HostCPUs != "" {
+					numaNodes[i].HostCPUs += ","
+				}
+				numaNodes[i].HostCPUs += cpus
+			}
+		}
+		return numaNodes, nil
+	}
+
+	// Add VM NUMA node for each host NUMA node.
+	nodeIds, err := getHostNUMANodes()
+	if err != nil {
+		return nil, err
+	}
+	if len(nodeIds) == 0 {
+		return nil, nil
+	}
+
+	numaNodes := make([]types.NUMANode, len(nodeIds))
+	for i, nodeId := range nodeIds {
+		cpus, err := getHostNUMANodeCPUs(nodeId)
+		if err != nil {
+			return nil, err
+		}
+		numaNodes[i].HostNodes = fmt.Sprintf("%d", nodeId)
+		numaNodes[i].HostCPUs = cpus
+	}
+
+	return numaNodes, nil
+}
diff --git a/src/runtime/virtcontainers/utils/utils_linux.go b/src/runtime/virtcontainers/utils/utils_linux.go
index a31b8d3511..0ddb4dd5a9 100644
--- a/src/runtime/virtcontainers/utils/utils_linux.go
+++ b/src/runtime/virtcontainers/utils/utils_linux.go
@@ -19,6 +19,8 @@ import (
 
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
+
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
 )
 
 var ioctlFunc = Ioctl
@@ -197,3 +199,24 @@ func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) b
 
 	return pidRunning
 }
+
+func getHostNUMANodes() ([]int, error) {
+	data, err := os.ReadFile("/sys/devices/system/node/online")
+	if err != nil {
+		return nil, err
+	}
+	nodes, err := cpuset.Parse(strings.TrimSuffix(string(data), "\n"))
+	if err != nil {
+		return nil, err
+	}
+	return nodes.ToSlice(), nil
+}
+
+func getHostNUMANodeCPUs(nodeId int) (string, error) {
+	fileName := fmt.Sprintf("/sys/devices/system/node/node%v/cpulist", nodeId)
+	data, err := os.ReadFile(fileName)
+	if err != nil {
+		return "", err
+	}
+	return strings.TrimSuffix(string(data), "\n"), nil
+}

From 48b57cb2aeb54433859bb357cbf8a1e205ddcadf Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Thu, 18 Jul 2024 17:00:31 +0200
Subject: [PATCH 4/6] runtime: enforce NUMA topology by VCPU threads affinity

For optimal performance VCPU threads must utilize only NUMA local CPUs.

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
---
 src/runtime/virtcontainers/hypervisor.go      |  3 +-
 src/runtime/virtcontainers/mock_hypervisor.go |  2 +-
 src/runtime/virtcontainers/sandbox.go         | 67 ++++++++++++++-----
 3 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go
index ad59a03f6f..3871a31c44 100644
--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@@ -686,7 +686,8 @@ type HypervisorConfig struct {
 
 // vcpu mapping from vcpu number to thread number
 type VcpuThreadIDs struct {
-	vcpus map[int]int
+	vcpus        map[int]int
+	vcpuToNodeId map[int]uint32
 }
 
 func (conf *HypervisorConfig) CheckTemplateConfig() error {
diff --git a/src/runtime/virtcontainers/mock_hypervisor.go b/src/runtime/virtcontainers/mock_hypervisor.go
index 7d6da561fa..c969a33273 100644
--- a/src/runtime/virtcontainers/mock_hypervisor.go
+++ b/src/runtime/virtcontainers/mock_hypervisor.go
@@ -113,7 +113,7 @@ func (m *mockHypervisor) Disconnect(ctx context.Context) {
 
 func (m *mockHypervisor) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
 	vcpus := map[int]int{0: os.Getpid()}
-	return VcpuThreadIDs{vcpus}, nil
+	return VcpuThreadIDs{vcpus, nil}, nil
 }
 
 func (m *mockHypervisor) Cleanup(ctx context.Context) error {
diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go
index 3711da7f5e..a60c2b8c79 100644
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -2852,11 +2852,12 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error {
 // is set to true. Then it fetches sandbox's number of vCPU threads
 // and number of CPUs in CPUSet. If the two are equal, each vCPU thread
 // is then pinned to one fixed CPU in CPUSet.
+// For enforcing NUMA topology vCPU threads are pinned to related host CPUs.
 func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 	if s.config == nil {
 		return fmt.Errorf("no sandbox config found")
 	}
-	if !s.config.EnableVCPUsPinning {
+	if !s.config.EnableVCPUsPinning && s.config.HypervisorConfig.NumNUMA() == 0 {
 		return nil
 	}
 
@@ -2875,23 +2876,59 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 	}
 	cpuSetSlice := cpuSet.ToSlice()
 
-	// check if vCPU thread numbers and CPU numbers are equal
-	numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice)
-	// if not equal, we should reset threads scheduling to random pattern
-	if numVCPUs != numCPUs {
-		if s.isVCPUsPinningOn {
-			s.isVCPUsPinningOn = false
-			return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice)
+	// build NUMA topology mapping, or fake single node if NUMA is not enabled.
+	numNodes := max(s.config.HypervisorConfig.NumNUMA(), 1)
+
+	numaNodeVCPUs := make([][]int, numNodes)
+	for vcpuId := range vCPUThreadsMap.vcpus {
+		nodeId, ok := vCPUThreadsMap.vcpuToNodeId[vcpuId]
+		if !ok || nodeId > numNodes {
+			nodeId = 0
 		}
-		return nil
+		numaNodeVCPUs[nodeId] = append(numaNodeVCPUs[nodeId], vcpuId)
 	}
-	// if equal, we can use vCPU thread pinning
-	for i, tid := range vCPUThreadsMap.vcpus {
-		if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil {
-			if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
-				return err
+
+	numaNodeCPUs := make([][]int, numNodes)
+	numaNodeCPUs[0] = cpuSetSlice
+	for i, numaNode := range s.config.HypervisorConfig.NUMANodes {
+		nodeHostCPUs, err := cpuset.Parse(numaNode.HostCPUs)
+		if err != nil {
+			return fmt.Errorf("failed to parse NUMA CPUSet string: %v", err)
+		}
+		if !cpuSet.IsEmpty() {
+			nodeHostCPUs = cpuSet.Intersection(nodeHostCPUs)
+		}
+		numaNodeCPUs[i] = nodeHostCPUs.ToSlice()
+	}
+
+	// check if vCPU threads have enough host CPUs in each NUMA node
+	// if not enough, we should reset threads affinity.
+	for nodeId := range numaNodeVCPUs {
+		numVCPUs, numCPUs := len(numaNodeVCPUs[nodeId]), len(numaNodeCPUs[nodeId])
+		if s.config.EnableVCPUsPinning && numVCPUs != numCPUs || numVCPUs > numCPUs {
+			if s.isVCPUsPinningOn {
+				s.isVCPUsPinningOn = false
+				return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice)
+			}
+			virtLog.Warningf("cannot pin vcpus in vm numa node %d", nodeId)
+			return nil
+		}
+	}
+
+	for nodeId := range numaNodeVCPUs {
+		nodeCpuSetSlice := numaNodeCPUs[nodeId]
+		for i, vcpuId := range numaNodeVCPUs[nodeId] {
+			tid := vCPUThreadsMap.vcpus[vcpuId]
+			affinity := nodeCpuSetSlice
+			if s.config.EnableVCPUsPinning {
+				affinity = affinity[i : i+1]
+			}
+			if err := resCtrl.SetThreadAffinity(tid, affinity); err != nil {
+				if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
+					return err
+				}
+				return fmt.Errorf("failed to set vcpu thread %d cpu affinity to %v: %v", tid, affinity, err)
 			}
-			return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d: %v", tid, cpuSetSlice[i], err)
 		}
 	}
 	s.isVCPUsPinningOn = true

From 1a2430a4d95c54c56d27a3583cb3a3c315b786a4 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Thu, 18 Jul 2024 17:02:20 +0200
Subject: [PATCH 5/6] govmm: setup qemu VM NUMA topology for initial CPUs and
 memory

If NUMA topology is enabled:
- groups CPUs into per NUMA node sockets
- split memory into per NUMA node modules
- report NUMA node for VCPU threads

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
---
 src/runtime/pkg/govmm/qemu/qemu.go           | 88 ++++++++++++++++----
 src/runtime/pkg/govmm/qemu/qemu_test.go      | 20 ++++-
 src/runtime/pkg/katautils/config.go          |  1 +
 src/runtime/virtcontainers/qemu.go           | 32 +++++++
 src/runtime/virtcontainers/qemu_amd64.go     |  5 +-
 src/runtime/virtcontainers/qemu_arch_base.go | 13 ++-
 6 files changed, 137 insertions(+), 22 deletions(-)

diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go
index aefa1ffdf7..35d8b028ed 100644
--- a/src/runtime/pkg/govmm/qemu/qemu.go
+++ b/src/runtime/pkg/govmm/qemu/qemu.go
@@ -2590,8 +2590,13 @@ type SMP struct {
 	Sockets uint32
 
 	// MaxCPUs is the maximum number of VCPUs that a VM can have.
-	// This value, if non-zero, MUST BE equal to or greater than CPUs
+	// This value, if non-zero, MUST BE equal to or greater than CPUs,
+	// and must be equal to Sockets * Cores * Threads if all are non-zero.
 	MaxCPUs uint32
+
+	// NumNUMA is the number of NUMA nodes that VM have.
+	// The value MUST NOT be greater than Sockets.
+	NumNUMA uint32
 }
 
 // Memory is the guest memory configuration structure.
@@ -2612,6 +2617,26 @@ type Memory struct {
 	// Path is the file path of the memory device. It points to a local
 	// file path used by FileBackedMem.
 	Path string
+
+	// MemoryModules describes memory topology and allocation policy.
+	MemoryModules []MemoryModule
+}
+
+// MemoryModule represents single module of guest memory.
+type MemoryModule struct {
+	// Size of memory module.
+	// It should be suffixed with M or G for sizes in megabytes or
+	// gigabytes respectively.
+	Size string
+
+	// NodeId is the guest NUMA node this module belongs to.
+	NodeId uint32
+
+	// HostNodes defines host NUMA nodes mask for binding memory allocation.
+	HostNodes string
+
+	// MemoryPolicy defines host NUMA memory allocation policy.
+	MemoryPolicy string
 }
 
 // Kernel is the guest kernel configuration structure.
@@ -2997,11 +3022,25 @@ func (config *Config) appendCPUs() error {
 				return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d",
 					config.SMP.MaxCPUs, config.SMP.CPUs)
 			}
+			topologyCPUs := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads
+			if topologyCPUs != 0 && config.SMP.MaxCPUs != topologyCPUs {
+				return fmt.Errorf("MaxCPUs %d must match CPU topology: sockets %d * cores %d * thread %d",
+					config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads)
+			}
 			SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs))
 		}
 
 		config.qemuParams = append(config.qemuParams, "-smp")
 		config.qemuParams = append(config.qemuParams, strings.Join(SMPParams, ","))
+
+		if config.SMP.NumNUMA > 1 {
+			// Interleave CPU sockets over NUMA nodes.
+			for socketId := uint32(0); socketId < config.SMP.Sockets; socketId++ {
+				nodeId := socketId % config.SMP.NumNUMA
+				config.qemuParams = append(config.qemuParams, "-numa",
+					fmt.Sprintf("cpu,node-id=%d,socket-id=%d", nodeId, socketId))
+			}
+		}
 	}
 
 	return nil
@@ -3070,34 +3109,49 @@ func (config *Config) appendMemoryKnobs() {
 	if config.Memory.Size == "" {
 		return
 	}
-	var objMemParam, numaMemParam string
-	dimmName := "dimm1"
+	if len(config.Memory.MemoryModules) == 0 {
+		config.appendMemoryModule("dimm1", MemoryModule{Size: config.Memory.Size})
+	}
+	for i, memModule := range config.Memory.MemoryModules {
+		config.appendMemoryModule(fmt.Sprintf("dimm%d", i), memModule)
+	}
+}
+
+func (config *Config) appendMemoryModule(memoryId string, memoryModule MemoryModule) {
+	var objMemParams []string
+
 	if config.Knobs.HugePages {
-		objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=/dev/hugepages"
-		numaMemParam = "node,memdev=" + dimmName
+		objMemParams = append(objMemParams, "memory-backend-file", "mem-path=/dev/hugepages")
 	} else if config.Knobs.FileBackedMem && config.Memory.Path != "" {
-		objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=" + config.Memory.Path
-		numaMemParam = "node,memdev=" + dimmName
+		objMemParams = append(objMemParams, "memory-backend-file", "mem-path="+config.Memory.Path)
 	} else {
-		objMemParam = "memory-backend-ram,id=" + dimmName + ",size=" + config.Memory.Size
-		numaMemParam = "node,memdev=" + dimmName
+		objMemParams = append(objMemParams, "memory-backend-ram")
+	}
+
+	objMemParams = append(objMemParams, "id="+memoryId, "size="+memoryModule.Size)
+
+	if memoryModule.MemoryPolicy != "" {
+		objMemParams = append(objMemParams, "policy="+memoryModule.MemoryPolicy)
+	}
+
+	if memoryModule.HostNodes != "" {
+		objMemParams = append(objMemParams, "host-nodes="+memoryModule.HostNodes)
 	}
 
 	if config.Knobs.MemShared {
-		objMemParam += ",share=on"
+		objMemParams = append(objMemParams, "share=on")
 	}
 	if config.Knobs.MemPrealloc {
-		objMemParam += ",prealloc=on"
+		objMemParams = append(objMemParams, "prealloc=on")
 	}
-	config.qemuParams = append(config.qemuParams, "-object")
-	config.qemuParams = append(config.qemuParams, objMemParam)
+
+	config.qemuParams = append(config.qemuParams, "-object", strings.Join(objMemParams, ","))
 
 	if isDimmSupported(config) {
-		config.qemuParams = append(config.qemuParams, "-numa")
-		config.qemuParams = append(config.qemuParams, numaMemParam)
+		config.qemuParams = append(config.qemuParams, "-numa",
+			fmt.Sprintf("node,nodeid=%d,memdev=%s", memoryModule.NodeId, memoryId))
 	} else {
-		config.qemuParams = append(config.qemuParams, "-machine")
-		config.qemuParams = append(config.qemuParams, "memory-backend="+dimmName)
+		config.qemuParams = append(config.qemuParams, "-machine", "memory-backend="+memoryId)
 	}
 }
 
diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go
index 181eb6506d..4c1c771dd2 100644
--- a/src/runtime/pkg/govmm/qemu/qemu_test.go
+++ b/src/runtime/pkg/govmm/qemu/qemu_test.go
@@ -666,7 +666,7 @@ func TestAppendMemory(t *testing.T) {
 	testAppend(memory, memoryString, t)
 }
 
-var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=6"
+var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=4"
 
 func TestAppendCPUs(t *testing.T) {
 	smp := SMP{
@@ -674,7 +674,7 @@ func TestAppendCPUs(t *testing.T) {
 		Sockets: 2,
 		Cores:   1,
 		Threads: 2,
-		MaxCPUs: 6,
+		MaxCPUs: 4,
 	}
 
 	testAppend(smp, cpusString, t)
@@ -696,6 +696,22 @@ func TestFailToAppendCPUs(t *testing.T) {
 	}
 }
 
+func TestFailToAppendCPUsWrongTopology(t *testing.T) {
+	config := Config{
+		SMP: SMP{
+			CPUs:    2,
+			Sockets: 2,
+			Cores:   1,
+			Threads: 2,
+			MaxCPUs: 6,
+		},
+	}
+
+	if err := config.appendCPUs(); err == nil {
+		t.Fatalf("Expected appendCPUs to fail")
+	}
+}
+
 var qmpSingleSocketServerString = "-qmp unix:path=cc-qmp,server=on,wait=off"
 var qmpSingleSocketString = "-qmp unix:path=cc-qmp"
 
diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go
index 5a9c0d80d3..8b451376df 100644
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -965,6 +965,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		HugePages:                h.HugePages,
 		IOMMU:                    h.IOMMU,
 		IOMMUPlatform:            h.getIOMMUPlatform(),
+		NUMANodes:                h.defaultNUMANodes(),
 		FileBackedMemRootDir:     h.FileBackedMemRootDir,
 		FileBackedMemRootList:    h.FileBackedMemRootList,
 		Debug:                    h.Debug,
diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go
index ba86c3d63a..8465087594 100644
--- a/src/runtime/virtcontainers/qemu.go
+++ b/src/runtime/virtcontainers/qemu.go
@@ -2538,6 +2538,36 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff
 	return memory
 }
 
+func genericNUMAMemoryModles(memoryMb, memoryAlign uint64, numaNodes []types.NUMANode) []govmmQemu.MemoryModule {
+	if len(numaNodes) == 0 {
+		return nil
+	}
+
+	memoryModules := make([]govmmQemu.MemoryModule, 0, len(numaNodes))
+
+	// Divide memory among NUMA nodes.
+	memoryPerNode := memoryMb / uint64(len(numaNodes))
+	memoryPerNode -= memoryPerNode % memoryAlign
+
+	// First NUMA node gets more if memory is not divide evenly.
+	moduleSize := memoryMb - memoryPerNode*uint64(len(numaNodes)-1)
+
+	for nodeId, numaNode := range numaNodes {
+		memoryModules = append(memoryModules, govmmQemu.MemoryModule{
+			Size:         fmt.Sprintf("%dM", moduleSize),
+			NodeId:       uint32(nodeId),
+			HostNodes:    numaNode.HostNodes,
+			MemoryPolicy: "interleave",
+		})
+		moduleSize = memoryPerNode
+		if moduleSize == 0 {
+			break
+		}
+	}
+
+	return memoryModules
+}
+
 // genericAppendPCIeRootPort appends to devices the given pcie-root-port
 func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
 	var (
@@ -2662,9 +2692,11 @@ func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
 	}
 
 	tid.vcpus = make(map[int]int, len(cpuInfos))
+	tid.vcpuToNodeId = make(map[int]uint32, len(cpuInfos))
 	for _, i := range cpuInfos {
 		if i.ThreadID > 0 {
 			tid.vcpus[i.CPUIndex] = i.ThreadID
+			tid.vcpuToNodeId[i.CPUIndex] = uint32(i.Props.Node)
 		}
 	}
 	return tid, nil
diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go
index ade7356eb6..e426062fd8 100644
--- a/src/runtime/virtcontainers/qemu_amd64.go
+++ b/src/runtime/virtcontainers/qemu_amd64.go
@@ -116,6 +116,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
 		qemuArchBase: qemuArchBase{
 			qemuMachine:          *mp,
 			qemuExePath:          defaultQemuPath,
+			numaNodes:            config.NUMANodes,
 			memoryOffset:         config.MemOffset,
 			kernelParamsNonDebug: kernelParamsNonDebug,
 			kernelParamsDebug:    kernelParamsDebug,
@@ -196,7 +197,9 @@ func (q *qemuAmd64) cpuModel() string {
 }
 
 func (q *qemuAmd64) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory {
-	return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
+	memory := genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
+	memory.MemoryModules = genericNUMAMemoryModles(memoryMb, 4, q.numaNodes)
+	return memory
 }
 
 // Is Memory Hotplug supported by this architecture/machine type combination?
diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go
index fd92be7724..c1fc3e5f45 100644
--- a/src/runtime/virtcontainers/qemu_arch_base.go
+++ b/src/runtime/virtcontainers/qemu_arch_base.go
@@ -183,6 +183,7 @@ type qemuArchBase struct {
 	kernelParamsDebug    []Param
 	kernelParams         []Param
 	Bridges              []types.Bridge
+	numaNodes            []types.NUMANode
 	memoryOffset         uint64
 	networkIndex         int
 	// Exclude from lint checking for it is ultimately only used in architecture-specific code
@@ -321,12 +322,20 @@ func (q *qemuArchBase) bridges(number uint32) {
 }
 
 func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP {
+	numNUMA := uint32(len(q.numaNodes))
+
+	numSockets := numNUMA
+	if numSockets == 0 {
+		numSockets = maxvcpus
+	}
+
 	smp := govmmQemu.SMP{
 		CPUs:    vcpus,
-		Sockets: maxvcpus,
-		Cores:   defaultCores,
+		Sockets: numSockets,
+		Cores:   maxvcpus / numSockets / defaultThreads,
 		Threads: defaultThreads,
 		MaxCPUs: maxvcpus,
+		NumNUMA: numNUMA,
 	}
 
 	return smp

From 086b4708fc235e0d535ea8c14370a58572226f21 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Thu, 18 Jul 2024 16:00:12 +0200
Subject: [PATCH 6/6] runtime: add annotation default_maxmemory

It seems there is no annotation for disabling VM memory hotplug.
This should be useful at least until hotplug become fully NUMA-aware.

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
---
 docs/how-to/how-to-set-sandbox-config-kata.md          |  1 +
 src/runtime/pkg/oci/utils.go                           | 10 ++++++++++
 src/runtime/virtcontainers/hypervisor_config_linux.go  |  4 ++++
 .../virtcontainers/pkg/annotations/annotations.go      |  3 +++
 4 files changed, 18 insertions(+)

diff --git a/docs/how-to/how-to-set-sandbox-config-kata.md b/docs/how-to/how-to-set-sandbox-config-kata.md
index 776a7e02f3..8957aceb56 100644
--- a/docs/how-to/how-to-set-sandbox-config-kata.md
+++ b/docs/how-to/how-to-set-sandbox-config-kata.md
@@ -47,6 +47,7 @@ There are several kinds of Kata configurations and they are listed below.
 | `io.katacontainers.config.hypervisor.block_device_driver` | string | the driver to be used for block device, valid values are `virtio-blk`, `virtio-scsi`, `nvdimm`|
 | `io.katacontainers.config.hypervisor.cpu_features` | `string` | Comma-separated list of CPU features to pass to the CPU (QEMU) |
 | `io.katacontainers.config.hypervisor.default_max_vcpus` | uint32| the maximum number of vCPUs allocated for the VM by the hypervisor |
+| `io.katacontainers.config.hypervisor.default_maxmemory` | uint32| the maximum memory assigned for a VM by the hypervisor in `MiB` |
 | `io.katacontainers.config.hypervisor.default_memory` | uint32| the memory assigned for a VM by the hypervisor in `MiB` |
 | `io.katacontainers.config.hypervisor.default_vcpus` | float32| the default vCPUs assigned for a VM by the hypervisor |
 | `io.katacontainers.config.hypervisor.disable_block_device_use` | `boolean` | disallow a block device from being used |
diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go
index cbe15ee1d8..535934f925 100644
--- a/src/runtime/pkg/oci/utils.go
+++ b/src/runtime/pkg/oci/utils.go
@@ -662,6 +662,16 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
 		return err
 	}
 
+	if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMaxMemory).setUintWithCheck(func(memorySz uint64) error {
+		if memorySz < vc.MinHypervisorMemory && sbConfig.HypervisorType != vc.RemoteHypervisor {
+			return fmt.Errorf("Memory specified in annotation %s is less than minimum required %d, please specify a larger value", vcAnnotations.DefaultMemory, vc.MinHypervisorMemory)
+		}
+		sbConfig.HypervisorConfig.DefaultMaxMemorySize = memorySz
+		return nil
+	}); err != nil {
+		return err
+	}
+
 	if err := newAnnotationConfiguration(ocispec, vcAnnotations.MemSlots).setUint(func(mslots uint64) {
 		if mslots > 0 {
 			sbConfig.HypervisorConfig.MemSlots = uint32(mslots)
diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go
index d9ca3bac39..12c89ae467 100644
--- a/src/runtime/virtcontainers/hypervisor_config_linux.go
+++ b/src/runtime/virtcontainers/hypervisor_config_linux.go
@@ -44,6 +44,10 @@ func validateHypervisorConfig(conf *HypervisorConfig) error {
 		conf.MemorySize = defaultMemSzMiB
 	}
 
+	if uint64(conf.MemorySize) > conf.DefaultMaxMemorySize {
+		conf.MemorySize = uint32(conf.DefaultMaxMemorySize)
+	}
+
 	if conf.DefaultBridges == 0 {
 		conf.DefaultBridges = defaultBridges
 	}
diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go
index 145c0ca865..41cf536775 100644
--- a/src/runtime/virtcontainers/pkg/annotations/annotations.go
+++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go
@@ -155,6 +155,9 @@ const (
 	// DefaultMemory is a sandbox annotation for the memory assigned for a VM by the hypervisor.
 	DefaultMemory = kataAnnotHypervisorPrefix + "default_memory"
 
+	// MaxMemory is a sandbox annotation for the maximum memory assigned for a VM by the hypervisor.
+	DefaultMaxMemory = kataAnnotHypervisorPrefix + "default_maxmemory"
+
 	// MemSlots is a sandbox annotation to specify the memory slots assigned to the VM by the hypervisor.
 	MemSlots = kataAnnotHypervisorPrefix + "memory_slots"