From 8d2ecaabb58675455151aef7560d20106ebb4c66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 30 Apr 2026 12:26:31 +0200 Subject: [PATCH 01/14] versions: Bump QEMU to v11.0.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For more details see QEMU's release notes: https://www.qemu.org/2026/04/22/qemu-11-0-0/ GPU experimental variants are also using v11.0.0 plus one patch to solve issues related to NUMA mapping. Signed-off-by: Fabiano Fidêncio --- .../qemu/patches/11.0.x/no_patches.txt | 0 ...m_convert_memory-calls-crossing-memo.patch | 94 +++++++++++++++++++ ...m_convert_memory-calls-crossing-memo.patch | 94 +++++++++++++++++++ .../tag_patches/v11.0.0/no_patches.txt | 0 versions.yaml | 8 +- 5 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 tools/packaging/qemu/patches/11.0.x/no_patches.txt create mode 100644 tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch create mode 100644 tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch create mode 100644 tools/packaging/qemu/patches/tag_patches/v11.0.0/no_patches.txt diff --git a/tools/packaging/qemu/patches/11.0.x/no_patches.txt b/tools/packaging/qemu/patches/11.0.x/no_patches.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch b/tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch new file mode 100644 index 0000000000..b80adaa58f --- /dev/null +++ b/tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch @@ -0,0 +1,94 @@ +From 6b0eaa20aa91e9d82e0bf72b4ade6e83d18a4c9f Mon Sep 17 00:00:00 2001 +From: Ashish Kalra +Date: Thu, 18 Sep 2025 22:10:35 +0000 +Subject: [PATCH] accel/kvm: Fix kvm_convert_memory calls crossing memory + regions + +Page conversion call can span multiple memory regions, potentially +resulting in a conversion failure if the memory range being converted +extends beyond the boundaries of the referenced memory region. + +Handle the case of page conversion call straddling across memory +regions. + +Signed-off-by: Ashish Kalra +Signed-off-by: Michael Roth +--- + accel/kvm/kvm-all.c | 27 ++++++++++++++++++++------- + 1 file changed, 20 insertions(+), 7 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 63230743d0..a1b2c3e5f4 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -3342,6 +3342,7 @@ static void kvm_eat_signals(CPUState *cpu) + int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + { + MemoryRegionSection section; ++ hwaddr convert_size; + ram_addr_t offset; + MemoryRegion *mr; + RAMBlock *rb; +@@ -3359,6 +3360,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + return ret; + } + ++ /* ++ * Page conversions can span multiple memory regions, for example, if two ++ * memory backends are added to support two different NUMA nodes/policies. ++ */ ++next_memory_region: + section = memory_region_find(get_system_memory(), start, size); + mr = section.mr; + if (!mr) { +@@ -3397,10 +3403,13 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + goto out_unref; + } + ++ convert_size = (section.offset_within_region + size > mr->size) ? ++ mr->size - section.offset_within_region : size; ++ + if (to_private) { +- ret = kvm_set_memory_attributes_private(start, size); ++ ret = kvm_set_memory_attributes_private(start, convert_size); + } else { +- ret = kvm_set_memory_attributes_shared(start, size); ++ ret = kvm_set_memory_attributes_shared(start, convert_size); + } + if (ret) { + goto out_unref; +@@ -3410,11 +3419,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + rb = qemu_ram_block_from_host(addr, false, &offset); + + ret = ram_block_attributes_state_change(rb->attributes, +- offset, size, to_private); ++ offset, convert_size, to_private); + if (ret) { + error_report("Failed to notify the listener the state change of " + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", +- start, size, to_private ? "private" : "shared"); ++ start, convert_size, to_private ? "private" : "shared"); + goto out_unref; + } + +@@ -3426,9 +3435,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + */ + goto out_unref; + } +- ret = ram_block_discard_range(rb, offset, size); ++ ret = ram_block_discard_range(rb, offset, convert_size); + } else { +- ret = ram_block_discard_guest_memfd_range(rb, offset, size); ++ ret = ram_block_discard_guest_memfd_range(rb, offset, convert_size); ++ } ++ ++ if (size - convert_size) { ++ start += convert_size; ++ size -= convert_size; ++ goto next_memory_region; + } + + out_unref: +-- +2.43.0 + diff --git a/tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch b/tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch new file mode 100644 index 0000000000..b80adaa58f --- /dev/null +++ b/tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch @@ -0,0 +1,94 @@ +From 6b0eaa20aa91e9d82e0bf72b4ade6e83d18a4c9f Mon Sep 17 00:00:00 2001 +From: Ashish Kalra +Date: Thu, 18 Sep 2025 22:10:35 +0000 +Subject: [PATCH] accel/kvm: Fix kvm_convert_memory calls crossing memory + regions + +Page conversion call can span multiple memory regions, potentially +resulting in a conversion failure if the memory range being converted +extends beyond the boundaries of the referenced memory region. + +Handle the case of page conversion call straddling across memory +regions. + +Signed-off-by: Ashish Kalra +Signed-off-by: Michael Roth +--- + accel/kvm/kvm-all.c | 27 ++++++++++++++++++++------- + 1 file changed, 20 insertions(+), 7 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 63230743d0..a1b2c3e5f4 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -3342,6 +3342,7 @@ static void kvm_eat_signals(CPUState *cpu) + int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + { + MemoryRegionSection section; ++ hwaddr convert_size; + ram_addr_t offset; + MemoryRegion *mr; + RAMBlock *rb; +@@ -3359,6 +3360,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + return ret; + } + ++ /* ++ * Page conversions can span multiple memory regions, for example, if two ++ * memory backends are added to support two different NUMA nodes/policies. ++ */ ++next_memory_region: + section = memory_region_find(get_system_memory(), start, size); + mr = section.mr; + if (!mr) { +@@ -3397,10 +3403,13 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + goto out_unref; + } + ++ convert_size = (section.offset_within_region + size > mr->size) ? ++ mr->size - section.offset_within_region : size; ++ + if (to_private) { +- ret = kvm_set_memory_attributes_private(start, size); ++ ret = kvm_set_memory_attributes_private(start, convert_size); + } else { +- ret = kvm_set_memory_attributes_shared(start, size); ++ ret = kvm_set_memory_attributes_shared(start, convert_size); + } + if (ret) { + goto out_unref; +@@ -3410,11 +3419,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + rb = qemu_ram_block_from_host(addr, false, &offset); + + ret = ram_block_attributes_state_change(rb->attributes, +- offset, size, to_private); ++ offset, convert_size, to_private); + if (ret) { + error_report("Failed to notify the listener the state change of " + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", +- start, size, to_private ? "private" : "shared"); ++ start, convert_size, to_private ? "private" : "shared"); + goto out_unref; + } + +@@ -3426,9 +3435,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + */ + goto out_unref; + } +- ret = ram_block_discard_range(rb, offset, size); ++ ret = ram_block_discard_range(rb, offset, convert_size); + } else { +- ret = ram_block_discard_guest_memfd_range(rb, offset, size); ++ ret = ram_block_discard_guest_memfd_range(rb, offset, convert_size); ++ } ++ ++ if (size - convert_size) { ++ start += convert_size; ++ size -= convert_size; ++ goto next_memory_region; + } + + out_unref: +-- +2.43.0 + diff --git a/tools/packaging/qemu/patches/tag_patches/v11.0.0/no_patches.txt b/tools/packaging/qemu/patches/tag_patches/v11.0.0/no_patches.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/versions.yaml b/versions.yaml index 93cad9e9f1..69fb34b0ba 100644 --- a/versions.yaml +++ b/versions.yaml @@ -88,8 +88,8 @@ assets: qemu: description: "VMM that uses KVM" url: "https://github.com/qemu/qemu" - version: "v10.2.1" - tag: "v10.2.1" + version: "v11.0.0" + tag: "v11.0.0" # Do not include any non-full release versions # Break the line *without CR or space being appended*, to appease # yamllint, and note the deliberate ' ' at the end of the expression. @@ -107,12 +107,12 @@ assets: qemu-snp-experimental: description: "QEMU with GPU+SNP support" url: "https://github.com/confidential-containers/qemu.git" - tag: "gpu-snp-20260107" + tag: "gpu-snp-20260430" qemu-tdx-experimental: description: "QEMU with GPU+TDX support" url: "https://github.com/confidential-containers/qemu.git" - tag: "gpu-tdx-20260107" + tag: "gpu-tdx-20260430" stratovirt: description: "StratoVirt is an lightweight opensource VMM" From 1e9da61d483f50bb809b2b2b60f57bea9e78fae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 14 Apr 2026 14:58:11 +0200 Subject: [PATCH 02/14] govmm: Add multi-NUMA memory backend and distance matrix support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce NUMANode and NUMADist types, add NUMANodes/NUMADists fields to Config, and implement appendMultiNUMAMemoryKnobs() to generate per-node memory-backend objects with host-nodes/policy=bind, -numa node entries with cpus= ranges, and -numa dist entries for the distance matrix. Gate the multi-NUMA path in appendMemoryKnobs() behind isDimmSupported() to ensure architectures without DIMM support (s390x, riscv64) fall back to the single-node path. Drop 386 from isDimmSupported since 32-bit x86 is not a supported Kata target. Signed-off-by: Fabiano Fidêncio Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/govmm/qemu/qemu.go | 129 +++++++++++++++-- .../pkg/govmm/qemu/qemu_arch_base_test.go | 8 +- src/runtime/pkg/govmm/qemu/qemu_test.go | 135 ++++++++++++++++++ 3 files changed, 258 insertions(+), 14 deletions(-) diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 7cf6915df9..dc7501c87b 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -152,7 +152,7 @@ const ( func isDimmSupported(config *Config) bool { switch runtime.GOARCH { - case "amd64", "386", "ppc64le", "arm64": + case "amd64", "ppc64le", "arm64": if config != nil && config.Machine.Type == MachineTypeMicrovm { // microvm does not support NUMA return false @@ -1586,8 +1586,13 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("netdev=%s", vhostuserDev.TypeDevID)) deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address)) - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + // Pin to pcie.0 so pxb-pcie (when present) doesn't capture + // this leaf device as the default bus. + deviceParams = append(deviceParams, "bus=pcie.0") + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-netdev") @@ -1612,8 +1617,11 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vhostuserDev.TypeDevID)) deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-device") @@ -1637,8 +1645,11 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string { deviceParams = append(deviceParams, "size=512M") deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-device") @@ -1674,8 +1685,11 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string { } deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo)) } - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-device") @@ -2689,7 +2703,8 @@ type SMP struct { Sockets uint32 // MaxCPUs is the maximum number of VCPUs that a VM can have. - // This value, if non-zero, MUST BE equal to or greater than CPUs + // This value, if non-zero, MUST BE equal to or greater than CPUs, + // and must be equal to Sockets * Cores * Threads if all are non-zero. MaxCPUs uint32 } @@ -2775,6 +2790,36 @@ func (fwcfg FwCfg) QemuParams(config *Config) []string { return qemuParams } +// NUMANode describes a guest NUMA node and its mapping to host resources. +type NUMANode struct { + // NodeID is the guest NUMA node identifier (0-based). + NodeID uint32 + + // CPUs is the guest vCPU range assigned to this node (e.g. "0-3"). + CPUs string + + // MemSize is the amount of memory for this node (e.g. "512M", "1G"). + MemSize string + + // HostNodes is the host NUMA node(s) this guest node maps to (e.g. "0" or "0-1"). + HostNodes string + + // MemBackendType selects the QEMU memory backend object type. + // Typical values: "memory-backend-ram" or "memory-backend-file". + MemBackendType string + + // MemBackendPath is the mem-path for file-backed memory (hugepages, file-backed). + // Empty when using memory-backend-ram. + MemBackendPath string +} + +// NUMADist describes a NUMA distance entry for `-numa dist`. +type NUMADist struct { + Src uint32 + Dst uint32 + Val uint32 +} + // Knobs regroups a set of qemu boolean settings type Knobs struct { // NoUserConfig prevents qemu from loading user config files. @@ -2922,6 +2967,14 @@ type Config struct { IOThreads []IOThread + // NUMANodes defines multi-NUMA guest topology. When non-empty, + // appendMemoryKnobs creates per-node memory backends and -numa entries + // instead of a single flat memory region. + NUMANodes []NUMANode + + // NUMADists defines inter-node distance entries emitted as -numa dist. + NUMADists []NUMADist + // PidFile is the -pidfile parameter PidFile string @@ -3096,6 +3149,13 @@ func (config *Config) appendCPUs() error { return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d", config.SMP.MaxCPUs, config.SMP.CPUs) } + if len(config.NUMANodes) > 1 && config.SMP.Sockets > 0 && config.SMP.Cores > 0 && config.SMP.Threads > 0 { + expected := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads + if config.SMP.MaxCPUs != expected { + return fmt.Errorf("MaxCPUs %d must equal Sockets(%d) * Cores(%d) * Threads(%d) = %d", + config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads, expected) + } + } SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs)) } @@ -3169,6 +3229,12 @@ func (config *Config) appendMemoryKnobs() { if config.Memory.Size == "" { return } + + if len(config.NUMANodes) > 0 && isDimmSupported(config) { + config.appendMultiNUMAMemoryKnobs() + return + } + var objMemParam, numaMemParam string dimmName := "dimm1" if config.Knobs.HugePages { @@ -3200,6 +3266,49 @@ func (config *Config) appendMemoryKnobs() { } } +func (config *Config) appendMultiNUMAMemoryKnobs() { + for _, node := range config.NUMANodes { + memID := fmt.Sprintf("numa-mem%d", node.NodeID) + + backendType := node.MemBackendType + if backendType == "" { + backendType = "memory-backend-ram" + } + + objMemParam := fmt.Sprintf("%s,id=%s,size=%s", backendType, memID, node.MemSize) + + if node.MemBackendPath != "" { + objMemParam += ",mem-path=" + node.MemBackendPath + } + + if node.HostNodes != "" { + objMemParam += ",host-nodes=" + node.HostNodes + ",policy=bind" + } + + if config.Knobs.MemShared { + objMemParam += ",share=on" + } + if config.Knobs.MemPrealloc { + objMemParam += ",prealloc=on" + } + + config.qemuParams = append(config.qemuParams, "-object") + config.qemuParams = append(config.qemuParams, objMemParam) + + numaParam := fmt.Sprintf("node,nodeid=%d,memdev=%s", node.NodeID, memID) + if node.CPUs != "" { + numaParam += ",cpus=" + node.CPUs + } + config.qemuParams = append(config.qemuParams, "-numa") + config.qemuParams = append(config.qemuParams, numaParam) + } + + for _, dist := range config.NUMADists { + config.qemuParams = append(config.qemuParams, "-numa") + config.qemuParams = append(config.qemuParams, fmt.Sprintf("dist,src=%d,dst=%d,val=%d", dist.Src, dist.Dst, dist.Val)) + } +} + func (config *Config) appendKnobs() { if config.Knobs.NoUserConfig { config.qemuParams = append(config.qemuParams, "-no-user-config") diff --git a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go index a14e0fb032..a15e77c184 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go @@ -14,8 +14,8 @@ var ( deviceNetworkString = "-netdev tap,id=tap0,vhost=on,ifname=ceth0,downscript=no,script=no -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,romfile=efi-virtio.rom" deviceNetworkStringMq = "-netdev tap,id=tap0,vhost=on,fds=3:4 -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,mq=on,vectors=6,romfile=efi-virtio.rom" deviceSerialString = "-device virtio-serial-pci,disable-modern=true,id=serial0,romfile=efi-virtio.rom,max_ports=2" - deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,romfile=efi-virtio.rom" - deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=efi-virtio.rom" + deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,bus=pcie.0,romfile=efi-virtio.rom" + deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,bus=pcie.0,romfile=efi-virtio.rom" deviceVFIOString = "-device vfio-pci,host=02:10.0,x-pci-vendor-id=0x1234,x-pci-device-id=0x5678,romfile=efi-virtio.rom" devicePCIeRootPortSimpleString = "-device pcie-root-port,id=rp1,bus=pcie.0,chassis=0x00,slot=0x00,multifunction=off" devicePCIeRootPortFullString = "-device pcie-root-port,id=rp2,bus=pcie.0,chassis=0x0,slot=0x1,addr=0x2,multifunction=on,bus-reserve=0x3,pref64-reserve=16G,mem-reserve=1G,io-reserve=512M,romfile=efi-virtio.rom" @@ -23,8 +23,8 @@ var ( deviceVFIOPCIeFullString = "-device vfio-pci,host=02:00.0,x-pci-vendor-id=0x10de,x-pci-device-id=0x15f8,romfile=efi-virtio.rom,bus=rp1" deviceSCSIControllerStr = "-device virtio-scsi-pci,id=foo,disable-modern=false,romfile=efi-virtio.rom" deviceSCSIControllerBusAddrStr = "-device virtio-scsi-pci,id=foo,bus=pci.0,addr=00:04.0,disable-modern=true,iothread=iothread1,romfile=efi-virtio.rom" - deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,romfile=efi-virtio.rom" - deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,romfile=efi-virtio.rom" + deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,bus=pcie.0,romfile=efi-virtio.rom" + deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,bus=pcie.0,romfile=efi-virtio.rom" deviceBlockString = "-device virtio-blk-pci,disable-modern=true,drive=hd0,config-wce=off,romfile=efi-virtio.rom,share-rw=on,serial=hd0 -drive id=hd0,file=/var/lib/vm.img,aio=threads,format=qcow2,if=none,readonly=on" devicePCIBridgeString = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=on,addr=ff,romfile=efi-virtio.rom" devicePCIBridgeStringReserved = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=off,addr=ff,romfile=efi-virtio.rom,io-reserve=4k,mem-reserve=1m,pref64-reserve=1m" diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 5d4c15ed9d..8be4d0d779 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -9,6 +9,7 @@ import ( "fmt" "os" "reflect" + "runtime" "strings" "testing" ) @@ -1117,6 +1118,140 @@ func TestBadMemoryKnobs(t *testing.T) { } } +func TestAppendMultiNUMAMemoryKnobs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + c := &Config{ + Memory: Memory{ + Size: "2G", + Slots: 8, + MaxMem: "4G", + }, + NUMANodes: []NUMANode{ + { + NodeID: 0, + CPUs: "0-3", + MemSize: "1G", + HostNodes: "0", + MemBackendType: "memory-backend-ram", + }, + { + NodeID: 1, + CPUs: "4-7", + MemSize: "1G", + HostNodes: "1", + MemBackendType: "memory-backend-ram", + }, + }, + Knobs: Knobs{ + MemShared: true, + MemPrealloc: true, + }, + } + + c.appendMemoryKnobs() + + expected := []string{ + "-object", "memory-backend-ram,id=numa-mem0,size=1G,host-nodes=0,policy=bind,share=on,prealloc=on", + "-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-3", + "-object", "memory-backend-ram,id=numa-mem1,size=1G,host-nodes=1,policy=bind,share=on,prealloc=on", + "-numa", "node,nodeid=1,memdev=numa-mem1,cpus=4-7", + } + if len(c.qemuParams) != len(expected) { + t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams) + } + for i, p := range expected { + if c.qemuParams[i] != p { + t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i]) + } + } +} + +func TestAppendMultiNUMAHugePages(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + c := &Config{ + Memory: Memory{ + Size: "2G", + Slots: 8, + MaxMem: "4G", + }, + NUMANodes: []NUMANode{ + { + NodeID: 0, + CPUs: "0-1", + MemSize: "1G", + HostNodes: "0", + MemBackendType: "memory-backend-file", + MemBackendPath: "/dev/hugepages", + }, + { + NodeID: 1, + CPUs: "2-3", + MemSize: "1G", + HostNodes: "1", + MemBackendType: "memory-backend-file", + MemBackendPath: "/dev/hugepages", + }, + }, + Knobs: Knobs{ + MemShared: true, + }, + } + + c.appendMemoryKnobs() + + expected := []string{ + "-object", "memory-backend-file,id=numa-mem0,size=1G,mem-path=/dev/hugepages,host-nodes=0,policy=bind,share=on", + "-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-1", + "-object", "memory-backend-file,id=numa-mem1,size=1G,mem-path=/dev/hugepages,host-nodes=1,policy=bind,share=on", + "-numa", "node,nodeid=1,memdev=numa-mem1,cpus=2-3", + } + if len(c.qemuParams) != len(expected) { + t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams) + } + for i, p := range expected { + if c.qemuParams[i] != p { + t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i]) + } + } +} + +func TestAppendNUMADist(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + c := &Config{ + Memory: Memory{ + Size: "2G", + }, + NUMANodes: []NUMANode{ + {NodeID: 0, CPUs: "0-1", MemSize: "1G", MemBackendType: "memory-backend-ram"}, + {NodeID: 1, CPUs: "2-3", MemSize: "1G", MemBackendType: "memory-backend-ram"}, + }, + NUMADists: []NUMADist{ + {Src: 0, Dst: 1, Val: 20}, + {Src: 1, Dst: 0, Val: 20}, + }, + } + + c.appendMemoryKnobs() + + expectedDist := []string{ + "-numa", "dist,src=0,dst=1,val=20", + "-numa", "dist,src=1,dst=0,val=20", + } + params := c.qemuParams + distParams := params[len(params)-4:] + for i, p := range expectedDist { + if distParams[i] != p { + t.Errorf("Dist param %d: expected %q, got %q", i, p, distParams[i]) + } + } +} + func TestBadBios(t *testing.T) { c := &Config{} c.appendBios() From 1ee8bb574010a269bd7b50dae29b425334b5cc17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 14 Apr 2026 15:02:01 +0200 Subject: [PATCH 03/14] runtime: Add NUMA-aware SMP topology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make cpuTopology() NUMA-aware by accepting a numNUMANodes parameter. When multiple NUMA nodes are configured, restructure the SMP topology so that Sockets=numNUMA and Cores=ceil(maxvcpus/numNUMA), grouping vCPUs by socket per NUMA node. Use ceiling division so that uneven vCPU counts (e.g. the +1 VMM overhead vCPU that Kata adds) produce a QEMU-valid SMP topology where MaxCPUs == Sockets * Cores * Threads. When numNUMANodes <= 1, the existing flat topology (Sockets=maxvcpus, Cores=1) is preserved. Signed-off-by: Fabiano Fidêncio Signed-off-by: Zvonko Kaiser --- .../virtcontainers/hypervisor_config_linux.go | 4 -- src/runtime/virtcontainers/qemu.go | 2 +- src/runtime/virtcontainers/qemu_arch_base.go | 26 +++++++++--- .../virtcontainers/qemu_arch_base_test.go | 41 ++++++++++++++++++- 4 files changed, 61 insertions(+), 12 deletions(-) diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go index 381f3c8f07..1e21e4867c 100644 --- a/src/runtime/virtcontainers/hypervisor_config_linux.go +++ b/src/runtime/virtcontainers/hypervisor_config_linux.go @@ -63,10 +63,6 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { conf.DefaultMaxVCPUs = defaultMaxVCPUs } - if numNUMA := conf.NumGuestNUMANodes(); numNUMA > 1 { - conf.DefaultMaxVCPUs -= conf.DefaultMaxVCPUs % numNUMA - } - if conf.Msize9p == 0 && conf.SharedFS != config.VirtioFS { conf.Msize9p = defaultMsize9p } diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 4066c85e48..d13e1209c4 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -326,7 +326,7 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso } func (q *qemu) cpuTopology() govmmQemu.SMP { - return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs) + return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, q.config.NumGuestNUMANodes()) } func (q *qemu) memoryTopology() (govmmQemu.Memory, error) { diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index aacb97b7cc..f3bba704ca 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -61,8 +61,9 @@ type qemuArch interface { // bridges sets the number bridges for the machine type bridges(number uint32) - // cpuTopology returns the CPU topology for the given amount of vcpus - cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP + // cpuTopology returns the CPU topology for the given amount of vcpus. + // numNUMANodes > 1 restructures the topology so vCPUs are grouped by socket per NUMA node. + cpuTopology(vcpus, maxvcpus uint32, numNUMANodes uint32) govmmQemu.SMP // cpuModel returns the CPU model for the machine type cpuModel() string @@ -324,16 +325,29 @@ func (q *qemuArchBase) bridges(number uint32) { } } -func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP { - smp := govmmQemu.SMP{ +func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32, numNUMANodes uint32) govmmQemu.SMP { + if numNUMANodes > 1 { + coresPerSocket := (maxvcpus + numNUMANodes - 1) / numNUMANodes + if coresPerSocket == 0 { + coresPerSocket = 1 + } + smpMaxCPUs := numNUMANodes * coresPerSocket * defaultThreads + return govmmQemu.SMP{ + CPUs: vcpus, + Sockets: numNUMANodes, + Cores: coresPerSocket, + Threads: defaultThreads, + MaxCPUs: smpMaxCPUs, + } + } + + return govmmQemu.SMP{ CPUs: vcpus, Sockets: maxvcpus, Cores: defaultCores, Threads: defaultThreads, MaxCPUs: maxvcpus, } - - return smp } func (q *qemuArchBase) cpuModel() string { diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go index dfaebb8dab..c177ee44a8 100644 --- a/src/runtime/virtcontainers/qemu_arch_base_test.go +++ b/src/runtime/virtcontainers/qemu_arch_base_test.go @@ -189,7 +189,46 @@ func TestQemuArchBaseCPUTopology(t *testing.T) { MaxCPUs: defaultMaxVCPUs, } - smp := qemuArchBase.cpuTopology(vcpus, defaultMaxVCPUs) + smp := qemuArchBase.cpuTopology(vcpus, defaultMaxVCPUs, 0) + assert.Equal(expectedSMP, smp) +} + +func TestQemuArchBaseCPUTopologyNUMA(t *testing.T) { + assert := assert.New(t) + qemuArchBase := newQemuArchBase() + vcpus := uint32(2) + maxvcpus := uint32(8) + numNUMA := uint32(2) + + expectedSMP := govmmQemu.SMP{ + CPUs: vcpus, + Sockets: numNUMA, + Cores: maxvcpus / numNUMA, + Threads: defaultThreads, + MaxCPUs: maxvcpus, + } + + smp := qemuArchBase.cpuTopology(vcpus, maxvcpus, numNUMA) + assert.Equal(expectedSMP, smp) +} + +func TestQemuArchBaseCPUTopologyNUMAUneven(t *testing.T) { + assert := assert.New(t) + qemuArchBase := newQemuArchBase() + vcpus := uint32(2) + maxvcpus := uint32(5) + numNUMA := uint32(2) + + coresPerSocket := (maxvcpus + numNUMA - 1) / numNUMA + expectedSMP := govmmQemu.SMP{ + CPUs: vcpus, + Sockets: numNUMA, + Cores: coresPerSocket, + Threads: defaultThreads, + MaxCPUs: numNUMA * coresPerSocket * defaultThreads, + } + + smp := qemuArchBase.cpuTopology(vcpus, maxvcpus, numNUMA) assert.Equal(expectedSMP, smp) } From 447e2a3faf52c5c73e3980cad5df1f009b2f9e70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 14 Apr 2026 15:16:15 +0200 Subject: [PATCH 04/14] runtime: Add VFIO device NUMA node detection and placement validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add PCISysFsDevicesNUMANode property and GetPCIDeviceNUMANode() helper to read /sys/bus/pci/devices//numa_node when discovering VFIO devices. Store the result in the new NUMANode field on VFIODev (-1 for unknown/no affinity). Wire NUMA node detection into both GetAllVFIODevicesFromIOMMUGroup() (legacy VFIO path) and GetDeviceFromVFIODev() (IOMMUFD path) so every discovered VFIO device carries its host NUMA node. Add validateVFIODeviceNUMAPlacement() which runs at the end of buildNUMATopology(). It checks every cold-plugged VFIO device's host NUMA node against the guest NUMA topology and logs a warning if a device is on a host NUMA node not covered by any guest NUMA node (indicating potential cross-NUMA memory access overhead), or an info message confirming correct placement. Signed-off-by: Fabiano Fidêncio Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/config/config.go | 4 + src/runtime/pkg/device/drivers/utils.go | 19 +- src/runtime/virtcontainers/qemu.go | 400 ++++++++++++++++++++++++ 3 files changed, 422 insertions(+), 1 deletion(-) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index b912fc6377..f41263975b 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -418,6 +418,10 @@ type VFIODev struct { // Type of VFIO device Type VFIODeviceType + // NUMANode is the host NUMA node this device is attached to. + // -1 means no affinity or unknown. + NUMANode int + // IsPCIe specifies device is PCIe or PCI IsPCIe bool diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 1e7ba5f118..d111b9e2bb 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -46,6 +46,7 @@ var ( PCISysFsSlotsMaxBusSpeed PCISysFsProperty = "max_bus_speed" // /sys/bus/pci/slots/xxx/max_bus_speed PCISysFsDevicesVendor PCISysFsProperty = "vendor" // /sys/bus/pci/devices/xxx/vendor PCISysFsDevicesDevice PCISysFsProperty = "device" // /sys/bus/pci/devices/xxx/device + PCISysFsDevicesNUMANode PCISysFsProperty = "numa_node" // /sys/bus/pci/devices/xxx/numa_node ) func deviceLogger() *logrus.Entry { @@ -85,6 +86,20 @@ func GetPCIDeviceProperty(bdf string, property PCISysFsProperty) string { return rlt } +// GetPCIDeviceNUMANode returns the host NUMA node for a PCI device. +// Returns -1 if the device has no NUMA affinity or the value cannot be read. +func GetPCIDeviceNUMANode(bdf string) int { + raw := GetPCIDeviceProperty(bdf, PCISysFsDevicesNUMANode) + if raw == "" { + return -1 + } + n, err := strconv.Atoi(raw) + if err != nil { + return -1 + } + return n +} + func readPCIProperty(propertyPath string) (string, error) { var ( buf []byte @@ -240,6 +255,7 @@ func GetDeviceFromVFIODev(device config.DeviceInfo) ([]*config.VFIODev, error) { Class: pciClass, VendorID: vendorID, DeviceID: deviceID, + NUMANode: GetPCIDeviceNUMANode(deviceBDF), Port: device.Port, HostPath: device.HostPath, } @@ -291,7 +307,6 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe vendorID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesVendor) deviceID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesDevice) - // Do not directly assign to `vfio` -- need to access field still vfio = config.VFIODev{ ID: id, Type: vfioDeviceType, @@ -301,6 +316,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe Class: pciClass, VendorID: vendorID, DeviceID: deviceID, + NUMANode: GetPCIDeviceNUMANode(deviceBDF), Port: device.Port, HostPath: device.HostPath, } @@ -315,6 +331,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe SysfsDev: deviceSysfsDev, Type: config.VFIOAPDeviceMediatedType, APDevices: devices, + NUMANode: -1, Port: device.Port, } default: diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index d13e1209c4..e2a3e446d2 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -44,6 +44,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" pkgUtils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" "github.com/kata-containers/kata-containers/src/runtime/pkg/uuid" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) @@ -339,6 +340,405 @@ func (q *qemu) memoryTopology() (govmmQemu.Memory, error) { return q.arch.memoryTopology(memMb, 0, 0), nil } +// vfioHostNUMANodes walks the given VFIO devices and returns the set of +// host NUMA node IDs that contain at least one of them. Devices for which +// the NUMA node cannot be determined (returned as -1 by the kernel when +// the device is not bound to any node) are skipped silently. Resolution +// failures are logged as warnings and treated as "no constraint" for that +// device. The function is a free function (not a method) so it can be +// invoked before q.config is populated, e.g. during pre-setConfig +// right-sizing. +func vfioHostNUMANodes(devices []config.DeviceInfo, log *logrus.Entry) map[int]struct{} { + nodes := make(map[int]struct{}) + for _, dev := range devices { + hostPath, err := config.GetHostPath(dev, false, "") + if err != nil { + log.WithError(err).WithField("device", dev.HostPath).Warn("Failed to resolve VFIO device host path for NUMA placement") + continue + } + dev.HostPath = hostPath + var vfioDevs []*config.VFIODev + if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) { + vfioDevs, err = drivers.GetDeviceFromVFIODev(dev) + } else { + vfioDevs, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + } + if err != nil { + log.WithError(err).WithField("device", dev.HostPath).Warn("Failed to enumerate VFIO device(s) for NUMA placement") + continue + } + for _, vd := range vfioDevs { + if vd.NUMANode >= 0 { + nodes[vd.NUMANode] = struct{}{} + } + } + } + return nodes +} + +// guestNodeCoversAny reports whether the HostNodes of guestNode references +// any host NUMA ID present in the given set. +func guestNodeCoversAny(guestNode types.GuestNUMANode, hostSet map[int]struct{}) bool { + if len(hostSet) == 0 { + return false + } + parsed, err := cpuset.Parse(guestNode.HostNodes) + if err != nil { + return false + } + for _, id := range parsed.ToSlice() { + if _, ok := hostSet[id]; ok { + return true + } + } + return false +} + +// guestNodeHostIDs returns the host NUMA IDs covered by a single guest node. +func guestNodeHostIDs(gn types.GuestNUMANode) []int { + parsed, err := cpuset.Parse(gn.HostNodes) + if err != nil { + return nil + } + return parsed.ToSlice() +} + +// hostNUMACapFn returns the (cpu_count, mem_mb) capacity of a host NUMA +// node. Used to inject sysfs reads for testability. +type hostNUMACapFn func(nodeID int) (cpus int, memMB uint64, err error) + +// realHostNUMACapFn is the production capacity provider, backed by sysfs. +func realHostNUMACapFn(nodeID int) (int, uint64, error) { + c, err := utils.GetHostNUMANodeCapacity(nodeID) + if err != nil { + return 0, 0, err + } + return c.CPUs, c.MemMB, nil +} + +// sumNUMACapacity returns the (cpu_count, mem_mb) sum of the unique host +// NUMA nodes referenced by the given guest NUMA nodes. Nodes whose capacity +// can't be queried are skipped silently. +func sumNUMACapacity(nodes []types.GuestNUMANode, capFn hostNUMACapFn) (int, uint64) { + seen := make(map[int]struct{}) + var totalCPUs int + var totalMemMB uint64 + for _, gn := range nodes { + for _, hid := range guestNodeHostIDs(gn) { + if _, ok := seen[hid]; ok { + continue + } + seen[hid] = struct{}{} + cpus, memMB, err := capFn(hid) + if err != nil { + continue + } + totalCPUs += cpus + totalMemMB += memMB + } + } + return totalCPUs, totalMemMB +} + +// selectNUMANodes is the pure right-sizing decision: given an auto-derived +// guest NUMA topology, the sandbox's CPU/memory budget, the set of host +// NUMA nodes containing an attached VFIO device, and a capacity oracle, +// return the smallest subset of numaNodes that satisfies the constraints. +// +// Heuristic, in order: +// +// 1. If a VFIO device is attached, keep the guest nodes covering host +// nodes that contain a device. If their combined capacity fits the +// sandbox, return only that subset. +// 2. With no VFIO devices, if the smallest single host node has enough +// CPU+memory for the sandbox, return the first guest node. +// 3. Otherwise, return the input unchanged. +// +// The function is pure (no I/O), so it is unit-testable. Callers must pass +// a capFn that resolves host NUMA capacity; production code uses +// realHostNUMACapFn. +func selectNUMANodes( + numaNodes []types.GuestNUMANode, + vcpus uint32, + memMB uint64, + vfioHostSet map[int]struct{}, + capFn hostNUMACapFn, + log *logrus.Entry, +) []types.GuestNUMANode { + if len(numaNodes) <= 1 { + return numaNodes + } + + // 1) VFIO-aware: keep the guest nodes covering device-bearing host nodes. + if len(vfioHostSet) > 0 { + var covered []types.GuestNUMANode + for _, gn := range numaNodes { + if guestNodeCoversAny(gn, vfioHostSet) { + covered = append(covered, gn) + } + } + if len(covered) == 0 { + log.WithField("vfio-host-nodes", vfioHostSet). + Warn("No guest NUMA node covers VFIO device host nodes; keeping full topology") + return numaNodes + } + cpus, memCap := sumNUMACapacity(covered, capFn) + if uint32(cpus) >= vcpus && memCap >= memMB { + log.WithFields(logrus.Fields{ + "selected-nodes": len(covered), + "input-nodes": len(numaNodes), + "vfio-host-nodes": vfioHostSet, + "vcpus": vcpus, + "mem-mb": memMB, + }).Info("Right-sized NUMA topology to VFIO-aligned subset") + return covered + } + log.WithFields(logrus.Fields{ + "vfio-host-nodes": vfioHostSet, + "covered-cpus": cpus, + "covered-mem-mb": memCap, + "requested-vcpus": vcpus, + "requested-mem-mb": memMB, + }).Info("VFIO-aligned NUMA subset too small for sandbox; keeping full topology") + return numaNodes + } + + // 2) No VFIO constraints: collapse if the sandbox fits in a single + // (smallest) host node. + var smallestCPUs int = -1 + var smallestMem uint64 = math.MaxUint64 + for _, gn := range numaNodes { + cpus, memCap := sumNUMACapacity([]types.GuestNUMANode{gn}, capFn) + if smallestCPUs < 0 || cpus < smallestCPUs { + smallestCPUs = cpus + } + if memCap < smallestMem { + smallestMem = memCap + } + } + if smallestCPUs > 0 && uint32(smallestCPUs) >= vcpus && smallestMem >= memMB { + log.WithFields(logrus.Fields{ + "input-nodes": len(numaNodes), + "vcpus": vcpus, + "mem-mb": memMB, + "smallest-node-cpus": smallestCPUs, + "smallest-node-memMB": smallestMem, + }).Info("Right-sized NUMA topology: sandbox fits in a single host node") + return numaNodes[:1] + } + + // 3) Sandbox spans multiple nodes; preserve the auto-derived topology. + return numaNodes +} + +// maybeRightSizeAutoNUMA right-sizes an auto-derived guest NUMA topology +// in place on the given HypervisorConfig. It is a no-op when the user +// configured an explicit numa_mapping (TOML or annotation), or when the +// topology has at most one node. +// +// This must run before the config is consumed by the rest of the runtime +// (sandbox vCPU pinning, cpuset.mems forwarding, QEMU command-line build), +// so callers should invoke it on the *shared* HypervisorConfig pointer +// owned by the sandbox, not on a local copy. +func maybeRightSizeAutoNUMA(hc *HypervisorConfig, log *logrus.Entry) { + if hc == nil || len(hc.NUMAMapping) > 0 || len(hc.GuestNUMANodes) <= 1 { + return + } + hc.GuestNUMANodes = selectNUMANodes( + hc.GuestNUMANodes, + hc.DefaultMaxVCPUs, + uint64(hc.MemorySize), + vfioHostNUMANodes(hc.VFIODevices, log), + realHostNUMACapFn, + log, + ) +} + +func (q *qemu) buildNUMATopology() ([]govmmQemu.NUMANode, []govmmQemu.NUMADist, error) { + // q.config.GuestNUMANodes has already been right-sized (when applicable) + // by maybeRightSizeAutoNUMA() at hypervisor setup time, so a length + // of 1 here means "no NUMA topology"; fall through to a flat memdev. + numaNodes := q.config.GuestNUMANodes + if len(numaNodes) <= 1 { + return nil, nil, nil + } + + switch goruntime.GOARCH { + case "amd64", "arm64": + default: + return nil, nil, fmt.Errorf("multi-NUMA not supported on architecture %s", goruntime.GOARCH) + } + + // NUMA requires static_sandbox_resource_mgmt=true, which guarantees + // NumVCPUs == DefaultMaxVCPUs (set in oci/utils.go). All boot vCPUs + // are present at VM start, so the per-node CPU ranges below are valid. + // + // cpuTopology() rounds MaxCPUs up to (numNUMANodes * coresPerSocket) + // so that QEMU's SMP topology is consistent. We must cover all CPU + // slots in the NUMA map, otherwise QEMU warns about CPUs not present + // in any NUMA node. Apply the same ceiling here. + numNodes := uint32(len(numaNodes)) + if q.config.DefaultMaxVCPUs < numNodes { + hvLogger.WithFields(logrus.Fields{ + "vcpus": q.config.DefaultMaxVCPUs, + "numa-nodes": numNodes, + }).Warn("DefaultMaxVCPUs < NUMA node count; skipping multi-NUMA topology") + return nil, nil, nil + } + coresPerSocket := (q.config.DefaultMaxVCPUs + numNodes - 1) / numNodes + maxVCPUs := numNodes * coresPerSocket + + vcpusPerNode, err := utils.DistributeVCPUsProportionally(numaNodes, maxVCPUs) + if err != nil { + return nil, nil, fmt.Errorf("failed to distribute vCPUs across NUMA nodes: %w", err) + } + + memMb := uint64(q.config.MemorySize) + + var memAlign uint64 = 1 + if q.config.HugePages { + memAlign = 2 + } + + backendType := "memory-backend-ram" + backendPath := "" + if q.config.HugePages { + backendType = "memory-backend-file" + backendPath = "/dev/hugepages" + } else if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus || + q.config.FileBackedMemRootDir != "" { + backendType = "memory-backend-file" + if q.config.FileBackedMemRootDir != "" { + backendPath = q.config.FileBackedMemRootDir + } else { + backendPath = fallbackFileBackedMemDir + } + } + if backendPath != "" { + if _, err := os.Stat(backendPath); err != nil { + return nil, nil, fmt.Errorf("NUMA memory backend path %q does not exist: %w", backendPath, err) + } + } + + // Distribute memory proportionally to vCPU counts, aligned to memAlign. + memPerNode := make([]uint64, numNodes) + var memAssigned uint64 + for i := uint32(0); i < numNodes; i++ { + raw := memMb * uint64(vcpusPerNode[i]) / uint64(maxVCPUs) + memPerNode[i] = (raw / memAlign) * memAlign + if memPerNode[i] == 0 { + memPerNode[i] = memAlign + } + memAssigned += memPerNode[i] + } + // Give the remainder to the last node (must also be aligned). + if memAssigned < memMb { + remainder := memMb - memAssigned + if remainder%memAlign != 0 { + return nil, nil, fmt.Errorf("MemorySize (%d MiB) cannot be evenly distributed across %d NUMA nodes with %d MiB alignment", + memMb, numNodes, memAlign) + } + memPerNode[numNodes-1] += remainder + } else if memAssigned > memMb { + return nil, nil, fmt.Errorf("MemorySize (%d MiB) cannot be evenly distributed across %d NUMA nodes with %d MiB alignment", + memMb, numNodes, memAlign) + } + + var nodes []govmmQemu.NUMANode + var cpuOffset uint32 + for i, gn := range numaNodes { + startCPU := cpuOffset + endCPU := startCPU + vcpusPerNode[i] - 1 + cpuOffset = endCPU + 1 + cpuRange := fmt.Sprintf("%d-%d", startCPU, endCPU) + + nodes = append(nodes, govmmQemu.NUMANode{ + NodeID: uint32(i), + CPUs: cpuRange, + MemSize: fmt.Sprintf("%dM", memPerNode[i]), + HostNodes: gn.HostNodes, + MemBackendType: backendType, + MemBackendPath: backendPath, + }) + } + + var dists []govmmQemu.NUMADist + hostDists := utils.GetHostNUMADistances(numaNodes) + for _, hd := range hostDists { + dists = append(dists, govmmQemu.NUMADist{ + Src: hd.Src, + Dst: hd.Dst, + Val: hd.Val, + }) + } + + q.validateVFIODeviceNUMAPlacement(numaNodes) + + return nodes, dists, nil +} + +// buildCoveredHostNodes maps each host NUMA node ID to its guest NUMA node +// index based on the GuestNUMANode HostNodes configuration. +func buildCoveredHostNodes(numaNodes []types.GuestNUMANode) map[int]uint32 { + covered := make(map[int]uint32) + for guestIdx, gn := range numaNodes { + nodeSet, err := cpuset.Parse(gn.HostNodes) + if err != nil { + continue + } + for _, n := range nodeSet.ToSlice() { + covered[n] = uint32(guestIdx) + } + } + return covered +} + +// validateVFIODeviceNUMAPlacement checks that every cold-plugged VFIO device +// (e.g. GPU) resides on a host NUMA node that is covered by the guest NUMA +// topology. A mismatch means the device will incur cross-NUMA memory accesses. +func (q *qemu) validateVFIODeviceNUMAPlacement(numaNodes []types.GuestNUMANode) { + coveredHostNodes := buildCoveredHostNodes(numaNodes) + + for _, dev := range q.config.VFIODevices { + hostPath, err := config.GetHostPath(dev, false, "") + if err != nil { + q.Logger().WithError(err).WithField("device", dev.HostPath).Warn("Failed to resolve VFIO device host path for NUMA placement validation") + continue + } + dev.HostPath = hostPath + var vfioDevs []*config.VFIODev + if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) { + vfioDevs, err = drivers.GetDeviceFromVFIODev(dev) + } else { + vfioDevs, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + } + if err != nil { + q.Logger().WithError(err).WithField("device", dev.HostPath).Warn("Failed to enumerate VFIO device(s) for NUMA placement validation") + continue + } + for _, vd := range vfioDevs { + if vd.NUMANode < 0 { + continue + } + guestNode, ok := coveredHostNodes[vd.NUMANode] + if !ok { + q.Logger().WithFields(logrus.Fields{ + "bdf": vd.BDF, + "host-numa": vd.NUMANode, + "guest-numa": "none", + "covered-nodes": coveredHostNodes, + }).Warn("VFIO device on host NUMA node not covered by guest NUMA topology; cross-NUMA memory accesses may occur") + } else { + q.Logger().WithFields(logrus.Fields{ + "bdf": vd.BDF, + "host-numa": vd.NUMANode, + "guest-numa": guestNode, + }).Debug("VFIO device NUMA placement validated") + } + } + } +} + func (q *qemu) qmpSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, qmpSocket) } From d0d7deb262a5e579e83ffb1168ecb725cf42153c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 14 Apr 2026 15:03:49 +0200 Subject: [PATCH 05/14] runtime: Add host NUMA distance discovery and build guest NUMA topology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sysfs-based host NUMA distance reading (GetHostNUMADistances) that parses /sys/devices/system/node/nodeN/distance to mirror the host NUMA distance matrix into the guest via -numa dist entries. Implement buildNUMATopology() which translates the GuestNUMANodes configuration into govmm NUMANode and NUMADist slices. Each guest NUMA node gets a floor-divided share of vCPUs and memory, with the last node absorbing any remainder. This handles the common Kata case of +1 VMM overhead vCPU gracefully. Memory backends are selected based on hugepages/virtio-fs/file-backed-mem configuration. Guard multi-NUMA topology generation to amd64 and arm64 only, since other architectures (s390x, riscv64) do not support QEMU NUMA/DIMM. Wire buildNUMATopology() into CreateVM so the QEMU config includes NUMA nodes and distances. Signed-off-by: Fabiano Fidêncio Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/katautils/config.go | 24 + src/runtime/pkg/oci/utils.go | 6 +- src/runtime/virtcontainers/hypervisor.go | 9 + src/runtime/virtcontainers/qemu.go | 23 +- src/runtime/virtcontainers/qemu_test.go | 571 +++++++++++++++++- src/runtime/virtcontainers/utils/utils.go | 180 ++++++ .../virtcontainers/utils/utils_darwin.go | 8 + .../virtcontainers/utils/utils_linux.go | 42 ++ .../virtcontainers/utils/utils_test.go | 61 ++ 9 files changed, 919 insertions(+), 5 deletions(-) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 061bf8b2ed..c5c5f70c34 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -1071,6 +1071,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { IOMMU: h.IOMMU, IOMMUPlatform: h.getIOMMUPlatform(), GuestNUMANodes: h.defaultGuestNUMANodes(), + NUMAMapping: append([]string(nil), h.NUMAMapping...), FileBackedMemRootDir: h.FileBackedMemRootDir, FileBackedMemRootList: h.FileBackedMemRootList, Debug: h.Debug, @@ -1994,6 +1995,10 @@ func checkConfig(config oci.RuntimeConfig) error { return err } + if err := checkNumaConfig(config); err != nil { + return err + } + hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO machineType := config.HypervisorConfig.HypervisorMachineType @@ -2005,6 +2010,25 @@ func checkConfig(config oci.RuntimeConfig) error { return nil } +func checkNumaConfig(config oci.RuntimeConfig) error { + if len(config.HypervisorConfig.GuestNUMANodes) <= 1 { + return nil + } + + switch goruntime.GOARCH { + case "amd64", "arm64": + default: + return fmt.Errorf("multi-NUMA support is only available on amd64 and arm64, got %q", goruntime.GOARCH) + } + + if !config.StaticSandboxResourceMgmt { + return fmt.Errorf("NUMA support requires static_sandbox_resource_mgmt to be enabled; " + + "NUMA topology is not compatible with dynamic CPU/memory hotplug") + } + + return nil +} + // checkPCIeConfig ensures the PCIe configuration is valid. // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index b09a97e994..39bb029400 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -794,11 +794,15 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig } if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok { - guestNUMANodes, err := vcutils.GetGuestNUMANodes(strings.Fields(annotation)) + mapping := strings.Fields(annotation) + guestNUMANodes, err := vcutils.GetGuestNUMANodes(mapping) if err != nil { return err } sbConfig.HypervisorConfig.GuestNUMANodes = guestNUMANodes + // Record the raw user-provided mapping so the hypervisor + // backend honors it verbatim instead of right-sizing. + sbConfig.HypervisorConfig.NUMAMapping = mapping } return nil diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index b631960f6b..8b93b31428 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -803,6 +803,15 @@ type HypervisorConfig struct { // GuestNUMANodes defines guest NUMA topology and mapping to host NUMA nodes and CPUs. GuestNUMANodes []types.GuestNUMANode + // NUMAMapping is the raw user-provided NUMA mapping (TOML + // `numa_mapping` or the io.katacontainers.config.hypervisor.numa_mapping + // annotation). When empty, GuestNUMANodes was auto-derived from the + // host topology and may be right-sized at sandbox creation (e.g. + // collapsed to a single host node when the sandbox fits, or + // restricted to host nodes containing attached VFIO devices). When + // non-empty, the topology is honored verbatim. + NUMAMapping []string + // DisableNestingChecks is used to override customizations performed // when running on top of another VMM. DisableNestingChecks bool diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index e2a3e446d2..173f72b2c1 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -21,6 +21,7 @@ import ( "os/user" "path/filepath" "regexp" + goruntime "runtime" "strconv" "strings" "sync" @@ -251,6 +252,14 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso span, _ := katatrace.Trace(ctx, q.Logger(), "setup", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() + // Right-size auto-derived NUMA topology before snapshotting the config. + // We mutate the caller-owned pointer so the sandbox's shared + // HypervisorConfig (used by vCPU pinning and cpuset.mems forwarding) + // observes the same trimmed topology that QEMU is launched with. + // No-op when numa_mapping was set explicitly or when the topology + // already has one or zero nodes. + maybeRightSizeAutoNUMA(hypervisorConfig, q.Logger()) + if err := q.setConfig(hypervisorConfig); err != nil { return err } @@ -326,8 +335,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso return nil } -func (q *qemu) cpuTopology() govmmQemu.SMP { - return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, q.config.NumGuestNUMANodes()) +func (q *qemu) cpuTopology(effectiveNUMANodes uint32) govmmQemu.SMP { + return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, effectiveNUMANodes) } func (q *qemu) memoryTopology() (govmmQemu.Memory, error) { @@ -996,7 +1005,13 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi return err } - smp := q.cpuTopology() + numaNodes, numaDists, err := q.buildNUMATopology() + if err != nil { + return err + } + + effectiveNUMANodes := uint32(len(numaNodes)) + smp := q.cpuTopology(effectiveNUMANodes) memory, err := q.memoryTopology() if err != nil { @@ -1117,6 +1132,8 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi QMPSockets: qmpSockets, Knobs: knobs, Incoming: incoming, + NUMANodes: numaNodes, + NUMADists: numaDists, VGA: "none", GlobalParam: "kvm-pit.lost_tick_policy=discard", Bios: firmwarePath, diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index 5d4267f011..db494bf365 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -19,6 +19,7 @@ import ( "os" "path" "path/filepath" + "runtime" "testing" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" @@ -29,6 +30,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" "github.com/pbnjay/memory" "github.com/pkg/errors" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" ) @@ -283,7 +285,7 @@ func TestQemuCPUTopology(t *testing.T) { MaxCPUs: uint32(vcpus), } - smp := q.cpuTopology() + smp := q.cpuTopology(0) assert.Exactly(smp, expectedOut) } @@ -1200,3 +1202,570 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) { // State should remain unchanged assert.Equal(100, q.state.HotpluggedMemory) } + +func TestBuildNUMATopologySingleNode(t *testing.T) { + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + }, + }, + } + nodes, dists, err := q.buildNUMATopology() + assert.NoError(err) + assert.Nil(nodes) + assert.Nil(dists) +} + +func TestBuildNUMATopologyTwoNodes(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("0-1", nodes[0].CPUs) + assert.Equal("512M", nodes[0].MemSize) + assert.Equal("memory-backend-ram", nodes[0].MemBackendType) + + assert.Equal(uint32(1), nodes[1].NodeID) + assert.Equal("2-3", nodes[1].CPUs) + assert.Equal("512M", nodes[1].MemSize) +} + +func TestBuildNUMATopologyHugePages(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + if _, err := os.Stat("/dev/hugepages"); err != nil { + t.Skip("skipping: /dev/hugepages not available") + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + HugePages: true, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("memory-backend-file", nodes[0].MemBackendType) + assert.Equal("/dev/hugepages", nodes[0].MemBackendPath) + assert.Equal("512M", nodes[0].MemSize) +} + +func TestBuildNUMATopologyVirtioFS(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + SharedFS: config.VirtioFS, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("memory-backend-file", nodes[0].MemBackendType) + assert.Equal(fallbackFileBackedMemDir, nodes[0].MemBackendPath) +} + +func TestBuildNUMATopologyFileBackedMem(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + tmpDir := t.TempDir() + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + FileBackedMemRootDir: tmpDir, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("memory-backend-file", nodes[0].MemBackendType) + assert.Equal(tmpDir, nodes[0].MemBackendPath) +} + +func TestBuildNUMATopologyTooFewVCPUs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 1, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0"}, + {HostNodes: "1", HostCPUs: "1"}, + }, + }, + } + nodes, dists, err := q.buildNUMATopology() + assert.NoError(err) + assert.Nil(nodes) + assert.Nil(dists) +} + +func TestBuildNUMATopologyUnevenVCPUs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 5, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-4"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + // cpuTopology() rounds MaxCPUs to ceil(5/2)*2=6, so 6 CPU slots + // are distributed proportionally: 2 host CPUs → 2 vCPUs, + // 3 host CPUs → 4 vCPUs (3 proportional + 1 remainder). + assert.Equal("0-1", nodes[0].CPUs) + assert.Equal("2-5", nodes[1].CPUs) +} + +func TestBuildNUMATopologyMemMisaligned(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1, + HugePages: true, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + _, _, err := q.buildNUMATopology() + assert.Error(err) + assert.Contains(err.Error(), "cannot be evenly distributed") +} + +func TestBuildNUMATopologyMemMisalignedRemainder(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 6, + MemorySize: 1025, + HugePages: true, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-2"}, + {HostNodes: "1", HostCPUs: "3-5"}, + }, + }, + } + _, _, err := q.buildNUMATopology() + assert.Error(err) + assert.Contains(err.Error(), "cannot be evenly distributed") +} + +func TestBuildNUMATopologyEvenMemory(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 6, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-2"}, + {HostNodes: "1", HostCPUs: "3-5"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + + assert.Equal("0-2", nodes[0].CPUs) + assert.Equal("512M", nodes[0].MemSize) + + assert.Equal("3-5", nodes[1].CPUs) + assert.Equal("512M", nodes[1].MemSize) +} + +func TestBuildNUMATopologyProportionalVCPUs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 10, + MemorySize: 1000, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-7"}, + {HostNodes: "1", HostCPUs: "8-9"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + // 8 out of 10 host CPUs on node 0 → 8 vCPUs + assert.Equal("0-7", nodes[0].CPUs) + assert.Equal("800M", nodes[0].MemSize) + // 2 out of 10 host CPUs on node 1 → 2 vCPUs + assert.Equal("8-9", nodes[1].CPUs) + assert.Equal("200M", nodes[1].MemSize) +} + +func TestBuildCoveredHostNodes(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes([]types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + }) + assert.Len(covered, 2) + assert.Equal(uint32(0), covered[0]) + assert.Equal(uint32(1), covered[1]) +} + +func TestBuildCoveredHostNodesRange(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes([]types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-7"}, + }) + assert.Len(covered, 2) + assert.Equal(uint32(0), covered[0]) + assert.Equal(uint32(0), covered[1]) +} + +func TestBuildCoveredHostNodesEmpty(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes(nil) + assert.Len(covered, 0) +} + +func TestBuildCoveredHostNodesInvalidParse(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes([]types.GuestNUMANode{ + {HostNodes: "invalid", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + }) + assert.Len(covered, 1) + assert.Equal(uint32(1), covered[1]) +} + +// silentLogger returns a logrus.Entry that discards all output, suitable +// for use in unit tests that exercise NUMA right-sizing decisions. +func silentLogger() *logrus.Entry { + l := logrus.New() + l.Out = io.Discard + return logrus.NewEntry(l) +} + +// fakeCapFn returns a hostNUMACapFn backed by a static map. Unknown nodes +// produce an error so we exercise the "skip unknown" branch in +// sumNUMACapacity when intended. +func fakeCapFn(caps map[int]struct { + cpus int + memMB uint64 +}) hostNUMACapFn { + return func(nodeID int) (int, uint64, error) { + if c, ok := caps[nodeID]; ok { + return c.cpus, c.memMB, nil + } + return 0, 0, fmt.Errorf("unknown host NUMA node %d", nodeID) + } +} + +// twoNodeHostCaps describes a typical 2-socket host: 32 CPUs and 128 GiB +// per node. +func twoNodeHostCaps() map[int]struct { + cpus int + memMB uint64 +} { + return map[int]struct { + cpus int + memMB uint64 + }{ + 0: {cpus: 32, memMB: 128 * 1024}, + 1: {cpus: 32, memMB: 128 * 1024}, + } +} + +func twoNodeAutoTopology() []types.GuestNUMANode { + return []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-31"}, + {HostNodes: "1", HostCPUs: "32-63"}, + } +} + +func TestSumNUMACapacity(t *testing.T) { + assert := assert.New(t) + + cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(twoNodeHostCaps())) + assert.Equal(64, cpus) + assert.Equal(uint64(256*1024), memMB) +} + +func TestSumNUMACapacityDeduplicatesHostNodes(t *testing.T) { + assert := assert.New(t) + + // Two guest entries that both reference host node 0 must only count + // once. The merged "0-1" entry adds host node 1. + nodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-31"}, + {HostNodes: "0-1", HostCPUs: "0-63"}, + } + cpus, memMB := sumNUMACapacity(nodes, fakeCapFn(twoNodeHostCaps())) + assert.Equal(64, cpus) + assert.Equal(uint64(256*1024), memMB) +} + +func TestSumNUMACapacitySkipsUnknown(t *testing.T) { + assert := assert.New(t) + + caps := map[int]struct { + cpus int + memMB uint64 + }{ + 0: {cpus: 16, memMB: 32 * 1024}, + // host node 1 missing on purpose + } + cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(caps)) + assert.Equal(16, cpus) + assert.Equal(uint64(32*1024), memMB) +} + +func TestSelectNUMANodesPassthroughForSingleNode(t *testing.T) { + assert := assert.New(t) + + in := []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}} + out := selectNUMANodes(in, 4, 1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesNoVFIOFitsOneNode(t *testing.T) { + // Small sandbox (8 vCPUs / 16 GiB) fits comfortably in one host node: + // expect collapse to the first guest node. + assert := assert.New(t) + + in := twoNodeAutoTopology() + out := selectNUMANodes(in, 8, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Len(out, 1) + assert.Equal("0", out[0].HostNodes) +} + +func TestSelectNUMANodesNoVFIOExceedsOneNode(t *testing.T) { + // 64 vCPUs needs both 32-CPU nodes: expect full topology. + assert := assert.New(t) + + in := twoNodeAutoTopology() + out := selectNUMANodes(in, 64, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesNoVFIOMemoryExceedsOneNode(t *testing.T) { + // CPU fits in one node but memory does not: expect full topology. + assert := assert.New(t) + + in := twoNodeAutoTopology() + out := selectNUMANodes(in, 8, 200*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesVFIOSubsetFits(t *testing.T) { + // VFIO device on host node 1; sandbox fits in one node: expect + // collapse to the guest node covering host node 1. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{1: {}} + out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Len(out, 1) + assert.Equal("1", out[0].HostNodes) +} + +func TestSelectNUMANodesVFIOSubsetTooSmall(t *testing.T) { + // VFIO device on host node 1, but sandbox needs more than one node's + // worth of memory: expect the full topology so the sandbox actually + // fits, even at the cost of cross-NUMA traffic. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{1: {}} + out := selectNUMANodes(in, 8, 200*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesVFIOSpansAllNodes(t *testing.T) { + // One VFIO device per host node: VFIO subset == full topology, no + // collapse possible. Result is the input unchanged. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{0: {}, 1: {}} + out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesVFIONoCoverage(t *testing.T) { + // VFIO host node not represented in the guest topology (rare, but can + // happen if numa_mapping has been customized). Keep the full topology + // rather than dropping all nodes. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{2: {}} + out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +// rightSizeNUMAWithFakeCaps mirrors maybeRightSizeAutoNUMA but lets tests +// inject a synthetic capacity oracle in place of realHostNUMACapFn so the +// decision is hermetic. +func rightSizeNUMAWithFakeCaps(hc *HypervisorConfig, capFn hostNUMACapFn) { + if hc == nil || len(hc.NUMAMapping) > 0 || len(hc.GuestNUMANodes) <= 1 { + return + } + hc.GuestNUMANodes = selectNUMANodes( + hc.GuestNUMANodes, + hc.DefaultMaxVCPUs, + uint64(hc.MemorySize), + nil, // no VFIO devices in this test + capFn, + silentLogger(), + ) +} + +func TestMaybeRightSizeAutoNUMACollapsesToOneNode(t *testing.T) { + // Empty NUMAMapping (auto) + sandbox fits in one host node: + // GuestNUMANodes is trimmed to a single entry. + assert := assert.New(t) + + hc := &HypervisorConfig{ + DefaultMaxVCPUs: 1, + MemorySize: 1, + GuestNUMANodes: twoNodeAutoTopology(), + } + rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps())) + assert.Len(hc.GuestNUMANodes, 1) + assert.Equal("0", hc.GuestNUMANodes[0].HostNodes) +} + +func TestMaybeRightSizeAutoNUMAExplicitMappingHonored(t *testing.T) { + // Non-empty NUMAMapping (user-provided) is left untouched, even if + // the sandbox would fit in a single node. + assert := assert.New(t) + + hc := &HypervisorConfig{ + DefaultMaxVCPUs: 1, + MemorySize: 1, + NUMAMapping: []string{"0", "1"}, + GuestNUMANodes: twoNodeAutoTopology(), + } + rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps())) + assert.Len(hc.GuestNUMANodes, 2) +} + +func TestMaybeRightSizeAutoNUMAKeepsFullWhenSandboxSpansNodes(t *testing.T) { + // Sandbox needs more CPUs than a single host node has: full topology + // is preserved. + assert := assert.New(t) + + hc := &HypervisorConfig{ + DefaultMaxVCPUs: 64, // > one node's 32 CPUs + MemorySize: 1024, + GuestNUMANodes: twoNodeAutoTopology(), + } + rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps())) + assert.Len(hc.GuestNUMANodes, 2) +} + +func TestMaybeRightSizeAutoNUMANoOpForFlatTopology(t *testing.T) { + // A topology with ≤ 1 node is a no-op regardless of NUMAMapping or + // budget. + assert := assert.New(t) + + for _, tc := range []struct { + name string + hc *HypervisorConfig + }{ + { + name: "nil config", + hc: nil, + }, + { + name: "single node", + hc: &HypervisorConfig{ + GuestNUMANodes: []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}}, + }, + }, + { + name: "empty", + hc: &HypervisorConfig{}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + before := 0 + if tc.hc != nil { + before = len(tc.hc.GuestNUMANodes) + } + rightSizeNUMAWithFakeCaps(tc.hc, fakeCapFn(twoNodeHostCaps())) + after := 0 + if tc.hc != nil { + after = len(tc.hc.GuestNUMANodes) + } + assert.Equal(before, after) + }) + } +} diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 39bcfde8f4..bc2aa98121 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -13,6 +13,7 @@ import ( "os/exec" "path/filepath" "regexp" + "strconv" "strings" "syscall" "time" @@ -623,3 +624,182 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) { return numaNodes, nil } + +// NUMADistEntry represents a single NUMA distance measurement between two nodes. +type NUMADistEntry struct { + Src uint32 + Dst uint32 + Val uint32 +} + +// GetHostNUMADistances reads the host NUMA distance matrix for the nodes +// referenced by the given GuestNUMANode list and returns off-diagonal +// pairwise entries (skipping self-distance src==dst). +// The distance row from sysfs is indexed by host NUMA node ID, so we parse +// each guest node's HostNodes to find the representative host node ID and +// use that to index into the distance row. +func GetHostNUMADistances(nodes []types.GuestNUMANode) []NUMADistEntry { + hostNodeIDs := make([]int, len(nodes)) + for i, n := range nodes { + nodeSet, err := cpuset.Parse(n.HostNodes) + if err != nil { + hostNodeIDs[i] = -1 + continue + } + ids := nodeSet.ToSlice() + if len(ids) == 0 { + hostNodeIDs[i] = -1 + continue + } + hostNodeIDs[i] = ids[0] + } + + var dists []NUMADistEntry + for srcIdx, srcNode := range nodes { + if hostNodeIDs[srcIdx] < 0 { + continue + } + distStr := getHostNUMADistance(srcNode.HostNodes) + if distStr == "" { + continue + } + fields := strings.Fields(distStr) + for dstIdx := range nodes { + if srcIdx == dstIdx { + continue + } + hostID := hostNodeIDs[dstIdx] + if hostID < 0 || hostID >= len(fields) { + continue + } + val, err := strconv.ParseUint(fields[hostID], 10, 32) + if err != nil { + continue + } + dists = append(dists, NUMADistEntry{ + Src: uint32(srcIdx), + Dst: uint32(dstIdx), + Val: uint32(val), + }) + } + } + return dists +} + +// HostNUMANodeCapacity describes the CPU and memory capacity of a single +// host NUMA node, as seen via sysfs. +type HostNUMANodeCapacity struct { + NodeID int + CPUs int + MemMB uint64 +} + +// GetHostNUMANodeCapacity returns the CPU count and memory size (in MiB) +// of the given host NUMA node. +func GetHostNUMANodeCapacity(nodeID int) (HostNUMANodeCapacity, error) { + cap := HostNUMANodeCapacity{NodeID: nodeID} + cpuList, err := getHostNUMANodeCPUs(nodeID) + if err != nil { + return cap, err + } + cs, err := cpuset.Parse(cpuList) + if err != nil { + return cap, fmt.Errorf("parse host node %d cpulist %q: %w", nodeID, cpuList, err) + } + cap.CPUs = cs.Size() + memMB, err := getHostNUMANodeMemoryMB(nodeID) + if err != nil { + return cap, err + } + cap.MemMB = memMB + return cap, nil +} + +// GetHostNUMANodeCapacities returns the capacities of the given host NUMA +// node IDs in the same order. Nodes that fail to be read are skipped and +// the corresponding error is logged via the returned error (the slice may +// be shorter than the input). +func GetHostNUMANodeCapacities(nodeIDs []int) ([]HostNUMANodeCapacity, error) { + out := make([]HostNUMANodeCapacity, 0, len(nodeIDs)) + for _, id := range nodeIDs { + c, err := GetHostNUMANodeCapacity(id) + if err != nil { + return out, fmt.Errorf("read host NUMA node %d capacity: %w", id, err) + } + out = append(out, c) + } + return out, nil +} + +// DistributeVCPUsProportionally distributes totalVCPUs across NUMA nodes +// proportionally to the number of host CPUs available on each node. +// Each node is guaranteed at least 1 vCPU. Remainder vCPUs go to nodes +// with the most host CPUs. +func DistributeVCPUsProportionally(numaNodes []types.GuestNUMANode, totalVCPUs uint32) ([]uint32, error) { + numNodes := len(numaNodes) + if numNodes == 0 { + return nil, fmt.Errorf("no NUMA nodes") + } + if totalVCPUs < uint32(numNodes) { + return nil, fmt.Errorf("totalVCPUs (%d) must be >= NUMA node count (%d)", totalVCPUs, numNodes) + } + + hostCPUCounts := make([]int, numNodes) + totalHostCPUs := 0 + for i, gn := range numaNodes { + parsed, err := cpuset.Parse(gn.HostCPUs) + if err != nil { + return nil, fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %w", i, err) + } + if parsed.Size() == 0 { + return nil, fmt.Errorf("HostCPUs for NUMA node %d must not be empty", i) + } + hostCPUCounts[i] = parsed.Size() + totalHostCPUs += hostCPUCounts[i] + } + if totalHostCPUs == 0 { + return nil, fmt.Errorf("total host CPU count is 0") + } + + vcpusPerNode := make([]uint32, numNodes) + var assigned uint32 + for i := range numaNodes { + vcpusPerNode[i] = uint32(int(totalVCPUs) * hostCPUCounts[i] / totalHostCPUs) + if vcpusPerNode[i] == 0 { + vcpusPerNode[i] = 1 + } + assigned += vcpusPerNode[i] + } + + // Use a copy for remainder distribution to avoid mutating the original counts. + weights := make([]int, numNodes) + copy(weights, hostCPUCounts) + + for assigned < totalVCPUs { + bestIdx := 0 + for i := 1; i < numNodes; i++ { + if weights[i] > weights[bestIdx] { + bestIdx = i + } + } + vcpusPerNode[bestIdx]++ + assigned++ + weights[bestIdx]-- + } + + for assigned > totalVCPUs { + bestIdx := 0 + for i := 1; i < numNodes; i++ { + if vcpusPerNode[i] > vcpusPerNode[bestIdx] { + bestIdx = i + } + } + if vcpusPerNode[bestIdx] <= 1 { + break + } + vcpusPerNode[bestIdx]-- + assigned-- + } + + return vcpusPerNode, nil +} diff --git a/src/runtime/virtcontainers/utils/utils_darwin.go b/src/runtime/virtcontainers/utils/utils_darwin.go index 4a64c921b1..a29d0378a2 100644 --- a/src/runtime/virtcontainers/utils/utils_darwin.go +++ b/src/runtime/virtcontainers/utils/utils_darwin.go @@ -22,3 +22,11 @@ func getHostNUMANodes() ([]int, error) { func getHostNUMANodeCPUs(nodeId int) (string, error) { return "", nil } + +func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) { + return 0, nil +} + +func getHostNUMADistance(hostNodes string) string { + return "" +} diff --git a/src/runtime/virtcontainers/utils/utils_linux.go b/src/runtime/virtcontainers/utils/utils_linux.go index 0ddb4dd5a9..11ae66b202 100644 --- a/src/runtime/virtcontainers/utils/utils_linux.go +++ b/src/runtime/virtcontainers/utils/utils_linux.go @@ -12,6 +12,8 @@ import ( "io" "math/big" "os" + "regexp" + "strconv" "strings" "syscall" "time" @@ -23,6 +25,8 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" ) +var nodeMemTotalRegexp = regexp.MustCompile(`Node\s+\d+\s+MemTotal:\s+(\d+)\s+kB`) + var ioctlFunc = Ioctl // maxUInt represents the maximum valid value for the context ID. @@ -220,3 +224,41 @@ func getHostNUMANodeCPUs(nodeId int) (string, error) { } return strings.TrimSuffix(string(data), "\n"), nil } + +// getHostNUMANodeMemoryMB returns the total memory in MiB for the given +// host NUMA node, parsed from /sys/devices/system/node/nodeN/meminfo. +func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) { + fileName := fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeId) + data, err := os.ReadFile(fileName) + if err != nil { + return 0, err + } + m := nodeMemTotalRegexp.FindSubmatch(data) + if m == nil { + return 0, fmt.Errorf("MemTotal not found in %s", fileName) + } + kb, err := strconv.ParseUint(string(m[1]), 10, 64) + if err != nil { + return 0, err + } + return kb / 1024, nil +} + +// getHostNUMADistance reads the distance row for the first host NUMA node +// in the given hostNodes specifier (e.g. "0" or "0-1"). +func getHostNUMADistance(hostNodes string) string { + nodeSet, err := cpuset.Parse(hostNodes) + if err != nil { + return "" + } + ids := nodeSet.ToSlice() + if len(ids) == 0 { + return "" + } + fileName := fmt.Sprintf("/sys/devices/system/node/node%d/distance", ids[0]) + data, err := os.ReadFile(fileName) + if err != nil { + return "" + } + return strings.TrimSuffix(string(data), "\n") +} diff --git a/src/runtime/virtcontainers/utils/utils_test.go b/src/runtime/virtcontainers/utils/utils_test.go index 8361caa1ee..cb11770c8c 100644 --- a/src/runtime/virtcontainers/utils/utils_test.go +++ b/src/runtime/virtcontainers/utils/utils_test.go @@ -19,6 +19,8 @@ import ( "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" + + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" ) const waitLocalProcessTimeoutSecs = 3 @@ -754,3 +756,62 @@ func TestDockerNetnsPath(t *testing.T) { } assert.Equal("", DockerNetnsPath(spec)) } + +func TestDistributeVCPUsProportionallySymmetric(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-3"}, + {HostCPUs: "4-7"}, + } + dist, err := DistributeVCPUsProportionally(nodes, 8) + assert.NoError(err) + assert.Equal([]uint32{4, 4}, dist) +} + +func TestDistributeVCPUsProportionallyAsymmetric(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-7"}, + {HostCPUs: "8-9"}, + } + dist, err := DistributeVCPUsProportionally(nodes, 10) + assert.NoError(err) + assert.Equal([]uint32{8, 2}, dist) +} + +func TestDistributeVCPUsProportionallyMinOnePerNode(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-99"}, + {HostCPUs: "100"}, + } + dist, err := DistributeVCPUsProportionally(nodes, 2) + assert.NoError(err) + assert.Equal(uint32(1), dist[0]) + assert.Equal(uint32(1), dist[1]) +} + +func TestDistributeVCPUsProportionallyThreeNodes(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-5"}, + {HostCPUs: "6-8"}, + {HostCPUs: "9"}, + } + // 6+3+1=10 host CPUs, 10 vCPUs: proportional = 6, 3, 1 + dist, err := DistributeVCPUsProportionally(nodes, 10) + assert.NoError(err) + assert.Equal([]uint32{6, 3, 1}, dist) +} + +func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0"}, + {HostCPUs: "1"}, + {HostCPUs: "2"}, + } + _, err := DistributeVCPUsProportionally(nodes, 2) + assert.Error(err) + assert.Contains(err.Error(), "must be >= NUMA node count") +} From 12e5985dbd87f9ea0805c802157c1dc89a258d24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 14 Apr 2026 15:04:47 +0200 Subject: [PATCH 06/14] runtime: Add NUMA-aware vCPU pinning and cpuset.mems forwarding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make checkVCPUsPinning() NUMA-aware: when GuestNUMANodes are configured, vCPU threads are pinned to host CPUs belonging to the same NUMA node as the vCPU's guest NUMA node assignment via checkVCPUsPinningNUMA(), preserving memory locality. vCPUs are distributed proportionally across NUMA nodes, matching the distribution in buildNUMATopology(). Stop unconditionally stripping cpuset.mems in constrainGRPCSpec() and container update(). When multi-NUMA is configured, translate host NUMA node IDs to guest NUMA node IDs using translateHostMemsToGuest() before forwarding to the agent. This allows the agent to enforce NUMA-aware memory placement for containers. Filter guest NUMA nodes at VM creation time: before calling CreateVM(), prune GuestNUMANodes to only those whose HostCPUs intersect the sandbox cpuset. This avoids exposing fake NUMA topology to the guest when Kubernetes allocates CPUs from fewer nodes than the host has (e.g. all CPUs from node 0 on a 2-node host), improving memory locality and avoiding unnecessary cross-node memory traffic. Signed-off-by: Fabiano Fidêncio Signed-off-by: Zvonko Kaiser --- src/runtime/virtcontainers/container.go | 13 +- src/runtime/virtcontainers/kata_agent.go | 42 +++++- src/runtime/virtcontainers/kata_agent_test.go | 50 ++++++- src/runtime/virtcontainers/sandbox.go | 124 +++++++++++++++++- src/runtime/virtcontainers/sandbox_test.go | 26 ++++ src/runtime/virtcontainers/utils/utils.go | 23 ++++ .../virtcontainers/utils/utils_test.go | 41 ++++++ 7 files changed, 303 insertions(+), 16 deletions(-) diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index dc96e3cf39..c3784712f3 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -1742,12 +1742,17 @@ func (c *Container) update(ctx context.Context, resources specs.LinuxResources) return err } - // There currently isn't a notion of cpusets.cpus or mems being tracked - // inside of the guest. Make sure we clear these before asking agent to update - // the container's cgroups. + // Cpus/Mems in cgroup cpuset are host-relative; clear Cpus since vCPU + // numbering differs inside the guest. For Mems, translate host NUMA node + // IDs to guest node IDs when multi-NUMA is configured, otherwise clear. if resources.CPU != nil { - resources.CPU.Mems = "" resources.CPU.Cpus = "" + numaNodes := c.sandbox.config.HypervisorConfig.GuestNUMANodes + if len(numaNodes) > 1 && resources.CPU.Mems != "" { + resources.CPU.Mems = translateHostMemsToGuest(resources.CPU.Mems, numaNodes) + } else { + resources.CPU.Mems = "" + } } return c.sandbox.agent.updateContainer(ctx, c.sandbox, *c, resources) diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 452c64f9ce..8b34cb246a 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -34,6 +34,7 @@ import ( kataclient "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/client" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc" vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" @@ -1018,7 +1019,36 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st return nil } -func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool) error { +// translateHostMemsToGuest converts a host cpuset.mems string (e.g. "0,2") +// into guest NUMA node IDs. Each guest NUMA node index maps to a set of host +// nodes via GuestNUMANode.HostNodes. If a host node from `mems` appears in +// a GuestNUMANode's HostNodes, the corresponding guest node index is included. +func translateHostMemsToGuest(hostMems string, numaNodes []types.GuestNUMANode) string { + hostSet, err := cpuset.Parse(hostMems) + if err != nil { + return "" + } + hostSlice := hostSet.ToSlice() + var guestNodes []int + for guestIdx, gn := range numaNodes { + nodeSet, err := cpuset.Parse(gn.HostNodes) + if err != nil { + continue + } + for _, hostNode := range hostSlice { + if nodeSet.Contains(hostNode) { + guestNodes = append(guestNodes, guestIdx) + break + } + } + } + if len(guestNodes) == 0 { + return "" + } + return cpuset.NewCPUSet(guestNodes...).String() +} + +func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool, numaNodes []types.GuestNUMANode) error { // Disable Hooks since they have been handled on the host and there is // no reason to send them to the agent. It would make no sense to try // to apply them on the guest. @@ -1060,7 +1090,6 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis } } - // By now only CPU constraints are supported // Issue: https://github.com/kata-containers/runtime/issues/158 // Issue: https://github.com/kata-containers/runtime/issues/204 grpcSpec.Linux.Resources.Devices = nil @@ -1069,7 +1098,12 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis grpcSpec.Linux.Resources.Network = nil if grpcSpec.Linux.Resources.CPU != nil { grpcSpec.Linux.Resources.CPU.Cpus = "" - grpcSpec.Linux.Resources.CPU.Mems = "" + if len(numaNodes) > 1 && grpcSpec.Linux.Resources.CPU.Mems != "" { + guestMems := translateHostMemsToGuest(grpcSpec.Linux.Resources.CPU.Mems, numaNodes) + grpcSpec.Linux.Resources.CPU.Mems = guestMems + } else { + grpcSpec.Linux.Resources.CPU.Mems = "" + } } // Disable network and time namespaces since they are handled on the host @@ -1495,7 +1529,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co // We need to constrain the spec to make sure we're not // passing irrelevant information to the agent. - err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel) + err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel, sandbox.config.HypervisorConfig.GuestNUMANodes) if err != nil { return nil, err } diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index 62bdd76eac..4b27f0c07e 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -638,7 +638,7 @@ func TestConstrainGRPCSpec(t *testing.T) { } k := kataAgent{} - k.constrainGRPCSpec(g, true, true, "", true) + k.constrainGRPCSpec(g, true, true, "", true, nil) // Check nil fields assert.Nil(g.Hooks) @@ -1370,3 +1370,51 @@ func TestKataAgentCreateContainerVFIODevices(t *testing.T) { }) } } + +func TestTranslateHostMemsToGuest(t *testing.T) { + assert := assert.New(t) + + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + } + + result := translateHostMemsToGuest("0", numaNodes) + assert.Equal("0", result) + + result = translateHostMemsToGuest("1", numaNodes) + assert.Equal("1", result) + + result = translateHostMemsToGuest("0-1", numaNodes) + assert.Equal("0-1", result) + + result = translateHostMemsToGuest("0,1", numaNodes) + assert.Equal("0-1", result) + + result = translateHostMemsToGuest("42", numaNodes) + assert.Equal("", result) + + result = translateHostMemsToGuest("invalid", numaNodes) + assert.Equal("", result) + + result = translateHostMemsToGuest("", numaNodes) + assert.Equal("", result) +} + +func TestTranslateHostMemsToGuestRangeNodes(t *testing.T) { + assert := assert.New(t) + + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-7"}, + {HostNodes: "2-3", HostCPUs: "8-15"}, + } + + result := translateHostMemsToGuest("1", numaNodes) + assert.Equal("0", result) + + result = translateHostMemsToGuest("2", numaNodes) + assert.Equal("1", result) + + result = translateHostMemsToGuest("0,3", numaNodes) + assert.Equal("0-1", result) +} diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 6ceae42de9..0a236e32b8 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -2961,9 +2961,26 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error { // checkVCPUsPinning is used to support CPUSet mode of kata container. // CPUSet mode is on when Sandbox.HypervisorConfig.EnableVCPUsPinning -// is set to true. Then it fetches sandbox's number of vCPU threads -// and number of CPUs in CPUSet. If the two are equal, each vCPU thread -// is then pinned to one fixed CPU in CPUSet. +// is set to true. +// +// When NUMA topology is configured (GuestNUMANodes is non-empty), vCPU +// threads are pinned to host CPUs belonging to the same host NUMA node +// as the vCPU's assigned guest NUMA node, preserving memory locality. +// vCPUs are distributed proportionally across nodes and each vCPU is +// pinned round-robin to the host CPUs within its NUMA node; the 1:1 +// count equality check does not apply. +// +// This is true for both multi-node sandboxes and right-sized +// single-node sandboxes: when buildNUMATopology()/maybeRightSizeAutoNUMA +// collapses the topology to one node, that single node still carries a +// meaningful HostCPUs subset (the CPUs of the chosen host NUMA node), +// and pinning to that subset is what makes right-sizing actually deliver +// host-thread locality, not just guest-topology locality. +// +// In the non-NUMA path (GuestNUMANodes is empty, e.g. enable_numa=false), +// it fetches the sandbox's number of vCPU threads and number of CPUs in +// CPUSet. If the two are equal, each vCPU thread is pinned 1:1 to the +// CPUs in CPUSet; otherwise pinning is skipped. func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { if s.config == nil { return fmt.Errorf("no sandbox config found") @@ -2972,7 +2989,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { return nil } - // fetch vCPU thread ids and CPUSet vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx) if err != nil { return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err) @@ -2987,9 +3003,42 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { } cpuSetSlice := cpuSet.ToSlice() - // check if vCPU thread numbers and CPU numbers are equal + numaNodes := s.config.HypervisorConfig.GuestNUMANodes + + if len(cpuSetSlice) == 0 { + if len(numaNodes) >= 1 { + // No cpuset constraint (e.g. ctr without k8s, or a Burstable + // pod with cpuManagerPolicy=none). Build an effective cpuset + // from the NUMA nodes' HostCPUs so pinning works using the + // (possibly right-sized) host NUMA topology. Even a single + // NUMA node here meaningfully constrains pinning to that + // node's host CPUs. + for _, gn := range numaNodes { + hostCPUs, err := cpuset.Parse(gn.HostCPUs) + if err != nil { + continue + } + cpuSet = cpuSet.Union(hostCPUs) + } + cpuSetSlice = cpuSet.ToSlice() + if len(cpuSetSlice) == 0 { + s.Logger().Warn("sandbox CPUSet is empty and cannot derive from NUMA HostCPUs; skipping vCPU pinning") + s.isVCPUsPinningOn = false + return nil + } + s.Logger().WithField("effective-cpuset", cpuSet.String()).Debug("derived cpuset from NUMA HostCPUs for pinning") + } else { + s.Logger().Warn("sandbox CPUSet is empty; skipping vCPU pinning") + s.isVCPUsPinningOn = false + return nil + } + } + + if len(numaNodes) >= 1 { + return s.checkVCPUsPinningNUMA(ctx, vCPUThreadsMap, numaNodes, cpuSetSlice) + } + numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice) - // if not equal, we should reset threads scheduling to random pattern if numVCPUs != numCPUs { if s.isVCPUsPinningOn { s.isVCPUsPinningOn = false @@ -2997,7 +3046,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { } return nil } - // if equal, we can use vCPU thread pinning for i, tid := range vCPUThreadsMap.vcpus { if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil { if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { @@ -3010,6 +3058,68 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { return nil } +// checkVCPUsPinningNUMA pins vCPU threads to host CPUs that belong to the +// same NUMA node as the vCPU's guest NUMA node assignment. vCPUs are +// distributed proportionally to the host CPU count per NUMA node +// (matching buildNUMATopology). It handles any non-empty numaNodes +// slice — including the right-sized single-node case, where every vCPU +// is pinned within the single chosen host NUMA node's CPU set. +func (s *Sandbox) checkVCPUsPinningNUMA(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, numaNodes []types.GuestNUMANode, cpuSetSlice []int) error { + numVCPUs := uint32(len(vCPUThreadsMap.vcpus)) + numNodes := uint32(len(numaNodes)) + if numVCPUs < numNodes { + return fmt.Errorf("number of vCPUs (%d) must be >= NUMA node count (%d) for NUMA pinning", numVCPUs, numNodes) + } + + vcpusPerNode, err := utils.DistributeVCPUsProportionally(numaNodes, numVCPUs) + if err != nil { + return fmt.Errorf("failed to compute NUMA vCPU distribution for pinning: %v", err) + } + + cpuSetAll := cpuset.NewCPUSet(cpuSetSlice...) + + var cpuOffset uint32 + for i, gn := range numaNodes { + hostCPUs, err := cpuset.Parse(gn.HostCPUs) + if err != nil { + return fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %v", i, err) + } + allowedCPUs := hostCPUs.Intersection(cpuSetAll).ToSlice() + if len(allowedCPUs) == 0 { + s.Logger().WithFields(logrus.Fields{ + "numa-node": i, + "host-cpus": gn.HostCPUs, + "sandbox-cpus": cpuSetSlice, + }).Warn("NUMA node HostCPUs do not intersect sandbox CPUSet; pinning vCPUs to full cpuset for this node") + allowedCPUs = cpuSetSlice + } + + startVCPU := cpuOffset + endVCPU := startVCPU + vcpusPerNode[i] + cpuOffset = endVCPU + + for vcpuIdx := startVCPU; vcpuIdx < endVCPU; vcpuIdx++ { + tid, ok := vCPUThreadsMap.vcpus[int(vcpuIdx)] + if !ok { + if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { + return err + } + return fmt.Errorf("missing vcpu thread id for vcpu index %d", vcpuIdx) + } + pinIdx := int(vcpuIdx-startVCPU) % len(allowedCPUs) + if err := resCtrl.SetThreadAffinity(tid, allowedCPUs[pinIdx:pinIdx+1]); err != nil { + if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { + return err + } + return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d (NUMA node %d): %v", tid, allowedCPUs[pinIdx], i, err) + } + } + } + + s.isVCPUsPinningOn = true + return nil +} + // resetVCPUsPinning cancels current pinning and restores default random vCPU threads scheduling func (s *Sandbox) resetVCPUsPinning(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, cpuSetSlice []int) error { for _, tid := range vCPUThreadsMap.vcpus { diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index 7e521f3842..50115c7a5b 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -1679,3 +1679,29 @@ func TestSandboxHugepageLimit(t *testing.T) { err = s.updateResources(context.Background()) assert.NoError(t, err) } + +func TestCheckVCPUsPinningNUMATooFewVCPUs(t *testing.T) { + assert := assert.New(t) + s := &Sandbox{} + vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100}} + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + } + err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7}) + assert.Error(err) + assert.Contains(err.Error(), "must be >= NUMA node count") +} + +func TestCheckVCPUsPinningNUMABadHostCPUs(t *testing.T) { + assert := assert.New(t) + s := &Sandbox{} + vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100, 1: 101, 2: 102, 3: 103}} + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "not-valid"}, + {HostNodes: "1", HostCPUs: "4-7"}, + } + err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7}) + assert.Error(err) + assert.Contains(err.Error(), "failed to parse HostCPUs") +} diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index bc2aa98121..5e1ff51ae3 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -625,6 +625,29 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) { return numaNodes, nil } +// FilterNUMANodesByCPUSet returns only those guest NUMA nodes whose HostCPUs +// intersect with the given sandbox cpuset. If sandboxCPUs is empty (size 0), +// no filtering is applied and the original slice is returned unchanged. +func FilterNUMANodesByCPUSet(nodes []types.GuestNUMANode, sandboxCPUs cpuset.CPUSet) []types.GuestNUMANode { + if sandboxCPUs.Size() == 0 { + return nodes + } + var filtered []types.GuestNUMANode + for _, n := range nodes { + hostCPUs, err := cpuset.Parse(n.HostCPUs) + if err != nil { + continue + } + if hostCPUs.Intersection(sandboxCPUs).Size() > 0 { + filtered = append(filtered, n) + } + } + if len(filtered) == 0 { + return nodes + } + return filtered +} + // NUMADistEntry represents a single NUMA distance measurement between two nodes. type NUMADistEntry struct { Src uint32 diff --git a/src/runtime/virtcontainers/utils/utils_test.go b/src/runtime/virtcontainers/utils/utils_test.go index cb11770c8c..90663e64b5 100644 --- a/src/runtime/virtcontainers/utils/utils_test.go +++ b/src/runtime/virtcontainers/utils/utils_test.go @@ -20,6 +20,7 @@ import ( "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" ) @@ -815,3 +816,43 @@ func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) { assert.Error(err) assert.Contains(err.Error(), "must be >= NUMA node count") } + +func TestFilterNUMANodesByCPUSet(t *testing.T) { + assert := assert.New(t) + + nodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-55,112-167"}, + {HostNodes: "1", HostCPUs: "56-111,168-223"}, + } + + // Sandbox cpuset only from node 0 -> should return 1 node + sandboxCPUs, _ := cpuset.Parse("1-40,113-152") + filtered := FilterNUMANodesByCPUSet(nodes, sandboxCPUs) + assert.Len(filtered, 1) + assert.Equal("0", filtered[0].HostNodes) + + // Sandbox cpuset from both nodes -> should return 2 nodes + sandboxCPUs, _ = cpuset.Parse("1-40,56-80") + filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs) + assert.Len(filtered, 2) + + // Sandbox cpuset only from node 1 -> should return 1 node + sandboxCPUs, _ = cpuset.Parse("60-70,170-180") + filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs) + assert.Len(filtered, 1) + assert.Equal("1", filtered[0].HostNodes) + + // Empty cpuset -> no filtering, return all + emptyCPUs := cpuset.NewCPUSet() + filtered = FilterNUMANodesByCPUSet(nodes, emptyCPUs) + assert.Len(filtered, 2) + + // Single-node host (1 NUMA node) -> returns 1 regardless + singleNode := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-7"}, + } + sandboxCPUs, _ = cpuset.Parse("0-3") + filtered = FilterNUMANodesByCPUSet(singleNode, sandboxCPUs) + assert.Len(filtered, 1) + assert.Equal("0", filtered[0].HostNodes) +} From b688619314d25584abfc924fe1f9e2eaea0dd0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 30 Apr 2026 17:18:23 +0200 Subject: [PATCH 07/14] runtime: oci: Fix sandbox CPU sizing with cpuManagerPolicy=static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cpuManagerPolicy=static is configured, kubelet sets the sandbox CPU quota to -1 (unconstrained) because it uses cpuset pinning instead of CFS quota. This causes CalculateSandboxSizing to compute 0 workload CPUs, resulting in the VM starting with only default_vcpus. Fall back to deriving the CPU count from sandbox CPU shares (1024 shares per CPU) when the quota-based calculation yields 0. Signed-off-by: Fabiano Fidêncio --- src/runtime/pkg/oci/utils.go | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index 39bb029400..229f065740 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -1461,7 +1461,7 @@ func (a *annotationConfiguration) setFloat32WithCheck(f func(float32) error) err // be added to the VM if sandbox annotations are provided with this sizing details func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32) { var memory, quota int64 - var period uint64 + var shares, period uint64 var err error if spec == nil || spec.Annotations == nil { @@ -1492,6 +1492,15 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32) } } + annotation, ok = spec.Annotations[ctrAnnotations.SandboxCPUShares] + if ok { + shares, err = strconv.ParseUint(annotation, 10, 64) + if err != nil { + ociLog.Warningf("sandbox-sizing: failure to parse SandboxCPUShares: %s", annotation) + shares = 0 + } + } + annotation, ok = spec.Annotations[ctrAnnotations.SandboxMem] if ok { memory, err = strconv.ParseInt(annotation, 10, 64) @@ -1501,7 +1510,16 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32) } } - return calculateVMResources(period, quota, memory) + numCPU, memSizeMB = calculateVMResources(period, quota, memory) + + // When cpuManagerPolicy=static is in use, kubelet sets quota=-1 + // (unconstrained) and assigns CPUs via cpuset instead. Fall back + // to deriving the CPU count from shares (1024 shares per CPU). + if numCPU == 0 && shares > 0 { + numCPU = float32(math.Ceil(float64(shares) / 1024.0)) + } + + return numCPU, memSizeMB } // CalculateContainerSizing will calculate the number of CPUs and amount of memory that is needed From f53f4278596f0703d645f5a6549333a8b206964e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 22 May 2026 14:48:42 +0200 Subject: [PATCH 08/14] runtime: Fix vCPU pinning race for Go runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QEMU may not have spawned all vCPU threads when pinning starts, so query_cpus_fast can return an incomplete list and leave some vCPUs unpinned. To fix it, let's add exponential backoff retries before pinning and fall back to available threads if retries are exhausted. Signed-off-by: Fabiano Fidêncio --- src/runtime/virtcontainers/sandbox.go | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 0a236e32b8..1d7004e441 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -2989,10 +2989,39 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { return nil } + expectedVCPUs := int(s.config.HypervisorConfig.NumVCPUs()) + vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx) if err != nil { return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err) } + + // QEMU may not have spawned all vCPU threads yet. Retry with + // exponential backoff until we see the expected count. + if len(vCPUThreadsMap.vcpus) < expectedVCPUs { + const maxAttempts = 10 + backoff := 50 * time.Millisecond + for attempt := 2; attempt <= maxAttempts && len(vCPUThreadsMap.vcpus) < expectedVCPUs; attempt++ { + s.Logger().WithFields(logrus.Fields{ + "have": len(vCPUThreadsMap.vcpus), + "want": expectedVCPUs, + "attempt": attempt, + }).Debug("waiting for all vCPU threads to be available") + time.Sleep(backoff) + backoff *= 2 + vCPUThreadsMap, err = s.hypervisor.GetThreadIDs(ctx) + if err != nil { + return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err) + } + } + if len(vCPUThreadsMap.vcpus) < expectedVCPUs { + s.Logger().WithFields(logrus.Fields{ + "have": len(vCPUThreadsMap.vcpus), + "want": expectedVCPUs, + }).Warn("not all vCPU threads available after retries; pinning available ones") + } + } + cpuSetStr, _, err := s.getSandboxCPUSet() if err != nil { return fmt.Errorf("failed to get CPUSet config: %v", err) From feeb5d8ecc776d053d9bae3cfde0bfee6ff4a65d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 22 May 2026 14:48:42 +0200 Subject: [PATCH 09/14] runtime-rs: Fix vCPU pinning race with backoff retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QEMU can report fewer vCPU threads during early startup, causing partial affinity setup. Let's retry with exponential backoff until the expected thread count is visible, then continue with best-effort pinning if the window is exhausted. Signed-off-by: Fabiano Fidêncio --- .../resource/src/cgroups/resource_inner.rs | 48 ++++++++++++++++--- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs index 7fbcdb2e06..d72dc73efe 100644 --- a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs +++ b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs @@ -7,6 +7,7 @@ use std::collections::{HashMap, HashSet}; use std::process; use std::str::FromStr; +use std::time::Duration; use anyhow::{anyhow, Context, Result}; use cgroups_rs::manager::is_systemd_cgroup; @@ -16,6 +17,7 @@ use kata_types::cpu::CpuSet; use nix::sched::{sched_setaffinity, CpuSet as NixCpuSet}; use nix::unistd::Pid; use oci_spec::runtime::{LinuxCpu, LinuxCpuBuilder, LinuxResources, LinuxResourcesBuilder}; +use tokio::time::sleep; use crate::cgroups::utils::get_tgid_from_pid; use crate::cgroups::CgroupConfig; @@ -187,12 +189,46 @@ impl CgroupsResourceInner { let needs_thread_ids = self.overhead_cgroup.is_some() || self.enable_vcpus_pinning; let thread_ids = if needs_thread_ids { - Some( - hypervisor - .get_thread_ids() - .await - .context("get vCPU thread IDs")?, - ) + let mut tids = hypervisor + .get_thread_ids() + .await + .context("get vCPU thread IDs")?; + + // QEMU may not have spawned all vCPU threads yet. Retry with + // exponential backoff until we see the expected count. + let expected = hypervisor.hypervisor_config().await.cpu_info.default_vcpus.ceil() as usize; + if expected > 0 && tids.vcpus.len() < expected { + const MAX_ATTEMPTS: u32 = 10; + let mut backoff = Duration::from_millis(50); + for attempt in 2..=MAX_ATTEMPTS { + if tids.vcpus.len() >= expected { + break; + } + info!( + sl!(), + "waiting for all vCPU threads: have {}, want {}, attempt {}", + tids.vcpus.len(), + expected, + attempt + ); + sleep(backoff).await; + backoff *= 2; + tids = hypervisor + .get_thread_ids() + .await + .context("get vCPU thread IDs (retry)")?; + } + if tids.vcpus.len() < expected { + warn!( + sl!(), + "not all vCPU threads available after retries: have {}, want {}; pinning available ones", + tids.vcpus.len(), + expected + ); + } + } + + Some(tids) } else { None }; From 15292da217e6520a050820c79700ce19321bc87b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 14 Apr 2026 15:16:39 +0200 Subject: [PATCH 10/14] config: Enable NUMA by default for nvidia-gpu configurations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable enable_numa=true in the three nvidia-gpu QEMU configuration templates (base, SNP, TDX). On single-NUMA hosts this is a no-op since buildNUMATopology() returns nil when there is only one node. On multi-NUMA hosts it ensures GPU memory accesses are NUMA-local. Add documentation to all QEMU config templates explaining the VFIO device NUMA placement validation that occurs when NUMA is enabled. Signed-off-by: Fabiano Fidêncio Signed-off-by: Zvonko Kaiser --- src/runtime/Makefile | 3 +++ .../config/configuration-qemu-nvidia-gpu-snp.toml.in | 7 ++++++- .../config/configuration-qemu-nvidia-gpu-tdx.toml.in | 7 ++++++- src/runtime/config/configuration-qemu-nvidia-gpu.toml.in | 7 ++++++- src/runtime/config/configuration-qemu.toml.in | 5 +++++ 5 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 91d3eb976c..88ef8077ff 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -511,6 +511,8 @@ ifneq (,$(QEMUCMD)) DEFENABLEVCPUPINNING_NV = true + DEFENABLENUMA_NV = true + # NVIDIA profile: rootfs filesystem type (erofs for read-only, compressed images) DEFROOTFSTYPE_NV := $(ROOTFSTYPE_EROFS) @@ -689,6 +691,7 @@ USER_VARS += DEFAULTTIMEOUT_NV USER_VARS += DEFAULTLAUNCHPROCESSTIMEOUT_NV USER_VARS += DEFSANDBOXCGROUPONLY_NV USER_VARS += DEFENABLEVCPUPINNING_NV +USER_VARS += DEFENABLENUMA_NV USER_VARS += DEFROOTFSTYPE_NV USER_VARS += DEFROOTFSTYPE USER_VARS += MACHINETYPE diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in index 4dae978b9b..b15186867d 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in @@ -360,7 +360,12 @@ enable_iommu_platform = false # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs. # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should # also be enabled for memory pre-allocation. -enable_numa = false +# +# GPU workloads strongly benefit from NUMA awareness: when enabled, the runtime +# validates that each cold-plugged VFIO device (GPU) resides on a host NUMA +# node covered by the guest NUMA topology, ensuring memory locality. Consider +# enabling this on multi-NUMA hosts with GPU passthrough. +enable_numa = @DEFENABLENUMA_NV@ # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes. # Each entry defines a VM NUMA node and the host NUMA node(s) it maps to. diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in index 1c1ce20b01..2928389b1c 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in @@ -337,7 +337,12 @@ enable_iommu_platform = false # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs. # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should # also be enabled for memory pre-allocation. -enable_numa = false +# +# GPU workloads strongly benefit from NUMA awareness: when enabled, the runtime +# validates that each cold-plugged VFIO device (GPU) resides on a host NUMA +# node covered by the guest NUMA topology, ensuring memory locality. Consider +# enabling this on multi-NUMA hosts with GPU passthrough. +enable_numa = @DEFENABLENUMA_NV@ # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes. # Each entry defines a VM NUMA node and the host NUMA node(s) it maps to. diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in index 49f9db0d6e..f373082129 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in @@ -319,7 +319,12 @@ enable_iommu_platform = false # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs. # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should # also be enabled for memory pre-allocation. -enable_numa = false +# +# GPU workloads strongly benefit from NUMA awareness: when enabled, the runtime +# validates that each cold-plugged VFIO device (GPU) resides on a host NUMA +# node covered by the guest NUMA topology, ensuring memory locality. Consider +# enabling this on multi-NUMA hosts with GPU passthrough. +enable_numa = @DEFENABLENUMA_NV@ # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes. # Each entry defines a VM NUMA node and the host NUMA node(s) it maps to. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 811884a088..5a51f628ca 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -314,6 +314,11 @@ enable_iommu_platform = false # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs. # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should # also be enabled for memory pre-allocation. +# +# When VFIO devices (e.g. GPUs) are cold-plugged and NUMA is enabled, the +# runtime validates that each device's host NUMA node is covered by the guest +# NUMA topology. A warning is logged if a device falls outside the configured +# nodes, indicating potential cross-NUMA memory access overhead. enable_numa = false # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes. From 1cbe930fc9897f61b70a4e88eebdafb96b3c8f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 15 May 2026 17:36:59 +0200 Subject: [PATCH 11/14] runtime: Add pxb-pcie NUMA-aware PCIe topology for VFIO devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When NUMA placement is active and VFIO devices are cold-plugged, create a pxb-pcie (PCIe Expander Bridge) per NUMA node that has devices. Each pxb-pcie carries a numa_node property that gives the guest kernel correct NUMA affinity for all PCI devices beneath it. Root ports are created on each pxb-pcie bus instead of pcie.0, and VFIODevice.Attach() assigns each device to the root port on its host NUMA node's pxb bridge. Non-VFIO devices remain on pcie.0. NUMA placement is "active" when there is more than one guest NUMA node OR a single guest node mapped to a specific host node (the latter happens when maybeRightSizeAutoNUMA() collapses a multi-node sandbox to the GPU's host NUMA node). In both cases buildNUMATopology() also emits the matching memory-backend-ram,host-nodes=,policy=bind entries so guest memory is sourced from the right host node. So pxb-pcie can never capture a leaf virtio-pci device as the default bus, every virtio-pci device emitter (NetDevice, VSOCK, vhost-user-{net,scsi,blk,fs}) now appends bus=pcie.0 explicitly when the machine actually exposes a pcie.0 root. Detection is done via a new hasPCIeRoot() helper that returns true only for q35/virt machine types — ppc64le's pseries (pci.0), s390x's s390-ccw-virtio (CCW transport) and microvm (no PCI) intentionally skip the pin to avoid "Bus 'pcie.0' not found" at startup. This is the only QEMU mechanism that works for both regular and confidential (TDX/SNP) guests, as it operates through the PCI bus hierarchy rather than ACPI table injection. Signed-off-by: Fabiano Fidêncio --- src/runtime/pkg/device/config/config.go | 11 ++ src/runtime/pkg/device/drivers/vfio.go | 16 ++- src/runtime/pkg/device/manager/manager.go | 2 + src/runtime/pkg/govmm/qemu/qemu.go | 84 +++++++++++-- .../pkg/govmm/qemu/qemu_arch_base_test.go | 7 +- src/runtime/pkg/govmm/qemu/qemu_test.go | 36 +++++- src/runtime/virtcontainers/qemu.go | 119 +++++++++++++++++- src/runtime/virtcontainers/qemu_test.go | 102 +++++++++++++++ 8 files changed, 356 insertions(+), 21 deletions(-) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index f41263975b..489cf5e4dc 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -235,6 +235,17 @@ var ( // different types of PCI ports. We can deduces the Bus number from it // and eliminate duplicates being assigned. PCIeDevicesPerPort = map[PCIePort][]VFIODev{} + + // NUMARootPorts maps host NUMA node IDs to root port IDs on pxb-pcie + // bridges. When NUMA-aware PCIe topology is active (pxb-pcie), + // createPCIeTopology populates this so VFIODevice.Attach() can assign + // each device to the root port on its host NUMA node's pxb-pcie bus. + // Key: host NUMA node ID, Value: slice of root port IDs on that node's pxb. + NUMARootPorts = map[int][]string{} + + // NUMARootPortDeviceCount tracks how many devices have been assigned + // to each host NUMA node's root ports (for round-robin assignment). + NUMARootPortDeviceCount = map[int]int{} ) // DeviceInfo is an embedded type that contains device data common to all types of devices. diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 42d86e2dca..ff70c4ac76 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -90,11 +90,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } if vfio.IsPCIe { - busIndex := len(config.PCIeDevicesPerPort[vfio.Port]) - vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) - // We need to keep track the number of devices per port to deduce - // the corectu bus number, additionally we can use the VFIO device - // info to act upon different Vendor IDs and Device IDs. + // When pxb-pcie NUMA topology is active, assign the device + // to a root port on the pxb-pcie bridge for its host NUMA + // node instead of the default rp/swdp numbering. + if rpIDs, ok := config.NUMARootPorts[vfio.NUMANode]; ok && len(rpIDs) > 0 { + idx := config.NUMARootPortDeviceCount[vfio.NUMANode] + vfio.Bus = rpIDs[idx%len(rpIDs)] + config.NUMARootPortDeviceCount[vfio.NUMANode] = idx + 1 + } else { + busIndex := len(config.PCIeDevicesPerPort[vfio.Port]) + vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) + } config.PCIeDevicesPerPort[vfio.Port] = append(config.PCIeDevicesPerPort[vfio.Port], *vfio) } } diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index 06f9117676..5726613e3a 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -71,6 +71,8 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS config.PCIeDevicesPerPort[config.RootPort] = make([]config.VFIODev, 0) config.PCIeDevicesPerPort[config.SwitchPort] = make([]config.VFIODev, 0) config.PCIeDevicesPerPort[config.BridgePort] = make([]config.VFIODev, 0) + config.NUMARootPorts = make(map[int][]string) + config.NUMARootPortDeviceCount = make(map[int]int) for _, dev := range devices { dm.devices[dev.DeviceID()] = dev diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index dc7501c87b..9dca1e959e 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -50,6 +50,20 @@ const ( qgsSocketPath string = "/var/run/tdx-qgs/qgs.socket" ) +// hasPCIeRoot reports whether the configured QEMU machine type exposes a +// `pcie.0` root complex (q35 on x86, virt on arm64). Machines such as +// pseries (ppc64le -> pci.0), s390-ccw-virtio (s390x -> CCW transport) +// and microvm (no PCI at all) do not have a `pcie.0` bus, so emitting +// `bus=pcie.0` on virtio-pci leaf devices would fail to start QEMU. +// Used to gate the bus= pin we apply to keep leaf devices off pxb-pcie. +func hasPCIeRoot(config *Config) bool { + if config == nil { + return false + } + t := config.Machine.Type + return strings.HasPrefix(t, "q35") || strings.HasPrefix(t, "virt") +} + const ( // Well known vsock CID for host system. // https://man7.org/linux/man-pages/man7/vsock.7.html @@ -132,6 +146,10 @@ const ( // VHostVSockPCI is a generic Vsock vhost device with PCI transport. VHostVSockPCI DeviceDriver = "vhost-vsock-pci" + // PXBPCIe is a PCIe Expander Bridge that creates a new PCI root + // complex with NUMA node affinity. + PXBPCIe DeviceDriver = "pxb-pcie" + // PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port. PCIeRootPort DeviceDriver = "pcie-root-port" @@ -1064,6 +1082,11 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string { if netdev.Bus != "" { deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus)) + } else if netdev.Transport.isVirtioPCI(config) && hasPCIeRoot(config) { + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device as the default bus. Skipped on machines + // without a `pcie.0` root (pseries, microvm, s390-ccw-virtio). + deviceParams = append(deviceParams, "bus=pcie.0") } if netdev.Addr != "" { @@ -1587,9 +1610,11 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address)) if vhostuserDev.Transport.isVirtioPCI(config) { - // Pin to pcie.0 so pxb-pcie (when present) doesn't capture - // this leaf device as the default bus. - deviceParams = append(deviceParams, "bus=pcie.0") + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device. See hasPCIeRoot() for skipped machines. + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1618,7 +1643,9 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) if vhostuserDev.Transport.isVirtioPCI(config) { - deviceParams = append(deviceParams, "bus=pcie.0") + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1646,7 +1673,9 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) if vhostuserDev.Transport.isVirtioPCI(config) { - deviceParams = append(deviceParams, "bus=pcie.0") + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1686,7 +1715,9 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo)) } if vhostuserDev.Transport.isVirtioPCI(config) { - deviceParams = append(deviceParams, "bus=pcie.0") + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1752,6 +1783,36 @@ func (vhostuserDev VhostUserDevice) deviceName(config *Config) string { } } +// PXBPCIeDevice represents a PCIe Expander Bridge (pxb-pcie). +// It creates a new PCI root complex with NUMA node affinity, allowing +// devices attached to its bus hierarchy to inherit the NUMA association. +// This is the only QEMU PCI device that carries a numa_node property. +type PXBPCIeDevice struct { + // ID is the QEMU device identifier (e.g. "pxb-numa0"). + ID string + + // BusNr is the guest PCI bus number for this root complex. + // Use values spaced apart (e.g. 0x20, 0x40) to leave room for + // bridges beneath each pxb-pcie. + BusNr uint8 + + // NUMANode is the guest NUMA node index this root complex belongs to. + NUMANode int +} + +// QemuParams returns the QEMU parameters for a pxb-pcie device. +func (dev PXBPCIeDevice) QemuParams(_ *Config) []string { + return []string{ + "-device", + fmt.Sprintf("pxb-pcie,id=%s,bus_nr=%d,numa_node=%d", dev.ID, dev.BusNr, dev.NUMANode), + } +} + +// Valid returns true if the PXBPCIeDevice structure is valid and complete. +func (dev PXBPCIeDevice) Valid() bool { + return dev.ID != "" +} + // PCIeRootPortDevice represents a memory balloon device. // nolint: govet type PCIeRootPortDevice struct { @@ -2324,8 +2385,15 @@ func (vsock VSOCKDevice) QemuParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vsock.ID)) deviceParams = append(deviceParams, fmt.Sprintf("%s=%d", VSOCKGuestCID, vsock.ContextID)) - if vsock.Transport.isVirtioPCI(config) && vsock.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile)) + if vsock.Transport.isVirtioPCI(config) { + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device. See hasPCIeRoot() for skipped machines. + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } + if vsock.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile)) + } } if vsock.Transport.isVirtioCCW(config) { diff --git a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go index a15e77c184..36e03254ae 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go @@ -42,7 +42,8 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserBlk, ROMFile: romfile, } - testAppend(vhostuserBlkDevice, deviceVhostUserBlkString, t) + // vhost-user-pci device strings include bus=pcie.0 — gated to q35/virt. + testAppendQ35(vhostuserBlkDevice, deviceVhostUserBlkString, t) vhostuserSCSIDevice := VhostUserDevice{ SocketPath: "/tmp/nonexistentsocket.socket", @@ -52,7 +53,7 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserSCSI, ROMFile: romfile, } - testAppend(vhostuserSCSIDevice, deviceVhostUserSCSIString, t) + testAppendQ35(vhostuserSCSIDevice, deviceVhostUserSCSIString, t) vhostuserNetDevice := VhostUserDevice{ SocketPath: "/tmp/nonexistentsocket.socket", @@ -62,7 +63,7 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserNet, ROMFile: romfile, } - testAppend(vhostuserNetDevice, deviceVhostUserNetString, t) + testAppendQ35(vhostuserNetDevice, deviceVhostUserNetString, t) } func TestAppendVirtioBalloon(t *testing.T) { diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 8be4d0d779..e4616a8231 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -24,6 +24,15 @@ func testAppend(structure interface{}, expected string, t *testing.T) { testConfigAppend(&config, structure, expected, t) } +// testAppendQ35 is testAppend with Config.Machine.Type set to "q35" so +// device emitters that gate on hasPCIeRoot() (e.g. virtio-pci leaves +// pinned to bus=pcie.0) take the PCIe path. Use this for tests whose +// expected string contains "bus=pcie.0". +func testAppendQ35(structure interface{}, expected string, t *testing.T) { + config := Config{Machine: Machine{Type: "q35"}} + testConfigAppend(&config, structure, expected, t) +} + func testConfigAppend(config *Config, structure interface{}, expected string, t *testing.T) { switch s := structure.(type) { case Machine: @@ -343,7 +352,32 @@ func TestAppendVSOCK(t *testing.T) { vsockDevice.DevNo = DevNo } - testAppend(vsockDevice, deviceVSOCKString, t) + // deviceVSOCKString includes bus=pcie.0 — gated to q35/virt machines. + testAppendQ35(vsockDevice, deviceVSOCKString, t) +} + +// TestAppendVSOCKNoPCIeRoot verifies that on machines without a `pcie.0` +// root (e.g. ppc64le's pseries, microvm, s390-ccw-virtio), we do NOT +// emit `bus=pcie.0` — doing so would crash QEMU with +// "Bus 'pcie.0' not found". Transport and ROMFile are set explicitly +// rather than using the arch-conditional `romfile` constant (which is +// "" on s390x via qemu_s390x_test.go), so the test exercises the +// same code path on every architecture. +func TestAppendVSOCKNoPCIeRoot(t *testing.T) { + const vsockRomfile = "efi-virtio.rom" + vsockDevice := VSOCKDevice{ + ID: "vhost-vsock-pci0", + ContextID: 4, + VHostFD: nil, + DisableModern: true, + ROMFile: vsockRomfile, + Transport: TransportPCI, + } + + // pseries -> hasPCIeRoot returns false -> no bus=pcie.0 emitted. + expected := "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=" + vsockRomfile + config := Config{Machine: Machine{Type: "pseries"}} + testConfigAppend(&config, vsockDevice, expected, t) } func TestVSOCKValid(t *testing.T) { diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 173f72b2c1..74818ff5d6 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -565,10 +565,12 @@ func maybeRightSizeAutoNUMA(hc *HypervisorConfig, log *logrus.Entry) { func (q *qemu) buildNUMATopology() ([]govmmQemu.NUMANode, []govmmQemu.NUMADist, error) { // q.config.GuestNUMANodes has already been right-sized (when applicable) - // by maybeRightSizeAutoNUMA() at hypervisor setup time, so a length - // of 1 here means "no NUMA topology"; fall through to a flat memdev. + // by maybeRightSizeAutoNUMA() at hypervisor setup time. Empty means + // no NUMA topology; a single node may still carry a HostNodes binding + // (e.g. right-sized to the GPU's NUMA node), in which case we must + // emit it so memory is bound to the correct host node. numaNodes := q.config.GuestNUMANodes - if len(numaNodes) <= 1 { + if !numaPlacementActive(numaNodes) { return nil, nil, nil } @@ -1298,6 +1300,15 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if numOfPluggablePorts > maxPCIeRootPort { return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) } + + // When NUMA is active (multi-node OR a single node right-sized to a + // specific host node), create pxb-pcie bridges so cold-plugged VFIO + // devices inherit the correct guest NUMA affinity. + if numaPlacementActive(q.config.GuestNUMANodes) && len(hypervisorConfig.VFIODevices) > 0 { + qemuConfig.Devices = q.createNUMAPCIeTopology(qemuConfig.Devices, hypervisorConfig, numOfPluggablePorts) + return nil + } + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts) return nil } @@ -3077,7 +3088,107 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff return memory } -// genericAppendPCIeRootPort appends to devices the given pcie-root-port +// numaPlacementActive reports whether the runtime should emit per-NUMA +// pxb-pcie / memory-binding QEMU args. True when there is more than one +// guest node, OR a single guest node with an explicit HostNodes binding. +// +// The single-node case covers two scenarios that the runtime cannot tell +// apart after right-sizing: +// - a multi-NUMA host whose workload was collapsed to one host node +// (e.g. GPU on host node 0) — pxb-pcie + host-nodes binding are +// required so the guest GPU reports the correct NUMA affinity; +// - a single-NUMA host with `enable_numa=true` — emitting the binding +// is a functional no-op (the only host node is node 0 anyway). +// +// Single node without a HostNodes value (no NUMA mapping at all) falls +// through to the flat memdev path. +func numaPlacementActive(nodes []types.GuestNUMANode) bool { + if len(nodes) > 1 { + return true + } + return len(nodes) == 1 && nodes[0].HostNodes != "" +} + +// createNUMAPCIeTopology creates pxb-pcie bridges for NUMA nodes that have +// VFIO devices, then creates root ports on each pxb bus. VFIO devices will +// be assigned to these root ports during Attach() based on their host NUMA +// node, giving the guest kernel correct NUMA affinity for the PCI devices. +func (q *qemu) createNUMAPCIeTopology(devices []govmmQemu.Device, hypervisorConfig *HypervisorConfig, totalPorts uint32) []govmmQemu.Device { + coveredHostNodes := buildCoveredHostNodes(q.config.GuestNUMANodes) + + // Count VFIO devices per host NUMA node. + numaDevCount := make(map[int]int) + for _, dev := range hypervisorConfig.VFIODevices { + hostPath, err := config.GetHostPath(dev, false, "") + if err != nil { + continue + } + dev.HostPath = hostPath + var vfioDevs []*config.VFIODev + if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) { + vfioDevs, _ = drivers.GetDeviceFromVFIODev(dev) + } else { + vfioDevs, _ = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + } + for _, vd := range vfioDevs { + if vd.NUMANode >= 0 && drivers.IsPCIeDevice(vd.BDF) { + numaDevCount[vd.NUMANode]++ + } + } + } + + if len(numaDevCount) == 0 { + return q.arch.appendPCIeRootPortDevice(devices, totalPorts) + } + + // Create a pxb-pcie + root ports per NUMA node that has devices. + var rpIndex uint32 + const busNrSpacing uint8 = 0x20 + + for hostNode, devCount := range numaDevCount { + guestNode, ok := coveredHostNodes[hostNode] + if !ok { + q.Logger().WithField("host-numa", hostNode).Warn("VFIO device on uncovered NUMA node; skipping pxb-pcie") + continue + } + + pxbID := fmt.Sprintf("pxb-numa%d", guestNode) + busNr := busNrSpacing * uint8(guestNode+1) + + devices = append(devices, govmmQemu.PXBPCIeDevice{ + ID: pxbID, + BusNr: busNr, + NUMANode: int(guestNode), + }) + + // Create root ports on this pxb bus for the VFIO devices. + var rpIDs []string + for i := 0; i < devCount; i++ { + rpID := fmt.Sprintf("rp-numa%d-%d", guestNode, i) + rpIDs = append(rpIDs, rpID) + devices = append(devices, govmmQemu.PCIeRootPortDevice{ + ID: rpID, + Bus: pxbID, + Chassis: fmt.Sprintf("%d", 10+guestNode), + Slot: fmt.Sprintf("%d", i), + }) + rpIndex++ + } + + config.NUMARootPorts[hostNode] = rpIDs + + q.Logger().WithFields(logrus.Fields{ + "pxb-id": pxbID, + "bus-nr": busNr, + "guest-numa": guestNode, + "host-numa": hostNode, + "root-ports": rpIDs, + }).Info("Created pxb-pcie with root ports for NUMA VFIO placement") + } + + return devices +} + func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device { var ( bus string diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index db494bf365..9fcb8dc1fa 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -1203,7 +1203,33 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) { assert.Equal(100, q.state.HotpluggedMemory) } +func TestNumaPlacementActive(t *testing.T) { + assert := assert.New(t) + cases := []struct { + name string + nodes []types.GuestNUMANode + want bool + }{ + {"empty", nil, false}, + {"single-node-no-binding", []types.GuestNUMANode{{}}, false}, + {"single-node-host-0", []types.GuestNUMANode{{HostNodes: "0"}}, true}, + {"single-node-host-1", []types.GuestNUMANode{{HostNodes: "1"}}, true}, + {"single-node-host-range", []types.GuestNUMANode{{HostNodes: "0-1"}}, true}, + {"two-nodes", []types.GuestNUMANode{{HostNodes: "0"}, {HostNodes: "1"}}, true}, + } + for _, c := range cases { + assert.Equal(c.want, numaPlacementActive(c.nodes), c.name) + } +} + func TestBuildNUMATopologySingleNode(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // A single guest node mapped to a specific host node (e.g. produced + // by maybeRightSizeAutoNUMA() collapsing a multi-node sandbox to the + // GPU's host NUMA node) must still emit a one-node topology so that + // the memory backend gets a host-nodes= binding. assert := assert.New(t) q := &qemu{ config: HypervisorConfig{ @@ -1214,12 +1240,88 @@ func TestBuildNUMATopologySingleNode(t *testing.T) { }, }, } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 1) + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("0-3", nodes[0].CPUs) + assert.Equal("1024M", nodes[0].MemSize) + assert.Equal("0", nodes[0].HostNodes) + assert.Equal("memory-backend-ram", nodes[0].MemBackendType) +} + +func TestBuildNUMATopologySingleNodeNoHostBinding(t *testing.T) { + // A single guest node without a HostNodes value carries no NUMA + // binding intent; buildNUMATopology() must return nil so that the + // QEMU command line falls through to the flat memdev path. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "", HostCPUs: "0-3"}, + }, + }, + } nodes, dists, err := q.buildNUMATopology() assert.NoError(err) assert.Nil(nodes) assert.Nil(dists) } +func TestBuildNUMATopologySingleNodeExplicitNonZeroHost(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // User explicitly mapped the only guest node to a non-zero host node + // (e.g. numa_mapping = ["1"]). buildNUMATopology() must propagate + // HostNodes verbatim so the memory backend ends up bound to host + // node 1 rather than the default node 0. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + NUMAMapping: []string{"1"}, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "1", HostCPUs: "0-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 1) + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("1", nodes[0].HostNodes) +} + +func TestBuildNUMATopologyExplicitRangedHostNodes(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // User explicitly mapped two guest nodes to disjoint host-node ranges + // (e.g. numa_mapping = ["0-1", "2-3"]). buildNUMATopology() must + // preserve the ranged HostNodes strings on each emitted NUMANode. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 8, + MemorySize: 2048, + NUMAMapping: []string{"0-1", "2-3"}, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-3"}, + {HostNodes: "2-3", HostCPUs: "4-7"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("0-1", nodes[0].HostNodes) + assert.Equal("2-3", nodes[1].HostNodes) +} + func TestBuildNUMATopologyTwoNodes(t *testing.T) { if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) From 8787da13a949b88cb0278d1cc46d3687dc9b3d2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 14 Apr 2026 15:06:33 +0200 Subject: [PATCH 12/14] agent: Add NUMA-aware PCI path parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend pcipath_from_dev_tree_path() to support the full NUMA-aware path format "root_complex/bus/device" (e.g. "10/00/02") in addition to the legacy "bus/device" format, defaulting to root complex "00" for backward compatibility. Signed-off-by: Fabiano Fidêncio Signed-off-by: Zvonko Kaiser --- src/agent/src/linux_abi.rs | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/agent/src/linux_abi.rs b/src/agent/src/linux_abi.rs index cb5c6bc3f0..a89454263e 100644 --- a/src/agent/src/linux_abi.rs +++ b/src/agent/src/linux_abi.rs @@ -26,15 +26,29 @@ pub fn create_pci_root_bus_path(root_complex: &str) -> String { format!("/devices/pci0000:{root_complex}") } -// This is used in several modules, let's create a helper function to parse the -// qom path and switch easily once the shim sends us the full NUMA path +// Parses a device tree path into a (root_complex, PCI path) pair. +// +// Supports two formats: +// - Full NUMA path: "root_complex/bus/device" (e.g. "10/00/02") where the +// first segment is the root complex and the rest form the PCI path. +// - Legacy path: "bus/device" (e.g. "00/02") which defaults to root complex "00". pub fn pcipath_from_dev_tree_path(dev_tree_path: &str) -> Result<(&str, pci::Path)> { - // Placeholder until the shim send us the full NUMA path - // via shim in the form of root_complex/bus/device 10/00/02 - // Currently the shim only sends us the bus/device 00/02 - let pci_path = pci::Path::from_str(dev_tree_path) - .with_context(|| format!("Failed to parse PCI path from QOM path '{}'", dev_tree_path))?; - Ok(("00", pci_path)) + let segments: Vec<&str> = dev_tree_path.split('/').collect(); + if segments.len() >= 3 { + let root_complex = segments[0]; + let pci_part = &dev_tree_path[root_complex.len() + 1..]; + let pci_path = pci::Path::from_str(pci_part).with_context(|| { + format!( + "Failed to parse PCI path from NUMA path '{}'", + dev_tree_path + ) + })?; + Ok((root_complex, pci_path)) + } else { + let pci_path = pci::Path::from_str(dev_tree_path) + .with_context(|| format!("Failed to parse PCI path from '{}'", dev_tree_path))?; + Ok(("00", pci_path)) + } } #[cfg(target_arch = "aarch64")] From 20705470e928903a33403e4a8ac6460b4c3d120f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 14 May 2026 22:43:29 +0200 Subject: [PATCH 13/14] docs: Add NUMA support guide for Kata Containers with QEMU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a step-by-step how-to guide covering host inspection, Kata NUMA drop-in setup (via kata-deploy Helm and manual config.d/), pod deployment examples, and guest/host verification procedures. Signed-off-by: Fabiano Fidêncio --- docs/.nav.yml | 2 + docs/how-to/README.md | 1 + docs/how-to/how-to-use-numa-with-kata.md | 645 ++++++++++++++++++ .../NVIDIA-GPU-passthrough-and-Kata-QEMU.md | 11 + tests/spellcheck/kata-dictionary.txt | 2 + 5 files changed, 661 insertions(+) create mode 100644 docs/how-to/how-to-use-numa-with-kata.md diff --git a/docs/.nav.yml b/docs/.nav.yml index 7dc1b12238..fa96b03a40 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -16,6 +16,8 @@ nav: - NVIDIA GPU Passthrough: use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md - NVIDIA vGPU: use-cases/NVIDIA-GPU-passthrough-and-Kata.md - Intel QAT: use-cases/using-Intel-QAT-and-kata.md + - How To: + - NUMA Support: how-to/how-to-use-numa-with-kata.md - Contributing: - Documentation: doc-contributing.md - Misc: diff --git a/docs/how-to/README.md b/docs/how-to/README.md index e2742ef374..134dd35e67 100644 --- a/docs/how-to/README.md +++ b/docs/how-to/README.md @@ -52,4 +52,5 @@ - [How to use seccomp with runtime-rs](how-to-use-seccomp-with-runtime-rs.md) - [How to use passthroughfd-IO with runtime-rs and Dragonball](how-to-use-passthroughfd-io-within-runtime-rs.md) - [How to use EROFS snapshotter with Kata Containers](how-to-use-erofs-snapshotter-with-kata.md) +- [How to use NUMA with Kata Containers](how-to-use-numa-with-kata.md) diff --git a/docs/how-to/how-to-use-numa-with-kata.md b/docs/how-to/how-to-use-numa-with-kata.md new file mode 100644 index 0000000000..671f1a3d1d --- /dev/null +++ b/docs/how-to/how-to-use-numa-with-kata.md @@ -0,0 +1,645 @@ +# NUMA Support for Kata Containers with QEMU + +## Overview + +Non-Uniform Memory Access (NUMA) is a memory architecture where access +latency depends on which CPU is accessing which memory region. On +multi-socket or multi-chiplet systems, each NUMA node has local memory that +its CPUs can access faster than remote memory belonging to other nodes. + +When running performance-sensitive workloads — particularly GPU passthrough +via VFIO — cross-NUMA memory access can significantly degrade throughput. +Kata Containers can expose the host NUMA topology to the guest VM so that +vCPUs, memory, and devices are all placed on the correct NUMA node, preserving +memory locality. + +This guide walks through the full setup end-to-end: host inspection, +Kubernetes configuration, Kata configuration, pod deployment, and +verification. + +> **Note:** +> +> NUMA support is currently available only for the **Go runtime** with the +> **QEMU hypervisor** on **amd64** and **arm64** architectures. The Rust +> runtime (`runtime-rs`) does not yet support NUMA topology. + +## Step 1: Inspect the Host NUMA Topology + +Before configuring anything, understand your host. Run on each worker node: + +```bash +$ numactl --hardware +``` + +Example output on a 2-socket system with 8 CPUs per socket: + +``` +available: 2 nodes (0-1) +node 0 cpus: 0 1 2 3 4 5 6 7 +node 0 size: 65536 MB +node 1 cpus: 8 9 10 11 12 13 14 15 +node 1 size: 65536 MB +node distances: +node 0 1 + 0: 10 21 + 1: 21 10 +``` + +Take note of: +- How many NUMA nodes exist (here: 2) +- Which CPUs belong to each node (here: 0-7 on node 0, 8-15 on node 1) +- The distance matrix (here: 10 local, 21 remote) + +If you have GPUs, check which NUMA node each GPU is attached to: + +```bash +$ lspci -nnk -d 10de: | grep -A2 "NVIDIA" +$ cat /sys/bus/pci/devices/0000:41:00.0/numa_node +``` + +Replace `0000:41:00.0` with your GPU's PCI address. The output (`0` or `1`) +tells you which NUMA node the GPU sits on. + +On a single-NUMA host (only node 0), enabling NUMA is a harmless no-op — +the runtime detects one node and skips multi-NUMA topology. + +## Step 2: Kubernetes CPU Manager Policy + +Kata's NUMA-aware vCPU pinning works **without** `cpuManagerPolicy: static`. +The recommended policy is the default (`none`): + +```yaml +apiVersion: kubelet.config.k8s.io/v1beta1 +kind: KubeletConfiguration +cpuManagerPolicy: "none" +``` + +> **Why not `static`?** +> +> With `cpuManagerPolicy: static`, Kubernetes assigns dedicated CPUs to +> Guaranteed QoS pods. On a multi-NUMA host, those CPUs are often all from +> a **single** NUMA node (depending on the topology manager policy). This +> causes the sandbox CPUSet to cover only one NUMA node, which defeats the +> purpose of multi-NUMA guest topology. +> +> With `cpuManagerPolicy: none` (the default), the pod inherits the full +> node CPUSet spanning all NUMA nodes, and Kata's NUMA-aware pinning +> distributes vCPU threads proportionally across host NUMA nodes. + +### 2.1 Check the current policy + +```bash +$ grep cpuManagerPolicy /var/lib/kubelet/config.yaml +``` + +If it shows `static`, switch to `none`: + +```bash +$ sudo sed -i 's/cpuManagerPolicy:.*/cpuManagerPolicy: "none"/' /var/lib/kubelet/config.yaml +$ sudo rm -f /var/lib/kubelet/cpu_manager_state +$ sudo systemctl restart kubelet +``` + +## Step 3: Configure Kata Containers for NUMA + +> **Note:** +> +> If you are using the NVIDIA GPU runtime classes +> (`kata-qemu-nvidia-gpu`, `kata-qemu-nvidia-gpu-snp`, +> `kata-qemu-nvidia-gpu-tdx`), NUMA is already enabled by default in their +> configuration templates. You only need the steps below for the base +> `kata-qemu` runtime class or custom configurations. + +Never edit the base `configuration-qemu.toml` directly — use a +**configuration drop-in** so your customizations survive upgrades. + +### 3.1 Via kata-deploy Helm chart (recommended) + +Add a custom runtime with a NUMA drop-in in your Helm values file: + +```yaml +customRuntimes: + enabled: true + runtimes: + numa: + baseConfig: qemu + runtimeClass: | + apiVersion: node.k8s.io/v1 + kind: RuntimeClass + metadata: + name: kata-qemu-numa + handler: kata-qemu-numa + dropIn: | + [hypervisor.qemu] + enable_numa = true + numa_mapping = [] + + [runtime] + static_sandbox_resource_mgmt = true + enable_vcpus_pinning = true +``` + +Then install (or upgrade) the Helm chart: + +```bash +$ helm upgrade kata-deploy \ + --namespace kata-system \ + -f my-values.yaml \ + "${CHART}" --version "${VERSION}" +``` + +Pods using `runtimeClassName: kata-qemu-numa` will get the NUMA-enabled +configuration. + +With `numa_mapping = []` (empty), the runtime auto-discovers host NUMA nodes +and creates a 1:1 guest-to-host mapping, then **right-sizes** the resulting +topology: if the sandbox's CPU and memory budget fits on a single host +NUMA node — and any cold-plugged VFIO devices live on that same node — +the guest topology collapses to that one node so the workload keeps full +memory locality without paying a multi-node penalty. Sandboxes that +genuinely span multiple host nodes keep the auto-derived multi-node +topology. An explicit `numa_mapping` opts out of right-sizing and is +honored verbatim — useful when you want a specific layout regardless of +sandbox size, or to group multiple host nodes into fewer guest nodes +(e.g., on a 4-socket system): + +```yaml + dropIn: | + [hypervisor.qemu] + enable_numa = true + numa_mapping = ["0-1", "2-3"] +``` + +Each entry is a cpuset-style string (ranges like `0-3` and lists like +`0,2,4` are both valid). + +### 3.2 Via manual drop-in on the node + +If you manage nodes directly (without kata-deploy), create a drop-in file +under the `config.d/` directory. Use a `50-*` prefix (the reserved range +for user customizations): + +```bash +$ cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-numa.toml <<'EOF' +[hypervisor.qemu] +enable_numa = true +numa_mapping = [] + +[runtime] +static_sandbox_resource_mgmt = true +enable_vcpus_pinning = true +EOF +``` + +The drop-in is merged on top of the base `configuration-qemu.toml` +automatically. No restart is needed — the shim reads the configuration +at pod creation time. + +> **Note:** +> +> For details on the drop-in mechanism, reserved prefix ranges, and +> additional Helm examples, see the +> [Helm configuration guide](../../docs/helm-configuration.md). + +### 3.3 Verify the effective configuration + +After applying the drop-in, verify the merged configuration on the node: + +```bash +$ grep -rE "enable_numa|numa_mapping|static_sandbox_resource_mgmt|enable_vcpus_pinning" \ + /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/ +``` + +## Step 4: Deploy a NUMA-Aware Pod + +### 4.1 Basic NUMA pod + +Create a pod that requests enough CPUs to span both NUMA nodes. Use the +runtime class matching your NUMA configuration from Step 3 (e.g., +`kata-qemu-numa` if you created a custom runtime, or `kata-qemu` if you +applied a drop-in to the base config). Kata sizes the VM based on +`limits`, so set `limits.cpu` to the desired vCPU count: + +```bash +$ cat <<'EOF' | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: numa-test +spec: + runtimeClassName: kata-qemu-numa + containers: + - name: numa-check + image: ubuntu:24.04 + command: ["sleep", "infinity"] + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + cpu: "80" + memory: "64Gi" +EOF +``` + +> **Note:** +> +> Kata sizes the VM based on `limits` (not `requests`). Using different +> values for `requests` and `limits` makes the pod **Burstable** QoS, +> which avoids Kubernetes CPU manager interference with NUMA-aware +> pinning. The large `limits.cpu` value tells Kata to create a VM with +> that many vCPUs distributed across NUMA nodes. + +### 4.2 GPU passthrough pod with NUMA + +For GPU workloads, use the NVIDIA GPU runtime class. NUMA is enabled by +default in the GPU configuration templates: + +```bash +$ cat <<'EOF' | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: gpu-numa-test +spec: + runtimeClassName: kata-qemu-nvidia-gpu + containers: + - name: cuda-test + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04 + resources: + limits: + cpu: "4" + memory: "8Gi" + nvidia.com/pgpu: "1" +EOF +``` + +## Step 5: Verify NUMA Inside the Guest + +### 5.1 Check guest NUMA topology + +Exec into the running pod and inspect the NUMA layout: + +```bash +$ kubectl exec -it numa-test -- bash +``` + +Inside the pod: + +```bash +$ apt-get update && apt-get install -y numactl +$ numactl --hardware +``` + +Expected output on a 2-NUMA-node guest: + +``` +available: 2 nodes (0-1) +node 0 cpus: 0 1 +node 0 size: 2048 MB +node 1 cpus: 2 3 +node 1 size: 2048 MB +node distances: +node 0 1 + 0: 10 21 + 1: 21 10 +``` + +Key things to verify: +- **Number of nodes** matches your host (or `numa_mapping` configuration). +- **CPUs** are distributed across nodes (not all on node 0). +- **Memory** is split across nodes (not all on node 0). +- **Distances** mirror the host distances. + +### 5.2 Check CPU-to-NUMA mapping + +```bash +$ lscpu | grep -i numa +``` + +Expected: + +``` +NUMA node(s): 2 +NUMA node0 CPU(s): 0,1 +NUMA node1 CPU(s): 2,3 +``` + +### 5.3 Check from /proc and /sys inside the guest + +```bash +$ cat /sys/devices/system/node/node*/cpulist +``` + +Expected: + +``` +0-1 +2-3 +``` + +```bash +$ cat /sys/devices/system/node/node*/meminfo | grep MemTotal +``` + +Expected (values will vary based on your pod's memory request): + +``` +Node 0 MemTotal: 2097152 kB +Node 1 MemTotal: 2097152 kB +``` + +## Step 6: Verify NUMA on the Host + +### 6.1 Check vCPU pinning + +From the host, find the QEMU process and check its thread affinities: + +```bash +$ QEMU_PID=$(pgrep -f "qemu.*numa-test") +$ ls /proc/${QEMU_PID}/task/ | while read tid; do + echo "TID ${tid}: $(taskset -p ${tid} 2>/dev/null)" + done +``` + +With NUMA pinning enabled, you should see vCPU threads pinned to specific +CPUs (not the full CPU mask). For example, on a 2-NUMA-node host with +CPUs 0-7 on node 0 and CPUs 8-15 on node 1: + +``` +TID 12345: pid 12345's current affinity mask: 1 # CPU 0 +TID 12346: pid 12346's current affinity mask: 2 # CPU 1 +TID 12347: pid 12347's current affinity mask: 100 # CPU 8 +TID 12348: pid 12348's current affinity mask: 200 # CPU 9 +``` + +### 6.2 Check the shim logs for NUMA configuration + +```bash +$ POD_SANDBOX_ID=$(crictl pods --name numa-test -q) +$ journalctl -t kata | grep "${POD_SANDBOX_ID}" | grep -i numa +``` + +Look for lines like: + +``` +buildNUMATopology: creating 2 guest NUMA nodes +VFIO device NUMA placement validated bdf=0000:41:00.0 host-numa=1 guest-numa=1 +``` + +### 6.3 Check the QEMU command line + +```bash +$ cat /proc/${QEMU_PID}/cmdline | tr '\0' '\n' | grep -E "numa|memory-backend" +``` + +Expected output (varies by configuration): + +``` +-object +memory-backend-ram,id=numa-mem0,size=2048M,host-nodes=0,policy=bind,share=on +-numa +node,nodeid=0,memdev=numa-mem0,cpus=0-1 +-object +memory-backend-ram,id=numa-mem1,size=2048M,host-nodes=1,policy=bind,share=on +-numa +node,nodeid=1,memdev=numa-mem1,cpus=2-3 +-numa +dist,src=0,dst=1,val=21 +-numa +dist,src=1,dst=0,val=21 +``` + +Key things to verify: +- Each `-object memory-backend-*` has `host-nodes=N` and `policy=bind` + matching the correct host NUMA node. +- Each `-numa node` has a `cpus=` range and `memdev=` pointing to the + correct memory backend. +- `-numa dist` entries mirror the host distances. + +## Step 7: Verify GPU NUMA Placement (GPU Passthrough Only) + +If using GPU passthrough, verify the device landed on the correct NUMA node: + +### 7.1 Check host-side GPU NUMA node + +```bash +$ GPU_BDF="0000:41:00.0" # Replace with your GPU's PCI address +$ cat /sys/bus/pci/devices/${GPU_BDF}/numa_node +``` + +### 7.2 Check shim logs for VFIO placement validation + +```bash +$ journalctl -t kata | grep -i "VFIO device NUMA" +``` + +Healthy output: + +``` +VFIO device NUMA placement validated bdf=0000:41:00.0 host-numa=1 guest-numa=1 +``` + +Warning output (indicates misconfiguration): + +``` +VFIO device on host NUMA node not covered by guest NUMA topology bdf=0000:41:00.0 host-numa=2 covered-nodes=map[0:0 1:1] +``` + +If you see the warning, extend your `numa_mapping` to include the GPU's host +NUMA node. + +### 7.3 Check GPU NUMA inside the guest + +Inside the GPU pod, verify the GPU reports a valid NUMA node (not `-1`): + +```bash +$ cat /sys/bus/pci/devices/*/numa_node +# Should show 0 or 1 (matching the host GPU's NUMA node), not -1. + +$ nvidia-smi topo --matrix +# Shows the GPU's relationship to NUMA nodes from the guest perspective. +``` + +The runtime uses QEMU's `acpi-generic-initiator` object to wire each VFIO +device to the correct guest NUMA node. If the guest reports `-1`, check +that the QEMU command line contains +`-object acpi-generic-initiator,id=gi-...,pci-dev=...,node=...`. + +## How It Works + +When a VM is created with NUMA enabled, the runtime: + +1. **Discovers host NUMA**: Reads + `/sys/devices/system/node/node*/distance` to build the host distance + matrix. + +2. **Right-sizes the topology** (auto-discovery only): When `numa_mapping` + is empty, the runtime compares the sandbox's vCPU and memory budget + against per-node host capacity (read from + `/sys/devices/system/node/node*/meminfo` and `cpulist`). If any + cold-plugged VFIO device pins the sandbox to specific host nodes, the + chosen subset must cover those; otherwise the smallest single host + node that fits the workload is picked. When the resulting subset has + one node, the topology collapses to a flat (no `-numa`) layout so QEMU + uses a single memory backend. Sandboxes that exceed any single node + keep the full auto-derived multi-node topology. An explicit + `numa_mapping` opts out of this step entirely and is honored verbatim. + +3. **Builds guest topology**: Creates guest NUMA nodes with per-node memory + backends (`policy=bind` to lock memory to host NUMA nodes), distributes + vCPUs proportionally to host CPU counts, and mirrors distances. For + confidential guests (SEV-SNP, TDX), QEMU automatically enables + `guest_memfd` on each memory backend for private/shared memory + attribute tracking (requires the cross-region conversion patch). + +4. **Restructures SMP**: Sets `sockets = num_NUMA_nodes` and + `cores = ceil(maxvcpus / num_NUMA_nodes)` so QEMU groups vCPUs by socket + per NUMA node. + +5. **Pins vCPUs** (when enabled): Each vCPU thread is pinned to a host CPU + belonging to the same NUMA node. Right-sized single-node sandboxes + also go through this NUMA-aware path, so all vCPUs land on the chosen + host NUMA node's CPUs. + +6. **Places VFIO devices on correct guest NUMA node**: For each + cold-plugged VFIO device (e.g. GPU), the runtime looks up its host + NUMA node, maps it to the corresponding guest NUMA node, and emits a + QEMU `acpi-generic-initiator` object so the guest kernel reports the + correct `numa_node` for the device. This ensures GPU memory accesses + stay NUMA-local. If a device's host NUMA node is not covered by the + guest topology, a warning is logged. + +7. **Translates cpuset.mems**: Converts host NUMA node IDs to guest node IDs + before forwarding to the agent. + +## Troubleshooting + +### Guest reports a single NUMA node on a multi-NUMA host + +**Symptom:** Inside a small pod on a 2+ NUMA-node host, `numactl --hardware` +shows only one NUMA node, and the QEMU command line has no `-numa` +arguments. + +**Cause:** Right-sizing collapsed the auto-derived topology because the +sandbox's vCPU + memory budget fits on one host NUMA node. This is the +intended optimization — the pod gets full memory locality without paying +the cross-node penalty for a workload that does not need it. + +**Fix (only if you really want the multi-node layout):** either +- set an explicit `numa_mapping = ["0", "1"]` (or similar) — explicit + mappings skip right-sizing and are honored verbatim, or +- raise the pod's `limits.cpu` / `limits.memory` so the sandbox truly + exceeds any single host node's capacity. + +### Multi-NUMA topology is skipped (too few vCPUs) + +**Symptom:** The shim logs show: + +``` +DefaultMaxVCPUs < NUMA node count; skipping multi-NUMA topology vcpus=1 numa-nodes=2 +``` + +**Cause:** The pod requested fewer CPUs than there are NUMA nodes. Each +NUMA node needs at least one vCPU. + +**Fix:** Request at least as many CPUs as NUMA nodes in the pod spec: + +```yaml +resources: + limits: + cpu: "2" # At least 2 for a 2-NUMA-node host +``` + +Or increase `default_vcpus` via a drop-in: + +```bash +$ cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-default-vcpus.toml <<'EOF' +[hypervisor.qemu] +default_vcpus = 2 +EOF +``` + +### vCPU pinning is skipped (empty CPUSet) + +**Symptom:** The shim logs show: + +``` +sandbox CPUSet is empty; skipping vCPU pinning +``` + +**Cause:** The runtime could not determine a CPUSet for pinning. With +`cpuManagerPolicy: none` and multi-NUMA enabled, the runtime derives the +CPUSet from the guest NUMA nodes' `HostCPUs`. This message indicates no +NUMA topology was built (e.g., the host has only one NUMA node). + +**Fix:** Verify: + +1. The host has multiple NUMA nodes (`numactl --hardware`) +2. `enable_numa = true` is set in the Kata configuration +3. `enable_vcpus_pinning = true` is set in the Kata configuration +4. `static_sandbox_resource_mgmt = true` is set (so all vCPUs boot at start) + +### NUMA pinning fallback warning + +**Symptom:** The shim logs show: + +``` +NUMA node HostCPUs do not intersect sandbox CPUSet; falling back to full cpuset +``` + +**Cause:** The CPUs Kubernetes assigned to the pod do not overlap with the +host CPUs on the NUMA node. This means NUMA locality is lost for that node. + +**Fix:** Verify that your `numa_mapping` matches the actual host topology: + +```bash +$ numactl --hardware # Check which CPUs are on which nodes +``` + +Ensure the Kubernetes node has CPUs from all mapped NUMA nodes available +for scheduling. + +### Configuration validation error at startup + +**Symptom:** + +``` +NUMA support requires static_sandbox_resource_mgmt to be enabled +``` + +**Fix:** Add `static_sandbox_resource_mgmt` via a drop-in: + +```bash +$ cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-static-resources.toml <<'EOF' +[runtime] +static_sandbox_resource_mgmt = true +EOF +``` + +## Configuration Reference + +| Option | Section | Default | Description | +|--------|---------|---------|-------------| +| `enable_numa` | `[hypervisor.qemu]` | `false` | Enable guest NUMA topology | +| `numa_mapping` | `[hypervisor.qemu]` | `[]` | Map guest NUMA nodes to host nodes. Empty = auto-discover with right-sizing (small sandboxes collapse to one node); non-empty = honored verbatim | +| `static_sandbox_resource_mgmt` | `[runtime]` | varies | Size VM at boot (required for NUMA) | +| `enable_vcpus_pinning` | `[runtime]` | `false` | Pin vCPU threads to host CPUs (NUMA-aware when NUMA enabled) | + +## Limitations + +- NUMA is only supported with the **Go runtime** and **QEMU** hypervisor. +- Only **amd64** and **arm64** architectures are supported. +- NUMA requires `static_sandbox_resource_mgmt = true` (no dynamic + CPU/memory hotplug). +- The VM needs at least as many vCPUs as NUMA nodes. If fewer vCPUs are + available, multi-NUMA is silently skipped. +- vCPU pinning with NUMA works best with `cpuManagerPolicy: none` (the + default). Using `static` may restrict the pod's CPUSet to a single NUMA + node, preventing balanced pinning across nodes. +- Confidential guests (SEV-SNP, TDX) with NUMA require a QEMU patch + ([accel/kvm: Fix kvm_convert_memory calls crossing memory regions](https://github.com/AMDESE/qemu/commit/6b0eaa20)) + to handle page conversions that span multiple NUMA memory backends. + The GPU-experimental QEMU builds (`gpu-snp`, `gpu-tdx`) include this + patch. Without it, QEMU crashes with + `ram_block_attributes_state_change, invalid range`. diff --git a/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md b/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md index b20a9d9d3e..118cf16919 100644 --- a/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md +++ b/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md @@ -506,6 +506,17 @@ To stop the pod, run: `kubectl delete pod cuda-vectoradd-kata`. ### Next steps +#### NUMA topology for GPU locality + +On multi-NUMA hosts, enabling NUMA support ensures GPU memory accesses stay +local to the NUMA node where the GPU is physically attached, avoiding +cross-NUMA latency. The NVIDIA GPU configuration templates ship with +`enable_numa = true` by default. + +For details on NUMA configuration, topology verification, and +troubleshooting, see the +[NUMA support guide](../how-to/how-to-use-numa-with-kata.md). + #### Use multi-GPU passthrough If you have machines supporting multi-GPU passthrough, use a pod deployment diff --git a/tests/spellcheck/kata-dictionary.txt b/tests/spellcheck/kata-dictionary.txt index 66fb7076b1..e5701c4b82 100644 --- a/tests/spellcheck/kata-dictionary.txt +++ b/tests/spellcheck/kata-dictionary.txt @@ -20,6 +20,7 @@ materialx # Hardware & Architecture AMD APQN +chiplet cpuid DCAP DGPU @@ -78,6 +79,7 @@ ttrpc vsock # Container, Runtime & Misc terms +Burstable cgroupsv1 coredump CPUSET From f763e9cca9d23cdea49540eb94680ea5e3d8dbba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 14 May 2026 22:44:04 +0200 Subject: [PATCH 14/14] tests: Add NUMA topology / GPU placement tests to the NV CIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add k8s-nvidia-numa.bats with five tests that validate NUMA behaviour on hosts where NUMA is configured by default (qemu-nvidia-gpu, qemu-nvidia-gpu-snp, qemu-nvidia-gpu-tdx): 1. Multi-node sandbox (large workload spanning all host NUMA nodes): - Guest NUMA node count matches host - Guest vCPU distribution is balanced across nodes (max-min <= 1) - Guest memory is distributed across NUMA nodes - Host-side vCPU pinning is balanced across NUMA nodes 2. Right-sized single-node sandbox (small workload fitting one node): - Guest collapses to a single NUMA node - All host vCPU threads pinned to that one NUMA node 3. GPU passthrough with VFIO, multi-node: - Guest NUMA topology is balanced (same as test 1) - Guest GPU's NUMA node matches the host GPU's NUMA node (resolved via the vfio-pci,host= from the QEMU command line and /sys/bus/pci/devices//numa_node) - QEMU command line contains pxb-pcie and policy=bind - Host vCPU pinning is balanced 4. GPU passthrough with VFIO, right-sized single-node: small workload plus GPU that fits in a single host NUMA node: - Guest collapses to a single NUMA node - The chosen node is the GPU's host NUMA node, not just any node that fits — verified by matching host-nodes= in the memory backend and pxb-pcie numa_node= against the GPU's host node - Guest GPU reports the same NUMA node as the host GPU 5. Explicit numa_mapping in the runtime TOML (QEMU-only): - Drops a config.d/ fragment that sets numa_mapping = ["1"], so the auto-derive + right-sizing path is bypassed entirely - Guest sees exactly 1 NUMA node - QEMU memory backend is bound to host node 1 (host-nodes=1, policy=bind), not host node 0 - Host-side vCPU threads land on host node 1 - Drop-in is removed on teardown so subsequent tests are unaffected Guest-side checks use a dedicated container image (quay.io/kata-containers/numa) that reads sysfs and prints results to stdout — no kubectl exec or CoCo policy overrides needed. Host-side checks (crictl, pgrep, taskset) run directly on the host via sudo; a standalone numa-pinning-check.sh script handles the vCPU thread affinity inspection. The config.d/ helpers used by test 5 are runtime-agnostic (probe Go vs runtime-rs layout on disk) but the test is gated to qemu-* shims since runtime-rs does not yet implement NUMA. Skips cleanly on single-NUMA hosts, unsupported hypervisors, or when no nvidia.com/pgpu resources are available (GPU tests only). Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- docs/how-to/how-to-use-numa-with-kata.md | 20 +- .../kubernetes/k8s-nvidia-numa.bats | 745 ++++++++++++++++++ .../kubernetes/numa-pinning-check.sh | 44 ++ .../kubernetes/run_kubernetes_nv_tests.sh | 1 + .../numa-topology-gpu-test.yaml.in | 24 + .../numa-topology-test.yaml.in | 23 + .../runtimeclass_workloads/numa/Dockerfile | 17 + .../runtimeclass_workloads/numa/README.md | 36 + .../runtimeclass_workloads/numa/entrypoint.sh | 73 ++ 9 files changed, 967 insertions(+), 16 deletions(-) create mode 100644 tests/integration/kubernetes/k8s-nvidia-numa.bats create mode 100755 tests/integration/kubernetes/numa-pinning-check.sh create mode 100644 tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in create mode 100644 tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in create mode 100644 tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile create mode 100644 tests/integration/kubernetes/runtimeclass_workloads/numa/README.md create mode 100755 tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh diff --git a/docs/how-to/how-to-use-numa-with-kata.md b/docs/how-to/how-to-use-numa-with-kata.md index 671f1a3d1d..248dec80ea 100644 --- a/docs/how-to/how-to-use-numa-with-kata.md +++ b/docs/how-to/how-to-use-numa-with-kata.md @@ -451,20 +451,13 @@ NUMA node. ### 7.3 Check GPU NUMA inside the guest -Inside the GPU pod, verify the GPU reports a valid NUMA node (not `-1`): +Inside the GPU pod: ```bash -$ cat /sys/bus/pci/devices/*/numa_node -# Should show 0 or 1 (matching the host GPU's NUMA node), not -1. - $ nvidia-smi topo --matrix -# Shows the GPU's relationship to NUMA nodes from the guest perspective. ``` -The runtime uses QEMU's `acpi-generic-initiator` object to wire each VFIO -device to the correct guest NUMA node. If the guest reports `-1`, check -that the QEMU command line contains -`-object acpi-generic-initiator,id=gi-...,pci-dev=...,node=...`. +This shows the GPU's relationship to NUMA nodes from the guest perspective. ## How It Works @@ -502,13 +495,8 @@ When a VM is created with NUMA enabled, the runtime: also go through this NUMA-aware path, so all vCPUs land on the chosen host NUMA node's CPUs. -6. **Places VFIO devices on correct guest NUMA node**: For each - cold-plugged VFIO device (e.g. GPU), the runtime looks up its host - NUMA node, maps it to the corresponding guest NUMA node, and emits a - QEMU `acpi-generic-initiator` object so the guest kernel reports the - correct `numa_node` for the device. This ensures GPU memory accesses - stay NUMA-local. If a device's host NUMA node is not covered by the - guest topology, a warning is logged. +6. **Validates VFIO devices**: Checks each cold-plugged device's host NUMA + node against the guest topology and logs placement status. 7. **Translates cpuset.mems**: Converts host NUMA node IDs to guest node IDs before forwarding to the agent. diff --git a/tests/integration/kubernetes/k8s-nvidia-numa.bats b/tests/integration/kubernetes/k8s-nvidia-numa.bats new file mode 100644 index 0000000000..dd695e6811 --- /dev/null +++ b/tests/integration/kubernetes/k8s-nvidia-numa.bats @@ -0,0 +1,745 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# NUMA topology and vCPU pinning verification tests for Kata Containers. +# +# Five tests cover the main paths in the runtime's NUMA logic: +# 1. Multi-node sandbox: a workload that does NOT fit in a single host +# NUMA node should be balanced across host nodes — the guest sees +# multiple NUMA nodes with even vCPU/memory distribution and host +# vCPU pinning is balanced as well. +# 2. Right-sized single-node sandbox: a workload that DOES fit in a +# single host NUMA node should be collapsed to one node — the guest +# sees exactly one NUMA node with all vCPUs in it AND all host +# QEMU vCPU threads are pinned to that one host NUMA node. +# 3. GPU passthrough (VFIO), multi-node: when a GPU is attached via +# VFIO and the workload spans every host NUMA node, the runtime +# creates pxb-pcie bridges and the guest GPU reports the same NUMA +# node as the host GPU. +# 4. GPU passthrough (VFIO), right-sized single-node: when a small +# workload + GPU fits on a single host NUMA node, the runtime +# collapses the topology to the GPU's host NUMA node (memory and +# vCPUs land on the same node as the GPU, not just any fitting node). +# 5. Explicit numa_mapping in the runtime TOML: when the user pins the +# guest topology to a specific host node via numa_mapping = ["1"], +# maybeRightSizeAutoNUMA() must be a no-op and buildNUMATopology() +# must propagate the binding (memory + vCPU pinning land on the +# chosen host node, regardless of how small the workload is). +# +# Guest-side checks use the quay.io/kata-containers/numa container image +# which reads sysfs and prints results to stdout. The bats test reads +# the output via "kubectl logs" — no kubectl exec, no CoCo policy +# overrides needed. +# +# WARNING: The host-side pinning check runs numa-pinning-check.sh directly +# on the host (not inside a container). This requires the bats runner to +# execute on the k8s node with privileged access to /proc, /sys, crictl, +# and taskset. If the test environment changes so that bats no longer +# runs on the node, these calls must be reworked to use exec_host or +# equivalent. + +load "${BATS_TEST_DIRNAME}/lib.sh" +load "${BATS_TEST_DIRNAME}/confidential_common.sh" + +export KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu-nvidia-gpu-snp}" + +# Hypervisors where NUMA is configured and supported by default. +# Only qemu-nvidia-gpu variants ship enable_numa=true in their base config. +# runtime-rs does not yet implement NUMA; non-QEMU hypervisors lack support. +NUMA_CONFIGURED_SUPPORTED_BY_DEFAULT=( + "qemu-nvidia-gpu" + "qemu-nvidia-gpu-snp" + "qemu-nvidia-gpu-tdx" +) + +# Multi-node test: large enough to span every host NUMA node. +NUMA_TEST_VCPUS_LARGE="${NUMA_TEST_VCPUS_LARGE:-64}" +NUMA_TEST_MEMORY_LARGE="${NUMA_TEST_MEMORY_LARGE:-64Gi}" + +# Right-sizing test: small enough to fit in a single host NUMA node on +# any reasonable production-class server. +NUMA_TEST_VCPUS_SMALL="${NUMA_TEST_VCPUS_SMALL:-4}" +NUMA_TEST_MEMORY_SMALL="${NUMA_TEST_MEMORY_SMALL:-4Gi}" + +# GPU test: same sizing as the large test, plus a GPU. +NUMA_TEST_VCPUS_GPU="${NUMA_TEST_VCPUS_GPU:-64}" +NUMA_TEST_MEMORY_GPU="${NUMA_TEST_MEMORY_GPU:-64Gi}" + +# Small GPU test: fits in a single host NUMA node, exercises the +# right-sizing path with VFIO (sandbox should land on the GPU's host +# NUMA node, not just any node that fits). +NUMA_TEST_VCPUS_GPU_SMALL="${NUMA_TEST_VCPUS_GPU_SMALL:-4}" +NUMA_TEST_MEMORY_GPU_SMALL="${NUMA_TEST_MEMORY_GPU_SMALL:-4Gi}" + +export POD_NAME_NUMA="numa-topology-test" +POD_NAME_NUMA_GPU="numa-topology-gpu-test" + +POD_WAIT_TIMEOUT=${POD_WAIT_TIMEOUT:-600s} +export POD_WAIT_TIMEOUT + +HOST_PINNING_RETRIES=20 +HOST_PINNING_SLEEP=0.5 + +setup() { + setup_common || die "setup_common failed" + + pod_yaml_in="${pod_config_dir}/${POD_NAME_NUMA}.yaml.in" + pod_yaml="${pod_config_dir}/${POD_NAME_NUMA}.yaml" + + policy_settings_dir="$(create_tmp_policy_settings_dir "${pod_config_dir}")" + add_requests_to_policy_settings "${policy_settings_dir}" "ReadStreamRequest" +} + +# ----------------------------------------------------------------------------- +# Skip / topology helpers +# ----------------------------------------------------------------------------- + +# numa_skip_reason returns a non-empty skip reason on stdout when the +# current test should be skipped (hypervisor lacks default NUMA support +# OR host has fewer than 2 NUMA nodes). Empty stdout means run. +# Callers must invoke `skip` themselves — bats `skip` inside command +# substitution does not propagate. +numa_skip_reason() { + # shellcheck disable=SC2076 + if [[ ! " ${NUMA_CONFIGURED_SUPPORTED_BY_DEFAULT[*]} " =~ " ${KATA_HYPERVISOR} " ]]; then + echo "NUMA not configured by default on ${KATA_HYPERVISOR} (only qemu-nvidia-gpu variants)" + return 0 + fi + local nodes + nodes=$(host_numa_node_count) + if [[ "${nodes}" -lt 2 ]]; then + echo "Host has only ${nodes} NUMA node(s), need >= 2 for this test" + fi +} + +# host_numa_node_count echoes the number of NUMA nodes on the host. +# WARNING: numactl runs directly on the host, not via exec_host. +host_numa_node_count() { + numactl --hardware | grep -oP 'available:\s+\K\d+' +} + +# ----------------------------------------------------------------------------- +# Pod lifecycle helpers +# ----------------------------------------------------------------------------- + +# render_pod renders the pod yaml with the given vCPU and memory limits +# and runs auto_generate_policy against it. Each @test calls this with +# its own sizing so the same template can serve multiple scenarios. +render_pod() { + local vcpus="${1}" memory="${2}" + NUMA_TEST_VCPUS="${vcpus}" NUMA_TEST_MEMORY="${memory}" \ + envsubst < "${pod_yaml_in}" > "${pod_yaml}" + auto_generate_policy "${policy_settings_dir}" "${pod_yaml}" +} + +# deploy_and_get_guest_logs renders, applies, waits for Ready, then +# echoes the pod's stdout (the test image prints NUMA topology then +# sleeps). The brief sleep gives the entrypoint time to print before +# we read. +deploy_and_get_guest_logs() { + local vcpus="${1}" memory="${2}" + render_pod "${vcpus}" "${memory}" + kubectl apply -f "${pod_yaml}" + kubectl wait --for=condition=Ready --timeout="${POD_WAIT_TIMEOUT}" pod "${POD_NAME_NUMA}" + sleep 2 + kubectl logs "${POD_NAME_NUMA}" +} + +# ----------------------------------------------------------------------------- +# Guest-log parsers (operate on stdout from the test container) +# ----------------------------------------------------------------------------- + +# guest_online_count parses a "numa_online: " payload (e.g. "0", +# "0-1", "0-7") and echoes the number of online NUMA nodes it implies. +guest_online_count() { + local online="${1}" + if [[ "${online}" =~ ^([0-9]+)-([0-9]+)$ ]]; then + echo $(( ${BASH_REMATCH[2]} - ${BASH_REMATCH[1]} + 1 )) + elif [[ "${online}" =~ ^[0-9]+$ ]]; then + echo 1 + else + die "Unexpected format for guest NUMA online nodes: ${online}" + fi +} + +# guest_field +# Echoes the value following ":" in . E.g. +# guest_field "$logs" numa_online -> "0-1" +guest_field() { + echo "${1}" | grep -oP "${2}:\s*\K\S+" +} + +# guest_per_node_values +# Emits one value per line for "node\d+: " entries +# (e.g. _cpus or _mem_kb). Suitable for `mapfile -t`. +guest_per_node_values() { + echo "${1}" | grep -oP "node\d+${2}:\s*\K\d+" +} + +# ----------------------------------------------------------------------------- +# Host-side pinning helpers +# ----------------------------------------------------------------------------- + +# get_qemu_pid_for_numa_pod resolves the running pod's sandbox via crictl +# and returns the QEMU PID via pgrep. Fails the test if either lookup +# turns up empty. +get_qemu_pid_for_numa_pod() { + local sandbox_id qemu_pid + sandbox_id=$(sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock \ + pods --name "${POD_NAME_NUMA}" -q | head -1) + [[ -n "${sandbox_id}" ]] || die "no sandbox id found for pod ${POD_NAME_NUMA}" + + qemu_pid=$(sudo pgrep -f "qemu.*${sandbox_id}" | head -1) + [[ -n "${qemu_pid}" ]] || die "no QEMU PID found for sandbox ${sandbox_id}" + echo "${qemu_pid}" +} + +# pinning_thread_total sums the per-bucket counts in numa-pinning-check.sh +# output ("nodeN: " lines) and echoes the total. +pinning_thread_total() { + echo "${1}" | awk -F: '/^node[0-9]+:/ {sum+=$2} END {print sum+0}' +} + +# wait_for_host_pinning +# Polls numa-pinning-check.sh until at least threads +# report per-CPU affinity, or until HOST_PINNING_RETRIES is exhausted. +# Echoes the final script output regardless of whether convergence was +# reached, so callers can inspect/assert on the bucket distribution. +wait_for_host_pinning() { + local qemu_pid="${1}" expected="${2}" + local script="${BATS_TEST_DIRNAME}/numa-pinning-check.sh" + local output total + local attempt + for ((attempt = 1; attempt <= HOST_PINNING_RETRIES; attempt++)); do + output=$(sudo bash "${script}" "${qemu_pid}") + total=$(pinning_thread_total "${output}") + if (( total >= expected )); then + echo "${output}" + return 0 + fi + echo "# Host pinning attempt ${attempt}/${HOST_PINNING_RETRIES}: ${total}/${expected} threads pinned" >&2 + sleep "${HOST_PINNING_SLEEP}" + done + echo "${output}" +} + +# minmax_diff +# Echoes (max - min) for the given non-empty integer list. +minmax_diff() { + local lo=$1 hi=$1 v + shift + for v in "$@"; do + (( v > hi )) && hi=$v + (( v < lo )) && lo=$v + done + echo $((hi - lo)) +} + +# get_qemu_cmdline +# Reads the QEMU process command line from /proc, replacing null bytes +# with spaces. Runs directly on the host via sudo. +get_qemu_cmdline() { + sudo cat "/proc/${1}/cmdline" | tr '\0' ' ' +} + +# host_has_pgpu returns 0 if the node has allocatable nvidia.com/pgpu +# resources, 1 otherwise. +host_has_pgpu() { + local count + count=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/pgpu}' 2>/dev/null) + [[ -n "${count}" && "${count}" -gt 0 ]] 2>/dev/null +} + +# gpu_numa_skip_reason extends numa_skip_reason with a check for GPU +# availability. +gpu_numa_skip_reason() { + local reason + reason=$(numa_skip_reason) + if [[ -n "${reason}" ]]; then + echo "${reason}" + return 0 + fi + if ! host_has_pgpu; then + echo "No nvidia.com/pgpu resources available on the cluster" + fi +} + +# ----------------------------------------------------------------------------- +# Explicit numa_mapping config helpers (drop-in based) +# ----------------------------------------------------------------------------- +# +# Both kata-runtime (Go) and runtime-rs (Rust) read TOML fragments from a +# `config.d/` directory next to the active configuration-.toml file +# and merge them into the loaded config on every sandbox start. These +# helpers drop in a single override fragment so the main config file is +# never edited — teardown just deletes the fragment. +# +# WARNING: must run on the k8s node (sudo required) and patch/restore must +# be paired — a leaked drop-in would silently affect every subsequent pod +# on the same node. + +# kata_runtime_config_dir echoes the per-shim runtime config directory +# (the one that holds configuration-.toml and config.d/). Handles +# both the Go layout (.../runtimes/) and the runtime-rs layout +# (.../runtime-rs/runtimes/) by probing the filesystem rather than +# parsing the shim name (some Rust shims like `dragonball` lack the +# `-runtime-rs` suffix). +kata_runtime_config_dir() { + local base="/opt/kata/share/defaults/kata-containers" + local rs_dir="${base}/runtime-rs/runtimes/${KATA_HYPERVISOR}" + local go_dir="${base}/runtimes/${KATA_HYPERVISOR}" + if [[ -d "${rs_dir}" ]]; then + echo "${rs_dir}" + elif [[ -d "${go_dir}" ]]; then + echo "${go_dir}" + else + die "no Kata runtime config dir for ${KATA_HYPERVISOR} (looked in ${rs_dir} and ${go_dir})" + fi +} + +# kata_hypervisor_section echoes the [hypervisor.X] header from the active +# config so the drop-in fragment targets the right table. Discovering it +# at runtime keeps us hypervisor-agnostic (qemu / clh / firecracker / ...). +kata_hypervisor_section() { + local dir + dir=$(kata_runtime_config_dir) + local cfg="${dir}/configuration-${KATA_HYPERVISOR}.toml" + [[ -f "${cfg}" ]] || die "Kata config not found at ${cfg}" + local section + section=$(sudo grep -oE '^\[hypervisor\.[a-z0-9_-]+\]' "${cfg}" | head -1) + [[ -n "${section}" ]] || die "no [hypervisor.X] section in ${cfg}" + echo "${section}" +} + +# patch_kata_numa_mapping +# Writes a config.d/ drop-in that sets numa_mapping = under +# the active hypervisor section. Example values: '["1"]', '["0-1","2-3"]'. +# Records the file path in KATA_NUMA_DROPIN_PATH so teardown() can remove +# it. No restart needed — the next sandbox start picks it up. +patch_kata_numa_mapping() { + local value="${1}" + local dir section + dir=$(kata_runtime_config_dir) + section=$(kata_hypervisor_section) + + KATA_NUMA_DROPIN_PATH="${dir}/config.d/99-numa-test.toml" + export KATA_NUMA_DROPIN_PATH + + sudo mkdir -p "${dir}/config.d" + sudo tee "${KATA_NUMA_DROPIN_PATH}" >/dev/null < +# Returns the host PCI BDF of the first vfio-pci device passed through. +# E.g. "vfio-pci,host=0000:41:00.0,..." -> "0000:41:00.0". +extract_vfio_host_bdf() { + echo "${1}" | grep -oP 'vfio-pci,host=\K[0-9a-fA-F:.]+' | head -1 +} + +# host_gpu_numa +# Returns the NUMA node ID of a host PCI device from sysfs. +# Reads /sys/bus/pci/devices//numa_node on the host (via sudo +# since the bats runner may not have read access by default). +host_gpu_numa() { + sudo cat "/sys/bus/pci/devices/${1}/numa_node" +} + +# ----------------------------------------------------------------------------- +# Tests +# ----------------------------------------------------------------------------- + +@test "NUMA: guest topology and host pinning are balanced" { + # Skip checks must live inside @test (not setup) to avoid bats + # "Executed 0 instead of expected 1 tests" warnings. + local skip_reason + skip_reason=$(numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + local host_nodes + host_nodes=$(host_numa_node_count) + + local guest_logs + guest_logs=$(deploy_and_get_guest_logs "${NUMA_TEST_VCPUS_LARGE}" "${NUMA_TEST_MEMORY_LARGE}") + echo "# Guest NUMA output:" + echo "# ${guest_logs}" + + # --- Guest topology matches host --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s); host has ${host_nodes}" + [[ "${guest_count}" -eq "${host_nodes}" ]] \ + || die "guest NUMA node count (${guest_count}) != host (${host_nodes})" + + # --- Guest vCPU balance --- + mapfile -t guest_cpus < <(guest_per_node_values "${guest_logs}" _cpus) + echo "# Guest vCPUs per node: ${guest_cpus[*]}" + [[ ${#guest_cpus[@]} -ge 2 ]] \ + || die "expected >= 2 guest NUMA buckets, got ${#guest_cpus[@]}" + local diff + diff=$(minmax_diff "${guest_cpus[@]}") + echo "# Guest vCPU balance diff: ${diff}" + [[ "${diff}" -le 1 ]] || die "guest vCPU imbalance: ${guest_cpus[*]}" + + # --- Guest memory presence per node --- + mapfile -t guest_mem < <(guest_per_node_values "${guest_logs}" _mem_kb) + echo "# Guest memory per node (kB): ${guest_mem[*]}" + [[ ${#guest_mem[@]} -ge 2 ]] || die "expected >= 2 guest memory nodes" + + # --- Host-side vCPU pinning balance --- + local qemu_pid host_output + qemu_pid=$(get_qemu_pid_for_numa_pod) + echo "# QEMU PID: ${qemu_pid}" + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_LARGE}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -ge 2 ]] \ + || die "expected >= 2 host NUMA buckets, got ${#host_counts[@]}: ${host_output}" + diff=$(minmax_diff "${host_counts[@]}") + echo "# Host pinning diff: ${diff}" + [[ "${diff}" -le 1 ]] || die "host pinning imbalance: ${host_output}" +} + +@test "NUMA: small workload right-sizes to a single guest NUMA node" { + # When the sandbox CPU + memory budget fits comfortably on a single + # host NUMA node and no explicit numa_mapping is provided, the + # runtime should collapse the auto-derived multi-node topology to a + # single node to preserve memory locality. This test exercises + # selectNUMANodes()'s right-sizing path on a multi-NUMA host: + # 1. The guest sees exactly one NUMA node with all vCPUs in it. + # 2. The host-side QEMU vCPU threads are all pinned to that one + # host NUMA node (delivered by checkVCPUsPinningNUMA, which + # handles single-node sandboxes too). + local skip_reason + skip_reason=$(numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + local guest_logs + guest_logs=$(deploy_and_get_guest_logs "${NUMA_TEST_VCPUS_SMALL}" "${NUMA_TEST_MEMORY_SMALL}") + echo "# Guest NUMA output:" + echo "# ${guest_logs}" + + # --- Guest topology collapsed to a single node --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s)" + [[ "${guest_count}" -eq 1 ]] \ + || die "right-sized sandbox should expose 1 NUMA node, got ${guest_count}" + + mapfile -t guest_cpus < <(guest_per_node_values "${guest_logs}" _cpus) + echo "# Guest vCPUs per node: ${guest_cpus[*]}" + [[ ${#guest_cpus[@]} -eq 1 ]] \ + || die "expected 1 guest NUMA bucket, got ${#guest_cpus[@]}: ${guest_cpus[*]}" + # The runtime may add a default vCPU on top of the workload request, + # so the guest can see slightly more than the pod spec asked for. + [[ "${guest_cpus[0]}" -ge "${NUMA_TEST_VCPUS_SMALL}" ]] \ + || die "expected at least ${NUMA_TEST_VCPUS_SMALL} vCPUs on the single node, got ${guest_cpus[0]}" + + mapfile -t guest_mem < <(guest_per_node_values "${guest_logs}" _mem_kb) + echo "# Guest memory per node (kB): ${guest_mem[*]}" + [[ ${#guest_mem[@]} -eq 1 ]] \ + || die "expected 1 guest memory node, got ${#guest_mem[@]}" + + # --- Host-side vCPU pinning collapsed to a single node --- + local qemu_pid host_output + qemu_pid=$(get_qemu_pid_for_numa_pod) + echo "# QEMU PID: ${qemu_pid}" + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_SMALL}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -eq 1 ]] \ + || die "right-sized sandbox vCPU threads should land on a single host NUMA node, got ${#host_counts[@]} buckets: ${host_output}" + [[ "${host_counts[0]}" -ge "${NUMA_TEST_VCPUS_SMALL}" ]] \ + || die "expected at least ${NUMA_TEST_VCPUS_SMALL} vCPU threads pinned, got ${host_counts[0]}: ${host_output}" +} + +@test "NUMA: GPU passthrough with VFIO has correct NUMA placement" { + local skip_reason + skip_reason=$(gpu_numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + local host_nodes + host_nodes=$(host_numa_node_count) + + local gpu_yaml_in="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml.in" + local gpu_yaml="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml" + + POD_NAME_NUMA="${POD_NAME_NUMA_GPU}" NUMA_TEST_VCPUS="${NUMA_TEST_VCPUS_GPU}" \ + NUMA_TEST_MEMORY="${NUMA_TEST_MEMORY_GPU}" \ + envsubst < "${gpu_yaml_in}" > "${gpu_yaml}" + auto_generate_policy "${policy_settings_dir}" "${gpu_yaml}" + + kubectl apply -f "${gpu_yaml}" + kubectl wait --for=condition=Ready --timeout="${POD_WAIT_TIMEOUT}" pod "${POD_NAME_NUMA_GPU}" + sleep 2 + + local guest_logs + guest_logs=$(kubectl logs "${POD_NAME_NUMA_GPU}") + echo "# GPU pod guest NUMA output:" + echo "# ${guest_logs}" + + # --- Guest NUMA topology matches host --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s); host has ${host_nodes}" + [[ "${guest_count}" -eq "${host_nodes}" ]] \ + || die "GPU pod guest NUMA node count (${guest_count}) != host (${host_nodes})" + + # --- Guest vCPU balance --- + mapfile -t guest_cpus < <(guest_per_node_values "${guest_logs}" _cpus) + echo "# Guest vCPUs per node: ${guest_cpus[*]}" + [[ ${#guest_cpus[@]} -ge 2 ]] \ + || die "expected >= 2 guest NUMA buckets, got ${#guest_cpus[@]}" + local diff + diff=$(minmax_diff "${guest_cpus[@]}") + echo "# Guest vCPU balance diff: ${diff}" + [[ "${diff}" -le 1 ]] || die "GPU pod guest vCPU imbalance: ${guest_cpus[*]}" + + # --- Guest memory presence per node --- + mapfile -t guest_mem < <(guest_per_node_values "${guest_logs}" _mem_kb) + echo "# Guest memory per node (kB): ${guest_mem[*]}" + [[ ${#guest_mem[@]} -ge 2 ]] || die "expected >= 2 guest memory nodes" + + # --- Host-side QEMU lookup (needed for the GPU NUMA assertion) --- + local sandbox_id qemu_pid qemu_cmd host_bdf host_node + sandbox_id=$(sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock \ + pods --name "${POD_NAME_NUMA_GPU}" -q | head -1) + [[ -n "${sandbox_id}" ]] || die "no sandbox id found for GPU pod" + + qemu_pid=$(sudo pgrep -f "qemu.*${sandbox_id}" | head -1) + [[ -n "${qemu_pid}" ]] || die "no QEMU PID found for GPU sandbox ${sandbox_id}" + echo "# QEMU PID: ${qemu_pid}" + + qemu_cmd=$(get_qemu_cmdline "${qemu_pid}") + host_bdf=$(extract_vfio_host_bdf "${qemu_cmd}") + [[ -n "${host_bdf}" ]] || die "no vfio-pci host BDF found in QEMU cmdline" + host_node=$(host_gpu_numa "${host_bdf}") + echo "# Host GPU ${host_bdf} on NUMA node ${host_node}" + + # --- Guest GPU NUMA affinity --- + # With pxb-pcie and default numa_mapping (1:1), the guest GPU's NUMA + # node must equal the host GPU's NUMA node. + mapfile -t gpu_numas < <(echo "${guest_logs}" | grep -oP 'gpu_.*_numa:\s*\K-?\d+') + echo "# Guest GPU NUMA nodes: ${gpu_numas[*]}" + [[ ${#gpu_numas[@]} -ge 1 ]] \ + || die "no GPU detected in guest sysfs (expected gpu_*_numa: lines)" + for gn in "${gpu_numas[@]}"; do + [[ "${gn}" -eq "${host_node}" ]] \ + || die "guest GPU on node ${gn} but host GPU ${host_bdf} is on node ${host_node}" + done + + # --- Host-side vCPU pinning balance --- + local host_output + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_GPU}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -ge 2 ]] \ + || die "expected >= 2 host NUMA buckets for GPU pod, got ${#host_counts[@]}: ${host_output}" + diff=$(minmax_diff "${host_counts[@]}") + echo "# Host pinning diff: ${diff}" + [[ "${diff}" -le 1 ]] || die "GPU pod host pinning imbalance: ${host_output}" + + # --- QEMU command line: pxb-pcie and NUMA binding --- + echo "# Checking QEMU cmdline for pxb-pcie..." + [[ "${qemu_cmd}" == *"pxb-pcie"* ]] \ + || die "QEMU command line does not contain 'pxb-pcie' — NUMA PCIe topology not active" + + echo "# Checking QEMU cmdline for NUMA memory binding..." + [[ "${qemu_cmd}" == *"policy=bind"* ]] \ + || die "QEMU command line does not contain 'policy=bind' — NUMA memory binding not active" +} + +@test "NUMA: small GPU workload right-sizes to the GPU's host NUMA node" { + # When a GPU is attached and the sandbox CPU + memory budget fits on + # a single host NUMA node, the runtime's right-sizing path + # (selectNUMANodes with VFIO awareness) should collapse the topology + # to the GPU's host NUMA node — not just any fitting node — so that + # GPU and memory access stay NUMA-local. + local skip_reason + skip_reason=$(gpu_numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + local gpu_yaml_in="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml.in" + local gpu_yaml="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml" + + POD_NAME_NUMA="${POD_NAME_NUMA_GPU}" NUMA_TEST_VCPUS="${NUMA_TEST_VCPUS_GPU_SMALL}" \ + NUMA_TEST_MEMORY="${NUMA_TEST_MEMORY_GPU_SMALL}" \ + envsubst < "${gpu_yaml_in}" > "${gpu_yaml}" + auto_generate_policy "${policy_settings_dir}" "${gpu_yaml}" + + kubectl apply -f "${gpu_yaml}" + kubectl wait --for=condition=Ready --timeout="${POD_WAIT_TIMEOUT}" pod "${POD_NAME_NUMA_GPU}" + sleep 2 + + local guest_logs + guest_logs=$(kubectl logs "${POD_NAME_NUMA_GPU}") + echo "# Small GPU pod guest NUMA output:" + echo "# ${guest_logs}" + + # --- Host-side QEMU lookup --- + local sandbox_id qemu_pid qemu_cmd host_bdf host_node + sandbox_id=$(sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock \ + pods --name "${POD_NAME_NUMA_GPU}" -q | head -1) + [[ -n "${sandbox_id}" ]] || die "no sandbox id found for GPU pod" + + qemu_pid=$(sudo pgrep -f "qemu.*${sandbox_id}" | head -1) + [[ -n "${qemu_pid}" ]] || die "no QEMU PID found for GPU sandbox ${sandbox_id}" + + qemu_cmd=$(get_qemu_cmdline "${qemu_pid}") + host_bdf=$(extract_vfio_host_bdf "${qemu_cmd}") + [[ -n "${host_bdf}" ]] || die "no vfio-pci host BDF found in QEMU cmdline" + host_node=$(host_gpu_numa "${host_bdf}") + echo "# Host GPU ${host_bdf} on NUMA node ${host_node}" + + # --- Guest collapsed to a single NUMA node --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s)" + [[ "${guest_count}" -eq 1 ]] \ + || die "right-sized GPU sandbox should expose 1 NUMA node, got ${guest_count}" + + # --- Guest GPU sees the (single) node --- + mapfile -t gpu_numas < <(echo "${guest_logs}" | grep -oP 'gpu_.*_numa:\s*\K-?\d+') + echo "# Guest GPU NUMA nodes: ${gpu_numas[*]}" + [[ ${#gpu_numas[@]} -ge 1 ]] \ + || die "no GPU detected in guest sysfs (expected gpu_*_numa: lines)" + # In a single-node guest, the GPU is on node 0. + for gn in "${gpu_numas[@]}"; do + [[ "${gn}" -eq 0 ]] \ + || die "guest GPU on node ${gn} but right-sized sandbox has only node 0" + done + + # --- QEMU memory backend bound to the GPU's host NUMA node --- + # The right-sizing path should pick the GPU's host node, not just + # any node that fits. With pxb-pcie + right-sizing, the single + # memory-backend-ram for the sandbox must have host-nodes=${host_node}. + echo "# Checking QEMU cmdline for memory binding on host node ${host_node}..." + [[ "${qemu_cmd}" == *"host-nodes=${host_node}"* ]] \ + || die "right-sized GPU sandbox memory not bound to GPU's host NUMA node ${host_node}: cmdline=${qemu_cmd}" + + # --- Host-side vCPU pinning collapsed to the GPU's host node --- + local host_output + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_GPU_SMALL}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -eq 1 ]] \ + || die "right-sized GPU sandbox vCPU threads should land on a single host NUMA node, got ${#host_counts[@]} buckets: ${host_output}" + + local pinned_node + pinned_node=$(echo "${host_output}" | grep -oP '^node\K[0-9]+' | head -1) + [[ "${pinned_node}" -eq "${host_node}" ]] \ + || die "right-sized GPU sandbox vCPUs pinned to node ${pinned_node} but GPU is on host node ${host_node}" +} + +@test "NUMA: explicit numa_mapping in TOML pins the sandbox to the chosen host node" { + # When the user sets numa_mapping = ["1"] in the runtime TOML, the + # right-sizing path must be skipped (maybeRightSizeAutoNUMA bails out + # for non-empty NUMAMapping) and buildNUMATopology must propagate the + # binding verbatim, regardless of how small the workload is. + # + # Verifies end-to-end that: + # - guest sees exactly 1 NUMA node; + # - the QEMU memory backend is bound to host node 1 (not 0); + # - host-side vCPU threads land on host node 1. + # + # QEMU-only: this test asserts on the QEMU command line (host-nodes=, + # policy=bind) and on the kata-runtime (Go) NUMA logic. runtime-rs + # does not yet implement NUMA, so even if numa_skip_reason were + # widened later we'd still want to gate this case explicitly. + [[ "${KATA_HYPERVISOR}" == qemu-* ]] \ + || skip "explicit numa_mapping test is QEMU-only (got ${KATA_HYPERVISOR})" + + local skip_reason + skip_reason=$(numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + # Need at least 2 host nodes so "host node 1" is a non-trivial pick. + local host_nodes + host_nodes=$(host_numa_node_count) + [[ "${host_nodes}" -ge 2 ]] || skip "explicit-mapping test needs >=2 host NUMA nodes" + + # Patch the active runtime config; teardown() restores it. + patch_kata_numa_mapping '["1"]' + + local guest_logs + guest_logs=$(deploy_and_get_guest_logs "${NUMA_TEST_VCPUS_SMALL}" "${NUMA_TEST_MEMORY_SMALL}") + echo "# Guest NUMA output:" + echo "# ${guest_logs}" + + # --- Guest: explicit mapping always yields exactly one node --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s)" + [[ "${guest_count}" -eq 1 ]] \ + || die "explicit numa_mapping=[1] should expose 1 guest NUMA node, got ${guest_count}" + + # --- QEMU memory backend bound to host node 1 --- + local qemu_pid qemu_cmd + qemu_pid=$(get_qemu_pid_for_numa_pod) + qemu_cmd=$(get_qemu_cmdline "${qemu_pid}") + echo "# Checking QEMU cmdline for memory binding on host node 1..." + [[ "${qemu_cmd}" == *"host-nodes=1"* ]] \ + || die "explicit numa_mapping=[1] did not pin QEMU memory to host node 1: cmdline=${qemu_cmd}" + [[ "${qemu_cmd}" == *"policy=bind"* ]] \ + || die "explicit numa_mapping=[1] missing policy=bind in QEMU cmdline: cmdline=${qemu_cmd}" + + # --- Host-side vCPU pinning lands on host node 1 --- + local host_output + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_SMALL}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -eq 1 ]] \ + || die "explicit numa_mapping=[1] should pin vCPUs to a single host NUMA node, got ${#host_counts[@]} buckets: ${host_output}" + + local pinned_node + pinned_node=$(echo "${host_output}" | grep -oP '^node\K[0-9]+' | head -1) + [[ "${pinned_node}" -eq 1 ]] \ + || die "explicit numa_mapping=[1] pinned vCPUs to node ${pinned_node}, expected 1" +} + +teardown() { + echo "=== NUMA test pod describe ===" + kubectl describe pod "${POD_NAME_NUMA}" || true + kubectl describe pod "${POD_NAME_NUMA_GPU}" 2>/dev/null || true + + echo "=== NUMA test pod logs ===" + kubectl logs "${POD_NAME_NUMA}" || true + kubectl logs "${POD_NAME_NUMA_GPU}" 2>/dev/null || true + + # Always restore the Kata config (no-op if no patch was applied). + restore_kata_numa_mapping || true + + delete_tmp_policy_settings_dir "${policy_settings_dir}" + + [ -f "${pod_yaml}" ] && kubectl delete -f "${pod_yaml}" --ignore-not-found=true + local gpu_yaml="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml" + [ -f "${gpu_yaml}" ] && kubectl delete -f "${gpu_yaml}" --ignore-not-found=true + + print_node_journal_since_test_start "${node}" "${node_start_time:-}" "${BATS_TEST_COMPLETED:-}" +} diff --git a/tests/integration/kubernetes/numa-pinning-check.sh b/tests/integration/kubernetes/numa-pinning-check.sh new file mode 100755 index 0000000000..ead2476e6a --- /dev/null +++ b/tests/integration/kubernetes/numa-pinning-check.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# WARNING: This script runs directly on the host, NOT inside a container. +# It requires privileged access to /proc and /sys to inspect QEMU vCPU +# thread affinities and map them to host NUMA nodes. +# +# Usage: numa-pinning-check.sh +# +# Output: one line per NUMA node with the count of pinned vCPU threads. +# node0: 32 +# node1: 32 +# +# A vCPU thread is counted only when taskset reports it pinned to a single +# CPU (bare number, no ranges or commas). Threads with broad affinity +# masks are silently skipped — the caller is expected to retry until the +# runtime has finished per-vCPU pinning. + +set -o pipefail + +QEMU_PID="${1:?Usage: $0 }" + +if [[ ! -d "/proc/${QEMU_PID}/task" ]]; then + echo "ERROR: /proc/${QEMU_PID}/task not found" >&2 + exit 1 +fi + +for tid in "/proc/${QEMU_PID}/task/"*; do + tid="${tid##*/}" + list=$(taskset -pc "${tid}" 2>/dev/null | sed 's/.*: //') + if [[ "${list}" =~ ^[0-9]+$ ]]; then + # Map the CPU to its NUMA node via the sysfs topology symlink + for node_link in "/sys/devices/system/cpu/cpu${list}/node"*; do + if [[ -d "${node_link}" ]]; then + numa_node="${node_link##*node}" + echo "node${numa_node}" + break + fi + done + fi +done | sort | uniq -c | awk '{print $2 ": " $1}' diff --git a/tests/integration/kubernetes/run_kubernetes_nv_tests.sh b/tests/integration/kubernetes/run_kubernetes_nv_tests.sh index eda7934858..901b97779b 100644 --- a/tests/integration/kubernetes/run_kubernetes_nv_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_nv_tests.sh @@ -72,6 +72,7 @@ if [[ -n "${K8S_TEST_NV:-}" ]]; then mapfile -d " " -t K8S_TEST_NV <<< "${K8S_TEST_NV}" else K8S_TEST_NV=("k8s-confidential-attestation.bats" \ + "k8s-nvidia-numa.bats" \ "k8s-nvidia-cuda.bats" \ "k8s-nvidia-nim.bats" \ "k8s-nvidia-nim-service.bats") diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in new file mode 100644 index 0000000000..7167fa271c --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in @@ -0,0 +1,24 @@ +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +apiVersion: v1 +kind: Pod +metadata: + name: ${POD_NAME_NUMA} + labels: + app: ${POD_NAME_NUMA} +spec: + runtimeClassName: kata + containers: + - name: numa-check + image: "quay.io/kata-containers/numa:2026-05-15@sha256:a863fcf95fcbbf63352b0555a61a62537f74399dc4bca826a2e42d001e26accb" + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + cpu: "${NUMA_TEST_VCPUS}" + memory: "${NUMA_TEST_MEMORY}" + nvidia.com/pgpu: "1" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in new file mode 100644 index 0000000000..731e75a32d --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in @@ -0,0 +1,23 @@ +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +apiVersion: v1 +kind: Pod +metadata: + name: ${POD_NAME_NUMA} + labels: + app: ${POD_NAME_NUMA} +spec: + runtimeClassName: kata + containers: + - name: numa-check + image: "quay.io/kata-containers/numa:2026-05-15@sha256:a863fcf95fcbbf63352b0555a61a62537f74399dc4bca826a2e42d001e26accb" + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + cpu: "${NUMA_TEST_VCPUS}" + memory: "${NUMA_TEST_MEMORY}" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile b/tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile new file mode 100644 index 0000000000..7e9f541ae8 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile @@ -0,0 +1,17 @@ +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Minimal image that reads guest NUMA topology from sysfs. +# Multi-arch: linux/amd64, linux/arm64 +# +# Build & push: +# docker buildx build --platform linux/amd64,linux/arm64 \ +# -t quay.io/kata-containers/numa:$(date +%Y-%m-%d) --push . + +FROM alpine:3.23 + +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa/README.md b/tests/integration/kubernetes/runtimeclass_workloads/numa/README.md new file mode 100644 index 0000000000..8bed127cb8 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa/README.md @@ -0,0 +1,36 @@ +# NUMA Topology Check Container + +Minimal container image that reads guest NUMA topology from sysfs and +prints structured output to stdout. Used by `k8s-nvidia-numa.bats` to +verify guest NUMA node count, vCPU distribution, and memory layout +without needing `kubectl exec` (which requires CoCo policy overrides). + +## Image + +`quay.io/kata-containers/numa:` + +## Build and push (multi-arch) + +```bash +cd tests/integration/kubernetes/runtimeclass_workloads/numa/ + +docker buildx build --platform linux/amd64,linux/arm64 \ + -t quay.io/kata-containers/numa:$(date +%Y-%m-%d) --push . +``` + +After pushing, update the image reference (including digest) in +`numa-topology-test.yaml.in`. + +## Output format + +The entrypoint prints one `key: value` pair per line: + +``` +numa_online: 0-1 +node0_cpus: 32 +node1_cpus: 32 +node0_mem_kb: 37078332 +node1_mem_kb: 37125524 +``` + +The bats test parses this output from `kubectl logs`. diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh b/tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh new file mode 100755 index 0000000000..1a8f970305 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh @@ -0,0 +1,73 @@ +#!/bin/sh +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Reads guest NUMA topology from sysfs and prints structured output. +# Designed to run inside a kata VM as the container entrypoint. +# +# Output format (one key: value per line): +# numa_online: 0-1 +# node0_cpus: 32 +# node1_cpus: 32 +# node0_mem_kb: 37078332 +# node1_mem_kb: 37125524 +# gpu_0000:41:00.0_numa: 1 (only if GPUs are present) + +set -e + +# Print results to stdout (readable via "kubectl logs"), then sleep to +# keep the pod alive so the host-side pinning check can inspect the +# QEMU process. The bats test deletes the pod when done. + +# NUMA nodes online (e.g. "0-1" or "0") +online=$(cat /sys/devices/system/node/online) +echo "numa_online: ${online}" + +# Per-node vCPU count +for cpulist in /sys/devices/system/node/node*/cpulist; do + node_name=$(basename "$(dirname "${cpulist}")") + cpus=$(cat "${cpulist}") + count=0 + # Parse comma-separated ranges like "0-31,64-95" + IFS="," + for range in ${cpus}; do + case "${range}" in + *-*) + lo=${range%-*} + hi=${range#*-} + count=$((count + hi - lo + 1)) + ;; + *) + count=$((count + 1)) + ;; + esac + done + unset IFS + echo "${node_name}_cpus: ${count}" +done + +# Per-node memory +for meminfo in /sys/devices/system/node/node*/meminfo; do + node_name=$(basename "$(dirname "${meminfo}")") + mem_kb=$(awk '/MemTotal/ {print $4}' "${meminfo}") + echo "${node_name}_mem_kb: ${mem_kb}" +done + +# GPU NUMA affinity (if any GPUs are present via VFIO passthrough). +# PCI class 0x030200 = 3D controller (NVIDIA data center GPUs: A100, H100, etc.) +for numa_file in /sys/bus/pci/devices/*/numa_node; do + dev_dir=$(dirname "${numa_file}") + class=$(cat "${dev_dir}/class" 2>/dev/null) || continue + case "${class}" in + 0x030200) + bdf=$(basename "${dev_dir}") + node=$(cat "${numa_file}") + echo "gpu_${bdf}_numa: ${node}" + ;; + esac +done + +# Keep the pod alive for host-side pinning verification. +exec sleep infinity