From 1cbe930fc9897f61b70a4e88eebdafb96b3c8f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 15 May 2026 17:36:59 +0200 Subject: [PATCH] runtime: Add pxb-pcie NUMA-aware PCIe topology for VFIO devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When NUMA placement is active and VFIO devices are cold-plugged, create a pxb-pcie (PCIe Expander Bridge) per NUMA node that has devices. Each pxb-pcie carries a numa_node property that gives the guest kernel correct NUMA affinity for all PCI devices beneath it. Root ports are created on each pxb-pcie bus instead of pcie.0, and VFIODevice.Attach() assigns each device to the root port on its host NUMA node's pxb bridge. Non-VFIO devices remain on pcie.0. NUMA placement is "active" when there is more than one guest NUMA node OR a single guest node mapped to a specific host node (the latter happens when maybeRightSizeAutoNUMA() collapses a multi-node sandbox to the GPU's host NUMA node). In both cases buildNUMATopology() also emits the matching memory-backend-ram,host-nodes=,policy=bind entries so guest memory is sourced from the right host node. So pxb-pcie can never capture a leaf virtio-pci device as the default bus, every virtio-pci device emitter (NetDevice, VSOCK, vhost-user-{net,scsi,blk,fs}) now appends bus=pcie.0 explicitly when the machine actually exposes a pcie.0 root. Detection is done via a new hasPCIeRoot() helper that returns true only for q35/virt machine types — ppc64le's pseries (pci.0), s390x's s390-ccw-virtio (CCW transport) and microvm (no PCI) intentionally skip the pin to avoid "Bus 'pcie.0' not found" at startup. This is the only QEMU mechanism that works for both regular and confidential (TDX/SNP) guests, as it operates through the PCI bus hierarchy rather than ACPI table injection. Signed-off-by: Fabiano Fidêncio --- src/runtime/pkg/device/config/config.go | 11 ++ src/runtime/pkg/device/drivers/vfio.go | 16 ++- src/runtime/pkg/device/manager/manager.go | 2 + src/runtime/pkg/govmm/qemu/qemu.go | 84 +++++++++++-- .../pkg/govmm/qemu/qemu_arch_base_test.go | 7 +- src/runtime/pkg/govmm/qemu/qemu_test.go | 36 +++++- src/runtime/virtcontainers/qemu.go | 119 +++++++++++++++++- src/runtime/virtcontainers/qemu_test.go | 102 +++++++++++++++ 8 files changed, 356 insertions(+), 21 deletions(-) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index f41263975b..489cf5e4dc 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -235,6 +235,17 @@ var ( // different types of PCI ports. We can deduces the Bus number from it // and eliminate duplicates being assigned. PCIeDevicesPerPort = map[PCIePort][]VFIODev{} + + // NUMARootPorts maps host NUMA node IDs to root port IDs on pxb-pcie + // bridges. When NUMA-aware PCIe topology is active (pxb-pcie), + // createPCIeTopology populates this so VFIODevice.Attach() can assign + // each device to the root port on its host NUMA node's pxb-pcie bus. + // Key: host NUMA node ID, Value: slice of root port IDs on that node's pxb. + NUMARootPorts = map[int][]string{} + + // NUMARootPortDeviceCount tracks how many devices have been assigned + // to each host NUMA node's root ports (for round-robin assignment). + NUMARootPortDeviceCount = map[int]int{} ) // DeviceInfo is an embedded type that contains device data common to all types of devices. diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 42d86e2dca..ff70c4ac76 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -90,11 +90,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } if vfio.IsPCIe { - busIndex := len(config.PCIeDevicesPerPort[vfio.Port]) - vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) - // We need to keep track the number of devices per port to deduce - // the corectu bus number, additionally we can use the VFIO device - // info to act upon different Vendor IDs and Device IDs. + // When pxb-pcie NUMA topology is active, assign the device + // to a root port on the pxb-pcie bridge for its host NUMA + // node instead of the default rp/swdp numbering. + if rpIDs, ok := config.NUMARootPorts[vfio.NUMANode]; ok && len(rpIDs) > 0 { + idx := config.NUMARootPortDeviceCount[vfio.NUMANode] + vfio.Bus = rpIDs[idx%len(rpIDs)] + config.NUMARootPortDeviceCount[vfio.NUMANode] = idx + 1 + } else { + busIndex := len(config.PCIeDevicesPerPort[vfio.Port]) + vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) + } config.PCIeDevicesPerPort[vfio.Port] = append(config.PCIeDevicesPerPort[vfio.Port], *vfio) } } diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index 06f9117676..5726613e3a 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -71,6 +71,8 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS config.PCIeDevicesPerPort[config.RootPort] = make([]config.VFIODev, 0) config.PCIeDevicesPerPort[config.SwitchPort] = make([]config.VFIODev, 0) config.PCIeDevicesPerPort[config.BridgePort] = make([]config.VFIODev, 0) + config.NUMARootPorts = make(map[int][]string) + config.NUMARootPortDeviceCount = make(map[int]int) for _, dev := range devices { dm.devices[dev.DeviceID()] = dev diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index dc7501c87b..9dca1e959e 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -50,6 +50,20 @@ const ( qgsSocketPath string = "/var/run/tdx-qgs/qgs.socket" ) +// hasPCIeRoot reports whether the configured QEMU machine type exposes a +// `pcie.0` root complex (q35 on x86, virt on arm64). Machines such as +// pseries (ppc64le -> pci.0), s390-ccw-virtio (s390x -> CCW transport) +// and microvm (no PCI at all) do not have a `pcie.0` bus, so emitting +// `bus=pcie.0` on virtio-pci leaf devices would fail to start QEMU. +// Used to gate the bus= pin we apply to keep leaf devices off pxb-pcie. +func hasPCIeRoot(config *Config) bool { + if config == nil { + return false + } + t := config.Machine.Type + return strings.HasPrefix(t, "q35") || strings.HasPrefix(t, "virt") +} + const ( // Well known vsock CID for host system. // https://man7.org/linux/man-pages/man7/vsock.7.html @@ -132,6 +146,10 @@ const ( // VHostVSockPCI is a generic Vsock vhost device with PCI transport. VHostVSockPCI DeviceDriver = "vhost-vsock-pci" + // PXBPCIe is a PCIe Expander Bridge that creates a new PCI root + // complex with NUMA node affinity. + PXBPCIe DeviceDriver = "pxb-pcie" + // PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port. PCIeRootPort DeviceDriver = "pcie-root-port" @@ -1064,6 +1082,11 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string { if netdev.Bus != "" { deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus)) + } else if netdev.Transport.isVirtioPCI(config) && hasPCIeRoot(config) { + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device as the default bus. Skipped on machines + // without a `pcie.0` root (pseries, microvm, s390-ccw-virtio). + deviceParams = append(deviceParams, "bus=pcie.0") } if netdev.Addr != "" { @@ -1587,9 +1610,11 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address)) if vhostuserDev.Transport.isVirtioPCI(config) { - // Pin to pcie.0 so pxb-pcie (when present) doesn't capture - // this leaf device as the default bus. - deviceParams = append(deviceParams, "bus=pcie.0") + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device. See hasPCIeRoot() for skipped machines. + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1618,7 +1643,9 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) if vhostuserDev.Transport.isVirtioPCI(config) { - deviceParams = append(deviceParams, "bus=pcie.0") + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1646,7 +1673,9 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) if vhostuserDev.Transport.isVirtioPCI(config) { - deviceParams = append(deviceParams, "bus=pcie.0") + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1686,7 +1715,9 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo)) } if vhostuserDev.Transport.isVirtioPCI(config) { - deviceParams = append(deviceParams, "bus=pcie.0") + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1752,6 +1783,36 @@ func (vhostuserDev VhostUserDevice) deviceName(config *Config) string { } } +// PXBPCIeDevice represents a PCIe Expander Bridge (pxb-pcie). +// It creates a new PCI root complex with NUMA node affinity, allowing +// devices attached to its bus hierarchy to inherit the NUMA association. +// This is the only QEMU PCI device that carries a numa_node property. +type PXBPCIeDevice struct { + // ID is the QEMU device identifier (e.g. "pxb-numa0"). + ID string + + // BusNr is the guest PCI bus number for this root complex. + // Use values spaced apart (e.g. 0x20, 0x40) to leave room for + // bridges beneath each pxb-pcie. + BusNr uint8 + + // NUMANode is the guest NUMA node index this root complex belongs to. + NUMANode int +} + +// QemuParams returns the QEMU parameters for a pxb-pcie device. +func (dev PXBPCIeDevice) QemuParams(_ *Config) []string { + return []string{ + "-device", + fmt.Sprintf("pxb-pcie,id=%s,bus_nr=%d,numa_node=%d", dev.ID, dev.BusNr, dev.NUMANode), + } +} + +// Valid returns true if the PXBPCIeDevice structure is valid and complete. +func (dev PXBPCIeDevice) Valid() bool { + return dev.ID != "" +} + // PCIeRootPortDevice represents a memory balloon device. // nolint: govet type PCIeRootPortDevice struct { @@ -2324,8 +2385,15 @@ func (vsock VSOCKDevice) QemuParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vsock.ID)) deviceParams = append(deviceParams, fmt.Sprintf("%s=%d", VSOCKGuestCID, vsock.ContextID)) - if vsock.Transport.isVirtioPCI(config) && vsock.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile)) + if vsock.Transport.isVirtioPCI(config) { + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device. See hasPCIeRoot() for skipped machines. + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } + if vsock.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile)) + } } if vsock.Transport.isVirtioCCW(config) { diff --git a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go index a15e77c184..36e03254ae 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go @@ -42,7 +42,8 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserBlk, ROMFile: romfile, } - testAppend(vhostuserBlkDevice, deviceVhostUserBlkString, t) + // vhost-user-pci device strings include bus=pcie.0 — gated to q35/virt. + testAppendQ35(vhostuserBlkDevice, deviceVhostUserBlkString, t) vhostuserSCSIDevice := VhostUserDevice{ SocketPath: "/tmp/nonexistentsocket.socket", @@ -52,7 +53,7 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserSCSI, ROMFile: romfile, } - testAppend(vhostuserSCSIDevice, deviceVhostUserSCSIString, t) + testAppendQ35(vhostuserSCSIDevice, deviceVhostUserSCSIString, t) vhostuserNetDevice := VhostUserDevice{ SocketPath: "/tmp/nonexistentsocket.socket", @@ -62,7 +63,7 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserNet, ROMFile: romfile, } - testAppend(vhostuserNetDevice, deviceVhostUserNetString, t) + testAppendQ35(vhostuserNetDevice, deviceVhostUserNetString, t) } func TestAppendVirtioBalloon(t *testing.T) { diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 8be4d0d779..e4616a8231 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -24,6 +24,15 @@ func testAppend(structure interface{}, expected string, t *testing.T) { testConfigAppend(&config, structure, expected, t) } +// testAppendQ35 is testAppend with Config.Machine.Type set to "q35" so +// device emitters that gate on hasPCIeRoot() (e.g. virtio-pci leaves +// pinned to bus=pcie.0) take the PCIe path. Use this for tests whose +// expected string contains "bus=pcie.0". +func testAppendQ35(structure interface{}, expected string, t *testing.T) { + config := Config{Machine: Machine{Type: "q35"}} + testConfigAppend(&config, structure, expected, t) +} + func testConfigAppend(config *Config, structure interface{}, expected string, t *testing.T) { switch s := structure.(type) { case Machine: @@ -343,7 +352,32 @@ func TestAppendVSOCK(t *testing.T) { vsockDevice.DevNo = DevNo } - testAppend(vsockDevice, deviceVSOCKString, t) + // deviceVSOCKString includes bus=pcie.0 — gated to q35/virt machines. + testAppendQ35(vsockDevice, deviceVSOCKString, t) +} + +// TestAppendVSOCKNoPCIeRoot verifies that on machines without a `pcie.0` +// root (e.g. ppc64le's pseries, microvm, s390-ccw-virtio), we do NOT +// emit `bus=pcie.0` — doing so would crash QEMU with +// "Bus 'pcie.0' not found". Transport and ROMFile are set explicitly +// rather than using the arch-conditional `romfile` constant (which is +// "" on s390x via qemu_s390x_test.go), so the test exercises the +// same code path on every architecture. +func TestAppendVSOCKNoPCIeRoot(t *testing.T) { + const vsockRomfile = "efi-virtio.rom" + vsockDevice := VSOCKDevice{ + ID: "vhost-vsock-pci0", + ContextID: 4, + VHostFD: nil, + DisableModern: true, + ROMFile: vsockRomfile, + Transport: TransportPCI, + } + + // pseries -> hasPCIeRoot returns false -> no bus=pcie.0 emitted. + expected := "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=" + vsockRomfile + config := Config{Machine: Machine{Type: "pseries"}} + testConfigAppend(&config, vsockDevice, expected, t) } func TestVSOCKValid(t *testing.T) { diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 173f72b2c1..74818ff5d6 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -565,10 +565,12 @@ func maybeRightSizeAutoNUMA(hc *HypervisorConfig, log *logrus.Entry) { func (q *qemu) buildNUMATopology() ([]govmmQemu.NUMANode, []govmmQemu.NUMADist, error) { // q.config.GuestNUMANodes has already been right-sized (when applicable) - // by maybeRightSizeAutoNUMA() at hypervisor setup time, so a length - // of 1 here means "no NUMA topology"; fall through to a flat memdev. + // by maybeRightSizeAutoNUMA() at hypervisor setup time. Empty means + // no NUMA topology; a single node may still carry a HostNodes binding + // (e.g. right-sized to the GPU's NUMA node), in which case we must + // emit it so memory is bound to the correct host node. numaNodes := q.config.GuestNUMANodes - if len(numaNodes) <= 1 { + if !numaPlacementActive(numaNodes) { return nil, nil, nil } @@ -1298,6 +1300,15 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if numOfPluggablePorts > maxPCIeRootPort { return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) } + + // When NUMA is active (multi-node OR a single node right-sized to a + // specific host node), create pxb-pcie bridges so cold-plugged VFIO + // devices inherit the correct guest NUMA affinity. + if numaPlacementActive(q.config.GuestNUMANodes) && len(hypervisorConfig.VFIODevices) > 0 { + qemuConfig.Devices = q.createNUMAPCIeTopology(qemuConfig.Devices, hypervisorConfig, numOfPluggablePorts) + return nil + } + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts) return nil } @@ -3077,7 +3088,107 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff return memory } -// genericAppendPCIeRootPort appends to devices the given pcie-root-port +// numaPlacementActive reports whether the runtime should emit per-NUMA +// pxb-pcie / memory-binding QEMU args. True when there is more than one +// guest node, OR a single guest node with an explicit HostNodes binding. +// +// The single-node case covers two scenarios that the runtime cannot tell +// apart after right-sizing: +// - a multi-NUMA host whose workload was collapsed to one host node +// (e.g. GPU on host node 0) — pxb-pcie + host-nodes binding are +// required so the guest GPU reports the correct NUMA affinity; +// - a single-NUMA host with `enable_numa=true` — emitting the binding +// is a functional no-op (the only host node is node 0 anyway). +// +// Single node without a HostNodes value (no NUMA mapping at all) falls +// through to the flat memdev path. +func numaPlacementActive(nodes []types.GuestNUMANode) bool { + if len(nodes) > 1 { + return true + } + return len(nodes) == 1 && nodes[0].HostNodes != "" +} + +// createNUMAPCIeTopology creates pxb-pcie bridges for NUMA nodes that have +// VFIO devices, then creates root ports on each pxb bus. VFIO devices will +// be assigned to these root ports during Attach() based on their host NUMA +// node, giving the guest kernel correct NUMA affinity for the PCI devices. +func (q *qemu) createNUMAPCIeTopology(devices []govmmQemu.Device, hypervisorConfig *HypervisorConfig, totalPorts uint32) []govmmQemu.Device { + coveredHostNodes := buildCoveredHostNodes(q.config.GuestNUMANodes) + + // Count VFIO devices per host NUMA node. + numaDevCount := make(map[int]int) + for _, dev := range hypervisorConfig.VFIODevices { + hostPath, err := config.GetHostPath(dev, false, "") + if err != nil { + continue + } + dev.HostPath = hostPath + var vfioDevs []*config.VFIODev + if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) { + vfioDevs, _ = drivers.GetDeviceFromVFIODev(dev) + } else { + vfioDevs, _ = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + } + for _, vd := range vfioDevs { + if vd.NUMANode >= 0 && drivers.IsPCIeDevice(vd.BDF) { + numaDevCount[vd.NUMANode]++ + } + } + } + + if len(numaDevCount) == 0 { + return q.arch.appendPCIeRootPortDevice(devices, totalPorts) + } + + // Create a pxb-pcie + root ports per NUMA node that has devices. + var rpIndex uint32 + const busNrSpacing uint8 = 0x20 + + for hostNode, devCount := range numaDevCount { + guestNode, ok := coveredHostNodes[hostNode] + if !ok { + q.Logger().WithField("host-numa", hostNode).Warn("VFIO device on uncovered NUMA node; skipping pxb-pcie") + continue + } + + pxbID := fmt.Sprintf("pxb-numa%d", guestNode) + busNr := busNrSpacing * uint8(guestNode+1) + + devices = append(devices, govmmQemu.PXBPCIeDevice{ + ID: pxbID, + BusNr: busNr, + NUMANode: int(guestNode), + }) + + // Create root ports on this pxb bus for the VFIO devices. + var rpIDs []string + for i := 0; i < devCount; i++ { + rpID := fmt.Sprintf("rp-numa%d-%d", guestNode, i) + rpIDs = append(rpIDs, rpID) + devices = append(devices, govmmQemu.PCIeRootPortDevice{ + ID: rpID, + Bus: pxbID, + Chassis: fmt.Sprintf("%d", 10+guestNode), + Slot: fmt.Sprintf("%d", i), + }) + rpIndex++ + } + + config.NUMARootPorts[hostNode] = rpIDs + + q.Logger().WithFields(logrus.Fields{ + "pxb-id": pxbID, + "bus-nr": busNr, + "guest-numa": guestNode, + "host-numa": hostNode, + "root-ports": rpIDs, + }).Info("Created pxb-pcie with root ports for NUMA VFIO placement") + } + + return devices +} + func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device { var ( bus string diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index db494bf365..9fcb8dc1fa 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -1203,7 +1203,33 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) { assert.Equal(100, q.state.HotpluggedMemory) } +func TestNumaPlacementActive(t *testing.T) { + assert := assert.New(t) + cases := []struct { + name string + nodes []types.GuestNUMANode + want bool + }{ + {"empty", nil, false}, + {"single-node-no-binding", []types.GuestNUMANode{{}}, false}, + {"single-node-host-0", []types.GuestNUMANode{{HostNodes: "0"}}, true}, + {"single-node-host-1", []types.GuestNUMANode{{HostNodes: "1"}}, true}, + {"single-node-host-range", []types.GuestNUMANode{{HostNodes: "0-1"}}, true}, + {"two-nodes", []types.GuestNUMANode{{HostNodes: "0"}, {HostNodes: "1"}}, true}, + } + for _, c := range cases { + assert.Equal(c.want, numaPlacementActive(c.nodes), c.name) + } +} + func TestBuildNUMATopologySingleNode(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // A single guest node mapped to a specific host node (e.g. produced + // by maybeRightSizeAutoNUMA() collapsing a multi-node sandbox to the + // GPU's host NUMA node) must still emit a one-node topology so that + // the memory backend gets a host-nodes= binding. assert := assert.New(t) q := &qemu{ config: HypervisorConfig{ @@ -1214,12 +1240,88 @@ func TestBuildNUMATopologySingleNode(t *testing.T) { }, }, } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 1) + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("0-3", nodes[0].CPUs) + assert.Equal("1024M", nodes[0].MemSize) + assert.Equal("0", nodes[0].HostNodes) + assert.Equal("memory-backend-ram", nodes[0].MemBackendType) +} + +func TestBuildNUMATopologySingleNodeNoHostBinding(t *testing.T) { + // A single guest node without a HostNodes value carries no NUMA + // binding intent; buildNUMATopology() must return nil so that the + // QEMU command line falls through to the flat memdev path. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "", HostCPUs: "0-3"}, + }, + }, + } nodes, dists, err := q.buildNUMATopology() assert.NoError(err) assert.Nil(nodes) assert.Nil(dists) } +func TestBuildNUMATopologySingleNodeExplicitNonZeroHost(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // User explicitly mapped the only guest node to a non-zero host node + // (e.g. numa_mapping = ["1"]). buildNUMATopology() must propagate + // HostNodes verbatim so the memory backend ends up bound to host + // node 1 rather than the default node 0. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + NUMAMapping: []string{"1"}, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "1", HostCPUs: "0-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 1) + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("1", nodes[0].HostNodes) +} + +func TestBuildNUMATopologyExplicitRangedHostNodes(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // User explicitly mapped two guest nodes to disjoint host-node ranges + // (e.g. numa_mapping = ["0-1", "2-3"]). buildNUMATopology() must + // preserve the ranged HostNodes strings on each emitted NUMANode. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 8, + MemorySize: 2048, + NUMAMapping: []string{"0-1", "2-3"}, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-3"}, + {HostNodes: "2-3", HostCPUs: "4-7"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("0-1", nodes[0].HostNodes) + assert.Equal("2-3", nodes[1].HostNodes) +} + func TestBuildNUMATopologyTwoNodes(t *testing.T) { if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)