diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index f41263975b..489cf5e4dc 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -235,6 +235,17 @@ var ( // different types of PCI ports. We can deduces the Bus number from it // and eliminate duplicates being assigned. PCIeDevicesPerPort = map[PCIePort][]VFIODev{} + + // NUMARootPorts maps host NUMA node IDs to root port IDs on pxb-pcie + // bridges. When NUMA-aware PCIe topology is active (pxb-pcie), + // createPCIeTopology populates this so VFIODevice.Attach() can assign + // each device to the root port on its host NUMA node's pxb-pcie bus. + // Key: host NUMA node ID, Value: slice of root port IDs on that node's pxb. + NUMARootPorts = map[int][]string{} + + // NUMARootPortDeviceCount tracks how many devices have been assigned + // to each host NUMA node's root ports (for round-robin assignment). + NUMARootPortDeviceCount = map[int]int{} ) // DeviceInfo is an embedded type that contains device data common to all types of devices. diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 42d86e2dca..ff70c4ac76 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -90,11 +90,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } if vfio.IsPCIe { - busIndex := len(config.PCIeDevicesPerPort[vfio.Port]) - vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) - // We need to keep track the number of devices per port to deduce - // the corectu bus number, additionally we can use the VFIO device - // info to act upon different Vendor IDs and Device IDs. + // When pxb-pcie NUMA topology is active, assign the device + // to a root port on the pxb-pcie bridge for its host NUMA + // node instead of the default rp/swdp numbering. + if rpIDs, ok := config.NUMARootPorts[vfio.NUMANode]; ok && len(rpIDs) > 0 { + idx := config.NUMARootPortDeviceCount[vfio.NUMANode] + vfio.Bus = rpIDs[idx%len(rpIDs)] + config.NUMARootPortDeviceCount[vfio.NUMANode] = idx + 1 + } else { + busIndex := len(config.PCIeDevicesPerPort[vfio.Port]) + vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) + } config.PCIeDevicesPerPort[vfio.Port] = append(config.PCIeDevicesPerPort[vfio.Port], *vfio) } } diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index 06f9117676..5726613e3a 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -71,6 +71,8 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS config.PCIeDevicesPerPort[config.RootPort] = make([]config.VFIODev, 0) config.PCIeDevicesPerPort[config.SwitchPort] = make([]config.VFIODev, 0) config.PCIeDevicesPerPort[config.BridgePort] = make([]config.VFIODev, 0) + config.NUMARootPorts = make(map[int][]string) + config.NUMARootPortDeviceCount = make(map[int]int) for _, dev := range devices { dm.devices[dev.DeviceID()] = dev diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index dc7501c87b..9dca1e959e 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -50,6 +50,20 @@ const ( qgsSocketPath string = "/var/run/tdx-qgs/qgs.socket" ) +// hasPCIeRoot reports whether the configured QEMU machine type exposes a +// `pcie.0` root complex (q35 on x86, virt on arm64). Machines such as +// pseries (ppc64le -> pci.0), s390-ccw-virtio (s390x -> CCW transport) +// and microvm (no PCI at all) do not have a `pcie.0` bus, so emitting +// `bus=pcie.0` on virtio-pci leaf devices would fail to start QEMU. +// Used to gate the bus= pin we apply to keep leaf devices off pxb-pcie. +func hasPCIeRoot(config *Config) bool { + if config == nil { + return false + } + t := config.Machine.Type + return strings.HasPrefix(t, "q35") || strings.HasPrefix(t, "virt") +} + const ( // Well known vsock CID for host system. // https://man7.org/linux/man-pages/man7/vsock.7.html @@ -132,6 +146,10 @@ const ( // VHostVSockPCI is a generic Vsock vhost device with PCI transport. VHostVSockPCI DeviceDriver = "vhost-vsock-pci" + // PXBPCIe is a PCIe Expander Bridge that creates a new PCI root + // complex with NUMA node affinity. + PXBPCIe DeviceDriver = "pxb-pcie" + // PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port. PCIeRootPort DeviceDriver = "pcie-root-port" @@ -1064,6 +1082,11 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string { if netdev.Bus != "" { deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus)) + } else if netdev.Transport.isVirtioPCI(config) && hasPCIeRoot(config) { + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device as the default bus. Skipped on machines + // without a `pcie.0` root (pseries, microvm, s390-ccw-virtio). + deviceParams = append(deviceParams, "bus=pcie.0") } if netdev.Addr != "" { @@ -1587,9 +1610,11 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address)) if vhostuserDev.Transport.isVirtioPCI(config) { - // Pin to pcie.0 so pxb-pcie (when present) doesn't capture - // this leaf device as the default bus. - deviceParams = append(deviceParams, "bus=pcie.0") + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device. See hasPCIeRoot() for skipped machines. + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1618,7 +1643,9 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) if vhostuserDev.Transport.isVirtioPCI(config) { - deviceParams = append(deviceParams, "bus=pcie.0") + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1646,7 +1673,9 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) if vhostuserDev.Transport.isVirtioPCI(config) { - deviceParams = append(deviceParams, "bus=pcie.0") + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1686,7 +1715,9 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo)) } if vhostuserDev.Transport.isVirtioPCI(config) { - deviceParams = append(deviceParams, "bus=pcie.0") + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } if vhostuserDev.ROMFile != "" { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) } @@ -1752,6 +1783,36 @@ func (vhostuserDev VhostUserDevice) deviceName(config *Config) string { } } +// PXBPCIeDevice represents a PCIe Expander Bridge (pxb-pcie). +// It creates a new PCI root complex with NUMA node affinity, allowing +// devices attached to its bus hierarchy to inherit the NUMA association. +// This is the only QEMU PCI device that carries a numa_node property. +type PXBPCIeDevice struct { + // ID is the QEMU device identifier (e.g. "pxb-numa0"). + ID string + + // BusNr is the guest PCI bus number for this root complex. + // Use values spaced apart (e.g. 0x20, 0x40) to leave room for + // bridges beneath each pxb-pcie. + BusNr uint8 + + // NUMANode is the guest NUMA node index this root complex belongs to. + NUMANode int +} + +// QemuParams returns the QEMU parameters for a pxb-pcie device. +func (dev PXBPCIeDevice) QemuParams(_ *Config) []string { + return []string{ + "-device", + fmt.Sprintf("pxb-pcie,id=%s,bus_nr=%d,numa_node=%d", dev.ID, dev.BusNr, dev.NUMANode), + } +} + +// Valid returns true if the PXBPCIeDevice structure is valid and complete. +func (dev PXBPCIeDevice) Valid() bool { + return dev.ID != "" +} + // PCIeRootPortDevice represents a memory balloon device. // nolint: govet type PCIeRootPortDevice struct { @@ -2324,8 +2385,15 @@ func (vsock VSOCKDevice) QemuParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vsock.ID)) deviceParams = append(deviceParams, fmt.Sprintf("%s=%d", VSOCKGuestCID, vsock.ContextID)) - if vsock.Transport.isVirtioPCI(config) && vsock.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile)) + if vsock.Transport.isVirtioPCI(config) { + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device. See hasPCIeRoot() for skipped machines. + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } + if vsock.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile)) + } } if vsock.Transport.isVirtioCCW(config) { diff --git a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go index a15e77c184..36e03254ae 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go @@ -42,7 +42,8 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserBlk, ROMFile: romfile, } - testAppend(vhostuserBlkDevice, deviceVhostUserBlkString, t) + // vhost-user-pci device strings include bus=pcie.0 — gated to q35/virt. + testAppendQ35(vhostuserBlkDevice, deviceVhostUserBlkString, t) vhostuserSCSIDevice := VhostUserDevice{ SocketPath: "/tmp/nonexistentsocket.socket", @@ -52,7 +53,7 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserSCSI, ROMFile: romfile, } - testAppend(vhostuserSCSIDevice, deviceVhostUserSCSIString, t) + testAppendQ35(vhostuserSCSIDevice, deviceVhostUserSCSIString, t) vhostuserNetDevice := VhostUserDevice{ SocketPath: "/tmp/nonexistentsocket.socket", @@ -62,7 +63,7 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserNet, ROMFile: romfile, } - testAppend(vhostuserNetDevice, deviceVhostUserNetString, t) + testAppendQ35(vhostuserNetDevice, deviceVhostUserNetString, t) } func TestAppendVirtioBalloon(t *testing.T) { diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 8be4d0d779..e4616a8231 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -24,6 +24,15 @@ func testAppend(structure interface{}, expected string, t *testing.T) { testConfigAppend(&config, structure, expected, t) } +// testAppendQ35 is testAppend with Config.Machine.Type set to "q35" so +// device emitters that gate on hasPCIeRoot() (e.g. virtio-pci leaves +// pinned to bus=pcie.0) take the PCIe path. Use this for tests whose +// expected string contains "bus=pcie.0". +func testAppendQ35(structure interface{}, expected string, t *testing.T) { + config := Config{Machine: Machine{Type: "q35"}} + testConfigAppend(&config, structure, expected, t) +} + func testConfigAppend(config *Config, structure interface{}, expected string, t *testing.T) { switch s := structure.(type) { case Machine: @@ -343,7 +352,32 @@ func TestAppendVSOCK(t *testing.T) { vsockDevice.DevNo = DevNo } - testAppend(vsockDevice, deviceVSOCKString, t) + // deviceVSOCKString includes bus=pcie.0 — gated to q35/virt machines. + testAppendQ35(vsockDevice, deviceVSOCKString, t) +} + +// TestAppendVSOCKNoPCIeRoot verifies that on machines without a `pcie.0` +// root (e.g. ppc64le's pseries, microvm, s390-ccw-virtio), we do NOT +// emit `bus=pcie.0` — doing so would crash QEMU with +// "Bus 'pcie.0' not found". Transport and ROMFile are set explicitly +// rather than using the arch-conditional `romfile` constant (which is +// "" on s390x via qemu_s390x_test.go), so the test exercises the +// same code path on every architecture. +func TestAppendVSOCKNoPCIeRoot(t *testing.T) { + const vsockRomfile = "efi-virtio.rom" + vsockDevice := VSOCKDevice{ + ID: "vhost-vsock-pci0", + ContextID: 4, + VHostFD: nil, + DisableModern: true, + ROMFile: vsockRomfile, + Transport: TransportPCI, + } + + // pseries -> hasPCIeRoot returns false -> no bus=pcie.0 emitted. + expected := "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=" + vsockRomfile + config := Config{Machine: Machine{Type: "pseries"}} + testConfigAppend(&config, vsockDevice, expected, t) } func TestVSOCKValid(t *testing.T) { diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 173f72b2c1..74818ff5d6 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -565,10 +565,12 @@ func maybeRightSizeAutoNUMA(hc *HypervisorConfig, log *logrus.Entry) { func (q *qemu) buildNUMATopology() ([]govmmQemu.NUMANode, []govmmQemu.NUMADist, error) { // q.config.GuestNUMANodes has already been right-sized (when applicable) - // by maybeRightSizeAutoNUMA() at hypervisor setup time, so a length - // of 1 here means "no NUMA topology"; fall through to a flat memdev. + // by maybeRightSizeAutoNUMA() at hypervisor setup time. Empty means + // no NUMA topology; a single node may still carry a HostNodes binding + // (e.g. right-sized to the GPU's NUMA node), in which case we must + // emit it so memory is bound to the correct host node. numaNodes := q.config.GuestNUMANodes - if len(numaNodes) <= 1 { + if !numaPlacementActive(numaNodes) { return nil, nil, nil } @@ -1298,6 +1300,15 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if numOfPluggablePorts > maxPCIeRootPort { return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) } + + // When NUMA is active (multi-node OR a single node right-sized to a + // specific host node), create pxb-pcie bridges so cold-plugged VFIO + // devices inherit the correct guest NUMA affinity. + if numaPlacementActive(q.config.GuestNUMANodes) && len(hypervisorConfig.VFIODevices) > 0 { + qemuConfig.Devices = q.createNUMAPCIeTopology(qemuConfig.Devices, hypervisorConfig, numOfPluggablePorts) + return nil + } + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts) return nil } @@ -3077,7 +3088,107 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff return memory } -// genericAppendPCIeRootPort appends to devices the given pcie-root-port +// numaPlacementActive reports whether the runtime should emit per-NUMA +// pxb-pcie / memory-binding QEMU args. True when there is more than one +// guest node, OR a single guest node with an explicit HostNodes binding. +// +// The single-node case covers two scenarios that the runtime cannot tell +// apart after right-sizing: +// - a multi-NUMA host whose workload was collapsed to one host node +// (e.g. GPU on host node 0) — pxb-pcie + host-nodes binding are +// required so the guest GPU reports the correct NUMA affinity; +// - a single-NUMA host with `enable_numa=true` — emitting the binding +// is a functional no-op (the only host node is node 0 anyway). +// +// Single node without a HostNodes value (no NUMA mapping at all) falls +// through to the flat memdev path. +func numaPlacementActive(nodes []types.GuestNUMANode) bool { + if len(nodes) > 1 { + return true + } + return len(nodes) == 1 && nodes[0].HostNodes != "" +} + +// createNUMAPCIeTopology creates pxb-pcie bridges for NUMA nodes that have +// VFIO devices, then creates root ports on each pxb bus. VFIO devices will +// be assigned to these root ports during Attach() based on their host NUMA +// node, giving the guest kernel correct NUMA affinity for the PCI devices. +func (q *qemu) createNUMAPCIeTopology(devices []govmmQemu.Device, hypervisorConfig *HypervisorConfig, totalPorts uint32) []govmmQemu.Device { + coveredHostNodes := buildCoveredHostNodes(q.config.GuestNUMANodes) + + // Count VFIO devices per host NUMA node. + numaDevCount := make(map[int]int) + for _, dev := range hypervisorConfig.VFIODevices { + hostPath, err := config.GetHostPath(dev, false, "") + if err != nil { + continue + } + dev.HostPath = hostPath + var vfioDevs []*config.VFIODev + if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) { + vfioDevs, _ = drivers.GetDeviceFromVFIODev(dev) + } else { + vfioDevs, _ = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + } + for _, vd := range vfioDevs { + if vd.NUMANode >= 0 && drivers.IsPCIeDevice(vd.BDF) { + numaDevCount[vd.NUMANode]++ + } + } + } + + if len(numaDevCount) == 0 { + return q.arch.appendPCIeRootPortDevice(devices, totalPorts) + } + + // Create a pxb-pcie + root ports per NUMA node that has devices. + var rpIndex uint32 + const busNrSpacing uint8 = 0x20 + + for hostNode, devCount := range numaDevCount { + guestNode, ok := coveredHostNodes[hostNode] + if !ok { + q.Logger().WithField("host-numa", hostNode).Warn("VFIO device on uncovered NUMA node; skipping pxb-pcie") + continue + } + + pxbID := fmt.Sprintf("pxb-numa%d", guestNode) + busNr := busNrSpacing * uint8(guestNode+1) + + devices = append(devices, govmmQemu.PXBPCIeDevice{ + ID: pxbID, + BusNr: busNr, + NUMANode: int(guestNode), + }) + + // Create root ports on this pxb bus for the VFIO devices. + var rpIDs []string + for i := 0; i < devCount; i++ { + rpID := fmt.Sprintf("rp-numa%d-%d", guestNode, i) + rpIDs = append(rpIDs, rpID) + devices = append(devices, govmmQemu.PCIeRootPortDevice{ + ID: rpID, + Bus: pxbID, + Chassis: fmt.Sprintf("%d", 10+guestNode), + Slot: fmt.Sprintf("%d", i), + }) + rpIndex++ + } + + config.NUMARootPorts[hostNode] = rpIDs + + q.Logger().WithFields(logrus.Fields{ + "pxb-id": pxbID, + "bus-nr": busNr, + "guest-numa": guestNode, + "host-numa": hostNode, + "root-ports": rpIDs, + }).Info("Created pxb-pcie with root ports for NUMA VFIO placement") + } + + return devices +} + func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device { var ( bus string diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index db494bf365..9fcb8dc1fa 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -1203,7 +1203,33 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) { assert.Equal(100, q.state.HotpluggedMemory) } +func TestNumaPlacementActive(t *testing.T) { + assert := assert.New(t) + cases := []struct { + name string + nodes []types.GuestNUMANode + want bool + }{ + {"empty", nil, false}, + {"single-node-no-binding", []types.GuestNUMANode{{}}, false}, + {"single-node-host-0", []types.GuestNUMANode{{HostNodes: "0"}}, true}, + {"single-node-host-1", []types.GuestNUMANode{{HostNodes: "1"}}, true}, + {"single-node-host-range", []types.GuestNUMANode{{HostNodes: "0-1"}}, true}, + {"two-nodes", []types.GuestNUMANode{{HostNodes: "0"}, {HostNodes: "1"}}, true}, + } + for _, c := range cases { + assert.Equal(c.want, numaPlacementActive(c.nodes), c.name) + } +} + func TestBuildNUMATopologySingleNode(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // A single guest node mapped to a specific host node (e.g. produced + // by maybeRightSizeAutoNUMA() collapsing a multi-node sandbox to the + // GPU's host NUMA node) must still emit a one-node topology so that + // the memory backend gets a host-nodes= binding. assert := assert.New(t) q := &qemu{ config: HypervisorConfig{ @@ -1214,12 +1240,88 @@ func TestBuildNUMATopologySingleNode(t *testing.T) { }, }, } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 1) + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("0-3", nodes[0].CPUs) + assert.Equal("1024M", nodes[0].MemSize) + assert.Equal("0", nodes[0].HostNodes) + assert.Equal("memory-backend-ram", nodes[0].MemBackendType) +} + +func TestBuildNUMATopologySingleNodeNoHostBinding(t *testing.T) { + // A single guest node without a HostNodes value carries no NUMA + // binding intent; buildNUMATopology() must return nil so that the + // QEMU command line falls through to the flat memdev path. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "", HostCPUs: "0-3"}, + }, + }, + } nodes, dists, err := q.buildNUMATopology() assert.NoError(err) assert.Nil(nodes) assert.Nil(dists) } +func TestBuildNUMATopologySingleNodeExplicitNonZeroHost(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // User explicitly mapped the only guest node to a non-zero host node + // (e.g. numa_mapping = ["1"]). buildNUMATopology() must propagate + // HostNodes verbatim so the memory backend ends up bound to host + // node 1 rather than the default node 0. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + NUMAMapping: []string{"1"}, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "1", HostCPUs: "0-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 1) + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("1", nodes[0].HostNodes) +} + +func TestBuildNUMATopologyExplicitRangedHostNodes(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // User explicitly mapped two guest nodes to disjoint host-node ranges + // (e.g. numa_mapping = ["0-1", "2-3"]). buildNUMATopology() must + // preserve the ranged HostNodes strings on each emitted NUMANode. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 8, + MemorySize: 2048, + NUMAMapping: []string{"0-1", "2-3"}, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-3"}, + {HostNodes: "2-3", HostCPUs: "4-7"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("0-1", nodes[0].HostNodes) + assert.Equal("2-3", nodes[1].HostNodes) +} + func TestBuildNUMATopologyTwoNodes(t *testing.T) { if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)