mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-01 22:50:54 +00:00
runtime: Add pxb-pcie NUMA-aware PCIe topology for VFIO devices
When NUMA placement is active and VFIO devices are cold-plugged,
create a pxb-pcie (PCIe Expander Bridge) per NUMA node that has
devices. Each pxb-pcie carries a numa_node property that gives the
guest kernel correct NUMA affinity for all PCI devices beneath it.
Root ports are created on each pxb-pcie bus instead of pcie.0, and
VFIODevice.Attach() assigns each device to the root port on its host
NUMA node's pxb bridge. Non-VFIO devices remain on pcie.0.
NUMA placement is "active" when there is more than one guest NUMA
node OR a single guest node mapped to a specific host node (the
latter happens when maybeRightSizeAutoNUMA() collapses a multi-node
sandbox to the GPU's host NUMA node). In both cases
buildNUMATopology() also emits the matching
memory-backend-ram,host-nodes=,policy=bind entries so guest memory
is sourced from the right host node.
So pxb-pcie can never capture a leaf virtio-pci device as the
default bus, every virtio-pci device emitter (NetDevice, VSOCK,
vhost-user-{net,scsi,blk,fs}) now appends bus=pcie.0 explicitly when
the machine actually exposes a pcie.0 root. Detection is done via a
new hasPCIeRoot() helper that returns true only for q35/virt machine
types — ppc64le's pseries (pci.0), s390x's s390-ccw-virtio (CCW
transport) and microvm (no PCI) intentionally skip the pin to avoid
"Bus 'pcie.0' not found" at startup.
This is the only QEMU mechanism that works for both regular and
confidential (TDX/SNP) guests, as it operates through the PCI bus
hierarchy rather than ACPI table injection.
Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
This commit is contained in:
committed by
Fabiano Fidêncio
parent
15292da217
commit
1cbe930fc9
@@ -235,6 +235,17 @@ var (
|
||||
// different types of PCI ports. We can deduces the Bus number from it
|
||||
// and eliminate duplicates being assigned.
|
||||
PCIeDevicesPerPort = map[PCIePort][]VFIODev{}
|
||||
|
||||
// NUMARootPorts maps host NUMA node IDs to root port IDs on pxb-pcie
|
||||
// bridges. When NUMA-aware PCIe topology is active (pxb-pcie),
|
||||
// createPCIeTopology populates this so VFIODevice.Attach() can assign
|
||||
// each device to the root port on its host NUMA node's pxb-pcie bus.
|
||||
// Key: host NUMA node ID, Value: slice of root port IDs on that node's pxb.
|
||||
NUMARootPorts = map[int][]string{}
|
||||
|
||||
// NUMARootPortDeviceCount tracks how many devices have been assigned
|
||||
// to each host NUMA node's root ports (for round-robin assignment).
|
||||
NUMARootPortDeviceCount = map[int]int{}
|
||||
)
|
||||
|
||||
// DeviceInfo is an embedded type that contains device data common to all types of devices.
|
||||
|
||||
@@ -90,11 +90,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
|
||||
}
|
||||
|
||||
if vfio.IsPCIe {
|
||||
busIndex := len(config.PCIeDevicesPerPort[vfio.Port])
|
||||
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
|
||||
// We need to keep track the number of devices per port to deduce
|
||||
// the corectu bus number, additionally we can use the VFIO device
|
||||
// info to act upon different Vendor IDs and Device IDs.
|
||||
// When pxb-pcie NUMA topology is active, assign the device
|
||||
// to a root port on the pxb-pcie bridge for its host NUMA
|
||||
// node instead of the default rp/swdp numbering.
|
||||
if rpIDs, ok := config.NUMARootPorts[vfio.NUMANode]; ok && len(rpIDs) > 0 {
|
||||
idx := config.NUMARootPortDeviceCount[vfio.NUMANode]
|
||||
vfio.Bus = rpIDs[idx%len(rpIDs)]
|
||||
config.NUMARootPortDeviceCount[vfio.NUMANode] = idx + 1
|
||||
} else {
|
||||
busIndex := len(config.PCIeDevicesPerPort[vfio.Port])
|
||||
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
|
||||
}
|
||||
config.PCIeDevicesPerPort[vfio.Port] = append(config.PCIeDevicesPerPort[vfio.Port], *vfio)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,6 +71,8 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
|
||||
config.PCIeDevicesPerPort[config.RootPort] = make([]config.VFIODev, 0)
|
||||
config.PCIeDevicesPerPort[config.SwitchPort] = make([]config.VFIODev, 0)
|
||||
config.PCIeDevicesPerPort[config.BridgePort] = make([]config.VFIODev, 0)
|
||||
config.NUMARootPorts = make(map[int][]string)
|
||||
config.NUMARootPortDeviceCount = make(map[int]int)
|
||||
|
||||
for _, dev := range devices {
|
||||
dm.devices[dev.DeviceID()] = dev
|
||||
|
||||
@@ -50,6 +50,20 @@ const (
|
||||
qgsSocketPath string = "/var/run/tdx-qgs/qgs.socket"
|
||||
)
|
||||
|
||||
// hasPCIeRoot reports whether the configured QEMU machine type exposes a
|
||||
// `pcie.0` root complex (q35 on x86, virt on arm64). Machines such as
|
||||
// pseries (ppc64le -> pci.0), s390-ccw-virtio (s390x -> CCW transport)
|
||||
// and microvm (no PCI at all) do not have a `pcie.0` bus, so emitting
|
||||
// `bus=pcie.0` on virtio-pci leaf devices would fail to start QEMU.
|
||||
// Used to gate the bus= pin we apply to keep leaf devices off pxb-pcie.
|
||||
func hasPCIeRoot(config *Config) bool {
|
||||
if config == nil {
|
||||
return false
|
||||
}
|
||||
t := config.Machine.Type
|
||||
return strings.HasPrefix(t, "q35") || strings.HasPrefix(t, "virt")
|
||||
}
|
||||
|
||||
const (
|
||||
// Well known vsock CID for host system.
|
||||
// https://man7.org/linux/man-pages/man7/vsock.7.html
|
||||
@@ -132,6 +146,10 @@ const (
|
||||
// VHostVSockPCI is a generic Vsock vhost device with PCI transport.
|
||||
VHostVSockPCI DeviceDriver = "vhost-vsock-pci"
|
||||
|
||||
// PXBPCIe is a PCIe Expander Bridge that creates a new PCI root
|
||||
// complex with NUMA node affinity.
|
||||
PXBPCIe DeviceDriver = "pxb-pcie"
|
||||
|
||||
// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
|
||||
PCIeRootPort DeviceDriver = "pcie-root-port"
|
||||
|
||||
@@ -1064,6 +1082,11 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string {
|
||||
|
||||
if netdev.Bus != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus))
|
||||
} else if netdev.Transport.isVirtioPCI(config) && hasPCIeRoot(config) {
|
||||
// Pin to pcie.0 (when present) so pxb-pcie can't capture
|
||||
// this leaf device as the default bus. Skipped on machines
|
||||
// without a `pcie.0` root (pseries, microvm, s390-ccw-virtio).
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
|
||||
if netdev.Addr != "" {
|
||||
@@ -1587,9 +1610,11 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address))
|
||||
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) {
|
||||
// Pin to pcie.0 so pxb-pcie (when present) doesn't capture
|
||||
// this leaf device as the default bus.
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
// Pin to pcie.0 (when present) so pxb-pcie can't capture
|
||||
// this leaf device. See hasPCIeRoot() for skipped machines.
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
}
|
||||
@@ -1618,7 +1643,9 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID))
|
||||
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
}
|
||||
@@ -1646,7 +1673,9 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID))
|
||||
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
}
|
||||
@@ -1686,7 +1715,9 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo))
|
||||
}
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
}
|
||||
@@ -1752,6 +1783,36 @@ func (vhostuserDev VhostUserDevice) deviceName(config *Config) string {
|
||||
}
|
||||
}
|
||||
|
||||
// PXBPCIeDevice represents a PCIe Expander Bridge (pxb-pcie).
|
||||
// It creates a new PCI root complex with NUMA node affinity, allowing
|
||||
// devices attached to its bus hierarchy to inherit the NUMA association.
|
||||
// This is the only QEMU PCI device that carries a numa_node property.
|
||||
type PXBPCIeDevice struct {
|
||||
// ID is the QEMU device identifier (e.g. "pxb-numa0").
|
||||
ID string
|
||||
|
||||
// BusNr is the guest PCI bus number for this root complex.
|
||||
// Use values spaced apart (e.g. 0x20, 0x40) to leave room for
|
||||
// bridges beneath each pxb-pcie.
|
||||
BusNr uint8
|
||||
|
||||
// NUMANode is the guest NUMA node index this root complex belongs to.
|
||||
NUMANode int
|
||||
}
|
||||
|
||||
// QemuParams returns the QEMU parameters for a pxb-pcie device.
|
||||
func (dev PXBPCIeDevice) QemuParams(_ *Config) []string {
|
||||
return []string{
|
||||
"-device",
|
||||
fmt.Sprintf("pxb-pcie,id=%s,bus_nr=%d,numa_node=%d", dev.ID, dev.BusNr, dev.NUMANode),
|
||||
}
|
||||
}
|
||||
|
||||
// Valid returns true if the PXBPCIeDevice structure is valid and complete.
|
||||
func (dev PXBPCIeDevice) Valid() bool {
|
||||
return dev.ID != ""
|
||||
}
|
||||
|
||||
// PCIeRootPortDevice represents a memory balloon device.
|
||||
// nolint: govet
|
||||
type PCIeRootPortDevice struct {
|
||||
@@ -2324,8 +2385,15 @@ func (vsock VSOCKDevice) QemuParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vsock.ID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("%s=%d", VSOCKGuestCID, vsock.ContextID))
|
||||
|
||||
if vsock.Transport.isVirtioPCI(config) && vsock.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile))
|
||||
if vsock.Transport.isVirtioPCI(config) {
|
||||
// Pin to pcie.0 (when present) so pxb-pcie can't capture
|
||||
// this leaf device. See hasPCIeRoot() for skipped machines.
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vsock.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile))
|
||||
}
|
||||
}
|
||||
|
||||
if vsock.Transport.isVirtioCCW(config) {
|
||||
|
||||
@@ -42,7 +42,8 @@ func TestAppendDeviceVhostUser(t *testing.T) {
|
||||
VhostUserType: VhostUserBlk,
|
||||
ROMFile: romfile,
|
||||
}
|
||||
testAppend(vhostuserBlkDevice, deviceVhostUserBlkString, t)
|
||||
// vhost-user-pci device strings include bus=pcie.0 — gated to q35/virt.
|
||||
testAppendQ35(vhostuserBlkDevice, deviceVhostUserBlkString, t)
|
||||
|
||||
vhostuserSCSIDevice := VhostUserDevice{
|
||||
SocketPath: "/tmp/nonexistentsocket.socket",
|
||||
@@ -52,7 +53,7 @@ func TestAppendDeviceVhostUser(t *testing.T) {
|
||||
VhostUserType: VhostUserSCSI,
|
||||
ROMFile: romfile,
|
||||
}
|
||||
testAppend(vhostuserSCSIDevice, deviceVhostUserSCSIString, t)
|
||||
testAppendQ35(vhostuserSCSIDevice, deviceVhostUserSCSIString, t)
|
||||
|
||||
vhostuserNetDevice := VhostUserDevice{
|
||||
SocketPath: "/tmp/nonexistentsocket.socket",
|
||||
@@ -62,7 +63,7 @@ func TestAppendDeviceVhostUser(t *testing.T) {
|
||||
VhostUserType: VhostUserNet,
|
||||
ROMFile: romfile,
|
||||
}
|
||||
testAppend(vhostuserNetDevice, deviceVhostUserNetString, t)
|
||||
testAppendQ35(vhostuserNetDevice, deviceVhostUserNetString, t)
|
||||
}
|
||||
|
||||
func TestAppendVirtioBalloon(t *testing.T) {
|
||||
|
||||
@@ -24,6 +24,15 @@ func testAppend(structure interface{}, expected string, t *testing.T) {
|
||||
testConfigAppend(&config, structure, expected, t)
|
||||
}
|
||||
|
||||
// testAppendQ35 is testAppend with Config.Machine.Type set to "q35" so
|
||||
// device emitters that gate on hasPCIeRoot() (e.g. virtio-pci leaves
|
||||
// pinned to bus=pcie.0) take the PCIe path. Use this for tests whose
|
||||
// expected string contains "bus=pcie.0".
|
||||
func testAppendQ35(structure interface{}, expected string, t *testing.T) {
|
||||
config := Config{Machine: Machine{Type: "q35"}}
|
||||
testConfigAppend(&config, structure, expected, t)
|
||||
}
|
||||
|
||||
func testConfigAppend(config *Config, structure interface{}, expected string, t *testing.T) {
|
||||
switch s := structure.(type) {
|
||||
case Machine:
|
||||
@@ -343,7 +352,32 @@ func TestAppendVSOCK(t *testing.T) {
|
||||
vsockDevice.DevNo = DevNo
|
||||
}
|
||||
|
||||
testAppend(vsockDevice, deviceVSOCKString, t)
|
||||
// deviceVSOCKString includes bus=pcie.0 — gated to q35/virt machines.
|
||||
testAppendQ35(vsockDevice, deviceVSOCKString, t)
|
||||
}
|
||||
|
||||
// TestAppendVSOCKNoPCIeRoot verifies that on machines without a `pcie.0`
|
||||
// root (e.g. ppc64le's pseries, microvm, s390-ccw-virtio), we do NOT
|
||||
// emit `bus=pcie.0` — doing so would crash QEMU with
|
||||
// "Bus 'pcie.0' not found". Transport and ROMFile are set explicitly
|
||||
// rather than using the arch-conditional `romfile` constant (which is
|
||||
// "" on s390x via qemu_s390x_test.go), so the test exercises the
|
||||
// same code path on every architecture.
|
||||
func TestAppendVSOCKNoPCIeRoot(t *testing.T) {
|
||||
const vsockRomfile = "efi-virtio.rom"
|
||||
vsockDevice := VSOCKDevice{
|
||||
ID: "vhost-vsock-pci0",
|
||||
ContextID: 4,
|
||||
VHostFD: nil,
|
||||
DisableModern: true,
|
||||
ROMFile: vsockRomfile,
|
||||
Transport: TransportPCI,
|
||||
}
|
||||
|
||||
// pseries -> hasPCIeRoot returns false -> no bus=pcie.0 emitted.
|
||||
expected := "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=" + vsockRomfile
|
||||
config := Config{Machine: Machine{Type: "pseries"}}
|
||||
testConfigAppend(&config, vsockDevice, expected, t)
|
||||
}
|
||||
|
||||
func TestVSOCKValid(t *testing.T) {
|
||||
|
||||
@@ -565,10 +565,12 @@ func maybeRightSizeAutoNUMA(hc *HypervisorConfig, log *logrus.Entry) {
|
||||
|
||||
func (q *qemu) buildNUMATopology() ([]govmmQemu.NUMANode, []govmmQemu.NUMADist, error) {
|
||||
// q.config.GuestNUMANodes has already been right-sized (when applicable)
|
||||
// by maybeRightSizeAutoNUMA() at hypervisor setup time, so a length
|
||||
// of 1 here means "no NUMA topology"; fall through to a flat memdev.
|
||||
// by maybeRightSizeAutoNUMA() at hypervisor setup time. Empty means
|
||||
// no NUMA topology; a single node may still carry a HostNodes binding
|
||||
// (e.g. right-sized to the GPU's NUMA node), in which case we must
|
||||
// emit it so memory is bound to the correct host node.
|
||||
numaNodes := q.config.GuestNUMANodes
|
||||
if len(numaNodes) <= 1 {
|
||||
if !numaPlacementActive(numaNodes) {
|
||||
return nil, nil, nil
|
||||
}
|
||||
|
||||
@@ -1298,6 +1300,15 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
|
||||
if numOfPluggablePorts > maxPCIeRootPort {
|
||||
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
|
||||
}
|
||||
|
||||
// When NUMA is active (multi-node OR a single node right-sized to a
|
||||
// specific host node), create pxb-pcie bridges so cold-plugged VFIO
|
||||
// devices inherit the correct guest NUMA affinity.
|
||||
if numaPlacementActive(q.config.GuestNUMANodes) && len(hypervisorConfig.VFIODevices) > 0 {
|
||||
qemuConfig.Devices = q.createNUMAPCIeTopology(qemuConfig.Devices, hypervisorConfig, numOfPluggablePorts)
|
||||
return nil
|
||||
}
|
||||
|
||||
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts)
|
||||
return nil
|
||||
}
|
||||
@@ -3077,7 +3088,107 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff
|
||||
return memory
|
||||
}
|
||||
|
||||
// genericAppendPCIeRootPort appends to devices the given pcie-root-port
|
||||
// numaPlacementActive reports whether the runtime should emit per-NUMA
|
||||
// pxb-pcie / memory-binding QEMU args. True when there is more than one
|
||||
// guest node, OR a single guest node with an explicit HostNodes binding.
|
||||
//
|
||||
// The single-node case covers two scenarios that the runtime cannot tell
|
||||
// apart after right-sizing:
|
||||
// - a multi-NUMA host whose workload was collapsed to one host node
|
||||
// (e.g. GPU on host node 0) — pxb-pcie + host-nodes binding are
|
||||
// required so the guest GPU reports the correct NUMA affinity;
|
||||
// - a single-NUMA host with `enable_numa=true` — emitting the binding
|
||||
// is a functional no-op (the only host node is node 0 anyway).
|
||||
//
|
||||
// Single node without a HostNodes value (no NUMA mapping at all) falls
|
||||
// through to the flat memdev path.
|
||||
func numaPlacementActive(nodes []types.GuestNUMANode) bool {
|
||||
if len(nodes) > 1 {
|
||||
return true
|
||||
}
|
||||
return len(nodes) == 1 && nodes[0].HostNodes != ""
|
||||
}
|
||||
|
||||
// createNUMAPCIeTopology creates pxb-pcie bridges for NUMA nodes that have
|
||||
// VFIO devices, then creates root ports on each pxb bus. VFIO devices will
|
||||
// be assigned to these root ports during Attach() based on their host NUMA
|
||||
// node, giving the guest kernel correct NUMA affinity for the PCI devices.
|
||||
func (q *qemu) createNUMAPCIeTopology(devices []govmmQemu.Device, hypervisorConfig *HypervisorConfig, totalPorts uint32) []govmmQemu.Device {
|
||||
coveredHostNodes := buildCoveredHostNodes(q.config.GuestNUMANodes)
|
||||
|
||||
// Count VFIO devices per host NUMA node.
|
||||
numaDevCount := make(map[int]int)
|
||||
for _, dev := range hypervisorConfig.VFIODevices {
|
||||
hostPath, err := config.GetHostPath(dev, false, "")
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
dev.HostPath = hostPath
|
||||
var vfioDevs []*config.VFIODev
|
||||
if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) {
|
||||
vfioDevs, _ = drivers.GetDeviceFromVFIODev(dev)
|
||||
} else {
|
||||
vfioDevs, _ = drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
|
||||
}
|
||||
for _, vd := range vfioDevs {
|
||||
if vd.NUMANode >= 0 && drivers.IsPCIeDevice(vd.BDF) {
|
||||
numaDevCount[vd.NUMANode]++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(numaDevCount) == 0 {
|
||||
return q.arch.appendPCIeRootPortDevice(devices, totalPorts)
|
||||
}
|
||||
|
||||
// Create a pxb-pcie + root ports per NUMA node that has devices.
|
||||
var rpIndex uint32
|
||||
const busNrSpacing uint8 = 0x20
|
||||
|
||||
for hostNode, devCount := range numaDevCount {
|
||||
guestNode, ok := coveredHostNodes[hostNode]
|
||||
if !ok {
|
||||
q.Logger().WithField("host-numa", hostNode).Warn("VFIO device on uncovered NUMA node; skipping pxb-pcie")
|
||||
continue
|
||||
}
|
||||
|
||||
pxbID := fmt.Sprintf("pxb-numa%d", guestNode)
|
||||
busNr := busNrSpacing * uint8(guestNode+1)
|
||||
|
||||
devices = append(devices, govmmQemu.PXBPCIeDevice{
|
||||
ID: pxbID,
|
||||
BusNr: busNr,
|
||||
NUMANode: int(guestNode),
|
||||
})
|
||||
|
||||
// Create root ports on this pxb bus for the VFIO devices.
|
||||
var rpIDs []string
|
||||
for i := 0; i < devCount; i++ {
|
||||
rpID := fmt.Sprintf("rp-numa%d-%d", guestNode, i)
|
||||
rpIDs = append(rpIDs, rpID)
|
||||
devices = append(devices, govmmQemu.PCIeRootPortDevice{
|
||||
ID: rpID,
|
||||
Bus: pxbID,
|
||||
Chassis: fmt.Sprintf("%d", 10+guestNode),
|
||||
Slot: fmt.Sprintf("%d", i),
|
||||
})
|
||||
rpIndex++
|
||||
}
|
||||
|
||||
config.NUMARootPorts[hostNode] = rpIDs
|
||||
|
||||
q.Logger().WithFields(logrus.Fields{
|
||||
"pxb-id": pxbID,
|
||||
"bus-nr": busNr,
|
||||
"guest-numa": guestNode,
|
||||
"host-numa": hostNode,
|
||||
"root-ports": rpIDs,
|
||||
}).Info("Created pxb-pcie with root ports for NUMA VFIO placement")
|
||||
}
|
||||
|
||||
return devices
|
||||
}
|
||||
|
||||
func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device {
|
||||
var (
|
||||
bus string
|
||||
|
||||
@@ -1203,7 +1203,33 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) {
|
||||
assert.Equal(100, q.state.HotpluggedMemory)
|
||||
}
|
||||
|
||||
func TestNumaPlacementActive(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
cases := []struct {
|
||||
name string
|
||||
nodes []types.GuestNUMANode
|
||||
want bool
|
||||
}{
|
||||
{"empty", nil, false},
|
||||
{"single-node-no-binding", []types.GuestNUMANode{{}}, false},
|
||||
{"single-node-host-0", []types.GuestNUMANode{{HostNodes: "0"}}, true},
|
||||
{"single-node-host-1", []types.GuestNUMANode{{HostNodes: "1"}}, true},
|
||||
{"single-node-host-range", []types.GuestNUMANode{{HostNodes: "0-1"}}, true},
|
||||
{"two-nodes", []types.GuestNUMANode{{HostNodes: "0"}, {HostNodes: "1"}}, true},
|
||||
}
|
||||
for _, c := range cases {
|
||||
assert.Equal(c.want, numaPlacementActive(c.nodes), c.name)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNUMATopologySingleNode(t *testing.T) {
|
||||
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
|
||||
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
|
||||
}
|
||||
// A single guest node mapped to a specific host node (e.g. produced
|
||||
// by maybeRightSizeAutoNUMA() collapsing a multi-node sandbox to the
|
||||
// GPU's host NUMA node) must still emit a one-node topology so that
|
||||
// the memory backend gets a host-nodes= binding.
|
||||
assert := assert.New(t)
|
||||
q := &qemu{
|
||||
config: HypervisorConfig{
|
||||
@@ -1214,12 +1240,88 @@ func TestBuildNUMATopologySingleNode(t *testing.T) {
|
||||
},
|
||||
},
|
||||
}
|
||||
nodes, _, err := q.buildNUMATopology()
|
||||
assert.NoError(err)
|
||||
assert.Len(nodes, 1)
|
||||
assert.Equal(uint32(0), nodes[0].NodeID)
|
||||
assert.Equal("0-3", nodes[0].CPUs)
|
||||
assert.Equal("1024M", nodes[0].MemSize)
|
||||
assert.Equal("0", nodes[0].HostNodes)
|
||||
assert.Equal("memory-backend-ram", nodes[0].MemBackendType)
|
||||
}
|
||||
|
||||
func TestBuildNUMATopologySingleNodeNoHostBinding(t *testing.T) {
|
||||
// A single guest node without a HostNodes value carries no NUMA
|
||||
// binding intent; buildNUMATopology() must return nil so that the
|
||||
// QEMU command line falls through to the flat memdev path.
|
||||
assert := assert.New(t)
|
||||
q := &qemu{
|
||||
config: HypervisorConfig{
|
||||
DefaultMaxVCPUs: 4,
|
||||
MemorySize: 1024,
|
||||
GuestNUMANodes: []types.GuestNUMANode{
|
||||
{HostNodes: "", HostCPUs: "0-3"},
|
||||
},
|
||||
},
|
||||
}
|
||||
nodes, dists, err := q.buildNUMATopology()
|
||||
assert.NoError(err)
|
||||
assert.Nil(nodes)
|
||||
assert.Nil(dists)
|
||||
}
|
||||
|
||||
func TestBuildNUMATopologySingleNodeExplicitNonZeroHost(t *testing.T) {
|
||||
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
|
||||
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
|
||||
}
|
||||
// User explicitly mapped the only guest node to a non-zero host node
|
||||
// (e.g. numa_mapping = ["1"]). buildNUMATopology() must propagate
|
||||
// HostNodes verbatim so the memory backend ends up bound to host
|
||||
// node 1 rather than the default node 0.
|
||||
assert := assert.New(t)
|
||||
q := &qemu{
|
||||
config: HypervisorConfig{
|
||||
DefaultMaxVCPUs: 4,
|
||||
MemorySize: 1024,
|
||||
NUMAMapping: []string{"1"},
|
||||
GuestNUMANodes: []types.GuestNUMANode{
|
||||
{HostNodes: "1", HostCPUs: "0-3"},
|
||||
},
|
||||
},
|
||||
}
|
||||
nodes, _, err := q.buildNUMATopology()
|
||||
assert.NoError(err)
|
||||
assert.Len(nodes, 1)
|
||||
assert.Equal(uint32(0), nodes[0].NodeID)
|
||||
assert.Equal("1", nodes[0].HostNodes)
|
||||
}
|
||||
|
||||
func TestBuildNUMATopologyExplicitRangedHostNodes(t *testing.T) {
|
||||
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
|
||||
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
|
||||
}
|
||||
// User explicitly mapped two guest nodes to disjoint host-node ranges
|
||||
// (e.g. numa_mapping = ["0-1", "2-3"]). buildNUMATopology() must
|
||||
// preserve the ranged HostNodes strings on each emitted NUMANode.
|
||||
assert := assert.New(t)
|
||||
q := &qemu{
|
||||
config: HypervisorConfig{
|
||||
DefaultMaxVCPUs: 8,
|
||||
MemorySize: 2048,
|
||||
NUMAMapping: []string{"0-1", "2-3"},
|
||||
GuestNUMANodes: []types.GuestNUMANode{
|
||||
{HostNodes: "0-1", HostCPUs: "0-3"},
|
||||
{HostNodes: "2-3", HostCPUs: "4-7"},
|
||||
},
|
||||
},
|
||||
}
|
||||
nodes, _, err := q.buildNUMATopology()
|
||||
assert.NoError(err)
|
||||
assert.Len(nodes, 2)
|
||||
assert.Equal("0-1", nodes[0].HostNodes)
|
||||
assert.Equal("2-3", nodes[1].HostNodes)
|
||||
}
|
||||
|
||||
func TestBuildNUMATopologyTwoNodes(t *testing.T) {
|
||||
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
|
||||
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
|
||||
|
||||
Reference in New Issue
Block a user