Merge pull request #12948 from fidencio/topic/numa

runtime (go): agent: Add NUMA support for QEMU
This commit is contained in:
Zvonko Kaiser
2026-05-25 15:33:14 +02:00
committed by GitHub
49 changed files with 4252 additions and 82 deletions

View File

@@ -235,6 +235,17 @@ var (
// different types of PCI ports. We can deduces the Bus number from it
// and eliminate duplicates being assigned.
PCIeDevicesPerPort = map[PCIePort][]VFIODev{}
// NUMARootPorts maps host NUMA node IDs to root port IDs on pxb-pcie
// bridges. When NUMA-aware PCIe topology is active (pxb-pcie),
// createPCIeTopology populates this so VFIODevice.Attach() can assign
// each device to the root port on its host NUMA node's pxb-pcie bus.
// Key: host NUMA node ID, Value: slice of root port IDs on that node's pxb.
NUMARootPorts = map[int][]string{}
// NUMARootPortDeviceCount tracks how many devices have been assigned
// to each host NUMA node's root ports (for round-robin assignment).
NUMARootPortDeviceCount = map[int]int{}
)
// DeviceInfo is an embedded type that contains device data common to all types of devices.
@@ -418,6 +429,10 @@ type VFIODev struct {
// Type of VFIO device
Type VFIODeviceType
// NUMANode is the host NUMA node this device is attached to.
// -1 means no affinity or unknown.
NUMANode int
// IsPCIe specifies device is PCIe or PCI
IsPCIe bool

View File

@@ -46,6 +46,7 @@ var (
PCISysFsSlotsMaxBusSpeed PCISysFsProperty = "max_bus_speed" // /sys/bus/pci/slots/xxx/max_bus_speed
PCISysFsDevicesVendor PCISysFsProperty = "vendor" // /sys/bus/pci/devices/xxx/vendor
PCISysFsDevicesDevice PCISysFsProperty = "device" // /sys/bus/pci/devices/xxx/device
PCISysFsDevicesNUMANode PCISysFsProperty = "numa_node" // /sys/bus/pci/devices/xxx/numa_node
)
func deviceLogger() *logrus.Entry {
@@ -85,6 +86,20 @@ func GetPCIDeviceProperty(bdf string, property PCISysFsProperty) string {
return rlt
}
// GetPCIDeviceNUMANode returns the host NUMA node for a PCI device.
// Returns -1 if the device has no NUMA affinity or the value cannot be read.
func GetPCIDeviceNUMANode(bdf string) int {
raw := GetPCIDeviceProperty(bdf, PCISysFsDevicesNUMANode)
if raw == "" {
return -1
}
n, err := strconv.Atoi(raw)
if err != nil {
return -1
}
return n
}
func readPCIProperty(propertyPath string) (string, error) {
var (
buf []byte
@@ -240,6 +255,7 @@ func GetDeviceFromVFIODev(device config.DeviceInfo) ([]*config.VFIODev, error) {
Class: pciClass,
VendorID: vendorID,
DeviceID: deviceID,
NUMANode: GetPCIDeviceNUMANode(deviceBDF),
Port: device.Port,
HostPath: device.HostPath,
}
@@ -291,7 +307,6 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
vendorID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesVendor)
deviceID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesDevice)
// Do not directly assign to `vfio` -- need to access field still
vfio = config.VFIODev{
ID: id,
Type: vfioDeviceType,
@@ -301,6 +316,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
Class: pciClass,
VendorID: vendorID,
DeviceID: deviceID,
NUMANode: GetPCIDeviceNUMANode(deviceBDF),
Port: device.Port,
HostPath: device.HostPath,
}
@@ -315,6 +331,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
SysfsDev: deviceSysfsDev,
Type: config.VFIOAPDeviceMediatedType,
APDevices: devices,
NUMANode: -1,
Port: device.Port,
}
default:

View File

@@ -90,11 +90,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
}
if vfio.IsPCIe {
busIndex := len(config.PCIeDevicesPerPort[vfio.Port])
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
// We need to keep track the number of devices per port to deduce
// the corectu bus number, additionally we can use the VFIO device
// info to act upon different Vendor IDs and Device IDs.
// When pxb-pcie NUMA topology is active, assign the device
// to a root port on the pxb-pcie bridge for its host NUMA
// node instead of the default rp/swdp numbering.
if rpIDs, ok := config.NUMARootPorts[vfio.NUMANode]; ok && len(rpIDs) > 0 {
idx := config.NUMARootPortDeviceCount[vfio.NUMANode]
vfio.Bus = rpIDs[idx%len(rpIDs)]
config.NUMARootPortDeviceCount[vfio.NUMANode] = idx + 1
} else {
busIndex := len(config.PCIeDevicesPerPort[vfio.Port])
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
}
config.PCIeDevicesPerPort[vfio.Port] = append(config.PCIeDevicesPerPort[vfio.Port], *vfio)
}
}

View File

@@ -71,6 +71,8 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
config.PCIeDevicesPerPort[config.RootPort] = make([]config.VFIODev, 0)
config.PCIeDevicesPerPort[config.SwitchPort] = make([]config.VFIODev, 0)
config.PCIeDevicesPerPort[config.BridgePort] = make([]config.VFIODev, 0)
config.NUMARootPorts = make(map[int][]string)
config.NUMARootPortDeviceCount = make(map[int]int)
for _, dev := range devices {
dm.devices[dev.DeviceID()] = dev

View File

@@ -50,6 +50,20 @@ const (
qgsSocketPath string = "/var/run/tdx-qgs/qgs.socket"
)
// hasPCIeRoot reports whether the configured QEMU machine type exposes a
// `pcie.0` root complex (q35 on x86, virt on arm64). Machines such as
// pseries (ppc64le -> pci.0), s390-ccw-virtio (s390x -> CCW transport)
// and microvm (no PCI at all) do not have a `pcie.0` bus, so emitting
// `bus=pcie.0` on virtio-pci leaf devices would fail to start QEMU.
// Used to gate the bus= pin we apply to keep leaf devices off pxb-pcie.
func hasPCIeRoot(config *Config) bool {
if config == nil {
return false
}
t := config.Machine.Type
return strings.HasPrefix(t, "q35") || strings.HasPrefix(t, "virt")
}
const (
// Well known vsock CID for host system.
// https://man7.org/linux/man-pages/man7/vsock.7.html
@@ -132,6 +146,10 @@ const (
// VHostVSockPCI is a generic Vsock vhost device with PCI transport.
VHostVSockPCI DeviceDriver = "vhost-vsock-pci"
// PXBPCIe is a PCIe Expander Bridge that creates a new PCI root
// complex with NUMA node affinity.
PXBPCIe DeviceDriver = "pxb-pcie"
// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
PCIeRootPort DeviceDriver = "pcie-root-port"
@@ -152,7 +170,7 @@ const (
func isDimmSupported(config *Config) bool {
switch runtime.GOARCH {
case "amd64", "386", "ppc64le", "arm64":
case "amd64", "ppc64le", "arm64":
if config != nil && config.Machine.Type == MachineTypeMicrovm {
// microvm does not support NUMA
return false
@@ -1064,6 +1082,11 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string {
if netdev.Bus != "" {
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus))
} else if netdev.Transport.isVirtioPCI(config) && hasPCIeRoot(config) {
// Pin to pcie.0 (when present) so pxb-pcie can't capture
// this leaf device as the default bus. Skipped on machines
// without a `pcie.0` root (pseries, microvm, s390-ccw-virtio).
deviceParams = append(deviceParams, "bus=pcie.0")
}
if netdev.Addr != "" {
@@ -1586,8 +1609,15 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string {
deviceParams = append(deviceParams, fmt.Sprintf("netdev=%s", vhostuserDev.TypeDevID))
deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address))
if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
if vhostuserDev.Transport.isVirtioPCI(config) {
// Pin to pcie.0 (when present) so pxb-pcie can't capture
// this leaf device. See hasPCIeRoot() for skipped machines.
if hasPCIeRoot(config) {
deviceParams = append(deviceParams, "bus=pcie.0")
}
if vhostuserDev.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
}
}
qemuParams = append(qemuParams, "-netdev")
@@ -1612,8 +1642,13 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string {
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vhostuserDev.TypeDevID))
deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID))
if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
if vhostuserDev.Transport.isVirtioPCI(config) {
if hasPCIeRoot(config) {
deviceParams = append(deviceParams, "bus=pcie.0")
}
if vhostuserDev.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
}
}
qemuParams = append(qemuParams, "-device")
@@ -1637,8 +1672,13 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string {
deviceParams = append(deviceParams, "size=512M")
deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID))
if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
if vhostuserDev.Transport.isVirtioPCI(config) {
if hasPCIeRoot(config) {
deviceParams = append(deviceParams, "bus=pcie.0")
}
if vhostuserDev.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
}
}
qemuParams = append(qemuParams, "-device")
@@ -1674,8 +1714,13 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string {
}
deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo))
}
if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
if vhostuserDev.Transport.isVirtioPCI(config) {
if hasPCIeRoot(config) {
deviceParams = append(deviceParams, "bus=pcie.0")
}
if vhostuserDev.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
}
}
qemuParams = append(qemuParams, "-device")
@@ -1738,6 +1783,36 @@ func (vhostuserDev VhostUserDevice) deviceName(config *Config) string {
}
}
// PXBPCIeDevice represents a PCIe Expander Bridge (pxb-pcie).
// It creates a new PCI root complex with NUMA node affinity, allowing
// devices attached to its bus hierarchy to inherit the NUMA association.
// This is the only QEMU PCI device that carries a numa_node property.
type PXBPCIeDevice struct {
// ID is the QEMU device identifier (e.g. "pxb-numa0").
ID string
// BusNr is the guest PCI bus number for this root complex.
// Use values spaced apart (e.g. 0x20, 0x40) to leave room for
// bridges beneath each pxb-pcie.
BusNr uint8
// NUMANode is the guest NUMA node index this root complex belongs to.
NUMANode int
}
// QemuParams returns the QEMU parameters for a pxb-pcie device.
func (dev PXBPCIeDevice) QemuParams(_ *Config) []string {
return []string{
"-device",
fmt.Sprintf("pxb-pcie,id=%s,bus_nr=%d,numa_node=%d", dev.ID, dev.BusNr, dev.NUMANode),
}
}
// Valid returns true if the PXBPCIeDevice structure is valid and complete.
func (dev PXBPCIeDevice) Valid() bool {
return dev.ID != ""
}
// PCIeRootPortDevice represents a memory balloon device.
// nolint: govet
type PCIeRootPortDevice struct {
@@ -2310,8 +2385,15 @@ func (vsock VSOCKDevice) QemuParams(config *Config) []string {
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vsock.ID))
deviceParams = append(deviceParams, fmt.Sprintf("%s=%d", VSOCKGuestCID, vsock.ContextID))
if vsock.Transport.isVirtioPCI(config) && vsock.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile))
if vsock.Transport.isVirtioPCI(config) {
// Pin to pcie.0 (when present) so pxb-pcie can't capture
// this leaf device. See hasPCIeRoot() for skipped machines.
if hasPCIeRoot(config) {
deviceParams = append(deviceParams, "bus=pcie.0")
}
if vsock.ROMFile != "" {
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile))
}
}
if vsock.Transport.isVirtioCCW(config) {
@@ -2689,7 +2771,8 @@ type SMP struct {
Sockets uint32
// MaxCPUs is the maximum number of VCPUs that a VM can have.
// This value, if non-zero, MUST BE equal to or greater than CPUs
// This value, if non-zero, MUST BE equal to or greater than CPUs,
// and must be equal to Sockets * Cores * Threads if all are non-zero.
MaxCPUs uint32
}
@@ -2775,6 +2858,36 @@ func (fwcfg FwCfg) QemuParams(config *Config) []string {
return qemuParams
}
// NUMANode describes a guest NUMA node and its mapping to host resources.
type NUMANode struct {
// NodeID is the guest NUMA node identifier (0-based).
NodeID uint32
// CPUs is the guest vCPU range assigned to this node (e.g. "0-3").
CPUs string
// MemSize is the amount of memory for this node (e.g. "512M", "1G").
MemSize string
// HostNodes is the host NUMA node(s) this guest node maps to (e.g. "0" or "0-1").
HostNodes string
// MemBackendType selects the QEMU memory backend object type.
// Typical values: "memory-backend-ram" or "memory-backend-file".
MemBackendType string
// MemBackendPath is the mem-path for file-backed memory (hugepages, file-backed).
// Empty when using memory-backend-ram.
MemBackendPath string
}
// NUMADist describes a NUMA distance entry for `-numa dist`.
type NUMADist struct {
Src uint32
Dst uint32
Val uint32
}
// Knobs regroups a set of qemu boolean settings
type Knobs struct {
// NoUserConfig prevents qemu from loading user config files.
@@ -2922,6 +3035,14 @@ type Config struct {
IOThreads []IOThread
// NUMANodes defines multi-NUMA guest topology. When non-empty,
// appendMemoryKnobs creates per-node memory backends and -numa entries
// instead of a single flat memory region.
NUMANodes []NUMANode
// NUMADists defines inter-node distance entries emitted as -numa dist.
NUMADists []NUMADist
// PidFile is the -pidfile parameter
PidFile string
@@ -3096,6 +3217,13 @@ func (config *Config) appendCPUs() error {
return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d",
config.SMP.MaxCPUs, config.SMP.CPUs)
}
if len(config.NUMANodes) > 1 && config.SMP.Sockets > 0 && config.SMP.Cores > 0 && config.SMP.Threads > 0 {
expected := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads
if config.SMP.MaxCPUs != expected {
return fmt.Errorf("MaxCPUs %d must equal Sockets(%d) * Cores(%d) * Threads(%d) = %d",
config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads, expected)
}
}
SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs))
}
@@ -3169,6 +3297,12 @@ func (config *Config) appendMemoryKnobs() {
if config.Memory.Size == "" {
return
}
if len(config.NUMANodes) > 0 && isDimmSupported(config) {
config.appendMultiNUMAMemoryKnobs()
return
}
var objMemParam, numaMemParam string
dimmName := "dimm1"
if config.Knobs.HugePages {
@@ -3200,6 +3334,49 @@ func (config *Config) appendMemoryKnobs() {
}
}
func (config *Config) appendMultiNUMAMemoryKnobs() {
for _, node := range config.NUMANodes {
memID := fmt.Sprintf("numa-mem%d", node.NodeID)
backendType := node.MemBackendType
if backendType == "" {
backendType = "memory-backend-ram"
}
objMemParam := fmt.Sprintf("%s,id=%s,size=%s", backendType, memID, node.MemSize)
if node.MemBackendPath != "" {
objMemParam += ",mem-path=" + node.MemBackendPath
}
if node.HostNodes != "" {
objMemParam += ",host-nodes=" + node.HostNodes + ",policy=bind"
}
if config.Knobs.MemShared {
objMemParam += ",share=on"
}
if config.Knobs.MemPrealloc {
objMemParam += ",prealloc=on"
}
config.qemuParams = append(config.qemuParams, "-object")
config.qemuParams = append(config.qemuParams, objMemParam)
numaParam := fmt.Sprintf("node,nodeid=%d,memdev=%s", node.NodeID, memID)
if node.CPUs != "" {
numaParam += ",cpus=" + node.CPUs
}
config.qemuParams = append(config.qemuParams, "-numa")
config.qemuParams = append(config.qemuParams, numaParam)
}
for _, dist := range config.NUMADists {
config.qemuParams = append(config.qemuParams, "-numa")
config.qemuParams = append(config.qemuParams, fmt.Sprintf("dist,src=%d,dst=%d,val=%d", dist.Src, dist.Dst, dist.Val))
}
}
func (config *Config) appendKnobs() {
if config.Knobs.NoUserConfig {
config.qemuParams = append(config.qemuParams, "-no-user-config")

View File

@@ -14,8 +14,8 @@ var (
deviceNetworkString = "-netdev tap,id=tap0,vhost=on,ifname=ceth0,downscript=no,script=no -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,romfile=efi-virtio.rom"
deviceNetworkStringMq = "-netdev tap,id=tap0,vhost=on,fds=3:4 -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,mq=on,vectors=6,romfile=efi-virtio.rom"
deviceSerialString = "-device virtio-serial-pci,disable-modern=true,id=serial0,romfile=efi-virtio.rom,max_ports=2"
deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,romfile=efi-virtio.rom"
deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=efi-virtio.rom"
deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,bus=pcie.0,romfile=efi-virtio.rom"
deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,bus=pcie.0,romfile=efi-virtio.rom"
deviceVFIOString = "-device vfio-pci,host=02:10.0,x-pci-vendor-id=0x1234,x-pci-device-id=0x5678,romfile=efi-virtio.rom"
devicePCIeRootPortSimpleString = "-device pcie-root-port,id=rp1,bus=pcie.0,chassis=0x00,slot=0x00,multifunction=off"
devicePCIeRootPortFullString = "-device pcie-root-port,id=rp2,bus=pcie.0,chassis=0x0,slot=0x1,addr=0x2,multifunction=on,bus-reserve=0x3,pref64-reserve=16G,mem-reserve=1G,io-reserve=512M,romfile=efi-virtio.rom"
@@ -23,8 +23,8 @@ var (
deviceVFIOPCIeFullString = "-device vfio-pci,host=02:00.0,x-pci-vendor-id=0x10de,x-pci-device-id=0x15f8,romfile=efi-virtio.rom,bus=rp1"
deviceSCSIControllerStr = "-device virtio-scsi-pci,id=foo,disable-modern=false,romfile=efi-virtio.rom"
deviceSCSIControllerBusAddrStr = "-device virtio-scsi-pci,id=foo,bus=pci.0,addr=00:04.0,disable-modern=true,iothread=iothread1,romfile=efi-virtio.rom"
deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,romfile=efi-virtio.rom"
deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,romfile=efi-virtio.rom"
deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,bus=pcie.0,romfile=efi-virtio.rom"
deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,bus=pcie.0,romfile=efi-virtio.rom"
deviceBlockString = "-device virtio-blk-pci,disable-modern=true,drive=hd0,config-wce=off,romfile=efi-virtio.rom,share-rw=on,serial=hd0 -drive id=hd0,file=/var/lib/vm.img,aio=threads,format=qcow2,if=none,readonly=on"
devicePCIBridgeString = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=on,addr=ff,romfile=efi-virtio.rom"
devicePCIBridgeStringReserved = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=off,addr=ff,romfile=efi-virtio.rom,io-reserve=4k,mem-reserve=1m,pref64-reserve=1m"
@@ -42,7 +42,8 @@ func TestAppendDeviceVhostUser(t *testing.T) {
VhostUserType: VhostUserBlk,
ROMFile: romfile,
}
testAppend(vhostuserBlkDevice, deviceVhostUserBlkString, t)
// vhost-user-pci device strings include bus=pcie.0 — gated to q35/virt.
testAppendQ35(vhostuserBlkDevice, deviceVhostUserBlkString, t)
vhostuserSCSIDevice := VhostUserDevice{
SocketPath: "/tmp/nonexistentsocket.socket",
@@ -52,7 +53,7 @@ func TestAppendDeviceVhostUser(t *testing.T) {
VhostUserType: VhostUserSCSI,
ROMFile: romfile,
}
testAppend(vhostuserSCSIDevice, deviceVhostUserSCSIString, t)
testAppendQ35(vhostuserSCSIDevice, deviceVhostUserSCSIString, t)
vhostuserNetDevice := VhostUserDevice{
SocketPath: "/tmp/nonexistentsocket.socket",
@@ -62,7 +63,7 @@ func TestAppendDeviceVhostUser(t *testing.T) {
VhostUserType: VhostUserNet,
ROMFile: romfile,
}
testAppend(vhostuserNetDevice, deviceVhostUserNetString, t)
testAppendQ35(vhostuserNetDevice, deviceVhostUserNetString, t)
}
func TestAppendVirtioBalloon(t *testing.T) {

View File

@@ -9,6 +9,7 @@ import (
"fmt"
"os"
"reflect"
"runtime"
"strings"
"testing"
)
@@ -23,6 +24,15 @@ func testAppend(structure interface{}, expected string, t *testing.T) {
testConfigAppend(&config, structure, expected, t)
}
// testAppendQ35 is testAppend with Config.Machine.Type set to "q35" so
// device emitters that gate on hasPCIeRoot() (e.g. virtio-pci leaves
// pinned to bus=pcie.0) take the PCIe path. Use this for tests whose
// expected string contains "bus=pcie.0".
func testAppendQ35(structure interface{}, expected string, t *testing.T) {
config := Config{Machine: Machine{Type: "q35"}}
testConfigAppend(&config, structure, expected, t)
}
func testConfigAppend(config *Config, structure interface{}, expected string, t *testing.T) {
switch s := structure.(type) {
case Machine:
@@ -342,7 +352,32 @@ func TestAppendVSOCK(t *testing.T) {
vsockDevice.DevNo = DevNo
}
testAppend(vsockDevice, deviceVSOCKString, t)
// deviceVSOCKString includes bus=pcie.0 — gated to q35/virt machines.
testAppendQ35(vsockDevice, deviceVSOCKString, t)
}
// TestAppendVSOCKNoPCIeRoot verifies that on machines without a `pcie.0`
// root (e.g. ppc64le's pseries, microvm, s390-ccw-virtio), we do NOT
// emit `bus=pcie.0` — doing so would crash QEMU with
// "Bus 'pcie.0' not found". Transport and ROMFile are set explicitly
// rather than using the arch-conditional `romfile` constant (which is
// "" on s390x via qemu_s390x_test.go), so the test exercises the
// same code path on every architecture.
func TestAppendVSOCKNoPCIeRoot(t *testing.T) {
const vsockRomfile = "efi-virtio.rom"
vsockDevice := VSOCKDevice{
ID: "vhost-vsock-pci0",
ContextID: 4,
VHostFD: nil,
DisableModern: true,
ROMFile: vsockRomfile,
Transport: TransportPCI,
}
// pseries -> hasPCIeRoot returns false -> no bus=pcie.0 emitted.
expected := "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=" + vsockRomfile
config := Config{Machine: Machine{Type: "pseries"}}
testConfigAppend(&config, vsockDevice, expected, t)
}
func TestVSOCKValid(t *testing.T) {
@@ -1117,6 +1152,140 @@ func TestBadMemoryKnobs(t *testing.T) {
}
}
func TestAppendMultiNUMAMemoryKnobs(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
c := &Config{
Memory: Memory{
Size: "2G",
Slots: 8,
MaxMem: "4G",
},
NUMANodes: []NUMANode{
{
NodeID: 0,
CPUs: "0-3",
MemSize: "1G",
HostNodes: "0",
MemBackendType: "memory-backend-ram",
},
{
NodeID: 1,
CPUs: "4-7",
MemSize: "1G",
HostNodes: "1",
MemBackendType: "memory-backend-ram",
},
},
Knobs: Knobs{
MemShared: true,
MemPrealloc: true,
},
}
c.appendMemoryKnobs()
expected := []string{
"-object", "memory-backend-ram,id=numa-mem0,size=1G,host-nodes=0,policy=bind,share=on,prealloc=on",
"-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-3",
"-object", "memory-backend-ram,id=numa-mem1,size=1G,host-nodes=1,policy=bind,share=on,prealloc=on",
"-numa", "node,nodeid=1,memdev=numa-mem1,cpus=4-7",
}
if len(c.qemuParams) != len(expected) {
t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams)
}
for i, p := range expected {
if c.qemuParams[i] != p {
t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i])
}
}
}
func TestAppendMultiNUMAHugePages(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
c := &Config{
Memory: Memory{
Size: "2G",
Slots: 8,
MaxMem: "4G",
},
NUMANodes: []NUMANode{
{
NodeID: 0,
CPUs: "0-1",
MemSize: "1G",
HostNodes: "0",
MemBackendType: "memory-backend-file",
MemBackendPath: "/dev/hugepages",
},
{
NodeID: 1,
CPUs: "2-3",
MemSize: "1G",
HostNodes: "1",
MemBackendType: "memory-backend-file",
MemBackendPath: "/dev/hugepages",
},
},
Knobs: Knobs{
MemShared: true,
},
}
c.appendMemoryKnobs()
expected := []string{
"-object", "memory-backend-file,id=numa-mem0,size=1G,mem-path=/dev/hugepages,host-nodes=0,policy=bind,share=on",
"-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-1",
"-object", "memory-backend-file,id=numa-mem1,size=1G,mem-path=/dev/hugepages,host-nodes=1,policy=bind,share=on",
"-numa", "node,nodeid=1,memdev=numa-mem1,cpus=2-3",
}
if len(c.qemuParams) != len(expected) {
t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams)
}
for i, p := range expected {
if c.qemuParams[i] != p {
t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i])
}
}
}
func TestAppendNUMADist(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
c := &Config{
Memory: Memory{
Size: "2G",
},
NUMANodes: []NUMANode{
{NodeID: 0, CPUs: "0-1", MemSize: "1G", MemBackendType: "memory-backend-ram"},
{NodeID: 1, CPUs: "2-3", MemSize: "1G", MemBackendType: "memory-backend-ram"},
},
NUMADists: []NUMADist{
{Src: 0, Dst: 1, Val: 20},
{Src: 1, Dst: 0, Val: 20},
},
}
c.appendMemoryKnobs()
expectedDist := []string{
"-numa", "dist,src=0,dst=1,val=20",
"-numa", "dist,src=1,dst=0,val=20",
}
params := c.qemuParams
distParams := params[len(params)-4:]
for i, p := range expectedDist {
if distParams[i] != p {
t.Errorf("Dist param %d: expected %q, got %q", i, p, distParams[i])
}
}
}
func TestBadBios(t *testing.T) {
c := &Config{}
c.appendBios()

View File

@@ -1071,6 +1071,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
IOMMU: h.IOMMU,
IOMMUPlatform: h.getIOMMUPlatform(),
GuestNUMANodes: h.defaultGuestNUMANodes(),
NUMAMapping: append([]string(nil), h.NUMAMapping...),
FileBackedMemRootDir: h.FileBackedMemRootDir,
FileBackedMemRootList: h.FileBackedMemRootList,
Debug: h.Debug,
@@ -1994,6 +1995,10 @@ func checkConfig(config oci.RuntimeConfig) error {
return err
}
if err := checkNumaConfig(config); err != nil {
return err
}
hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
machineType := config.HypervisorConfig.HypervisorMachineType
@@ -2005,6 +2010,25 @@ func checkConfig(config oci.RuntimeConfig) error {
return nil
}
func checkNumaConfig(config oci.RuntimeConfig) error {
if len(config.HypervisorConfig.GuestNUMANodes) <= 1 {
return nil
}
switch goruntime.GOARCH {
case "amd64", "arm64":
default:
return fmt.Errorf("multi-NUMA support is only available on amd64 and arm64, got %q", goruntime.GOARCH)
}
if !config.StaticSandboxResourceMgmt {
return fmt.Errorf("NUMA support requires static_sandbox_resource_mgmt to be enabled; " +
"NUMA topology is not compatible with dynamic CPU/memory hotplug")
}
return nil
}
// checkPCIeConfig ensures the PCIe configuration is valid.
// Only allow one of the following settings for cold-plug:
// no-port, root-port, switch-port

View File

@@ -794,11 +794,15 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
}
if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok {
guestNUMANodes, err := vcutils.GetGuestNUMANodes(strings.Fields(annotation))
mapping := strings.Fields(annotation)
guestNUMANodes, err := vcutils.GetGuestNUMANodes(mapping)
if err != nil {
return err
}
sbConfig.HypervisorConfig.GuestNUMANodes = guestNUMANodes
// Record the raw user-provided mapping so the hypervisor
// backend honors it verbatim instead of right-sizing.
sbConfig.HypervisorConfig.NUMAMapping = mapping
}
return nil
@@ -1457,7 +1461,7 @@ func (a *annotationConfiguration) setFloat32WithCheck(f func(float32) error) err
// be added to the VM if sandbox annotations are provided with this sizing details
func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32) {
var memory, quota int64
var period uint64
var shares, period uint64
var err error
if spec == nil || spec.Annotations == nil {
@@ -1488,6 +1492,15 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32)
}
}
annotation, ok = spec.Annotations[ctrAnnotations.SandboxCPUShares]
if ok {
shares, err = strconv.ParseUint(annotation, 10, 64)
if err != nil {
ociLog.Warningf("sandbox-sizing: failure to parse SandboxCPUShares: %s", annotation)
shares = 0
}
}
annotation, ok = spec.Annotations[ctrAnnotations.SandboxMem]
if ok {
memory, err = strconv.ParseInt(annotation, 10, 64)
@@ -1497,7 +1510,16 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32)
}
}
return calculateVMResources(period, quota, memory)
numCPU, memSizeMB = calculateVMResources(period, quota, memory)
// When cpuManagerPolicy=static is in use, kubelet sets quota=-1
// (unconstrained) and assigns CPUs via cpuset instead. Fall back
// to deriving the CPU count from shares (1024 shares per CPU).
if numCPU == 0 && shares > 0 {
numCPU = float32(math.Ceil(float64(shares) / 1024.0))
}
return numCPU, memSizeMB
}
// CalculateContainerSizing will calculate the number of CPUs and amount of memory that is needed