diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 7cf6915df9..dc7501c87b 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -152,7 +152,7 @@ const ( func isDimmSupported(config *Config) bool { switch runtime.GOARCH { - case "amd64", "386", "ppc64le", "arm64": + case "amd64", "ppc64le", "arm64": if config != nil && config.Machine.Type == MachineTypeMicrovm { // microvm does not support NUMA return false @@ -1586,8 +1586,13 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("netdev=%s", vhostuserDev.TypeDevID)) deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address)) - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + // Pin to pcie.0 so pxb-pcie (when present) doesn't capture + // this leaf device as the default bus. + deviceParams = append(deviceParams, "bus=pcie.0") + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-netdev") @@ -1612,8 +1617,11 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vhostuserDev.TypeDevID)) deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-device") @@ -1637,8 +1645,11 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string { deviceParams = append(deviceParams, "size=512M") deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-device") @@ -1674,8 +1685,11 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string { } deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo)) } - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-device") @@ -2689,7 +2703,8 @@ type SMP struct { Sockets uint32 // MaxCPUs is the maximum number of VCPUs that a VM can have. - // This value, if non-zero, MUST BE equal to or greater than CPUs + // This value, if non-zero, MUST BE equal to or greater than CPUs, + // and must be equal to Sockets * Cores * Threads if all are non-zero. MaxCPUs uint32 } @@ -2775,6 +2790,36 @@ func (fwcfg FwCfg) QemuParams(config *Config) []string { return qemuParams } +// NUMANode describes a guest NUMA node and its mapping to host resources. +type NUMANode struct { + // NodeID is the guest NUMA node identifier (0-based). + NodeID uint32 + + // CPUs is the guest vCPU range assigned to this node (e.g. "0-3"). + CPUs string + + // MemSize is the amount of memory for this node (e.g. "512M", "1G"). + MemSize string + + // HostNodes is the host NUMA node(s) this guest node maps to (e.g. "0" or "0-1"). + HostNodes string + + // MemBackendType selects the QEMU memory backend object type. + // Typical values: "memory-backend-ram" or "memory-backend-file". + MemBackendType string + + // MemBackendPath is the mem-path for file-backed memory (hugepages, file-backed). + // Empty when using memory-backend-ram. + MemBackendPath string +} + +// NUMADist describes a NUMA distance entry for `-numa dist`. +type NUMADist struct { + Src uint32 + Dst uint32 + Val uint32 +} + // Knobs regroups a set of qemu boolean settings type Knobs struct { // NoUserConfig prevents qemu from loading user config files. @@ -2922,6 +2967,14 @@ type Config struct { IOThreads []IOThread + // NUMANodes defines multi-NUMA guest topology. When non-empty, + // appendMemoryKnobs creates per-node memory backends and -numa entries + // instead of a single flat memory region. + NUMANodes []NUMANode + + // NUMADists defines inter-node distance entries emitted as -numa dist. + NUMADists []NUMADist + // PidFile is the -pidfile parameter PidFile string @@ -3096,6 +3149,13 @@ func (config *Config) appendCPUs() error { return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d", config.SMP.MaxCPUs, config.SMP.CPUs) } + if len(config.NUMANodes) > 1 && config.SMP.Sockets > 0 && config.SMP.Cores > 0 && config.SMP.Threads > 0 { + expected := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads + if config.SMP.MaxCPUs != expected { + return fmt.Errorf("MaxCPUs %d must equal Sockets(%d) * Cores(%d) * Threads(%d) = %d", + config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads, expected) + } + } SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs)) } @@ -3169,6 +3229,12 @@ func (config *Config) appendMemoryKnobs() { if config.Memory.Size == "" { return } + + if len(config.NUMANodes) > 0 && isDimmSupported(config) { + config.appendMultiNUMAMemoryKnobs() + return + } + var objMemParam, numaMemParam string dimmName := "dimm1" if config.Knobs.HugePages { @@ -3200,6 +3266,49 @@ func (config *Config) appendMemoryKnobs() { } } +func (config *Config) appendMultiNUMAMemoryKnobs() { + for _, node := range config.NUMANodes { + memID := fmt.Sprintf("numa-mem%d", node.NodeID) + + backendType := node.MemBackendType + if backendType == "" { + backendType = "memory-backend-ram" + } + + objMemParam := fmt.Sprintf("%s,id=%s,size=%s", backendType, memID, node.MemSize) + + if node.MemBackendPath != "" { + objMemParam += ",mem-path=" + node.MemBackendPath + } + + if node.HostNodes != "" { + objMemParam += ",host-nodes=" + node.HostNodes + ",policy=bind" + } + + if config.Knobs.MemShared { + objMemParam += ",share=on" + } + if config.Knobs.MemPrealloc { + objMemParam += ",prealloc=on" + } + + config.qemuParams = append(config.qemuParams, "-object") + config.qemuParams = append(config.qemuParams, objMemParam) + + numaParam := fmt.Sprintf("node,nodeid=%d,memdev=%s", node.NodeID, memID) + if node.CPUs != "" { + numaParam += ",cpus=" + node.CPUs + } + config.qemuParams = append(config.qemuParams, "-numa") + config.qemuParams = append(config.qemuParams, numaParam) + } + + for _, dist := range config.NUMADists { + config.qemuParams = append(config.qemuParams, "-numa") + config.qemuParams = append(config.qemuParams, fmt.Sprintf("dist,src=%d,dst=%d,val=%d", dist.Src, dist.Dst, dist.Val)) + } +} + func (config *Config) appendKnobs() { if config.Knobs.NoUserConfig { config.qemuParams = append(config.qemuParams, "-no-user-config") diff --git a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go index a14e0fb032..a15e77c184 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go @@ -14,8 +14,8 @@ var ( deviceNetworkString = "-netdev tap,id=tap0,vhost=on,ifname=ceth0,downscript=no,script=no -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,romfile=efi-virtio.rom" deviceNetworkStringMq = "-netdev tap,id=tap0,vhost=on,fds=3:4 -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,mq=on,vectors=6,romfile=efi-virtio.rom" deviceSerialString = "-device virtio-serial-pci,disable-modern=true,id=serial0,romfile=efi-virtio.rom,max_ports=2" - deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,romfile=efi-virtio.rom" - deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=efi-virtio.rom" + deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,bus=pcie.0,romfile=efi-virtio.rom" + deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,bus=pcie.0,romfile=efi-virtio.rom" deviceVFIOString = "-device vfio-pci,host=02:10.0,x-pci-vendor-id=0x1234,x-pci-device-id=0x5678,romfile=efi-virtio.rom" devicePCIeRootPortSimpleString = "-device pcie-root-port,id=rp1,bus=pcie.0,chassis=0x00,slot=0x00,multifunction=off" devicePCIeRootPortFullString = "-device pcie-root-port,id=rp2,bus=pcie.0,chassis=0x0,slot=0x1,addr=0x2,multifunction=on,bus-reserve=0x3,pref64-reserve=16G,mem-reserve=1G,io-reserve=512M,romfile=efi-virtio.rom" @@ -23,8 +23,8 @@ var ( deviceVFIOPCIeFullString = "-device vfio-pci,host=02:00.0,x-pci-vendor-id=0x10de,x-pci-device-id=0x15f8,romfile=efi-virtio.rom,bus=rp1" deviceSCSIControllerStr = "-device virtio-scsi-pci,id=foo,disable-modern=false,romfile=efi-virtio.rom" deviceSCSIControllerBusAddrStr = "-device virtio-scsi-pci,id=foo,bus=pci.0,addr=00:04.0,disable-modern=true,iothread=iothread1,romfile=efi-virtio.rom" - deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,romfile=efi-virtio.rom" - deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,romfile=efi-virtio.rom" + deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,bus=pcie.0,romfile=efi-virtio.rom" + deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,bus=pcie.0,romfile=efi-virtio.rom" deviceBlockString = "-device virtio-blk-pci,disable-modern=true,drive=hd0,config-wce=off,romfile=efi-virtio.rom,share-rw=on,serial=hd0 -drive id=hd0,file=/var/lib/vm.img,aio=threads,format=qcow2,if=none,readonly=on" devicePCIBridgeString = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=on,addr=ff,romfile=efi-virtio.rom" devicePCIBridgeStringReserved = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=off,addr=ff,romfile=efi-virtio.rom,io-reserve=4k,mem-reserve=1m,pref64-reserve=1m" diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 5d4c15ed9d..8be4d0d779 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -9,6 +9,7 @@ import ( "fmt" "os" "reflect" + "runtime" "strings" "testing" ) @@ -1117,6 +1118,140 @@ func TestBadMemoryKnobs(t *testing.T) { } } +func TestAppendMultiNUMAMemoryKnobs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + c := &Config{ + Memory: Memory{ + Size: "2G", + Slots: 8, + MaxMem: "4G", + }, + NUMANodes: []NUMANode{ + { + NodeID: 0, + CPUs: "0-3", + MemSize: "1G", + HostNodes: "0", + MemBackendType: "memory-backend-ram", + }, + { + NodeID: 1, + CPUs: "4-7", + MemSize: "1G", + HostNodes: "1", + MemBackendType: "memory-backend-ram", + }, + }, + Knobs: Knobs{ + MemShared: true, + MemPrealloc: true, + }, + } + + c.appendMemoryKnobs() + + expected := []string{ + "-object", "memory-backend-ram,id=numa-mem0,size=1G,host-nodes=0,policy=bind,share=on,prealloc=on", + "-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-3", + "-object", "memory-backend-ram,id=numa-mem1,size=1G,host-nodes=1,policy=bind,share=on,prealloc=on", + "-numa", "node,nodeid=1,memdev=numa-mem1,cpus=4-7", + } + if len(c.qemuParams) != len(expected) { + t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams) + } + for i, p := range expected { + if c.qemuParams[i] != p { + t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i]) + } + } +} + +func TestAppendMultiNUMAHugePages(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + c := &Config{ + Memory: Memory{ + Size: "2G", + Slots: 8, + MaxMem: "4G", + }, + NUMANodes: []NUMANode{ + { + NodeID: 0, + CPUs: "0-1", + MemSize: "1G", + HostNodes: "0", + MemBackendType: "memory-backend-file", + MemBackendPath: "/dev/hugepages", + }, + { + NodeID: 1, + CPUs: "2-3", + MemSize: "1G", + HostNodes: "1", + MemBackendType: "memory-backend-file", + MemBackendPath: "/dev/hugepages", + }, + }, + Knobs: Knobs{ + MemShared: true, + }, + } + + c.appendMemoryKnobs() + + expected := []string{ + "-object", "memory-backend-file,id=numa-mem0,size=1G,mem-path=/dev/hugepages,host-nodes=0,policy=bind,share=on", + "-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-1", + "-object", "memory-backend-file,id=numa-mem1,size=1G,mem-path=/dev/hugepages,host-nodes=1,policy=bind,share=on", + "-numa", "node,nodeid=1,memdev=numa-mem1,cpus=2-3", + } + if len(c.qemuParams) != len(expected) { + t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams) + } + for i, p := range expected { + if c.qemuParams[i] != p { + t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i]) + } + } +} + +func TestAppendNUMADist(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + c := &Config{ + Memory: Memory{ + Size: "2G", + }, + NUMANodes: []NUMANode{ + {NodeID: 0, CPUs: "0-1", MemSize: "1G", MemBackendType: "memory-backend-ram"}, + {NodeID: 1, CPUs: "2-3", MemSize: "1G", MemBackendType: "memory-backend-ram"}, + }, + NUMADists: []NUMADist{ + {Src: 0, Dst: 1, Val: 20}, + {Src: 1, Dst: 0, Val: 20}, + }, + } + + c.appendMemoryKnobs() + + expectedDist := []string{ + "-numa", "dist,src=0,dst=1,val=20", + "-numa", "dist,src=1,dst=0,val=20", + } + params := c.qemuParams + distParams := params[len(params)-4:] + for i, p := range expectedDist { + if distParams[i] != p { + t.Errorf("Dist param %d: expected %q, got %q", i, p, distParams[i]) + } + } +} + func TestBadBios(t *testing.T) { c := &Config{} c.appendBios()