mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-02 15:09:45 +00:00
Merge pull request #12948 from fidencio/topic/numa
runtime (go): agent: Add NUMA support for QEMU
This commit is contained in:
@@ -235,6 +235,17 @@ var (
|
||||
// different types of PCI ports. We can deduces the Bus number from it
|
||||
// and eliminate duplicates being assigned.
|
||||
PCIeDevicesPerPort = map[PCIePort][]VFIODev{}
|
||||
|
||||
// NUMARootPorts maps host NUMA node IDs to root port IDs on pxb-pcie
|
||||
// bridges. When NUMA-aware PCIe topology is active (pxb-pcie),
|
||||
// createPCIeTopology populates this so VFIODevice.Attach() can assign
|
||||
// each device to the root port on its host NUMA node's pxb-pcie bus.
|
||||
// Key: host NUMA node ID, Value: slice of root port IDs on that node's pxb.
|
||||
NUMARootPorts = map[int][]string{}
|
||||
|
||||
// NUMARootPortDeviceCount tracks how many devices have been assigned
|
||||
// to each host NUMA node's root ports (for round-robin assignment).
|
||||
NUMARootPortDeviceCount = map[int]int{}
|
||||
)
|
||||
|
||||
// DeviceInfo is an embedded type that contains device data common to all types of devices.
|
||||
@@ -418,6 +429,10 @@ type VFIODev struct {
|
||||
// Type of VFIO device
|
||||
Type VFIODeviceType
|
||||
|
||||
// NUMANode is the host NUMA node this device is attached to.
|
||||
// -1 means no affinity or unknown.
|
||||
NUMANode int
|
||||
|
||||
// IsPCIe specifies device is PCIe or PCI
|
||||
IsPCIe bool
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@ var (
|
||||
PCISysFsSlotsMaxBusSpeed PCISysFsProperty = "max_bus_speed" // /sys/bus/pci/slots/xxx/max_bus_speed
|
||||
PCISysFsDevicesVendor PCISysFsProperty = "vendor" // /sys/bus/pci/devices/xxx/vendor
|
||||
PCISysFsDevicesDevice PCISysFsProperty = "device" // /sys/bus/pci/devices/xxx/device
|
||||
PCISysFsDevicesNUMANode PCISysFsProperty = "numa_node" // /sys/bus/pci/devices/xxx/numa_node
|
||||
)
|
||||
|
||||
func deviceLogger() *logrus.Entry {
|
||||
@@ -85,6 +86,20 @@ func GetPCIDeviceProperty(bdf string, property PCISysFsProperty) string {
|
||||
return rlt
|
||||
}
|
||||
|
||||
// GetPCIDeviceNUMANode returns the host NUMA node for a PCI device.
|
||||
// Returns -1 if the device has no NUMA affinity or the value cannot be read.
|
||||
func GetPCIDeviceNUMANode(bdf string) int {
|
||||
raw := GetPCIDeviceProperty(bdf, PCISysFsDevicesNUMANode)
|
||||
if raw == "" {
|
||||
return -1
|
||||
}
|
||||
n, err := strconv.Atoi(raw)
|
||||
if err != nil {
|
||||
return -1
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func readPCIProperty(propertyPath string) (string, error) {
|
||||
var (
|
||||
buf []byte
|
||||
@@ -240,6 +255,7 @@ func GetDeviceFromVFIODev(device config.DeviceInfo) ([]*config.VFIODev, error) {
|
||||
Class: pciClass,
|
||||
VendorID: vendorID,
|
||||
DeviceID: deviceID,
|
||||
NUMANode: GetPCIDeviceNUMANode(deviceBDF),
|
||||
Port: device.Port,
|
||||
HostPath: device.HostPath,
|
||||
}
|
||||
@@ -291,7 +307,6 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
|
||||
vendorID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesVendor)
|
||||
deviceID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesDevice)
|
||||
|
||||
// Do not directly assign to `vfio` -- need to access field still
|
||||
vfio = config.VFIODev{
|
||||
ID: id,
|
||||
Type: vfioDeviceType,
|
||||
@@ -301,6 +316,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
|
||||
Class: pciClass,
|
||||
VendorID: vendorID,
|
||||
DeviceID: deviceID,
|
||||
NUMANode: GetPCIDeviceNUMANode(deviceBDF),
|
||||
Port: device.Port,
|
||||
HostPath: device.HostPath,
|
||||
}
|
||||
@@ -315,6 +331,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
|
||||
SysfsDev: deviceSysfsDev,
|
||||
Type: config.VFIOAPDeviceMediatedType,
|
||||
APDevices: devices,
|
||||
NUMANode: -1,
|
||||
Port: device.Port,
|
||||
}
|
||||
default:
|
||||
|
||||
@@ -90,11 +90,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
|
||||
}
|
||||
|
||||
if vfio.IsPCIe {
|
||||
busIndex := len(config.PCIeDevicesPerPort[vfio.Port])
|
||||
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
|
||||
// We need to keep track the number of devices per port to deduce
|
||||
// the corectu bus number, additionally we can use the VFIO device
|
||||
// info to act upon different Vendor IDs and Device IDs.
|
||||
// When pxb-pcie NUMA topology is active, assign the device
|
||||
// to a root port on the pxb-pcie bridge for its host NUMA
|
||||
// node instead of the default rp/swdp numbering.
|
||||
if rpIDs, ok := config.NUMARootPorts[vfio.NUMANode]; ok && len(rpIDs) > 0 {
|
||||
idx := config.NUMARootPortDeviceCount[vfio.NUMANode]
|
||||
vfio.Bus = rpIDs[idx%len(rpIDs)]
|
||||
config.NUMARootPortDeviceCount[vfio.NUMANode] = idx + 1
|
||||
} else {
|
||||
busIndex := len(config.PCIeDevicesPerPort[vfio.Port])
|
||||
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
|
||||
}
|
||||
config.PCIeDevicesPerPort[vfio.Port] = append(config.PCIeDevicesPerPort[vfio.Port], *vfio)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,6 +71,8 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
|
||||
config.PCIeDevicesPerPort[config.RootPort] = make([]config.VFIODev, 0)
|
||||
config.PCIeDevicesPerPort[config.SwitchPort] = make([]config.VFIODev, 0)
|
||||
config.PCIeDevicesPerPort[config.BridgePort] = make([]config.VFIODev, 0)
|
||||
config.NUMARootPorts = make(map[int][]string)
|
||||
config.NUMARootPortDeviceCount = make(map[int]int)
|
||||
|
||||
for _, dev := range devices {
|
||||
dm.devices[dev.DeviceID()] = dev
|
||||
|
||||
@@ -50,6 +50,20 @@ const (
|
||||
qgsSocketPath string = "/var/run/tdx-qgs/qgs.socket"
|
||||
)
|
||||
|
||||
// hasPCIeRoot reports whether the configured QEMU machine type exposes a
|
||||
// `pcie.0` root complex (q35 on x86, virt on arm64). Machines such as
|
||||
// pseries (ppc64le -> pci.0), s390-ccw-virtio (s390x -> CCW transport)
|
||||
// and microvm (no PCI at all) do not have a `pcie.0` bus, so emitting
|
||||
// `bus=pcie.0` on virtio-pci leaf devices would fail to start QEMU.
|
||||
// Used to gate the bus= pin we apply to keep leaf devices off pxb-pcie.
|
||||
func hasPCIeRoot(config *Config) bool {
|
||||
if config == nil {
|
||||
return false
|
||||
}
|
||||
t := config.Machine.Type
|
||||
return strings.HasPrefix(t, "q35") || strings.HasPrefix(t, "virt")
|
||||
}
|
||||
|
||||
const (
|
||||
// Well known vsock CID for host system.
|
||||
// https://man7.org/linux/man-pages/man7/vsock.7.html
|
||||
@@ -132,6 +146,10 @@ const (
|
||||
// VHostVSockPCI is a generic Vsock vhost device with PCI transport.
|
||||
VHostVSockPCI DeviceDriver = "vhost-vsock-pci"
|
||||
|
||||
// PXBPCIe is a PCIe Expander Bridge that creates a new PCI root
|
||||
// complex with NUMA node affinity.
|
||||
PXBPCIe DeviceDriver = "pxb-pcie"
|
||||
|
||||
// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
|
||||
PCIeRootPort DeviceDriver = "pcie-root-port"
|
||||
|
||||
@@ -152,7 +170,7 @@ const (
|
||||
|
||||
func isDimmSupported(config *Config) bool {
|
||||
switch runtime.GOARCH {
|
||||
case "amd64", "386", "ppc64le", "arm64":
|
||||
case "amd64", "ppc64le", "arm64":
|
||||
if config != nil && config.Machine.Type == MachineTypeMicrovm {
|
||||
// microvm does not support NUMA
|
||||
return false
|
||||
@@ -1064,6 +1082,11 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string {
|
||||
|
||||
if netdev.Bus != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus))
|
||||
} else if netdev.Transport.isVirtioPCI(config) && hasPCIeRoot(config) {
|
||||
// Pin to pcie.0 (when present) so pxb-pcie can't capture
|
||||
// this leaf device as the default bus. Skipped on machines
|
||||
// without a `pcie.0` root (pseries, microvm, s390-ccw-virtio).
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
|
||||
if netdev.Addr != "" {
|
||||
@@ -1586,8 +1609,15 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("netdev=%s", vhostuserDev.TypeDevID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address))
|
||||
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) {
|
||||
// Pin to pcie.0 (when present) so pxb-pcie can't capture
|
||||
// this leaf device. See hasPCIeRoot() for skipped machines.
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
}
|
||||
}
|
||||
|
||||
qemuParams = append(qemuParams, "-netdev")
|
||||
@@ -1612,8 +1642,13 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vhostuserDev.TypeDevID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID))
|
||||
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) {
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
}
|
||||
}
|
||||
|
||||
qemuParams = append(qemuParams, "-device")
|
||||
@@ -1637,8 +1672,13 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, "size=512M")
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID))
|
||||
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) {
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
}
|
||||
}
|
||||
|
||||
qemuParams = append(qemuParams, "-device")
|
||||
@@ -1674,8 +1714,13 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string {
|
||||
}
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo))
|
||||
}
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
if vhostuserDev.Transport.isVirtioPCI(config) {
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vhostuserDev.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
|
||||
}
|
||||
}
|
||||
|
||||
qemuParams = append(qemuParams, "-device")
|
||||
@@ -1738,6 +1783,36 @@ func (vhostuserDev VhostUserDevice) deviceName(config *Config) string {
|
||||
}
|
||||
}
|
||||
|
||||
// PXBPCIeDevice represents a PCIe Expander Bridge (pxb-pcie).
|
||||
// It creates a new PCI root complex with NUMA node affinity, allowing
|
||||
// devices attached to its bus hierarchy to inherit the NUMA association.
|
||||
// This is the only QEMU PCI device that carries a numa_node property.
|
||||
type PXBPCIeDevice struct {
|
||||
// ID is the QEMU device identifier (e.g. "pxb-numa0").
|
||||
ID string
|
||||
|
||||
// BusNr is the guest PCI bus number for this root complex.
|
||||
// Use values spaced apart (e.g. 0x20, 0x40) to leave room for
|
||||
// bridges beneath each pxb-pcie.
|
||||
BusNr uint8
|
||||
|
||||
// NUMANode is the guest NUMA node index this root complex belongs to.
|
||||
NUMANode int
|
||||
}
|
||||
|
||||
// QemuParams returns the QEMU parameters for a pxb-pcie device.
|
||||
func (dev PXBPCIeDevice) QemuParams(_ *Config) []string {
|
||||
return []string{
|
||||
"-device",
|
||||
fmt.Sprintf("pxb-pcie,id=%s,bus_nr=%d,numa_node=%d", dev.ID, dev.BusNr, dev.NUMANode),
|
||||
}
|
||||
}
|
||||
|
||||
// Valid returns true if the PXBPCIeDevice structure is valid and complete.
|
||||
func (dev PXBPCIeDevice) Valid() bool {
|
||||
return dev.ID != ""
|
||||
}
|
||||
|
||||
// PCIeRootPortDevice represents a memory balloon device.
|
||||
// nolint: govet
|
||||
type PCIeRootPortDevice struct {
|
||||
@@ -2310,8 +2385,15 @@ func (vsock VSOCKDevice) QemuParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vsock.ID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("%s=%d", VSOCKGuestCID, vsock.ContextID))
|
||||
|
||||
if vsock.Transport.isVirtioPCI(config) && vsock.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile))
|
||||
if vsock.Transport.isVirtioPCI(config) {
|
||||
// Pin to pcie.0 (when present) so pxb-pcie can't capture
|
||||
// this leaf device. See hasPCIeRoot() for skipped machines.
|
||||
if hasPCIeRoot(config) {
|
||||
deviceParams = append(deviceParams, "bus=pcie.0")
|
||||
}
|
||||
if vsock.ROMFile != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile))
|
||||
}
|
||||
}
|
||||
|
||||
if vsock.Transport.isVirtioCCW(config) {
|
||||
@@ -2689,7 +2771,8 @@ type SMP struct {
|
||||
Sockets uint32
|
||||
|
||||
// MaxCPUs is the maximum number of VCPUs that a VM can have.
|
||||
// This value, if non-zero, MUST BE equal to or greater than CPUs
|
||||
// This value, if non-zero, MUST BE equal to or greater than CPUs,
|
||||
// and must be equal to Sockets * Cores * Threads if all are non-zero.
|
||||
MaxCPUs uint32
|
||||
}
|
||||
|
||||
@@ -2775,6 +2858,36 @@ func (fwcfg FwCfg) QemuParams(config *Config) []string {
|
||||
return qemuParams
|
||||
}
|
||||
|
||||
// NUMANode describes a guest NUMA node and its mapping to host resources.
|
||||
type NUMANode struct {
|
||||
// NodeID is the guest NUMA node identifier (0-based).
|
||||
NodeID uint32
|
||||
|
||||
// CPUs is the guest vCPU range assigned to this node (e.g. "0-3").
|
||||
CPUs string
|
||||
|
||||
// MemSize is the amount of memory for this node (e.g. "512M", "1G").
|
||||
MemSize string
|
||||
|
||||
// HostNodes is the host NUMA node(s) this guest node maps to (e.g. "0" or "0-1").
|
||||
HostNodes string
|
||||
|
||||
// MemBackendType selects the QEMU memory backend object type.
|
||||
// Typical values: "memory-backend-ram" or "memory-backend-file".
|
||||
MemBackendType string
|
||||
|
||||
// MemBackendPath is the mem-path for file-backed memory (hugepages, file-backed).
|
||||
// Empty when using memory-backend-ram.
|
||||
MemBackendPath string
|
||||
}
|
||||
|
||||
// NUMADist describes a NUMA distance entry for `-numa dist`.
|
||||
type NUMADist struct {
|
||||
Src uint32
|
||||
Dst uint32
|
||||
Val uint32
|
||||
}
|
||||
|
||||
// Knobs regroups a set of qemu boolean settings
|
||||
type Knobs struct {
|
||||
// NoUserConfig prevents qemu from loading user config files.
|
||||
@@ -2922,6 +3035,14 @@ type Config struct {
|
||||
|
||||
IOThreads []IOThread
|
||||
|
||||
// NUMANodes defines multi-NUMA guest topology. When non-empty,
|
||||
// appendMemoryKnobs creates per-node memory backends and -numa entries
|
||||
// instead of a single flat memory region.
|
||||
NUMANodes []NUMANode
|
||||
|
||||
// NUMADists defines inter-node distance entries emitted as -numa dist.
|
||||
NUMADists []NUMADist
|
||||
|
||||
// PidFile is the -pidfile parameter
|
||||
PidFile string
|
||||
|
||||
@@ -3096,6 +3217,13 @@ func (config *Config) appendCPUs() error {
|
||||
return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d",
|
||||
config.SMP.MaxCPUs, config.SMP.CPUs)
|
||||
}
|
||||
if len(config.NUMANodes) > 1 && config.SMP.Sockets > 0 && config.SMP.Cores > 0 && config.SMP.Threads > 0 {
|
||||
expected := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads
|
||||
if config.SMP.MaxCPUs != expected {
|
||||
return fmt.Errorf("MaxCPUs %d must equal Sockets(%d) * Cores(%d) * Threads(%d) = %d",
|
||||
config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads, expected)
|
||||
}
|
||||
}
|
||||
SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs))
|
||||
}
|
||||
|
||||
@@ -3169,6 +3297,12 @@ func (config *Config) appendMemoryKnobs() {
|
||||
if config.Memory.Size == "" {
|
||||
return
|
||||
}
|
||||
|
||||
if len(config.NUMANodes) > 0 && isDimmSupported(config) {
|
||||
config.appendMultiNUMAMemoryKnobs()
|
||||
return
|
||||
}
|
||||
|
||||
var objMemParam, numaMemParam string
|
||||
dimmName := "dimm1"
|
||||
if config.Knobs.HugePages {
|
||||
@@ -3200,6 +3334,49 @@ func (config *Config) appendMemoryKnobs() {
|
||||
}
|
||||
}
|
||||
|
||||
func (config *Config) appendMultiNUMAMemoryKnobs() {
|
||||
for _, node := range config.NUMANodes {
|
||||
memID := fmt.Sprintf("numa-mem%d", node.NodeID)
|
||||
|
||||
backendType := node.MemBackendType
|
||||
if backendType == "" {
|
||||
backendType = "memory-backend-ram"
|
||||
}
|
||||
|
||||
objMemParam := fmt.Sprintf("%s,id=%s,size=%s", backendType, memID, node.MemSize)
|
||||
|
||||
if node.MemBackendPath != "" {
|
||||
objMemParam += ",mem-path=" + node.MemBackendPath
|
||||
}
|
||||
|
||||
if node.HostNodes != "" {
|
||||
objMemParam += ",host-nodes=" + node.HostNodes + ",policy=bind"
|
||||
}
|
||||
|
||||
if config.Knobs.MemShared {
|
||||
objMemParam += ",share=on"
|
||||
}
|
||||
if config.Knobs.MemPrealloc {
|
||||
objMemParam += ",prealloc=on"
|
||||
}
|
||||
|
||||
config.qemuParams = append(config.qemuParams, "-object")
|
||||
config.qemuParams = append(config.qemuParams, objMemParam)
|
||||
|
||||
numaParam := fmt.Sprintf("node,nodeid=%d,memdev=%s", node.NodeID, memID)
|
||||
if node.CPUs != "" {
|
||||
numaParam += ",cpus=" + node.CPUs
|
||||
}
|
||||
config.qemuParams = append(config.qemuParams, "-numa")
|
||||
config.qemuParams = append(config.qemuParams, numaParam)
|
||||
}
|
||||
|
||||
for _, dist := range config.NUMADists {
|
||||
config.qemuParams = append(config.qemuParams, "-numa")
|
||||
config.qemuParams = append(config.qemuParams, fmt.Sprintf("dist,src=%d,dst=%d,val=%d", dist.Src, dist.Dst, dist.Val))
|
||||
}
|
||||
}
|
||||
|
||||
func (config *Config) appendKnobs() {
|
||||
if config.Knobs.NoUserConfig {
|
||||
config.qemuParams = append(config.qemuParams, "-no-user-config")
|
||||
|
||||
@@ -14,8 +14,8 @@ var (
|
||||
deviceNetworkString = "-netdev tap,id=tap0,vhost=on,ifname=ceth0,downscript=no,script=no -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,romfile=efi-virtio.rom"
|
||||
deviceNetworkStringMq = "-netdev tap,id=tap0,vhost=on,fds=3:4 -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,mq=on,vectors=6,romfile=efi-virtio.rom"
|
||||
deviceSerialString = "-device virtio-serial-pci,disable-modern=true,id=serial0,romfile=efi-virtio.rom,max_ports=2"
|
||||
deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,romfile=efi-virtio.rom"
|
||||
deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=efi-virtio.rom"
|
||||
deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,bus=pcie.0,romfile=efi-virtio.rom"
|
||||
deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,bus=pcie.0,romfile=efi-virtio.rom"
|
||||
deviceVFIOString = "-device vfio-pci,host=02:10.0,x-pci-vendor-id=0x1234,x-pci-device-id=0x5678,romfile=efi-virtio.rom"
|
||||
devicePCIeRootPortSimpleString = "-device pcie-root-port,id=rp1,bus=pcie.0,chassis=0x00,slot=0x00,multifunction=off"
|
||||
devicePCIeRootPortFullString = "-device pcie-root-port,id=rp2,bus=pcie.0,chassis=0x0,slot=0x1,addr=0x2,multifunction=on,bus-reserve=0x3,pref64-reserve=16G,mem-reserve=1G,io-reserve=512M,romfile=efi-virtio.rom"
|
||||
@@ -23,8 +23,8 @@ var (
|
||||
deviceVFIOPCIeFullString = "-device vfio-pci,host=02:00.0,x-pci-vendor-id=0x10de,x-pci-device-id=0x15f8,romfile=efi-virtio.rom,bus=rp1"
|
||||
deviceSCSIControllerStr = "-device virtio-scsi-pci,id=foo,disable-modern=false,romfile=efi-virtio.rom"
|
||||
deviceSCSIControllerBusAddrStr = "-device virtio-scsi-pci,id=foo,bus=pci.0,addr=00:04.0,disable-modern=true,iothread=iothread1,romfile=efi-virtio.rom"
|
||||
deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,romfile=efi-virtio.rom"
|
||||
deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,romfile=efi-virtio.rom"
|
||||
deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,bus=pcie.0,romfile=efi-virtio.rom"
|
||||
deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,bus=pcie.0,romfile=efi-virtio.rom"
|
||||
deviceBlockString = "-device virtio-blk-pci,disable-modern=true,drive=hd0,config-wce=off,romfile=efi-virtio.rom,share-rw=on,serial=hd0 -drive id=hd0,file=/var/lib/vm.img,aio=threads,format=qcow2,if=none,readonly=on"
|
||||
devicePCIBridgeString = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=on,addr=ff,romfile=efi-virtio.rom"
|
||||
devicePCIBridgeStringReserved = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=off,addr=ff,romfile=efi-virtio.rom,io-reserve=4k,mem-reserve=1m,pref64-reserve=1m"
|
||||
@@ -42,7 +42,8 @@ func TestAppendDeviceVhostUser(t *testing.T) {
|
||||
VhostUserType: VhostUserBlk,
|
||||
ROMFile: romfile,
|
||||
}
|
||||
testAppend(vhostuserBlkDevice, deviceVhostUserBlkString, t)
|
||||
// vhost-user-pci device strings include bus=pcie.0 — gated to q35/virt.
|
||||
testAppendQ35(vhostuserBlkDevice, deviceVhostUserBlkString, t)
|
||||
|
||||
vhostuserSCSIDevice := VhostUserDevice{
|
||||
SocketPath: "/tmp/nonexistentsocket.socket",
|
||||
@@ -52,7 +53,7 @@ func TestAppendDeviceVhostUser(t *testing.T) {
|
||||
VhostUserType: VhostUserSCSI,
|
||||
ROMFile: romfile,
|
||||
}
|
||||
testAppend(vhostuserSCSIDevice, deviceVhostUserSCSIString, t)
|
||||
testAppendQ35(vhostuserSCSIDevice, deviceVhostUserSCSIString, t)
|
||||
|
||||
vhostuserNetDevice := VhostUserDevice{
|
||||
SocketPath: "/tmp/nonexistentsocket.socket",
|
||||
@@ -62,7 +63,7 @@ func TestAppendDeviceVhostUser(t *testing.T) {
|
||||
VhostUserType: VhostUserNet,
|
||||
ROMFile: romfile,
|
||||
}
|
||||
testAppend(vhostuserNetDevice, deviceVhostUserNetString, t)
|
||||
testAppendQ35(vhostuserNetDevice, deviceVhostUserNetString, t)
|
||||
}
|
||||
|
||||
func TestAppendVirtioBalloon(t *testing.T) {
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
@@ -23,6 +24,15 @@ func testAppend(structure interface{}, expected string, t *testing.T) {
|
||||
testConfigAppend(&config, structure, expected, t)
|
||||
}
|
||||
|
||||
// testAppendQ35 is testAppend with Config.Machine.Type set to "q35" so
|
||||
// device emitters that gate on hasPCIeRoot() (e.g. virtio-pci leaves
|
||||
// pinned to bus=pcie.0) take the PCIe path. Use this for tests whose
|
||||
// expected string contains "bus=pcie.0".
|
||||
func testAppendQ35(structure interface{}, expected string, t *testing.T) {
|
||||
config := Config{Machine: Machine{Type: "q35"}}
|
||||
testConfigAppend(&config, structure, expected, t)
|
||||
}
|
||||
|
||||
func testConfigAppend(config *Config, structure interface{}, expected string, t *testing.T) {
|
||||
switch s := structure.(type) {
|
||||
case Machine:
|
||||
@@ -342,7 +352,32 @@ func TestAppendVSOCK(t *testing.T) {
|
||||
vsockDevice.DevNo = DevNo
|
||||
}
|
||||
|
||||
testAppend(vsockDevice, deviceVSOCKString, t)
|
||||
// deviceVSOCKString includes bus=pcie.0 — gated to q35/virt machines.
|
||||
testAppendQ35(vsockDevice, deviceVSOCKString, t)
|
||||
}
|
||||
|
||||
// TestAppendVSOCKNoPCIeRoot verifies that on machines without a `pcie.0`
|
||||
// root (e.g. ppc64le's pseries, microvm, s390-ccw-virtio), we do NOT
|
||||
// emit `bus=pcie.0` — doing so would crash QEMU with
|
||||
// "Bus 'pcie.0' not found". Transport and ROMFile are set explicitly
|
||||
// rather than using the arch-conditional `romfile` constant (which is
|
||||
// "" on s390x via qemu_s390x_test.go), so the test exercises the
|
||||
// same code path on every architecture.
|
||||
func TestAppendVSOCKNoPCIeRoot(t *testing.T) {
|
||||
const vsockRomfile = "efi-virtio.rom"
|
||||
vsockDevice := VSOCKDevice{
|
||||
ID: "vhost-vsock-pci0",
|
||||
ContextID: 4,
|
||||
VHostFD: nil,
|
||||
DisableModern: true,
|
||||
ROMFile: vsockRomfile,
|
||||
Transport: TransportPCI,
|
||||
}
|
||||
|
||||
// pseries -> hasPCIeRoot returns false -> no bus=pcie.0 emitted.
|
||||
expected := "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=" + vsockRomfile
|
||||
config := Config{Machine: Machine{Type: "pseries"}}
|
||||
testConfigAppend(&config, vsockDevice, expected, t)
|
||||
}
|
||||
|
||||
func TestVSOCKValid(t *testing.T) {
|
||||
@@ -1117,6 +1152,140 @@ func TestBadMemoryKnobs(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAppendMultiNUMAMemoryKnobs(t *testing.T) {
|
||||
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
|
||||
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
|
||||
}
|
||||
c := &Config{
|
||||
Memory: Memory{
|
||||
Size: "2G",
|
||||
Slots: 8,
|
||||
MaxMem: "4G",
|
||||
},
|
||||
NUMANodes: []NUMANode{
|
||||
{
|
||||
NodeID: 0,
|
||||
CPUs: "0-3",
|
||||
MemSize: "1G",
|
||||
HostNodes: "0",
|
||||
MemBackendType: "memory-backend-ram",
|
||||
},
|
||||
{
|
||||
NodeID: 1,
|
||||
CPUs: "4-7",
|
||||
MemSize: "1G",
|
||||
HostNodes: "1",
|
||||
MemBackendType: "memory-backend-ram",
|
||||
},
|
||||
},
|
||||
Knobs: Knobs{
|
||||
MemShared: true,
|
||||
MemPrealloc: true,
|
||||
},
|
||||
}
|
||||
|
||||
c.appendMemoryKnobs()
|
||||
|
||||
expected := []string{
|
||||
"-object", "memory-backend-ram,id=numa-mem0,size=1G,host-nodes=0,policy=bind,share=on,prealloc=on",
|
||||
"-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-3",
|
||||
"-object", "memory-backend-ram,id=numa-mem1,size=1G,host-nodes=1,policy=bind,share=on,prealloc=on",
|
||||
"-numa", "node,nodeid=1,memdev=numa-mem1,cpus=4-7",
|
||||
}
|
||||
if len(c.qemuParams) != len(expected) {
|
||||
t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams)
|
||||
}
|
||||
for i, p := range expected {
|
||||
if c.qemuParams[i] != p {
|
||||
t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAppendMultiNUMAHugePages(t *testing.T) {
|
||||
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
|
||||
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
|
||||
}
|
||||
c := &Config{
|
||||
Memory: Memory{
|
||||
Size: "2G",
|
||||
Slots: 8,
|
||||
MaxMem: "4G",
|
||||
},
|
||||
NUMANodes: []NUMANode{
|
||||
{
|
||||
NodeID: 0,
|
||||
CPUs: "0-1",
|
||||
MemSize: "1G",
|
||||
HostNodes: "0",
|
||||
MemBackendType: "memory-backend-file",
|
||||
MemBackendPath: "/dev/hugepages",
|
||||
},
|
||||
{
|
||||
NodeID: 1,
|
||||
CPUs: "2-3",
|
||||
MemSize: "1G",
|
||||
HostNodes: "1",
|
||||
MemBackendType: "memory-backend-file",
|
||||
MemBackendPath: "/dev/hugepages",
|
||||
},
|
||||
},
|
||||
Knobs: Knobs{
|
||||
MemShared: true,
|
||||
},
|
||||
}
|
||||
|
||||
c.appendMemoryKnobs()
|
||||
|
||||
expected := []string{
|
||||
"-object", "memory-backend-file,id=numa-mem0,size=1G,mem-path=/dev/hugepages,host-nodes=0,policy=bind,share=on",
|
||||
"-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-1",
|
||||
"-object", "memory-backend-file,id=numa-mem1,size=1G,mem-path=/dev/hugepages,host-nodes=1,policy=bind,share=on",
|
||||
"-numa", "node,nodeid=1,memdev=numa-mem1,cpus=2-3",
|
||||
}
|
||||
if len(c.qemuParams) != len(expected) {
|
||||
t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams)
|
||||
}
|
||||
for i, p := range expected {
|
||||
if c.qemuParams[i] != p {
|
||||
t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAppendNUMADist(t *testing.T) {
|
||||
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
|
||||
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
|
||||
}
|
||||
c := &Config{
|
||||
Memory: Memory{
|
||||
Size: "2G",
|
||||
},
|
||||
NUMANodes: []NUMANode{
|
||||
{NodeID: 0, CPUs: "0-1", MemSize: "1G", MemBackendType: "memory-backend-ram"},
|
||||
{NodeID: 1, CPUs: "2-3", MemSize: "1G", MemBackendType: "memory-backend-ram"},
|
||||
},
|
||||
NUMADists: []NUMADist{
|
||||
{Src: 0, Dst: 1, Val: 20},
|
||||
{Src: 1, Dst: 0, Val: 20},
|
||||
},
|
||||
}
|
||||
|
||||
c.appendMemoryKnobs()
|
||||
|
||||
expectedDist := []string{
|
||||
"-numa", "dist,src=0,dst=1,val=20",
|
||||
"-numa", "dist,src=1,dst=0,val=20",
|
||||
}
|
||||
params := c.qemuParams
|
||||
distParams := params[len(params)-4:]
|
||||
for i, p := range expectedDist {
|
||||
if distParams[i] != p {
|
||||
t.Errorf("Dist param %d: expected %q, got %q", i, p, distParams[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBadBios(t *testing.T) {
|
||||
c := &Config{}
|
||||
c.appendBios()
|
||||
|
||||
@@ -1071,6 +1071,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
IOMMU: h.IOMMU,
|
||||
IOMMUPlatform: h.getIOMMUPlatform(),
|
||||
GuestNUMANodes: h.defaultGuestNUMANodes(),
|
||||
NUMAMapping: append([]string(nil), h.NUMAMapping...),
|
||||
FileBackedMemRootDir: h.FileBackedMemRootDir,
|
||||
FileBackedMemRootList: h.FileBackedMemRootList,
|
||||
Debug: h.Debug,
|
||||
@@ -1994,6 +1995,10 @@ func checkConfig(config oci.RuntimeConfig) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := checkNumaConfig(config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
|
||||
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
|
||||
machineType := config.HypervisorConfig.HypervisorMachineType
|
||||
@@ -2005,6 +2010,25 @@ func checkConfig(config oci.RuntimeConfig) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func checkNumaConfig(config oci.RuntimeConfig) error {
|
||||
if len(config.HypervisorConfig.GuestNUMANodes) <= 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
switch goruntime.GOARCH {
|
||||
case "amd64", "arm64":
|
||||
default:
|
||||
return fmt.Errorf("multi-NUMA support is only available on amd64 and arm64, got %q", goruntime.GOARCH)
|
||||
}
|
||||
|
||||
if !config.StaticSandboxResourceMgmt {
|
||||
return fmt.Errorf("NUMA support requires static_sandbox_resource_mgmt to be enabled; " +
|
||||
"NUMA topology is not compatible with dynamic CPU/memory hotplug")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// checkPCIeConfig ensures the PCIe configuration is valid.
|
||||
// Only allow one of the following settings for cold-plug:
|
||||
// no-port, root-port, switch-port
|
||||
|
||||
@@ -794,11 +794,15 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
|
||||
}
|
||||
|
||||
if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok {
|
||||
guestNUMANodes, err := vcutils.GetGuestNUMANodes(strings.Fields(annotation))
|
||||
mapping := strings.Fields(annotation)
|
||||
guestNUMANodes, err := vcutils.GetGuestNUMANodes(mapping)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
sbConfig.HypervisorConfig.GuestNUMANodes = guestNUMANodes
|
||||
// Record the raw user-provided mapping so the hypervisor
|
||||
// backend honors it verbatim instead of right-sizing.
|
||||
sbConfig.HypervisorConfig.NUMAMapping = mapping
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -1457,7 +1461,7 @@ func (a *annotationConfiguration) setFloat32WithCheck(f func(float32) error) err
|
||||
// be added to the VM if sandbox annotations are provided with this sizing details
|
||||
func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32) {
|
||||
var memory, quota int64
|
||||
var period uint64
|
||||
var shares, period uint64
|
||||
var err error
|
||||
|
||||
if spec == nil || spec.Annotations == nil {
|
||||
@@ -1488,6 +1492,15 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32)
|
||||
}
|
||||
}
|
||||
|
||||
annotation, ok = spec.Annotations[ctrAnnotations.SandboxCPUShares]
|
||||
if ok {
|
||||
shares, err = strconv.ParseUint(annotation, 10, 64)
|
||||
if err != nil {
|
||||
ociLog.Warningf("sandbox-sizing: failure to parse SandboxCPUShares: %s", annotation)
|
||||
shares = 0
|
||||
}
|
||||
}
|
||||
|
||||
annotation, ok = spec.Annotations[ctrAnnotations.SandboxMem]
|
||||
if ok {
|
||||
memory, err = strconv.ParseInt(annotation, 10, 64)
|
||||
@@ -1497,7 +1510,16 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32)
|
||||
}
|
||||
}
|
||||
|
||||
return calculateVMResources(period, quota, memory)
|
||||
numCPU, memSizeMB = calculateVMResources(period, quota, memory)
|
||||
|
||||
// When cpuManagerPolicy=static is in use, kubelet sets quota=-1
|
||||
// (unconstrained) and assigns CPUs via cpuset instead. Fall back
|
||||
// to deriving the CPU count from shares (1024 shares per CPU).
|
||||
if numCPU == 0 && shares > 0 {
|
||||
numCPU = float32(math.Ceil(float64(shares) / 1024.0))
|
||||
}
|
||||
|
||||
return numCPU, memSizeMB
|
||||
}
|
||||
|
||||
// CalculateContainerSizing will calculate the number of CPUs and amount of memory that is needed
|
||||
|
||||
Reference in New Issue
Block a user