gpu: reintroduce pcie_root_port and add pcie_switch_port

In Kubernetes we still do not have proper VM sizing
at sandbox creation level. This KEP tries to mitigates
that: kubernetes/enhancements#4113 but this can take
some time until Kube and containerd or other runtimes
have those changes rolled out.

Before we used a static config of VFIO ports, and we
introduced CDI support which needs a patched contianerd.
We want to eliminate the patched continerd in the GPU case
as well.

Fixes: #8860

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
Zvonko Kaiser
2024-01-18 10:56:38 -08:00
parent 6f6a164451
commit c7b41361b2
20 changed files with 264 additions and 56 deletions

View File

@@ -83,6 +83,8 @@ type QemuState struct {
VirtiofsDaemonPid int
HotplugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
PCIeRootPort uint32
PCIeSwitchPort uint32
}
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
@@ -283,7 +285,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
q.state.UUID = uuid.Generate().String()
q.state.HotPlugVFIO = q.config.HotPlugVFIO
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
q.state.HotPlugVFIO = q.config.HotPlugVFIO
q.state.PCIeRootPort = q.config.PCIeRootPort
q.state.PCIeSwitchPort = q.config.PCIeSwitchPort
// The path might already exist, but in case of VM templating,
// we have to create it since the sandbox has not created it yet.
@@ -803,11 +806,19 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort)
vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
// If the devices are not advertised via CRI or cold-plugged we need to
// get the number of pluggable root/switch ports from the config
numPCIeRootPorts := hypervisorConfig.PCIeRootPort
numPCIeSwitchPorts := hypervisorConfig.PCIeSwitchPort
// If number of PCIe root ports > 16 then bail out otherwise we may
// use up all slots or IO memory on the root bus and vfio-XXX-pci devices
// cannot be added which are crucial for Kata max slots on root bus is 32
// max slots on the complete pci(e) topology is 256 in QEMU
if vfioOnRootPort {
if numOfPluggablePorts < numPCIeRootPorts {
numOfPluggablePorts = numPCIeRootPorts
}
if numOfPluggablePorts > maxPCIeRootPort {
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
}
@@ -815,6 +826,9 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
return nil
}
if vfioOnSwitchPort {
if numOfPluggablePorts < numPCIeSwitchPorts {
numOfPluggablePorts = numPCIeSwitchPorts
}
if numOfPluggablePorts > maxPCIeSwitchPort {
return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
}
@@ -924,8 +938,9 @@ func (q *qemu) setupVirtioMem(ctx context.Context) error {
machineType := q.HypervisorConfig().HypervisorMachineType
if machineType == QemuVirt {
addr = "00"
bridgeID = fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
config.PCIeDevices[config.RootPort]["virtiomem"] = true
bridgeID = fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort]))
dev := config.VFIODev{ID: "virtiomem"}
config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev)
}
err = q.qmpMonitorCh.qmp.ExecMemdevAdd(q.qmpMonitorCh.ctx, memoryBack, "virtiomem", target, sizeMB, share, "virtio-mem-pci", "virtiomem0", addr, bridgeID)
@@ -1640,8 +1655,9 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
//Since the dev is the first and only one on this bus(root port), it should be 0.
addr := "00"
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
config.PCIeDevices[config.RootPort][devID] = true
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort]))
dev := config.VFIODev{ID: devID}
config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev)
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
bridgeSlot, err := q.arch.qomGetSlot(bridgeQomPath, &q.qmpMonitorCh)
@@ -1901,8 +1917,10 @@ func (q *qemu) hotplugNetDevice(ctx context.Context, endpoint Endpoint, op Opera
// Hotplug net dev to pcie root port for QemuVirt
if machineType == QemuVirt {
addr := "00"
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
config.PCIeDevices[config.RootPort][devID] = true
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort]))
dev := config.VFIODev{ID: devID}
config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev)
return q.qmpMonitorCh.qmp.ExecuteNetPCIDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), addr, bridgeID, romFile, int(q.config.NumVCPUs()), defaultDisableModern)
}