diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 8c9055ae25..9ce16040b6 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -178,22 +178,22 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe } id := utils.MakeNameID("vfio", device.ID+strconv.Itoa(i), maxDevIDSize) - pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) - // We need to ignore Host or PCI Bridges that are in the same IOMMU group as the - // passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge. - // Class 0x0604 is PCI bridge, 0x0600 is Host bridge - ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) - if err != nil { - return nil, err - } - if ignorePCIDevice { - continue - } - var vfio config.VFIODev switch vfioDeviceType { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: + // This is vfio-pci and vfio-mdev specific + pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) + // We need to ignore Host or PCI Bridges that are in the same IOMMU group as the + // passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge. + // Class 0x0604 is PCI bridge, 0x0600 is Host bridge + ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) + if err != nil { + return nil, err + } + if ignorePCIDevice { + continue + } // Do not directly assign to `vfio` -- need to access field still vfio = config.VFIODev{ ID: id, diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 801b1a81f8..7edd5b3c6e 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -70,6 +70,12 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece return err } for _, vfio := range device.VfioDevs { + // If vfio.Port is not set we bail out, users should set + // explicitly the port in the config file + if vfio.Port == "" { + return fmt.Errorf("cold_plug_vfio= or hot_plug_vfio= port is not set for device %s (BridgePort | RootPort | SwitchPort)", vfio.BDF) + } + if vfio.IsPCIe { busIndex := len(config.PCIeDevices[vfio.Port]) vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) diff --git a/src/runtime/pkg/device/manager/manager_test.go b/src/runtime/pkg/device/manager/manager_test.go index 70c76b67d7..bcd321477f 100644 --- a/src/runtime/pkg/device/manager/manager_test.go +++ b/src/runtime/pkg/device/manager/manager_test.go @@ -132,6 +132,8 @@ func TestAttachVFIODevice(t *testing.T) { HostPath: path, ContainerPath: path, DevType: "c", + ColdPlug: false, + Port: config.RootPort, } device, err := dm.NewDevice(deviceInfo) diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 453012169f..46e845f9e4 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -163,6 +163,9 @@ const ( // TransportMMIO is the MMIO transport for virtio devices. TransportMMIO VirtioTransport = "mmio" + + // TransportAP is the AP transport for virtio devices. + TransportAP VirtioTransport = "ap" ) // defaultTransport returns the default transport for the current combination @@ -199,6 +202,14 @@ func (transport VirtioTransport) isVirtioCCW(config *Config) bool { return transport == TransportCCW } +func (transport VirtioTransport) isVirtioAP(config *Config) bool { + if transport == "" { + transport = transport.defaultTransport(config) + } + + return transport == TransportAP +} + // getName returns the name of the current transport. func (transport VirtioTransport) getName(config *Config) string { if transport == "" { @@ -1811,6 +1822,9 @@ type VFIODevice struct { // Transport is the virtio transport for this device. Transport VirtioTransport + + // SysfsDev specifies the sysfs matrix entry for the AP device + SysfsDev string } // VFIODeviceTransport is a map of the vfio device name that corresponds to @@ -1819,11 +1833,13 @@ var VFIODeviceTransport = map[VirtioTransport]string{ TransportPCI: "vfio-pci", TransportCCW: "vfio-ccw", TransportMMIO: "vfio-device", + TransportAP: "vfio-ap", } // Valid returns true if the VFIODevice structure is valid and complete. +// s390x architecture requires SysfsDev to be set. func (vfioDev VFIODevice) Valid() bool { - return vfioDev.BDF != "" + return vfioDev.BDF != "" || vfioDev.SysfsDev != "" } // QemuParams returns the qemu parameters built out of this vfio device. @@ -1833,6 +1849,15 @@ func (vfioDev VFIODevice) QemuParams(config *Config) []string { driver := vfioDev.deviceName(config) + if vfioDev.Transport.isVirtioAP(config) { + deviceParams = append(deviceParams, fmt.Sprintf("%s,sysfsdev=%s", driver, vfioDev.SysfsDev)) + + qemuParams = append(qemuParams, "-device") + qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) + + return qemuParams + } + deviceParams = append(deviceParams, fmt.Sprintf("%s,host=%s", driver, vfioDev.BDF)) if vfioDev.Transport.isVirtioPCI(config) { if vfioDev.VendorID != "" { @@ -2837,10 +2862,9 @@ func (config *Config) appendDevices(logger QMPLog) { for _, d := range config.Devices { if !d.Valid() { - logger.Errorf("vm device is not valid: %+v", config.Devices) + logger.Errorf("vm device is not valid: %+v", d) continue } - config.qemuParams = append(config.qemuParams, d.QemuParams(config)...) } } diff --git a/src/runtime/pkg/govmm/qemu/qmp.go b/src/runtime/pkg/govmm/qemu/qmp.go index 6b020103da..46eb1c0827 100644 --- a/src/runtime/pkg/govmm/qemu/qmp.go +++ b/src/runtime/pkg/govmm/qemu/qmp.go @@ -1217,10 +1217,11 @@ func (q *QMP) ExecutePCIVFIOMediatedDeviceAdd(ctx context.Context, devID, sysfsd } // ExecuteAPVFIOMediatedDeviceAdd adds a VFIO mediated AP device to a QEMU instance using the device_add command. -func (q *QMP) ExecuteAPVFIOMediatedDeviceAdd(ctx context.Context, sysfsdev string) error { +func (q *QMP) ExecuteAPVFIOMediatedDeviceAdd(ctx context.Context, sysfsdev string, devID string) error { args := map[string]interface{}{ "driver": VfioAP, "sysfsdev": sysfsdev, + "id": devID, } return q.executeCommand(ctx, "device_add", args, nil) } diff --git a/src/runtime/pkg/govmm/qemu/qmp_test.go b/src/runtime/pkg/govmm/qemu/qmp_test.go index 17492f6fd7..06738a40d6 100644 --- a/src/runtime/pkg/govmm/qemu/qmp_test.go +++ b/src/runtime/pkg/govmm/qemu/qmp_test.go @@ -1128,7 +1128,7 @@ func TestQMPAPVFIOMediatedDeviceAdd(t *testing.T) { q := startQMPLoop(buf, cfg, connectedCh, disconnectedCh) checkVersion(t, connectedCh) sysfsDev := "/sys/devices/vfio_ap/matrix/a297db4a-f4c2-11e6-90f6-d3b88d6c9525" - err := q.ExecuteAPVFIOMediatedDeviceAdd(context.Background(), sysfsDev) + err := q.ExecuteAPVFIOMediatedDeviceAdd(context.Background(), sysfsDev, "test-id") if err != nil { t.Fatalf("Unexpected error %v", err) } diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index c0f0789d1f..1e2316236d 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -869,6 +869,23 @@ func (c *Container) create(ctx context.Context) (err error) { } } + // If cold-plug we've attached the devices already, do not try to + // attach them a second time. + coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort) + if coldPlugVFIO { + var cntDevices []ContainerDevice + for _, dev := range c.devices { + if strings.HasPrefix(dev.ContainerPath, vfioPath) { + c.Logger().WithFields(logrus.Fields{ + "device": dev, + }).Info("Remvoing device since we're cold-plugging no Attach needed") + continue + } + cntDevices = append(cntDevices, dev) + } + c.devices = cntDevices + } + c.Logger().WithFields(logrus.Fields{ "devices": c.devices, }).Info("Attach devices") diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 2fd763f496..1a74944baa 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -65,11 +65,6 @@ const romFile = "" // Default value is false. const defaultDisableModern = false -// A deeper PCIe topology than 5 is already not advisable just for the sake -// of having enough buffer we limit ourselves to 10 and exit if we reach -// the root bus -const maxPCIeTopoDepth = 10 - type qmpChannel struct { qmp *govmmQemu.QMP ctx context.Context @@ -80,15 +75,14 @@ type qmpChannel struct { // QemuState keeps Qemu's state type QemuState struct { - UUID string - HotPlugVFIO config.PCIePort - Bridges []types.Bridge - HotpluggedVCPUs []hv.CPUDevice - HotpluggedMemory int - VirtiofsDaemonPid int - HotplugVFIOOnRootBus bool - HotplugVFIO config.PCIePort - ColdPlugVFIO config.PCIePort + UUID string + HotPlugVFIO config.PCIePort + Bridges []types.Bridge + HotpluggedVCPUs []hv.CPUDevice + HotpluggedMemory int + VirtiofsDaemonPid int + HotplugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort } // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. @@ -289,7 +283,6 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.state.UUID = uuid.Generate().String() q.state.HotPlugVFIO = q.config.HotPlugVFIO q.state.ColdPlugVFIO = q.config.ColdPlugVFIO - q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus q.state.HotPlugVFIO = q.config.HotPlugVFIO // The path might already exist, but in case of VM templating, @@ -792,7 +785,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig } } } - vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus) + vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort) vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort) numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices) @@ -1638,7 +1631,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V config.PCIeDevices[config.RootPort][devID] = true bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) - bridgeSlot, err := q.qomGetSlot(bridgeQomPath) + bridgeSlot, err := q.arch.qomGetSlot(bridgeQomPath, &q.qmpMonitorCh) if err != nil { return err } @@ -1741,88 +1734,6 @@ func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUs } } -// Query QMP to find the PCI slot of a device, given its QOM path or ID -func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) { - addr, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qomPath, "addr") - if err != nil { - return types.PciSlot{}, err - } - addrf, ok := addr.(float64) - // XXX going via float makes no real sense, but that's how - // JSON works, and we'll get away with it for the small values - // we have here - if !ok { - return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr) - } - addri := int(addrf) - - slotNum, funcNum := addri>>3, addri&0x7 - if funcNum != 0 { - return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q", - slotNum, funcNum, qomPath) - } - - return types.PciSlotFromInt(slotNum) -} - -// Query QMP to find a device's PCI path given its QOM path or ID -func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { - - var slots []types.PciSlot - - devSlot, err := q.qomGetSlot(qemuID) - if err != nil { - return types.PciPath{}, err - } - slots = append(slots, devSlot) - - // This only works for Q35 and Virt - r, _ := regexp.Compile(`^/machine/.*/pcie.0`) - - var parentPath = qemuID - // We do not want to use a forever loop here, a deeper PCIe topology - // than 5 is already not advisable just for the sake of having enough - // buffer we limit ourselves to 10 and leave the loop early if we hit - // the root bus. - for i := 1; i <= maxPCIeTopoDepth; i++ { - parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus") - if err != nil { - return types.PciPath{}, err - } - - busQOM, ok := parenBusQOM.(string) - if !ok { - return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM) - } - - // If we hit /machine/q35/pcie.0 we're done this is the root bus - // we climbed the complete hierarchy - if r.Match([]byte(busQOM)) { - break - } - - // `bus` is the QOM path of the QOM bus object, but we need - // the PCI parent_bus which manages that bus. There doesn't seem - // to be a way to get that other than to simply drop the last - // path component. - idx := strings.LastIndex(busQOM, "/") - if idx == -1 { - return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM) - } - parentBus := busQOM[:idx] - - parentSlot, err := q.qomGetSlot(parentBus) - if err != nil { - return types.PciPath{}, err - } - - // Prepend the slots, since we're climbing the hierarchy - slots = append([]types.PciSlot{parentSlot}, slots...) - parentPath = parentBus - } - return types.PciPathFromSlots(slots...) -} - func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) { return q.executeVFIODeviceAdd(device) } @@ -1852,7 +1763,7 @@ func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, brid case config.VFIOPCIDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile) case config.VFIOAPDeviceMediatedType: - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID) default: return fmt.Errorf("Incorrect VFIO device type found") } @@ -1865,7 +1776,7 @@ func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error { case config.VFIOPCIDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile) case config.VFIOAPDeviceMediatedType: - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID) default: return fmt.Errorf("Incorrect VFIO device type found") } @@ -1883,46 +1794,43 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op "hot-plug-vfio": q.state.HotPlugVFIO, "device-info": string(buf), }).Info("Start hot-plug VFIO device") - // In case MachineType is q35, a PCIe device is hotplugged on - // a PCIe Root Port or alternatively on a PCIe Switch Port - if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt { - device.Bus = "" - } else { - var err error - // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus - // for pc machine type instead of bridge. This is useful for devices that require - // a large PCI BAR which is a currently a limitation with PCI bridges. - if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { - err = q.hotplugVFIODeviceRootPort(ctx, device) - } else if q.state.HotPlugVFIO == config.SwitchPort { - err = q.hotplugVFIODeviceSwitchPort(ctx, device) - } else { - err = q.hotplugVFIODeviceBridgePort(ctx, device) - } - if err != nil { - return err - } + + err = fmt.Errorf("Incorrect hot plug configuration %v for device %v found", q.state.HotPlugVFIO, device) + // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus + // for pc machine type instead of bridge. This is useful for devices that require + // a large PCI BAR which is a currently a limitation with PCI bridges. + if q.state.HotPlugVFIO == config.RootPort { + err = q.hotplugVFIODeviceRootPort(ctx, device) + } else if q.state.HotPlugVFIO == config.SwitchPort { + err = q.hotplugVFIODeviceSwitchPort(ctx, device) + } else if q.state.HotPlugVFIO == config.BridgePort { + err = q.hotplugVFIODeviceBridgePort(ctx, device) } - // XXX: Depending on whether we're doing root port or + if err != nil { + return err + } + + // Depending on whether we're doing root port or // bridge hotplug, and how the bridge is set up in // other parts of the code, we may or may not already // have information about the slot number of the // bridge and or the device. For simplicity, just - // query both of them back from qemu - device.GuestPciPath, err = q.qomGetPciPath(device.ID) + // query both of them back from qemu based on the arch + device.GuestPciPath, err = q.arch.qomGetPciPath(device.ID, &q.qmpMonitorCh) + return err - } + } else { - q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") + q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") - if !q.state.HotplugVFIOOnRootBus { - if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { - return err + if q.state.HotPlugVFIO == config.BridgePort { + if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { + return err + } } + + return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) } - - return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) - } func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error { @@ -2881,7 +2789,6 @@ func (q *qemu) Save() (s hv.HypervisorState) { s.Type = string(QemuHypervisor) s.UUID = q.state.UUID s.HotpluggedMemory = q.state.HotpluggedMemory - s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus for _, bridge := range q.arch.getBridges() { s.Bridges = append(s.Bridges, hv.Bridge{ @@ -2903,7 +2810,6 @@ func (q *qemu) Save() (s hv.HypervisorState) { func (q *qemu) Load(s hv.HypervisorState) { q.state.UUID = s.UUID q.state.HotpluggedMemory = s.HotpluggedMemory - q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid for _, bridge := range s.Bridges { diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index b74f260ec8..0dc9f46cde 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -13,6 +13,7 @@ import ( "errors" "fmt" "os" + "regexp" "runtime" "strings" @@ -24,6 +25,11 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) +// A deeper PCIe topology than 5 is already not advisable just for the sake +// of having enough buffer we limit ourselves to 10 and exit if we reach +// the root bus +const maxPCIeTopoDepth = 10 + type qemuArch interface { // enableNestingChecks nesting checks will be honoured enableNestingChecks() @@ -158,6 +164,12 @@ type qemuArch interface { // scans the PCIe space and returns the biggest BAR sizes for 32-bit // and 64-bit addressable memory getBARsMaxAddressableMemory() (uint64, uint64) + + // Query QMP to find a device's PCI path given its QOM path or ID + qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) + + // Query QMP to find the PCI slot of a device, given its QOM path or ID + qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error) } type qemuArchBase struct { @@ -881,3 +893,85 @@ func (q *qemuArchBase) appendProtectionDevice(devices []govmmQemu.Device, firmwa hvLogger.WithField("arch", runtime.GOARCH).Warnf("Confidential Computing has not been implemented for this architecture") return devices, firmware, nil } + +// Query QMP to find the PCI slot of a device, given its QOM path or ID +func (q *qemuArchBase) qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error) { + addr, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, qomPath, "addr") + if err != nil { + return types.PciSlot{}, err + } + addrf, ok := addr.(float64) + // XXX going via float makes no real sense, but that's how + // JSON works, and we'll get away with it for the small values + // we have here + if !ok { + return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr) + } + addri := int(addrf) + + slotNum, funcNum := addri>>3, addri&0x7 + if funcNum != 0 { + return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q", + slotNum, funcNum, qomPath) + } + + return types.PciSlotFromInt(slotNum) +} + +// Query QMP to find a device's PCI path given its QOM path or ID +func (q *qemuArchBase) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) { + + var slots []types.PciSlot + + devSlot, err := q.qomGetSlot(qemuID, qmpCh) + if err != nil { + return types.PciPath{}, err + } + slots = append(slots, devSlot) + + // This only works for Q35 and Virt + r, _ := regexp.Compile(`^/machine/.*/pcie.0`) + + var parentPath = qemuID + // We do not want to use a forever loop here, a deeper PCIe topology + // than 5 is already not advisable just for the sake of having enough + // buffer we limit ourselves to 10 and leave the loop early if we hit + // the root bus. + for i := 1; i <= maxPCIeTopoDepth; i++ { + parenBusQOM, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, parentPath, "parent_bus") + if err != nil { + return types.PciPath{}, err + } + + busQOM, ok := parenBusQOM.(string) + if !ok { + return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM) + } + + // If we hit /machine/q35/pcie.0 we're done this is the root bus + // we climbed the complete hierarchy + if r.Match([]byte(busQOM)) { + break + } + + // `bus` is the QOM path of the QOM bus object, but we need + // the PCI parent_bus which manages that bus. There doesn't seem + // to be a way to get that other than to simply drop the last + // path component. + idx := strings.LastIndex(busQOM, "/") + if idx == -1 { + return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM) + } + parentBus := busQOM[:idx] + + parentSlot, err := q.qomGetSlot(parentBus, qmpCh) + if err != nil { + return types.PciPath{}, err + } + + // Prepend the slots, since we're climbing the hierarchy + slots = append([]types.PciSlot{parentSlot}, slots...) + parentPath = parentBus + } + return types.PciPathFromSlots(slots...) +} diff --git a/src/runtime/virtcontainers/qemu_s390x.go b/src/runtime/virtcontainers/qemu_s390x.go index b0c1ede543..bc5c45bff8 100644 --- a/src/runtime/virtcontainers/qemu_s390x.go +++ b/src/runtime/virtcontainers/qemu_s390x.go @@ -351,3 +351,32 @@ func (q *qemuS390x) appendProtectionDevice(devices []govmmQemu.Device, firmware, return devices, firmware, fmt.Errorf("Unsupported guest protection technology: %v", q.protection) } } + +func (q *qemuS390x) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device { + if vfioDev.SysfsDev == "" { + return devices + } + + if len(vfioDev.APDevices) > 0 { + devices = append(devices, + govmmQemu.VFIODevice{ + SysfsDev: vfioDev.SysfsDev, + Transport: govmmQemu.TransportAP, + }, + ) + return devices + + } + devices = append(devices, + govmmQemu.VFIODevice{ + SysfsDev: vfioDev.SysfsDev, + }, + ) + return devices +} + +// Query QMP to find a device's PCI path given its QOM path or ID +func (q *qemuArchBase) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) { + hvLogger.Warnf("qomGetPciPath not implemented for s390x") + return types.PciPath{}, nil +} diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 9949656a6d..52ed76567d 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -619,6 +619,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // Aggregate all the containner devices for hot-plug and use them to dedcue // the correct amount of ports to reserve for the hypervisor. hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) + stripVFIO := sandboxConfig.VfioMode == config.VFIOModeGuestKernel var vfioDevices []config.DeviceInfo // vhost-user-block device is a PCIe device in Virt, keep track of it @@ -644,7 +645,9 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // We need to remove the devices marked for cold-plug // otherwise at the container level the kata-agent // will try to hot-plug them. - sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" + if stripVFIO { + sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" + } } } var filteredDevices []config.DeviceInfo @@ -656,6 +659,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices } + sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices