From 114542e2ba55e84d77414ccaf13659127fa21226 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 20 Jul 2023 07:24:26 +0000 Subject: [PATCH 1/5] s390x: Fixing device.Bus assignment The device.Bus was reset if a specific combination of configuration parameters were not met. With the new PCIe topology this should not happen anymore Fixes: #7381 Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/drivers/utils.go | 24 +-- src/runtime/pkg/device/drivers/vfio.go | 6 + .../pkg/device/manager/manager_test.go | 2 + src/runtime/pkg/govmm/qemu/qemu.go | 30 ++- src/runtime/pkg/govmm/qemu/qmp.go | 3 +- src/runtime/pkg/govmm/qemu/qmp_test.go | 2 +- src/runtime/virtcontainers/container.go | 17 ++ src/runtime/virtcontainers/qemu.go | 172 ++++-------------- src/runtime/virtcontainers/qemu_arch_base.go | 94 ++++++++++ src/runtime/virtcontainers/qemu_s390x.go | 29 +++ src/runtime/virtcontainers/sandbox.go | 6 +- 11 files changed, 234 insertions(+), 151 deletions(-) diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 8c9055ae25..9ce16040b6 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -178,22 +178,22 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe } id := utils.MakeNameID("vfio", device.ID+strconv.Itoa(i), maxDevIDSize) - pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) - // We need to ignore Host or PCI Bridges that are in the same IOMMU group as the - // passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge. - // Class 0x0604 is PCI bridge, 0x0600 is Host bridge - ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) - if err != nil { - return nil, err - } - if ignorePCIDevice { - continue - } - var vfio config.VFIODev switch vfioDeviceType { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: + // This is vfio-pci and vfio-mdev specific + pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) + // We need to ignore Host or PCI Bridges that are in the same IOMMU group as the + // passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge. + // Class 0x0604 is PCI bridge, 0x0600 is Host bridge + ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) + if err != nil { + return nil, err + } + if ignorePCIDevice { + continue + } // Do not directly assign to `vfio` -- need to access field still vfio = config.VFIODev{ ID: id, diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 801b1a81f8..7edd5b3c6e 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -70,6 +70,12 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece return err } for _, vfio := range device.VfioDevs { + // If vfio.Port is not set we bail out, users should set + // explicitly the port in the config file + if vfio.Port == "" { + return fmt.Errorf("cold_plug_vfio= or hot_plug_vfio= port is not set for device %s (BridgePort | RootPort | SwitchPort)", vfio.BDF) + } + if vfio.IsPCIe { busIndex := len(config.PCIeDevices[vfio.Port]) vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) diff --git a/src/runtime/pkg/device/manager/manager_test.go b/src/runtime/pkg/device/manager/manager_test.go index 70c76b67d7..bcd321477f 100644 --- a/src/runtime/pkg/device/manager/manager_test.go +++ b/src/runtime/pkg/device/manager/manager_test.go @@ -132,6 +132,8 @@ func TestAttachVFIODevice(t *testing.T) { HostPath: path, ContainerPath: path, DevType: "c", + ColdPlug: false, + Port: config.RootPort, } device, err := dm.NewDevice(deviceInfo) diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 453012169f..46e845f9e4 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -163,6 +163,9 @@ const ( // TransportMMIO is the MMIO transport for virtio devices. TransportMMIO VirtioTransport = "mmio" + + // TransportAP is the AP transport for virtio devices. + TransportAP VirtioTransport = "ap" ) // defaultTransport returns the default transport for the current combination @@ -199,6 +202,14 @@ func (transport VirtioTransport) isVirtioCCW(config *Config) bool { return transport == TransportCCW } +func (transport VirtioTransport) isVirtioAP(config *Config) bool { + if transport == "" { + transport = transport.defaultTransport(config) + } + + return transport == TransportAP +} + // getName returns the name of the current transport. func (transport VirtioTransport) getName(config *Config) string { if transport == "" { @@ -1811,6 +1822,9 @@ type VFIODevice struct { // Transport is the virtio transport for this device. Transport VirtioTransport + + // SysfsDev specifies the sysfs matrix entry for the AP device + SysfsDev string } // VFIODeviceTransport is a map of the vfio device name that corresponds to @@ -1819,11 +1833,13 @@ var VFIODeviceTransport = map[VirtioTransport]string{ TransportPCI: "vfio-pci", TransportCCW: "vfio-ccw", TransportMMIO: "vfio-device", + TransportAP: "vfio-ap", } // Valid returns true if the VFIODevice structure is valid and complete. +// s390x architecture requires SysfsDev to be set. func (vfioDev VFIODevice) Valid() bool { - return vfioDev.BDF != "" + return vfioDev.BDF != "" || vfioDev.SysfsDev != "" } // QemuParams returns the qemu parameters built out of this vfio device. @@ -1833,6 +1849,15 @@ func (vfioDev VFIODevice) QemuParams(config *Config) []string { driver := vfioDev.deviceName(config) + if vfioDev.Transport.isVirtioAP(config) { + deviceParams = append(deviceParams, fmt.Sprintf("%s,sysfsdev=%s", driver, vfioDev.SysfsDev)) + + qemuParams = append(qemuParams, "-device") + qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) + + return qemuParams + } + deviceParams = append(deviceParams, fmt.Sprintf("%s,host=%s", driver, vfioDev.BDF)) if vfioDev.Transport.isVirtioPCI(config) { if vfioDev.VendorID != "" { @@ -2837,10 +2862,9 @@ func (config *Config) appendDevices(logger QMPLog) { for _, d := range config.Devices { if !d.Valid() { - logger.Errorf("vm device is not valid: %+v", config.Devices) + logger.Errorf("vm device is not valid: %+v", d) continue } - config.qemuParams = append(config.qemuParams, d.QemuParams(config)...) } } diff --git a/src/runtime/pkg/govmm/qemu/qmp.go b/src/runtime/pkg/govmm/qemu/qmp.go index 6b020103da..46eb1c0827 100644 --- a/src/runtime/pkg/govmm/qemu/qmp.go +++ b/src/runtime/pkg/govmm/qemu/qmp.go @@ -1217,10 +1217,11 @@ func (q *QMP) ExecutePCIVFIOMediatedDeviceAdd(ctx context.Context, devID, sysfsd } // ExecuteAPVFIOMediatedDeviceAdd adds a VFIO mediated AP device to a QEMU instance using the device_add command. -func (q *QMP) ExecuteAPVFIOMediatedDeviceAdd(ctx context.Context, sysfsdev string) error { +func (q *QMP) ExecuteAPVFIOMediatedDeviceAdd(ctx context.Context, sysfsdev string, devID string) error { args := map[string]interface{}{ "driver": VfioAP, "sysfsdev": sysfsdev, + "id": devID, } return q.executeCommand(ctx, "device_add", args, nil) } diff --git a/src/runtime/pkg/govmm/qemu/qmp_test.go b/src/runtime/pkg/govmm/qemu/qmp_test.go index 17492f6fd7..06738a40d6 100644 --- a/src/runtime/pkg/govmm/qemu/qmp_test.go +++ b/src/runtime/pkg/govmm/qemu/qmp_test.go @@ -1128,7 +1128,7 @@ func TestQMPAPVFIOMediatedDeviceAdd(t *testing.T) { q := startQMPLoop(buf, cfg, connectedCh, disconnectedCh) checkVersion(t, connectedCh) sysfsDev := "/sys/devices/vfio_ap/matrix/a297db4a-f4c2-11e6-90f6-d3b88d6c9525" - err := q.ExecuteAPVFIOMediatedDeviceAdd(context.Background(), sysfsDev) + err := q.ExecuteAPVFIOMediatedDeviceAdd(context.Background(), sysfsDev, "test-id") if err != nil { t.Fatalf("Unexpected error %v", err) } diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index c0f0789d1f..1e2316236d 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -869,6 +869,23 @@ func (c *Container) create(ctx context.Context) (err error) { } } + // If cold-plug we've attached the devices already, do not try to + // attach them a second time. + coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort) + if coldPlugVFIO { + var cntDevices []ContainerDevice + for _, dev := range c.devices { + if strings.HasPrefix(dev.ContainerPath, vfioPath) { + c.Logger().WithFields(logrus.Fields{ + "device": dev, + }).Info("Remvoing device since we're cold-plugging no Attach needed") + continue + } + cntDevices = append(cntDevices, dev) + } + c.devices = cntDevices + } + c.Logger().WithFields(logrus.Fields{ "devices": c.devices, }).Info("Attach devices") diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 2fd763f496..1a74944baa 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -65,11 +65,6 @@ const romFile = "" // Default value is false. const defaultDisableModern = false -// A deeper PCIe topology than 5 is already not advisable just for the sake -// of having enough buffer we limit ourselves to 10 and exit if we reach -// the root bus -const maxPCIeTopoDepth = 10 - type qmpChannel struct { qmp *govmmQemu.QMP ctx context.Context @@ -80,15 +75,14 @@ type qmpChannel struct { // QemuState keeps Qemu's state type QemuState struct { - UUID string - HotPlugVFIO config.PCIePort - Bridges []types.Bridge - HotpluggedVCPUs []hv.CPUDevice - HotpluggedMemory int - VirtiofsDaemonPid int - HotplugVFIOOnRootBus bool - HotplugVFIO config.PCIePort - ColdPlugVFIO config.PCIePort + UUID string + HotPlugVFIO config.PCIePort + Bridges []types.Bridge + HotpluggedVCPUs []hv.CPUDevice + HotpluggedMemory int + VirtiofsDaemonPid int + HotplugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort } // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. @@ -289,7 +283,6 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.state.UUID = uuid.Generate().String() q.state.HotPlugVFIO = q.config.HotPlugVFIO q.state.ColdPlugVFIO = q.config.ColdPlugVFIO - q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus q.state.HotPlugVFIO = q.config.HotPlugVFIO // The path might already exist, but in case of VM templating, @@ -792,7 +785,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig } } } - vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus) + vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort) vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort) numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices) @@ -1638,7 +1631,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V config.PCIeDevices[config.RootPort][devID] = true bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) - bridgeSlot, err := q.qomGetSlot(bridgeQomPath) + bridgeSlot, err := q.arch.qomGetSlot(bridgeQomPath, &q.qmpMonitorCh) if err != nil { return err } @@ -1741,88 +1734,6 @@ func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUs } } -// Query QMP to find the PCI slot of a device, given its QOM path or ID -func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) { - addr, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qomPath, "addr") - if err != nil { - return types.PciSlot{}, err - } - addrf, ok := addr.(float64) - // XXX going via float makes no real sense, but that's how - // JSON works, and we'll get away with it for the small values - // we have here - if !ok { - return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr) - } - addri := int(addrf) - - slotNum, funcNum := addri>>3, addri&0x7 - if funcNum != 0 { - return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q", - slotNum, funcNum, qomPath) - } - - return types.PciSlotFromInt(slotNum) -} - -// Query QMP to find a device's PCI path given its QOM path or ID -func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { - - var slots []types.PciSlot - - devSlot, err := q.qomGetSlot(qemuID) - if err != nil { - return types.PciPath{}, err - } - slots = append(slots, devSlot) - - // This only works for Q35 and Virt - r, _ := regexp.Compile(`^/machine/.*/pcie.0`) - - var parentPath = qemuID - // We do not want to use a forever loop here, a deeper PCIe topology - // than 5 is already not advisable just for the sake of having enough - // buffer we limit ourselves to 10 and leave the loop early if we hit - // the root bus. - for i := 1; i <= maxPCIeTopoDepth; i++ { - parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus") - if err != nil { - return types.PciPath{}, err - } - - busQOM, ok := parenBusQOM.(string) - if !ok { - return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM) - } - - // If we hit /machine/q35/pcie.0 we're done this is the root bus - // we climbed the complete hierarchy - if r.Match([]byte(busQOM)) { - break - } - - // `bus` is the QOM path of the QOM bus object, but we need - // the PCI parent_bus which manages that bus. There doesn't seem - // to be a way to get that other than to simply drop the last - // path component. - idx := strings.LastIndex(busQOM, "/") - if idx == -1 { - return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM) - } - parentBus := busQOM[:idx] - - parentSlot, err := q.qomGetSlot(parentBus) - if err != nil { - return types.PciPath{}, err - } - - // Prepend the slots, since we're climbing the hierarchy - slots = append([]types.PciSlot{parentSlot}, slots...) - parentPath = parentBus - } - return types.PciPathFromSlots(slots...) -} - func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) { return q.executeVFIODeviceAdd(device) } @@ -1852,7 +1763,7 @@ func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, brid case config.VFIOPCIDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile) case config.VFIOAPDeviceMediatedType: - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID) default: return fmt.Errorf("Incorrect VFIO device type found") } @@ -1865,7 +1776,7 @@ func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error { case config.VFIOPCIDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile) case config.VFIOAPDeviceMediatedType: - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID) default: return fmt.Errorf("Incorrect VFIO device type found") } @@ -1883,46 +1794,43 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op "hot-plug-vfio": q.state.HotPlugVFIO, "device-info": string(buf), }).Info("Start hot-plug VFIO device") - // In case MachineType is q35, a PCIe device is hotplugged on - // a PCIe Root Port or alternatively on a PCIe Switch Port - if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt { - device.Bus = "" - } else { - var err error - // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus - // for pc machine type instead of bridge. This is useful for devices that require - // a large PCI BAR which is a currently a limitation with PCI bridges. - if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { - err = q.hotplugVFIODeviceRootPort(ctx, device) - } else if q.state.HotPlugVFIO == config.SwitchPort { - err = q.hotplugVFIODeviceSwitchPort(ctx, device) - } else { - err = q.hotplugVFIODeviceBridgePort(ctx, device) - } - if err != nil { - return err - } + + err = fmt.Errorf("Incorrect hot plug configuration %v for device %v found", q.state.HotPlugVFIO, device) + // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus + // for pc machine type instead of bridge. This is useful for devices that require + // a large PCI BAR which is a currently a limitation with PCI bridges. + if q.state.HotPlugVFIO == config.RootPort { + err = q.hotplugVFIODeviceRootPort(ctx, device) + } else if q.state.HotPlugVFIO == config.SwitchPort { + err = q.hotplugVFIODeviceSwitchPort(ctx, device) + } else if q.state.HotPlugVFIO == config.BridgePort { + err = q.hotplugVFIODeviceBridgePort(ctx, device) } - // XXX: Depending on whether we're doing root port or + if err != nil { + return err + } + + // Depending on whether we're doing root port or // bridge hotplug, and how the bridge is set up in // other parts of the code, we may or may not already // have information about the slot number of the // bridge and or the device. For simplicity, just - // query both of them back from qemu - device.GuestPciPath, err = q.qomGetPciPath(device.ID) + // query both of them back from qemu based on the arch + device.GuestPciPath, err = q.arch.qomGetPciPath(device.ID, &q.qmpMonitorCh) + return err - } + } else { - q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") + q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") - if !q.state.HotplugVFIOOnRootBus { - if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { - return err + if q.state.HotPlugVFIO == config.BridgePort { + if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { + return err + } } + + return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) } - - return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) - } func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error { @@ -2881,7 +2789,6 @@ func (q *qemu) Save() (s hv.HypervisorState) { s.Type = string(QemuHypervisor) s.UUID = q.state.UUID s.HotpluggedMemory = q.state.HotpluggedMemory - s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus for _, bridge := range q.arch.getBridges() { s.Bridges = append(s.Bridges, hv.Bridge{ @@ -2903,7 +2810,6 @@ func (q *qemu) Save() (s hv.HypervisorState) { func (q *qemu) Load(s hv.HypervisorState) { q.state.UUID = s.UUID q.state.HotpluggedMemory = s.HotpluggedMemory - q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid for _, bridge := range s.Bridges { diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index b74f260ec8..0dc9f46cde 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -13,6 +13,7 @@ import ( "errors" "fmt" "os" + "regexp" "runtime" "strings" @@ -24,6 +25,11 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) +// A deeper PCIe topology than 5 is already not advisable just for the sake +// of having enough buffer we limit ourselves to 10 and exit if we reach +// the root bus +const maxPCIeTopoDepth = 10 + type qemuArch interface { // enableNestingChecks nesting checks will be honoured enableNestingChecks() @@ -158,6 +164,12 @@ type qemuArch interface { // scans the PCIe space and returns the biggest BAR sizes for 32-bit // and 64-bit addressable memory getBARsMaxAddressableMemory() (uint64, uint64) + + // Query QMP to find a device's PCI path given its QOM path or ID + qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) + + // Query QMP to find the PCI slot of a device, given its QOM path or ID + qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error) } type qemuArchBase struct { @@ -881,3 +893,85 @@ func (q *qemuArchBase) appendProtectionDevice(devices []govmmQemu.Device, firmwa hvLogger.WithField("arch", runtime.GOARCH).Warnf("Confidential Computing has not been implemented for this architecture") return devices, firmware, nil } + +// Query QMP to find the PCI slot of a device, given its QOM path or ID +func (q *qemuArchBase) qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error) { + addr, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, qomPath, "addr") + if err != nil { + return types.PciSlot{}, err + } + addrf, ok := addr.(float64) + // XXX going via float makes no real sense, but that's how + // JSON works, and we'll get away with it for the small values + // we have here + if !ok { + return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr) + } + addri := int(addrf) + + slotNum, funcNum := addri>>3, addri&0x7 + if funcNum != 0 { + return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q", + slotNum, funcNum, qomPath) + } + + return types.PciSlotFromInt(slotNum) +} + +// Query QMP to find a device's PCI path given its QOM path or ID +func (q *qemuArchBase) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) { + + var slots []types.PciSlot + + devSlot, err := q.qomGetSlot(qemuID, qmpCh) + if err != nil { + return types.PciPath{}, err + } + slots = append(slots, devSlot) + + // This only works for Q35 and Virt + r, _ := regexp.Compile(`^/machine/.*/pcie.0`) + + var parentPath = qemuID + // We do not want to use a forever loop here, a deeper PCIe topology + // than 5 is already not advisable just for the sake of having enough + // buffer we limit ourselves to 10 and leave the loop early if we hit + // the root bus. + for i := 1; i <= maxPCIeTopoDepth; i++ { + parenBusQOM, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, parentPath, "parent_bus") + if err != nil { + return types.PciPath{}, err + } + + busQOM, ok := parenBusQOM.(string) + if !ok { + return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM) + } + + // If we hit /machine/q35/pcie.0 we're done this is the root bus + // we climbed the complete hierarchy + if r.Match([]byte(busQOM)) { + break + } + + // `bus` is the QOM path of the QOM bus object, but we need + // the PCI parent_bus which manages that bus. There doesn't seem + // to be a way to get that other than to simply drop the last + // path component. + idx := strings.LastIndex(busQOM, "/") + if idx == -1 { + return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM) + } + parentBus := busQOM[:idx] + + parentSlot, err := q.qomGetSlot(parentBus, qmpCh) + if err != nil { + return types.PciPath{}, err + } + + // Prepend the slots, since we're climbing the hierarchy + slots = append([]types.PciSlot{parentSlot}, slots...) + parentPath = parentBus + } + return types.PciPathFromSlots(slots...) +} diff --git a/src/runtime/virtcontainers/qemu_s390x.go b/src/runtime/virtcontainers/qemu_s390x.go index b0c1ede543..bc5c45bff8 100644 --- a/src/runtime/virtcontainers/qemu_s390x.go +++ b/src/runtime/virtcontainers/qemu_s390x.go @@ -351,3 +351,32 @@ func (q *qemuS390x) appendProtectionDevice(devices []govmmQemu.Device, firmware, return devices, firmware, fmt.Errorf("Unsupported guest protection technology: %v", q.protection) } } + +func (q *qemuS390x) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device { + if vfioDev.SysfsDev == "" { + return devices + } + + if len(vfioDev.APDevices) > 0 { + devices = append(devices, + govmmQemu.VFIODevice{ + SysfsDev: vfioDev.SysfsDev, + Transport: govmmQemu.TransportAP, + }, + ) + return devices + + } + devices = append(devices, + govmmQemu.VFIODevice{ + SysfsDev: vfioDev.SysfsDev, + }, + ) + return devices +} + +// Query QMP to find a device's PCI path given its QOM path or ID +func (q *qemuArchBase) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) { + hvLogger.Warnf("qomGetPciPath not implemented for s390x") + return types.PciPath{}, nil +} diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 9949656a6d..52ed76567d 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -619,6 +619,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // Aggregate all the containner devices for hot-plug and use them to dedcue // the correct amount of ports to reserve for the hypervisor. hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) + stripVFIO := sandboxConfig.VfioMode == config.VFIOModeGuestKernel var vfioDevices []config.DeviceInfo // vhost-user-block device is a PCIe device in Virt, keep track of it @@ -644,7 +645,9 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // We need to remove the devices marked for cold-plug // otherwise at the container level the kata-agent // will try to hot-plug them. - sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" + if stripVFIO { + sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" + } } } var filteredDevices []config.DeviceInfo @@ -656,6 +659,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices } + sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices From dd422ccb696b5b94b9a7b4e3c111228b0d7c23cc Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 20 Jul 2023 07:25:40 +0000 Subject: [PATCH 2/5] vfio: Remove obsolete HotplugVFIOonRootBus Removing HotplugVFIOonRootBus which is obsolete with the latest PCI topology changes, users can set cold_plug_vfio or hot_plug_vfio either in the configuration.toml or via annotations. Signed-off-by: Zvonko Kaiser --- src/runtime/cmd/kata-runtime/kata-env.go | 54 +++++++++---------- src/runtime/cmd/kata-runtime/kata-env_test.go | 8 +-- .../pkg/containerd-shim-v2/create_test.go | 34 ++++++------ .../pkg/hypervisors/hypervisor_state.go | 11 ++-- src/runtime/pkg/katatestutils/utils.go | 3 +- .../pkg/katautils/config-settings.go.in | 1 - src/runtime/pkg/katautils/config.go | 22 ++++---- src/runtime/pkg/katautils/config_test.go | 11 ---- src/runtime/pkg/oci/utils.go | 6 --- src/runtime/pkg/oci/utils_test.go | 2 - .../documentation/api/1.0/api.md | 4 -- src/runtime/virtcontainers/hypervisor.go | 4 -- src/runtime/virtcontainers/persist.go | 2 - .../virtcontainers/persist/api/config.go | 4 -- .../pkg/annotations/annotations.go | 4 -- 15 files changed, 61 insertions(+), 109 deletions(-) diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index f74cb9f89c..9b4fc80640 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -103,20 +103,19 @@ type RuntimeVersionInfo struct { // HypervisorInfo stores hypervisor details type HypervisorInfo struct { - MachineType string - Version string - Path string - BlockDeviceDriver string - EntropySource string - SharedFS string - VirtioFSDaemon string - SocketPath string - Msize9p uint32 - MemorySlots uint32 - HotPlugVFIO config.PCIePort - ColdPlugVFIO config.PCIePort - HotplugVFIOOnRootBus bool - Debug bool + MachineType string + Version string + Path string + BlockDeviceDriver string + EntropySource string + SharedFS string + VirtioFSDaemon string + SocketPath string + Msize9p uint32 + MemorySlots uint32 + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort + Debug bool } // AgentInfo stores agent details @@ -307,20 +306,19 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) { } return HypervisorInfo{ - Debug: config.HypervisorConfig.Debug, - MachineType: config.HypervisorConfig.HypervisorMachineType, - Version: version, - Path: hypervisorPath, - BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver, - Msize9p: config.HypervisorConfig.Msize9p, - MemorySlots: config.HypervisorConfig.MemSlots, - EntropySource: config.HypervisorConfig.EntropySource, - SharedFS: config.HypervisorConfig.SharedFS, - VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, - HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, - ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, - HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, - SocketPath: socketPath, + Debug: config.HypervisorConfig.Debug, + MachineType: config.HypervisorConfig.HypervisorMachineType, + Version: version, + Path: hypervisorPath, + BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver, + Msize9p: config.HypervisorConfig.Msize9p, + MemorySlots: config.HypervisorConfig.MemSlots, + EntropySource: config.HypervisorConfig.EntropySource, + SharedFS: config.HypervisorConfig.SharedFS, + VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, + HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, + ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, + SocketPath: socketPath, }, nil } diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index c8d4d0ea9f..5bf2c5a880 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -87,7 +87,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti disableBlock := true blockStorageDriver := "virtio-scsi" enableIOThreads := true - hotplugVFIOOnRootBus := true hotPlugVFIO = config.BridgePort coldPlugVFIO = config.NoPort disableNewNetNs := false @@ -132,7 +131,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti DisableBlock: disableBlock, BlockDeviceDriver: blockStorageDriver, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, DisableNewNetNs: disableNewNetNs, @@ -276,10 +274,8 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo { EntropySource: config.HypervisorConfig.EntropySource, SharedFS: config.HypervisorConfig.SharedFS, VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, - - HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, - HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, - ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, + HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, + ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, } if os.Geteuid() == 0 { diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go index c24e3ced3d..2ba6ce9e56 100644 --- a/src/runtime/pkg/containerd-shim-v2/create_test.go +++ b/src/runtime/pkg/containerd-shim-v2/create_test.go @@ -330,7 +330,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, disableBlockDevice := true blockDeviceDriver := "virtio-scsi" enableIOThreads := true - hotplugVFIOOnRootBus := true disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") @@ -338,23 +337,22 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, coldPlugVFIO = config.RootPort configFileOptions := ktu.RuntimeConfigOptions{ - Hypervisor: "qemu", - HypervisorPath: hypervisorPath, - KernelPath: kernelPath, - ImagePath: imagePath, - RootfsType: rootfsType, - KernelParams: kernelParams, - MachineType: machineType, - LogPath: logPath, - DisableBlock: disableBlockDevice, - BlockDeviceDriver: blockDeviceDriver, - EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - DisableNewNetNs: disableNewNetNs, - SharedFS: sharedFS, - VirtioFSDaemon: virtioFSdaemon, - HotPlugVFIO: hotPlugVFIO, - ColdPlugVFIO: coldPlugVFIO, + Hypervisor: "qemu", + HypervisorPath: hypervisorPath, + KernelPath: kernelPath, + ImagePath: imagePath, + RootfsType: rootfsType, + KernelParams: kernelParams, + MachineType: machineType, + LogPath: logPath, + DisableBlock: disableBlockDevice, + BlockDeviceDriver: blockDeviceDriver, + EnableIOThreads: enableIOThreads, + DisableNewNetNs: disableNewNetNs, + SharedFS: sharedFS, + VirtioFSDaemon: virtioFSdaemon, + HotPlugVFIO: hotPlugVFIO, + ColdPlugVFIO: coldPlugVFIO, } runtimeConfigFileData := ktu.MakeRuntimeConfigFileData(configFileOptions) diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index f0ba941deb..7384cca5e4 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -42,10 +42,9 @@ type HypervisorState struct { // HotpluggedCPUs is the list of CPUs that were hot-added HotpluggedVCPUs []CPUDevice - HotpluggedMemory int - VirtiofsDaemonPid int - Pid int - HotPlugVFIO config.PCIePort - ColdPlugVFIO config.PCIePort - HotplugVFIOOnRootBus bool + HotpluggedMemory int + VirtiofsDaemonPid int + Pid int + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort } diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index ec1d85c3ac..041a2ec5ed 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -233,7 +233,6 @@ type RuntimeConfigOptions struct { DefaultMsize9p uint32 DisableBlock bool EnableIOThreads bool - HotplugVFIOOnRootBus bool DisableNewNetNs bool HypervisorDebug bool RuntimeDebug bool @@ -317,8 +316,8 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { default_memory = ` + strconv.FormatUint(uint64(config.DefaultMemSize), 10) + ` disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + ` enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` - hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + ` cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `" + hot_plug_vfio = "` + config.HotPlugVFIO.String() + `" msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` guest_hook_path = "` + config.DefaultGuestHookPath + `" diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 1a4b7da04f..77caa53fb8 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -81,7 +81,6 @@ const defaultFileBackedMemRootDir string = "" const defaultEnableDebug bool = false const defaultDisableNestingChecks bool = false const defaultMsize9p uint32 = 8192 -const defaultHotplugVFIOOnRootBus bool = false const defaultEntropySource = "/dev/urandom" const defaultGuestHookPath string = "" const defaultVirtioFSCacheMode = "never" diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index c6c08bfe08..cb59b4f167 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -146,7 +146,6 @@ type hypervisor struct { DisableNestingChecks bool `toml:"disable_nesting_checks"` EnableIOThreads bool `toml:"enable_iothreads"` DisableImageNvdimm bool `toml:"disable_image_nvdimm"` - HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"` ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"` DisableVhostNet bool `toml:"disable_vhost_net"` @@ -867,7 +866,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { EnableIOThreads: h.EnableIOThreads, Msize9p: h.msize9p(), DisableImageNvdimm: h.DisableImageNvdimm, - HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, HotPlugVFIO: h.hotPlugVFIO(), ColdPlugVFIO: h.coldPlugVFIO(), DisableVhostNet: h.DisableVhostNet, @@ -1063,7 +1061,6 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush, EnableIOThreads: h.EnableIOThreads, Msize9p: h.msize9p(), - HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, ColdPlugVFIO: h.coldPlugVFIO(), HotPlugVFIO: h.hotPlugVFIO(), DisableVhostNet: true, @@ -1294,7 +1291,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { BlockDeviceCacheNoflush: defaultBlockDeviceCacheNoflush, EnableIOThreads: defaultEnableIOThreads, Msize9p: defaultMsize9p, - HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, ColdPlugVFIO: defaultColdPlugVFIO, HotPlugVFIO: defaultHotPlugVFIO, GuestHookPath: defaultGuestHookPath, @@ -1682,18 +1678,19 @@ func checkConfig(config oci.RuntimeConfig) error { // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error { - // Currently only QEMU q35 supports advanced PCIe topologies + // Currently only QEMU q35,virt support advanced PCIe topologies // firecracker, dragonball do not have right now any PCIe support - if machineType != "q35" { - return nil - } - if coldPlug != config.NoPort && hotPlug != config.NoPort { return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug) } if coldPlug == config.NoPort && hotPlug == config.NoPort { return nil } + + if machineType != "q35" && machineType != "virt" { + return nil + } + var port config.PCIePort if coldPlug != config.NoPort { port = coldPlug @@ -1701,10 +1698,13 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT if hotPlug != config.NoPort { port = hotPlug } - if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort { + if port == config.NoPort { + return fmt.Errorf("invalid vfio_port=%s setting, use on of %s, %s, %s", + port, config.BridgePort, config.RootPort, config.SwitchPort) + } + if port == config.BridgePort || port == config.RootPort || port == config.SwitchPort { return nil } - return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s", coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort) } diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 059c1d69e8..90f9f4d568 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -85,7 +85,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime blockDeviceDriver := "virtio-scsi" blockDeviceAIO := "io_uring" enableIOThreads := true - hotplugVFIOOnRootBus := true hotPlugVFIO = config.NoPort coldPlugVFIO = config.BridgePort disableNewNetNs := false @@ -108,7 +107,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime BlockDeviceDriver: blockDeviceDriver, BlockDeviceAIO: blockDeviceAIO, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, DisableNewNetNs: disableNewNetNs, @@ -172,7 +170,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime BlockDeviceAIO: defaultBlockDeviceAIO, DefaultBridges: defaultBridgesCount, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, Msize9p: defaultMsize9p, @@ -611,7 +608,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { machineType := "machineType" disableBlock := true enableIOThreads := true - hotplugVFIOOnRootBus := true coldPlugVFIO = config.BridgePort orgVHostVSockDevicePath := utils.VHostVSockDevicePath blockDeviceAIO := "io_uring" @@ -630,7 +626,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { MachineType: machineType, DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, ColdPlugVFIO: coldPlugVFIO, RxRateLimiterMaxRate: rxRateLimiterMaxRate, TxRateLimiterMaxRate: txRateLimiterMaxRate, @@ -682,10 +677,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { t.Errorf("Expected value for enable IOThreads %v, got %v", enableIOThreads, config.EnableIOThreads) } - if config.HotplugVFIOOnRootBus != hotplugVFIOOnRootBus { - t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus) - } - if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate { t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate) } @@ -807,7 +798,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { machineType := "machineType" disableBlock := true enableIOThreads := true - hotplugVFIOOnRootBus := true hypervisor := hypervisor{ Path: hypervisorPath, @@ -817,7 +807,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { MachineType: machineType, DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, } _, err := newQemuHypervisorConfig(hypervisor) diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index d782211dd1..1ec4fc5bf9 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -500,12 +500,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig, return err } - if err := newAnnotationConfiguration(ocispec, vcAnnotations.HotplugVFIOOnRootBus).setBool(func(hotplugVFIOOnRootBus bool) { - config.HypervisorConfig.HotplugVFIOOnRootBus = hotplugVFIOOnRootBus - }); err != nil { - return err - } - if err := newAnnotationConfiguration(ocispec, vcAnnotations.UseLegacySerial).setBool(func(useLegacySerial bool) { config.HypervisorConfig.LegacySerial = useLegacySerial }); err != nil { diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index d54e7092e5..4eeaedd115 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -659,7 +659,6 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.DisableVhostNet] = "true" ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/" ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true" - ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true" ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true" @@ -700,7 +699,6 @@ func TestAddHypervisorAnnotations(t *testing.T) { assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true) assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/") assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true) - assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true) assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort)) assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort)) assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true) diff --git a/src/runtime/virtcontainers/documentation/api/1.0/api.md b/src/runtime/virtcontainers/documentation/api/1.0/api.md index ca5cb4a1ad..75acda0667 100644 --- a/src/runtime/virtcontainers/documentation/api/1.0/api.md +++ b/src/runtime/virtcontainers/documentation/api/1.0/api.md @@ -284,10 +284,6 @@ type HypervisorConfig struct { // DisableImageNvdimm is used to disable guest rootfs image nvdimm devices DisableImageNvdimm bool - // HotplugVFIOOnRootBus is used to indicate if devices need to be hotplugged on the - // root bus instead of a bridge. - HotplugVFIOOnRootBus bool - // HotPlugVFIO is used to indicate if devices need to be hotplugged on the // root port, switch, bridge or no port HotPlugVFIO hv.PCIePort diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 43a631c1df..14d8b19373 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -599,10 +599,6 @@ type HypervisorConfig struct { // DisableImageNvdimm is used to disable guest rootfs image nvdimm devices DisableImageNvdimm bool - // HotplugVFIOOnRootBus is used to indicate if devices need to be hotplugged on the - // root bus instead of a bridge. - HotplugVFIOOnRootBus bool - // GuestMemoryDumpPaging is used to indicate if enable paging // for QEMU dump-guest-memory command GuestMemoryDumpPaging bool diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index 91ab51ebfb..aa1875a3b2 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -244,7 +244,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { FileBackedMemRootList: sconfig.HypervisorConfig.FileBackedMemRootList, DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks, DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm, - HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus, BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate, BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate, DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet, @@ -485,7 +484,6 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { FileBackedMemRootList: hconf.FileBackedMemRootList, DisableNestingChecks: hconf.DisableNestingChecks, DisableImageNvdimm: hconf.DisableImageNvdimm, - HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus, HotPlugVFIO: hconf.HotPlugVFIO, ColdPlugVFIO: hconf.ColdPlugVFIO, BootToBeTemplate: hconf.BootToBeTemplate, diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index 6ca5ee6906..5b43ab24ab 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -191,10 +191,6 @@ type HypervisorConfig struct { // DisableImageNvdimm disables nvdimm for guest rootfs image DisableImageNvdimm bool - // HotplugVFIOOnRootBus is used to indicate if devices need to be hotplugged on the - // root bus instead of a bridge. - HotplugVFIOOnRootBus bool - // HotPlugVFIO is used to indicate if devices need to be hotplugged on the // root, switch, bridge or no-port HotPlugVFIO config.PCIePort diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index bad1f66ee2..eb8656cc28 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -122,10 +122,6 @@ const ( // DisableImageNvdimm is a sandbox annotation to specify use of nvdimm device for guest rootfs image. DisableImageNvdimm = kataAnnotHypervisorPrefix + "disable_image_nvdimm" - // HotplugVFIOOnRootBus is a sandbox annotation used to indicate if devices need to be hotplugged on the - // root bus instead of a bridge. - HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus" - // ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged. ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio" From 62aa6750eca8985503f43dd1377a3483253a2d78 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 20 Jul 2023 08:46:09 +0000 Subject: [PATCH 3/5] vfio: Added better handling of VFIO Control Devices Depending on the vfio_mode we need to mount the VFIO control device additionally into the container. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/drivers/utils.go | 1 + src/runtime/pkg/device/manager/manager.go | 6 ++-- src/runtime/pkg/device/manager/utils.go | 9 +++++- src/runtime/pkg/device/manager/utils_test.go | 2 +- src/runtime/virtcontainers/container.go | 8 ++++++ src/runtime/virtcontainers/sandbox.go | 29 +++++++++++++------- 6 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 9ce16040b6..4fa0fdd0e2 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -216,6 +216,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe SysfsDev: deviceSysfsDev, Type: config.VFIOAPDeviceMediatedType, APDevices: devices, + Port: device.Port, } default: return nil, fmt.Errorf("Failed to append device: VFIO device type unrecognized") diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index 735061d9ee..cb5e86a045 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -120,7 +120,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device if devInfo.ID, err = dm.newDeviceID(); err != nil { return nil, err } - if IsVFIO(devInfo.HostPath) { + if IsVFIODevice(devInfo.HostPath) { return drivers.NewVFIODevice(&devInfo), nil } else if IsVhostUserBlk(devInfo) { if devInfo.DriverOptions == nil { @@ -191,12 +191,12 @@ func (dm *deviceManager) AttachDevice(ctx context.Context, id string, dr api.Dev dm.Lock() defer dm.Unlock() - d, ok := dm.devices[id] + dev, ok := dm.devices[id] if !ok { return ErrDeviceNotExist } - if err := d.Attach(ctx, dr); err != nil { + if err := dev.Attach(ctx, dr); err != nil { return err } return nil diff --git a/src/runtime/pkg/device/manager/utils.go b/src/runtime/pkg/device/manager/utils.go index a9e4ee8c6a..6658b19a41 100644 --- a/src/runtime/pkg/device/manager/utils.go +++ b/src/runtime/pkg/device/manager/utils.go @@ -17,8 +17,15 @@ const ( vfioPath = "/dev/vfio/" ) +// IsVFIOControlDevice checks if the device provided is a vfio control device. +// Depending no the vfio_mode we need to know if a device is a VFIO device +// or the VFIO control device +func IsVFIOControlDevice(path string) bool { + return path == filepath.Join(vfioPath, "vfio") +} + // IsVFIO checks if the device provided is a vfio group. -func IsVFIO(hostPath string) bool { +func IsVFIODevice(hostPath string) bool { // Ignore /dev/vfio/vfio character device if strings.HasPrefix(hostPath, filepath.Join(vfioPath, "vfio")) { return false diff --git a/src/runtime/pkg/device/manager/utils_test.go b/src/runtime/pkg/device/manager/utils_test.go index 6752719ddb..9fbc829d7e 100644 --- a/src/runtime/pkg/device/manager/utils_test.go +++ b/src/runtime/pkg/device/manager/utils_test.go @@ -31,7 +31,7 @@ func TestIsVFIO(t *testing.T) { } for _, d := range data { - isVFIO := IsVFIO(d.path) + isVFIO := IsVFIODevice(d.path) assert.Equal(t, d.expected, isVFIO) } } diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index 1e2316236d..4927bdab3f 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -18,6 +18,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager" + deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager" volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc" @@ -872,9 +873,16 @@ func (c *Container) create(ctx context.Context) (err error) { // If cold-plug we've attached the devices already, do not try to // attach them a second time. coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort) + modeVFIO := (c.sandbox.config.VfioMode == config.VFIOModeVFIO) + if coldPlugVFIO { var cntDevices []ContainerDevice for _, dev := range c.devices { + isVFIOControlDevice := deviceManager.IsVFIOControlDevice(dev.ContainerPath) + if isVFIOControlDevice && modeVFIO { + cntDevices = append(cntDevices, dev) + } + if strings.HasPrefix(dev.ContainerPath, vfioPath) { c.Logger().WithFields(logrus.Fields{ "device": dev, diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 52ed76567d..9f18b5c2cd 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -619,7 +619,9 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // Aggregate all the containner devices for hot-plug and use them to dedcue // the correct amount of ports to reserve for the hypervisor. hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) - stripVFIO := sandboxConfig.VfioMode == config.VFIOModeGuestKernel + + modeGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel) + modeVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO) var vfioDevices []config.DeviceInfo // vhost-user-block device is a PCIe device in Virt, keep track of it @@ -633,19 +635,26 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor vhostUserBlkDevices = append(vhostUserBlkDevices, device) continue } - isVFIO := deviceManager.IsVFIO(device.ContainerPath) - if hotPlugVFIO && isVFIO { + isVFIODevice := deviceManager.IsVFIODevice(device.ContainerPath) + isVFIOControlDevice := deviceManager.IsVFIOControlDevice(device.ContainerPath) + // vfio_mode=vfio needs the VFIO control device add it to the list + // of devices to be added to the VM. + if modeVFIO && isVFIOControlDevice { + vfioDevices = append(vfioDevices, device) + } + + if hotPlugVFIO && isVFIODevice { vfioDevices = append(vfioDevices, device) sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO } - if coldPlugVFIO && isVFIO { + if coldPlugVFIO && isVFIODevice { device.ColdPlug = true device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO vfioDevices = append(vfioDevices, device) // We need to remove the devices marked for cold-plug // otherwise at the container level the kata-agent // will try to hot-plug them. - if stripVFIO { + if modeGK { sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" } } @@ -2053,26 +2062,26 @@ func (s *Sandbox) AddDevice(ctx context.Context, info config.DeviceInfo) (api.De } var err error - b, err := s.devManager.NewDevice(info) + add, err := s.devManager.NewDevice(info) if err != nil { return nil, err } defer func() { if err != nil { - s.devManager.RemoveDevice(b.DeviceID()) + s.devManager.RemoveDevice(add.DeviceID()) } }() - if err = s.devManager.AttachDevice(ctx, b.DeviceID(), s); err != nil { + if err = s.devManager.AttachDevice(ctx, add.DeviceID(), s); err != nil { return nil, err } defer func() { if err != nil { - s.devManager.DetachDevice(ctx, b.DeviceID(), s) + s.devManager.DetachDevice(ctx, add.DeviceID(), s) } }() - return b, nil + return add, nil } // updateResources will: From 545de5042ab27a76d1599eb092352f4dba333c9e Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 20 Jul 2023 09:31:01 +0000 Subject: [PATCH 4/5] vfio: Fix tests Now with more elaborate checking of cold|hot plug ports we needed to update some of the tests. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/containerd-shim-v2/create_test.go | 2 +- src/runtime/virtcontainers/qemu_s390x.go | 2 +- src/runtime/virtcontainers/sandbox.go | 3 +++ src/runtime/virtcontainers/sandbox_test.go | 1 + 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go index 2ba6ce9e56..82ce7357f2 100644 --- a/src/runtime/pkg/containerd-shim-v2/create_test.go +++ b/src/runtime/pkg/containerd-shim-v2/create_test.go @@ -334,7 +334,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") hotPlugVFIO = config.BridgePort - coldPlugVFIO = config.RootPort + coldPlugVFIO = config.NoPort configFileOptions := ktu.RuntimeConfigOptions{ Hypervisor: "qemu", diff --git a/src/runtime/virtcontainers/qemu_s390x.go b/src/runtime/virtcontainers/qemu_s390x.go index bc5c45bff8..29eaafe5b3 100644 --- a/src/runtime/virtcontainers/qemu_s390x.go +++ b/src/runtime/virtcontainers/qemu_s390x.go @@ -376,7 +376,7 @@ func (q *qemuS390x) appendVFIODevice(devices []govmmQemu.Device, vfioDev config. } // Query QMP to find a device's PCI path given its QOM path or ID -func (q *qemuArchBase) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) { +func (q *qemuS390x) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) { hvLogger.Warnf("qomGetPciPath not implemented for s390x") return types.PciPath{}, nil } diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 9f18b5c2cd..3507807323 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -644,6 +644,8 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor } if hotPlugVFIO && isVFIODevice { + device.ColdPlug = false + device.Port = sandboxConfig.HypervisorConfig.HotPlugVFIO vfioDevices = append(vfioDevices, device) sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO } @@ -686,6 +688,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor } for _, dev := range vfioDevices { + s.Logger().Info("cold-plug device: ", dev) _, err := s.AddDevice(ctx, dev) if err != nil { s.Logger().WithError(err).Debug("Cannot cold-plug add device") diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index 90a2af7ee3..b735c55115 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -606,6 +606,7 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) { HostPath: path, ContainerPath: path, DevType: "c", + Port: config.RootPort, } dev, err := dm.NewDevice(deviceInfo) assert.Nil(t, err, "deviceManager.NewDevice return error: %v", err) From 1fc715bc6526f53057e3bc5f0bc118907e4f089a Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Fri, 21 Jul 2023 07:44:36 +0000 Subject: [PATCH 5/5] s390x: Add AP Attach/Detach test Now that we have propper AP device support add a unit test for testing the correct Attach/Detach of AP devices. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/drivers/utils.go | 2 +- src/runtime/pkg/device/drivers/vfio.go | 1 + .../pkg/device/manager/manager_test.go | 94 +++++++++++++++++++ src/runtime/pkg/katautils/config.go | 16 +++- src/runtime/virtcontainers/qemu.go | 1 + src/runtime/virtcontainers/sandbox.go | 62 ++++++------ 6 files changed, 143 insertions(+), 33 deletions(-) diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 4fa0fdd0e2..5f76bff48e 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -111,7 +111,7 @@ func GetVFIODeviceType(deviceFilePath string) (config.VFIODeviceType, error) { return config.VFIODeviceErrorType, err } - if strings.HasPrefix(deviceSysfsDev, vfioAPSysfsDir) { + if strings.Contains(deviceSysfsDev, vfioAPSysfsDir) { return config.VFIOAPDeviceMediatedType, nil } diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 7edd5b3c6e..feec9c4482 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -69,6 +69,7 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece if err != nil { return err } + for _, vfio := range device.VfioDevs { // If vfio.Port is not set we bail out, users should set // explicitly the port in the config file diff --git a/src/runtime/pkg/device/manager/manager_test.go b/src/runtime/pkg/device/manager/manager_test.go index bcd321477f..c08cb66ab4 100644 --- a/src/runtime/pkg/device/manager/manager_test.go +++ b/src/runtime/pkg/device/manager/manager_test.go @@ -90,6 +90,100 @@ func TestNewDevice(t *testing.T) { assert.Equal(t, vfioDev.DeviceInfo.GID, uint32(2)) } +func TestAttachVFIOAPDevice(t *testing.T) { + + var err error + var ok bool + + dm := &deviceManager{ + devices: make(map[string]api.Device), + } + + tmpDir := t.TempDir() + // sys/devices/vfio_ap/matrix/f94290f8-78ac-45fb-bb22-e55e519fa64f + testSysfsAP := "/sys/devices/vfio_ap/" + testDeviceAP := "f94290f8-78ac-45fb-bb22-e55e519fa64f" + testVFIOGroup := "42" + + matrixDir := filepath.Join(tmpDir, testSysfsAP, "matrix") + err = os.MkdirAll(matrixDir, dirMode) + assert.Nil(t, err) + + deviceAPFile := filepath.Join(matrixDir, testDeviceAP) + err = os.MkdirAll(deviceAPFile, dirMode) + assert.Nil(t, err) + + matrixDeviceAPFile := filepath.Join(deviceAPFile, "matrix") + _, err = os.Create(matrixDeviceAPFile) + assert.Nil(t, err) + // create AP devices in the matrix file + APDevices := []byte("05.001f\n") + err = os.WriteFile(matrixDeviceAPFile, APDevices, 0644) + assert.Nil(t, err) + + devicesVFIOGroupDir := filepath.Join(tmpDir, testVFIOGroup, "devices") + err = os.MkdirAll(devicesVFIOGroupDir, dirMode) + assert.Nil(t, err) + + deviceAPSymlink := filepath.Join(devicesVFIOGroupDir, testDeviceAP) + err = os.Symlink(deviceAPFile, deviceAPSymlink) + assert.Nil(t, err) + + savedIOMMUPath := config.SysIOMMUGroupPath + config.SysIOMMUGroupPath = tmpDir + + savedSysBusPciDevicesPath := config.SysBusPciDevicesPath + config.SysBusPciDevicesPath = devicesVFIOGroupDir + + defer func() { + config.SysIOMMUGroupPath = savedIOMMUPath + config.SysBusPciDevicesPath = savedSysBusPciDevicesPath + }() + + path := filepath.Join(vfioPath, testVFIOGroup) + deviceInfo := config.DeviceInfo{ + HostPath: path, + ContainerPath: path, + DevType: "c", + ColdPlug: false, + Port: config.RootPort, + } + + device, err := dm.NewDevice(deviceInfo) + assert.Nil(t, err) + _, ok = device.(*drivers.VFIODevice) + assert.True(t, ok) + + devReceiver := &api.MockDeviceReceiver{} + err = device.Attach(context.Background(), devReceiver) + assert.Nil(t, err) + + err = device.Detach(context.Background(), devReceiver) + assert.Nil(t, err) + + // If we omit the port setting we should fail + failDm := &deviceManager{ + devices: make(map[string]api.Device), + } + + failDeviceInfo := config.DeviceInfo{ + HostPath: path, + ContainerPath: path, + DevType: "c", + ColdPlug: false, + } + + failDevice, err := failDm.NewDevice(failDeviceInfo) + assert.Nil(t, err) + _, ok = failDevice.(*drivers.VFIODevice) + assert.True(t, ok) + + failDevReceiver := &api.MockDeviceReceiver{} + err = failDevice.Attach(context.Background(), failDevReceiver) + assert.Error(t, err) + +} + func TestAttachVFIODevice(t *testing.T) { dm := &deviceManager{ blockDriver: config.VirtioBlock, diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index cb59b4f167..f577df5263 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -22,6 +22,7 @@ import ( govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" @@ -1667,7 +1668,8 @@ func checkConfig(config oci.RuntimeConfig) error { hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO machineType := config.HypervisorConfig.HypervisorMachineType - if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil { + hypervisorType := config.HypervisorType + if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType, hypervisorType); err != nil { return err } @@ -1677,16 +1679,20 @@ func checkConfig(config oci.RuntimeConfig) error { // checkPCIeConfig ensures the PCIe configuration is valid. // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port -func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error { - // Currently only QEMU q35,virt support advanced PCIe topologies - // firecracker, dragonball do not have right now any PCIe support +func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string, hypervisorType virtcontainers.HypervisorType) error { + if hypervisorType != virtcontainers.QemuHypervisor { + kataUtilsLogger.Warn("Advanced PCIe Topology only available for QEMU hypervisor, ignoring hot(cold)_vfio_port setting") + return nil + } + if coldPlug != config.NoPort && hotPlug != config.NoPort { return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug) } if coldPlug == config.NoPort && hotPlug == config.NoPort { return nil } - + // Currently only QEMU q35,virt support advanced PCIe topologies + // firecracker, dragonball do not have right now any PCIe support if machineType != "q35" && machineType != "virt" { return nil } diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 1a74944baa..57fa036f61 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -775,6 +775,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if err != nil { return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err) } + devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev) if err != nil { return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 3507807323..a4d7e07302 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -613,6 +613,36 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return nil, err } + coldPlugVFIO, err := s.coldOrHotPlugVFIO(&sandboxConfig) + if err != nil { + return nil, err + } + + // store doesn't require hypervisor to be stored immediately + if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { + return nil, err + } + + if s.disableVMShutdown, err = s.agent.init(ctx, s, sandboxConfig.AgentConfig); err != nil { + return nil, err + } + + if !coldPlugVFIO { + return s, nil + } + + for _, dev := range sandboxConfig.HypervisorConfig.VFIODevices { + s.Logger().Info("cold-plug device: ", dev) + _, err := s.AddDevice(ctx, dev) + if err != nil { + s.Logger().WithError(err).Debug("Cannot cold-plug add device") + return nil, err + } + } + return s, nil +} + +func (s *Sandbox) coldOrHotPlugVFIO(sandboxConfig *SandboxConfig) (bool, error) { // If we have a confidential guest we need to cold-plug the PCIe VFIO devices // until we have TDISP/IDE PCIe support. coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort) @@ -620,8 +650,8 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // the correct amount of ports to reserve for the hypervisor. hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) - modeGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel) - modeVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO) + modeIsGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel) + modeIsVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO) var vfioDevices []config.DeviceInfo // vhost-user-block device is a PCIe device in Virt, keep track of it @@ -639,7 +669,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor isVFIOControlDevice := deviceManager.IsVFIOControlDevice(device.ContainerPath) // vfio_mode=vfio needs the VFIO control device add it to the list // of devices to be added to the VM. - if modeVFIO && isVFIOControlDevice { + if modeIsVFIO && isVFIOControlDevice && !hotPlugVFIO { vfioDevices = append(vfioDevices, device) } @@ -656,7 +686,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // We need to remove the devices marked for cold-plug // otherwise at the container level the kata-agent // will try to hot-plug them. - if modeGK { + if modeIsGK { sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" } } @@ -668,34 +698,12 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor } } sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices - } sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices - // store doesn't require hypervisor to be stored immediately - if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { - return nil, err - } - - if s.disableVMShutdown, err = s.agent.init(ctx, s, sandboxConfig.AgentConfig); err != nil { - return nil, err - } - - if !coldPlugVFIO { - return s, nil - } - - for _, dev := range vfioDevices { - s.Logger().Info("cold-plug device: ", dev) - _, err := s.AddDevice(ctx, dev) - if err != nil { - s.Logger().WithError(err).Debug("Cannot cold-plug add device") - return nil, err - } - } - return s, nil + return coldPlugVFIO, nil } func (s *Sandbox) createResourceController() error {