From ad45c52fbed3681d56ee676915e94ef0ef5743b4 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 13 Apr 2021 14:09:03 +1000 Subject: [PATCH] runtime/device: Record guest PCI path for VFIO devices For several device types which correspond to a PCI device in the guest we record the device's PCI path in the guest. We don't currently do that for VFIO devices, but we're going to need to for better handling of SR-IOV devices. To accomplish this, we have to determine the guest PCI path from the information the VMM gives us: For qemu, we query the slot of the device and its bridge from QMP. For cloud-hypervisor, the device add interface gives us a guest PCI address. In fact this represents a design error in the clh API - there's no way it can really know the guest PCI address in general. It works in this case, because clh doesn't use PCI bridges, so the device will always be on the root bus. Based on that, the PCI path is simply the device's slot number. Signed-off-by: David Gibson --- src/runtime/virtcontainers/clh.go | 26 ++++++- .../virtcontainers/device/config/config.go | 3 + src/runtime/virtcontainers/qemu.go | 73 +++++++++++++++++++ 3 files changed, 98 insertions(+), 4 deletions(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index b117fb7e11..170f76f4ec 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -503,7 +503,7 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro return err } -func (clh *cloudHypervisor) hotPlugVFIODevice(device config.VFIODev) error { +func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { cl := clh.client() ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second) defer cancel() @@ -512,10 +512,28 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device config.VFIODev) error { clhDevice := *chclient.NewVmAddDevice() clhDevice.Path = &device.SysfsDev clhDevice.Id = &device.ID - _, _, err := cl.VmAddDevicePut(ctx, clhDevice) + pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) if err != nil { - err = fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) + return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) } + + // clh doesn't use bridges, so the PCI path is simply the slot + // number of the device. This will break if clh starts using + // bridges (including PCI-E root ports), but so will the clh + // API, since there's no way it can reliably predict a guest + // Bdf when bridges are present. + tokens := strings.Split(pciInfo.Bdf, ":") + if len(tokens) != 3 || tokens[0] != "0000" || tokens[1] != "00" { + return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf) + } + + tokens = strings.Split(tokens[2], ".") + if len(tokens) != 2 || tokens[1] != "0" || len(tokens[0]) != 2 { + return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf) + } + + device.GuestPciPath, err = vcTypes.PciPathFromString(tokens[0]) + return err } @@ -529,7 +547,7 @@ func (clh *cloudHypervisor) hotplugAddDevice(ctx context.Context, devInfo interf return nil, clh.hotplugAddBlockDevice(drive) case vfioDev: device := devInfo.(*config.VFIODev) - return nil, clh.hotPlugVFIODevice(*device) + return nil, clh.hotPlugVFIODevice(device) default: return nil, fmt.Errorf("cannot hotplug device: unsupported device type '%v'", devType) } diff --git a/src/runtime/virtcontainers/device/config/config.go b/src/runtime/virtcontainers/device/config/config.go index c2a31a403c..8627094a91 100644 --- a/src/runtime/virtcontainers/device/config/config.go +++ b/src/runtime/virtcontainers/device/config/config.go @@ -224,6 +224,9 @@ type VFIODev struct { // Bus of VFIO PCIe device Bus string + // Guest PCI path of device + GuestPciPath vcTypes.PciPath + // Type of VFIO device Type VFIODeviceType diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index fd48ad639e..cdbc3e3d83 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -1438,6 +1438,69 @@ func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUs } } +// Query QMP to find the PCI slot of a device, given its QOM path or ID +func (q *qemu) qomGetSlot(qomPath string) (vcTypes.PciSlot, error) { + addr, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qomPath, "addr") + if err != nil { + return vcTypes.PciSlot{}, err + } + addrf, ok := addr.(float64) + // XXX going via float makes no real sense, but that's how + // JSON works, and we'll get away with it for the small values + // we have here + if !ok { + return vcTypes.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr) + } + addri := int(addrf) + + slotNum, funcNum := addri>>3, addri&0x7 + if funcNum != 0 { + return vcTypes.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q", + slotNum, funcNum, qomPath) + } + + return vcTypes.PciSlotFromInt(slotNum) +} + +// Query QMP to find a device's PCI path given its QOM path or ID +func (q *qemu) qomGetPciPath(qemuID string) (vcTypes.PciPath, error) { + // XXX: For now we assume there's exactly one bridge, since + // that's always how we configure qemu from Kata for now. It + // would be good to generalize this to different PCI + // topologies + devSlot, err := q.qomGetSlot(qemuID) + if err != nil { + return vcTypes.PciPath{}, err + } + + busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus") + if err != nil { + return vcTypes.PciPath{}, err + } + + bus, ok := busq.(string) + if !ok { + return vcTypes.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq) + } + + // `bus` is the QOM path of the QOM bus object, but we need + // the PCI bridge which manages that bus. There doesn't seem + // to be a way to get that other than to simply drop the last + // path component. + idx := strings.LastIndex(bus, "/") + if idx == -1 { + return vcTypes.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus) + } + bridge := bus[:idx] + + bridgeSlot, err := q.qomGetSlot(bridge) + if err != nil { + return vcTypes.PciPath{}, err + } + + return vcTypes.PciPathFromSlots(bridgeSlot, devSlot) +} + func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op operation) (err error) { if err = q.qmpSetup(); err != nil { return err @@ -1509,6 +1572,16 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op return fmt.Errorf("Incorrect VFIO device type found") } } + if err != nil { + return err + } + // XXX: Depending on whether we're doing root port or + // bridge hotplug, and how the bridge is set up in + // other parts of the code, we may or may not already + // have information about the slot number of the + // bridge and or the device. For simplicity, just + // query both of them back from qemu + device.GuestPciPath, err = q.qomGetPciPath(devID) return err } else { q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")