From cf36fd87ad788fa54bcfb21a757ede8dccfb310a Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 18 Aug 2021 14:41:18 +1000 Subject: [PATCH 01/10] runtime: Fix some leftover go fmt errors A few "go fmt" errors appear to have crept it. Clean them up with "go fmt ./..." in the src/runtime directory. Signed-off-by: David Gibson --- .../virtcontainers/hypervisor_s390x.go | 2 +- src/runtime/virtcontainers/qemu_arm64.go | 38 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/runtime/virtcontainers/hypervisor_s390x.go b/src/runtime/virtcontainers/hypervisor_s390x.go index b80fce767..a8afca4ad 100644 --- a/src/runtime/virtcontainers/hypervisor_s390x.go +++ b/src/runtime/virtcontainers/hypervisor_s390x.go @@ -90,7 +90,7 @@ func availableGuestProtection() (guestProtection, error) { return noneProtection, err } if !seCmdlinePresent { - return noneProtection, fmt.Errorf("Protected Virtualization is not enabled on kernel command line! " + + return noneProtection, fmt.Errorf("Protected Virtualization is not enabled on kernel command line! "+ "Need %s=%s (or %s) to enable Secure Execution", seCmdlineParam, seCmdlineValues[0], strings.Join(seCmdlineValues[1:], ", ")) } diff --git a/src/runtime/virtcontainers/qemu_arm64.go b/src/runtime/virtcontainers/qemu_arm64.go index eced3b044..d14ec4131 100644 --- a/src/runtime/virtcontainers/qemu_arm64.go +++ b/src/runtime/virtcontainers/qemu_arm64.go @@ -8,8 +8,8 @@ package virtcontainers import ( "context" "fmt" - "time" "os" + "time" govmmQemu "github.com/kata-containers/govmm/qemu" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" @@ -100,29 +100,29 @@ func (q *qemuArm64) appendImage(ctx context.Context, devices []govmmQemu.Device, // so we temporarily add this specific implementation for arm64 here until // the qemu used by arm64 is capable for that feature func (q *qemuArm64) appendNvdimmImage(devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) { - imageFile, err := os.Open(path) - if err != nil { - return nil, err - } - defer imageFile.Close() + imageFile, err := os.Open(path) + if err != nil { + return nil, err + } + defer imageFile.Close() - imageStat, err := imageFile.Stat() - if err != nil { - return nil, err - } + imageStat, err := imageFile.Stat() + if err != nil { + return nil, err + } object := govmmQemu.Object{ - Driver: govmmQemu.NVDIMM, - Type: govmmQemu.MemoryBackendFile, - DeviceID: "nv0", - ID: "mem0", - MemPath: path, - Size: (uint64)(imageStat.Size()), - } + Driver: govmmQemu.NVDIMM, + Type: govmmQemu.MemoryBackendFile, + DeviceID: "nv0", + ID: "mem0", + MemPath: path, + Size: (uint64)(imageStat.Size()), + } - devices = append(devices, object) + devices = append(devices, object) - return devices, nil + return devices, nil } func (q *qemuArm64) setIgnoreSharedMemoryMigrationCaps(_ context.Context, _ *govmmQemu.QMP) error { From 5b1eb08bdeb186e7b7afa117b65b95f255162c4c Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 14 Jul 2021 12:32:10 +1000 Subject: [PATCH 02/10] agent/uevent: Improve logging of wait_for_uevent() These messages will help when debugging matchers not matching properly. Signed-off-by: David Gibson --- src/agent/src/uevent.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/agent/src/uevent.rs b/src/agent/src/uevent.rs index c5ecefcc1..1b1dbb72e 100644 --- a/src/agent/src/uevent.rs +++ b/src/agent/src/uevent.rs @@ -111,10 +111,13 @@ pub async fn wait_for_uevent( sandbox: &Arc>, matcher: impl UeventMatcher, ) -> Result { + let logprefix = format!("Waiting for {:?}", &matcher); + + info!(sl!(), "{}", logprefix); let mut sb = sandbox.lock().await; for uev in sb.uevent_map.values() { if matcher.is_match(uev) { - info!(sl!(), "Device {:?} found in device map", uev); + info!(sl!(), "{}: found {:?} in uevent map", logprefix, &uev); return Ok(uev.clone()); } } @@ -129,7 +132,8 @@ pub async fn wait_for_uevent( sb.uevent_watchers.push(Some((Box::new(matcher), tx))); drop(sb); // unlock - info!(sl!(), "Waiting on channel for uevent notification\n"); + info!(sl!(), "{}: waiting on channel", logprefix); + let hotplug_timeout = AGENT_CONFIG.read().await.hotplug_timeout; let uev = match tokio::time::timeout(hotplug_timeout, rx).await { @@ -146,6 +150,7 @@ pub async fn wait_for_uevent( } }; + info!(sl!(), "{}: found {:?} on channel", logprefix, &uev); Ok(uev) } From f7a2707505099af5f9a26fd5b3e157be02e2fdbf Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 9 Apr 2021 16:29:40 +1000 Subject: [PATCH 03/10] agent: Move driver type constants into device.rs Currently the constants giving the names for each device/driver type in the protocol are in mount.rs, and used in device.rs. Since these constants are inherently related to, well, devices, it makes more sense to put them in device.rs and use them from mount.rs. This will become even more so with planned extensions which will add some device types that will not be used in mount.rs at all. Signed-off-by: David Gibson --- src/agent/src/device.rs | 15 +++++++++++---- src/agent/src/mount.rs | 14 +++----------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 91ce30ea7..b179eaac6 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -17,10 +17,6 @@ use tokio::sync::Mutex; #[cfg(target_arch = "s390x")] use crate::ccw; use crate::linux_abi::*; -use crate::mount::{ - DRIVER_BLK_CCW_TYPE, DRIVER_BLK_TYPE, DRIVER_MMIO_BLK_TYPE, DRIVER_NVDIMM_TYPE, - DRIVER_SCSI_TYPE, -}; use crate::pci; use crate::sandbox::Sandbox; use crate::uevent::{wait_for_uevent, Uevent, UeventMatcher}; @@ -38,6 +34,17 @@ macro_rules! sl { const VM_ROOTFS: &str = "/"; +pub const DRIVER_9P_TYPE: &str = "9p"; +pub const DRIVER_VIRTIOFS_TYPE: &str = "virtio-fs"; +pub const DRIVER_BLK_TYPE: &str = "blk"; +pub const DRIVER_BLK_CCW_TYPE: &str = "blk-ccw"; +pub const DRIVER_MMIO_BLK_TYPE: &str = "mmioblk"; +pub const DRIVER_SCSI_TYPE: &str = "scsi"; +pub const DRIVER_NVDIMM_TYPE: &str = "nvdimm"; +pub const DRIVER_EPHEMERAL_TYPE: &str = "ephemeral"; +pub const DRIVER_LOCAL_TYPE: &str = "local"; +pub const DRIVER_WATCHABLE_BIND_TYPE: &str = "watchable-bind"; + #[derive(Debug)] struct DevIndexEntry { idx: usize, diff --git a/src/agent/src/mount.rs b/src/agent/src/mount.rs index 1c2b2dab2..3d55f874f 100644 --- a/src/agent/src/mount.rs +++ b/src/agent/src/mount.rs @@ -22,6 +22,9 @@ use regex::Regex; use crate::device::{ get_scsi_device_name, get_virtio_blk_pci_device_name, online_device, wait_for_pmem_device, + DRIVER_9P_TYPE, DRIVER_BLK_CCW_TYPE, DRIVER_BLK_TYPE, DRIVER_EPHEMERAL_TYPE, DRIVER_LOCAL_TYPE, + DRIVER_MMIO_BLK_TYPE, DRIVER_NVDIMM_TYPE, DRIVER_SCSI_TYPE, DRIVER_VIRTIOFS_TYPE, + DRIVER_WATCHABLE_BIND_TYPE, }; use crate::linux_abi::*; use crate::pci; @@ -33,17 +36,6 @@ use anyhow::{anyhow, Context, Result}; use slog::Logger; use tracing::instrument; -pub const DRIVER_9P_TYPE: &str = "9p"; -pub const DRIVER_VIRTIOFS_TYPE: &str = "virtio-fs"; -pub const DRIVER_BLK_TYPE: &str = "blk"; -pub const DRIVER_BLK_CCW_TYPE: &str = "blk-ccw"; -pub const DRIVER_MMIO_BLK_TYPE: &str = "mmioblk"; -pub const DRIVER_SCSI_TYPE: &str = "scsi"; -pub const DRIVER_NVDIMM_TYPE: &str = "nvdimm"; -pub const DRIVER_EPHEMERAL_TYPE: &str = "ephemeral"; -pub const DRIVER_LOCAL_TYPE: &str = "local"; -pub const DRIVER_WATCHABLE_BIND_TYPE: &str = "watchable-bind"; - pub const TYPE_ROOTFS: &str = "rootfs"; pub const MOUNT_GUEST_TAG: &str = "kataShared"; From 8bc71105f48d806191dfbfb6bcf5bfe12ed1f5b2 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 13 Apr 2021 11:35:09 +1000 Subject: [PATCH 04/10] agent/device: Add device type for VFIO devices Currently, VFIO devices attached to a Kata container aren't described to the agent at all. We essentially just hope they're ready by the time we've entered the container proper, which is usually the case because of the PCI rescan - but that causes other problems. This adds a new device type to the agent representing VFIO devices. The agent will use its existing uevent watching mechanisms to wait for the associated guest PCI device to appear before proceeding. Signed-off-by: David Gibson --- src/agent/src/device.rs | 73 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index b179eaac6..4ee275f30 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -44,6 +44,8 @@ pub const DRIVER_NVDIMM_TYPE: &str = "nvdimm"; pub const DRIVER_EPHEMERAL_TYPE: &str = "ephemeral"; pub const DRIVER_LOCAL_TYPE: &str = "local"; pub const DRIVER_WATCHABLE_BIND_TYPE: &str = "watchable-bind"; +// VFIO device to be bound to a guest kernel driver +pub const DRIVER_VFIO_GK_TYPE: &str = "vfio-gk"; #[derive(Debug)] struct DevIndexEntry { @@ -262,6 +264,35 @@ pub async fn wait_for_pmem_device(sandbox: &Arc>, devpath: &str) Ok(()) } +#[derive(Debug)] +struct PciMatcher { + devpath: String, +} + +impl PciMatcher { + fn new(relpath: &str) -> Result { + let root_bus = create_pci_root_bus_path(); + Ok(PciMatcher { + devpath: format!("{}{}", root_bus, relpath), + }) + } +} + +impl UeventMatcher for PciMatcher { + fn is_match(&self, uev: &Uevent) -> bool { + uev.devpath == self.devpath + } +} + +pub async fn wait_for_pci_device(sandbox: &Arc>, pcipath: &pci::Path) -> Result<()> { + let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path()); + let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; + let matcher = PciMatcher::new(&sysfs_rel_path)?; + + let _ = wait_for_uevent(sandbox, matcher).await?; + Ok(()) +} + /// Scan SCSI bus for the given SCSI address(SCSI-Id and LUN) #[instrument] fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { @@ -458,6 +489,37 @@ async fn virtio_nvdimm_device_handler( update_spec_device_list(device, spec, devidx) } +fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { + let mut tokens = opt.split('='); + let hostbdf = tokens.next()?; + let path = tokens.next()?; + if tokens.next().is_some() { + None + } else { + Some((hostbdf, path)) + } +} + +// device.options should have one entry for each PCI device in the VFIO group +// Each option should have the form "DDDD:BB:DD.F=" +// DDDD:BB:DD.F is the device's PCI address in the host +// is a PCI path to the device in the guest (see pci.rs) +async fn vfio_gk_device_handler( + device: &Device, + _: &mut Spec, + sandbox: &Arc>, + _: &DevIndex, +) -> Result<()> { + for opt in device.options.iter() { + let (_, pcipath) = + split_vfio_option(opt).ok_or_else(|| anyhow!("Malformed VFIO option {:?}", opt))?; + let pcipath = pci::Path::from_str(pcipath)?; + + wait_for_pci_device(sandbox, &pcipath).await?; + } + Ok(()) +} + impl DevIndex { fn new(spec: &Spec) -> DevIndex { let mut map = HashMap::new(); @@ -527,6 +589,7 @@ async fn add_device( DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, spec, sandbox, devidx).await, DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, spec, sandbox, devidx).await, DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, spec, sandbox, devidx).await, + DRIVER_VFIO_GK_TYPE => vfio_gk_device_handler(device, spec, sandbox, devidx).await, _ => Err(anyhow!("Unknown device type {}", device.field_type)), } } @@ -1075,4 +1138,14 @@ mod tests { assert!(!matcher_b.is_match(&uev_a)); assert!(!matcher_a.is_match(&uev_b)); } + + #[test] + fn test_split_vfio_option() { + assert_eq!( + split_vfio_option("0000:01:00.0=02/01"), + Some(("0000:01:00.0", "02/01")) + ); + assert_eq!(split_vfio_option("0000:01:00.0=02/01=rubbish"), None); + assert_eq!(split_vfio_option("0000:01:00.0"), None); + } } From 5c2af3e3082081dff175c175e4b009e74466b1ef Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 13 Apr 2021 13:52:18 +1000 Subject: [PATCH 05/10] runtime/device: Refactor hotplugVFIODevice() to have common exit path hotplugVFIODevice() has several different paths depending if we're plugging into a root port or a PCIE<->PCI bridge and if we're using a regular or mediated VFIO device. We're going to want some common code on the successful exit path here, so refactor the function to allow that without duplication. Signed-off-by: David Gibson --- src/runtime/virtcontainers/qemu.go | 55 ++++++++++++++++-------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 48252a0b5..fd48ad639 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -1474,39 +1474,42 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op switch device.Type { case config.VFIODeviceNormalType: - return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile) + err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile) case config.VFIODeviceMediatedType: if utils.IsAPVFIOMediatedDevice(device.SysfsDev) { - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + } else { + err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile) + } + default: + return fmt.Errorf("Incorrect VFIO device type found") + } + } else { + addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI) + if err != nil { + return err + } + + defer func() { + if err != nil { + q.arch.removeDeviceFromBridge(devID) + } + }() + + switch device.Type { + case config.VFIODeviceNormalType: + err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, addr, bridge.ID, romFile) + case config.VFIODeviceMediatedType: + if utils.IsAPVFIOMediatedDevice(device.SysfsDev) { + err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + } else { + err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, addr, bridge.ID, romFile) } - return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile) default: return fmt.Errorf("Incorrect VFIO device type found") } } - - addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI) - if err != nil { - return err - } - - defer func() { - if err != nil { - q.arch.removeDeviceFromBridge(devID) - } - }() - - switch device.Type { - case config.VFIODeviceNormalType: - return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, addr, bridge.ID, romFile) - case config.VFIODeviceMediatedType: - if utils.IsAPVFIOMediatedDevice(device.SysfsDev) { - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) - } - return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, addr, bridge.ID, romFile) - default: - return fmt.Errorf("Incorrect VFIO device type found") - } + return err } else { q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device") From ad45c52fbed3681d56ee676915e94ef0ef5743b4 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 13 Apr 2021 14:09:03 +1000 Subject: [PATCH 06/10] runtime/device: Record guest PCI path for VFIO devices For several device types which correspond to a PCI device in the guest we record the device's PCI path in the guest. We don't currently do that for VFIO devices, but we're going to need to for better handling of SR-IOV devices. To accomplish this, we have to determine the guest PCI path from the information the VMM gives us: For qemu, we query the slot of the device and its bridge from QMP. For cloud-hypervisor, the device add interface gives us a guest PCI address. In fact this represents a design error in the clh API - there's no way it can really know the guest PCI address in general. It works in this case, because clh doesn't use PCI bridges, so the device will always be on the root bus. Based on that, the PCI path is simply the device's slot number. Signed-off-by: David Gibson --- src/runtime/virtcontainers/clh.go | 26 ++++++- .../virtcontainers/device/config/config.go | 3 + src/runtime/virtcontainers/qemu.go | 73 +++++++++++++++++++ 3 files changed, 98 insertions(+), 4 deletions(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index b117fb7e1..170f76f4e 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -503,7 +503,7 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro return err } -func (clh *cloudHypervisor) hotPlugVFIODevice(device config.VFIODev) error { +func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { cl := clh.client() ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second) defer cancel() @@ -512,10 +512,28 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device config.VFIODev) error { clhDevice := *chclient.NewVmAddDevice() clhDevice.Path = &device.SysfsDev clhDevice.Id = &device.ID - _, _, err := cl.VmAddDevicePut(ctx, clhDevice) + pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) if err != nil { - err = fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) + return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) } + + // clh doesn't use bridges, so the PCI path is simply the slot + // number of the device. This will break if clh starts using + // bridges (including PCI-E root ports), but so will the clh + // API, since there's no way it can reliably predict a guest + // Bdf when bridges are present. + tokens := strings.Split(pciInfo.Bdf, ":") + if len(tokens) != 3 || tokens[0] != "0000" || tokens[1] != "00" { + return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf) + } + + tokens = strings.Split(tokens[2], ".") + if len(tokens) != 2 || tokens[1] != "0" || len(tokens[0]) != 2 { + return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf) + } + + device.GuestPciPath, err = vcTypes.PciPathFromString(tokens[0]) + return err } @@ -529,7 +547,7 @@ func (clh *cloudHypervisor) hotplugAddDevice(ctx context.Context, devInfo interf return nil, clh.hotplugAddBlockDevice(drive) case vfioDev: device := devInfo.(*config.VFIODev) - return nil, clh.hotPlugVFIODevice(*device) + return nil, clh.hotPlugVFIODevice(device) default: return nil, fmt.Errorf("cannot hotplug device: unsupported device type '%v'", devType) } diff --git a/src/runtime/virtcontainers/device/config/config.go b/src/runtime/virtcontainers/device/config/config.go index c2a31a403..8627094a9 100644 --- a/src/runtime/virtcontainers/device/config/config.go +++ b/src/runtime/virtcontainers/device/config/config.go @@ -224,6 +224,9 @@ type VFIODev struct { // Bus of VFIO PCIe device Bus string + // Guest PCI path of device + GuestPciPath vcTypes.PciPath + // Type of VFIO device Type VFIODeviceType diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index fd48ad639..cdbc3e3d8 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -1438,6 +1438,69 @@ func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUs } } +// Query QMP to find the PCI slot of a device, given its QOM path or ID +func (q *qemu) qomGetSlot(qomPath string) (vcTypes.PciSlot, error) { + addr, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qomPath, "addr") + if err != nil { + return vcTypes.PciSlot{}, err + } + addrf, ok := addr.(float64) + // XXX going via float makes no real sense, but that's how + // JSON works, and we'll get away with it for the small values + // we have here + if !ok { + return vcTypes.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr) + } + addri := int(addrf) + + slotNum, funcNum := addri>>3, addri&0x7 + if funcNum != 0 { + return vcTypes.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q", + slotNum, funcNum, qomPath) + } + + return vcTypes.PciSlotFromInt(slotNum) +} + +// Query QMP to find a device's PCI path given its QOM path or ID +func (q *qemu) qomGetPciPath(qemuID string) (vcTypes.PciPath, error) { + // XXX: For now we assume there's exactly one bridge, since + // that's always how we configure qemu from Kata for now. It + // would be good to generalize this to different PCI + // topologies + devSlot, err := q.qomGetSlot(qemuID) + if err != nil { + return vcTypes.PciPath{}, err + } + + busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus") + if err != nil { + return vcTypes.PciPath{}, err + } + + bus, ok := busq.(string) + if !ok { + return vcTypes.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq) + } + + // `bus` is the QOM path of the QOM bus object, but we need + // the PCI bridge which manages that bus. There doesn't seem + // to be a way to get that other than to simply drop the last + // path component. + idx := strings.LastIndex(bus, "/") + if idx == -1 { + return vcTypes.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus) + } + bridge := bus[:idx] + + bridgeSlot, err := q.qomGetSlot(bridge) + if err != nil { + return vcTypes.PciPath{}, err + } + + return vcTypes.PciPathFromSlots(bridgeSlot, devSlot) +} + func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op operation) (err error) { if err = q.qmpSetup(); err != nil { return err @@ -1509,6 +1572,16 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op return fmt.Errorf("Incorrect VFIO device type found") } } + if err != nil { + return err + } + // XXX: Depending on whether we're doing root port or + // bridge hotplug, and how the bridge is set up in + // other parts of the code, we may or may not already + // have information about the slot number of the + // bridge and or the device. For simplicity, just + // query both of them back from qemu + device.GuestPciPath, err = q.qomGetPciPath(devID) return err } else { q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device") From ebd7b6188404450f1cdd0a8599708ff1c18ea027 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 13 Apr 2021 14:49:42 +1000 Subject: [PATCH 07/10] runtime: Don't repeat GetDeviceByID between appendDevices() and append*() Both appendBlockDevice and appendVhostUserBlkDevice start by using GetDeviceByID to lookup the api.Device object corresponding to their ContainerDevice object. However their common caller, appendDevices() has already done this. This changes it so the looked up api.Device is passed to the individual append*Device() functions. This slightly reduces duplicated work, but more importantly it makes it clearer that append*Device() don't need to check for a nil result from GetDeviceByID, since the caller has already done that. Signed-off-by: David Gibson --- src/runtime/virtcontainers/kata_agent.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 698353a9e..81c44a9fd 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -1144,9 +1144,7 @@ func (k *kataAgent) handleShm(mounts []specs.Mount, sandbox *Sandbox) { } } -func (k *kataAgent) appendBlockDevice(dev ContainerDevice, c *Container) *grpc.Device { - device := c.sandbox.devManager.GetDeviceByID(dev.ID) - +func (k *kataAgent) appendBlockDevice(dev ContainerDevice, device api.Device, c *Container) *grpc.Device { d, ok := device.GetDeviceInfo().(*config.BlockDrive) if !ok || d == nil { k.Logger().WithField("device", device).Error("malformed block drive") @@ -1187,9 +1185,7 @@ func (k *kataAgent) appendBlockDevice(dev ContainerDevice, c *Container) *grpc.D return kataDevice } -func (k *kataAgent) appendVhostUserBlkDevice(dev ContainerDevice, c *Container) *grpc.Device { - device := c.sandbox.devManager.GetDeviceByID(dev.ID) - +func (k *kataAgent) appendVhostUserBlkDevice(dev ContainerDevice, device api.Device, c *Container) *grpc.Device { d, ok := device.GetDeviceInfo().(*config.VhostUserDeviceAttrs) if !ok || d == nil { k.Logger().WithField("device", device).Error("malformed vhost-user-blk drive") @@ -1217,9 +1213,9 @@ func (k *kataAgent) appendDevices(deviceList []*grpc.Device, c *Container) []*gr switch device.DeviceType() { case config.DeviceBlock: - kataDevice = k.appendBlockDevice(dev, c) + kataDevice = k.appendBlockDevice(dev, device, c) case config.VhostUserBlk: - kataDevice = k.appendVhostUserBlkDevice(dev, c) + kataDevice = k.appendVhostUserBlkDevice(dev, device, c) } if kataDevice == nil { From aad1a8734f854c7d2674de1ebb40b12b2db97025 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 13 Apr 2021 15:00:19 +1000 Subject: [PATCH 08/10] runtime/device: Give the agent information about VFIO devices We send information about several kinds of devices to the agent so that it can apply specific handling. We don't currently do this with VFIO devices. However we need to do that so that the agent can properly wait for VFIO devices to be ready (previously it did that using a PCI rescan which may not be reliable and has some very bad side effects). This patch collates and sends the relevant information. Signed-off-by: David Gibson --- src/runtime/virtcontainers/kata_agent.go | 35 ++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 81c44a9fd..1c9bbf1aa 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -92,6 +92,7 @@ var ( kataNvdimmDevType = "nvdimm" kataVirtioFSDevType = "virtio-fs" kataWatchableBindDevType = "watchable-bind" + kataVfioGuestKernelDevType = "vfio-gk" // VFIO device for consumption by the guest kernel sharedDir9pOptions = []string{"trans=virtio,version=9p2000.L,cache=mmap", "nodev"} sharedDirVirtioFSOptions = []string{} sharedDirVirtioFSDaxOptions = "dax" @@ -1201,6 +1202,38 @@ func (k *kataAgent) appendVhostUserBlkDevice(dev ContainerDevice, device api.Dev return kataDevice } +func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *Container) *grpc.Device { + devList, ok := device.GetDeviceInfo().([]*config.VFIODev) + if !ok || devList == nil { + k.Logger().WithField("device", device).Error("malformed vfio device") + return nil + } + + groupNum := filepath.Base(dev.ContainerPath) + + // Each /dev/vfio/NN device represents a VFIO group, which + // could include several PCI devices. So we give group + // information in the main structure, then list each + // individual PCI device in the Options array. + // + // Each option is formatted as "DDDD:BB:DD.F=" + // DDDD:BB:DD.F is the device's PCI address on the + // *host*. is the device's PCI path in the guest + // (see qomGetPciPath() for details). + kataDevice := &grpc.Device{ + ContainerPath: dev.ContainerPath, + Type: kataVfioGuestKernelDevType, + Id: groupNum, + Options: make([]string, len(devList)), + } + + for i, pciDev := range devList { + kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDev.BDF, pciDev.GuestPciPath) + } + + return kataDevice +} + func (k *kataAgent) appendDevices(deviceList []*grpc.Device, c *Container) []*grpc.Device { var kataDevice *grpc.Device @@ -1216,6 +1249,8 @@ func (k *kataAgent) appendDevices(deviceList []*grpc.Device, c *Container) []*gr kataDevice = k.appendBlockDevice(dev, device, c) case config.VhostUserBlk: kataDevice = k.appendVhostUserBlkDevice(dev, device, c) + case config.DeviceVFIO: + kataDevice = k.appendVfioDevice(dev, device, c) } if kataDevice == nil { From 75f426dd1ec2395b3d27d2481b1f53ec11b6daf9 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 24 Sep 2021 15:36:40 +1000 Subject: [PATCH 09/10] agent: Simplify do_add_swap() do_add_swap() has some mildly complex code to translate the PCI path of a virtio-blk device (where the swap will reside) into a /dev path. However, the device module already has get_virtio_blk_pci_device_name() which does exactly that. The existing code has some further advantages: it uses more precise matching of the sysfs paths, and if necessary it will wait for the device to be added to the guest. While we're there, remove an unnecessary 'as u8' from the PCI path construction: pci::Path::new() already accepts anything which implements TryInfo, which u32 certainly does. Signed-off-by: David Gibson --- src/agent/src/rpc.rs | 46 ++++++++++---------------------------------- 1 file changed, 10 insertions(+), 36 deletions(-) diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 775b119df..c748beb04 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 // -use crate::pci; use async_trait::async_trait; use rustjail::{pipestream::PipeStream, process::StreamType}; use tokio::io::{AsyncReadExt, AsyncWriteExt, ReadHalf}; @@ -44,12 +43,15 @@ use nix::sys::stat; use nix::unistd::{self, Pid}; use rustjail::process::ProcessOperations; -use crate::device::{add_devices, pcipath_to_sysfs, rescan_pci_bus, update_device_cgroup}; +use crate::device::{ + add_devices, get_virtio_blk_pci_device_name, rescan_pci_bus, update_device_cgroup, +}; use crate::linux_abi::*; use crate::metrics::get_metrics; use crate::mount::{add_storages, baremount, remove_mounts, STORAGE_HANDLER_LIST}; use crate::namespace::{NSTYPEIPC, NSTYPEPID, NSTYPEUTS}; use crate::network::setup_guest_dns; +use crate::pci; use crate::random; use crate::sandbox::Sandbox; use crate::version::{AGENT_VERSION, API_VERSION}; @@ -1204,7 +1206,9 @@ impl protocols::agent_ttrpc::AgentService for AgentService { ) -> ttrpc::Result { trace_rpc_call!(ctx, "add_swap", req); - do_add_swap(&req).map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?; + do_add_swap(&self.sandbox, &req) + .await + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?; Ok(Empty::new()) } @@ -1557,43 +1561,13 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> { Ok(()) } -pub fn path_name_lookup + std::fmt::Debug>( - path: P, - lookup: &str, -) -> Result<(PathBuf, String)> { - for entry in fs::read_dir(path.clone())? { - let entry = entry?; - if let Some(name) = entry.path().file_name() { - if let Some(name) = name.to_str() { - if Some(0) == name.find(lookup) { - return Ok((entry.path(), name.to_string())); - } - } - } - } - Err(anyhow!("cannot get {} dir in {:?}", lookup, path)) -} - -fn do_add_swap(req: &AddSwapRequest) -> Result<()> { - // re-scan PCI bus - // looking for hidden devices - rescan_pci_bus().context("Could not rescan PCI bus")?; - +async fn do_add_swap(sandbox: &Arc>, req: &AddSwapRequest) -> Result<()> { let mut slots = Vec::new(); for slot in &req.PCIPath { - slots.push(pci::Slot::new(*slot as u8)?); + slots.push(pci::Slot::new(*slot)?); } let pcipath = pci::Path::new(slots)?; - let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path()); - let sysfs_rel_path = format!( - "{}{}", - root_bus_sysfs, - pcipath_to_sysfs(&root_bus_sysfs, &pcipath)? - ); - let (mut virtio_path, _) = path_name_lookup(sysfs_rel_path, "virtio")?; - virtio_path.push("block"); - let (_, dev_name) = path_name_lookup(virtio_path, "vd")?; - let dev_name = format!("/dev/{}", dev_name); + let dev_name = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; let c_str = CString::new(dev_name)?; let ret = unsafe { libc::swapon(c_str.as_ptr() as *const c_char, 0) }; From 907459c1c169c15749717d522a972bad3f806613 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 14 Apr 2021 14:07:16 +1000 Subject: [PATCH 10/10] agent/device: Don't force PCI rescans The agent initiates a PCI rescan from two places. One is triggered for each virtio-blk PCI device, and one is triggered unconditionally when we start a new container. The PCI bus rescan code was added long time ago in Clear Containers due to lack of ACPI support in QEMU 2.9 + q35. Since Kata routinely plugs devices under a PCIe-to-PCI bridge, that left SHPC as the only available hotplug mechanism. However, while Kata was using SHPC on the qemu side, it wasn't actually using it on the guest side. Due to a quirk of our guest kernel configuration, the SHPC driver never bound to the bridge, and *no* hotplug was working at all. To work around that, Kata was forcing the rescan manually, which would discover the new device. That was very fragile (we were arguably relying on a kernel bug). Even if we were using SHPC propertly, it includes a mandatory 5s delay during plug operations (designed for physical cards and human operators), which makes it unsuitable quick start up. Worse, the forced PCI rescans could race with either SHPC or PCIe native hotplug sequences, causing several problems. In some cases this could put the device into an entirely broken state where it wouldn't respond to config space accesses at all. Since pull request #2323 was merged, we have instead used ACPI hotplug which is both fast, and more solid in terms of semantics and races. So, the forced PCI rescans are no longer necessary. Remove them all. fixes #683 Signed-off-by: David Gibson --- src/agent/src/device.rs | 7 ------- src/agent/src/linux_abi.rs | 1 - src/agent/src/rpc.rs | 8 +------- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 4ee275f30..067c6d03d 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -56,11 +56,6 @@ struct DevIndexEntry { #[derive(Debug)] struct DevIndex(HashMap); -#[instrument] -pub fn rescan_pci_bus() -> Result<()> { - online_device(SYSFS_PCI_BUS_RESCAN_FILE) -} - #[instrument] pub fn online_device(path: &str) -> Result<()> { fs::write(path, "1")?; @@ -171,8 +166,6 @@ pub async fn get_virtio_blk_pci_device_name( let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; let matcher = VirtioBlkPciMatcher::new(&sysfs_rel_path); - rescan_pci_bus()?; - let uev = wait_for_uevent(sandbox, matcher).await?; Ok(format!("{}/{}", SYSTEM_DEV_PATH, &uev.devname)) } diff --git a/src/agent/src/linux_abi.rs b/src/agent/src/linux_abi.rs index 08600d9f7..e49d901f8 100644 --- a/src/agent/src/linux_abi.rs +++ b/src/agent/src/linux_abi.rs @@ -9,7 +9,6 @@ use std::fs; pub const SYSFS_DIR: &str = "/sys"; -pub const SYSFS_PCI_BUS_RESCAN_FILE: &str = "/sys/bus/pci/rescan"; #[cfg(any( target_arch = "powerpc64", target_arch = "s390x", diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index c748beb04..8154975b7 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -43,9 +43,7 @@ use nix::sys::stat; use nix::unistd::{self, Pid}; use rustjail::process::ProcessOperations; -use crate::device::{ - add_devices, get_virtio_blk_pci_device_name, rescan_pci_bus, update_device_cgroup, -}; +use crate::device::{add_devices, get_virtio_blk_pci_device_name, update_device_cgroup}; use crate::linux_abi::*; use crate::metrics::get_metrics; use crate::mount::{add_storages, baremount, remove_mounts, STORAGE_HANDLER_LIST}; @@ -136,10 +134,6 @@ impl AgentService { info!(sl!(), "receive createcontainer, spec: {:?}", &oci); - // re-scan PCI bus - // looking for hidden devices - rescan_pci_bus().context("Could not rescan PCI bus")?; - // Some devices need some extra processing (the ones invoked with // --device for instance), and that's what this call is doing. It // updates the devices listed in the OCI spec, so that they actually