diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 91ce30ea7..067c6d03d 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -17,10 +17,6 @@ use tokio::sync::Mutex; #[cfg(target_arch = "s390x")] use crate::ccw; use crate::linux_abi::*; -use crate::mount::{ - DRIVER_BLK_CCW_TYPE, DRIVER_BLK_TYPE, DRIVER_MMIO_BLK_TYPE, DRIVER_NVDIMM_TYPE, - DRIVER_SCSI_TYPE, -}; use crate::pci; use crate::sandbox::Sandbox; use crate::uevent::{wait_for_uevent, Uevent, UeventMatcher}; @@ -38,6 +34,19 @@ macro_rules! sl { const VM_ROOTFS: &str = "/"; +pub const DRIVER_9P_TYPE: &str = "9p"; +pub const DRIVER_VIRTIOFS_TYPE: &str = "virtio-fs"; +pub const DRIVER_BLK_TYPE: &str = "blk"; +pub const DRIVER_BLK_CCW_TYPE: &str = "blk-ccw"; +pub const DRIVER_MMIO_BLK_TYPE: &str = "mmioblk"; +pub const DRIVER_SCSI_TYPE: &str = "scsi"; +pub const DRIVER_NVDIMM_TYPE: &str = "nvdimm"; +pub const DRIVER_EPHEMERAL_TYPE: &str = "ephemeral"; +pub const DRIVER_LOCAL_TYPE: &str = "local"; +pub const DRIVER_WATCHABLE_BIND_TYPE: &str = "watchable-bind"; +// VFIO device to be bound to a guest kernel driver +pub const DRIVER_VFIO_GK_TYPE: &str = "vfio-gk"; + #[derive(Debug)] struct DevIndexEntry { idx: usize, @@ -47,11 +56,6 @@ struct DevIndexEntry { #[derive(Debug)] struct DevIndex(HashMap); -#[instrument] -pub fn rescan_pci_bus() -> Result<()> { - online_device(SYSFS_PCI_BUS_RESCAN_FILE) -} - #[instrument] pub fn online_device(path: &str) -> Result<()> { fs::write(path, "1")?; @@ -162,8 +166,6 @@ pub async fn get_virtio_blk_pci_device_name( let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; let matcher = VirtioBlkPciMatcher::new(&sysfs_rel_path); - rescan_pci_bus()?; - let uev = wait_for_uevent(sandbox, matcher).await?; Ok(format!("{}/{}", SYSTEM_DEV_PATH, &uev.devname)) } @@ -255,6 +257,35 @@ pub async fn wait_for_pmem_device(sandbox: &Arc>, devpath: &str) Ok(()) } +#[derive(Debug)] +struct PciMatcher { + devpath: String, +} + +impl PciMatcher { + fn new(relpath: &str) -> Result { + let root_bus = create_pci_root_bus_path(); + Ok(PciMatcher { + devpath: format!("{}{}", root_bus, relpath), + }) + } +} + +impl UeventMatcher for PciMatcher { + fn is_match(&self, uev: &Uevent) -> bool { + uev.devpath == self.devpath + } +} + +pub async fn wait_for_pci_device(sandbox: &Arc>, pcipath: &pci::Path) -> Result<()> { + let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path()); + let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; + let matcher = PciMatcher::new(&sysfs_rel_path)?; + + let _ = wait_for_uevent(sandbox, matcher).await?; + Ok(()) +} + /// Scan SCSI bus for the given SCSI address(SCSI-Id and LUN) #[instrument] fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { @@ -451,6 +482,37 @@ async fn virtio_nvdimm_device_handler( update_spec_device_list(device, spec, devidx) } +fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { + let mut tokens = opt.split('='); + let hostbdf = tokens.next()?; + let path = tokens.next()?; + if tokens.next().is_some() { + None + } else { + Some((hostbdf, path)) + } +} + +// device.options should have one entry for each PCI device in the VFIO group +// Each option should have the form "DDDD:BB:DD.F=" +// DDDD:BB:DD.F is the device's PCI address in the host +// is a PCI path to the device in the guest (see pci.rs) +async fn vfio_gk_device_handler( + device: &Device, + _: &mut Spec, + sandbox: &Arc>, + _: &DevIndex, +) -> Result<()> { + for opt in device.options.iter() { + let (_, pcipath) = + split_vfio_option(opt).ok_or_else(|| anyhow!("Malformed VFIO option {:?}", opt))?; + let pcipath = pci::Path::from_str(pcipath)?; + + wait_for_pci_device(sandbox, &pcipath).await?; + } + Ok(()) +} + impl DevIndex { fn new(spec: &Spec) -> DevIndex { let mut map = HashMap::new(); @@ -520,6 +582,7 @@ async fn add_device( DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, spec, sandbox, devidx).await, DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, spec, sandbox, devidx).await, DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, spec, sandbox, devidx).await, + DRIVER_VFIO_GK_TYPE => vfio_gk_device_handler(device, spec, sandbox, devidx).await, _ => Err(anyhow!("Unknown device type {}", device.field_type)), } } @@ -1068,4 +1131,14 @@ mod tests { assert!(!matcher_b.is_match(&uev_a)); assert!(!matcher_a.is_match(&uev_b)); } + + #[test] + fn test_split_vfio_option() { + assert_eq!( + split_vfio_option("0000:01:00.0=02/01"), + Some(("0000:01:00.0", "02/01")) + ); + assert_eq!(split_vfio_option("0000:01:00.0=02/01=rubbish"), None); + assert_eq!(split_vfio_option("0000:01:00.0"), None); + } } diff --git a/src/agent/src/linux_abi.rs b/src/agent/src/linux_abi.rs index 08600d9f7..e49d901f8 100644 --- a/src/agent/src/linux_abi.rs +++ b/src/agent/src/linux_abi.rs @@ -9,7 +9,6 @@ use std::fs; pub const SYSFS_DIR: &str = "/sys"; -pub const SYSFS_PCI_BUS_RESCAN_FILE: &str = "/sys/bus/pci/rescan"; #[cfg(any( target_arch = "powerpc64", target_arch = "s390x", diff --git a/src/agent/src/mount.rs b/src/agent/src/mount.rs index 1c2b2dab2..3d55f874f 100644 --- a/src/agent/src/mount.rs +++ b/src/agent/src/mount.rs @@ -22,6 +22,9 @@ use regex::Regex; use crate::device::{ get_scsi_device_name, get_virtio_blk_pci_device_name, online_device, wait_for_pmem_device, + DRIVER_9P_TYPE, DRIVER_BLK_CCW_TYPE, DRIVER_BLK_TYPE, DRIVER_EPHEMERAL_TYPE, DRIVER_LOCAL_TYPE, + DRIVER_MMIO_BLK_TYPE, DRIVER_NVDIMM_TYPE, DRIVER_SCSI_TYPE, DRIVER_VIRTIOFS_TYPE, + DRIVER_WATCHABLE_BIND_TYPE, }; use crate::linux_abi::*; use crate::pci; @@ -33,17 +36,6 @@ use anyhow::{anyhow, Context, Result}; use slog::Logger; use tracing::instrument; -pub const DRIVER_9P_TYPE: &str = "9p"; -pub const DRIVER_VIRTIOFS_TYPE: &str = "virtio-fs"; -pub const DRIVER_BLK_TYPE: &str = "blk"; -pub const DRIVER_BLK_CCW_TYPE: &str = "blk-ccw"; -pub const DRIVER_MMIO_BLK_TYPE: &str = "mmioblk"; -pub const DRIVER_SCSI_TYPE: &str = "scsi"; -pub const DRIVER_NVDIMM_TYPE: &str = "nvdimm"; -pub const DRIVER_EPHEMERAL_TYPE: &str = "ephemeral"; -pub const DRIVER_LOCAL_TYPE: &str = "local"; -pub const DRIVER_WATCHABLE_BIND_TYPE: &str = "watchable-bind"; - pub const TYPE_ROOTFS: &str = "rootfs"; pub const MOUNT_GUEST_TAG: &str = "kataShared"; diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 775b119df..8154975b7 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 // -use crate::pci; use async_trait::async_trait; use rustjail::{pipestream::PipeStream, process::StreamType}; use tokio::io::{AsyncReadExt, AsyncWriteExt, ReadHalf}; @@ -44,12 +43,13 @@ use nix::sys::stat; use nix::unistd::{self, Pid}; use rustjail::process::ProcessOperations; -use crate::device::{add_devices, pcipath_to_sysfs, rescan_pci_bus, update_device_cgroup}; +use crate::device::{add_devices, get_virtio_blk_pci_device_name, update_device_cgroup}; use crate::linux_abi::*; use crate::metrics::get_metrics; use crate::mount::{add_storages, baremount, remove_mounts, STORAGE_HANDLER_LIST}; use crate::namespace::{NSTYPEIPC, NSTYPEPID, NSTYPEUTS}; use crate::network::setup_guest_dns; +use crate::pci; use crate::random; use crate::sandbox::Sandbox; use crate::version::{AGENT_VERSION, API_VERSION}; @@ -134,10 +134,6 @@ impl AgentService { info!(sl!(), "receive createcontainer, spec: {:?}", &oci); - // re-scan PCI bus - // looking for hidden devices - rescan_pci_bus().context("Could not rescan PCI bus")?; - // Some devices need some extra processing (the ones invoked with // --device for instance), and that's what this call is doing. It // updates the devices listed in the OCI spec, so that they actually @@ -1204,7 +1200,9 @@ impl protocols::agent_ttrpc::AgentService for AgentService { ) -> ttrpc::Result { trace_rpc_call!(ctx, "add_swap", req); - do_add_swap(&req).map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?; + do_add_swap(&self.sandbox, &req) + .await + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?; Ok(Empty::new()) } @@ -1557,43 +1555,13 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> { Ok(()) } -pub fn path_name_lookup + std::fmt::Debug>( - path: P, - lookup: &str, -) -> Result<(PathBuf, String)> { - for entry in fs::read_dir(path.clone())? { - let entry = entry?; - if let Some(name) = entry.path().file_name() { - if let Some(name) = name.to_str() { - if Some(0) == name.find(lookup) { - return Ok((entry.path(), name.to_string())); - } - } - } - } - Err(anyhow!("cannot get {} dir in {:?}", lookup, path)) -} - -fn do_add_swap(req: &AddSwapRequest) -> Result<()> { - // re-scan PCI bus - // looking for hidden devices - rescan_pci_bus().context("Could not rescan PCI bus")?; - +async fn do_add_swap(sandbox: &Arc>, req: &AddSwapRequest) -> Result<()> { let mut slots = Vec::new(); for slot in &req.PCIPath { - slots.push(pci::Slot::new(*slot as u8)?); + slots.push(pci::Slot::new(*slot)?); } let pcipath = pci::Path::new(slots)?; - let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path()); - let sysfs_rel_path = format!( - "{}{}", - root_bus_sysfs, - pcipath_to_sysfs(&root_bus_sysfs, &pcipath)? - ); - let (mut virtio_path, _) = path_name_lookup(sysfs_rel_path, "virtio")?; - virtio_path.push("block"); - let (_, dev_name) = path_name_lookup(virtio_path, "vd")?; - let dev_name = format!("/dev/{}", dev_name); + let dev_name = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; let c_str = CString::new(dev_name)?; let ret = unsafe { libc::swapon(c_str.as_ptr() as *const c_char, 0) }; diff --git a/src/agent/src/uevent.rs b/src/agent/src/uevent.rs index c5ecefcc1..1b1dbb72e 100644 --- a/src/agent/src/uevent.rs +++ b/src/agent/src/uevent.rs @@ -111,10 +111,13 @@ pub async fn wait_for_uevent( sandbox: &Arc>, matcher: impl UeventMatcher, ) -> Result { + let logprefix = format!("Waiting for {:?}", &matcher); + + info!(sl!(), "{}", logprefix); let mut sb = sandbox.lock().await; for uev in sb.uevent_map.values() { if matcher.is_match(uev) { - info!(sl!(), "Device {:?} found in device map", uev); + info!(sl!(), "{}: found {:?} in uevent map", logprefix, &uev); return Ok(uev.clone()); } } @@ -129,7 +132,8 @@ pub async fn wait_for_uevent( sb.uevent_watchers.push(Some((Box::new(matcher), tx))); drop(sb); // unlock - info!(sl!(), "Waiting on channel for uevent notification\n"); + info!(sl!(), "{}: waiting on channel", logprefix); + let hotplug_timeout = AGENT_CONFIG.read().await.hotplug_timeout; let uev = match tokio::time::timeout(hotplug_timeout, rx).await { @@ -146,6 +150,7 @@ pub async fn wait_for_uevent( } }; + info!(sl!(), "{}: found {:?} on channel", logprefix, &uev); Ok(uev) } diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index b117fb7e1..170f76f4e 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -503,7 +503,7 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro return err } -func (clh *cloudHypervisor) hotPlugVFIODevice(device config.VFIODev) error { +func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { cl := clh.client() ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second) defer cancel() @@ -512,10 +512,28 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device config.VFIODev) error { clhDevice := *chclient.NewVmAddDevice() clhDevice.Path = &device.SysfsDev clhDevice.Id = &device.ID - _, _, err := cl.VmAddDevicePut(ctx, clhDevice) + pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) if err != nil { - err = fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) + return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) } + + // clh doesn't use bridges, so the PCI path is simply the slot + // number of the device. This will break if clh starts using + // bridges (including PCI-E root ports), but so will the clh + // API, since there's no way it can reliably predict a guest + // Bdf when bridges are present. + tokens := strings.Split(pciInfo.Bdf, ":") + if len(tokens) != 3 || tokens[0] != "0000" || tokens[1] != "00" { + return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf) + } + + tokens = strings.Split(tokens[2], ".") + if len(tokens) != 2 || tokens[1] != "0" || len(tokens[0]) != 2 { + return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf) + } + + device.GuestPciPath, err = vcTypes.PciPathFromString(tokens[0]) + return err } @@ -529,7 +547,7 @@ func (clh *cloudHypervisor) hotplugAddDevice(ctx context.Context, devInfo interf return nil, clh.hotplugAddBlockDevice(drive) case vfioDev: device := devInfo.(*config.VFIODev) - return nil, clh.hotPlugVFIODevice(*device) + return nil, clh.hotPlugVFIODevice(device) default: return nil, fmt.Errorf("cannot hotplug device: unsupported device type '%v'", devType) } diff --git a/src/runtime/virtcontainers/device/config/config.go b/src/runtime/virtcontainers/device/config/config.go index c2a31a403..8627094a9 100644 --- a/src/runtime/virtcontainers/device/config/config.go +++ b/src/runtime/virtcontainers/device/config/config.go @@ -224,6 +224,9 @@ type VFIODev struct { // Bus of VFIO PCIe device Bus string + // Guest PCI path of device + GuestPciPath vcTypes.PciPath + // Type of VFIO device Type VFIODeviceType diff --git a/src/runtime/virtcontainers/hypervisor_s390x.go b/src/runtime/virtcontainers/hypervisor_s390x.go index b80fce767..a8afca4ad 100644 --- a/src/runtime/virtcontainers/hypervisor_s390x.go +++ b/src/runtime/virtcontainers/hypervisor_s390x.go @@ -90,7 +90,7 @@ func availableGuestProtection() (guestProtection, error) { return noneProtection, err } if !seCmdlinePresent { - return noneProtection, fmt.Errorf("Protected Virtualization is not enabled on kernel command line! " + + return noneProtection, fmt.Errorf("Protected Virtualization is not enabled on kernel command line! "+ "Need %s=%s (or %s) to enable Secure Execution", seCmdlineParam, seCmdlineValues[0], strings.Join(seCmdlineValues[1:], ", ")) } diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 698353a9e..1c9bbf1aa 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -92,6 +92,7 @@ var ( kataNvdimmDevType = "nvdimm" kataVirtioFSDevType = "virtio-fs" kataWatchableBindDevType = "watchable-bind" + kataVfioGuestKernelDevType = "vfio-gk" // VFIO device for consumption by the guest kernel sharedDir9pOptions = []string{"trans=virtio,version=9p2000.L,cache=mmap", "nodev"} sharedDirVirtioFSOptions = []string{} sharedDirVirtioFSDaxOptions = "dax" @@ -1144,9 +1145,7 @@ func (k *kataAgent) handleShm(mounts []specs.Mount, sandbox *Sandbox) { } } -func (k *kataAgent) appendBlockDevice(dev ContainerDevice, c *Container) *grpc.Device { - device := c.sandbox.devManager.GetDeviceByID(dev.ID) - +func (k *kataAgent) appendBlockDevice(dev ContainerDevice, device api.Device, c *Container) *grpc.Device { d, ok := device.GetDeviceInfo().(*config.BlockDrive) if !ok || d == nil { k.Logger().WithField("device", device).Error("malformed block drive") @@ -1187,9 +1186,7 @@ func (k *kataAgent) appendBlockDevice(dev ContainerDevice, c *Container) *grpc.D return kataDevice } -func (k *kataAgent) appendVhostUserBlkDevice(dev ContainerDevice, c *Container) *grpc.Device { - device := c.sandbox.devManager.GetDeviceByID(dev.ID) - +func (k *kataAgent) appendVhostUserBlkDevice(dev ContainerDevice, device api.Device, c *Container) *grpc.Device { d, ok := device.GetDeviceInfo().(*config.VhostUserDeviceAttrs) if !ok || d == nil { k.Logger().WithField("device", device).Error("malformed vhost-user-blk drive") @@ -1205,6 +1202,38 @@ func (k *kataAgent) appendVhostUserBlkDevice(dev ContainerDevice, c *Container) return kataDevice } +func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *Container) *grpc.Device { + devList, ok := device.GetDeviceInfo().([]*config.VFIODev) + if !ok || devList == nil { + k.Logger().WithField("device", device).Error("malformed vfio device") + return nil + } + + groupNum := filepath.Base(dev.ContainerPath) + + // Each /dev/vfio/NN device represents a VFIO group, which + // could include several PCI devices. So we give group + // information in the main structure, then list each + // individual PCI device in the Options array. + // + // Each option is formatted as "DDDD:BB:DD.F=" + // DDDD:BB:DD.F is the device's PCI address on the + // *host*. is the device's PCI path in the guest + // (see qomGetPciPath() for details). + kataDevice := &grpc.Device{ + ContainerPath: dev.ContainerPath, + Type: kataVfioGuestKernelDevType, + Id: groupNum, + Options: make([]string, len(devList)), + } + + for i, pciDev := range devList { + kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDev.BDF, pciDev.GuestPciPath) + } + + return kataDevice +} + func (k *kataAgent) appendDevices(deviceList []*grpc.Device, c *Container) []*grpc.Device { var kataDevice *grpc.Device @@ -1217,9 +1246,11 @@ func (k *kataAgent) appendDevices(deviceList []*grpc.Device, c *Container) []*gr switch device.DeviceType() { case config.DeviceBlock: - kataDevice = k.appendBlockDevice(dev, c) + kataDevice = k.appendBlockDevice(dev, device, c) case config.VhostUserBlk: - kataDevice = k.appendVhostUserBlkDevice(dev, c) + kataDevice = k.appendVhostUserBlkDevice(dev, device, c) + case config.DeviceVFIO: + kataDevice = k.appendVfioDevice(dev, device, c) } if kataDevice == nil { diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index b189245d8..a78545c5c 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -1440,6 +1440,69 @@ func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUs } } +// Query QMP to find the PCI slot of a device, given its QOM path or ID +func (q *qemu) qomGetSlot(qomPath string) (vcTypes.PciSlot, error) { + addr, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qomPath, "addr") + if err != nil { + return vcTypes.PciSlot{}, err + } + addrf, ok := addr.(float64) + // XXX going via float makes no real sense, but that's how + // JSON works, and we'll get away with it for the small values + // we have here + if !ok { + return vcTypes.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr) + } + addri := int(addrf) + + slotNum, funcNum := addri>>3, addri&0x7 + if funcNum != 0 { + return vcTypes.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q", + slotNum, funcNum, qomPath) + } + + return vcTypes.PciSlotFromInt(slotNum) +} + +// Query QMP to find a device's PCI path given its QOM path or ID +func (q *qemu) qomGetPciPath(qemuID string) (vcTypes.PciPath, error) { + // XXX: For now we assume there's exactly one bridge, since + // that's always how we configure qemu from Kata for now. It + // would be good to generalize this to different PCI + // topologies + devSlot, err := q.qomGetSlot(qemuID) + if err != nil { + return vcTypes.PciPath{}, err + } + + busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus") + if err != nil { + return vcTypes.PciPath{}, err + } + + bus, ok := busq.(string) + if !ok { + return vcTypes.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq) + } + + // `bus` is the QOM path of the QOM bus object, but we need + // the PCI bridge which manages that bus. There doesn't seem + // to be a way to get that other than to simply drop the last + // path component. + idx := strings.LastIndex(bus, "/") + if idx == -1 { + return vcTypes.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus) + } + bridge := bus[:idx] + + bridgeSlot, err := q.qomGetSlot(bridge) + if err != nil { + return vcTypes.PciPath{}, err + } + + return vcTypes.PciPathFromSlots(bridgeSlot, devSlot) +} + func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op operation) (err error) { if err = q.qmpSetup(); err != nil { return err @@ -1476,39 +1539,52 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op switch device.Type { case config.VFIODeviceNormalType: - return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile) + err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile) case config.VFIODeviceMediatedType: if utils.IsAPVFIOMediatedDevice(device.SysfsDev) { - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + } else { + err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile) + } + default: + return fmt.Errorf("Incorrect VFIO device type found") + } + } else { + addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI) + if err != nil { + return err + } + + defer func() { + if err != nil { + q.arch.removeDeviceFromBridge(devID) + } + }() + + switch device.Type { + case config.VFIODeviceNormalType: + err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, addr, bridge.ID, romFile) + case config.VFIODeviceMediatedType: + if utils.IsAPVFIOMediatedDevice(device.SysfsDev) { + err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + } else { + err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, addr, bridge.ID, romFile) } - return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile) default: return fmt.Errorf("Incorrect VFIO device type found") } } - - addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI) if err != nil { return err } - - defer func() { - if err != nil { - q.arch.removeDeviceFromBridge(devID) - } - }() - - switch device.Type { - case config.VFIODeviceNormalType: - return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, addr, bridge.ID, romFile) - case config.VFIODeviceMediatedType: - if utils.IsAPVFIOMediatedDevice(device.SysfsDev) { - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) - } - return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, addr, bridge.ID, romFile) - default: - return fmt.Errorf("Incorrect VFIO device type found") - } + // XXX: Depending on whether we're doing root port or + // bridge hotplug, and how the bridge is set up in + // other parts of the code, we may or may not already + // have information about the slot number of the + // bridge and or the device. For simplicity, just + // query both of them back from qemu + device.GuestPciPath, err = q.qomGetPciPath(devID) + return err } else { q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device") diff --git a/src/runtime/virtcontainers/qemu_arm64.go b/src/runtime/virtcontainers/qemu_arm64.go index eced3b044..d14ec4131 100644 --- a/src/runtime/virtcontainers/qemu_arm64.go +++ b/src/runtime/virtcontainers/qemu_arm64.go @@ -8,8 +8,8 @@ package virtcontainers import ( "context" "fmt" - "time" "os" + "time" govmmQemu "github.com/kata-containers/govmm/qemu" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" @@ -100,29 +100,29 @@ func (q *qemuArm64) appendImage(ctx context.Context, devices []govmmQemu.Device, // so we temporarily add this specific implementation for arm64 here until // the qemu used by arm64 is capable for that feature func (q *qemuArm64) appendNvdimmImage(devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) { - imageFile, err := os.Open(path) - if err != nil { - return nil, err - } - defer imageFile.Close() + imageFile, err := os.Open(path) + if err != nil { + return nil, err + } + defer imageFile.Close() - imageStat, err := imageFile.Stat() - if err != nil { - return nil, err - } + imageStat, err := imageFile.Stat() + if err != nil { + return nil, err + } object := govmmQemu.Object{ - Driver: govmmQemu.NVDIMM, - Type: govmmQemu.MemoryBackendFile, - DeviceID: "nv0", - ID: "mem0", - MemPath: path, - Size: (uint64)(imageStat.Size()), - } + Driver: govmmQemu.NVDIMM, + Type: govmmQemu.MemoryBackendFile, + DeviceID: "nv0", + ID: "mem0", + MemPath: path, + Size: (uint64)(imageStat.Size()), + } - devices = append(devices, object) + devices = append(devices, object) - return devices, nil + return devices, nil } func (q *qemuArm64) setIgnoreSharedMemoryMigrationCaps(_ context.Context, _ *govmmQemu.QMP) error {