From e22bd78249783c22141447fd2eb4bd8a83d7b833 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 5 Oct 2021 16:37:29 +1100 Subject: [PATCH 01/15] agent/device: Add helper function for binding a guest device to a driver For better VFIO support, we're going to need to take control of which guest driver controls specific guest devices. To assist with that, add the pci_driver_override() function to force a specific guest device to be bound to a specific guest driver. Signed-off-by: David Gibson --- src/agent/src/device.rs | 74 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 3258be1165..60f661c120 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -7,7 +7,9 @@ use libc::{c_uint, major, minor}; use nix::sys::stat; use regex::Regex; use std::collections::HashMap; +use std::ffi::OsStr; use std::fs; +use std::os::unix::ffi::OsStrExt; use std::os::unix::fs::MetadataExt; use std::path::Path; use std::str::FromStr; @@ -62,6 +64,40 @@ pub fn online_device(path: &str) -> Result<()> { Ok(()) } +// Force a given PCI device to bind to the given driver, does +// basically the same thing as +// driverctl set-override +#[instrument] +pub fn pci_driver_override(syspci: T, dev: pci::Address, drv: U) -> Result<()> +where + T: AsRef + std::fmt::Debug, + U: AsRef + std::fmt::Debug, +{ + let syspci = Path::new(&syspci); + let drv = drv.as_ref(); + info!(sl!(), "rebind_pci_driver: {} => {:?}", dev, drv); + + let devpath = syspci.join("devices").join(dev.to_string()); + let overridepath = &devpath.join("driver_override"); + + fs::write(overridepath, drv.as_bytes())?; + + let drvpath = &devpath.join("driver"); + let need_unbind = match fs::read_link(drvpath) { + Ok(d) if d.file_name() == Some(drv) => return Ok(()), // Nothing to do + Err(e) if e.kind() == std::io::ErrorKind::NotFound => false, // No current driver + Err(e) => return Err(anyhow!("Error checking driver on {}: {}", dev, e)), + Ok(_) => true, // Current driver needs unbinding + }; + if need_unbind { + let unbindpath = &drvpath.join("unbind"); + fs::write(unbindpath, dev.to_string())?; + } + let probepath = syspci.join("drivers_probe"); + fs::write(probepath, dev.to_string())?; + Ok(()) +} + // pcipath_to_sysfs fetches the sysfs path for a PCI path, relative to // the sysfs path for the PCI host bridge, based on the PCI path // provided. @@ -1151,4 +1187,42 @@ mod tests { assert_eq!(split_vfio_option("0000:01:00.0=02/01=rubbish"), None); assert_eq!(split_vfio_option("0000:01:00.0"), None); } + + #[test] + fn test_pci_driver_override() { + let testdir = tempdir().expect("failed to create tmpdir"); + let syspci = testdir.path(); // Path to mock /sys/bus/pci + + let dev0 = pci::Address::new(0, 0, pci::SlotFn::new(0, 0).unwrap()); + let dev0path = syspci.join("devices").join(dev0.to_string()); + let dev0drv = dev0path.join("driver"); + let dev0override = dev0path.join("driver_override"); + + let drvapath = syspci.join("drivers").join("drv_a"); + let drvaunbind = drvapath.join("unbind"); + + let probepath = syspci.join("drivers_probe"); + + // Start mocking dev0 as being unbound + fs::create_dir_all(&dev0path).unwrap(); + + pci_driver_override(syspci, dev0, "drv_a").unwrap(); + assert_eq!(fs::read_to_string(&dev0override).unwrap(), "drv_a"); + assert_eq!(fs::read_to_string(&probepath).unwrap(), dev0.to_string()); + + // Now mock dev0 already being attached to drv_a + fs::create_dir_all(&drvapath).unwrap(); + std::os::unix::fs::symlink(&drvapath, dev0drv).unwrap(); + std::fs::remove_file(&probepath).unwrap(); + + pci_driver_override(syspci, dev0, "drv_a").unwrap(); // no-op + assert_eq!(fs::read_to_string(&dev0override).unwrap(), "drv_a"); + assert!(!probepath.exists()); + + // Now try binding to a different driver + pci_driver_override(syspci, dev0, "drv_b").unwrap(); + assert_eq!(fs::read_to_string(&dev0override).unwrap(), "drv_b"); + assert_eq!(fs::read_to_string(&probepath).unwrap(), dev0.to_string()); + assert_eq!(fs::read_to_string(&drvaunbind).unwrap(), dev0.to_string()); + } } From 13b06a35d5183b5bb295b76859fe333285767108 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 6 Oct 2021 14:08:16 +1100 Subject: [PATCH 02/15] agent/device: Rebind VFIO devices to VFIO driver inside guest VFIO devices can be added to a Kata container and they will be passed through to the sandbox guest. However, inside the guest those devices will bind to a native guest driver, so they will no longer appear as VFIO devices within the guest. This behaviour differs from runc or other conventional container runtimes. This code allows the agent to match the behaviour of other runtimes, if instructed to by kata-runtime. VFIO devices it's informed about with the "vfio" type instead of the existing "vfio-gk" type will be rebound to the vfio-pci driver within the guest. Signed-off-by: David Gibson --- src/agent/src/device.rs | 16 +++++++++++++--- src/agent/src/linux_abi.rs | 2 ++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 60f661c120..abc2cbdea7 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -48,6 +48,9 @@ pub const DRIVER_LOCAL_TYPE: &str = "local"; pub const DRIVER_WATCHABLE_BIND_TYPE: &str = "watchable-bind"; // VFIO device to be bound to a guest kernel driver pub const DRIVER_VFIO_GK_TYPE: &str = "vfio-gk"; +// VFIO device to be bound to vfio-pci and made available inside the +// container as a VFIO device node +pub const DRIVER_VFIO_TYPE: &str = "vfio"; #[derive(Debug)] struct DevIndexEntry { @@ -543,18 +546,23 @@ fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { // Each option should have the form "DDDD:BB:DD.F=" // DDDD:BB:DD.F is the device's PCI address in the host // is a PCI path to the device in the guest (see pci.rs) -async fn vfio_gk_device_handler( +async fn vfio_device_handler( device: &Device, _: &mut Spec, sandbox: &Arc>, _: &DevIndex, ) -> Result<()> { + let vfio_in_guest = device.field_type != DRIVER_VFIO_GK_TYPE; + for opt in device.options.iter() { let (_, pcipath) = split_vfio_option(opt).ok_or_else(|| anyhow!("Malformed VFIO option {:?}", opt))?; let pcipath = pci::Path::from_str(pcipath)?; - wait_for_pci_device(sandbox, &pcipath).await?; + let guestdev = wait_for_pci_device(sandbox, &pcipath).await?; + if vfio_in_guest { + pci_driver_override(SYSFS_BUS_PCI_PATH, guestdev, "vfio-pci")?; + } } Ok(()) } @@ -628,7 +636,9 @@ async fn add_device( DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, spec, sandbox, devidx).await, DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, spec, sandbox, devidx).await, DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, spec, sandbox, devidx).await, - DRIVER_VFIO_GK_TYPE => vfio_gk_device_handler(device, spec, sandbox, devidx).await, + DRIVER_VFIO_GK_TYPE | DRIVER_VFIO_TYPE => { + vfio_device_handler(device, spec, sandbox, devidx).await + } _ => Err(anyhow!("Unknown device type {}", device.field_type)), } } diff --git a/src/agent/src/linux_abi.rs b/src/agent/src/linux_abi.rs index e49d901f80..ee3823d380 100644 --- a/src/agent/src/linux_abi.rs +++ b/src/agent/src/linux_abi.rs @@ -83,6 +83,8 @@ pub const SYSFS_MEMORY_ONLINE_PATH: &str = "/sys/devices/system/memory"; pub const SYSFS_SCSI_HOST_PATH: &str = "/sys/class/scsi_host"; +pub const SYSFS_BUS_PCI_PATH: &str = "/sys/bus/pci"; + pub const SYSFS_CGROUPPATH: &str = "/sys/fs/cgroup"; pub const SYSFS_ONLINE_FILE: &str = "online"; From ff59db7534a33abea9d4293c51e0523661660f85 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 6 Oct 2021 15:55:49 +1100 Subject: [PATCH 03/15] agent/device: Add function to get IOMMU group for a PCI device For upcoming VFIO extensions we'll need to work with the IOMMU groups of VFIO devices. This helps us towards that by adding pci_iommu_group() to retrieve the IOMMU group (if any) of a given PCI device. Signed-off-by: David Gibson --- src/agent/src/device.rs | 83 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index abc2cbdea7..a07ca6c7ca 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -8,6 +8,7 @@ use nix::sys::stat; use regex::Regex; use std::collections::HashMap; use std::ffi::OsStr; +use std::fmt; use std::fs; use std::os::unix::ffi::OsStrExt; use std::os::unix::fs::MetadataExt; @@ -101,6 +102,49 @@ where Ok(()) } +// Represents an IOMMU group +#[derive(Clone, Debug, PartialEq, Eq)] +struct IommuGroup(u32); + +impl fmt::Display for IommuGroup { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "{}", self.0) + } +} + +// Determine the IOMMU group of a PCI device +#[instrument] +fn pci_iommu_group(syspci: T, dev: pci::Address) -> Result> +where + T: AsRef + std::fmt::Debug, +{ + let syspci = Path::new(&syspci); + let grouppath = syspci + .join("devices") + .join(dev.to_string()) + .join("iommu_group"); + + match fs::read_link(&grouppath) { + // Device has no group + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None), + Err(e) => Err(anyhow!("Error reading link {:?}: {}", &grouppath, e)), + Ok(group) => { + if let Some(group) = group.file_name() { + if let Some(group) = group.to_str() { + if let Ok(group) = group.parse::() { + return Ok(Some(IommuGroup(group))); + } + } + } + Err(anyhow!( + "Unexpected IOMMU group link {:?} => {:?}", + grouppath, + group + )) + } + } +} + // pcipath_to_sysfs fetches the sysfs path for a PCI path, relative to // the sysfs path for the PCI host bridge, based on the PCI path // provided. @@ -1235,4 +1279,43 @@ mod tests { assert_eq!(fs::read_to_string(&probepath).unwrap(), dev0.to_string()); assert_eq!(fs::read_to_string(&drvaunbind).unwrap(), dev0.to_string()); } + + #[test] + fn test_pci_iommu_group() { + let testdir = tempdir().expect("failed to create tmpdir"); // mock /sys + let syspci = testdir.path().join("bus").join("pci"); + + // Mock dev0, which has no group + let dev0 = pci::Address::new(0, 0, pci::SlotFn::new(0, 0).unwrap()); + let dev0path = syspci.join("devices").join(dev0.to_string()); + + fs::create_dir_all(&dev0path).unwrap(); + + // Test dev0 + assert!(pci_iommu_group(&syspci, dev0).unwrap().is_none()); + + // Mock dev1, which is in group 12 + let dev1 = pci::Address::new(0, 1, pci::SlotFn::new(0, 0).unwrap()); + let dev1path = syspci.join("devices").join(dev1.to_string()); + let dev1group = dev1path.join("iommu_group"); + + fs::create_dir_all(&dev1path).unwrap(); + std::os::unix::fs::symlink("../../../kernel/iommu_groups/12", &dev1group).unwrap(); + + // Test dev1 + assert_eq!( + pci_iommu_group(&syspci, dev1).unwrap(), + Some(IommuGroup(12)) + ); + + // Mock dev2, which has a bogus group (dir instead of symlink) + let dev2 = pci::Address::new(0, 2, pci::SlotFn::new(0, 0).unwrap()); + let dev2path = syspci.join("devices").join(dev2.to_string()); + let dev2group = dev2path.join("iommu_group"); + + fs::create_dir_all(&dev2group).unwrap(); + + // Test dev2 + assert!(pci_iommu_group(&syspci, dev2).is_err()); + } } From 8ceadcc5a95dfd2b11bcbb538b51c6f7bc7c6a3f Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 6 Oct 2021 16:29:51 +1100 Subject: [PATCH 04/15] agent/device: Sanity check guest IOMMU groups Each VFIO device passed into the guest could represent a whole IOMMU group of devices on the host. Since these devices aren't DMA isolated from each other, they must appear as the same IOMMU group in the guest as well. The VMM should enforce that for us, but double check it, since things can't work otherwise. This also means we determine the guest IOMMU group for the VFIO device, which we'll be needing later. Signed-off-by: David Gibson --- src/agent/src/device.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index a07ca6c7ca..f8389fc171 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -597,6 +597,7 @@ async fn vfio_device_handler( _: &DevIndex, ) -> Result<()> { let vfio_in_guest = device.field_type != DRIVER_VFIO_GK_TYPE; + let mut group = None; for opt in device.options.iter() { let (_, pcipath) = @@ -606,6 +607,26 @@ async fn vfio_device_handler( let guestdev = wait_for_pci_device(sandbox, &pcipath).await?; if vfio_in_guest { pci_driver_override(SYSFS_BUS_PCI_PATH, guestdev, "vfio-pci")?; + + let devgroup = pci_iommu_group(SYSFS_BUS_PCI_PATH, guestdev)?; + if devgroup.is_none() { + // Devices must have an IOMMU group to be usable via VFIO + return Err(anyhow!("{} has no IOMMU group", guestdev)); + } + + if group.is_some() && group != devgroup { + // If PCI devices associated with the same VFIO device + // (and therefore group) in the host don't end up in + // the same group in the guest, something has gone + // horribly wrong + return Err(anyhow!( + "{} is not in guest IOMMU group {}", + guestdev, + group.unwrap() + )); + } + + group = devgroup; } } Ok(()) From 827a41f973b9bd74badcee0cbc9d1ab617d90d2a Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 6 Oct 2021 17:04:24 +1100 Subject: [PATCH 05/15] agent/device: Refactor update_spec_device_list() update_spec_device_list() is used to update the container configuration to change device major/minor numbers configured by the Kata client based on host details to values suitable for the sandbox VM, which may differ. It takes a 'device' object, but the only things it actually uses from there are container_path and vm_path. Refactor this as update_spec_device(), taking the host and guest paths to the device as explicit parameters. This makes the function more self-contained and will enable some future extensions. Signed-off-by: David Gibson --- src/agent/src/device.rs | 114 ++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index f8389fc171..5d93eca4cc 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -409,24 +409,25 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { Ok(()) } -// update_spec_device_list takes a device description provided by the caller, -// trying to find it on the guest. Once this device has been identified, the -// "real" information that can be read from inside the VM is used to update -// the same device in the list of devices provided through the OCI spec. -// This is needed to update information about minor/major numbers that cannot -// be predicted from the caller. +// update_spec_device updates the device list in the OCI spec to make +// it include details appropriate for the VM, instead of the host. It +// is given the host path to the device (to locate the device in the +// original OCI spec) and the VM path which it uses to determine the +// VM major/minor numbers. #[instrument] -fn update_spec_device_list(device: &Device, spec: &mut Spec, devidx: &DevIndex) -> Result<()> { +fn update_spec_device( + spec: &mut Spec, + devidx: &DevIndex, + host_path: &str, + vm_path: &str, +) -> Result<()> { let major_id: c_uint; let minor_id: c_uint; // If no container_path is provided, we won't be able to match and // update the device in the OCI spec device list. This is an error. - if device.container_path.is_empty() { - return Err(anyhow!( - "container_path cannot empty for device {:?}", - device - )); + if host_path.is_empty() { + return Err(anyhow!("Host path cannot empty for device")); } let linux = spec @@ -434,11 +435,11 @@ fn update_spec_device_list(device: &Device, spec: &mut Spec, devidx: &DevIndex) .as_mut() .ok_or_else(|| anyhow!("Spec didn't container linux field"))?; - if !Path::new(&device.vm_path).exists() { - return Err(anyhow!("vm_path:{} doesn't exist", device.vm_path)); + if !Path::new(vm_path).exists() { + return Err(anyhow!("vm_path:{} doesn't exist", vm_path)); } - let meta = fs::metadata(&device.vm_path)?; + let meta = fs::metadata(vm_path)?; let dev_id = meta.rdev(); unsafe { major_id = major(dev_id); @@ -447,10 +448,10 @@ fn update_spec_device_list(device: &Device, spec: &mut Spec, devidx: &DevIndex) info!( sl!(), - "got the device: dev_path: {}, major: {}, minor: {}\n", &device.vm_path, major_id, minor_id + "got the device: dev_path: {}, major: {}, minor: {}\n", vm_path, major_id, minor_id ); - if let Some(idxdata) = devidx.0.get(device.container_path.as_str()) { + if let Some(idxdata) = devidx.0.get(host_path) { let dev = &mut linux.devices[idxdata.idx]; let host_major = dev.major; let host_minor = dev.minor; @@ -485,7 +486,7 @@ fn update_spec_device_list(device: &Device, spec: &mut Spec, devidx: &DevIndex) } else { Err(anyhow!( "Should have found a matching device {} in the spec", - device.vm_path + vm_path )) } } @@ -503,7 +504,7 @@ async fn virtiommio_blk_device_handler( return Err(anyhow!("Invalid path for virtio mmio blk device")); } - update_spec_device_list(device, spec, devidx) + update_spec_device(spec, devidx, &device.container_path, &device.vm_path) } // device.Id should be a PCI path string @@ -519,7 +520,7 @@ async fn virtio_blk_device_handler( dev.vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; - update_spec_device_list(&dev, spec, devidx) + update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path) } // device.id should be a CCW path string @@ -534,7 +535,7 @@ async fn virtio_blk_ccw_device_handler( let mut dev = device.clone(); let ccw_device = ccw::Device::from_str(&device.id)?; dev.vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?; - update_spec_device_list(&dev, spec, devidx) + update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path) } #[cfg(not(target_arch = "s390x"))] @@ -558,7 +559,7 @@ async fn virtio_scsi_device_handler( ) -> Result<()> { let mut dev = device.clone(); dev.vm_path = get_scsi_device_name(sandbox, &device.id).await?; - update_spec_device_list(&dev, spec, devidx) + update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path) } #[instrument] @@ -572,7 +573,7 @@ async fn virtio_nvdimm_device_handler( return Err(anyhow!("Invalid path for nvdimm device")); } - update_spec_device_list(device, spec, devidx) + update_spec_device(spec, devidx, &device.container_path, &device.vm_path) } fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { @@ -768,28 +769,28 @@ mod tests { } #[test] - fn test_update_spec_device_list() { + fn test_update_spec_device() { let (major, minor) = (7, 2); - let mut device = Device::default(); let mut spec = Spec::default(); // container_path empty + let container_path = ""; + let vm_path = ""; let devidx = DevIndex::new(&spec); - let res = update_spec_device_list(&device, &mut spec, &devidx); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); assert!(res.is_err()); - device.container_path = "/dev/null".to_string(); - // linux is empty + let container_path = "/dev/null"; let devidx = DevIndex::new(&spec); - let res = update_spec_device_list(&device, &mut spec, &devidx); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); assert!(res.is_err()); spec.linux = Some(Linux::default()); // linux.devices is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device_list(&device, &mut spec, &devidx); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); assert!(res.is_err()); spec.linux.as_mut().unwrap().devices = vec![oci::LinuxDevice { @@ -801,26 +802,32 @@ mod tests { // vm_path empty let devidx = DevIndex::new(&spec); - let res = update_spec_device_list(&device, &mut spec, &devidx); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); assert!(res.is_err()); - device.vm_path = "/dev/null".to_string(); + let vm_path = "/dev/null"; // guest and host path are not the same let devidx = DevIndex::new(&spec); - let res = update_spec_device_list(&device, &mut spec, &devidx); - assert!(res.is_err(), "device={:?} spec={:?}", device, spec); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); + assert!( + res.is_err(), + "container_path={:?} vm_path={:?} spec={:?}", + container_path, + vm_path, + spec + ); - spec.linux.as_mut().unwrap().devices[0].path = device.container_path.clone(); + spec.linux.as_mut().unwrap().devices[0].path = container_path.to_string(); // spec.linux.resources is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device_list(&device, &mut spec, &devidx); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); assert!(res.is_ok()); // update both devices and cgroup lists spec.linux.as_mut().unwrap().devices = vec![oci::LinuxDevice { - path: device.container_path.clone(), + path: container_path.to_string(), major, minor, ..oci::LinuxDevice::default() @@ -836,12 +843,12 @@ mod tests { }); let devidx = DevIndex::new(&spec); - let res = update_spec_device_list(&device, &mut spec, &devidx); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); assert!(res.is_ok()); } #[test] - fn test_update_spec_device_list_guest_host_conflict() { + fn test_update_spec_device_guest_host_conflict() { let null_rdev = fs::metadata("/dev/null").unwrap().rdev(); let zero_rdev = fs::metadata("/dev/zero").unwrap().rdev(); let full_rdev = fs::metadata("/dev/full").unwrap().rdev(); @@ -892,20 +899,14 @@ mod tests { }; let devidx = DevIndex::new(&spec); - let dev_a = Device { - container_path: "/dev/a".to_string(), - vm_path: "/dev/zero".to_string(), - ..Device::default() - }; + let container_path_a = "/dev/a"; + let vm_path_a = "/dev/zero"; let guest_major_a = stat::major(zero_rdev) as i64; let guest_minor_a = stat::minor(zero_rdev) as i64; - let dev_b = Device { - container_path: "/dev/b".to_string(), - vm_path: "/dev/full".to_string(), - ..Device::default() - }; + let container_path_b = "/dev/b"; + let vm_path_b = "/dev/full"; let guest_major_b = stat::major(full_rdev) as i64; let guest_minor_b = stat::minor(full_rdev) as i64; @@ -922,7 +923,7 @@ mod tests { assert_eq!(Some(host_major_b), specresources.devices[1].major); assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - let res = update_spec_device_list(&dev_a, &mut spec, &devidx); + let res = update_spec_device(&mut spec, &devidx, container_path_a, vm_path_a); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -937,7 +938,7 @@ mod tests { assert_eq!(Some(host_major_b), specresources.devices[1].major); assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - let res = update_spec_device_list(&dev_b, &mut spec, &devidx); + let res = update_spec_device(&mut spec, &devidx, container_path_b, vm_path_b); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -954,7 +955,7 @@ mod tests { } #[test] - fn test_update_spec_device_list_char_block_conflict() { + fn test_update_spec_device_char_block_conflict() { let null_rdev = fs::metadata("/dev/null").unwrap().rdev(); let guest_major = stat::major(null_rdev) as i64; @@ -1003,11 +1004,8 @@ mod tests { }; let devidx = DevIndex::new(&spec); - let dev = Device { - container_path: "/dev/char".to_string(), - vm_path: "/dev/null".to_string(), - ..Device::default() - }; + let container_path = "/dev/char"; + let vm_path = "/dev/null"; let specresources = spec.linux.as_ref().unwrap().resources.as_ref().unwrap(); assert_eq!(Some(host_major), specresources.devices[0].major); @@ -1015,7 +1013,7 @@ mod tests { assert_eq!(Some(host_major), specresources.devices[1].major); assert_eq!(Some(host_minor), specresources.devices[1].minor); - let res = update_spec_device_list(&dev, &mut spec, &devidx); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); assert!(res.is_ok()); // Only the char device, not the block device should be updated From 42b92b2b05b7644be7b65fe7fb2ebb934a4bd727 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 6 Oct 2021 17:10:27 +1100 Subject: [PATCH 06/15] agent/device: Allow container devname to differ from the host Currently, update_spec_device() assumes that the proper device path in the (inner) container is the same as the device path specified in the outer OCI spec on the host. Usually that's correct. However for VFIO group devices we actually need the container to see the VM's device path, since it's normal to correlate that with IOMMU group information from sysfs which will be different in the guest and which we can't namespace away. So, add an extra "final_path" parameter to update_spec_device() to allow callers to chose the device path that should be used for the inner container. All current callers pass the same thing as container_path, but that will change in future. Signed-off-by: David Gibson --- src/agent/src/device.rs | 124 +++++++++++++++++++++++++++++++++------- 1 file changed, 104 insertions(+), 20 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 5d93eca4cc..39429bb349 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -413,13 +413,15 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { // it include details appropriate for the VM, instead of the host. It // is given the host path to the device (to locate the device in the // original OCI spec) and the VM path which it uses to determine the -// VM major/minor numbers. +// VM major/minor numbers, and the final path with which to present +// the device in the (inner) container #[instrument] fn update_spec_device( spec: &mut Spec, devidx: &DevIndex, host_path: &str, vm_path: &str, + final_path: &str, ) -> Result<()> { let major_id: c_uint; let minor_id: c_uint; @@ -448,7 +450,7 @@ fn update_spec_device( info!( sl!(), - "got the device: dev_path: {}, major: {}, minor: {}\n", vm_path, major_id, minor_id + "update_spec_device(): vm_path={}, major: {}, minor: {}\n", vm_path, major_id, minor_id ); if let Some(idxdata) = devidx.0.get(host_path) { @@ -458,14 +460,17 @@ fn update_spec_device( dev.major = major_id as i64; dev.minor = minor_id as i64; + dev.path = final_path.to_string(); info!( sl!(), - "change the device from major: {} minor: {} to vm device major: {} minor: {}", + "change the device from path: {} major: {} minor: {} to vm device path: {} major: {} minor: {}", + host_path, host_major, host_minor, - major_id, - minor_id + dev.path, + dev.major, + dev.minor, ); // Resources must be updated since they are used to identify @@ -504,7 +509,13 @@ async fn virtiommio_blk_device_handler( return Err(anyhow!("Invalid path for virtio mmio blk device")); } - update_spec_device(spec, devidx, &device.container_path, &device.vm_path) + update_spec_device( + spec, + devidx, + &device.container_path, + &device.vm_path, + &device.container_path, + ) } // device.Id should be a PCI path string @@ -520,7 +531,13 @@ async fn virtio_blk_device_handler( dev.vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; - update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path) + update_spec_device( + spec, + devidx, + &dev.container_path, + &dev.vm_path, + &dev.container_path, + ) } // device.id should be a CCW path string @@ -535,7 +552,13 @@ async fn virtio_blk_ccw_device_handler( let mut dev = device.clone(); let ccw_device = ccw::Device::from_str(&device.id)?; dev.vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?; - update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path) + update_spec_device( + spec, + devidx, + &dev.container_path, + &dev.vm_path, + &dev.container_path, + ) } #[cfg(not(target_arch = "s390x"))] @@ -559,7 +582,13 @@ async fn virtio_scsi_device_handler( ) -> Result<()> { let mut dev = device.clone(); dev.vm_path = get_scsi_device_name(sandbox, &device.id).await?; - update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path) + update_spec_device( + spec, + devidx, + &dev.container_path, + &dev.vm_path, + &dev.container_path, + ) } #[instrument] @@ -573,7 +602,13 @@ async fn virtio_nvdimm_device_handler( return Err(anyhow!("Invalid path for nvdimm device")); } - update_spec_device(spec, devidx, &device.container_path, &device.vm_path) + update_spec_device( + spec, + devidx, + &device.container_path, + &device.vm_path, + &device.container_path, + ) } fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { @@ -777,20 +812,20 @@ mod tests { let container_path = ""; let vm_path = ""; let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); assert!(res.is_err()); // linux is empty let container_path = "/dev/null"; let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); assert!(res.is_err()); spec.linux = Some(Linux::default()); // linux.devices is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); assert!(res.is_err()); spec.linux.as_mut().unwrap().devices = vec![oci::LinuxDevice { @@ -802,14 +837,14 @@ mod tests { // vm_path empty let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); assert!(res.is_err()); let vm_path = "/dev/null"; // guest and host path are not the same let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); assert!( res.is_err(), "container_path={:?} vm_path={:?} spec={:?}", @@ -822,7 +857,7 @@ mod tests { // spec.linux.resources is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); assert!(res.is_ok()); // update both devices and cgroup lists @@ -843,7 +878,7 @@ mod tests { }); let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); assert!(res.is_ok()); } @@ -923,7 +958,13 @@ mod tests { assert_eq!(Some(host_major_b), specresources.devices[1].major); assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - let res = update_spec_device(&mut spec, &devidx, container_path_a, vm_path_a); + let res = update_spec_device( + &mut spec, + &devidx, + container_path_a, + vm_path_a, + container_path_a, + ); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -938,7 +979,13 @@ mod tests { assert_eq!(Some(host_major_b), specresources.devices[1].major); assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - let res = update_spec_device(&mut spec, &devidx, container_path_b, vm_path_b); + let res = update_spec_device( + &mut spec, + &devidx, + container_path_b, + vm_path_b, + container_path_b, + ); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -1013,7 +1060,7 @@ mod tests { assert_eq!(Some(host_major), specresources.devices[1].major); assert_eq!(Some(host_minor), specresources.devices[1].minor); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); assert!(res.is_ok()); // Only the char device, not the block device should be updated @@ -1024,6 +1071,43 @@ mod tests { assert_eq!(Some(host_minor), specresources.devices[1].minor); } + #[test] + fn test_update_spec_device_final_path() { + let null_rdev = fs::metadata("/dev/null").unwrap().rdev(); + let guest_major = stat::major(null_rdev) as i64; + let guest_minor = stat::minor(null_rdev) as i64; + + let host_path = "/dev/host"; + let host_major: i64 = 99; + let host_minor: i64 = 99; + + let mut spec = Spec { + linux: Some(Linux { + devices: vec![oci::LinuxDevice { + path: host_path.to_string(), + r#type: "c".to_string(), + major: host_major, + minor: host_minor, + ..oci::LinuxDevice::default() + }], + ..Linux::default() + }), + ..Spec::default() + }; + let devidx = DevIndex::new(&spec); + + let vm_path = "/dev/null"; + let final_path = "/dev/final"; + + let res = update_spec_device(&mut spec, &devidx, host_path, vm_path, final_path); + assert!(res.is_ok()); + + let specdevices = &spec.linux.as_ref().unwrap().devices; + assert_eq!(guest_major, specdevices[0].major); + assert_eq!(guest_minor, specdevices[0].minor); + assert_eq!(final_path, specdevices[0].path); + } + #[test] fn test_pcipath_to_sysfs() { let testdir = tempdir().expect("failed to create tmpdir"); From 2680c0bfee2455f422bdc6bc13afe06f2ad9ff8d Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 13 Oct 2021 15:04:57 +1100 Subject: [PATCH 07/15] rustjail: Provide useful context on device node creation errors create_devices() within the rustjail module is responsible for creating device nodes within the (inner) containers. Errors that occur here will be propagated up, but are likely to be low level failures of mknod() - e.g. ENOENT or EACCESS - which won't be very useful without context when reported all the way up to the runtime without the context of what we were trying to do. Add some anyhow context information giving the details of the device we were trying to create when it failed. Signed-off-by: David Gibson --- src/agent/rustjail/src/mount.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agent/rustjail/src/mount.rs b/src/agent/rustjail/src/mount.rs index 8c021cce39..4ec36761a9 100644 --- a/src/agent/rustjail/src/mount.rs +++ b/src/agent/rustjail/src/mount.rs @@ -832,14 +832,14 @@ fn create_devices(devices: &[LinuxDevice], bind: bool) -> Result<()> { let op: fn(&LinuxDevice) -> Result<()> = if bind { bind_dev } else { mknod_dev }; let old = stat::umask(Mode::from_bits_truncate(0o000)); for dev in DEFAULT_DEVICES.iter() { - op(dev)?; + op(dev).context(format!("Creating container device {:?}", dev))?; } for dev in devices { if !dev.path.starts_with("/dev") || dev.path.contains("..") { let msg = format!("{} is not a valid device path", dev.path); bail!(anyhow!(msg)); } - op(dev)?; + op(dev).context(format!("Creating container device {:?}", dev))?; } stat::umask(old); Ok(()) From d6b62c029e137d7d2f294cdcaa539fd61e2048c2 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 13 Oct 2021 16:43:20 +1100 Subject: [PATCH 08/15] rustjail: Change mknod_dev() and bind_dev() to take relative device path Both these functions take the absolute path from LinuxDevice and drop the leading '/' to make a relative path. They do that with a simple &dev.path[1..]. That can be technically incorrect in some edge cases such as a path with redundant /s like "//dev//sda". To handle cases like that, have the explicit relative path passed into these functions. For now we calculate it in the same buggy way, but we'll fix that shortly. Signed-off-by: David Gibson --- src/agent/rustjail/src/mount.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/agent/rustjail/src/mount.rs b/src/agent/rustjail/src/mount.rs index 4ec36761a9..ec808d5492 100644 --- a/src/agent/rustjail/src/mount.rs +++ b/src/agent/rustjail/src/mount.rs @@ -829,17 +829,19 @@ fn default_symlinks() -> Result<()> { Ok(()) } fn create_devices(devices: &[LinuxDevice], bind: bool) -> Result<()> { - let op: fn(&LinuxDevice) -> Result<()> = if bind { bind_dev } else { mknod_dev }; + let op: fn(&LinuxDevice, &Path) -> Result<()> = if bind { bind_dev } else { mknod_dev }; let old = stat::umask(Mode::from_bits_truncate(0o000)); for dev in DEFAULT_DEVICES.iter() { - op(dev).context(format!("Creating container device {:?}", dev))?; + let path = Path::new(&dev.path[1..]); + op(dev, path).context(format!("Creating container device {:?}", dev))?; } for dev in devices { if !dev.path.starts_with("/dev") || dev.path.contains("..") { let msg = format!("{} is not a valid device path", dev.path); bail!(anyhow!(msg)); } - op(dev).context(format!("Creating container device {:?}", dev))?; + let path = Path::new(&dev.path[1..]); + op(dev, path).context(format!("Creating container device {:?}", dev))?; } stat::umask(old); Ok(()) @@ -861,21 +863,21 @@ lazy_static! { }; } -fn mknod_dev(dev: &LinuxDevice) -> Result<()> { +fn mknod_dev(dev: &LinuxDevice, relpath: &Path) -> Result<()> { let f = match LINUXDEVICETYPE.get(dev.r#type.as_str()) { Some(v) => v, None => return Err(anyhow!("invalid spec".to_string())), }; stat::mknod( - &dev.path[1..], + relpath, *f, Mode::from_bits_truncate(dev.file_mode.unwrap_or(0)), nix::sys::stat::makedev(dev.major as u64, dev.minor as u64), )?; unistd::chown( - &dev.path[1..], + relpath, Some(Uid::from_raw(dev.uid.unwrap_or(0) as uid_t)), Some(Gid::from_raw(dev.gid.unwrap_or(0) as uid_t)), )?; @@ -883,9 +885,9 @@ fn mknod_dev(dev: &LinuxDevice) -> Result<()> { Ok(()) } -fn bind_dev(dev: &LinuxDevice) -> Result<()> { +fn bind_dev(dev: &LinuxDevice, relpath: &Path) -> Result<()> { let fd = fcntl::open( - &dev.path[1..], + relpath, OFlag::O_RDWR | OFlag::O_CREAT, Mode::from_bits_truncate(0o644), )?; @@ -894,7 +896,7 @@ fn bind_dev(dev: &LinuxDevice) -> Result<()> { mount( Some(&*dev.path), - &dev.path[1..], + relpath, None::<&str>, MsFlags::MS_BIND, None::<&str>, @@ -1258,11 +1260,12 @@ mod tests { uid: Some(unistd::getuid().as_raw()), gid: Some(unistd::getgid().as_raw()), }; + let path = Path::new("fifo"); - let ret = mknod_dev(&dev); + let ret = mknod_dev(&dev, path); assert!(ret.is_ok(), "Should pass. Got: {:?}", ret); - let ret = stat::stat("fifo"); + let ret = stat::stat(path); assert!(ret.is_ok(), "Should pass. Got: {:?}", ret); } #[test] From 9891efc61f4203a0b56bb49456d8d4486993ecd8 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 13 Oct 2021 15:43:30 +1100 Subject: [PATCH 09/15] rustjail: Correct sanity checks on device path For each user supplied device, create_devices() checks that the given path actually is in /dev, by checking that its path starts with /dev and does not contain "..". However, this has subtle errors because it's interpreting the path as a raw string without considering separators. It will accept the path /devfoo which it should not, while it will not accept the valid (though weird) paths /dev/... and /dev/a..b. Correct this by using std::path::Path methods designed for the purpose. Having done this, it's trivial to also generate the relative path that mknod_dev() or bind_dev() will need, so do that at the same time. We also move this logic into a helper function so that we can add some unit tests for it. Signed-off-by: David Gibson --- src/agent/rustjail/src/mount.rs | 46 ++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/src/agent/rustjail/src/mount.rs b/src/agent/rustjail/src/mount.rs index ec808d5492..ac9498c73d 100644 --- a/src/agent/rustjail/src/mount.rs +++ b/src/agent/rustjail/src/mount.rs @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{anyhow, Context, Result}; use libc::uid_t; use nix::errno::Errno; use nix::fcntl::{self, OFlag}; @@ -19,7 +19,7 @@ use std::fs::{self, OpenOptions}; use std::mem::MaybeUninit; use std::os::unix; use std::os::unix::io::RawFd; -use std::path::{Path, PathBuf}; +use std::path::{Component, Path, PathBuf}; use path_absolutize::*; use std::fs::File; @@ -828,6 +828,19 @@ fn default_symlinks() -> Result<()> { } Ok(()) } + +fn dev_rel_path(path: &str) -> Option<&Path> { + let path = Path::new(path); + + if !path.starts_with("/dev") + || path == Path::new("/dev") + || path.components().any(|c| c == Component::ParentDir) + { + return None; + } + path.strip_prefix("/").ok() +} + fn create_devices(devices: &[LinuxDevice], bind: bool) -> Result<()> { let op: fn(&LinuxDevice, &Path) -> Result<()> = if bind { bind_dev } else { mknod_dev }; let old = stat::umask(Mode::from_bits_truncate(0o000)); @@ -836,11 +849,10 @@ fn create_devices(devices: &[LinuxDevice], bind: bool) -> Result<()> { op(dev, path).context(format!("Creating container device {:?}", dev))?; } for dev in devices { - if !dev.path.starts_with("/dev") || dev.path.contains("..") { + let path = dev_rel_path(&dev.path).ok_or_else(|| { let msg = format!("{} is not a valid device path", dev.path); - bail!(anyhow!(msg)); - } - let path = Path::new(&dev.path[1..]); + anyhow!(msg) + })?; op(dev, path).context(format!("Creating container device {:?}", dev))?; } stat::umask(old); @@ -1382,4 +1394,26 @@ mod tests { assert!(result == t.result, "{}", msg); } } + + #[test] + fn test_dev_rel_path() { + // Valid device paths + assert_eq!(dev_rel_path("/dev/sda").unwrap(), Path::new("dev/sda")); + assert_eq!(dev_rel_path("//dev/sda").unwrap(), Path::new("dev/sda")); + assert_eq!( + dev_rel_path("/dev/vfio/99").unwrap(), + Path::new("dev/vfio/99") + ); + assert_eq!(dev_rel_path("/dev/...").unwrap(), Path::new("dev/...")); + assert_eq!(dev_rel_path("/dev/a..b").unwrap(), Path::new("dev/a..b")); + assert_eq!(dev_rel_path("/dev//foo").unwrap(), Path::new("dev/foo")); + + // Bad device paths + assert!(dev_rel_path("/devfoo").is_none()); + assert!(dev_rel_path("/etc/passwd").is_none()); + assert!(dev_rel_path("/dev/../etc/passwd").is_none()); + assert!(dev_rel_path("dev/foo").is_none()); + assert!(dev_rel_path("").is_none()); + assert!(dev_rel_path("/dev").is_none()); + } } From 175f9b06e92d466d7abae4c21fca887131bce216 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 13 Oct 2021 16:59:58 +1100 Subject: [PATCH 10/15] rustjail: Allow container devices in subdirectories Many device nodes go directly under /dev, however some are conventionally placed in subdirectories under /dev. For example /dev/vfio/vfio or /dev/pts/ptmx. Currently, attempting to pass such a device into a Kata container will fail because mknod() will get an ENOENT because the parent directory is missing (or an equivalent error for bind_dev()). Correct that by making subdirectories as necessary in create_devices(). Signed-off-by: David Gibson --- src/agent/rustjail/src/mount.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/agent/rustjail/src/mount.rs b/src/agent/rustjail/src/mount.rs index ac9498c73d..7d1f7bc8fe 100644 --- a/src/agent/rustjail/src/mount.rs +++ b/src/agent/rustjail/src/mount.rs @@ -853,6 +853,9 @@ fn create_devices(devices: &[LinuxDevice], bind: bool) -> Result<()> { let msg = format!("{} is not a valid device path", dev.path); anyhow!(msg) })?; + if let Some(dir) = path.parent() { + fs::create_dir_all(dir).context(format!("Creating container device {:?}", dev))?; + } op(dev, path).context(format!("Creating container device {:?}", dev))?; } stat::umask(old); From 730b9c433f6e924de6dc47f4057cab81e12a7b09 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 7 Oct 2021 14:26:50 +1100 Subject: [PATCH 11/15] agent/device: Create device nodes for VFIO devices Add and adjust the vfio devices in the inner container spec so that rustjail will create device nodes for them. In order to do that, we also need to make sure the VFIO device node is ready within the guest VM first. That may take (slightly) longer than just the underlying PCI device(s) being ready, because vfio-pci needs to initialize. So, add a helper function that will wait for a specific VFIO device node to be ready, using the existing uevent listening mechanism. It also returns the device node name for the device (though in practice it will always /dev/vfio/NN where NN is the group number). Signed-off-by: David Gibson --- src/agent/src/device.rs | 63 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 39429bb349..df0221c746 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -104,7 +104,7 @@ where // Represents an IOMMU group #[derive(Clone, Debug, PartialEq, Eq)] -struct IommuGroup(u32); +pub struct IommuGroup(u32); impl fmt::Display for IommuGroup { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { @@ -379,6 +379,33 @@ pub async fn wait_for_pci_device( Ok(addr) } +#[derive(Debug)] +struct VfioMatcher { + syspath: String, +} + +impl VfioMatcher { + fn new(grp: IommuGroup) -> VfioMatcher { + VfioMatcher { + syspath: format!("/devices/virtual/vfio/{}", grp), + } + } +} + +impl UeventMatcher for VfioMatcher { + fn is_match(&self, uev: &Uevent) -> bool { + uev.devpath == self.syspath + } +} + +#[instrument] +async fn get_vfio_device_name(sandbox: &Arc>, grp: IommuGroup) -> Result { + let matcher = VfioMatcher::new(grp); + + let uev = wait_for_uevent(sandbox, matcher).await?; + Ok(format!("{}/{}", SYSTEM_DEV_PATH, &uev.devname)) +} + /// Scan SCSI bus for the given SCSI address(SCSI-Id and LUN) #[instrument] fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { @@ -628,9 +655,9 @@ fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { // is a PCI path to the device in the guest (see pci.rs) async fn vfio_device_handler( device: &Device, - _: &mut Spec, + spec: &mut Spec, sandbox: &Arc>, - _: &DevIndex, + devidx: &DevIndex, ) -> Result<()> { let vfio_in_guest = device.field_type != DRIVER_VFIO_GK_TYPE; let mut group = None; @@ -665,6 +692,15 @@ async fn vfio_device_handler( group = devgroup; } } + + if vfio_in_guest { + // If there are any devices at all, logic above ensures that group is not None + let group = group.unwrap(); + let vmpath = get_vfio_device_name(sandbox, group).await?; + + update_spec_device(spec, devidx, &device.container_path, &vmpath, &vmpath)?; + } + Ok(()) } @@ -1335,6 +1371,27 @@ mod tests { assert!(!matcher_a.is_match(&uev_b)); } + #[tokio::test] + async fn test_vfio_matcher() { + let grpa = IommuGroup(1); + let grpb = IommuGroup(22); + + let mut uev_a = crate::uevent::Uevent::default(); + uev_a.action = crate::linux_abi::U_EVENT_ACTION_ADD.to_string(); + uev_a.devname = format!("vfio/{}", grpa); + uev_a.devpath = format!("/devices/virtual/vfio/{}", grpa); + let matcher_a = VfioMatcher::new(grpa); + + let mut uev_b = uev_a.clone(); + uev_b.devpath = format!("/devices/virtual/vfio/{}", grpb); + let matcher_b = VfioMatcher::new(grpb); + + assert!(matcher_a.is_match(&uev_a)); + assert!(matcher_b.is_match(&uev_b)); + assert!(!matcher_b.is_match(&uev_a)); + assert!(!matcher_a.is_match(&uev_b)); + } + #[test] fn test_split_vfio_option() { assert_eq!( From 57ab408576290a7d7f1b172fd45cac5ecb18916b Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 8 Oct 2021 16:37:28 +1100 Subject: [PATCH 12/15] runtime: Introduce "vfio_mode" config variable and annotation In order to support DPDK workloads, we need to change the way VFIO devices will be handled in Kata containers. However, the current method, although it is not remotely OCI compliant has real uses. Therefore, introduce a new runtime configuration field "vfio_mode" to control how VFIO devices will be presented to the container. We also add a new sandbox annotation - io.katacontainers.config.runtime.vfio_mode - to override this on a per-sandbox basis. For now, the only allowed value is "guest-kernel" which refers to the current behaviour where VFIO devices added to the container will be bound to whatever driver in the VM kernel claims them. Signed-off-by: David Gibson --- src/runtime/Makefile | 2 ++ src/runtime/config/configuration-clh.toml.in | 14 +++++++++++ src/runtime/config/configuration-qemu.toml.in | 14 +++++++++++ .../pkg/katautils/config-settings.go.in | 1 + src/runtime/pkg/katautils/config.go | 14 +++++++++++ .../virtcontainers/device/config/config.go | 25 +++++++++++++++++++ .../pkg/annotations/annotations.go | 4 +++ src/runtime/virtcontainers/pkg/oci/utils.go | 13 ++++++++++ src/runtime/virtcontainers/sandbox.go | 2 ++ 9 files changed, 89 insertions(+) diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 5bfc803f13..4fbbcbfb78 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -190,6 +190,7 @@ DEFVALIDVHOSTUSERSTOREPATHS := [\"$(DEFVHOSTUSERSTOREPATH)\"] DEFFILEMEMBACKEND := "" DEFVALIDFILEMEMBACKENDS := [\"$(DEFFILEMEMBACKEND)\"] DEFMSIZE9P := 8192 +DEFVFIOMODE := guest-kernel # Default cgroup model DEFSANDBOXCGROUPONLY ?= false @@ -459,6 +460,7 @@ USER_VARS += DEFENTROPYSOURCE USER_VARS += DEFVALIDENTROPYSOURCES USER_VARS += DEFSANDBOXCGROUPONLY USER_VARS += DEFBINDMOUNTS +USER_VARS += DEFVFIOMODE USER_VARS += FEATURE_SELINUX USER_VARS += BUILDFLAGS diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index a128aea6a9..076a113941 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -263,6 +263,20 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ # These will not be exposed to the container workloads, and are only provided for potential guest services. sandbox_bind_mounts=@DEFBINDMOUNTS@ +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode="@DEFVFIOMODE@" + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # they may break compatibility, and are prepared for a big version bump. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index f055ae1303..f2c7c4fa73 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -543,6 +543,20 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ # These will not be exposed to the container workloads, and are only provided for potential guest services. sandbox_bind_mounts=@DEFBINDMOUNTS@ +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode="@DEFVFIOMODE@" + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # they may break compatibility, and are prepared for a big version bump. diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 66f30e073f..bd793e23f4 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -88,6 +88,7 @@ const defaultConfidentialGuest = false const defaultGuestSwap = false const defaultRootlessHypervisor = false const defaultDisableSeccomp = false +const defaultVfioMode = "guest-kernel" var defaultSGXEPCSize = int64(0) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 93a870e8a0..79634ba266 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -143,6 +143,7 @@ type runtime struct { JaegerEndpoint string `toml:"jaeger_endpoint"` JaegerUser string `toml:"jaeger_user"` JaegerPassword string `toml:"jaeger_password"` + VfioMode string `toml:"vfio_mode"` SandboxBindMounts []string `toml:"sandbox_bind_mounts"` Experimental []string `toml:"experimental"` Debug bool `toml:"enable_debug"` @@ -1068,6 +1069,11 @@ func initConfig() (config oci.RuntimeConfig, err error) { return oci.RuntimeConfig{}, err } + err = config.VfioMode.VFIOSetMode(defaultVfioMode) + if err != nil { + return oci.RuntimeConfig{}, err + } + config = oci.RuntimeConfig{ HypervisorType: defaultHypervisor, HypervisorConfig: GetDefaultHypervisorConfig(), @@ -1114,6 +1120,14 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat } } + if tomlConf.Runtime.VfioMode != "" { + err = config.VfioMode.VFIOSetMode(tomlConf.Runtime.VfioMode) + + if err != nil { + return "", config, err + } + } + if !ignoreLogging { err := handleSystemLog("", "") if err != nil { diff --git a/src/runtime/virtcontainers/device/config/config.go b/src/runtime/virtcontainers/device/config/config.go index 8627094a91..5e62d0c179 100644 --- a/src/runtime/virtcontainers/device/config/config.go +++ b/src/runtime/virtcontainers/device/config/config.go @@ -187,6 +187,31 @@ type BlockDrive struct { Swap bool } +// VFIOMode indicates e behaviour mode for handling devices in the VM +type VFIOModeType uint32 + +const ( + // VFIOModeGuestKernel specifies Kata-specific behaviour + // useful in certain cases: VFIO devices specified to Kata are + // bound to whatever driver in the VM will take them. This + // requires specialized containers expecting this behaviour to + // locate and use the devices + VFIOModeGuestKernel = iota +) + +const ( + vfioModeGuestKernelStr = "guest-kernel" +) + +func (m *VFIOModeType) VFIOSetMode(modeName string) error { + switch modeName { + case vfioModeGuestKernelStr: + *m = VFIOModeGuestKernel + return nil + } + return fmt.Errorf("Unknown VFIO mode %s", modeName) +} + // VFIODeviceType indicates VFIO device type type VFIODeviceType uint32 diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 2a7e88f725..5161e84b17 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -250,6 +250,10 @@ const ( // DisableNewNetNs is a sandbox annotation that determines if create a netns for hypervisor process. DisableNewNetNs = kataAnnotRuntimePrefix + "disable_new_netns" + + // VfioMode is a sandbox annotation to specify how attached VFIO devices should be treated + // Overrides the runtime.vfio_mode parameter in the global configuration.toml + VfioMode = kataAnnotRuntimePrefix + "vfio_mode" ) // Agent related annotations diff --git a/src/runtime/virtcontainers/pkg/oci/utils.go b/src/runtime/virtcontainers/pkg/oci/utils.go index ba7977db19..d6fc10d8d6 100644 --- a/src/runtime/virtcontainers/pkg/oci/utils.go +++ b/src/runtime/virtcontainers/pkg/oci/utils.go @@ -116,6 +116,10 @@ type RuntimeConfig struct { //the container network interface InterNetworkModel vc.NetInterworkingModel + //Determines how VFIO devices should be presented to the + //container + VfioMode config.VFIOModeType + Debug bool Trace bool @@ -826,6 +830,13 @@ func addRuntimeConfigOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, r sbConfig.NetworkConfig.InterworkingModel = runtimeConfig.InterNetworkModel } + if value, ok := ocispec.Annotations[vcAnnotations.VfioMode]; ok { + if err := sbConfig.VfioMode.VFIOSetMode(value); err != nil { + return fmt.Errorf("Unknown VFIO mode \"%s\" in annotation %s", + value, vcAnnotations.VfioMode) + } + } + return nil } @@ -893,6 +904,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c ShmSize: shmSize, + VfioMode: runtime.VfioMode, + SystemdCgroup: systemdCgroup, SandboxCgroupOnly: runtime.SandboxCgroupOnly, diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 8ce76497a8..5b9bc7fc4e 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -134,6 +134,8 @@ type SandboxConfig struct { ShmSize uint64 + VfioMode config.VFIOModeType + // SharePidNs sets all containers to share the same sandbox level pid namespace. SharePidNs bool From d9e2e9edb2c8ee480c0dd0dc63da216fa041d7c8 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 8 Oct 2021 17:07:42 +1100 Subject: [PATCH 13/15] runtime: Rename constraintGRPCSpec to improve grammar "constraint" is a noun, "constrain" is the associated verb, which makes more sense in this context. Signed-off-by: David Gibson --- src/runtime/virtcontainers/kata_agent.go | 8 ++++---- src/runtime/virtcontainers/kata_agent_test.go | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index feccb6382c..f46f5f3a1b 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -995,7 +995,7 @@ func (k *kataAgent) replaceOCIMountsForStorages(spec *specs.Spec, volumeStorages return nil } -func (k *kataAgent) constraintGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool) { +func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool) { // Disable Hooks since they have been handled on the host and there is // no reason to send them to the agent. It would make no sense to try // to apply them on the guest. @@ -1411,9 +1411,9 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co passSeccomp := !sandbox.config.DisableGuestSeccomp && sandbox.seccompSupported - // We need to constraint the spec to make sure we're not passing - // irrelevant information to the agent. - k.constraintGRPCSpec(grpcSpec, passSeccomp) + // We need to constrain the spec to make sure we're not + // passing irrelevant information to the agent. + k.constrainGRPCSpec(grpcSpec, passSeccomp) req := &grpc.CreateContainerRequest{ ContainerId: c.id, diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index ce4515e2ee..fa50569cea 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -541,7 +541,7 @@ func TestAppendVhostUserBlkDevices(t *testing.T) { updatedDevList, expected) } -func TestConstraintGRPCSpec(t *testing.T) { +func TestConstrainGRPCSpec(t *testing.T) { assert := assert.New(t) expectedCgroupPath := "/foo/bar" @@ -589,7 +589,7 @@ func TestConstraintGRPCSpec(t *testing.T) { } k := kataAgent{} - k.constraintGRPCSpec(g, true) + k.constrainGRPCSpec(g, true) // check nil fields assert.Nil(g.Hooks) From 68696e051dd29329209fc9f52c9fe64899f2a55d Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 8 Oct 2021 17:03:59 +1100 Subject: [PATCH 14/15] runtime: Add parameter to constrainGRPCSpec to control VFIO handling Currently constrainGRPCSpec always removes VFIO devices from the OCI container spec which will be used for the inner container. For upcoming support for VFIO devices in DPDK usecases we'll need to not do that. As a preliminary to that, add an extra parameter to the function to control whether or not it will remove the VFIO devices from the spec. Signed-off-by: David Gibson --- src/runtime/virtcontainers/kata_agent.go | 26 +++++++++++-------- src/runtime/virtcontainers/kata_agent_test.go | 2 +- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index f46f5f3a1b..c1a39220fb 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -995,7 +995,7 @@ func (k *kataAgent) replaceOCIMountsForStorages(spec *specs.Spec, volumeStorages return nil } -func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool) { +func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, stripVfio bool) { // Disable Hooks since they have been handled on the host and there is // no reason to send them to the agent. It would make no sense to try // to apply them on the guest. @@ -1058,17 +1058,21 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool) { } grpcSpec.Linux.Namespaces = tmpNamespaces - // VFIO char device shouldn't not appear in the guest, - // the device driver should handle it and determinate its group. - var linuxDevices []grpc.LinuxDevice - for _, dev := range grpcSpec.Linux.Devices { - if dev.Type == "c" && strings.HasPrefix(dev.Path, vfioPath) { - k.Logger().WithField("vfio-dev", dev.Path).Debug("removing vfio device from grpcSpec") - continue + if stripVfio { + // VFIO char device shouldn't appear in the guest + // (because the VM device driver will do something + // with it rather than just presenting it to the + // container unmodified) + var linuxDevices []grpc.LinuxDevice + for _, dev := range grpcSpec.Linux.Devices { + if dev.Type == "c" && strings.HasPrefix(dev.Path, vfioPath) { + k.Logger().WithField("vfio-dev", dev.Path).Debug("removing vfio device from grpcSpec") + continue + } + linuxDevices = append(linuxDevices, dev) } - linuxDevices = append(linuxDevices, dev) + grpcSpec.Linux.Devices = linuxDevices } - grpcSpec.Linux.Devices = linuxDevices } func (k *kataAgent) handleShm(mounts []specs.Mount, sandbox *Sandbox) { @@ -1413,7 +1417,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co // We need to constrain the spec to make sure we're not // passing irrelevant information to the agent. - k.constrainGRPCSpec(grpcSpec, passSeccomp) + k.constrainGRPCSpec(grpcSpec, passSeccomp, true) req := &grpc.CreateContainerRequest{ ContainerId: c.id, diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index fa50569cea..0db6f4b28b 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -589,7 +589,7 @@ func TestConstrainGRPCSpec(t *testing.T) { } k := kataAgent{} - k.constrainGRPCSpec(g, true) + k.constrainGRPCSpec(g, true, true) // check nil fields assert.Nil(g.Hooks) From 34273da98f11589bdfa46327d3418643b492275a Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 8 Oct 2021 16:57:48 +1100 Subject: [PATCH 15/15] runtime/device: Allow VFIO devices to be presented to guest as VFIO devices On a conventional (e.g. runc) container, passing in a VFIO group device, /dev/vfio/NN, will result in the same VFIO group device being available within the container. With Kata, however, the VFIO device will be bound to the guest kernel's driver (if it has one), possibly appearing as some other device (or a network interface) within the guest. This add a new `vfio_mode` option to alter this. If set to "vfio" it will instruct the agent to remap VFIO devices to the VFIO driver within the guest as well, meaning they will appear as VFIO devices within the container. Unlike a runc container, the VFIO devices will have different names to the host, since the names correspond to the IOMMU groups of the guest and those can't be remapped with namespaces. For now we keep 'guest-kernel' as the value in the default configuration files, to maintain current Kata behaviour. In future we should change this to 'vfio' as the default. That will make Kata's default behaviour more closely resemble OCI specified behaviour. fixes #693 Signed-off-by: David Gibson --- src/runtime/config/configuration-clh.toml.in | 7 +++++++ src/runtime/config/configuration-qemu.toml.in | 7 +++++++ src/runtime/virtcontainers/device/config/config.go | 11 ++++++++++- src/runtime/virtcontainers/kata_agent.go | 13 +++++++++++-- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index 076a113941..3e46d4961d 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -267,6 +267,13 @@ sandbox_bind_mounts=@DEFBINDMOUNTS@ # Determines how VFIO devices should be be presented to the container. # Options: # +# - vfio +# Matches behaviour of OCI runtimes (e.g. runc) as much as +# possible. VFIO devices will appear in the container as VFIO +# character devices under /dev/vfio. The exact names may differ +# from the host (they need to match the VM's IOMMU group numbers +# rather than the host's) +# # - guest-kernel # This is a Kata-specific behaviour that's useful in certain cases. # The VFIO device is managed by whatever driver in the VM kernel diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index f2c7c4fa73..5bc8e9cc38 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -547,6 +547,13 @@ sandbox_bind_mounts=@DEFBINDMOUNTS@ # Determines how VFIO devices should be be presented to the container. # Options: # +# - vfio +# Matches behaviour of OCI runtimes (e.g. runc) as much as +# possible. VFIO devices will appear in the container as VFIO +# character devices under /dev/vfio. The exact names may differ +# from the host (they need to match the VM's IOMMU group numbers +# rather than the host's) +# # - guest-kernel # This is a Kata-specific behaviour that's useful in certain cases. # The VFIO device is managed by whatever driver in the VM kernel diff --git a/src/runtime/virtcontainers/device/config/config.go b/src/runtime/virtcontainers/device/config/config.go index 5e62d0c179..4f27d9358b 100644 --- a/src/runtime/virtcontainers/device/config/config.go +++ b/src/runtime/virtcontainers/device/config/config.go @@ -191,20 +191,29 @@ type BlockDrive struct { type VFIOModeType uint32 const ( + // VFIOModeVFIO specifies OCI compliant behaviour: VFIO + // devices specified to Kata appear as VFIO devices within the + // container + VFIOModeVFIO VFIOModeType = iota + // VFIOModeGuestKernel specifies Kata-specific behaviour // useful in certain cases: VFIO devices specified to Kata are // bound to whatever driver in the VM will take them. This // requires specialized containers expecting this behaviour to // locate and use the devices - VFIOModeGuestKernel = iota + VFIOModeGuestKernel ) const ( + vfioModeVfioStr = "vfio" vfioModeGuestKernelStr = "guest-kernel" ) func (m *VFIOModeType) VFIOSetMode(modeName string) error { switch modeName { + case vfioModeVfioStr: + *m = VFIOModeVFIO + return nil case vfioModeGuestKernelStr: *m = VFIOModeGuestKernel return nil diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index c1a39220fb..d3c4fa48b8 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -92,6 +92,7 @@ var ( kataNvdimmDevType = "nvdimm" kataVirtioFSDevType = "virtio-fs" kataWatchableBindDevType = "watchable-bind" + kataVfioDevType = "vfio" // VFIO device to used as VFIO in the container kataVfioGuestKernelDevType = "vfio-gk" // VFIO device for consumption by the guest kernel sharedDir9pOptions = []string{"trans=virtio,version=9p2000.L,cache=mmap", "nodev"} sharedDirVirtioFSOptions = []string{} @@ -1183,11 +1184,19 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c * // (see qomGetPciPath() for details). kataDevice := &grpc.Device{ ContainerPath: dev.ContainerPath, - Type: kataVfioGuestKernelDevType, + Type: kataVfioDevType, Id: groupNum, Options: make([]string, len(devList)), } + // We always pass the device information to the agent, since + // it needs that to wait for them to be ready. But depending + // on the vfio_mode, we need to use a different device type so + // the agent can handle it properly + if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel { + kataDevice.Type = kataVfioGuestKernelDevType + } + for i, pciDev := range devList { kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDev.BDF, pciDev.GuestPciPath) } @@ -1417,7 +1426,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co // We need to constrain the spec to make sure we're not // passing irrelevant information to the agent. - k.constrainGRPCSpec(grpcSpec, passSeccomp, true) + k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.VfioMode == config.VFIOModeGuestKernel) req := &grpc.CreateContainerRequest{ ContainerId: c.id,