diff --git a/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in index fdf9f7a585..8e468795c3 100644 --- a/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in @@ -381,6 +381,12 @@ disable_image_nvdimm = false # Default false hotplug_vfio_on_root_bus = false +# Enable cold-plugging of VFIO devices to a PCIe port type. +# Accepted values: "no-port" (default, disabled), "root-port". +# When set to "root-port", devices discovered via CDI / Pod Resources +# are cold-plugged before VM boot. +cold_plug_vfio = "no-port" + # Before hot plugging a PCIe device, you need to add a pcie_root_port device. # Use this parameter when using some large PCI bar devices, such as Nvidia GPU # The value means the number of pcie_root_port diff --git a/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in index c16c6e2ff3..edb888ea43 100644 --- a/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in @@ -365,6 +365,12 @@ disable_image_nvdimm = true # Default false hotplug_vfio_on_root_bus = false +# Enable cold-plugging of VFIO devices to a PCIe port type. +# Accepted values: "no-port" (disabled), "root-port". +# When set to "root-port", devices discovered via CDI / Pod Resources +# are cold-plugged before VM boot. +cold_plug_vfio = "root-port" + # Before hot plugging a PCIe device, you need to add a pcie_root_port device. # Use this parameter when using some large PCI bar devices, such as Nvidia GPU # The value means the number of pcie_root_port diff --git a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs index 7bec687abe..d29173f63a 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs @@ -18,12 +18,13 @@ use kata_types::config::hypervisor::{ use tokio::sync::{Mutex, RwLock}; use crate::{ - vfio_device::VfioDeviceModernHandle, vhost_user_blk::VhostUserBlkDevice, BlockConfig, - BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, Hypervisor, - NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, VhostUserConfig, - VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, - KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, VIRTIO_BLOCK_MMIO, - VIRTIO_BLOCK_PCI, VIRTIO_PMEM, + vfio_device::{VfioDeviceModernHandle, VfioDeviceType}, + vhost_user_blk::VhostUserBlkDevice, + BlockConfig, BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, + Hypervisor, NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, + VhostUserConfig, VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, + KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, + VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM, }; use super::{ @@ -724,6 +725,36 @@ pub async fn get_shared_fs_info(d: &RwLock) -> SharedFsInfo { d.read().await.get_shared_fs_info().await } +/// Returns the APQN list for a cold-plugged VFIO-AP device whose +/// `iommu_group_devnode` matches `host_path`, or `None` if no such device is +/// registered in the device manager. +/// +/// Used by `handler_devices` to bypass `do_handle_device` for VFIO-AP devices +/// that were cold-plugged before VM boot. VFIO-AP devices have no PCIe BDF so +/// the BDF-keyed `cold_plug_bdfs` map cannot catch them; this lookup fills that +/// gap without touching reference counts or the QMP hot-plug path. +pub async fn find_cold_plugged_vfio_ap( + d: &RwLock, + host_path: &str, +) -> Option> { + // Avoid holding the DeviceManager read-lock across .await points. + let devices: Vec = { + let dm = d.read().await; + dm.devices.values().cloned().collect() + }; + for dev in devices { + if let DeviceType::VfioModern(inner) = dev.lock().await.get_device_info().await { + let guard = inner.lock().await; + if guard.device.device_type == VfioDeviceType::MediatedAp + && guard.config.iommu_group_devnode == Path::new(host_path) + { + return Some(guard.config.ap_devices.clone()); + } + } + } + None +} + #[cfg(test)] mod tests { use super::DeviceManager; diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs index b28ed7ca55..f8833ad30b 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs @@ -20,10 +20,10 @@ mod virtio_vsock; pub use port_device::{PCIePortDevice, PortDeviceConfig}; pub use protection_device::{ProtectionDevice, ProtectionDeviceConfig, SevSnpConfig, TdxConfig}; pub use vfio::{ - bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, VfioConfig, - VfioDevice, + bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, + VfioConfig, VfioDevice, VfioDeviceType, }; -pub use vfio_device::{VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle}; +pub use vfio_device::{is_vfio_ap_device, VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle}; pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType}; pub use vhost_user_net::VhostUserNetDevice; pub use virtio_blk::{ diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs index 88aa8b6720..631881f3e7 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs @@ -19,6 +19,7 @@ const SYS_PCI_DEVS: &str = "/sys/bus/pci/devices"; const DEV_IOMMU: &str = "/dev/iommu"; const DEV_VFIO_DEVICES: &str = "/dev/vfio/devices"; const SYS_CLASS_VFIO_DEV: &str = "/sys/class/vfio-dev"; +const SYS_VFIO_AP: &str = "/sys/devices/vfio_ap"; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VfioIommufdBackend { @@ -47,6 +48,10 @@ pub struct VfioDevice { pub primary: DeviceInfo, pub labels: BTreeMap, pub health: Health, + + /// APQNs (Adjunct Processor Queue Numbers) for MediatedAp devices, e.g. ["0a.0001", "0b.0002"]. + /// Populated by discover_vfio_ap_device(); empty for all non-AP device types. + pub ap_devices: Vec, } #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] @@ -656,6 +661,7 @@ fn discover_vfio_device_for_iommu_group(gid: u32, group_devnode: PathBuf) -> Res primary: primary_device, labels, health, + ap_devices: Vec::new(), }) } @@ -770,3 +776,96 @@ pub fn is_dev_vfio_group_path(host_path: &str) -> bool { // Valid if remainder is non-empty and contains only digits !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) } + +/// Returns true if the VFIO group at `group_devnode` (e.g. `/dev/vfio/N`) contains +/// an s390x AP mediated device. Detection works by resolving the sysfs symlinks for +/// every entry in `/sys/kernel/iommu_groups//devices/` and checking whether any of +/// them resolves to a path under `/sys/devices/vfio_ap`. +pub fn is_vfio_ap_device(group_devnode: &Path) -> bool { + let gid = match parse_dev_vfio_group_id(&group_devnode.to_string_lossy()) { + Some(id) => id, + None => return false, + }; + let group_devices_dir = Path::new(SYS_IOMMU_GROUPS) + .join(gid.to_string()) + .join("devices"); + let rd = match fs::read_dir(&group_devices_dir) { + Ok(r) => r, + Err(_) => return false, + }; + for ent in rd.flatten() { + let link_path = group_devices_dir.join(ent.file_name()); + if let Ok(resolved) = fs::canonicalize(&link_path) { + if resolved.starts_with(SYS_VFIO_AP) { + return true; + } + } + } + false +} + +/// Discovers an s390x VFIO-AP mediated device from its VFIO group path (`/dev/vfio/N`). +/// +/// Reads the `matrix` file from the mdev's sysfs path to obtain the list of APQNs +/// (Adjunct Processor Queue Numbers) assigned to this matrix device. +pub fn discover_vfio_ap_device(group_devnode: &Path) -> Result { + let gid = parse_dev_vfio_group_id(&group_devnode.to_string_lossy()) + .ok_or_else(|| anyhow!("Invalid VFIO group path: {}", group_devnode.display()))?; + + let group_devices_dir = Path::new(SYS_IOMMU_GROUPS) + .join(gid.to_string()) + .join("devices"); + + // Enumerate IOMMU group entries and find the AP mdev symlink. + let mut ap_sysfs_path: Option = None; + for ent in fs::read_dir(&group_devices_dir) + .with_context(|| format!("Failed to read {}", group_devices_dir.display()))? + .flatten() + { + let link_path = group_devices_dir.join(ent.file_name()); + if let Ok(resolved) = fs::canonicalize(&link_path) { + if resolved.starts_with(SYS_VFIO_AP) { + ap_sysfs_path = Some(resolved); + break; + } + } + } + + let sysfs_dev = ap_sysfs_path + .ok_or_else(|| anyhow!("No VFIO-AP device found in IOMMU group {}", gid))?; + + // Read APQNs from the `matrix` sysfs attribute (one APQN per line, e.g. "0a.0001"). + let matrix_path = sysfs_dev.join("matrix"); + let matrix_raw = fs::read_to_string(&matrix_path) + .with_context(|| format!("Failed to read {}", matrix_path.display()))?; + let ap_devices: Vec = matrix_raw + .lines() + .map(|l| l.trim().to_string()) + .filter(|l| !l.is_empty()) + .collect(); + + let primary = DeviceInfo { + addr: DeviceAddress::MdevUuid( + sysfs_dev + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_default(), + ), + sysfs_path: sysfs_dev.clone(), + ..Default::default() + }; + + Ok(VfioDevice { + id: format!("vfio-ap-{}", gid), + device_type: VfioDeviceType::MediatedAp, + bus_mode: VfioBusMode::Ccw, + iommu_group: None, + iommu_group_id: Some(gid), + iommufd: None, + devices: vec![primary.clone()], + primary, + labels: BTreeMap::new(), + health: Health::Healthy, + ap_devices, + }) +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs index e7db73679a..6519df4aba 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs @@ -13,7 +13,10 @@ use crate::device::pci_path::PciPath; use crate::device::topology::{PCIePort, PCIeTopology}; use crate::device::util::{do_decrease_count, do_increase_count}; use crate::device::{Device, DeviceType, PCIeDevice}; -use crate::vfio_device::core::{discover_vfio_device, discover_vfio_group_device, VfioDevice}; +use crate::vfio_device::core::{ + discover_vfio_ap_device, discover_vfio_device, discover_vfio_group_device, is_vfio_ap_device, + VfioDevice, VfioDeviceType, +}; use crate::Hypervisor; /// Identifies a specific port on a PCI bus: (bus_name, bus_slot, port_id) @@ -64,6 +67,11 @@ pub struct VfioDeviceBase { /// - VFIO Volume: "vfio_vol_" /// - VFIO NVMe: "vfio_nvme_" pub hostdev_prefix: String, + + /// APQNs assigned to this device (s390x VFIO-AP only). + /// Each entry is a string like "0a.0001" read from the mdev matrix sysfs file. + /// Empty for all non-AP device types. + pub ap_devices: Vec, } #[derive(Debug, Default, Clone)] @@ -95,18 +103,33 @@ fn vfio_modern_group_discovery_path(base: &VfioDeviceBase) -> PathBuf { impl VfioDeviceModern { pub fn new(device_id: String, base: &VfioDeviceBase) -> Result { - // For modern VFIO devices, we require the specific device cdev path to be provided in the configuration. - // This allows us to directly discover the device context without needing to resolve group devices. - // If the device node is not provided, we can optionally fallback to group device discovery, - // but this is less efficient and may not be supported in all environments. + let group_path = vfio_modern_group_discovery_path(base); + + // s390x VFIO-AP: mediated AP devices have no PCI BDF; discover them separately. + if is_vfio_ap_device(&group_path) { + let device = discover_vfio_ap_device(&group_path)?; + let mut config = base.clone(); + config.ap_devices = device.ap_devices.clone(); + return Ok(Self { + device_id, + device, + config, + device_options: Vec::new(), + is_allocated: false, + attach_count: 0, + }); + } + + // PCI / iommufd path: use the iommu_device_node cdev when available, otherwise + // fall back to group-device discovery. let device = if let Some(ref node) = base.iommu_device_node { if !node.as_os_str().is_empty() { discover_vfio_device(node)? } else { - discover_vfio_group_device(vfio_modern_group_discovery_path(base))? + discover_vfio_group_device(group_path)? } } else { - discover_vfio_group_device(vfio_modern_group_discovery_path(base))? + discover_vfio_group_device(group_path)? }; Ok(Self { device_id, @@ -196,21 +219,31 @@ impl Device for VfioDeviceModernHandle { return Ok(()); } - // Register the device in the virtual PCIe topology - let topo = pcie_topo.as_deref_mut().ok_or_else(|| { - anyhow::anyhow!("VFIO device requires a PCIe topology but none was provided") - })?; - self.register(topo).await?; + let is_ap = self + .with(|d| d.device.device_type == VfioDeviceType::MediatedAp) + .await; - // Request Hypervisor to perform the actual hardware passthrough - if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await { - error!(sl!(), "failed to attach vfio device: {:?}", e); - - // Rollback state on failure - self.decrease_attach_count().await?; - self.unregister(topo).await?; - return Err(e); + if is_ap { + // AP devices have no PCIe topology; call the hypervisor directly. + if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await { + error!(sl!(), "failed to attach vfio-ap device: {:?}", e); + self.decrease_attach_count().await?; + return Err(e); + } + } else { + // PCI devices must be registered in the topology first. + let topo = pcie_topo.as_deref_mut().ok_or_else(|| { + anyhow::anyhow!("VFIO device requires a PCIe topology but none was provided") + })?; + self.register(topo).await?; + if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await { + error!(sl!(), "failed to attach vfio device: {:?}", e); + self.decrease_attach_count().await?; + self.unregister(topo).await?; + return Err(e); + } } + info!( sl!(), "vfio device {:?} attached successfully", @@ -247,9 +280,14 @@ impl Device for VfioDeviceModernHandle { let virt = self.with(|d| d.config.virt_path.clone()).await; let device_index = virt.map(|(idx, _)| idx); - // Unregister from PCIe topology - if let Some(topo) = pcie_topo { - self.unregister(topo).await?; + // AP devices have no PCIe topology to unregister from. + let is_ap = self + .with(|d| d.device.device_type == VfioDeviceType::MediatedAp) + .await; + if !is_ap { + if let Some(topo) = pcie_topo { + self.unregister(topo).await?; + } } Ok(device_index) @@ -279,6 +317,14 @@ impl Device for VfioDeviceModernHandle { impl PCIeDevice for VfioDeviceModernHandle { /// Reserves a bus and port in the PCIe topology for this device. async fn register(&mut self, topo: &mut PCIeTopology) -> Result<()> { + // AP devices have no PCIe topology. + if self + .with(|d| d.device.device_type == VfioDeviceType::MediatedAp) + .await + { + return Ok(()); + } + let device_id = self.device_id().await; let port_type = self.with(|d| d.config.port).await; @@ -299,6 +345,14 @@ impl PCIeDevice for VfioDeviceModernHandle { /// Releases the reserved PCIe resources and resets attachment state. async fn unregister(&mut self, topo: &mut PCIeTopology) -> Result<()> { + // AP devices have no PCIe topology. + if self + .with(|d| d.device.device_type == VfioDeviceType::MediatedAp) + .await + { + return Ok(()); + } + let device_id = self.device_id().await; topo.release_bus_for_device(&device_id)?; diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs index 0758e3d43d..41206f5156 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs @@ -6,7 +6,10 @@ mod core; mod device; -pub use core::{discover_vfio_group_device, VfioDevice}; +pub use core::{ + discover_vfio_ap_device, discover_vfio_group_device, is_vfio_ap_device, VfioDevice, + VfioDeviceType, +}; pub use device::VfioDeviceBase; pub use device::VfioDeviceModern; pub use device::VfioDeviceModernHandle; diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index 5352a85a45..80a9ee7b47 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -2331,6 +2331,21 @@ impl PCIeVfioDevice { } } +/// s390x VFIO-AP device: `-device vfio-ap,sysfsdev=` +struct VfioApDevice { + sysfs_path: String, +} + +#[async_trait] +impl ToQemuParams for VfioApDevice { + async fn qemu_params(&self) -> Result> { + Ok(vec![ + "-device".to_string(), + format!("vfio-ap,sysfsdev={}", self.sysfs_path), + ]) + } +} + #[async_trait] impl ToQemuParams for PCIeVfioDevice { async fn qemu_params(&self) -> Result> { @@ -3169,6 +3184,16 @@ impl<'a> QemuCmdLine<'a> { Ok(()) } + /// Adds an s390x VFIO-AP device to the QEMU command line. + /// + /// Generates: `-device vfio-ap,sysfsdev=` + pub fn add_vfio_ap_device(&mut self, sysfs_path: &str) -> Result<()> { + self.devices.push(Box::new(VfioApDevice { + sysfs_path: sysfs_path.to_string(), + })); + Ok(()) + } + /// Batch adds multiple VFIO devices to the QEMU command line. pub fn add_vfio_devices(&mut self, configs: Vec) -> Result<()> { if configs.is_empty() { diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs index 9c6a52a51f..548b065b47 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs @@ -203,50 +203,59 @@ impl QemuInner { } } DeviceType::VfioModern(vfio_dev) => { - // To avoid holding the lock for too long, we first snapshot the necessary VFIO parameters, - // then release the lock before doing the coldplug via cmdline, - // and finally re-acquire the lock to update the guest PCI path after coldplug. - let (devices, bus_port_id) = { + // Snapshot parameters under the lock; release before doing cmdline work. + let (device_type, ap_sysfs_path, devices, bus_port_id) = { let vfio_device = vfio_dev.lock().await; + let device_type = vfio_device.device.device_type.clone(); + let ap_sysfs_path = + vfio_device.device.primary.sysfs_path.display().to_string(); let devices = vfio_device .device .iommu_group .as_ref() .map(|g| g.clone().devices) .unwrap_or_default(); - - (devices, vfio_device.config.bus_port_id.clone()) + ( + device_type, + ap_sysfs_path, + devices, + vfio_device.config.bus_port_id.clone(), + ) }; - // Cold plug devices - for dev in devices.iter() { - let host_bdf = dev.addr.to_string(); + if device_type == VfioDeviceType::MediatedAp { + // s390x VFIO-AP: -device vfio-ap,sysfsdev= + // No PCIe root port, no guest_pci_path. + cmdline.add_vfio_ap_device(&ap_sysfs_path)?; + info!(sl!(), "Completed VFIOModern AP coldplug for sysfsdev: {}", ap_sysfs_path); + } else { + // PCI cold plug devices + for dev in devices.iter() { + let host_bdf = dev.addr.to_string(); - let vfio_cfg = VfioDeviceConfig::new( - host_bdf, - bus_port_id.1 as u16, - bus_port_id.1 + 1, - ) - .with_vfio_bus(bus_port_id.0.clone()); + let vfio_cfg = VfioDeviceConfig::new( + host_bdf, + bus_port_id.1 as u16, + bus_port_id.1 + 1, + ) + .with_vfio_bus(bus_port_id.0.clone()); - cmdline.add_pcie_vfio_device(vfio_cfg)?; + cmdline.add_pcie_vfio_device(vfio_cfg)?; + } + + // Write back guest PCI path + let pci_path = + PciPath::try_from(format!("{:02x}/00", bus_port_id.1).as_str())?; + { + let mut vfio_device = vfio_dev.lock().await; + vfio_device.config.guest_pci_path = Some(pci_path.clone()); + } + info!( + sl!(), + "Completed VFIOModern coldplug with returned guest pci path: {:?}", + pci_path + ); } - - // Write back with lock - let pci_path = PciPath::try_from(format!("{:02x}/00", bus_port_id.1).as_str())?; - - { - let mut vfio_device = vfio_dev.lock().await; - // Update the guest PCI path for the VFIO device after coldplug, - // which will be used for device mapping into from Guest to Container Environment. - vfio_device.config.guest_pci_path = Some(pci_path.clone()); - } - - info!( - sl!(), - "Completed VFIOModern coldplug with returned guest pci path: {:?}", - pci_path - ); } DeviceType::Vfio(vfio_dev) => { // Cold-plug physical-endpoint VFs (non-IOMMUFD VFIO) onto @@ -908,6 +917,7 @@ async fn log_qemu_stderr(stderr: ChildStderr, exit_notify: mpsc::Sender<()>) -> } use crate::device::DeviceType; +use crate::vfio_device::VfioDeviceType; // device manager part of Hypervisor impl QemuInner { @@ -1141,38 +1151,41 @@ impl QemuInner { } DeviceType::VfioModern(ref vfiodev) => { // Snapshot VFIO parameters inside the lock. - let (hostdev_id, sysfs_path, address, driver_type, bus) = { + let (hostdev_id, device_type, sysfs_path, address, driver_type, bus) = { let vfio_device = vfiodev.lock().await; let hostdev_id = vfio_device.device_id.clone(); let device = &vfio_device.device; + let device_type = device.device_type.clone(); - // FIXME: The first device in the group might not be the actual device intended for passthrough. - // Multi-function support is tracked via issue #11292. - let primary_device = device - .clone() - .iommu_group - .ok_or_else(|| anyhow!("IOMMU group missing for VFIO device"))? - .primary; + let sysfs_path = device.primary.sysfs_path.display().to_string(); - info!( - sl!(), - "QMP hotplug VFIO primary_device address: {:?}", &primary_device.addr - ); + // For AP devices there is no IOMMU group or BDF; use empty strings. + let (address, driver_type, bus) = if device_type == VfioDeviceType::MediatedAp { + (String::new(), "vfio-ap".to_string(), String::new()) + } else { + // FIXME: The first device in the group might not be the actual device intended for passthrough. + // Multi-function support is tracked via issue #11292. + let primary_device = device + .clone() + .iommu_group + .ok_or_else(|| anyhow!("IOMMU group missing for VFIO device"))? + .primary; - let sysfs_path = primary_device.sysfs_path.display().to_string(); - let driver_type = primary_device - .driver - .clone() - .ok_or_else(|| anyhow!("Driver type missing for primary device"))?; - let address = format!("{}", primary_device.addr); + info!( + sl!(), + "QMP hotplug VFIO primary_device address: {:?}", &primary_device.addr + ); - ( - hostdev_id, - sysfs_path, - address, - driver_type, - vfio_device.config.bus_port_id.0.clone(), - ) + let driver_type = primary_device + .driver + .clone() + .ok_or_else(|| anyhow!("Driver type missing for primary device"))?; + let address = format!("{}", primary_device.addr); + let bus = vfio_device.config.bus_port_id.0.clone(); + (address, driver_type, bus) + }; + + (hostdev_id, device_type, sysfs_path, address, driver_type, bus) }; // Execute hotplug outside the lock. @@ -1184,12 +1197,13 @@ impl QemuInner { &bus, )?; - // Write the resulting Guest PCI Path back within the lock. + // Write the resulting Guest PCI Path back within the lock (PCI only). { let mut vfio_device = vfiodev.lock().await; - if let Some(p) = guest_pci_path { - // Very important to write back the guest pci path for VFIO devices. - vfio_device.config.guest_pci_path = Some(p); + if device_type != VfioDeviceType::MediatedAp { + if let Some(p) = guest_pci_path { + vfio_device.config.guest_pci_path = Some(p); + } } info!( sl!(), diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 8b573cff70..1122aa237f 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -11,23 +11,27 @@ use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use hypervisor::{ device::{ - device_manager::{do_handle_device, get_block_device_info, DeviceManager}, + device_manager::{ + do_handle_device, find_cold_plugged_vfio_ap, get_block_device_info, DeviceManager, + }, util::{get_host_path, DEVICE_TYPE_BLOCK, DEVICE_TYPE_CHAR}, DeviceConfig, DeviceType, }, utils::uses_native_ccw_bus, + vfio_device::is_vfio_ap_device, BlockConfig, BlockDeviceAio, Hypervisor, VfioConfig, }; use kata_types::mount::{kata_guest_sandbox_dir, Mount, KATA_EPHEMERAL_VOLUME_TYPE, SHM_DIR}; use kata_types::{ config::{hypervisor::TopologyConfigInfo, TomlConfig}, + device::DRIVER_VFIO_AP_COLD_TYPE, mount::{adjust_rootfs_mounts, KATA_IMAGE_FORCE_GUEST_PULL}, }; use libc::NUD_PERMANENT; use oci::{Linux, LinuxCpu, LinuxResources}; use oci_spec::runtime::{self as oci, LinuxDeviceType}; use persist::sandbox_persist::Persist; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use tokio::{runtime, sync::RwLock}; use crate::{ @@ -661,6 +665,43 @@ impl ResourceManagerInner { continue; } + // VFIO-AP devices have no PCIe BDF, so cold_plug_bdfs above + // cannot catch them. If this device was registered in the + // device manager during cold-plug (prepare_coldplug_raw_vfio_devices + // or CDI), retrieve its APQN list and build the agent device + // directly — calling do_handle_device on an already-present + // device would attempt a QMP device_add and fail. + if is_vfio_ap_device(Path::new(&host_path)) { + if let Some(ap_devs) = + find_cold_plugged_vfio_ap(&self.device_manager, &host_path).await + { + let container_path = d.path().display().to_string(); + let group_num = d + .path() + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or_default() + .to_string(); + let agent_device = Device { + id: group_num, + container_path, + field_type: DRIVER_VFIO_AP_COLD_TYPE.to_string(), + options: ap_devs, + ..Default::default() + }; + info!( + sl!(), + "vfio-ap cold-plugged agent device: {:?}", agent_device + ); + devices.push(ContainerDevice { + device_info: None, + device: agent_device, + }); + continue; + } + // Not registered as cold-plugged — fall through to do_handle_device. + } + let bus_type = if uses_native_ccw_bus() { "ccw".to_string() } else { @@ -681,6 +722,14 @@ impl ResourceManagerInner { if let DeviceType::VfioModern(vfio_dev) = device_info.clone() { info!(sl!(), "device info: {:?}", vfio_dev.lock().await); let vfio_device = vfio_dev.lock().await; + + let group_num = d + .path() + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or_default() + .to_string(); + let guest_pci_path = vfio_device .config .guest_pci_path @@ -697,15 +746,8 @@ impl ResourceManagerInner { // vfio mode: vfio-pci and vfio-pci-gk for x86_64 // - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container. // - vfio-pci-gk, devices are managed by whatever driver in Guest kernel. - // - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices. let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() { - "vfio" => { - if bus_type == "ccw" { - "vfio-ap".to_string() - } else { - "vfio-pci".to_string() - } - } + "vfio" => "vfio-pci".to_string(), _ => "vfio-pci-gk".to_string(), }; let device_options = vec![format!("{}={}", host_bdf, guest_pci_path)]; @@ -713,12 +755,6 @@ impl ResourceManagerInner { // filepath.Base(dev.ContainerPath), e.g. "vfio0". // The agent policy validates this with: // i_vfio_device.id == concat("", ["vfio", suffix]) - let group_num = d - .path() - .file_name() - .and_then(|n| n.to_str()) - .unwrap_or_default() - .to_string(); let agent_device = Device { id: group_num, container_path: d.path().display().to_string().clone(), diff --git a/src/runtime-rs/crates/resource/src/rootfs/virtual_volume.rs b/src/runtime-rs/crates/resource/src/rootfs/virtual_volume.rs index 029ea2aecb..723ef792f0 100644 --- a/src/runtime-rs/crates/resource/src/rootfs/virtual_volume.rs +++ b/src/runtime-rs/crates/resource/src/rootfs/virtual_volume.rs @@ -47,6 +47,10 @@ fn kata_guest_root_shared_fs() -> String { /// If the container is a PodSandbox, it returns "pause". /// Otherwise, it attempts to find the image name using the appropriate Kubernetes /// annotation key. +/// +/// If no container type annotation is found (SingleContainer case), it falls back to +/// checking for image name annotations directly. This supports standalone container +/// runtimes like nerdctl that may only provide the image name annotation. pub fn get_image_reference(spec_annotations: &HashMap) -> Result<&str> { info!( sl!(), @@ -74,6 +78,26 @@ pub fn get_image_reference(spec_annotations: &HashMap) -> Result } } + // Fallback for SingleContainer case: if no container type annotation is found, + // try to get image name directly. This supports standalone container runtimes + // (e.g., nerdctl) that may only provide the image name annotation without the + // container type annotation. + if let Some(image_name) = spec_annotations.get(KUBERNETES_CRI_IMAGE_NAME) { + info!( + sl!(), + "Found image name without container type annotation (SingleContainer): {}", image_name + ); + return Ok(image_name.as_str()); + } + + if let Some(image_name) = spec_annotations.get(KUBERNETES_CRIO_IMAGE_NAME) { + info!( + sl!(), + "Found CRI-O image name without container type annotation (SingleContainer): {}", image_name + ); + return Ok(image_name.as_str()); + } + Err(anyhow!("no target image reference found")) } @@ -256,6 +280,25 @@ mod tests { let image_ref_result_pod_sandbox = get_image_reference(&annotations_pod_sandbox); assert!(image_ref_result_pod_sandbox.is_ok()); assert_eq!(image_ref_result_pod_sandbox.unwrap(), "pause"); + // Test SingleContainer fallback (no container type annotation) + let mut annotations_single = HashMap::new(); + annotations_single.insert( + "io.kubernetes.cri.image-name".to_string(), + "example-image-single".to_string(), + ); + let image_ref_result_single = get_image_reference(&annotations_single); + assert!(image_ref_result_single.is_ok()); + assert_eq!(image_ref_result_single.unwrap(), "example-image-single"); + + // Test SingleContainer fallback with CRI-O annotation + let mut annotations_single_crio = HashMap::new(); + annotations_single_crio.insert( + "io.kubernetes.cri-o.ImageName".to_string(), + "example-image-single-crio".to_string(), + ); + let image_ref_result_single_crio = get_image_reference(&annotations_single_crio); + assert!(image_ref_result_single_crio.is_ok()); + assert_eq!(image_ref_result_single_crio.unwrap(), "example-image-single-crio"); } #[tokio::test] diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 9850a4cd4c..ef12d2e6ec 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -31,8 +31,9 @@ use containerd_shim_protos::events::task::{TaskExit, TaskOOM}; ))] use hypervisor::ch::CloudHypervisor; use hypervisor::device::topology::PCIePort; +use hypervisor::device::util::{get_host_path, DEVICE_TYPE_CHAR}; use hypervisor::remote::Remote; -use hypervisor::VfioDeviceBase; +use hypervisor::{is_vfio_ap_device, VfioDeviceBase}; use hypervisor::VsockConfig; use hypervisor::HYPERVISOR_REMOTE; #[cfg(all( @@ -262,7 +263,23 @@ impl VirtSandbox { None }; - let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?; + // Cold-plug VFIO devices using two mutually exclusive paths: + // 1. CDI path: Query Kubernetes Pod Resources API for devices managed by device plugins + // (typical in K8s environments with device plugins) + // 2. Raw VFIO path: Parse OCI spec's linux.devices for directly specified VFIO devices + // (typical in standalone containers like `ctr --device /dev/vfio/0`) + // + // These paths are mutually exclusive from a user perspective: + // - In K8s, devices come through device plugins, not raw OCI device specs + // - In standalone containers, there's no Pod Resources API available + // + // Therefore, we only attempt the raw VFIO path if CDI finds no devices, + // avoiding unnecessary file I/O and OCI spec parsing in the common K8s case. + let mut vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?; + if vfio_devices.is_empty() { + let raw_vfio = self.prepare_coldplug_raw_vfio_devices(sandbox_config).await?; + vfio_devices.extend(raw_vfio); + } if !vfio_devices.is_empty() { info!( sl!(), @@ -387,6 +404,97 @@ impl VirtSandbox { .collect()) } + // Fallback cold-plug path for standalone containers (e.g. `ctr --device /dev/vfio/0`). + // Reads the OCI spec from the bundle and cold-plugs any VFIO char devices found in + // linux.devices before VM boot, mirroring Go's coldOrHotPlugVFIO(). + // Returns empty when the pod resources API path already handles devices (K8s) or + // when cold_plug_vfio is not configured. + async fn prepare_coldplug_raw_vfio_devices( + &self, + sandbox_config: &SandboxConfig, + ) -> Result> { + let hypervisor_config = self.hypervisor.hypervisor_config().await; + let cold_plug_vfio = &hypervisor_config.device_info.cold_plug_vfio; + if cold_plug_vfio.is_empty() || cold_plug_vfio == "no-port" { + return Ok(Vec::new()); + } + + let port = match cold_plug_vfio.as_str() { + "root-port" => PCIePort::RootPort, + other => { + return Err(anyhow!( + "unsupported cold_plug_vfio value {:?}; only \"root-port\" is supported", + other + )) + } + }; + + let bundle = &sandbox_config.state.bundle; + if bundle.is_empty() { + return Ok(Vec::new()); + } + + let spec_path = format!("{}/{}", bundle, spec::OCI_SPEC_CONFIG_FILE_NAME); + let oci_spec = match oci::Spec::load(&spec_path) { + Ok(s) => s, + Err(e) => { + info!( + sl!(), + "no OCI spec at {:?}: {:?}, skipping raw VFIO cold-plug", spec_path, e + ); + return Ok(Vec::new()); + } + }; + + let linux_devices = oci_spec + .linux() + .as_ref() + .and_then(|l| l.devices().as_ref()) + .cloned() + .unwrap_or_default(); + + let mut vfio_configs = Vec::new(); + for d in linux_devices.iter() { + if d.typ() != oci::LinuxDeviceType::C { + continue; + } + let host_path = match get_host_path(DEVICE_TYPE_CHAR, d.major(), d.minor()) { + Ok(p) => p, + Err(e) => { + warn!( + sl!(), + "failed to resolve host path for {:?}: {:?}", d.path(), e + ); + continue; + } + }; + // Only process VFIO passthrough devices under /dev/vfio/*. + // Skip non-VFIO devices and the legacy VFIO control node (/dev/vfio/vfio). + if !host_path.starts_with("/dev/vfio/") || host_path == "/dev/vfio/vfio" { + continue; + } + let device_port = if is_vfio_ap_device(Path::new(&host_path)) { + PCIePort::NoPort + } else { + port + }; + vfio_configs.push(VfioDeviceBase { + host_path: host_path.clone(), + iommu_group_devnode: PathBuf::from(&host_path), + dev_type: "c".to_string(), + port: device_port, + hostdev_prefix: "vfio_device".to_owned(), + ..Default::default() + }); + } + info!(sl!(), "raw VFIO cold-plug candidates: {:?}", vfio_configs); + + Ok(vfio_configs + .into_iter() + .map(ResourceConfig::VfioDeviceModern) + .collect()) + } + async fn prepare_network_resource( &self, network_env: &SandboxNetworkEnv, diff --git a/tests/functional/vfio-ap/run.sh b/tests/functional/vfio-ap/run.sh index dd8c1adf3c..9d29def1e3 100755 --- a/tests/functional/vfio-ap/run.sh +++ b/tests/functional/vfio-ap/run.sh @@ -63,6 +63,7 @@ setup_hotplug() { show_config_file elif [[ "${runtime}" == "runtime-rs" ]]; then setup_config_file "vfio_mode" "replace" "vfio" "runtime-rs" + setup_config_file "cold_plug_vfio" "replace" "no-port" "runtime-rs" show_config_file "runtime-rs" else echo "Invalid runtime: ${runtime}" >&2 @@ -79,8 +80,9 @@ setup_coldplug() { setup_config_file "cold_plug_vfio" "replace" "bridge-port" show_config_file elif [[ "${runtime}" == "runtime-rs" ]]; then - echo "Coldplug is not supported for runtime-rs" >&2 - exit 1 + setup_config_file "vfio_mode" "replace" "vfio" "runtime-rs" + setup_config_file "cold_plug_vfio" "replace" "root-port" "runtime-rs" + show_config_file "runtime-rs" else echo "Invalid runtime: ${runtime}" >&2 exit 1 @@ -296,6 +298,9 @@ run_tests() { setup_hotplug "runtime-rs" run_test "3" "runtime-rs" "Test can assign a CEX device inside the guest via VFIO-AP Hotplug" "&& zcrypttest -a -v" + + setup_coldplug "runtime-rs" + run_test "4" "runtime-rs" "Test can assign a CEX device inside the guest via VFIO-AP Coldplug" "&& zcrypttest -a -v" } main() {