From aad3bfa7aadfd4b60455a7093a0bb5562c71f73c Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Tue, 2 Jun 2026 13:12:15 +0200 Subject: [PATCH 1/4] runtime-rs: add fallback for image reference for single container This commit adds a fallback mechanism in `get_image_reference()` to handle SingleContainer scenarios where no container type annotation is present. The issue occurs when standalone container runtimes (like nerdctl) create containers without the Kubernetes container type annotation. In such cases, `container_type_with_id()` returns `SingleContainer` as the default, but `get_image_reference()` previously only checked for image names when a container type annotation was explicitly set to either PodSandbox or PodContainer. This caused image reference lookups to fail for standalone containers, even though the image name annotations (io.kubernetes.cri.image-name or io.kubernetes.cri-o.ImageName) were present in the spec. The fallback logic now checks for image name annotations directly when no container type annotation is found, supporting both: - io.kubernetes.cri.image-name (standard CRI) - io.kubernetes.cri-o.ImageName (CRI-O specific) This maintains backward compatibility with Kubernetes pod scenarios while enabling support for standalone container runtimes that don't provide container type annotations. Test cases have been added to verify the fallback behavior for both CRI and CRI-O image name annotations in SingleContainer scenarios. Signed-off-by: Hyounggyu Choi --- .../resource/src/rootfs/virtual_volume.rs | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/runtime-rs/crates/resource/src/rootfs/virtual_volume.rs b/src/runtime-rs/crates/resource/src/rootfs/virtual_volume.rs index 029ea2aecb..723ef792f0 100644 --- a/src/runtime-rs/crates/resource/src/rootfs/virtual_volume.rs +++ b/src/runtime-rs/crates/resource/src/rootfs/virtual_volume.rs @@ -47,6 +47,10 @@ fn kata_guest_root_shared_fs() -> String { /// If the container is a PodSandbox, it returns "pause". /// Otherwise, it attempts to find the image name using the appropriate Kubernetes /// annotation key. +/// +/// If no container type annotation is found (SingleContainer case), it falls back to +/// checking for image name annotations directly. This supports standalone container +/// runtimes like nerdctl that may only provide the image name annotation. pub fn get_image_reference(spec_annotations: &HashMap) -> Result<&str> { info!( sl!(), @@ -74,6 +78,26 @@ pub fn get_image_reference(spec_annotations: &HashMap) -> Result } } + // Fallback for SingleContainer case: if no container type annotation is found, + // try to get image name directly. This supports standalone container runtimes + // (e.g., nerdctl) that may only provide the image name annotation without the + // container type annotation. + if let Some(image_name) = spec_annotations.get(KUBERNETES_CRI_IMAGE_NAME) { + info!( + sl!(), + "Found image name without container type annotation (SingleContainer): {}", image_name + ); + return Ok(image_name.as_str()); + } + + if let Some(image_name) = spec_annotations.get(KUBERNETES_CRIO_IMAGE_NAME) { + info!( + sl!(), + "Found CRI-O image name without container type annotation (SingleContainer): {}", image_name + ); + return Ok(image_name.as_str()); + } + Err(anyhow!("no target image reference found")) } @@ -256,6 +280,25 @@ mod tests { let image_ref_result_pod_sandbox = get_image_reference(&annotations_pod_sandbox); assert!(image_ref_result_pod_sandbox.is_ok()); assert_eq!(image_ref_result_pod_sandbox.unwrap(), "pause"); + // Test SingleContainer fallback (no container type annotation) + let mut annotations_single = HashMap::new(); + annotations_single.insert( + "io.kubernetes.cri.image-name".to_string(), + "example-image-single".to_string(), + ); + let image_ref_result_single = get_image_reference(&annotations_single); + assert!(image_ref_result_single.is_ok()); + assert_eq!(image_ref_result_single.unwrap(), "example-image-single"); + + // Test SingleContainer fallback with CRI-O annotation + let mut annotations_single_crio = HashMap::new(); + annotations_single_crio.insert( + "io.kubernetes.cri-o.ImageName".to_string(), + "example-image-single-crio".to_string(), + ); + let image_ref_result_single_crio = get_image_reference(&annotations_single_crio); + assert!(image_ref_result_single_crio.is_ok()); + assert_eq!(image_ref_result_single_crio.unwrap(), "example-image-single-crio"); } #[tokio::test] From 534db34e7a1122903947b4b46f06782cdb8c486f Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Thu, 28 May 2026 13:21:17 +0200 Subject: [PATCH 2/4] runtime-rs: add s390x VFIO-AP mediated device support Add runtime-rs support for s390x VFIO-AP mediated devices across QEMU device creation, hotplug/coldplug flows, and agent device translation. Introduce a dedicated vfio-ap QEMU device generator using the sysfsdev path and route mediated AP devices through it instead of the PCI VFIO path. Skip PCI-specific guest path handling for these devices since they do not use PCI root ports or guest PCI addresses. Signed-off-by: Hyounggyu Choi --- .../src/device/driver/vfio_device/core.rs | 99 +++++++++++++ .../src/device/driver/vfio_device/device.rs | 100 ++++++++++--- .../src/device/driver/vfio_device/mod.rs | 5 +- .../hypervisor/src/qemu/cmdline_generator.rs | 25 ++++ .../crates/hypervisor/src/qemu/inner.rs | 136 ++++++++++-------- 5 files changed, 280 insertions(+), 85 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs index 88aa8b6720..631881f3e7 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs @@ -19,6 +19,7 @@ const SYS_PCI_DEVS: &str = "/sys/bus/pci/devices"; const DEV_IOMMU: &str = "/dev/iommu"; const DEV_VFIO_DEVICES: &str = "/dev/vfio/devices"; const SYS_CLASS_VFIO_DEV: &str = "/sys/class/vfio-dev"; +const SYS_VFIO_AP: &str = "/sys/devices/vfio_ap"; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VfioIommufdBackend { @@ -47,6 +48,10 @@ pub struct VfioDevice { pub primary: DeviceInfo, pub labels: BTreeMap, pub health: Health, + + /// APQNs (Adjunct Processor Queue Numbers) for MediatedAp devices, e.g. ["0a.0001", "0b.0002"]. + /// Populated by discover_vfio_ap_device(); empty for all non-AP device types. + pub ap_devices: Vec, } #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] @@ -656,6 +661,7 @@ fn discover_vfio_device_for_iommu_group(gid: u32, group_devnode: PathBuf) -> Res primary: primary_device, labels, health, + ap_devices: Vec::new(), }) } @@ -770,3 +776,96 @@ pub fn is_dev_vfio_group_path(host_path: &str) -> bool { // Valid if remainder is non-empty and contains only digits !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) } + +/// Returns true if the VFIO group at `group_devnode` (e.g. `/dev/vfio/N`) contains +/// an s390x AP mediated device. Detection works by resolving the sysfs symlinks for +/// every entry in `/sys/kernel/iommu_groups//devices/` and checking whether any of +/// them resolves to a path under `/sys/devices/vfio_ap`. +pub fn is_vfio_ap_device(group_devnode: &Path) -> bool { + let gid = match parse_dev_vfio_group_id(&group_devnode.to_string_lossy()) { + Some(id) => id, + None => return false, + }; + let group_devices_dir = Path::new(SYS_IOMMU_GROUPS) + .join(gid.to_string()) + .join("devices"); + let rd = match fs::read_dir(&group_devices_dir) { + Ok(r) => r, + Err(_) => return false, + }; + for ent in rd.flatten() { + let link_path = group_devices_dir.join(ent.file_name()); + if let Ok(resolved) = fs::canonicalize(&link_path) { + if resolved.starts_with(SYS_VFIO_AP) { + return true; + } + } + } + false +} + +/// Discovers an s390x VFIO-AP mediated device from its VFIO group path (`/dev/vfio/N`). +/// +/// Reads the `matrix` file from the mdev's sysfs path to obtain the list of APQNs +/// (Adjunct Processor Queue Numbers) assigned to this matrix device. +pub fn discover_vfio_ap_device(group_devnode: &Path) -> Result { + let gid = parse_dev_vfio_group_id(&group_devnode.to_string_lossy()) + .ok_or_else(|| anyhow!("Invalid VFIO group path: {}", group_devnode.display()))?; + + let group_devices_dir = Path::new(SYS_IOMMU_GROUPS) + .join(gid.to_string()) + .join("devices"); + + // Enumerate IOMMU group entries and find the AP mdev symlink. + let mut ap_sysfs_path: Option = None; + for ent in fs::read_dir(&group_devices_dir) + .with_context(|| format!("Failed to read {}", group_devices_dir.display()))? + .flatten() + { + let link_path = group_devices_dir.join(ent.file_name()); + if let Ok(resolved) = fs::canonicalize(&link_path) { + if resolved.starts_with(SYS_VFIO_AP) { + ap_sysfs_path = Some(resolved); + break; + } + } + } + + let sysfs_dev = ap_sysfs_path + .ok_or_else(|| anyhow!("No VFIO-AP device found in IOMMU group {}", gid))?; + + // Read APQNs from the `matrix` sysfs attribute (one APQN per line, e.g. "0a.0001"). + let matrix_path = sysfs_dev.join("matrix"); + let matrix_raw = fs::read_to_string(&matrix_path) + .with_context(|| format!("Failed to read {}", matrix_path.display()))?; + let ap_devices: Vec = matrix_raw + .lines() + .map(|l| l.trim().to_string()) + .filter(|l| !l.is_empty()) + .collect(); + + let primary = DeviceInfo { + addr: DeviceAddress::MdevUuid( + sysfs_dev + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_default(), + ), + sysfs_path: sysfs_dev.clone(), + ..Default::default() + }; + + Ok(VfioDevice { + id: format!("vfio-ap-{}", gid), + device_type: VfioDeviceType::MediatedAp, + bus_mode: VfioBusMode::Ccw, + iommu_group: None, + iommu_group_id: Some(gid), + iommufd: None, + devices: vec![primary.clone()], + primary, + labels: BTreeMap::new(), + health: Health::Healthy, + ap_devices, + }) +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs index e7db73679a..6519df4aba 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs @@ -13,7 +13,10 @@ use crate::device::pci_path::PciPath; use crate::device::topology::{PCIePort, PCIeTopology}; use crate::device::util::{do_decrease_count, do_increase_count}; use crate::device::{Device, DeviceType, PCIeDevice}; -use crate::vfio_device::core::{discover_vfio_device, discover_vfio_group_device, VfioDevice}; +use crate::vfio_device::core::{ + discover_vfio_ap_device, discover_vfio_device, discover_vfio_group_device, is_vfio_ap_device, + VfioDevice, VfioDeviceType, +}; use crate::Hypervisor; /// Identifies a specific port on a PCI bus: (bus_name, bus_slot, port_id) @@ -64,6 +67,11 @@ pub struct VfioDeviceBase { /// - VFIO Volume: "vfio_vol_" /// - VFIO NVMe: "vfio_nvme_" pub hostdev_prefix: String, + + /// APQNs assigned to this device (s390x VFIO-AP only). + /// Each entry is a string like "0a.0001" read from the mdev matrix sysfs file. + /// Empty for all non-AP device types. + pub ap_devices: Vec, } #[derive(Debug, Default, Clone)] @@ -95,18 +103,33 @@ fn vfio_modern_group_discovery_path(base: &VfioDeviceBase) -> PathBuf { impl VfioDeviceModern { pub fn new(device_id: String, base: &VfioDeviceBase) -> Result { - // For modern VFIO devices, we require the specific device cdev path to be provided in the configuration. - // This allows us to directly discover the device context without needing to resolve group devices. - // If the device node is not provided, we can optionally fallback to group device discovery, - // but this is less efficient and may not be supported in all environments. + let group_path = vfio_modern_group_discovery_path(base); + + // s390x VFIO-AP: mediated AP devices have no PCI BDF; discover them separately. + if is_vfio_ap_device(&group_path) { + let device = discover_vfio_ap_device(&group_path)?; + let mut config = base.clone(); + config.ap_devices = device.ap_devices.clone(); + return Ok(Self { + device_id, + device, + config, + device_options: Vec::new(), + is_allocated: false, + attach_count: 0, + }); + } + + // PCI / iommufd path: use the iommu_device_node cdev when available, otherwise + // fall back to group-device discovery. let device = if let Some(ref node) = base.iommu_device_node { if !node.as_os_str().is_empty() { discover_vfio_device(node)? } else { - discover_vfio_group_device(vfio_modern_group_discovery_path(base))? + discover_vfio_group_device(group_path)? } } else { - discover_vfio_group_device(vfio_modern_group_discovery_path(base))? + discover_vfio_group_device(group_path)? }; Ok(Self { device_id, @@ -196,21 +219,31 @@ impl Device for VfioDeviceModernHandle { return Ok(()); } - // Register the device in the virtual PCIe topology - let topo = pcie_topo.as_deref_mut().ok_or_else(|| { - anyhow::anyhow!("VFIO device requires a PCIe topology but none was provided") - })?; - self.register(topo).await?; + let is_ap = self + .with(|d| d.device.device_type == VfioDeviceType::MediatedAp) + .await; - // Request Hypervisor to perform the actual hardware passthrough - if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await { - error!(sl!(), "failed to attach vfio device: {:?}", e); - - // Rollback state on failure - self.decrease_attach_count().await?; - self.unregister(topo).await?; - return Err(e); + if is_ap { + // AP devices have no PCIe topology; call the hypervisor directly. + if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await { + error!(sl!(), "failed to attach vfio-ap device: {:?}", e); + self.decrease_attach_count().await?; + return Err(e); + } + } else { + // PCI devices must be registered in the topology first. + let topo = pcie_topo.as_deref_mut().ok_or_else(|| { + anyhow::anyhow!("VFIO device requires a PCIe topology but none was provided") + })?; + self.register(topo).await?; + if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await { + error!(sl!(), "failed to attach vfio device: {:?}", e); + self.decrease_attach_count().await?; + self.unregister(topo).await?; + return Err(e); + } } + info!( sl!(), "vfio device {:?} attached successfully", @@ -247,9 +280,14 @@ impl Device for VfioDeviceModernHandle { let virt = self.with(|d| d.config.virt_path.clone()).await; let device_index = virt.map(|(idx, _)| idx); - // Unregister from PCIe topology - if let Some(topo) = pcie_topo { - self.unregister(topo).await?; + // AP devices have no PCIe topology to unregister from. + let is_ap = self + .with(|d| d.device.device_type == VfioDeviceType::MediatedAp) + .await; + if !is_ap { + if let Some(topo) = pcie_topo { + self.unregister(topo).await?; + } } Ok(device_index) @@ -279,6 +317,14 @@ impl Device for VfioDeviceModernHandle { impl PCIeDevice for VfioDeviceModernHandle { /// Reserves a bus and port in the PCIe topology for this device. async fn register(&mut self, topo: &mut PCIeTopology) -> Result<()> { + // AP devices have no PCIe topology. + if self + .with(|d| d.device.device_type == VfioDeviceType::MediatedAp) + .await + { + return Ok(()); + } + let device_id = self.device_id().await; let port_type = self.with(|d| d.config.port).await; @@ -299,6 +345,14 @@ impl PCIeDevice for VfioDeviceModernHandle { /// Releases the reserved PCIe resources and resets attachment state. async fn unregister(&mut self, topo: &mut PCIeTopology) -> Result<()> { + // AP devices have no PCIe topology. + if self + .with(|d| d.device.device_type == VfioDeviceType::MediatedAp) + .await + { + return Ok(()); + } + let device_id = self.device_id().await; topo.release_bus_for_device(&device_id)?; diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs index 0758e3d43d..41206f5156 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs @@ -6,7 +6,10 @@ mod core; mod device; -pub use core::{discover_vfio_group_device, VfioDevice}; +pub use core::{ + discover_vfio_ap_device, discover_vfio_group_device, is_vfio_ap_device, VfioDevice, + VfioDeviceType, +}; pub use device::VfioDeviceBase; pub use device::VfioDeviceModern; pub use device::VfioDeviceModernHandle; diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index 5352a85a45..80a9ee7b47 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -2331,6 +2331,21 @@ impl PCIeVfioDevice { } } +/// s390x VFIO-AP device: `-device vfio-ap,sysfsdev=` +struct VfioApDevice { + sysfs_path: String, +} + +#[async_trait] +impl ToQemuParams for VfioApDevice { + async fn qemu_params(&self) -> Result> { + Ok(vec![ + "-device".to_string(), + format!("vfio-ap,sysfsdev={}", self.sysfs_path), + ]) + } +} + #[async_trait] impl ToQemuParams for PCIeVfioDevice { async fn qemu_params(&self) -> Result> { @@ -3169,6 +3184,16 @@ impl<'a> QemuCmdLine<'a> { Ok(()) } + /// Adds an s390x VFIO-AP device to the QEMU command line. + /// + /// Generates: `-device vfio-ap,sysfsdev=` + pub fn add_vfio_ap_device(&mut self, sysfs_path: &str) -> Result<()> { + self.devices.push(Box::new(VfioApDevice { + sysfs_path: sysfs_path.to_string(), + })); + Ok(()) + } + /// Batch adds multiple VFIO devices to the QEMU command line. pub fn add_vfio_devices(&mut self, configs: Vec) -> Result<()> { if configs.is_empty() { diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs index 9c6a52a51f..548b065b47 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs @@ -203,50 +203,59 @@ impl QemuInner { } } DeviceType::VfioModern(vfio_dev) => { - // To avoid holding the lock for too long, we first snapshot the necessary VFIO parameters, - // then release the lock before doing the coldplug via cmdline, - // and finally re-acquire the lock to update the guest PCI path after coldplug. - let (devices, bus_port_id) = { + // Snapshot parameters under the lock; release before doing cmdline work. + let (device_type, ap_sysfs_path, devices, bus_port_id) = { let vfio_device = vfio_dev.lock().await; + let device_type = vfio_device.device.device_type.clone(); + let ap_sysfs_path = + vfio_device.device.primary.sysfs_path.display().to_string(); let devices = vfio_device .device .iommu_group .as_ref() .map(|g| g.clone().devices) .unwrap_or_default(); - - (devices, vfio_device.config.bus_port_id.clone()) + ( + device_type, + ap_sysfs_path, + devices, + vfio_device.config.bus_port_id.clone(), + ) }; - // Cold plug devices - for dev in devices.iter() { - let host_bdf = dev.addr.to_string(); + if device_type == VfioDeviceType::MediatedAp { + // s390x VFIO-AP: -device vfio-ap,sysfsdev= + // No PCIe root port, no guest_pci_path. + cmdline.add_vfio_ap_device(&ap_sysfs_path)?; + info!(sl!(), "Completed VFIOModern AP coldplug for sysfsdev: {}", ap_sysfs_path); + } else { + // PCI cold plug devices + for dev in devices.iter() { + let host_bdf = dev.addr.to_string(); - let vfio_cfg = VfioDeviceConfig::new( - host_bdf, - bus_port_id.1 as u16, - bus_port_id.1 + 1, - ) - .with_vfio_bus(bus_port_id.0.clone()); + let vfio_cfg = VfioDeviceConfig::new( + host_bdf, + bus_port_id.1 as u16, + bus_port_id.1 + 1, + ) + .with_vfio_bus(bus_port_id.0.clone()); - cmdline.add_pcie_vfio_device(vfio_cfg)?; + cmdline.add_pcie_vfio_device(vfio_cfg)?; + } + + // Write back guest PCI path + let pci_path = + PciPath::try_from(format!("{:02x}/00", bus_port_id.1).as_str())?; + { + let mut vfio_device = vfio_dev.lock().await; + vfio_device.config.guest_pci_path = Some(pci_path.clone()); + } + info!( + sl!(), + "Completed VFIOModern coldplug with returned guest pci path: {:?}", + pci_path + ); } - - // Write back with lock - let pci_path = PciPath::try_from(format!("{:02x}/00", bus_port_id.1).as_str())?; - - { - let mut vfio_device = vfio_dev.lock().await; - // Update the guest PCI path for the VFIO device after coldplug, - // which will be used for device mapping into from Guest to Container Environment. - vfio_device.config.guest_pci_path = Some(pci_path.clone()); - } - - info!( - sl!(), - "Completed VFIOModern coldplug with returned guest pci path: {:?}", - pci_path - ); } DeviceType::Vfio(vfio_dev) => { // Cold-plug physical-endpoint VFs (non-IOMMUFD VFIO) onto @@ -908,6 +917,7 @@ async fn log_qemu_stderr(stderr: ChildStderr, exit_notify: mpsc::Sender<()>) -> } use crate::device::DeviceType; +use crate::vfio_device::VfioDeviceType; // device manager part of Hypervisor impl QemuInner { @@ -1141,38 +1151,41 @@ impl QemuInner { } DeviceType::VfioModern(ref vfiodev) => { // Snapshot VFIO parameters inside the lock. - let (hostdev_id, sysfs_path, address, driver_type, bus) = { + let (hostdev_id, device_type, sysfs_path, address, driver_type, bus) = { let vfio_device = vfiodev.lock().await; let hostdev_id = vfio_device.device_id.clone(); let device = &vfio_device.device; + let device_type = device.device_type.clone(); - // FIXME: The first device in the group might not be the actual device intended for passthrough. - // Multi-function support is tracked via issue #11292. - let primary_device = device - .clone() - .iommu_group - .ok_or_else(|| anyhow!("IOMMU group missing for VFIO device"))? - .primary; + let sysfs_path = device.primary.sysfs_path.display().to_string(); - info!( - sl!(), - "QMP hotplug VFIO primary_device address: {:?}", &primary_device.addr - ); + // For AP devices there is no IOMMU group or BDF; use empty strings. + let (address, driver_type, bus) = if device_type == VfioDeviceType::MediatedAp { + (String::new(), "vfio-ap".to_string(), String::new()) + } else { + // FIXME: The first device in the group might not be the actual device intended for passthrough. + // Multi-function support is tracked via issue #11292. + let primary_device = device + .clone() + .iommu_group + .ok_or_else(|| anyhow!("IOMMU group missing for VFIO device"))? + .primary; - let sysfs_path = primary_device.sysfs_path.display().to_string(); - let driver_type = primary_device - .driver - .clone() - .ok_or_else(|| anyhow!("Driver type missing for primary device"))?; - let address = format!("{}", primary_device.addr); + info!( + sl!(), + "QMP hotplug VFIO primary_device address: {:?}", &primary_device.addr + ); - ( - hostdev_id, - sysfs_path, - address, - driver_type, - vfio_device.config.bus_port_id.0.clone(), - ) + let driver_type = primary_device + .driver + .clone() + .ok_or_else(|| anyhow!("Driver type missing for primary device"))?; + let address = format!("{}", primary_device.addr); + let bus = vfio_device.config.bus_port_id.0.clone(); + (address, driver_type, bus) + }; + + (hostdev_id, device_type, sysfs_path, address, driver_type, bus) }; // Execute hotplug outside the lock. @@ -1184,12 +1197,13 @@ impl QemuInner { &bus, )?; - // Write the resulting Guest PCI Path back within the lock. + // Write the resulting Guest PCI Path back within the lock (PCI only). { let mut vfio_device = vfiodev.lock().await; - if let Some(p) = guest_pci_path { - // Very important to write back the guest pci path for VFIO devices. - vfio_device.config.guest_pci_path = Some(p); + if device_type != VfioDeviceType::MediatedAp { + if let Some(p) = guest_pci_path { + vfio_device.config.guest_pci_path = Some(p); + } } info!( sl!(), From f223199b3ddd73c48b59b2b0a83d8a39e2ff0aa3 Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Thu, 28 May 2026 14:38:32 +0200 Subject: [PATCH 3/4] runtime-rs: add raw VFIO device cold-plug support Add support for cold-plugging raw VFIO devices in standalone container scenarios (e.g., `ctr --device /dev/vfio/0`) where devices are specified directly in the OCI spec rather than through K8S CDI/device plugins. This implements a fallback path that: - Reads linux.devices from the OCI spec in the container bundle - Identifies character devices under /dev/vfio - Cold-plugs them before VM boot using the configured cold_plug_vfio mode - Handles VFIO-AP devices specially by using NoPort topology - Mirrors the Go runtime's coldOrHotPlugVFIO() behavior The implementation: - Adds prepare_coldplug_raw_vfio_devices() method to VirtSandbox - Exports is_vfio_ap_device() helper for device type detection - Integrates with existing CDI device cold-plug flow - Only activates when cold_plug_vfio is configured (not "no-port") - Skips processing when bundle path is unavailable or OCI spec missing Plus, update resource manager: - Skipping hotplug for cold-plugged mediated AP devices - Handling the AP devices to get exposed to the agent This enables VFIO device passthrough for standalone containers while maintaining compatibility with Kubernetes pod resource API workflows. Signed-off-by: Hyounggyu Choi --- .../hypervisor/src/device/device_manager.rs | 43 ++++++- .../hypervisor/src/device/driver/mod.rs | 6 +- .../crates/resource/src/manager_inner.rs | 68 ++++++++--- .../runtimes/virt_container/src/sandbox.rs | 112 +++++++++++++++++- 4 files changed, 202 insertions(+), 27 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs index 7bec687abe..d29173f63a 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs @@ -18,12 +18,13 @@ use kata_types::config::hypervisor::{ use tokio::sync::{Mutex, RwLock}; use crate::{ - vfio_device::VfioDeviceModernHandle, vhost_user_blk::VhostUserBlkDevice, BlockConfig, - BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, Hypervisor, - NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, VhostUserConfig, - VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, - KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, VIRTIO_BLOCK_MMIO, - VIRTIO_BLOCK_PCI, VIRTIO_PMEM, + vfio_device::{VfioDeviceModernHandle, VfioDeviceType}, + vhost_user_blk::VhostUserBlkDevice, + BlockConfig, BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, + Hypervisor, NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, + VhostUserConfig, VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, + KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, + VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM, }; use super::{ @@ -724,6 +725,36 @@ pub async fn get_shared_fs_info(d: &RwLock) -> SharedFsInfo { d.read().await.get_shared_fs_info().await } +/// Returns the APQN list for a cold-plugged VFIO-AP device whose +/// `iommu_group_devnode` matches `host_path`, or `None` if no such device is +/// registered in the device manager. +/// +/// Used by `handler_devices` to bypass `do_handle_device` for VFIO-AP devices +/// that were cold-plugged before VM boot. VFIO-AP devices have no PCIe BDF so +/// the BDF-keyed `cold_plug_bdfs` map cannot catch them; this lookup fills that +/// gap without touching reference counts or the QMP hot-plug path. +pub async fn find_cold_plugged_vfio_ap( + d: &RwLock, + host_path: &str, +) -> Option> { + // Avoid holding the DeviceManager read-lock across .await points. + let devices: Vec = { + let dm = d.read().await; + dm.devices.values().cloned().collect() + }; + for dev in devices { + if let DeviceType::VfioModern(inner) = dev.lock().await.get_device_info().await { + let guard = inner.lock().await; + if guard.device.device_type == VfioDeviceType::MediatedAp + && guard.config.iommu_group_devnode == Path::new(host_path) + { + return Some(guard.config.ap_devices.clone()); + } + } + } + None +} + #[cfg(test)] mod tests { use super::DeviceManager; diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs index b28ed7ca55..f8833ad30b 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs @@ -20,10 +20,10 @@ mod virtio_vsock; pub use port_device::{PCIePortDevice, PortDeviceConfig}; pub use protection_device::{ProtectionDevice, ProtectionDeviceConfig, SevSnpConfig, TdxConfig}; pub use vfio::{ - bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, VfioConfig, - VfioDevice, + bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, + VfioConfig, VfioDevice, VfioDeviceType, }; -pub use vfio_device::{VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle}; +pub use vfio_device::{is_vfio_ap_device, VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle}; pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType}; pub use vhost_user_net::VhostUserNetDevice; pub use virtio_blk::{ diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 8b573cff70..1122aa237f 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -11,23 +11,27 @@ use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use hypervisor::{ device::{ - device_manager::{do_handle_device, get_block_device_info, DeviceManager}, + device_manager::{ + do_handle_device, find_cold_plugged_vfio_ap, get_block_device_info, DeviceManager, + }, util::{get_host_path, DEVICE_TYPE_BLOCK, DEVICE_TYPE_CHAR}, DeviceConfig, DeviceType, }, utils::uses_native_ccw_bus, + vfio_device::is_vfio_ap_device, BlockConfig, BlockDeviceAio, Hypervisor, VfioConfig, }; use kata_types::mount::{kata_guest_sandbox_dir, Mount, KATA_EPHEMERAL_VOLUME_TYPE, SHM_DIR}; use kata_types::{ config::{hypervisor::TopologyConfigInfo, TomlConfig}, + device::DRIVER_VFIO_AP_COLD_TYPE, mount::{adjust_rootfs_mounts, KATA_IMAGE_FORCE_GUEST_PULL}, }; use libc::NUD_PERMANENT; use oci::{Linux, LinuxCpu, LinuxResources}; use oci_spec::runtime::{self as oci, LinuxDeviceType}; use persist::sandbox_persist::Persist; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use tokio::{runtime, sync::RwLock}; use crate::{ @@ -661,6 +665,43 @@ impl ResourceManagerInner { continue; } + // VFIO-AP devices have no PCIe BDF, so cold_plug_bdfs above + // cannot catch them. If this device was registered in the + // device manager during cold-plug (prepare_coldplug_raw_vfio_devices + // or CDI), retrieve its APQN list and build the agent device + // directly — calling do_handle_device on an already-present + // device would attempt a QMP device_add and fail. + if is_vfio_ap_device(Path::new(&host_path)) { + if let Some(ap_devs) = + find_cold_plugged_vfio_ap(&self.device_manager, &host_path).await + { + let container_path = d.path().display().to_string(); + let group_num = d + .path() + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or_default() + .to_string(); + let agent_device = Device { + id: group_num, + container_path, + field_type: DRIVER_VFIO_AP_COLD_TYPE.to_string(), + options: ap_devs, + ..Default::default() + }; + info!( + sl!(), + "vfio-ap cold-plugged agent device: {:?}", agent_device + ); + devices.push(ContainerDevice { + device_info: None, + device: agent_device, + }); + continue; + } + // Not registered as cold-plugged — fall through to do_handle_device. + } + let bus_type = if uses_native_ccw_bus() { "ccw".to_string() } else { @@ -681,6 +722,14 @@ impl ResourceManagerInner { if let DeviceType::VfioModern(vfio_dev) = device_info.clone() { info!(sl!(), "device info: {:?}", vfio_dev.lock().await); let vfio_device = vfio_dev.lock().await; + + let group_num = d + .path() + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or_default() + .to_string(); + let guest_pci_path = vfio_device .config .guest_pci_path @@ -697,15 +746,8 @@ impl ResourceManagerInner { // vfio mode: vfio-pci and vfio-pci-gk for x86_64 // - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container. // - vfio-pci-gk, devices are managed by whatever driver in Guest kernel. - // - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices. let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() { - "vfio" => { - if bus_type == "ccw" { - "vfio-ap".to_string() - } else { - "vfio-pci".to_string() - } - } + "vfio" => "vfio-pci".to_string(), _ => "vfio-pci-gk".to_string(), }; let device_options = vec![format!("{}={}", host_bdf, guest_pci_path)]; @@ -713,12 +755,6 @@ impl ResourceManagerInner { // filepath.Base(dev.ContainerPath), e.g. "vfio0". // The agent policy validates this with: // i_vfio_device.id == concat("", ["vfio", suffix]) - let group_num = d - .path() - .file_name() - .and_then(|n| n.to_str()) - .unwrap_or_default() - .to_string(); let agent_device = Device { id: group_num, container_path: d.path().display().to_string().clone(), diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 9850a4cd4c..ef12d2e6ec 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -31,8 +31,9 @@ use containerd_shim_protos::events::task::{TaskExit, TaskOOM}; ))] use hypervisor::ch::CloudHypervisor; use hypervisor::device::topology::PCIePort; +use hypervisor::device::util::{get_host_path, DEVICE_TYPE_CHAR}; use hypervisor::remote::Remote; -use hypervisor::VfioDeviceBase; +use hypervisor::{is_vfio_ap_device, VfioDeviceBase}; use hypervisor::VsockConfig; use hypervisor::HYPERVISOR_REMOTE; #[cfg(all( @@ -262,7 +263,23 @@ impl VirtSandbox { None }; - let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?; + // Cold-plug VFIO devices using two mutually exclusive paths: + // 1. CDI path: Query Kubernetes Pod Resources API for devices managed by device plugins + // (typical in K8s environments with device plugins) + // 2. Raw VFIO path: Parse OCI spec's linux.devices for directly specified VFIO devices + // (typical in standalone containers like `ctr --device /dev/vfio/0`) + // + // These paths are mutually exclusive from a user perspective: + // - In K8s, devices come through device plugins, not raw OCI device specs + // - In standalone containers, there's no Pod Resources API available + // + // Therefore, we only attempt the raw VFIO path if CDI finds no devices, + // avoiding unnecessary file I/O and OCI spec parsing in the common K8s case. + let mut vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?; + if vfio_devices.is_empty() { + let raw_vfio = self.prepare_coldplug_raw_vfio_devices(sandbox_config).await?; + vfio_devices.extend(raw_vfio); + } if !vfio_devices.is_empty() { info!( sl!(), @@ -387,6 +404,97 @@ impl VirtSandbox { .collect()) } + // Fallback cold-plug path for standalone containers (e.g. `ctr --device /dev/vfio/0`). + // Reads the OCI spec from the bundle and cold-plugs any VFIO char devices found in + // linux.devices before VM boot, mirroring Go's coldOrHotPlugVFIO(). + // Returns empty when the pod resources API path already handles devices (K8s) or + // when cold_plug_vfio is not configured. + async fn prepare_coldplug_raw_vfio_devices( + &self, + sandbox_config: &SandboxConfig, + ) -> Result> { + let hypervisor_config = self.hypervisor.hypervisor_config().await; + let cold_plug_vfio = &hypervisor_config.device_info.cold_plug_vfio; + if cold_plug_vfio.is_empty() || cold_plug_vfio == "no-port" { + return Ok(Vec::new()); + } + + let port = match cold_plug_vfio.as_str() { + "root-port" => PCIePort::RootPort, + other => { + return Err(anyhow!( + "unsupported cold_plug_vfio value {:?}; only \"root-port\" is supported", + other + )) + } + }; + + let bundle = &sandbox_config.state.bundle; + if bundle.is_empty() { + return Ok(Vec::new()); + } + + let spec_path = format!("{}/{}", bundle, spec::OCI_SPEC_CONFIG_FILE_NAME); + let oci_spec = match oci::Spec::load(&spec_path) { + Ok(s) => s, + Err(e) => { + info!( + sl!(), + "no OCI spec at {:?}: {:?}, skipping raw VFIO cold-plug", spec_path, e + ); + return Ok(Vec::new()); + } + }; + + let linux_devices = oci_spec + .linux() + .as_ref() + .and_then(|l| l.devices().as_ref()) + .cloned() + .unwrap_or_default(); + + let mut vfio_configs = Vec::new(); + for d in linux_devices.iter() { + if d.typ() != oci::LinuxDeviceType::C { + continue; + } + let host_path = match get_host_path(DEVICE_TYPE_CHAR, d.major(), d.minor()) { + Ok(p) => p, + Err(e) => { + warn!( + sl!(), + "failed to resolve host path for {:?}: {:?}", d.path(), e + ); + continue; + } + }; + // Only process VFIO passthrough devices under /dev/vfio/*. + // Skip non-VFIO devices and the legacy VFIO control node (/dev/vfio/vfio). + if !host_path.starts_with("/dev/vfio/") || host_path == "/dev/vfio/vfio" { + continue; + } + let device_port = if is_vfio_ap_device(Path::new(&host_path)) { + PCIePort::NoPort + } else { + port + }; + vfio_configs.push(VfioDeviceBase { + host_path: host_path.clone(), + iommu_group_devnode: PathBuf::from(&host_path), + dev_type: "c".to_string(), + port: device_port, + hostdev_prefix: "vfio_device".to_owned(), + ..Default::default() + }); + } + info!(sl!(), "raw VFIO cold-plug candidates: {:?}", vfio_configs); + + Ok(vfio_configs + .into_iter() + .map(ResourceConfig::VfioDeviceModern) + .collect()) + } + async fn prepare_network_resource( &self, network_env: &SandboxNetworkEnv, From 038f9512068f9ee7d07cc8a387d9af080f3e2230 Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Thu, 28 May 2026 15:58:01 +0200 Subject: [PATCH 4/4] test: add VFIO device cold-plug tests to nightly tests This commit enhances the VFIO-AP testing suite by adding support for cold-plug testing in runtime-rs and updating configuration files to support the new cold_plug_vfio parameter. Changes include: 1. Configuration updates: - Added cold_plug_vfio parameter to configuration-qemu-runtime-rs.toml.in with default value "no-port" (disabled) - Added cold_plug_vfio parameter to configuration-qemu-se-runtime-rs.toml.in with value "root-port" (enabled for Secure Execution) 2. Test enhancements: - Updated setup_hotplug() to explicitly set cold_plug_vfio to "no-port" for runtime-rs to ensure hotplug-only mode - Implemented setup_coldplug() for runtime-rs (previously unsupported) to enable cold-plug testing with "root-port" configuration - Added new test case (Test 4) for runtime-rs VFIO-AP cold-plug functionality using zcrypttest validation This enables comprehensive testing of both hot-plug and cold-plug VFIO device assignment scenarios for s390x CEX devices in runtime-rs. Signed-off-by: Hyounggyu Choi --- .../config/configuration-qemu-runtime-rs.toml.in | 6 ++++++ .../config/configuration-qemu-se-runtime-rs.toml.in | 6 ++++++ tests/functional/vfio-ap/run.sh | 9 +++++++-- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in index fdf9f7a585..8e468795c3 100644 --- a/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in @@ -381,6 +381,12 @@ disable_image_nvdimm = false # Default false hotplug_vfio_on_root_bus = false +# Enable cold-plugging of VFIO devices to a PCIe port type. +# Accepted values: "no-port" (default, disabled), "root-port". +# When set to "root-port", devices discovered via CDI / Pod Resources +# are cold-plugged before VM boot. +cold_plug_vfio = "no-port" + # Before hot plugging a PCIe device, you need to add a pcie_root_port device. # Use this parameter when using some large PCI bar devices, such as Nvidia GPU # The value means the number of pcie_root_port diff --git a/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in index c16c6e2ff3..edb888ea43 100644 --- a/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in @@ -365,6 +365,12 @@ disable_image_nvdimm = true # Default false hotplug_vfio_on_root_bus = false +# Enable cold-plugging of VFIO devices to a PCIe port type. +# Accepted values: "no-port" (disabled), "root-port". +# When set to "root-port", devices discovered via CDI / Pod Resources +# are cold-plugged before VM boot. +cold_plug_vfio = "root-port" + # Before hot plugging a PCIe device, you need to add a pcie_root_port device. # Use this parameter when using some large PCI bar devices, such as Nvidia GPU # The value means the number of pcie_root_port diff --git a/tests/functional/vfio-ap/run.sh b/tests/functional/vfio-ap/run.sh index dd8c1adf3c..9d29def1e3 100755 --- a/tests/functional/vfio-ap/run.sh +++ b/tests/functional/vfio-ap/run.sh @@ -63,6 +63,7 @@ setup_hotplug() { show_config_file elif [[ "${runtime}" == "runtime-rs" ]]; then setup_config_file "vfio_mode" "replace" "vfio" "runtime-rs" + setup_config_file "cold_plug_vfio" "replace" "no-port" "runtime-rs" show_config_file "runtime-rs" else echo "Invalid runtime: ${runtime}" >&2 @@ -79,8 +80,9 @@ setup_coldplug() { setup_config_file "cold_plug_vfio" "replace" "bridge-port" show_config_file elif [[ "${runtime}" == "runtime-rs" ]]; then - echo "Coldplug is not supported for runtime-rs" >&2 - exit 1 + setup_config_file "vfio_mode" "replace" "vfio" "runtime-rs" + setup_config_file "cold_plug_vfio" "replace" "root-port" "runtime-rs" + show_config_file "runtime-rs" else echo "Invalid runtime: ${runtime}" >&2 exit 1 @@ -296,6 +298,9 @@ run_tests() { setup_hotplug "runtime-rs" run_test "3" "runtime-rs" "Test can assign a CEX device inside the guest via VFIO-AP Hotplug" "&& zcrypttest -a -v" + + setup_coldplug "runtime-rs" + run_test "4" "runtime-rs" "Test can assign a CEX device inside the guest via VFIO-AP Coldplug" "&& zcrypttest -a -v" } main() {