From f223199b3ddd73c48b59b2b0a83d8a39e2ff0aa3 Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Thu, 28 May 2026 14:38:32 +0200 Subject: [PATCH] runtime-rs: add raw VFIO device cold-plug support Add support for cold-plugging raw VFIO devices in standalone container scenarios (e.g., `ctr --device /dev/vfio/0`) where devices are specified directly in the OCI spec rather than through K8S CDI/device plugins. This implements a fallback path that: - Reads linux.devices from the OCI spec in the container bundle - Identifies character devices under /dev/vfio - Cold-plugs them before VM boot using the configured cold_plug_vfio mode - Handles VFIO-AP devices specially by using NoPort topology - Mirrors the Go runtime's coldOrHotPlugVFIO() behavior The implementation: - Adds prepare_coldplug_raw_vfio_devices() method to VirtSandbox - Exports is_vfio_ap_device() helper for device type detection - Integrates with existing CDI device cold-plug flow - Only activates when cold_plug_vfio is configured (not "no-port") - Skips processing when bundle path is unavailable or OCI spec missing Plus, update resource manager: - Skipping hotplug for cold-plugged mediated AP devices - Handling the AP devices to get exposed to the agent This enables VFIO device passthrough for standalone containers while maintaining compatibility with Kubernetes pod resource API workflows. Signed-off-by: Hyounggyu Choi --- .../hypervisor/src/device/device_manager.rs | 43 ++++++- .../hypervisor/src/device/driver/mod.rs | 6 +- .../crates/resource/src/manager_inner.rs | 68 ++++++++--- .../runtimes/virt_container/src/sandbox.rs | 112 +++++++++++++++++- 4 files changed, 202 insertions(+), 27 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs index 7bec687abe..d29173f63a 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs @@ -18,12 +18,13 @@ use kata_types::config::hypervisor::{ use tokio::sync::{Mutex, RwLock}; use crate::{ - vfio_device::VfioDeviceModernHandle, vhost_user_blk::VhostUserBlkDevice, BlockConfig, - BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, Hypervisor, - NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, VhostUserConfig, - VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, - KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, VIRTIO_BLOCK_MMIO, - VIRTIO_BLOCK_PCI, VIRTIO_PMEM, + vfio_device::{VfioDeviceModernHandle, VfioDeviceType}, + vhost_user_blk::VhostUserBlkDevice, + BlockConfig, BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, + Hypervisor, NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, + VhostUserConfig, VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, + KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, + VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM, }; use super::{ @@ -724,6 +725,36 @@ pub async fn get_shared_fs_info(d: &RwLock) -> SharedFsInfo { d.read().await.get_shared_fs_info().await } +/// Returns the APQN list for a cold-plugged VFIO-AP device whose +/// `iommu_group_devnode` matches `host_path`, or `None` if no such device is +/// registered in the device manager. +/// +/// Used by `handler_devices` to bypass `do_handle_device` for VFIO-AP devices +/// that were cold-plugged before VM boot. VFIO-AP devices have no PCIe BDF so +/// the BDF-keyed `cold_plug_bdfs` map cannot catch them; this lookup fills that +/// gap without touching reference counts or the QMP hot-plug path. +pub async fn find_cold_plugged_vfio_ap( + d: &RwLock, + host_path: &str, +) -> Option> { + // Avoid holding the DeviceManager read-lock across .await points. + let devices: Vec = { + let dm = d.read().await; + dm.devices.values().cloned().collect() + }; + for dev in devices { + if let DeviceType::VfioModern(inner) = dev.lock().await.get_device_info().await { + let guard = inner.lock().await; + if guard.device.device_type == VfioDeviceType::MediatedAp + && guard.config.iommu_group_devnode == Path::new(host_path) + { + return Some(guard.config.ap_devices.clone()); + } + } + } + None +} + #[cfg(test)] mod tests { use super::DeviceManager; diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs index b28ed7ca55..f8833ad30b 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs @@ -20,10 +20,10 @@ mod virtio_vsock; pub use port_device::{PCIePortDevice, PortDeviceConfig}; pub use protection_device::{ProtectionDevice, ProtectionDeviceConfig, SevSnpConfig, TdxConfig}; pub use vfio::{ - bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, VfioConfig, - VfioDevice, + bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, + VfioConfig, VfioDevice, VfioDeviceType, }; -pub use vfio_device::{VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle}; +pub use vfio_device::{is_vfio_ap_device, VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle}; pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType}; pub use vhost_user_net::VhostUserNetDevice; pub use virtio_blk::{ diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 8b573cff70..1122aa237f 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -11,23 +11,27 @@ use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use hypervisor::{ device::{ - device_manager::{do_handle_device, get_block_device_info, DeviceManager}, + device_manager::{ + do_handle_device, find_cold_plugged_vfio_ap, get_block_device_info, DeviceManager, + }, util::{get_host_path, DEVICE_TYPE_BLOCK, DEVICE_TYPE_CHAR}, DeviceConfig, DeviceType, }, utils::uses_native_ccw_bus, + vfio_device::is_vfio_ap_device, BlockConfig, BlockDeviceAio, Hypervisor, VfioConfig, }; use kata_types::mount::{kata_guest_sandbox_dir, Mount, KATA_EPHEMERAL_VOLUME_TYPE, SHM_DIR}; use kata_types::{ config::{hypervisor::TopologyConfigInfo, TomlConfig}, + device::DRIVER_VFIO_AP_COLD_TYPE, mount::{adjust_rootfs_mounts, KATA_IMAGE_FORCE_GUEST_PULL}, }; use libc::NUD_PERMANENT; use oci::{Linux, LinuxCpu, LinuxResources}; use oci_spec::runtime::{self as oci, LinuxDeviceType}; use persist::sandbox_persist::Persist; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use tokio::{runtime, sync::RwLock}; use crate::{ @@ -661,6 +665,43 @@ impl ResourceManagerInner { continue; } + // VFIO-AP devices have no PCIe BDF, so cold_plug_bdfs above + // cannot catch them. If this device was registered in the + // device manager during cold-plug (prepare_coldplug_raw_vfio_devices + // or CDI), retrieve its APQN list and build the agent device + // directly — calling do_handle_device on an already-present + // device would attempt a QMP device_add and fail. + if is_vfio_ap_device(Path::new(&host_path)) { + if let Some(ap_devs) = + find_cold_plugged_vfio_ap(&self.device_manager, &host_path).await + { + let container_path = d.path().display().to_string(); + let group_num = d + .path() + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or_default() + .to_string(); + let agent_device = Device { + id: group_num, + container_path, + field_type: DRIVER_VFIO_AP_COLD_TYPE.to_string(), + options: ap_devs, + ..Default::default() + }; + info!( + sl!(), + "vfio-ap cold-plugged agent device: {:?}", agent_device + ); + devices.push(ContainerDevice { + device_info: None, + device: agent_device, + }); + continue; + } + // Not registered as cold-plugged — fall through to do_handle_device. + } + let bus_type = if uses_native_ccw_bus() { "ccw".to_string() } else { @@ -681,6 +722,14 @@ impl ResourceManagerInner { if let DeviceType::VfioModern(vfio_dev) = device_info.clone() { info!(sl!(), "device info: {:?}", vfio_dev.lock().await); let vfio_device = vfio_dev.lock().await; + + let group_num = d + .path() + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or_default() + .to_string(); + let guest_pci_path = vfio_device .config .guest_pci_path @@ -697,15 +746,8 @@ impl ResourceManagerInner { // vfio mode: vfio-pci and vfio-pci-gk for x86_64 // - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container. // - vfio-pci-gk, devices are managed by whatever driver in Guest kernel. - // - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices. let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() { - "vfio" => { - if bus_type == "ccw" { - "vfio-ap".to_string() - } else { - "vfio-pci".to_string() - } - } + "vfio" => "vfio-pci".to_string(), _ => "vfio-pci-gk".to_string(), }; let device_options = vec![format!("{}={}", host_bdf, guest_pci_path)]; @@ -713,12 +755,6 @@ impl ResourceManagerInner { // filepath.Base(dev.ContainerPath), e.g. "vfio0". // The agent policy validates this with: // i_vfio_device.id == concat("", ["vfio", suffix]) - let group_num = d - .path() - .file_name() - .and_then(|n| n.to_str()) - .unwrap_or_default() - .to_string(); let agent_device = Device { id: group_num, container_path: d.path().display().to_string().clone(), diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 9850a4cd4c..ef12d2e6ec 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -31,8 +31,9 @@ use containerd_shim_protos::events::task::{TaskExit, TaskOOM}; ))] use hypervisor::ch::CloudHypervisor; use hypervisor::device::topology::PCIePort; +use hypervisor::device::util::{get_host_path, DEVICE_TYPE_CHAR}; use hypervisor::remote::Remote; -use hypervisor::VfioDeviceBase; +use hypervisor::{is_vfio_ap_device, VfioDeviceBase}; use hypervisor::VsockConfig; use hypervisor::HYPERVISOR_REMOTE; #[cfg(all( @@ -262,7 +263,23 @@ impl VirtSandbox { None }; - let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?; + // Cold-plug VFIO devices using two mutually exclusive paths: + // 1. CDI path: Query Kubernetes Pod Resources API for devices managed by device plugins + // (typical in K8s environments with device plugins) + // 2. Raw VFIO path: Parse OCI spec's linux.devices for directly specified VFIO devices + // (typical in standalone containers like `ctr --device /dev/vfio/0`) + // + // These paths are mutually exclusive from a user perspective: + // - In K8s, devices come through device plugins, not raw OCI device specs + // - In standalone containers, there's no Pod Resources API available + // + // Therefore, we only attempt the raw VFIO path if CDI finds no devices, + // avoiding unnecessary file I/O and OCI spec parsing in the common K8s case. + let mut vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?; + if vfio_devices.is_empty() { + let raw_vfio = self.prepare_coldplug_raw_vfio_devices(sandbox_config).await?; + vfio_devices.extend(raw_vfio); + } if !vfio_devices.is_empty() { info!( sl!(), @@ -387,6 +404,97 @@ impl VirtSandbox { .collect()) } + // Fallback cold-plug path for standalone containers (e.g. `ctr --device /dev/vfio/0`). + // Reads the OCI spec from the bundle and cold-plugs any VFIO char devices found in + // linux.devices before VM boot, mirroring Go's coldOrHotPlugVFIO(). + // Returns empty when the pod resources API path already handles devices (K8s) or + // when cold_plug_vfio is not configured. + async fn prepare_coldplug_raw_vfio_devices( + &self, + sandbox_config: &SandboxConfig, + ) -> Result> { + let hypervisor_config = self.hypervisor.hypervisor_config().await; + let cold_plug_vfio = &hypervisor_config.device_info.cold_plug_vfio; + if cold_plug_vfio.is_empty() || cold_plug_vfio == "no-port" { + return Ok(Vec::new()); + } + + let port = match cold_plug_vfio.as_str() { + "root-port" => PCIePort::RootPort, + other => { + return Err(anyhow!( + "unsupported cold_plug_vfio value {:?}; only \"root-port\" is supported", + other + )) + } + }; + + let bundle = &sandbox_config.state.bundle; + if bundle.is_empty() { + return Ok(Vec::new()); + } + + let spec_path = format!("{}/{}", bundle, spec::OCI_SPEC_CONFIG_FILE_NAME); + let oci_spec = match oci::Spec::load(&spec_path) { + Ok(s) => s, + Err(e) => { + info!( + sl!(), + "no OCI spec at {:?}: {:?}, skipping raw VFIO cold-plug", spec_path, e + ); + return Ok(Vec::new()); + } + }; + + let linux_devices = oci_spec + .linux() + .as_ref() + .and_then(|l| l.devices().as_ref()) + .cloned() + .unwrap_or_default(); + + let mut vfio_configs = Vec::new(); + for d in linux_devices.iter() { + if d.typ() != oci::LinuxDeviceType::C { + continue; + } + let host_path = match get_host_path(DEVICE_TYPE_CHAR, d.major(), d.minor()) { + Ok(p) => p, + Err(e) => { + warn!( + sl!(), + "failed to resolve host path for {:?}: {:?}", d.path(), e + ); + continue; + } + }; + // Only process VFIO passthrough devices under /dev/vfio/*. + // Skip non-VFIO devices and the legacy VFIO control node (/dev/vfio/vfio). + if !host_path.starts_with("/dev/vfio/") || host_path == "/dev/vfio/vfio" { + continue; + } + let device_port = if is_vfio_ap_device(Path::new(&host_path)) { + PCIePort::NoPort + } else { + port + }; + vfio_configs.push(VfioDeviceBase { + host_path: host_path.clone(), + iommu_group_devnode: PathBuf::from(&host_path), + dev_type: "c".to_string(), + port: device_port, + hostdev_prefix: "vfio_device".to_owned(), + ..Default::default() + }); + } + info!(sl!(), "raw VFIO cold-plug candidates: {:?}", vfio_configs); + + Ok(vfio_configs + .into_iter() + .map(ResourceConfig::VfioDeviceModern) + .collect()) + } + async fn prepare_network_resource( &self, network_env: &SandboxNetworkEnv,