runtime-rs: add raw VFIO device cold-plug support

Add support for cold-plugging raw VFIO devices in standalone
container scenarios (e.g., `ctr --device /dev/vfio/0`) where
devices are specified directly in the OCI spec rather than
through K8S CDI/device plugins.

This implements a fallback path that:
- Reads linux.devices from the OCI spec in the container bundle
- Identifies character devices under /dev/vfio
- Cold-plugs them before VM boot using the configured cold_plug_vfio mode
- Handles VFIO-AP devices specially by using NoPort topology
- Mirrors the Go runtime's coldOrHotPlugVFIO() behavior

The implementation:
- Adds prepare_coldplug_raw_vfio_devices() method to VirtSandbox
- Exports is_vfio_ap_device() helper for device type detection
- Integrates with existing CDI device cold-plug flow
- Only activates when cold_plug_vfio is configured (not "no-port")
- Skips processing when bundle path is unavailable or OCI spec missing

Plus, update resource manager:
- Skipping hotplug for cold-plugged mediated AP devices
- Handling the AP devices to get exposed to the agent

This enables VFIO device passthrough for standalone containers while
maintaining compatibility with Kubernetes pod resource API workflows.

Signed-off-by: Hyounggyu Choi <Hyounggyu.Choi@ibm.com>
This commit is contained in:
Hyounggyu Choi
2026-05-28 14:38:32 +02:00
parent 534db34e7a
commit f223199b3d
4 changed files with 202 additions and 27 deletions

View File

@@ -18,12 +18,13 @@ use kata_types::config::hypervisor::{
use tokio::sync::{Mutex, RwLock};
use crate::{
vfio_device::VfioDeviceModernHandle, vhost_user_blk::VhostUserBlkDevice, BlockConfig,
BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, Hypervisor,
NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, VhostUserConfig,
VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE,
KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, VIRTIO_BLOCK_MMIO,
VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
vfio_device::{VfioDeviceModernHandle, VfioDeviceType},
vhost_user_blk::VhostUserBlkDevice,
BlockConfig, BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice,
Hypervisor, NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice,
VhostUserConfig, VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE,
KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW,
VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
};
use super::{
@@ -724,6 +725,36 @@ pub async fn get_shared_fs_info(d: &RwLock<DeviceManager>) -> SharedFsInfo {
d.read().await.get_shared_fs_info().await
}
/// Returns the APQN list for a cold-plugged VFIO-AP device whose
/// `iommu_group_devnode` matches `host_path`, or `None` if no such device is
/// registered in the device manager.
///
/// Used by `handler_devices` to bypass `do_handle_device` for VFIO-AP devices
/// that were cold-plugged before VM boot. VFIO-AP devices have no PCIe BDF so
/// the BDF-keyed `cold_plug_bdfs` map cannot catch them; this lookup fills that
/// gap without touching reference counts or the QMP hot-plug path.
pub async fn find_cold_plugged_vfio_ap(
d: &RwLock<DeviceManager>,
host_path: &str,
) -> Option<Vec<String>> {
// Avoid holding the DeviceManager read-lock across .await points.
let devices: Vec<ArcMutexDevice> = {
let dm = d.read().await;
dm.devices.values().cloned().collect()
};
for dev in devices {
if let DeviceType::VfioModern(inner) = dev.lock().await.get_device_info().await {
let guard = inner.lock().await;
if guard.device.device_type == VfioDeviceType::MediatedAp
&& guard.config.iommu_group_devnode == Path::new(host_path)
{
return Some(guard.config.ap_devices.clone());
}
}
}
None
}
#[cfg(test)]
mod tests {
use super::DeviceManager;

View File

@@ -20,10 +20,10 @@ mod virtio_vsock;
pub use port_device::{PCIePortDevice, PortDeviceConfig};
pub use protection_device::{ProtectionDevice, ProtectionDeviceConfig, SevSnpConfig, TdxConfig};
pub use vfio::{
bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, VfioConfig,
VfioDevice,
bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode,
VfioConfig, VfioDevice, VfioDeviceType,
};
pub use vfio_device::{VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle};
pub use vfio_device::{is_vfio_ap_device, VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle};
pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType};
pub use vhost_user_net::VhostUserNetDevice;
pub use virtio_blk::{

View File

@@ -11,23 +11,27 @@ use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use hypervisor::{
device::{
device_manager::{do_handle_device, get_block_device_info, DeviceManager},
device_manager::{
do_handle_device, find_cold_plugged_vfio_ap, get_block_device_info, DeviceManager,
},
util::{get_host_path, DEVICE_TYPE_BLOCK, DEVICE_TYPE_CHAR},
DeviceConfig, DeviceType,
},
utils::uses_native_ccw_bus,
vfio_device::is_vfio_ap_device,
BlockConfig, BlockDeviceAio, Hypervisor, VfioConfig,
};
use kata_types::mount::{kata_guest_sandbox_dir, Mount, KATA_EPHEMERAL_VOLUME_TYPE, SHM_DIR};
use kata_types::{
config::{hypervisor::TopologyConfigInfo, TomlConfig},
device::DRIVER_VFIO_AP_COLD_TYPE,
mount::{adjust_rootfs_mounts, KATA_IMAGE_FORCE_GUEST_PULL},
};
use libc::NUD_PERMANENT;
use oci::{Linux, LinuxCpu, LinuxResources};
use oci_spec::runtime::{self as oci, LinuxDeviceType};
use persist::sandbox_persist::Persist;
use std::path::PathBuf;
use std::path::{Path, PathBuf};
use tokio::{runtime, sync::RwLock};
use crate::{
@@ -661,6 +665,43 @@ impl ResourceManagerInner {
continue;
}
// VFIO-AP devices have no PCIe BDF, so cold_plug_bdfs above
// cannot catch them. If this device was registered in the
// device manager during cold-plug (prepare_coldplug_raw_vfio_devices
// or CDI), retrieve its APQN list and build the agent device
// directly — calling do_handle_device on an already-present
// device would attempt a QMP device_add and fail.
if is_vfio_ap_device(Path::new(&host_path)) {
if let Some(ap_devs) =
find_cold_plugged_vfio_ap(&self.device_manager, &host_path).await
{
let container_path = d.path().display().to_string();
let group_num = d
.path()
.file_name()
.and_then(|n| n.to_str())
.unwrap_or_default()
.to_string();
let agent_device = Device {
id: group_num,
container_path,
field_type: DRIVER_VFIO_AP_COLD_TYPE.to_string(),
options: ap_devs,
..Default::default()
};
info!(
sl!(),
"vfio-ap cold-plugged agent device: {:?}", agent_device
);
devices.push(ContainerDevice {
device_info: None,
device: agent_device,
});
continue;
}
// Not registered as cold-plugged — fall through to do_handle_device.
}
let bus_type = if uses_native_ccw_bus() {
"ccw".to_string()
} else {
@@ -681,6 +722,14 @@ impl ResourceManagerInner {
if let DeviceType::VfioModern(vfio_dev) = device_info.clone() {
info!(sl!(), "device info: {:?}", vfio_dev.lock().await);
let vfio_device = vfio_dev.lock().await;
let group_num = d
.path()
.file_name()
.and_then(|n| n.to_str())
.unwrap_or_default()
.to_string();
let guest_pci_path = vfio_device
.config
.guest_pci_path
@@ -697,15 +746,8 @@ impl ResourceManagerInner {
// vfio mode: vfio-pci and vfio-pci-gk for x86_64
// - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container.
// - vfio-pci-gk, devices are managed by whatever driver in Guest kernel.
// - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices.
let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() {
"vfio" => {
if bus_type == "ccw" {
"vfio-ap".to_string()
} else {
"vfio-pci".to_string()
}
}
"vfio" => "vfio-pci".to_string(),
_ => "vfio-pci-gk".to_string(),
};
let device_options = vec![format!("{}={}", host_bdf, guest_pci_path)];
@@ -713,12 +755,6 @@ impl ResourceManagerInner {
// filepath.Base(dev.ContainerPath), e.g. "vfio0".
// The agent policy validates this with:
// i_vfio_device.id == concat("", ["vfio", suffix])
let group_num = d
.path()
.file_name()
.and_then(|n| n.to_str())
.unwrap_or_default()
.to_string();
let agent_device = Device {
id: group_num,
container_path: d.path().display().to_string().clone(),

View File

@@ -31,8 +31,9 @@ use containerd_shim_protos::events::task::{TaskExit, TaskOOM};
))]
use hypervisor::ch::CloudHypervisor;
use hypervisor::device::topology::PCIePort;
use hypervisor::device::util::{get_host_path, DEVICE_TYPE_CHAR};
use hypervisor::remote::Remote;
use hypervisor::VfioDeviceBase;
use hypervisor::{is_vfio_ap_device, VfioDeviceBase};
use hypervisor::VsockConfig;
use hypervisor::HYPERVISOR_REMOTE;
#[cfg(all(
@@ -262,7 +263,23 @@ impl VirtSandbox {
None
};
let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
// Cold-plug VFIO devices using two mutually exclusive paths:
// 1. CDI path: Query Kubernetes Pod Resources API for devices managed by device plugins
// (typical in K8s environments with device plugins)
// 2. Raw VFIO path: Parse OCI spec's linux.devices for directly specified VFIO devices
// (typical in standalone containers like `ctr --device /dev/vfio/0`)
//
// These paths are mutually exclusive from a user perspective:
// - In K8s, devices come through device plugins, not raw OCI device specs
// - In standalone containers, there's no Pod Resources API available
//
// Therefore, we only attempt the raw VFIO path if CDI finds no devices,
// avoiding unnecessary file I/O and OCI spec parsing in the common K8s case.
let mut vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
if vfio_devices.is_empty() {
let raw_vfio = self.prepare_coldplug_raw_vfio_devices(sandbox_config).await?;
vfio_devices.extend(raw_vfio);
}
if !vfio_devices.is_empty() {
info!(
sl!(),
@@ -387,6 +404,97 @@ impl VirtSandbox {
.collect())
}
// Fallback cold-plug path for standalone containers (e.g. `ctr --device /dev/vfio/0`).
// Reads the OCI spec from the bundle and cold-plugs any VFIO char devices found in
// linux.devices before VM boot, mirroring Go's coldOrHotPlugVFIO().
// Returns empty when the pod resources API path already handles devices (K8s) or
// when cold_plug_vfio is not configured.
async fn prepare_coldplug_raw_vfio_devices(
&self,
sandbox_config: &SandboxConfig,
) -> Result<Vec<ResourceConfig>> {
let hypervisor_config = self.hypervisor.hypervisor_config().await;
let cold_plug_vfio = &hypervisor_config.device_info.cold_plug_vfio;
if cold_plug_vfio.is_empty() || cold_plug_vfio == "no-port" {
return Ok(Vec::new());
}
let port = match cold_plug_vfio.as_str() {
"root-port" => PCIePort::RootPort,
other => {
return Err(anyhow!(
"unsupported cold_plug_vfio value {:?}; only \"root-port\" is supported",
other
))
}
};
let bundle = &sandbox_config.state.bundle;
if bundle.is_empty() {
return Ok(Vec::new());
}
let spec_path = format!("{}/{}", bundle, spec::OCI_SPEC_CONFIG_FILE_NAME);
let oci_spec = match oci::Spec::load(&spec_path) {
Ok(s) => s,
Err(e) => {
info!(
sl!(),
"no OCI spec at {:?}: {:?}, skipping raw VFIO cold-plug", spec_path, e
);
return Ok(Vec::new());
}
};
let linux_devices = oci_spec
.linux()
.as_ref()
.and_then(|l| l.devices().as_ref())
.cloned()
.unwrap_or_default();
let mut vfio_configs = Vec::new();
for d in linux_devices.iter() {
if d.typ() != oci::LinuxDeviceType::C {
continue;
}
let host_path = match get_host_path(DEVICE_TYPE_CHAR, d.major(), d.minor()) {
Ok(p) => p,
Err(e) => {
warn!(
sl!(),
"failed to resolve host path for {:?}: {:?}", d.path(), e
);
continue;
}
};
// Only process VFIO passthrough devices under /dev/vfio/*.
// Skip non-VFIO devices and the legacy VFIO control node (/dev/vfio/vfio).
if !host_path.starts_with("/dev/vfio/") || host_path == "/dev/vfio/vfio" {
continue;
}
let device_port = if is_vfio_ap_device(Path::new(&host_path)) {
PCIePort::NoPort
} else {
port
};
vfio_configs.push(VfioDeviceBase {
host_path: host_path.clone(),
iommu_group_devnode: PathBuf::from(&host_path),
dev_type: "c".to_string(),
port: device_port,
hostdev_prefix: "vfio_device".to_owned(),
..Default::default()
});
}
info!(sl!(), "raw VFIO cold-plug candidates: {:?}", vfio_configs);
Ok(vfio_configs
.into_iter()
.map(ResourceConfig::VfioDeviceModern)
.collect())
}
async fn prepare_network_resource(
&self,
network_env: &SandboxNetworkEnv,