mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-01 14:38:33 +00:00
runtime-rs: add raw VFIO device cold-plug support
Add support for cold-plugging raw VFIO devices in standalone container scenarios (e.g., `ctr --device /dev/vfio/0`) where devices are specified directly in the OCI spec rather than through K8S CDI/device plugins. This implements a fallback path that: - Reads linux.devices from the OCI spec in the container bundle - Identifies character devices under /dev/vfio - Cold-plugs them before VM boot using the configured cold_plug_vfio mode - Handles VFIO-AP devices specially by using NoPort topology - Mirrors the Go runtime's coldOrHotPlugVFIO() behavior The implementation: - Adds prepare_coldplug_raw_vfio_devices() method to VirtSandbox - Exports is_vfio_ap_device() helper for device type detection - Integrates with existing CDI device cold-plug flow - Only activates when cold_plug_vfio is configured (not "no-port") - Skips processing when bundle path is unavailable or OCI spec missing Plus, update resource manager: - Skipping hotplug for cold-plugged mediated AP devices - Handling the AP devices to get exposed to the agent This enables VFIO device passthrough for standalone containers while maintaining compatibility with Kubernetes pod resource API workflows. Signed-off-by: Hyounggyu Choi <Hyounggyu.Choi@ibm.com>
This commit is contained in:
@@ -18,12 +18,13 @@ use kata_types::config::hypervisor::{
|
||||
use tokio::sync::{Mutex, RwLock};
|
||||
|
||||
use crate::{
|
||||
vfio_device::VfioDeviceModernHandle, vhost_user_blk::VhostUserBlkDevice, BlockConfig,
|
||||
BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, Hypervisor,
|
||||
NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, VhostUserConfig,
|
||||
VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE,
|
||||
KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, VIRTIO_BLOCK_MMIO,
|
||||
VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
|
||||
vfio_device::{VfioDeviceModernHandle, VfioDeviceType},
|
||||
vhost_user_blk::VhostUserBlkDevice,
|
||||
BlockConfig, BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice,
|
||||
Hypervisor, NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice,
|
||||
VhostUserConfig, VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE,
|
||||
KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW,
|
||||
VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -724,6 +725,36 @@ pub async fn get_shared_fs_info(d: &RwLock<DeviceManager>) -> SharedFsInfo {
|
||||
d.read().await.get_shared_fs_info().await
|
||||
}
|
||||
|
||||
/// Returns the APQN list for a cold-plugged VFIO-AP device whose
|
||||
/// `iommu_group_devnode` matches `host_path`, or `None` if no such device is
|
||||
/// registered in the device manager.
|
||||
///
|
||||
/// Used by `handler_devices` to bypass `do_handle_device` for VFIO-AP devices
|
||||
/// that were cold-plugged before VM boot. VFIO-AP devices have no PCIe BDF so
|
||||
/// the BDF-keyed `cold_plug_bdfs` map cannot catch them; this lookup fills that
|
||||
/// gap without touching reference counts or the QMP hot-plug path.
|
||||
pub async fn find_cold_plugged_vfio_ap(
|
||||
d: &RwLock<DeviceManager>,
|
||||
host_path: &str,
|
||||
) -> Option<Vec<String>> {
|
||||
// Avoid holding the DeviceManager read-lock across .await points.
|
||||
let devices: Vec<ArcMutexDevice> = {
|
||||
let dm = d.read().await;
|
||||
dm.devices.values().cloned().collect()
|
||||
};
|
||||
for dev in devices {
|
||||
if let DeviceType::VfioModern(inner) = dev.lock().await.get_device_info().await {
|
||||
let guard = inner.lock().await;
|
||||
if guard.device.device_type == VfioDeviceType::MediatedAp
|
||||
&& guard.config.iommu_group_devnode == Path::new(host_path)
|
||||
{
|
||||
return Some(guard.config.ap_devices.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::DeviceManager;
|
||||
|
||||
@@ -20,10 +20,10 @@ mod virtio_vsock;
|
||||
pub use port_device::{PCIePortDevice, PortDeviceConfig};
|
||||
pub use protection_device::{ProtectionDevice, ProtectionDeviceConfig, SevSnpConfig, TdxConfig};
|
||||
pub use vfio::{
|
||||
bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, VfioConfig,
|
||||
VfioDevice,
|
||||
bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode,
|
||||
VfioConfig, VfioDevice, VfioDeviceType,
|
||||
};
|
||||
pub use vfio_device::{VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle};
|
||||
pub use vfio_device::{is_vfio_ap_device, VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle};
|
||||
pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType};
|
||||
pub use vhost_user_net::VhostUserNetDevice;
|
||||
pub use virtio_blk::{
|
||||
|
||||
@@ -11,23 +11,27 @@ use anyhow::{anyhow, Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use hypervisor::{
|
||||
device::{
|
||||
device_manager::{do_handle_device, get_block_device_info, DeviceManager},
|
||||
device_manager::{
|
||||
do_handle_device, find_cold_plugged_vfio_ap, get_block_device_info, DeviceManager,
|
||||
},
|
||||
util::{get_host_path, DEVICE_TYPE_BLOCK, DEVICE_TYPE_CHAR},
|
||||
DeviceConfig, DeviceType,
|
||||
},
|
||||
utils::uses_native_ccw_bus,
|
||||
vfio_device::is_vfio_ap_device,
|
||||
BlockConfig, BlockDeviceAio, Hypervisor, VfioConfig,
|
||||
};
|
||||
use kata_types::mount::{kata_guest_sandbox_dir, Mount, KATA_EPHEMERAL_VOLUME_TYPE, SHM_DIR};
|
||||
use kata_types::{
|
||||
config::{hypervisor::TopologyConfigInfo, TomlConfig},
|
||||
device::DRIVER_VFIO_AP_COLD_TYPE,
|
||||
mount::{adjust_rootfs_mounts, KATA_IMAGE_FORCE_GUEST_PULL},
|
||||
};
|
||||
use libc::NUD_PERMANENT;
|
||||
use oci::{Linux, LinuxCpu, LinuxResources};
|
||||
use oci_spec::runtime::{self as oci, LinuxDeviceType};
|
||||
use persist::sandbox_persist::Persist;
|
||||
use std::path::PathBuf;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tokio::{runtime, sync::RwLock};
|
||||
|
||||
use crate::{
|
||||
@@ -661,6 +665,43 @@ impl ResourceManagerInner {
|
||||
continue;
|
||||
}
|
||||
|
||||
// VFIO-AP devices have no PCIe BDF, so cold_plug_bdfs above
|
||||
// cannot catch them. If this device was registered in the
|
||||
// device manager during cold-plug (prepare_coldplug_raw_vfio_devices
|
||||
// or CDI), retrieve its APQN list and build the agent device
|
||||
// directly — calling do_handle_device on an already-present
|
||||
// device would attempt a QMP device_add and fail.
|
||||
if is_vfio_ap_device(Path::new(&host_path)) {
|
||||
if let Some(ap_devs) =
|
||||
find_cold_plugged_vfio_ap(&self.device_manager, &host_path).await
|
||||
{
|
||||
let container_path = d.path().display().to_string();
|
||||
let group_num = d
|
||||
.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
let agent_device = Device {
|
||||
id: group_num,
|
||||
container_path,
|
||||
field_type: DRIVER_VFIO_AP_COLD_TYPE.to_string(),
|
||||
options: ap_devs,
|
||||
..Default::default()
|
||||
};
|
||||
info!(
|
||||
sl!(),
|
||||
"vfio-ap cold-plugged agent device: {:?}", agent_device
|
||||
);
|
||||
devices.push(ContainerDevice {
|
||||
device_info: None,
|
||||
device: agent_device,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
// Not registered as cold-plugged — fall through to do_handle_device.
|
||||
}
|
||||
|
||||
let bus_type = if uses_native_ccw_bus() {
|
||||
"ccw".to_string()
|
||||
} else {
|
||||
@@ -681,6 +722,14 @@ impl ResourceManagerInner {
|
||||
if let DeviceType::VfioModern(vfio_dev) = device_info.clone() {
|
||||
info!(sl!(), "device info: {:?}", vfio_dev.lock().await);
|
||||
let vfio_device = vfio_dev.lock().await;
|
||||
|
||||
let group_num = d
|
||||
.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
|
||||
let guest_pci_path = vfio_device
|
||||
.config
|
||||
.guest_pci_path
|
||||
@@ -697,15 +746,8 @@ impl ResourceManagerInner {
|
||||
// vfio mode: vfio-pci and vfio-pci-gk for x86_64
|
||||
// - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container.
|
||||
// - vfio-pci-gk, devices are managed by whatever driver in Guest kernel.
|
||||
// - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices.
|
||||
let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() {
|
||||
"vfio" => {
|
||||
if bus_type == "ccw" {
|
||||
"vfio-ap".to_string()
|
||||
} else {
|
||||
"vfio-pci".to_string()
|
||||
}
|
||||
}
|
||||
"vfio" => "vfio-pci".to_string(),
|
||||
_ => "vfio-pci-gk".to_string(),
|
||||
};
|
||||
let device_options = vec![format!("{}={}", host_bdf, guest_pci_path)];
|
||||
@@ -713,12 +755,6 @@ impl ResourceManagerInner {
|
||||
// filepath.Base(dev.ContainerPath), e.g. "vfio0".
|
||||
// The agent policy validates this with:
|
||||
// i_vfio_device.id == concat("", ["vfio", suffix])
|
||||
let group_num = d
|
||||
.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
let agent_device = Device {
|
||||
id: group_num,
|
||||
container_path: d.path().display().to_string().clone(),
|
||||
|
||||
@@ -31,8 +31,9 @@ use containerd_shim_protos::events::task::{TaskExit, TaskOOM};
|
||||
))]
|
||||
use hypervisor::ch::CloudHypervisor;
|
||||
use hypervisor::device::topology::PCIePort;
|
||||
use hypervisor::device::util::{get_host_path, DEVICE_TYPE_CHAR};
|
||||
use hypervisor::remote::Remote;
|
||||
use hypervisor::VfioDeviceBase;
|
||||
use hypervisor::{is_vfio_ap_device, VfioDeviceBase};
|
||||
use hypervisor::VsockConfig;
|
||||
use hypervisor::HYPERVISOR_REMOTE;
|
||||
#[cfg(all(
|
||||
@@ -262,7 +263,23 @@ impl VirtSandbox {
|
||||
None
|
||||
};
|
||||
|
||||
let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
|
||||
// Cold-plug VFIO devices using two mutually exclusive paths:
|
||||
// 1. CDI path: Query Kubernetes Pod Resources API for devices managed by device plugins
|
||||
// (typical in K8s environments with device plugins)
|
||||
// 2. Raw VFIO path: Parse OCI spec's linux.devices for directly specified VFIO devices
|
||||
// (typical in standalone containers like `ctr --device /dev/vfio/0`)
|
||||
//
|
||||
// These paths are mutually exclusive from a user perspective:
|
||||
// - In K8s, devices come through device plugins, not raw OCI device specs
|
||||
// - In standalone containers, there's no Pod Resources API available
|
||||
//
|
||||
// Therefore, we only attempt the raw VFIO path if CDI finds no devices,
|
||||
// avoiding unnecessary file I/O and OCI spec parsing in the common K8s case.
|
||||
let mut vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
|
||||
if vfio_devices.is_empty() {
|
||||
let raw_vfio = self.prepare_coldplug_raw_vfio_devices(sandbox_config).await?;
|
||||
vfio_devices.extend(raw_vfio);
|
||||
}
|
||||
if !vfio_devices.is_empty() {
|
||||
info!(
|
||||
sl!(),
|
||||
@@ -387,6 +404,97 @@ impl VirtSandbox {
|
||||
.collect())
|
||||
}
|
||||
|
||||
// Fallback cold-plug path for standalone containers (e.g. `ctr --device /dev/vfio/0`).
|
||||
// Reads the OCI spec from the bundle and cold-plugs any VFIO char devices found in
|
||||
// linux.devices before VM boot, mirroring Go's coldOrHotPlugVFIO().
|
||||
// Returns empty when the pod resources API path already handles devices (K8s) or
|
||||
// when cold_plug_vfio is not configured.
|
||||
async fn prepare_coldplug_raw_vfio_devices(
|
||||
&self,
|
||||
sandbox_config: &SandboxConfig,
|
||||
) -> Result<Vec<ResourceConfig>> {
|
||||
let hypervisor_config = self.hypervisor.hypervisor_config().await;
|
||||
let cold_plug_vfio = &hypervisor_config.device_info.cold_plug_vfio;
|
||||
if cold_plug_vfio.is_empty() || cold_plug_vfio == "no-port" {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let port = match cold_plug_vfio.as_str() {
|
||||
"root-port" => PCIePort::RootPort,
|
||||
other => {
|
||||
return Err(anyhow!(
|
||||
"unsupported cold_plug_vfio value {:?}; only \"root-port\" is supported",
|
||||
other
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
let bundle = &sandbox_config.state.bundle;
|
||||
if bundle.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let spec_path = format!("{}/{}", bundle, spec::OCI_SPEC_CONFIG_FILE_NAME);
|
||||
let oci_spec = match oci::Spec::load(&spec_path) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
info!(
|
||||
sl!(),
|
||||
"no OCI spec at {:?}: {:?}, skipping raw VFIO cold-plug", spec_path, e
|
||||
);
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
};
|
||||
|
||||
let linux_devices = oci_spec
|
||||
.linux()
|
||||
.as_ref()
|
||||
.and_then(|l| l.devices().as_ref())
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut vfio_configs = Vec::new();
|
||||
for d in linux_devices.iter() {
|
||||
if d.typ() != oci::LinuxDeviceType::C {
|
||||
continue;
|
||||
}
|
||||
let host_path = match get_host_path(DEVICE_TYPE_CHAR, d.major(), d.minor()) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
sl!(),
|
||||
"failed to resolve host path for {:?}: {:?}", d.path(), e
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
// Only process VFIO passthrough devices under /dev/vfio/*.
|
||||
// Skip non-VFIO devices and the legacy VFIO control node (/dev/vfio/vfio).
|
||||
if !host_path.starts_with("/dev/vfio/") || host_path == "/dev/vfio/vfio" {
|
||||
continue;
|
||||
}
|
||||
let device_port = if is_vfio_ap_device(Path::new(&host_path)) {
|
||||
PCIePort::NoPort
|
||||
} else {
|
||||
port
|
||||
};
|
||||
vfio_configs.push(VfioDeviceBase {
|
||||
host_path: host_path.clone(),
|
||||
iommu_group_devnode: PathBuf::from(&host_path),
|
||||
dev_type: "c".to_string(),
|
||||
port: device_port,
|
||||
hostdev_prefix: "vfio_device".to_owned(),
|
||||
..Default::default()
|
||||
});
|
||||
}
|
||||
info!(sl!(), "raw VFIO cold-plug candidates: {:?}", vfio_configs);
|
||||
|
||||
Ok(vfio_configs
|
||||
.into_iter()
|
||||
.map(ResourceConfig::VfioDeviceModern)
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn prepare_network_resource(
|
||||
&self,
|
||||
network_env: &SandboxNetworkEnv,
|
||||
|
||||
Reference in New Issue
Block a user