From 241f33dd53a54ad54bb7ece6898fa1364a925a8f Mon Sep 17 00:00:00 2001 From: Alex Lyn Date: Sun, 12 Apr 2026 16:30:10 +0200 Subject: [PATCH] runtime-rs: Add Pod Resources CDI discovery in sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Query the kubelet Pod Resources API during sandbox setup to discover which GPU devices have been allocated to the pod. When cold_plug_vfio is enabled, the sandbox resolves CDI device specs, extracts host PCI addresses and IOMMU groups from sysfs, and creates VfioModernCfg device entries that get passed to the hypervisor for cold-plug. Add pod-resources and cdi crate dependencies to the runtimes and virt_container workspace members. Signed-off-by: Alex Lyn Signed-off-by: Fabiano FidĂȘncio --- Cargo.lock | 1 + .../kata-types/src/config/hypervisor/mod.rs | 8 ++ .../crates/runtimes/virt_container/Cargo.toml | 1 + .../runtimes/virt_container/src/sandbox.rs | 96 +++++++++++++++++-- src/tools/agent-ctl/Cargo.lock | 2 + 5 files changed, 102 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1d6a6d20be..f45ee5b033 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8615,6 +8615,7 @@ dependencies = [ "nix 0.26.4", "oci-spec 0.8.4", "persist", + "pod-resources-rs", "protobuf", "resource", "runtime-spec", diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs index 776b781a4c..8edfeaacd7 100644 --- a/src/libs/kata-types/src/config/hypervisor/mod.rs +++ b/src/libs/kata-types/src/config/hypervisor/mod.rs @@ -787,6 +787,14 @@ pub struct DeviceInfo { #[serde(default)] pub hotplug_vfio_on_root_bus: bool, + /// Cold-plug VFIO devices to a PCIe port type. + /// + /// Accepted values: `"no-port"` (default, disabled), `"root-port"`. + /// In confidential compute environments hot-plugging can compromise + /// security, so devices are cold-plugged instead. + #[serde(default)] + pub cold_plug_vfio: String, + /// Number of PCIe root ports to create during VM creation. /// /// Valid when `hotplug_vfio_on_root_bus = true` and `machine_type = "q35"`. diff --git a/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml b/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml index 13e9524c40..74ff8a3163 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml +++ b/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml @@ -30,6 +30,7 @@ agent = { workspace = true } common = { workspace = true } hypervisor = { workspace = true, features = ["cloud-hypervisor"] } kata-sys-util = { workspace = true } +pod-resources-rs = { workspace = true } kata-types = { workspace = true } logging = { workspace = true } runtime-spec = { workspace = true } diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index d887de1ece..4de5cf1de9 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -24,6 +24,7 @@ use common::{ }; use containerd_shim_protos::events::task::{TaskExit, TaskOOM}; +use hypervisor::device::topology::PCIePort; use hypervisor::VsockConfig; #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] use hypervisor::{firecracker::Firecracker, HYPERVISOR_FIRECRACKER}; @@ -47,6 +48,7 @@ use hypervisor::{ use hypervisor::{BlockConfig, Hypervisor}; use hypervisor::{BlockDeviceAio, PortDeviceConfig}; use hypervisor::{ProtectionDeviceConfig, SevSnpConfig, TdxConfig}; +use hypervisor::VfioDeviceBase; use kata_sys_util::hooks::HookStates; use kata_sys_util::protection::{available_guest_protection, GuestProtection}; use kata_sys_util::spec::load_oci_spec; @@ -61,6 +63,7 @@ use kata_types::config::{hypervisor::Factory, TomlConfig}; use kata_types::initdata::{calculate_initdata_digest, ProtectedPlatform}; use oci_spec::runtime as oci; use persist::{self, sandbox_persist::Persist}; +use pod_resources_rs::handle_cdi_devices; use protobuf::SpecialFields; use resource::coco_data::initdata::{ kata_shared_init_data_path, InitDataConfig, KATA_INIT_DATA_IMAGE, @@ -70,7 +73,7 @@ use resource::manager::ManagerArgs; use resource::network::{dan_config_path, DanNetworkConfig, NetworkConfig, NetworkWithNetNsConfig}; use resource::{ResourceConfig, ResourceManager}; use runtime_spec as spec; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::Arc; use strum::Display; use tokio::sync::{mpsc::Sender, Mutex, RwLock}; @@ -175,7 +178,7 @@ impl VirtSandbox { async fn prepare_for_start_sandbox( &self, id: &str, - network_env: SandboxNetworkEnv, + sandbox_config: &SandboxConfig, ) -> Result> { let mut resource_configs = vec![]; @@ -186,6 +189,7 @@ impl VirtSandbox { .context("failed to prepare vm socket config")?; resource_configs.push(vm_socket_config); + let network_env: SandboxNetworkEnv = sandbox_config.network_env.clone(); // prepare network config if !network_env.network_created { if let Some(network_resource) = self.prepare_network_resource(&network_env).await { @@ -221,6 +225,17 @@ impl VirtSandbox { None }; + let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?; + if !vfio_devices.is_empty() { + info!( + sl!(), + "prepare pod devices {vfio_devices:?} for sandbox done." + ); + resource_configs.extend(vfio_devices); + } else { + info!(sl!(), "no pod devices to prepare for sandbox."); + } + // prepare protection device config if let Some(protection_dev_config) = self .prepare_protection_device_config(&self.hypervisor.hypervisor_config().await, init_data) @@ -266,6 +281,77 @@ impl VirtSandbox { } } + async fn prepare_coldplug_cdi_devices( + &self, + sandbox_config: &SandboxConfig, + ) -> Result> { + let hypervisor_config = self.hypervisor.hypervisor_config().await; + let cold_plug_vfio = &hypervisor_config.device_info.cold_plug_vfio; + if cold_plug_vfio.is_empty() || cold_plug_vfio == "no-port" { + return Ok(Vec::new()); + } + + let port = match cold_plug_vfio.as_str() { + "root-port" => PCIePort::RootPort, + other => { + return Err(anyhow!( + "unsupported cold_plug_vfio value {:?}; only \"root-port\" is supported", + other + )) + } + }; + + let config = self.resource_manager.config().await; + let pod_resource_socket = &config.runtime.pod_resource_api_sock; + info!( + sl!(), + "sandbox pod_resource_socket: {:?}", pod_resource_socket + ); + if pod_resource_socket.is_empty() || !Path::new(pod_resource_socket).exists() { + return Ok(Vec::new()); + } + + let annotations = &sandbox_config.annotations; + debug!( + sl!(), + "cold-plug: sandbox-name={:?} sandbox-namespace={:?}", + annotations.get("io.kubernetes.cri.sandbox-name"), + annotations.get("io.kubernetes.cri.sandbox-namespace") + ); + + let cdi_devices = pod_resources_rs::pod_resources::get_pod_cdi_devices( + pod_resource_socket, + annotations, + ) + .await + .context("failed to query Pod Resources CDI devices")?; + info!(sl!(), "pod cdi devices: {:?}", cdi_devices); + + let device_nodes = handle_cdi_devices(&cdi_devices).await?; + let paths: Vec = device_nodes + .iter() + .filter_map(pod_resources_rs::device_node_host_path) + .collect(); + + let mut vfio_configs = Vec::new(); + for path in paths.iter() { + let dev_info = VfioDeviceBase { + host_path: path.clone(), + iommu_group_devnode: PathBuf::from(path), + dev_type: "c".to_string(), + port, + hostdev_prefix: "vfio_device".to_owned(), + ..Default::default() + }; + vfio_configs.push(dev_info); + } + + Ok(vfio_configs + .into_iter() + .map(ResourceConfig::VfioDeviceModern) + .collect()) + } + async fn prepare_network_resource( &self, network_env: &SandboxNetworkEnv, @@ -656,9 +742,7 @@ impl Sandbox for VirtSandbox { // generate device and setup before start vm // should after hypervisor.prepare_vm - let resources = self - .prepare_for_start_sandbox(id, sandbox_config.network_env.clone()) - .await?; + let resources = self.prepare_for_start_sandbox(id, sandbox_config).await?; self.resource_manager .prepare_before_start_vm(resources) @@ -841,7 +925,7 @@ impl Sandbox for VirtSandbox { // generate device and setup before start vm // should after hypervisor.prepare_vm let resources = self - .prepare_for_start_sandbox(id, sandbox_config.network_env.clone()) + .prepare_for_start_sandbox(id, sandbox_config) .await .context("prepare resources before start vm")?; diff --git a/src/tools/agent-ctl/Cargo.lock b/src/tools/agent-ctl/Cargo.lock index 8522a5a35c..72be9c7019 100644 --- a/src/tools/agent-ctl/Cargo.lock +++ b/src/tools/agent-ctl/Cargo.lock @@ -2139,6 +2139,7 @@ dependencies = [ "libc", "logging", "nix 0.26.4", + "once_cell", "path-clean", "persist", "protocols", @@ -2146,6 +2147,7 @@ dependencies = [ "qapi-qmp", "qapi-spec", "rand 0.10.1", + "regex", "rust-ini", "safe-path 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "seccompiler",