mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-04-12 23:04:33 +00:00
runtime-rs: Add Pod Resources CDI discovery in sandbox
Query the kubelet Pod Resources API during sandbox setup to discover which GPU devices have been allocated to the pod. When cold_plug_vfio is enabled, the sandbox resolves CDI device specs, extracts host PCI addresses and IOMMU groups from sysfs, and creates VfioModernCfg device entries that get passed to the hypervisor for cold-plug. Add pod-resources and cdi crate dependencies to the runtimes and virt_container workspace members. Signed-off-by: Alex Lyn <alex.lyn@antgroup.com> Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
This commit is contained in:
committed by
Fabiano Fidêncio
parent
0d331cfe9f
commit
e04444da8e
@@ -29,3 +29,4 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
shim = { path = "crates/shim" }
|
||||
common = { workspace = true }
|
||||
runtimes = { workspace = true }
|
||||
pod-resources-rs = { workspace = true }
|
||||
|
||||
@@ -41,6 +41,7 @@ kata-types = { workspace = true }
|
||||
protocols = { workspace = true }
|
||||
protobuf = { workspace = true }
|
||||
kata-sys-util = { workspace = true }
|
||||
pod-resources-rs = { workspace = true }
|
||||
logging = { workspace = true }
|
||||
runtime-spec = { workspace = true }
|
||||
shim-interface = { workspace = true }
|
||||
|
||||
@@ -8,6 +8,7 @@ license = { workspace = true }
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
container-device-interface = "0.1.2"
|
||||
awaitgroup = "0.6.0"
|
||||
containerd-shim-protos = { workspace = true }
|
||||
libc = { workspace = true }
|
||||
@@ -30,6 +31,7 @@ agent = { workspace = true }
|
||||
common = { workspace = true }
|
||||
hypervisor = { workspace = true, features = ["cloud-hypervisor"] }
|
||||
kata-sys-util = { workspace = true }
|
||||
pod-resources-rs = { workspace = true }
|
||||
kata-types = { workspace = true }
|
||||
logging = { workspace = true }
|
||||
runtime-spec = { workspace = true }
|
||||
|
||||
@@ -23,8 +23,9 @@ use common::{
|
||||
ContainerManager, Sandbox, SandboxNetworkEnv,
|
||||
};
|
||||
|
||||
use container_device_interface::specs::config::DeviceNode as CdiSpecDeviceNode;
|
||||
use containerd_shim_protos::events::task::{TaskExit, TaskOOM};
|
||||
use hypervisor::VsockConfig;
|
||||
use hypervisor::device::topology::PCIePort;
|
||||
use hypervisor::HYPERVISOR_FIRECRACKER;
|
||||
use hypervisor::HYPERVISOR_REMOTE;
|
||||
#[cfg(feature = "dragonball")]
|
||||
@@ -37,6 +38,7 @@ use hypervisor::{
|
||||
use hypervisor::{BlockConfig, Hypervisor};
|
||||
use hypervisor::{BlockDeviceAio, PortDeviceConfig};
|
||||
use hypervisor::{ProtectionDeviceConfig, SevSnpConfig, TdxConfig};
|
||||
use hypervisor::{VfioDeviceBase, VsockConfig};
|
||||
use kata_sys_util::hooks::HookStates;
|
||||
use kata_sys_util::protection::{available_guest_protection, GuestProtection};
|
||||
use kata_sys_util::spec::load_oci_spec;
|
||||
@@ -47,6 +49,7 @@ use kata_types::config::{hypervisor::Factory, TomlConfig};
|
||||
use kata_types::initdata::{calculate_initdata_digest, ProtectedPlatform};
|
||||
use oci_spec::runtime as oci;
|
||||
use persist::{self, sandbox_persist::Persist};
|
||||
use pod_resources_rs::handle_cdi_devices;
|
||||
use protobuf::SpecialFields;
|
||||
use resource::coco_data::initdata::{
|
||||
kata_shared_init_data_path, InitDataConfig, KATA_INIT_DATA_IMAGE,
|
||||
@@ -56,14 +59,26 @@ use resource::manager::ManagerArgs;
|
||||
use resource::network::{dan_config_path, DanNetworkConfig, NetworkConfig, NetworkWithNetNsConfig};
|
||||
use resource::{ResourceConfig, ResourceManager};
|
||||
use runtime_spec as spec;
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use strum::Display;
|
||||
use tokio::sync::{mpsc::Sender, Mutex, RwLock};
|
||||
use tokio::time;
|
||||
use tracing::instrument;
|
||||
|
||||
pub(crate) const VIRTCONTAINER: &str = "virt_container";
|
||||
|
||||
/// Host path for CDI device nodes (`hostPath` when set, else `path`). Needed because
|
||||
/// `container_device_interface::specs::config::DeviceNode` fields are crate-private.
|
||||
fn cdi_device_node_host_path(dn: &CdiSpecDeviceNode) -> Option<String> {
|
||||
serde_json::to_value(dn).ok().and_then(|v| {
|
||||
v.get("hostPath")
|
||||
.or_else(|| v.get("path"))
|
||||
.and_then(|p| p.as_str())
|
||||
.map(String::from)
|
||||
})
|
||||
}
|
||||
|
||||
pub struct SandboxRestoreArgs {
|
||||
pub sid: String,
|
||||
pub toml_config: TomlConfig,
|
||||
@@ -161,7 +176,7 @@ impl VirtSandbox {
|
||||
async fn prepare_for_start_sandbox(
|
||||
&self,
|
||||
id: &str,
|
||||
network_env: SandboxNetworkEnv,
|
||||
sandbox_config: &SandboxConfig,
|
||||
) -> Result<Vec<ResourceConfig>> {
|
||||
let mut resource_configs = vec![];
|
||||
|
||||
@@ -172,6 +187,7 @@ impl VirtSandbox {
|
||||
.context("failed to prepare vm socket config")?;
|
||||
resource_configs.push(vm_socket_config);
|
||||
|
||||
let network_env: SandboxNetworkEnv = sandbox_config.network_env.clone();
|
||||
// prepare network config
|
||||
if !network_env.network_created {
|
||||
if let Some(network_resource) = self.prepare_network_resource(&network_env).await {
|
||||
@@ -207,6 +223,17 @@ impl VirtSandbox {
|
||||
None
|
||||
};
|
||||
|
||||
let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
|
||||
if !vfio_devices.is_empty() {
|
||||
info!(
|
||||
sl!(),
|
||||
"prepare pod devices {vfio_devices:?} for sandbox done."
|
||||
);
|
||||
resource_configs.extend(vfio_devices);
|
||||
} else {
|
||||
info!(sl!(), "no pod devices to prepare for sandbox.");
|
||||
}
|
||||
|
||||
// prepare protection device config
|
||||
if let Some(protection_dev_config) = self
|
||||
.prepare_protection_device_config(&self.hypervisor.hypervisor_config().await, init_data)
|
||||
@@ -252,6 +279,62 @@ impl VirtSandbox {
|
||||
}
|
||||
}
|
||||
|
||||
async fn prepare_coldplug_cdi_devices(
|
||||
&self,
|
||||
sandbox_config: &SandboxConfig,
|
||||
) -> Result<Vec<ResourceConfig>> {
|
||||
let config = self.resource_manager.config().await;
|
||||
let pod_resource_socket = &config.runtime.pod_resource_api_sock;
|
||||
info!(
|
||||
sl!(),
|
||||
"sandbox pod_resource_socket: {:?}", pod_resource_socket
|
||||
);
|
||||
// If pod_resource_socket is empty, we should treat it as not support such function.
|
||||
if !Path::new(pod_resource_socket).exists() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let annotations = &sandbox_config.annotations;
|
||||
info!(sl!(), "sandbox annotations: {:?}", annotations);
|
||||
|
||||
let cdi_devices = (pod_resources_rs::pod_resources::get_pod_cdi_devices(
|
||||
pod_resource_socket,
|
||||
annotations,
|
||||
)
|
||||
.await)
|
||||
.unwrap_or_default();
|
||||
info!(sl!(), "pod cdi devices: {:?}", cdi_devices);
|
||||
|
||||
let device_nodes = handle_cdi_devices(&cdi_devices, time::Duration::from_secs(100)).await?;
|
||||
let paths: Vec<String> = device_nodes
|
||||
.iter()
|
||||
.filter_map(cdi_device_node_host_path)
|
||||
.collect();
|
||||
|
||||
// FQN: nvidia.com/gpu=X
|
||||
let mut vfio_configs = Vec::new();
|
||||
for path in paths.iter() {
|
||||
let dev_info = VfioDeviceBase {
|
||||
host_path: path.clone(),
|
||||
// CDI passes the per-device cdev (e.g. /dev/vfio/devices/vfio0); device_manager
|
||||
// also copies host_path here — set early so configs are self-consistent in logs
|
||||
// and any code path that runs before that assignment still discovers VFIO correctly.
|
||||
iommu_group_devnode: PathBuf::from(path),
|
||||
dev_type: "c".to_string(),
|
||||
// bus_type: bus_type.clone(),
|
||||
port: PCIePort::RootPort,
|
||||
hostdev_prefix: "vfio_device".to_owned(),
|
||||
..Default::default()
|
||||
};
|
||||
vfio_configs.push(dev_info);
|
||||
}
|
||||
|
||||
Ok(vfio_configs
|
||||
.into_iter()
|
||||
.map(ResourceConfig::VfioDeviceModern)
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn prepare_network_resource(
|
||||
&self,
|
||||
network_env: &SandboxNetworkEnv,
|
||||
@@ -602,9 +685,7 @@ impl Sandbox for VirtSandbox {
|
||||
|
||||
// generate device and setup before start vm
|
||||
// should after hypervisor.prepare_vm
|
||||
let resources = self
|
||||
.prepare_for_start_sandbox(id, sandbox_config.network_env.clone())
|
||||
.await?;
|
||||
let resources = self.prepare_for_start_sandbox(id, sandbox_config).await?;
|
||||
|
||||
self.resource_manager
|
||||
.prepare_before_start_vm(resources)
|
||||
@@ -786,7 +867,7 @@ impl Sandbox for VirtSandbox {
|
||||
// generate device and setup before start vm
|
||||
// should after hypervisor.prepare_vm
|
||||
let resources = self
|
||||
.prepare_for_start_sandbox(id, sandbox_config.network_env.clone())
|
||||
.prepare_for_start_sandbox(id, sandbox_config)
|
||||
.await
|
||||
.context("prepare resources before start vm")?;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user