feat(runtime-rs): expose visible_cdi_devices in config

Declare the `visible_cdi_devices` agent option (kernel param
agent.visible_cdi_devices) in kata-types so runtime-rs can opt into
emitting it to the guest, and expose it in the three NVIDIA GPU
configuration templates (qemu, qemu-snp, qemu-tdx) at runtime-rs/config/.

The agent consumes the corresponding VISIBLE_CDI_DEVICES env var to
drive CDI device requests.

Signed-off-by: LandonTClipp <lclipp@coreweave.com>
This commit is contained in:
LandonTClipp
2026-06-15 16:34:21 +00:00
committed by Fabiano Fidêncio
parent 676fc90d0b
commit b49eb577b2
5 changed files with 83 additions and 0 deletions

View File

@@ -93,6 +93,12 @@ pub struct Agent {
#[serde(default)]
pub debug_console_enabled: bool,
/// When enabled, the agent translates a container's VISIBLE_CDI_DEVICES
/// environment variable into CDI GPU device requests (nvidia.com/gpu) so
/// that the container sees the matching GPUs present in the VM.
#[serde(default)]
pub visible_cdi_devices: bool,
/// Agent server port
#[serde(default = "default_server_port")]
pub server_port: u32,
@@ -180,6 +186,7 @@ impl std::default::Default for Agent {
log_level: "info".to_string(),
enable_tracing: false,
debug_console_enabled: false,
visible_cdi_devices: false,
server_port: DEFAULT_AGENT_VSOCK_PORT,
log_port: DEFAULT_AGENT_LOG_PORT,
passfd_listener_port: DEFAULT_PASSFD_LISTENER_PORT,

View File

@@ -61,6 +61,8 @@ pub const CONTAINER_PIPE_SIZE_OPTION: &str = "agent.container_pipe_size";
pub const LAUNCH_PROCESS_TIMEOUT_OPTION: &str = "agent.launch_process_timeout";
/// Option of setting the fd passthrough io listener port
pub const PASSFD_LISTENER_PORT: &str = "agent.passfd_listener_port";
/// Option enabling translation of VISIBLE_CDI_DEVICES into CDI GPU requests
pub const VISIBLE_CDI_DEVICES_OPTION: &str = "agent.visible_cdi_devices";
/// Trait to manipulate global Kata configuration information.
pub trait ConfigPlugin: Send + Sync {
@@ -246,6 +248,9 @@ impl TomlConfig {
DEFAULT_AGENT_DBG_CONSOLE_PORT.to_string(),
);
}
if cfg.visible_cdi_devices {
kv.insert(VISIBLE_CDI_DEVICES_OPTION.to_string(), "true".to_string());
}
if cfg.mem_agent.enable {
kv.insert("psi".to_string(), "1".to_string());
kv.insert("agent.mem_agent_enable".to_string(), "1".to_string());
@@ -500,6 +505,7 @@ mod tests {
container_pipe_size: 20,
debug_console_enabled: true,
launch_process_timeout: 60,
visible_cdi_devices: true,
..Default::default()
};
let agent_name = "test_agent";
@@ -513,5 +519,6 @@ mod tests {
kv.get("agent.debug_console").unwrap();
assert_eq!(kv.get("agent.debug_console_vport").unwrap(), "1026"); // 1026 is the default port
assert_eq!(kv.get("agent.launch_process_timeout").unwrap(), "60");
assert_eq!(kv.get("agent.visible_cdi_devices").unwrap(), "true");
}
}

View File

@@ -565,6 +565,29 @@ reconnect_timeout_ms = @DEFRECONNECTTIMEOUTMS_NV@
# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s)
create_container_timeout = @DEFAULTTIMEOUT_NV@
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
# against the GPUs present in the VM via the CDI spec generated in the guest at
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
#
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
#
# For example, you may set something like:
#
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
#
# The devices can be referenced by explicit CDI index or through the "all"
# keyword.
#
# This parameter is useful in the case where multiple containers in a pod need
# access to the same GPU and do not want to request additional GPUs from the
# outer runtime. This is especially useful with GPU observability where one
# workload container performs the CDI request to the outer runtime, and the
# sidecar observability containers would get access to the same resources by
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
# (default: false)
visible_cdi_devices = false
[agent.@PROJECT_TYPE@.mem_agent]
# Control the mem-agent function enable or disable.
# Default to false

View File

@@ -595,6 +595,29 @@ reconnect_timeout_ms = @DEFRECONNECTTIMEOUTMS_NV@
# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s)
create_container_timeout = @DEFAULTTIMEOUT_NV@
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
# against the GPUs present in the VM via the CDI spec generated in the guest at
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
#
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
#
# For example, you may set something like:
#
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
#
# The devices can be referenced by explicit CDI index or through the "all"
# keyword.
#
# This parameter is useful in the case where multiple containers in a pod need
# access to the same GPU and do not want to request additional GPUs from the
# outer runtime. This is especially useful with GPU observability where one
# workload container performs the CDI request to the outer runtime, and the
# sidecar observability containers would get access to the same resources by
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
# (default: false)
visible_cdi_devices = false
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -571,6 +571,29 @@ reconnect_timeout_ms = @DEFRECONNECTTIMEOUTMS_NV@
# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s)
create_container_timeout = @DEFAULTTIMEOUT_NV@
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
# against the GPUs present in the VM via the CDI spec generated in the guest at
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
#
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
#
# For example, you may set something like:
#
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
#
# The devices can be referenced by explicit CDI index or through the "all"
# keyword.
#
# This parameter is useful in the case where multiple containers in a pod need
# access to the same GPU and do not want to request additional GPUs from the
# outer runtime. This is especially useful with GPU observability where one
# workload container performs the CDI request to the outer runtime, and the
# sidecar observability containers would get access to the same resources by
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
# (default: false)
visible_cdi_devices = false
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log