mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-01 06:28:11 +00:00
feat(runtime-rs): expose visible_cdi_devices in config
Declare the `visible_cdi_devices` agent option (kernel param agent.visible_cdi_devices) in kata-types so runtime-rs can opt into emitting it to the guest, and expose it in the three NVIDIA GPU configuration templates (qemu, qemu-snp, qemu-tdx) at runtime-rs/config/. The agent consumes the corresponding VISIBLE_CDI_DEVICES env var to drive CDI device requests. Signed-off-by: LandonTClipp <lclipp@coreweave.com>
This commit is contained in:
committed by
Fabiano Fidêncio
parent
676fc90d0b
commit
b49eb577b2
@@ -93,6 +93,12 @@ pub struct Agent {
|
||||
#[serde(default)]
|
||||
pub debug_console_enabled: bool,
|
||||
|
||||
/// When enabled, the agent translates a container's VISIBLE_CDI_DEVICES
|
||||
/// environment variable into CDI GPU device requests (nvidia.com/gpu) so
|
||||
/// that the container sees the matching GPUs present in the VM.
|
||||
#[serde(default)]
|
||||
pub visible_cdi_devices: bool,
|
||||
|
||||
/// Agent server port
|
||||
#[serde(default = "default_server_port")]
|
||||
pub server_port: u32,
|
||||
@@ -180,6 +186,7 @@ impl std::default::Default for Agent {
|
||||
log_level: "info".to_string(),
|
||||
enable_tracing: false,
|
||||
debug_console_enabled: false,
|
||||
visible_cdi_devices: false,
|
||||
server_port: DEFAULT_AGENT_VSOCK_PORT,
|
||||
log_port: DEFAULT_AGENT_LOG_PORT,
|
||||
passfd_listener_port: DEFAULT_PASSFD_LISTENER_PORT,
|
||||
|
||||
@@ -61,6 +61,8 @@ pub const CONTAINER_PIPE_SIZE_OPTION: &str = "agent.container_pipe_size";
|
||||
pub const LAUNCH_PROCESS_TIMEOUT_OPTION: &str = "agent.launch_process_timeout";
|
||||
/// Option of setting the fd passthrough io listener port
|
||||
pub const PASSFD_LISTENER_PORT: &str = "agent.passfd_listener_port";
|
||||
/// Option enabling translation of VISIBLE_CDI_DEVICES into CDI GPU requests
|
||||
pub const VISIBLE_CDI_DEVICES_OPTION: &str = "agent.visible_cdi_devices";
|
||||
|
||||
/// Trait to manipulate global Kata configuration information.
|
||||
pub trait ConfigPlugin: Send + Sync {
|
||||
@@ -246,6 +248,9 @@ impl TomlConfig {
|
||||
DEFAULT_AGENT_DBG_CONSOLE_PORT.to_string(),
|
||||
);
|
||||
}
|
||||
if cfg.visible_cdi_devices {
|
||||
kv.insert(VISIBLE_CDI_DEVICES_OPTION.to_string(), "true".to_string());
|
||||
}
|
||||
if cfg.mem_agent.enable {
|
||||
kv.insert("psi".to_string(), "1".to_string());
|
||||
kv.insert("agent.mem_agent_enable".to_string(), "1".to_string());
|
||||
@@ -500,6 +505,7 @@ mod tests {
|
||||
container_pipe_size: 20,
|
||||
debug_console_enabled: true,
|
||||
launch_process_timeout: 60,
|
||||
visible_cdi_devices: true,
|
||||
..Default::default()
|
||||
};
|
||||
let agent_name = "test_agent";
|
||||
@@ -513,5 +519,6 @@ mod tests {
|
||||
kv.get("agent.debug_console").unwrap();
|
||||
assert_eq!(kv.get("agent.debug_console_vport").unwrap(), "1026"); // 1026 is the default port
|
||||
assert_eq!(kv.get("agent.launch_process_timeout").unwrap(), "60");
|
||||
assert_eq!(kv.get("agent.visible_cdi_devices").unwrap(), "true");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -565,6 +565,29 @@ reconnect_timeout_ms = @DEFRECONNECTTIMEOUTMS_NV@
|
||||
# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s)
|
||||
create_container_timeout = @DEFAULTTIMEOUT_NV@
|
||||
|
||||
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
|
||||
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
|
||||
# against the GPUs present in the VM via the CDI spec generated in the guest at
|
||||
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
|
||||
#
|
||||
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
|
||||
#
|
||||
# For example, you may set something like:
|
||||
#
|
||||
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
|
||||
#
|
||||
# The devices can be referenced by explicit CDI index or through the "all"
|
||||
# keyword.
|
||||
#
|
||||
# This parameter is useful in the case where multiple containers in a pod need
|
||||
# access to the same GPU and do not want to request additional GPUs from the
|
||||
# outer runtime. This is especially useful with GPU observability where one
|
||||
# workload container performs the CDI request to the outer runtime, and the
|
||||
# sidecar observability containers would get access to the same resources by
|
||||
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
|
||||
# (default: false)
|
||||
visible_cdi_devices = false
|
||||
|
||||
[agent.@PROJECT_TYPE@.mem_agent]
|
||||
# Control the mem-agent function enable or disable.
|
||||
# Default to false
|
||||
|
||||
@@ -595,6 +595,29 @@ reconnect_timeout_ms = @DEFRECONNECTTIMEOUTMS_NV@
|
||||
# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s)
|
||||
create_container_timeout = @DEFAULTTIMEOUT_NV@
|
||||
|
||||
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
|
||||
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
|
||||
# against the GPUs present in the VM via the CDI spec generated in the guest at
|
||||
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
|
||||
#
|
||||
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
|
||||
#
|
||||
# For example, you may set something like:
|
||||
#
|
||||
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
|
||||
#
|
||||
# The devices can be referenced by explicit CDI index or through the "all"
|
||||
# keyword.
|
||||
#
|
||||
# This parameter is useful in the case where multiple containers in a pod need
|
||||
# access to the same GPU and do not want to request additional GPUs from the
|
||||
# outer runtime. This is especially useful with GPU observability where one
|
||||
# workload container performs the CDI request to the outer runtime, and the
|
||||
# sidecar observability containers would get access to the same resources by
|
||||
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
|
||||
# (default: false)
|
||||
visible_cdi_devices = false
|
||||
|
||||
[runtime]
|
||||
# If enabled, the runtime will log additional debug messages to the
|
||||
# system log
|
||||
|
||||
@@ -571,6 +571,29 @@ reconnect_timeout_ms = @DEFRECONNECTTIMEOUTMS_NV@
|
||||
# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s)
|
||||
create_container_timeout = @DEFAULTTIMEOUT_NV@
|
||||
|
||||
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
|
||||
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
|
||||
# against the GPUs present in the VM via the CDI spec generated in the guest at
|
||||
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
|
||||
#
|
||||
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
|
||||
#
|
||||
# For example, you may set something like:
|
||||
#
|
||||
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
|
||||
#
|
||||
# The devices can be referenced by explicit CDI index or through the "all"
|
||||
# keyword.
|
||||
#
|
||||
# This parameter is useful in the case where multiple containers in a pod need
|
||||
# access to the same GPU and do not want to request additional GPUs from the
|
||||
# outer runtime. This is especially useful with GPU observability where one
|
||||
# workload container performs the CDI request to the outer runtime, and the
|
||||
# sidecar observability containers would get access to the same resources by
|
||||
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
|
||||
# (default: false)
|
||||
visible_cdi_devices = false
|
||||
|
||||
[runtime]
|
||||
# If enabled, the runtime will log additional debug messages to the
|
||||
# system log
|
||||
|
||||
Reference in New Issue
Block a user