feat(runtime): plumb VISIBLE_CDI_DEVICES through the Go runtime

Add a `visible_cdi_devices` TOML option to the Go runtime so the
agent.visible_cdi_devices=true kernel parameter is emitted to the guest
when enabled. Wire the option through the NVIDIA GPU configuration
templates and add tests verifying the kernel-params flow.

Signed-off-by: LandonTClipp <lclipp@coreweave.com>
This commit is contained in:
LandonTClipp
2026-06-15 16:35:11 +00:00
committed by Fabiano Fidêncio
parent b49eb577b2
commit a1dd28cb52
7 changed files with 121 additions and 38 deletions

View File

@@ -612,6 +612,29 @@ dial_timeout = @DEFAULTTIMEOUT_NV@
# (agent default when unset: 6)
launch_process_timeout = @DEFAULTLAUNCHPROCESSTIMEOUT_NV@
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
# against the GPUs present in the VM via the CDI spec generated in the guest at
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
#
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
#
# For example, you may set something like:
#
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
#
# The devices can be referenced by explicit CDI index or through the "all"
# keyword.
#
# This parameter is useful in the case where multiple containers in a pod need
# access to the same GPU and do not want to request additional GPUs from the
# outer runtime. This is especially useful with GPU observability where one
# workload container performs the CDI request to the outer runtime, and the
# sidecar observability containers would get access to the same resources by
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
# (default: false)
visible_cdi_devices = false
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -589,6 +589,29 @@ dial_timeout = @DEFAULTTIMEOUT_NV@
# (agent default when unset: 6)
launch_process_timeout = @DEFAULTLAUNCHPROCESSTIMEOUT_NV@
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
# against the GPUs present in the VM via the CDI spec generated in the guest at
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
#
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
#
# For example, you may set something like:
#
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
#
# The devices can be referenced by explicit CDI index or through the "all"
# keyword.
#
# This parameter is useful in the case where multiple containers in a pod need
# access to the same GPU and do not want to request additional GPUs from the
# outer runtime. This is especially useful with GPU observability where one
# workload container performs the CDI request to the outer runtime, and the
# sidecar observability containers would get access to the same resources by
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
# (default: false)
visible_cdi_devices = false
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -591,6 +591,29 @@ dial_timeout = @DEFAULTTIMEOUT_NV@
# (agent default when unset: 6)
launch_process_timeout = @DEFAULTLAUNCHPROCESSTIMEOUT_NV@
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
# against the GPUs present in the VM via the CDI spec generated in the guest at
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
#
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
#
# For example, you may set something like:
#
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
#
# The devices can be referenced by explicit CDI index or through the "all"
# keyword.
#
# This parameter is useful in the case where multiple containers in a pod need
# access to the same GPU and do not want to request additional GPUs from the
# outer runtime. This is especially useful with GPU observability where one
# workload container performs the CDI request to the outer runtime, and the
# sidecar observability containers would get access to the same resources by
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
# (default: false)
visible_cdi_devices = false
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -229,6 +229,7 @@ type agent struct {
DialTimeout uint32 `toml:"dial_timeout"`
CdhApiTimeout uint32 `toml:"cdh_api_timeout"`
LaunchProcessTimeout uint32 `toml:"launch_process_timeout"`
VisibleCdiDevices bool `toml:"visible_cdi_devices"`
}
func (orig *tomlConfig) Clone() tomlConfig {
@@ -801,6 +802,10 @@ func (a agent) launchProcessTimeout() uint32 {
return a.LaunchProcessTimeout
}
func (a agent) visibleCdiDevices() bool {
return a.VisibleCdiDevices
}
func (a agent) debug() bool {
return a.Debug
}
@@ -1472,6 +1477,7 @@ func updateRuntimeConfigAgent(configPath string, tomlConf tomlConfig, config *oc
DialTimeout: agent.dialTimout(),
CdhApiTimeout: agent.cdhApiTimout(),
LaunchProcessTimeout: agent.launchProcessTimeout(),
VisibleCdiDevices: agent.visibleCdiDevices(),
}
}

View File

@@ -303,6 +303,7 @@ type KataAgentConfig struct {
Debug bool
Trace bool
EnableDebugConsole bool
VisibleCdiDevices bool
Policy string
}
@@ -374,6 +375,10 @@ func KataAgentKernelParams(config KataAgentConfig) []Param {
params = append(params, Param{Key: vcAnnotations.LaunchProcessTimeoutKernelParam, Value: launchProcessTimeout})
}
if config.VisibleCdiDevices {
params = append(params, Param{Key: vcAnnotations.VisibleCdiDevicesKernelParam, Value: "true"})
}
return params
}

View File

@@ -1083,14 +1083,14 @@ func TestKataCleanupSandbox(t *testing.T) {
}
func TestKataAgentKernelParams(t *testing.T) {
assert := assert.New(t)
// nolint: govet
type testData struct {
name string
debug bool
trace bool
containerPipeSize uint32
launchProcessTimeout uint32
visibleCdiDevices bool
expectedParams []Param
}
@@ -1099,60 +1099,57 @@ func TestKataAgentKernelParams(t *testing.T) {
containerPipeSizeParam := Param{Key: vcAnnotations.ContainerPipeSizeKernelParam, Value: "2097152"}
launchProcessTimeoutParam := Param{Key: vcAnnotations.LaunchProcessTimeoutKernelParam, Value: "60"}
visibleCdiDevicesParam := Param{Key: "agent.visible_cdi_devices", Value: "true"}
data := []testData{
{false, false, 0, 0, []Param{}},
{name: "no options", expectedParams: []Param{}},
// Debug
{true, false, 0, 0, []Param{debugParam}},
{name: "debug", debug: true, expectedParams: []Param{debugParam}},
// Tracing
{false, true, 0, 0, []Param{traceParam}},
{name: "tracing", trace: true, expectedParams: []Param{traceParam}},
// Debug + Tracing
{true, true, 0, 0, []Param{debugParam, traceParam}},
{name: "debug and tracing", debug: true, trace: true, expectedParams: []Param{debugParam, traceParam}},
// pipesize
{false, false, 2097152, 0, []Param{containerPipeSizeParam}},
{name: "pipesize", containerPipeSize: 2097152, expectedParams: []Param{containerPipeSizeParam}},
// Debug + pipesize
{true, false, 2097152, 0, []Param{debugParam, containerPipeSizeParam}},
{name: "debug and pipesize", debug: true, containerPipeSize: 2097152, expectedParams: []Param{debugParam, containerPipeSizeParam}},
// Tracing + pipesize
{false, true, 2097152, 0, []Param{traceParam, containerPipeSizeParam}},
{name: "tracing and pipesize", trace: true, containerPipeSize: 2097152, expectedParams: []Param{traceParam, containerPipeSizeParam}},
// Debug + Tracing + pipesize
{true, true, 2097152, 0, []Param{debugParam, traceParam, containerPipeSizeParam}},
{name: "debug, tracing and pipesize", debug: true, trace: true, containerPipeSize: 2097152, expectedParams: []Param{debugParam, traceParam, containerPipeSizeParam}},
// LaunchProcessTimeout
{false, false, 0, 60, []Param{launchProcessTimeoutParam}},
{name: "launch process timeout", launchProcessTimeout: 60, expectedParams: []Param{launchProcessTimeoutParam}},
// Debug + LaunchProcessTimeout
{true, false, 0, 60, []Param{debugParam, launchProcessTimeoutParam}},
{name: "debug and launch process timeout", debug: true, launchProcessTimeout: 60, expectedParams: []Param{debugParam, launchProcessTimeoutParam}},
{name: "visible cdi devices", visibleCdiDevices: true, expectedParams: []Param{visibleCdiDevicesParam}},
}
for i, d := range data {
config := KataAgentConfig{
Debug: d.debug,
Trace: d.trace,
ContainerPipeSize: d.containerPipeSize,
LaunchProcessTimeout: d.launchProcessTimeout,
}
for _, d := range data {
t.Run(d.name, func(t *testing.T) {
assert := assert.New(t)
count := len(d.expectedParams)
config := KataAgentConfig{
Debug: d.debug,
Trace: d.trace,
ContainerPipeSize: d.containerPipeSize,
LaunchProcessTimeout: d.launchProcessTimeout,
VisibleCdiDevices: d.visibleCdiDevices,
}
params := KataAgentKernelParams(config)
params := KataAgentKernelParams(config)
if count == 0 {
assert.Emptyf(params, "test %d (%+v)", i, d)
continue
}
if len(d.expectedParams) == 0 {
assert.Empty(params)
return
}
assert.Len(params, count)
assert.Len(params, len(d.expectedParams))
for _, p := range d.expectedParams {
assert.Containsf(params, p, "test %d (%+v)", i, d)
}
for _, p := range d.expectedParams {
assert.Contains(params, p)
}
})
}
}

View File

@@ -339,6 +339,12 @@ const (
LaunchProcessTimeoutOption = "launch_process_timeout"
LaunchProcessTimeoutKernelParam = "agent." + LaunchProcessTimeoutOption
// VisibleCdiDevices, when enabled, lets the agent translate a
// container's VISIBLE_CDI_DEVICES environment variable into CDI GPU
// device requests inside the guest.
VisibleCdiDevicesOption = "visible_cdi_devices"
VisibleCdiDevicesKernelParam = "agent." + VisibleCdiDevicesOption
// Policy is an annotation containing the contents of an agent policy file, base64 encoded.
Policy = kataAnnotAgentPrefix + "policy"
)