mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-06-30 22:21:05 +00:00
feat(runtime): plumb VISIBLE_CDI_DEVICES through the Go runtime
Add a `visible_cdi_devices` TOML option to the Go runtime so the agent.visible_cdi_devices=true kernel parameter is emitted to the guest when enabled. Wire the option through the NVIDIA GPU configuration templates and add tests verifying the kernel-params flow. Signed-off-by: LandonTClipp <lclipp@coreweave.com>
This commit is contained in:
committed by
Fabiano Fidêncio
parent
b49eb577b2
commit
a1dd28cb52
@@ -612,6 +612,29 @@ dial_timeout = @DEFAULTTIMEOUT_NV@
|
||||
# (agent default when unset: 6)
|
||||
launch_process_timeout = @DEFAULTLAUNCHPROCESSTIMEOUT_NV@
|
||||
|
||||
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
|
||||
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
|
||||
# against the GPUs present in the VM via the CDI spec generated in the guest at
|
||||
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
|
||||
#
|
||||
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
|
||||
#
|
||||
# For example, you may set something like:
|
||||
#
|
||||
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
|
||||
#
|
||||
# The devices can be referenced by explicit CDI index or through the "all"
|
||||
# keyword.
|
||||
#
|
||||
# This parameter is useful in the case where multiple containers in a pod need
|
||||
# access to the same GPU and do not want to request additional GPUs from the
|
||||
# outer runtime. This is especially useful with GPU observability where one
|
||||
# workload container performs the CDI request to the outer runtime, and the
|
||||
# sidecar observability containers would get access to the same resources by
|
||||
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
|
||||
# (default: false)
|
||||
visible_cdi_devices = false
|
||||
|
||||
[runtime]
|
||||
# If enabled, the runtime will log additional debug messages to the
|
||||
# system log
|
||||
|
||||
@@ -589,6 +589,29 @@ dial_timeout = @DEFAULTTIMEOUT_NV@
|
||||
# (agent default when unset: 6)
|
||||
launch_process_timeout = @DEFAULTLAUNCHPROCESSTIMEOUT_NV@
|
||||
|
||||
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
|
||||
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
|
||||
# against the GPUs present in the VM via the CDI spec generated in the guest at
|
||||
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
|
||||
#
|
||||
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
|
||||
#
|
||||
# For example, you may set something like:
|
||||
#
|
||||
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
|
||||
#
|
||||
# The devices can be referenced by explicit CDI index or through the "all"
|
||||
# keyword.
|
||||
#
|
||||
# This parameter is useful in the case where multiple containers in a pod need
|
||||
# access to the same GPU and do not want to request additional GPUs from the
|
||||
# outer runtime. This is especially useful with GPU observability where one
|
||||
# workload container performs the CDI request to the outer runtime, and the
|
||||
# sidecar observability containers would get access to the same resources by
|
||||
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
|
||||
# (default: false)
|
||||
visible_cdi_devices = false
|
||||
|
||||
[runtime]
|
||||
# If enabled, the runtime will log additional debug messages to the
|
||||
# system log
|
||||
|
||||
@@ -591,6 +591,29 @@ dial_timeout = @DEFAULTTIMEOUT_NV@
|
||||
# (agent default when unset: 6)
|
||||
launch_process_timeout = @DEFAULTLAUNCHPROCESSTIMEOUT_NV@
|
||||
|
||||
# If enabled, the agent translates a container's VISIBLE_CDI_DEVICES
|
||||
# environment variable into CDI GPU device requests (nvidia.com/gpu), resolved
|
||||
# against the GPUs present in the VM via the CDI spec generated in the guest at
|
||||
# /var/run/cdi/. The format of VISIBLE_CDI_DEVICES is:
|
||||
#
|
||||
# <cdi-kind>=<devices>[:<cdi-kind>=<devices>[...]]
|
||||
#
|
||||
# For example, you may set something like:
|
||||
#
|
||||
# VISIBLE_CDI_DEVICES="nvidia.com/gpu=all:nvidia.com/ib=0,1"
|
||||
#
|
||||
# The devices can be referenced by explicit CDI index or through the "all"
|
||||
# keyword.
|
||||
#
|
||||
# This parameter is useful in the case where multiple containers in a pod need
|
||||
# access to the same GPU and do not want to request additional GPUs from the
|
||||
# outer runtime. This is especially useful with GPU observability where one
|
||||
# workload container performs the CDI request to the outer runtime, and the
|
||||
# sidecar observability containers would get access to the same resources by
|
||||
# setting VISIBLE_CDI_DEVICES="nvidia.com/gpu=all".
|
||||
# (default: false)
|
||||
visible_cdi_devices = false
|
||||
|
||||
[runtime]
|
||||
# If enabled, the runtime will log additional debug messages to the
|
||||
# system log
|
||||
|
||||
@@ -229,6 +229,7 @@ type agent struct {
|
||||
DialTimeout uint32 `toml:"dial_timeout"`
|
||||
CdhApiTimeout uint32 `toml:"cdh_api_timeout"`
|
||||
LaunchProcessTimeout uint32 `toml:"launch_process_timeout"`
|
||||
VisibleCdiDevices bool `toml:"visible_cdi_devices"`
|
||||
}
|
||||
|
||||
func (orig *tomlConfig) Clone() tomlConfig {
|
||||
@@ -801,6 +802,10 @@ func (a agent) launchProcessTimeout() uint32 {
|
||||
return a.LaunchProcessTimeout
|
||||
}
|
||||
|
||||
func (a agent) visibleCdiDevices() bool {
|
||||
return a.VisibleCdiDevices
|
||||
}
|
||||
|
||||
func (a agent) debug() bool {
|
||||
return a.Debug
|
||||
}
|
||||
@@ -1472,6 +1477,7 @@ func updateRuntimeConfigAgent(configPath string, tomlConf tomlConfig, config *oc
|
||||
DialTimeout: agent.dialTimout(),
|
||||
CdhApiTimeout: agent.cdhApiTimout(),
|
||||
LaunchProcessTimeout: agent.launchProcessTimeout(),
|
||||
VisibleCdiDevices: agent.visibleCdiDevices(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -303,6 +303,7 @@ type KataAgentConfig struct {
|
||||
Debug bool
|
||||
Trace bool
|
||||
EnableDebugConsole bool
|
||||
VisibleCdiDevices bool
|
||||
Policy string
|
||||
}
|
||||
|
||||
@@ -374,6 +375,10 @@ func KataAgentKernelParams(config KataAgentConfig) []Param {
|
||||
params = append(params, Param{Key: vcAnnotations.LaunchProcessTimeoutKernelParam, Value: launchProcessTimeout})
|
||||
}
|
||||
|
||||
if config.VisibleCdiDevices {
|
||||
params = append(params, Param{Key: vcAnnotations.VisibleCdiDevicesKernelParam, Value: "true"})
|
||||
}
|
||||
|
||||
return params
|
||||
}
|
||||
|
||||
|
||||
@@ -1083,14 +1083,14 @@ func TestKataCleanupSandbox(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestKataAgentKernelParams(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// nolint: govet
|
||||
type testData struct {
|
||||
name string
|
||||
debug bool
|
||||
trace bool
|
||||
containerPipeSize uint32
|
||||
launchProcessTimeout uint32
|
||||
visibleCdiDevices bool
|
||||
expectedParams []Param
|
||||
}
|
||||
|
||||
@@ -1099,60 +1099,57 @@ func TestKataAgentKernelParams(t *testing.T) {
|
||||
|
||||
containerPipeSizeParam := Param{Key: vcAnnotations.ContainerPipeSizeKernelParam, Value: "2097152"}
|
||||
launchProcessTimeoutParam := Param{Key: vcAnnotations.LaunchProcessTimeoutKernelParam, Value: "60"}
|
||||
visibleCdiDevicesParam := Param{Key: "agent.visible_cdi_devices", Value: "true"}
|
||||
|
||||
data := []testData{
|
||||
{false, false, 0, 0, []Param{}},
|
||||
{name: "no options", expectedParams: []Param{}},
|
||||
|
||||
// Debug
|
||||
{true, false, 0, 0, []Param{debugParam}},
|
||||
{name: "debug", debug: true, expectedParams: []Param{debugParam}},
|
||||
|
||||
// Tracing
|
||||
{false, true, 0, 0, []Param{traceParam}},
|
||||
{name: "tracing", trace: true, expectedParams: []Param{traceParam}},
|
||||
|
||||
// Debug + Tracing
|
||||
{true, true, 0, 0, []Param{debugParam, traceParam}},
|
||||
{name: "debug and tracing", debug: true, trace: true, expectedParams: []Param{debugParam, traceParam}},
|
||||
|
||||
// pipesize
|
||||
{false, false, 2097152, 0, []Param{containerPipeSizeParam}},
|
||||
{name: "pipesize", containerPipeSize: 2097152, expectedParams: []Param{containerPipeSizeParam}},
|
||||
|
||||
// Debug + pipesize
|
||||
{true, false, 2097152, 0, []Param{debugParam, containerPipeSizeParam}},
|
||||
{name: "debug and pipesize", debug: true, containerPipeSize: 2097152, expectedParams: []Param{debugParam, containerPipeSizeParam}},
|
||||
|
||||
// Tracing + pipesize
|
||||
{false, true, 2097152, 0, []Param{traceParam, containerPipeSizeParam}},
|
||||
{name: "tracing and pipesize", trace: true, containerPipeSize: 2097152, expectedParams: []Param{traceParam, containerPipeSizeParam}},
|
||||
|
||||
// Debug + Tracing + pipesize
|
||||
{true, true, 2097152, 0, []Param{debugParam, traceParam, containerPipeSizeParam}},
|
||||
{name: "debug, tracing and pipesize", debug: true, trace: true, containerPipeSize: 2097152, expectedParams: []Param{debugParam, traceParam, containerPipeSizeParam}},
|
||||
|
||||
// LaunchProcessTimeout
|
||||
{false, false, 0, 60, []Param{launchProcessTimeoutParam}},
|
||||
{name: "launch process timeout", launchProcessTimeout: 60, expectedParams: []Param{launchProcessTimeoutParam}},
|
||||
|
||||
// Debug + LaunchProcessTimeout
|
||||
{true, false, 0, 60, []Param{debugParam, launchProcessTimeoutParam}},
|
||||
{name: "debug and launch process timeout", debug: true, launchProcessTimeout: 60, expectedParams: []Param{debugParam, launchProcessTimeoutParam}},
|
||||
|
||||
{name: "visible cdi devices", visibleCdiDevices: true, expectedParams: []Param{visibleCdiDevicesParam}},
|
||||
}
|
||||
|
||||
for i, d := range data {
|
||||
config := KataAgentConfig{
|
||||
Debug: d.debug,
|
||||
Trace: d.trace,
|
||||
ContainerPipeSize: d.containerPipeSize,
|
||||
LaunchProcessTimeout: d.launchProcessTimeout,
|
||||
}
|
||||
for _, d := range data {
|
||||
t.Run(d.name, func(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
count := len(d.expectedParams)
|
||||
config := KataAgentConfig{
|
||||
Debug: d.debug,
|
||||
Trace: d.trace,
|
||||
ContainerPipeSize: d.containerPipeSize,
|
||||
LaunchProcessTimeout: d.launchProcessTimeout,
|
||||
VisibleCdiDevices: d.visibleCdiDevices,
|
||||
}
|
||||
|
||||
params := KataAgentKernelParams(config)
|
||||
params := KataAgentKernelParams(config)
|
||||
|
||||
if count == 0 {
|
||||
assert.Emptyf(params, "test %d (%+v)", i, d)
|
||||
continue
|
||||
}
|
||||
if len(d.expectedParams) == 0 {
|
||||
assert.Empty(params)
|
||||
return
|
||||
}
|
||||
|
||||
assert.Len(params, count)
|
||||
assert.Len(params, len(d.expectedParams))
|
||||
|
||||
for _, p := range d.expectedParams {
|
||||
assert.Containsf(params, p, "test %d (%+v)", i, d)
|
||||
}
|
||||
for _, p := range d.expectedParams {
|
||||
assert.Contains(params, p)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -339,6 +339,12 @@ const (
|
||||
LaunchProcessTimeoutOption = "launch_process_timeout"
|
||||
LaunchProcessTimeoutKernelParam = "agent." + LaunchProcessTimeoutOption
|
||||
|
||||
// VisibleCdiDevices, when enabled, lets the agent translate a
|
||||
// container's VISIBLE_CDI_DEVICES environment variable into CDI GPU
|
||||
// device requests inside the guest.
|
||||
VisibleCdiDevicesOption = "visible_cdi_devices"
|
||||
VisibleCdiDevicesKernelParam = "agent." + VisibleCdiDevicesOption
|
||||
|
||||
// Policy is an annotation containing the contents of an agent policy file, base64 encoded.
|
||||
Policy = kataAnnotAgentPrefix + "policy"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user