agent: Make launch_process_timeout configurable

The hardcoded DEFAULT_LAUNCH_PROCESS_TIMEOUT of 6 seconds in the kata
agent is insufficient for environments with NVIDIA GPUs and NVSwitches,
where the attestation-agent needs significantly more time to collect
evidence during initialization (e.g. ~2 seconds per NVSwitch).

When the timeout expires, the agent (PID 1) exits with an error, causing
the guest kernel to perform an orderly shutdown before the
attestation-agent has finished starting.

Make this timeout configurable via the kernel parameter
agent.launch_process_timeout (in seconds), preserving the 6-second
default for backward compatibility. The Go runtime is wired up to pass
this value from the TOML config's [agent.kata] section through to the
kernel command line.

The NVIDIA GPU configs set the new default to 15 seconds.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Made-with: Cursor
This commit is contained in:
Fabiano Fidêncio
2026-04-09 18:52:33 +02:00
parent fd6375d8d5
commit 36a2d8e7f2
21 changed files with 170 additions and 48 deletions

View File

@@ -25,6 +25,7 @@ const HOTPLUG_TIMOUT_OPTION: &str = "agent.hotplug_timeout";
const CDH_API_TIMOUT_OPTION: &str = "agent.cdh_api_timeout";
const CDH_IMAGE_PULL_TIMEOUT_OPTION: &str = "agent.image_pull_timeout";
const CDI_TIMEOUT_OPTION: &str = "agent.cdi_timeout";
const LAUNCH_PROCESS_TIMEOUT_OPTION: &str = "agent.launch_process_timeout";
const DEBUG_CONSOLE_VPORT_OPTION: &str = "agent.debug_console_vport";
const LOG_VPORT_OPTION: &str = "agent.log_vport";
const CONTAINER_PIPE_SIZE_OPTION: &str = "agent.container_pipe_size";
@@ -66,6 +67,7 @@ const DEFAULT_HOTPLUG_TIMEOUT: time::Duration = time::Duration::from_secs(3);
const DEFAULT_CDH_API_TIMEOUT: time::Duration = time::Duration::from_secs(50);
const DEFAULT_IMAGE_PULL_TIMEOUT: time::Duration = time::Duration::from_secs(1200);
const DEFAULT_CDI_TIMEOUT: time::Duration = time::Duration::from_secs(100);
const DEFAULT_LAUNCH_PROCESS_TIMEOUT: time::Duration = time::Duration::from_secs(6);
const DEFAULT_CONTAINER_PIPE_SIZE: i32 = 0;
const VSOCK_ADDR: &str = "vsock://-1";
@@ -130,6 +132,7 @@ pub struct AgentConfig {
pub cdh_api_timeout: time::Duration,
pub image_pull_timeout: time::Duration,
pub cdi_timeout: time::Duration,
pub launch_process_timeout: time::Duration,
pub debug_console_vport: i32,
pub log_vport: i32,
pub container_pipe_size: i32,
@@ -163,6 +166,7 @@ pub struct AgentConfigBuilder {
pub cdh_api_timeout: Option<time::Duration>,
pub image_pull_timeout: Option<time::Duration>,
pub cdi_timeout: Option<time::Duration>,
pub launch_process_timeout: Option<time::Duration>,
pub debug_console_vport: Option<i32>,
pub log_vport: Option<i32>,
pub container_pipe_size: Option<i32>,
@@ -257,6 +261,7 @@ impl Default for AgentConfig {
cdh_api_timeout: DEFAULT_CDH_API_TIMEOUT,
image_pull_timeout: DEFAULT_IMAGE_PULL_TIMEOUT,
cdi_timeout: DEFAULT_CDI_TIMEOUT,
launch_process_timeout: DEFAULT_LAUNCH_PROCESS_TIMEOUT,
debug_console_vport: 0,
log_vport: 0,
container_pipe_size: DEFAULT_CONTAINER_PIPE_SIZE,
@@ -298,6 +303,7 @@ impl FromStr for AgentConfig {
config_override!(agent_config_builder, agent_config, cdh_api_timeout);
config_override!(agent_config_builder, agent_config, image_pull_timeout);
config_override!(agent_config_builder, agent_config, cdi_timeout);
config_override!(agent_config_builder, agent_config, launch_process_timeout);
config_override!(agent_config_builder, agent_config, debug_console_vport);
config_override!(agent_config_builder, agent_config, log_vport);
config_override!(agent_config_builder, agent_config, container_pipe_size);
@@ -481,6 +487,14 @@ impl AgentConfig {
|cdi_timeout: &time::Duration| cdi_timeout.as_secs() > 0
);
parse_cmdline_param!(
param,
LAUNCH_PROCESS_TIMEOUT_OPTION,
config.launch_process_timeout,
get_timeout,
|launch_process_timeout: &time::Duration| launch_process_timeout.as_secs() > 0
);
// vsock port should be positive values
parse_cmdline_param!(
param,
@@ -742,6 +756,7 @@ fn get_timeout(param: &str) -> Result<time::Duration> {
| CDH_API_TIMOUT_OPTION
| CDH_IMAGE_PULL_TIMEOUT_OPTION
| CDI_TIMEOUT_OPTION
| LAUNCH_PROCESS_TIMEOUT_OPTION
),
ERR_INVALID_TIMEOUT_KEY
);
@@ -1630,6 +1645,7 @@ Caused by:
#[case("agent.cdh_api_timeout=600", Ok(time::Duration::from_secs(600)))]
#[case("agent.image_pull_timeout=1200", Ok(time::Duration::from_secs(1200)))]
#[case("agent.cdi_timeout=320", Ok(time::Duration::from_secs(320)))]
#[case("agent.launch_process_timeout=60", Ok(time::Duration::from_secs(60)))]
fn test_timeout(#[case] param: &str, #[case] expected: Result<time::Duration>) {
let result = get_timeout(param);
let msg = format!("expected: {expected:?}, result: {result:?}");

View File

@@ -111,8 +111,6 @@ const API_SERVER_PATH: &str = "/usr/local/bin/api-server-rest";
/// TODO: remove this when we move the launch of CDH out of the kata-agent.
const OCICRYPT_CONFIG_PATH: &str = "/etc/ocicrypt_config.json";
const DEFAULT_LAUNCH_PROCESS_TIMEOUT: i32 = 6;
lazy_static! {
static ref AGENT_CONFIG: AgentConfig =
// Note: We can't do AgentOpts.parse() here to send through the processed arguments to AgentConfig
@@ -505,7 +503,7 @@ async fn launch_guest_component_procs(
aa_args,
Some(AA_CONFIG_PATH),
AA_ATTESTATION_SOCKET,
DEFAULT_LAUNCH_PROCESS_TIMEOUT,
config.launch_process_timeout.as_secs(),
&[],
)
.await
@@ -527,7 +525,7 @@ async fn launch_guest_component_procs(
vec![],
Some(CDH_CONFIG_PATH),
CDH_SOCKET,
DEFAULT_LAUNCH_PROCESS_TIMEOUT,
config.launch_process_timeout.as_secs(),
&[("OCICRYPT_KEYPROVIDER_CONFIG", OCICRYPT_CONFIG_PATH)],
)
.await
@@ -587,7 +585,7 @@ async fn init_attestation_components(
Ok(())
}
async fn wait_for_path_to_exist(logger: &Logger, path: &str, timeout_secs: i32) -> Result<()> {
async fn wait_for_path_to_exist(logger: &Logger, path: &str, timeout_secs: u64) -> Result<()> {
let p = Path::new(path);
let mut attempts = 0;
loop {
@@ -614,7 +612,7 @@ async fn launch_process(
mut args: Vec<&str>,
config: Option<&str>,
unix_socket_path: &str,
timeout_secs: i32,
timeout_secs: u64,
envs: &[(&str, &str)],
) -> Result<()> {
if !Path::new(path).exists() {

View File

@@ -146,6 +146,11 @@ pub struct Agent {
#[serde(default)]
pub container_pipe_size: u32,
/// Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
/// to create their Unix sockets after being spawned by the agent.
#[serde(default)]
pub launch_process_timeout: u32,
/// Memory agent configuration
#[serde(default)]
pub mem_agent: MemAgent,
@@ -180,6 +185,7 @@ impl std::default::Default for Agent {
health_check_request_timeout_ms: 90_000,
kernel_modules: Default::default(),
container_pipe_size: 0,
launch_process_timeout: 0,
mem_agent: MemAgent::default(),
policy: Default::default(),
}

View File

@@ -54,6 +54,8 @@ pub const DEBUG_CONSOLE_VPORT_OPTION: &str = "agent.debug_console_vport";
pub const LOG_VPORT_OPTION: &str = "agent.log_vport";
/// Option of setting the container's pipe size
pub const CONTAINER_PIPE_SIZE_OPTION: &str = "agent.container_pipe_size";
/// Option of setting the guest component launch process timeout
pub const LAUNCH_PROCESS_TIMEOUT_OPTION: &str = "agent.launch_process_timeout";
/// Option of setting the fd passthrough io listener port
pub const PASSFD_LISTENER_PORT: &str = "agent.passfd_listener_port";
@@ -219,6 +221,13 @@ impl TomlConfig {
let container_pipe_size = cfg.container_pipe_size.to_string();
kv.insert(CONTAINER_PIPE_SIZE_OPTION.to_string(), container_pipe_size);
}
if cfg.launch_process_timeout > 0 {
let launch_process_timeout = cfg.launch_process_timeout.to_string();
kv.insert(
LAUNCH_PROCESS_TIMEOUT_OPTION.to_string(),
launch_process_timeout,
);
}
if cfg.debug_console_enabled {
kv.insert(DEBUG_CONSOLE_FLAG.to_string(), "".to_string());
kv.insert(
@@ -479,6 +488,7 @@ mod tests {
enable_tracing: true,
container_pipe_size: 20,
debug_console_enabled: true,
launch_process_timeout: 60,
..Default::default()
};
let agent_name = "test_agent";
@@ -491,5 +501,6 @@ mod tests {
assert_eq!(kv.get("agent.container_pipe_size").unwrap(), "20");
kv.get("agent.debug_console").unwrap();
assert_eq!(kv.get("agent.debug_console_vport").unwrap(), "1026"); // 1026 is the default port
assert_eq!(kv.get("agent.launch_process_timeout").unwrap(), "60");
}
}

View File

@@ -541,6 +541,11 @@ dial_timeout_ms = 10
# (default: 3000)
reconnect_timeout_ms = 3000
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# (agent default when unset: 6)
launch_process_timeout = 6
# Create Container Request Timeout
# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest.
# It's also used to ensure that workloads, especially those involving large image pulls within the guest,

View File

@@ -523,6 +523,11 @@ dial_timeout_ms = 90
# (default: 3000)
reconnect_timeout_ms = 5000
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# (agent default when unset: 6)
launch_process_timeout = 6
# Create Container Request Timeout
# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest.
# It's also used to ensure that workloads, especially those involving large image pulls within the guest,

View File

@@ -565,6 +565,11 @@ dial_timeout_ms = 10
# (default: 3000)
reconnect_timeout_ms = 3000
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# (agent default when unset: 6)
launch_process_timeout = 6
# Create Container Request Timeout
# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest.
# It's also used to ensure that workloads, especially those involving large image pulls within the guest,

View File

@@ -541,6 +541,11 @@ dial_timeout_ms = 10
# (default: 3000)
reconnect_timeout_ms = 3000
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# (agent default when unset: 6)
launch_process_timeout = 6
# Create Container Request Timeout
# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest.
# It's also used to ensure that workloads, especially those involving large image pulls within the guest,

View File

@@ -491,6 +491,7 @@ ifneq (,$(QEMUCMD))
DEFAULTVCPUS_NV = 1
DEFAULTMEMORY_NV = 8192
DEFAULTTIMEOUT_NV = 1200
DEFAULTLAUNCHPROCESSTIMEOUT_NV = 15
DEFAULTVFIOPORT_NV = root-port
DEFAULTPCIEROOTPORT_NV = 8
@@ -678,6 +679,7 @@ USER_VARS += KERNELPARAMS_CONFIDENTIAL_NV
USER_VARS += KERNELVERITYPARAMS_NV
USER_VARS += KERNELVERITYPARAMS_CONFIDENTIAL_NV
USER_VARS += DEFAULTTIMEOUT_NV
USER_VARS += DEFAULTLAUNCHPROCESSTIMEOUT_NV
USER_VARS += DEFSANDBOXCGROUPONLY_NV
USER_VARS += DEFROOTFSTYPE
USER_VARS += MACHINETYPE

View File

@@ -537,6 +537,11 @@ debug_console_enabled = false
# (default: 90)
dial_timeout = 90
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# (agent default when unset: 6)
launch_process_timeout = 6
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -595,6 +595,11 @@ dial_timeout = 45
# (default: 50)
cdh_api_timeout = 50
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# (agent default when unset: 6)
launch_process_timeout = 6
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -611,6 +611,13 @@ debug_console_enabled = false
# (default: 90)
dial_timeout = @DEFAULTTIMEOUT_NV@
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# With NVIDIA GPUs and NVSwitches, the attestation-agent needs extra time
# to collect evidence during initialization.
# (agent default when unset: 6)
launch_process_timeout = @DEFAULTLAUNCHPROCESSTIMEOUT_NV@
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -588,6 +588,13 @@ debug_console_enabled = false
# (default: 90)
dial_timeout = @DEFAULTTIMEOUT_NV@
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# With NVIDIA GPUs and NVSwitches, the attestation-agent needs extra time
# to collect evidence during initialization.
# (agent default when unset: 6)
launch_process_timeout = @DEFAULTLAUNCHPROCESSTIMEOUT_NV@
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -590,6 +590,13 @@ debug_console_enabled = false
# (default: 90)
dial_timeout = @DEFAULTTIMEOUT_NV@
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# With NVIDIA GPUs and NVSwitches, the attestation-agent needs extra time
# to collect evidence during initialization.
# (agent default when unset: 6)
launch_process_timeout = @DEFAULTLAUNCHPROCESSTIMEOUT_NV@
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -573,6 +573,11 @@ debug_console_enabled = false
# (default: 30)
dial_timeout = 90
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# (agent default when unset: 6)
launch_process_timeout = 6
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -603,6 +603,11 @@ debug_console_enabled = false
# (default: 90)
dial_timeout = 90
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# (agent default when unset: 6)
launch_process_timeout = 6
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -580,6 +580,11 @@ debug_console_enabled = false
# (default: 60)
dial_timeout = 60
# Timeout in seconds for guest components (attestation-agent, confidential-data-hub)
# to create their Unix sockets after being spawned by the agent.
# (agent default when unset: 6)
launch_process_timeout = 6
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log

View File

@@ -224,12 +224,13 @@ func (r runtime) emptyDirMode() (string, error) {
}
type agent struct {
KernelModules []string `toml:"kernel_modules"`
Debug bool `toml:"enable_debug"`
Tracing bool `toml:"enable_tracing"`
DebugConsoleEnabled bool `toml:"debug_console_enabled"`
DialTimeout uint32 `toml:"dial_timeout"`
CdhApiTimeout uint32 `toml:"cdh_api_timeout"`
KernelModules []string `toml:"kernel_modules"`
Debug bool `toml:"enable_debug"`
Tracing bool `toml:"enable_tracing"`
DebugConsoleEnabled bool `toml:"debug_console_enabled"`
DialTimeout uint32 `toml:"dial_timeout"`
CdhApiTimeout uint32 `toml:"cdh_api_timeout"`
LaunchProcessTimeout uint32 `toml:"launch_process_timeout"`
}
func (orig *tomlConfig) Clone() tomlConfig {
@@ -798,6 +799,10 @@ func (a agent) cdhApiTimout() uint32 {
return a.CdhApiTimeout
}
func (a agent) launchProcessTimeout() uint32 {
return a.LaunchProcessTimeout
}
func (a agent) debug() bool {
return a.Debug
}
@@ -1464,13 +1469,14 @@ func updateRuntimeConfigHypervisor(configPath string, tomlConf tomlConfig, confi
func updateRuntimeConfigAgent(configPath string, tomlConf tomlConfig, config *oci.RuntimeConfig) error {
for _, agent := range tomlConf.Agent {
config.AgentConfig = vc.KataAgentConfig{
LongLiveConn: true,
Debug: agent.debug(),
Trace: agent.trace(),
KernelModules: agent.kernelModules(),
EnableDebugConsole: agent.debugConsoleEnabled(),
DialTimeout: agent.dialTimout(),
CdhApiTimeout: agent.cdhApiTimout(),
LongLiveConn: true,
Debug: agent.debug(),
Trace: agent.trace(),
KernelModules: agent.kernelModules(),
EnableDebugConsole: agent.debugConsoleEnabled(),
DialTimeout: agent.dialTimout(),
CdhApiTimeout: agent.cdhApiTimout(),
LaunchProcessTimeout: agent.launchProcessTimeout(),
}
}

View File

@@ -292,15 +292,16 @@ func ephemeralPath() string {
// KataAgentConfig is a structure storing information needed
// to reach the Kata Containers agent.
type KataAgentConfig struct {
KernelModules []string
ContainerPipeSize uint32
DialTimeout uint32
CdhApiTimeout uint32
LongLiveConn bool
Debug bool
Trace bool
EnableDebugConsole bool
Policy string
KernelModules []string
ContainerPipeSize uint32
DialTimeout uint32
CdhApiTimeout uint32
LaunchProcessTimeout uint32
LongLiveConn bool
Debug bool
Trace bool
EnableDebugConsole bool
Policy string
}
// KataAgentState is the structure describing the data stored from this
@@ -366,6 +367,11 @@ func KataAgentKernelParams(config KataAgentConfig) []Param {
params = append(params, Param{Key: vcAnnotations.CdhApiTimeoutKernelParam, Value: cdhApiTimeout})
}
if config.LaunchProcessTimeout > 0 {
launchProcessTimeout := strconv.FormatUint(uint64(config.LaunchProcessTimeout), 10)
params = append(params, Param{Key: vcAnnotations.LaunchProcessTimeoutKernelParam, Value: launchProcessTimeout})
}
return params
}

View File

@@ -1083,47 +1083,56 @@ func TestKataAgentKernelParams(t *testing.T) {
// nolint: govet
type testData struct {
debug bool
trace bool
containerPipeSize uint32
expectedParams []Param
debug bool
trace bool
containerPipeSize uint32
launchProcessTimeout uint32
expectedParams []Param
}
debugParam := Param{Key: "agent.log", Value: "debug"}
traceParam := Param{Key: "agent.trace", Value: "true"}
containerPipeSizeParam := Param{Key: vcAnnotations.ContainerPipeSizeKernelParam, Value: "2097152"}
launchProcessTimeoutParam := Param{Key: vcAnnotations.LaunchProcessTimeoutKernelParam, Value: "60"}
data := []testData{
{false, false, 0, []Param{}},
{false, false, 0, 0, []Param{}},
// Debug
{true, false, 0, []Param{debugParam}},
{true, false, 0, 0, []Param{debugParam}},
// Tracing
{false, true, 0, []Param{traceParam}},
{false, true, 0, 0, []Param{traceParam}},
// Debug + Tracing
{true, true, 0, []Param{debugParam, traceParam}},
{true, true, 0, 0, []Param{debugParam, traceParam}},
// pipesize
{false, false, 2097152, []Param{containerPipeSizeParam}},
{false, false, 2097152, 0, []Param{containerPipeSizeParam}},
// Debug + pipesize
{true, false, 2097152, []Param{debugParam, containerPipeSizeParam}},
{true, false, 2097152, 0, []Param{debugParam, containerPipeSizeParam}},
// Tracing + pipesize
{false, true, 2097152, []Param{traceParam, containerPipeSizeParam}},
{false, true, 2097152, 0, []Param{traceParam, containerPipeSizeParam}},
// Debug + Tracing + pipesize
{true, true, 2097152, []Param{debugParam, traceParam, containerPipeSizeParam}},
{true, true, 2097152, 0, []Param{debugParam, traceParam, containerPipeSizeParam}},
// LaunchProcessTimeout
{false, false, 0, 60, []Param{launchProcessTimeoutParam}},
// Debug + LaunchProcessTimeout
{true, false, 0, 60, []Param{debugParam, launchProcessTimeoutParam}},
}
for i, d := range data {
config := KataAgentConfig{
Debug: d.debug,
Trace: d.trace,
ContainerPipeSize: d.containerPipeSize,
Debug: d.debug,
Trace: d.trace,
ContainerPipeSize: d.containerPipeSize,
LaunchProcessTimeout: d.launchProcessTimeout,
}
count := len(d.expectedParams)

View File

@@ -334,11 +334,13 @@ const (
AgentTrace = kataAnnotAgentPrefix + "enable_tracing"
// AgentContainerPipeSize is an annotation to specify the size of the pipes created for containers
AgentContainerPipeSize = kataAnnotAgentPrefix + ContainerPipeSizeOption
ContainerPipeSizeOption = "container_pipe_size"
ContainerPipeSizeKernelParam = "agent." + ContainerPipeSizeOption
CdhApiTimeoutOption = "cdh_api_timeout"
CdhApiTimeoutKernelParam = "agent." + CdhApiTimeoutOption
AgentContainerPipeSize = kataAnnotAgentPrefix + ContainerPipeSizeOption
ContainerPipeSizeOption = "container_pipe_size"
ContainerPipeSizeKernelParam = "agent." + ContainerPipeSizeOption
CdhApiTimeoutOption = "cdh_api_timeout"
CdhApiTimeoutKernelParam = "agent." + CdhApiTimeoutOption
LaunchProcessTimeoutOption = "launch_process_timeout"
LaunchProcessTimeoutKernelParam = "agent." + LaunchProcessTimeoutOption
// Policy is an annotation containing the contents of an agent policy file, base64 encoded.
Policy = kataAnnotAgentPrefix + "policy"