From 95aa21f018ecf0eacac52b16d9a10a9d6a3794a1 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 13 Feb 2025 19:23:19 +0000 Subject: [PATCH] gpu: Add CDI timeout via kernel config Some systems like a DGX where we have 8 H100 or 8 H800 GPUs need some extended time to be initialized. We need to make sure we can configure CDI timeout, to enable even systems with 16 GPUs. Signed-off-by: Zvonko Kaiser --- src/agent/src/config.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/agent/src/config.rs b/src/agent/src/config.rs index 81302c39a..f12fbcdad 100644 --- a/src/agent/src/config.rs +++ b/src/agent/src/config.rs @@ -23,6 +23,7 @@ const SERVER_ADDR_OPTION: &str = "agent.server_addr"; const PASSFD_LISTENER_PORT: &str = "agent.passfd_listener_port"; const HOTPLUG_TIMOUT_OPTION: &str = "agent.hotplug_timeout"; const CDH_API_TIMOUT_OPTION: &str = "agent.cdh_api_timeout"; +const CDI_TIMEOUT_OPTION: &str = "agent.cdi_timeout"; const DEBUG_CONSOLE_VPORT_OPTION: &str = "agent.debug_console_vport"; const LOG_VPORT_OPTION: &str = "agent.log_vport"; const CONTAINER_PIPE_SIZE_OPTION: &str = "agent.container_pipe_size"; @@ -70,6 +71,7 @@ const MEM_AGENT_COMPACT_FORCE_TIMES: &str = "agent.mem_agent_compact_force_times const DEFAULT_LOG_LEVEL: slog::Level = slog::Level::Info; const DEFAULT_HOTPLUG_TIMEOUT: time::Duration = time::Duration::from_secs(3); const DEFAULT_CDH_API_TIMEOUT: time::Duration = time::Duration::from_secs(50); +const DEFAULT_CDI_TIMEOUT: time::Duration = time::Duration::from_secs(100); const DEFAULT_CONTAINER_PIPE_SIZE: i32 = 0; const VSOCK_ADDR: &str = "vsock://-1"; @@ -132,6 +134,7 @@ pub struct AgentConfig { pub log_level: slog::Level, pub hotplug_timeout: time::Duration, pub cdh_api_timeout: time::Duration, + pub cdi_timeout: time::Duration, pub debug_console_vport: i32, pub log_vport: i32, pub container_pipe_size: i32, @@ -170,6 +173,7 @@ pub struct AgentConfigBuilder { pub log_level: Option, pub hotplug_timeout: Option, pub cdh_api_timeout: Option, + pub cdi_timeout: Option, pub debug_console_vport: Option, pub log_vport: Option, pub container_pipe_size: Option, @@ -268,6 +272,7 @@ impl Default for AgentConfig { log_level: DEFAULT_LOG_LEVEL, hotplug_timeout: DEFAULT_HOTPLUG_TIMEOUT, cdh_api_timeout: DEFAULT_CDH_API_TIMEOUT, + cdi_timeout: DEFAULT_CDI_TIMEOUT, debug_console_vport: 0, log_vport: 0, container_pipe_size: DEFAULT_CONTAINER_PIPE_SIZE, @@ -314,6 +319,7 @@ impl FromStr for AgentConfig { ); config_override!(agent_config_builder, agent_config, hotplug_timeout); config_override!(agent_config_builder, agent_config, cdh_api_timeout); + config_override!(agent_config_builder, agent_config, cdi_timeout); config_override!(agent_config_builder, agent_config, debug_console_vport); config_override!(agent_config_builder, agent_config, log_vport); config_override!(agent_config_builder, agent_config, container_pipe_size); @@ -489,6 +495,15 @@ impl AgentConfig { |cdh_api_timeout: &time::Duration| cdh_api_timeout.as_secs() > 0 ); + // ensure the timeout is a positive value + parse_cmdline_param!( + param, + CDI_TIMEOUT_OPTION, + config.cdi_timeout, + get_timeout, + |cdi_timeout: &time::Duration| cdi_timeout.as_secs() > 0 + ); + // vsock port should be positive values parse_cmdline_param!( param, @@ -765,7 +780,7 @@ fn get_timeout(param: &str) -> Result { let fields: Vec<&str> = param.split('=').collect(); ensure!(fields.len() == 2, ERR_INVALID_TIMEOUT); ensure!( - matches!(fields[0], HOTPLUG_TIMOUT_OPTION | CDH_API_TIMOUT_OPTION), + matches!(fields[0], HOTPLUG_TIMOUT_OPTION | CDH_API_TIMOUT_OPTION | CDI_TIMEOUT_OPTION), ERR_INVALID_TIMEOUT_KEY ); @@ -1706,6 +1721,7 @@ Caused by: )))] #[case("agent.chd_api_timeout=1", Err(anyhow!(ERR_INVALID_TIMEOUT_KEY)))] #[case("agent.cdh_api_timeout=600", Ok(time::Duration::from_secs(600)))] + #[case("agent.cdi_timeout=320", Ok(time::Duration::from_secs(320)))] fn test_timeout(#[case] param: &str, #[case] expected: Result) { let result = get_timeout(param); let msg = format!("expected: {:?}, result: {:?}", expected, result);