gpu: Add CDI timeout via kernel config

Some systems like a DGX where we have 8 H100 or 8 H800 GPUs
need some extended time to be initialized. We need to make
sure we can configure CDI timeout, to enable even systems with 16 GPUs.

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
Zvonko Kaiser 2025-02-13 19:23:19 +00:00
parent 95c63f4982
commit 95aa21f018

View File

@ -23,6 +23,7 @@ const SERVER_ADDR_OPTION: &str = "agent.server_addr";
const PASSFD_LISTENER_PORT: &str = "agent.passfd_listener_port"; const PASSFD_LISTENER_PORT: &str = "agent.passfd_listener_port";
const HOTPLUG_TIMOUT_OPTION: &str = "agent.hotplug_timeout"; const HOTPLUG_TIMOUT_OPTION: &str = "agent.hotplug_timeout";
const CDH_API_TIMOUT_OPTION: &str = "agent.cdh_api_timeout"; const CDH_API_TIMOUT_OPTION: &str = "agent.cdh_api_timeout";
const CDI_TIMEOUT_OPTION: &str = "agent.cdi_timeout";
const DEBUG_CONSOLE_VPORT_OPTION: &str = "agent.debug_console_vport"; const DEBUG_CONSOLE_VPORT_OPTION: &str = "agent.debug_console_vport";
const LOG_VPORT_OPTION: &str = "agent.log_vport"; const LOG_VPORT_OPTION: &str = "agent.log_vport";
const CONTAINER_PIPE_SIZE_OPTION: &str = "agent.container_pipe_size"; const CONTAINER_PIPE_SIZE_OPTION: &str = "agent.container_pipe_size";
@ -70,6 +71,7 @@ const MEM_AGENT_COMPACT_FORCE_TIMES: &str = "agent.mem_agent_compact_force_times
const DEFAULT_LOG_LEVEL: slog::Level = slog::Level::Info; const DEFAULT_LOG_LEVEL: slog::Level = slog::Level::Info;
const DEFAULT_HOTPLUG_TIMEOUT: time::Duration = time::Duration::from_secs(3); const DEFAULT_HOTPLUG_TIMEOUT: time::Duration = time::Duration::from_secs(3);
const DEFAULT_CDH_API_TIMEOUT: time::Duration = time::Duration::from_secs(50); const DEFAULT_CDH_API_TIMEOUT: time::Duration = time::Duration::from_secs(50);
const DEFAULT_CDI_TIMEOUT: time::Duration = time::Duration::from_secs(100);
const DEFAULT_CONTAINER_PIPE_SIZE: i32 = 0; const DEFAULT_CONTAINER_PIPE_SIZE: i32 = 0;
const VSOCK_ADDR: &str = "vsock://-1"; const VSOCK_ADDR: &str = "vsock://-1";
@ -132,6 +134,7 @@ pub struct AgentConfig {
pub log_level: slog::Level, pub log_level: slog::Level,
pub hotplug_timeout: time::Duration, pub hotplug_timeout: time::Duration,
pub cdh_api_timeout: time::Duration, pub cdh_api_timeout: time::Duration,
pub cdi_timeout: time::Duration,
pub debug_console_vport: i32, pub debug_console_vport: i32,
pub log_vport: i32, pub log_vport: i32,
pub container_pipe_size: i32, pub container_pipe_size: i32,
@ -170,6 +173,7 @@ pub struct AgentConfigBuilder {
pub log_level: Option<String>, pub log_level: Option<String>,
pub hotplug_timeout: Option<time::Duration>, pub hotplug_timeout: Option<time::Duration>,
pub cdh_api_timeout: Option<time::Duration>, pub cdh_api_timeout: Option<time::Duration>,
pub cdi_timeout: Option<time::Duration>,
pub debug_console_vport: Option<i32>, pub debug_console_vport: Option<i32>,
pub log_vport: Option<i32>, pub log_vport: Option<i32>,
pub container_pipe_size: Option<i32>, pub container_pipe_size: Option<i32>,
@ -268,6 +272,7 @@ impl Default for AgentConfig {
log_level: DEFAULT_LOG_LEVEL, log_level: DEFAULT_LOG_LEVEL,
hotplug_timeout: DEFAULT_HOTPLUG_TIMEOUT, hotplug_timeout: DEFAULT_HOTPLUG_TIMEOUT,
cdh_api_timeout: DEFAULT_CDH_API_TIMEOUT, cdh_api_timeout: DEFAULT_CDH_API_TIMEOUT,
cdi_timeout: DEFAULT_CDI_TIMEOUT,
debug_console_vport: 0, debug_console_vport: 0,
log_vport: 0, log_vport: 0,
container_pipe_size: DEFAULT_CONTAINER_PIPE_SIZE, container_pipe_size: DEFAULT_CONTAINER_PIPE_SIZE,
@ -314,6 +319,7 @@ impl FromStr for AgentConfig {
); );
config_override!(agent_config_builder, agent_config, hotplug_timeout); config_override!(agent_config_builder, agent_config, hotplug_timeout);
config_override!(agent_config_builder, agent_config, cdh_api_timeout); config_override!(agent_config_builder, agent_config, cdh_api_timeout);
config_override!(agent_config_builder, agent_config, cdi_timeout);
config_override!(agent_config_builder, agent_config, debug_console_vport); config_override!(agent_config_builder, agent_config, debug_console_vport);
config_override!(agent_config_builder, agent_config, log_vport); config_override!(agent_config_builder, agent_config, log_vport);
config_override!(agent_config_builder, agent_config, container_pipe_size); config_override!(agent_config_builder, agent_config, container_pipe_size);
@ -489,6 +495,15 @@ impl AgentConfig {
|cdh_api_timeout: &time::Duration| cdh_api_timeout.as_secs() > 0 |cdh_api_timeout: &time::Duration| cdh_api_timeout.as_secs() > 0
); );
// ensure the timeout is a positive value
parse_cmdline_param!(
param,
CDI_TIMEOUT_OPTION,
config.cdi_timeout,
get_timeout,
|cdi_timeout: &time::Duration| cdi_timeout.as_secs() > 0
);
// vsock port should be positive values // vsock port should be positive values
parse_cmdline_param!( parse_cmdline_param!(
param, param,
@ -765,7 +780,7 @@ fn get_timeout(param: &str) -> Result<time::Duration> {
let fields: Vec<&str> = param.split('=').collect(); let fields: Vec<&str> = param.split('=').collect();
ensure!(fields.len() == 2, ERR_INVALID_TIMEOUT); ensure!(fields.len() == 2, ERR_INVALID_TIMEOUT);
ensure!( ensure!(
matches!(fields[0], HOTPLUG_TIMOUT_OPTION | CDH_API_TIMOUT_OPTION), matches!(fields[0], HOTPLUG_TIMOUT_OPTION | CDH_API_TIMOUT_OPTION | CDI_TIMEOUT_OPTION),
ERR_INVALID_TIMEOUT_KEY ERR_INVALID_TIMEOUT_KEY
); );
@ -1706,6 +1721,7 @@ Caused by:
)))] )))]
#[case("agent.chd_api_timeout=1", Err(anyhow!(ERR_INVALID_TIMEOUT_KEY)))] #[case("agent.chd_api_timeout=1", Err(anyhow!(ERR_INVALID_TIMEOUT_KEY)))]
#[case("agent.cdh_api_timeout=600", Ok(time::Duration::from_secs(600)))] #[case("agent.cdh_api_timeout=600", Ok(time::Duration::from_secs(600)))]
#[case("agent.cdi_timeout=320", Ok(time::Duration::from_secs(320)))]
fn test_timeout(#[case] param: &str, #[case] expected: Result<time::Duration>) { fn test_timeout(#[case] param: &str, #[case] expected: Result<time::Duration>) {
let result = get_timeout(param); let result = get_timeout(param);
let msg = format!("expected: {:?}, result: {:?}", expected, result); let msg = format!("expected: {:?}, result: {:?}", expected, result);