diff --git a/src/agent/src/config.rs b/src/agent/src/config.rs index 59c3706566..c124d6015d 100644 --- a/src/agent/src/config.rs +++ b/src/agent/src/config.rs @@ -23,6 +23,7 @@ const SERVER_ADDR_OPTION: &str = "agent.server_addr"; const PASSFD_LISTENER_PORT: &str = "agent.passfd_listener_port"; const HOTPLUG_TIMOUT_OPTION: &str = "agent.hotplug_timeout"; const CDH_API_TIMOUT_OPTION: &str = "agent.cdh_api_timeout"; +const CDI_TIMEOUT_OPTION: &str = "agent.cdi_timeout"; const DEBUG_CONSOLE_VPORT_OPTION: &str = "agent.debug_console_vport"; const LOG_VPORT_OPTION: &str = "agent.log_vport"; const CONTAINER_PIPE_SIZE_OPTION: &str = "agent.container_pipe_size"; @@ -70,6 +71,7 @@ const MEM_AGENT_COMPACT_FORCE_TIMES: &str = "agent.mem_agent_compact_force_times const DEFAULT_LOG_LEVEL: slog::Level = slog::Level::Info; const DEFAULT_HOTPLUG_TIMEOUT: time::Duration = time::Duration::from_secs(3); const DEFAULT_CDH_API_TIMEOUT: time::Duration = time::Duration::from_secs(50); +const DEFAULT_CDI_TIMEOUT: time::Duration = time::Duration::from_secs(100); const DEFAULT_CONTAINER_PIPE_SIZE: i32 = 0; const VSOCK_ADDR: &str = "vsock://-1"; @@ -132,6 +134,7 @@ pub struct AgentConfig { pub log_level: slog::Level, pub hotplug_timeout: time::Duration, pub cdh_api_timeout: time::Duration, + pub cdi_timeout: time::Duration, pub debug_console_vport: i32, pub log_vport: i32, pub container_pipe_size: i32, @@ -169,6 +172,7 @@ pub struct AgentConfigBuilder { pub log_level: Option, pub hotplug_timeout: Option, pub cdh_api_timeout: Option, + pub cdi_timeout: Option, pub debug_console_vport: Option, pub log_vport: Option, pub container_pipe_size: Option, @@ -267,6 +271,7 @@ impl Default for AgentConfig { log_level: DEFAULT_LOG_LEVEL, hotplug_timeout: DEFAULT_HOTPLUG_TIMEOUT, cdh_api_timeout: DEFAULT_CDH_API_TIMEOUT, + cdi_timeout: DEFAULT_CDI_TIMEOUT, debug_console_vport: 0, log_vport: 0, container_pipe_size: DEFAULT_CONTAINER_PIPE_SIZE, @@ -312,6 +317,7 @@ impl FromStr for AgentConfig { ); config_override!(agent_config_builder, agent_config, hotplug_timeout); config_override!(agent_config_builder, agent_config, cdh_api_timeout); + config_override!(agent_config_builder, agent_config, cdi_timeout); config_override!(agent_config_builder, agent_config, debug_console_vport); config_override!(agent_config_builder, agent_config, log_vport); config_override!(agent_config_builder, agent_config, container_pipe_size); @@ -487,6 +493,15 @@ impl AgentConfig { |cdh_api_timeout: &time::Duration| cdh_api_timeout.as_secs() > 0 ); + // ensure the timeout is a positive value + parse_cmdline_param!( + param, + CDI_TIMEOUT_OPTION, + config.cdi_timeout, + get_timeout, + |cdi_timeout: &time::Duration| cdi_timeout.as_secs() > 0 + ); + // vsock port should be positive values parse_cmdline_param!( param, @@ -763,7 +778,10 @@ fn get_timeout(param: &str) -> Result { let fields: Vec<&str> = param.split('=').collect(); ensure!(fields.len() == 2, ERR_INVALID_TIMEOUT); ensure!( - matches!(fields[0], HOTPLUG_TIMOUT_OPTION | CDH_API_TIMOUT_OPTION), + matches!( + fields[0], + HOTPLUG_TIMOUT_OPTION | CDH_API_TIMOUT_OPTION | CDI_TIMEOUT_OPTION + ), ERR_INVALID_TIMEOUT_KEY ); @@ -1704,6 +1722,7 @@ Caused by: )))] #[case("agent.chd_api_timeout=1", Err(anyhow!(ERR_INVALID_TIMEOUT_KEY)))] #[case("agent.cdh_api_timeout=600", Ok(time::Duration::from_secs(600)))] + #[case("agent.cdi_timeout=320", Ok(time::Duration::from_secs(320)))] fn test_timeout(#[case] param: &str, #[case] expected: Result) { let result = get_timeout(param); let msg = format!("expected: {:?}, result: {:?}", expected, result); diff --git a/src/agent/src/device/mod.rs b/src/agent/src/device/mod.rs index 400b6f1386..7bfa802f83 100644 --- a/src/agent/src/device/mod.rs +++ b/src/agent/src/device/mod.rs @@ -248,7 +248,7 @@ pub async fn handle_cdi_devices( logger: &Logger, spec: &mut Spec, spec_dir: &str, - cdi_timeout: u64, + cdi_timeout: time::Duration, ) -> Result<()> { if let Some(container_type) = spec .annotations() @@ -271,7 +271,7 @@ pub async fn handle_cdi_devices( let options: Vec = vec![with_auto_refresh(false), with_spec_dirs(&[spec_dir])]; let cache: Arc> = new_cache(options); - for _ in 0..=cdi_timeout { + for i in 0..=cdi_timeout.as_secs() { let inject_result = { // Lock cache within this scope, std::sync::Mutex has no Send // and await will not work with time::sleep @@ -294,15 +294,20 @@ pub async fn handle_cdi_devices( return Ok(()); } Err(e) => { - info!(logger, "error injecting devices: {:?}", e); - println!("error injecting devices: {:?}", e); + info!( + logger, + "waiting for CDI spec(s) to be generated ({} of {} max tries) {:?}", + i, + cdi_timeout.as_secs(), + e + ); } } - time::sleep(Duration::from_millis(1000)).await; + time::sleep(Duration::from_secs(1)).await; } Err(anyhow!( "failed to inject devices after CDI timeout of {} seconds", - cdi_timeout + cdi_timeout.as_secs() )) } @@ -1243,8 +1248,15 @@ mod tests { fs::write(&cdi_file, cdi_content).expect("Failed to write CDI file"); - let res = - handle_cdi_devices(&logger, &mut spec, temp_dir.path().to_str().unwrap(), 0).await; + let cdi_timeout = Duration::from_secs(0); + + let res = handle_cdi_devices( + &logger, + &mut spec, + temp_dir.path().to_str().unwrap(), + cdi_timeout, + ) + .await; println!("modfied spec {:?}", spec); assert!(res.is_ok(), "{}", res.err().unwrap()); diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index cbf4b5eefd..d032c08183 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -130,8 +130,6 @@ const ERR_NO_SANDBOX_PIDNS: &str = "Sandbox does not have sandbox_pidns"; // not available. const IPTABLES_RESTORE_WAIT_SEC: u64 = 5; -const CDI_TIMEOUT_LIMIT: u64 = 100; - // Convenience function to obtain the scope logger. fn sl() -> slog::Logger { slog_scope::logger() @@ -234,7 +232,7 @@ impl AgentService { // or other entities for a specifc device. // In Kata we only consider the directory "/var/run/cdi", "/etc" may be // readonly - handle_cdi_devices(&sl(), &mut oci, "/var/run/cdi", CDI_TIMEOUT_LIMIT).await?; + handle_cdi_devices(&sl(), &mut oci, "/var/run/cdi", AGENT_CONFIG.cdi_timeout).await?; cdh_handler(&mut oci).await?;