gpu: Update handle_cdi_devices

AgentConfig now has the cdi_timeout from the kernel
cmdline, update the proper function signature and use
it in the for loop.

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
Zvonko Kaiser 2025-02-13 19:38:19 +00:00
parent 95aa21f018
commit 2499d013bd
3 changed files with 12 additions and 9 deletions

View File

@ -780,7 +780,10 @@ fn get_timeout(param: &str) -> Result<time::Duration> {
let fields: Vec<&str> = param.split('=').collect(); let fields: Vec<&str> = param.split('=').collect();
ensure!(fields.len() == 2, ERR_INVALID_TIMEOUT); ensure!(fields.len() == 2, ERR_INVALID_TIMEOUT);
ensure!( ensure!(
matches!(fields[0], HOTPLUG_TIMOUT_OPTION | CDH_API_TIMOUT_OPTION | CDI_TIMEOUT_OPTION), matches!(
fields[0],
HOTPLUG_TIMOUT_OPTION | CDH_API_TIMOUT_OPTION | CDI_TIMEOUT_OPTION
),
ERR_INVALID_TIMEOUT_KEY ERR_INVALID_TIMEOUT_KEY
); );

View File

@ -248,7 +248,7 @@ pub async fn handle_cdi_devices(
logger: &Logger, logger: &Logger,
spec: &mut Spec, spec: &mut Spec,
spec_dir: &str, spec_dir: &str,
cdi_timeout: u64, cdi_timeout: time::Duration,
) -> Result<()> { ) -> Result<()> {
if let Some(container_type) = spec if let Some(container_type) = spec
.annotations() .annotations()
@ -271,7 +271,7 @@ pub async fn handle_cdi_devices(
let options: Vec<CdiOption> = vec![with_auto_refresh(false), with_spec_dirs(&[spec_dir])]; let options: Vec<CdiOption> = vec![with_auto_refresh(false), with_spec_dirs(&[spec_dir])];
let cache: Arc<std::sync::Mutex<cdi::cache::Cache>> = new_cache(options); let cache: Arc<std::sync::Mutex<cdi::cache::Cache>> = new_cache(options);
for _ in 0..=cdi_timeout { for _ in 0..=cdi_timeout.as_secs() {
let inject_result = { let inject_result = {
// Lock cache within this scope, std::sync::Mutex has no Send // Lock cache within this scope, std::sync::Mutex has no Send
// and await will not work with time::sleep // and await will not work with time::sleep
@ -298,11 +298,11 @@ pub async fn handle_cdi_devices(
println!("error injecting devices: {:?}", e); println!("error injecting devices: {:?}", e);
} }
} }
time::sleep(Duration::from_millis(1000)).await; time::sleep(Duration::from_secs(1)).await;
} }
Err(anyhow!( Err(anyhow!(
"failed to inject devices after CDI timeout of {} seconds", "failed to inject devices after CDI timeout of {} seconds",
cdi_timeout cdi_timeout.as_secs()
)) ))
} }
@ -1243,8 +1243,10 @@ mod tests {
fs::write(&cdi_file, cdi_content).expect("Failed to write CDI file"); fs::write(&cdi_file, cdi_content).expect("Failed to write CDI file");
let cdi_timeout = Duration::from_secs(0);
let res = let res =
handle_cdi_devices(&logger, &mut spec, temp_dir.path().to_str().unwrap(), 0).await; handle_cdi_devices(&logger, &mut spec, temp_dir.path().to_str().unwrap(), cdi_timeout).await;
println!("modfied spec {:?}", spec); println!("modfied spec {:?}", spec);
assert!(res.is_ok(), "{}", res.err().unwrap()); assert!(res.is_ok(), "{}", res.err().unwrap());

View File

@ -130,8 +130,6 @@ const ERR_NO_SANDBOX_PIDNS: &str = "Sandbox does not have sandbox_pidns";
// not available. // not available.
const IPTABLES_RESTORE_WAIT_SEC: u64 = 5; const IPTABLES_RESTORE_WAIT_SEC: u64 = 5;
const CDI_TIMEOUT_LIMIT: u64 = 100;
// Convenience function to obtain the scope logger. // Convenience function to obtain the scope logger.
fn sl() -> slog::Logger { fn sl() -> slog::Logger {
slog_scope::logger() slog_scope::logger()
@ -234,7 +232,7 @@ impl AgentService {
// or other entities for a specifc device. // or other entities for a specifc device.
// In Kata we only consider the directory "/var/run/cdi", "/etc" may be // In Kata we only consider the directory "/var/run/cdi", "/etc" may be
// readonly // readonly
handle_cdi_devices(&sl(), &mut oci, "/var/run/cdi", CDI_TIMEOUT_LIMIT).await?; handle_cdi_devices(&sl(), &mut oci, "/var/run/cdi", AGENT_CONFIG.cdi_timeout).await?;
cdh_handler(&mut oci).await?; cdh_handler(&mut oci).await?;