From feeb5d8ecc776d053d9bae3cfde0bfee6ff4a65d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Fri, 22 May 2026 14:48:42 +0200
Subject: [PATCH] runtime-rs: Fix vCPU pinning race with backoff retry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

QEMU can report fewer vCPU threads during early startup, causing partial
affinity setup. Let's retry with exponential backoff until the expected
thread count is visible, then continue with best-effort pinning if the
window is exhausted.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
---
 .../resource/src/cgroups/resource_inner.rs    | 48 ++++++++++++++++---
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs
index 7fbcdb2e06..d72dc73efe 100644
--- a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs
+++ b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs
@@ -7,6 +7,7 @@
 use std::collections::{HashMap, HashSet};
 use std::process;
 use std::str::FromStr;
+use std::time::Duration;
 
 use anyhow::{anyhow, Context, Result};
 use cgroups_rs::manager::is_systemd_cgroup;
@@ -16,6 +17,7 @@ use kata_types::cpu::CpuSet;
 use nix::sched::{sched_setaffinity, CpuSet as NixCpuSet};
 use nix::unistd::Pid;
 use oci_spec::runtime::{LinuxCpu, LinuxCpuBuilder, LinuxResources, LinuxResourcesBuilder};
+use tokio::time::sleep;
 
 use crate::cgroups::utils::get_tgid_from_pid;
 use crate::cgroups::CgroupConfig;
@@ -187,12 +189,46 @@ impl CgroupsResourceInner {
         let needs_thread_ids = self.overhead_cgroup.is_some() || self.enable_vcpus_pinning;
 
         let thread_ids = if needs_thread_ids {
-            Some(
-                hypervisor
-                    .get_thread_ids()
-                    .await
-                    .context("get vCPU thread IDs")?,
-            )
+            let mut tids = hypervisor
+                .get_thread_ids()
+                .await
+                .context("get vCPU thread IDs")?;
+
+            // QEMU may not have spawned all vCPU threads yet. Retry with
+            // exponential backoff until we see the expected count.
+            let expected = hypervisor.hypervisor_config().await.cpu_info.default_vcpus.ceil() as usize;
+            if expected > 0 && tids.vcpus.len() < expected {
+                const MAX_ATTEMPTS: u32 = 10;
+                let mut backoff = Duration::from_millis(50);
+                for attempt in 2..=MAX_ATTEMPTS {
+                    if tids.vcpus.len() >= expected {
+                        break;
+                    }
+                    info!(
+                        sl!(),
+                        "waiting for all vCPU threads: have {}, want {}, attempt {}",
+                        tids.vcpus.len(),
+                        expected,
+                        attempt
+                    );
+                    sleep(backoff).await;
+                    backoff *= 2;
+                    tids = hypervisor
+                        .get_thread_ids()
+                        .await
+                        .context("get vCPU thread IDs (retry)")?;
+                }
+                if tids.vcpus.len() < expected {
+                    warn!(
+                        sl!(),
+                        "not all vCPU threads available after retries: have {}, want {}; pinning available ones",
+                        tids.vcpus.len(),
+                        expected
+                    );
+                }
+            }
+
+            Some(tids)
         } else {
             None
         };