From 6cd842494cdaaeea9d1d7b9fcfe444aad30604e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Tue, 5 May 2026 14:07:13 +0200
Subject: [PATCH] kata-deploy: cap the tokio worker pool to 2 threads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The default #[tokio::main] expands with flavor = "multi_thread" and
worker_threads = num_cpus::get(). On a typical NVIDIA GPU node
(200+ vCPUs) that allocates 200+ worker threads with ~2 MiB stacks
each, which is the single largest contributor to the DaemonSet pod's
VmData reservation — hundreds of MiB of address space mapped but never
touched, easily reproducing the "kata-deploy is using ~400 MB" reports
on any monitoring layer that surfaces VSZ / committed virtual memory.

Switch to a fixed two-worker multi-thread runtime instead:

  #[tokio::main(flavor = "multi_thread", worker_threads = 2)]

Two workers is exactly the right number for kata-deploy:

  - the install path is overwhelmingly I/O-bound and runs serially;
    one worker is enough to drive the install future itself,
  - install does shell out to `nsenter --target 1 systemctl restart
    containerd` (and friends) via the synchronous std::process::
    Command::output(), which wedges the worker thread it runs on for
    tens of seconds; the second worker keeps the spawned health-server
    task able to answer kubelet probes inside timeoutSeconds while
    the first is blocked.

flavor = "current_thread" would be tighter still on stacks (~4 MiB
saved) but is fundamentally unsafe here: with a single runtime thread,
any blocking host_systemctl call freezes the health server too, the
kubelet fails the readiness probe, and the pod is restarted long
before install completes. The CI lifecycle test reliably reproduces
this as a 15-minute timeout waiting for the kata-deploy DaemonSet pod
to become Ready.

Net result vs. upstream's num_cpus()-driven pool on a 200-vCPU node:
~200 fewer worker threads, ~400 MiB less VmData reservation, while
keeping kubelet probes responsive across the entire install path.

Add the "sync" tokio feature here too so subsequent commits in the
series can use tokio::sync primitives (OnceCell) without another
features bump.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 tools/packaging/kata-deploy/binary/Cargo.toml  |  1 +
 tools/packaging/kata-deploy/binary/src/main.rs | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tools/packaging/kata-deploy/binary/Cargo.toml b/tools/packaging/kata-deploy/binary/Cargo.toml
index e68ff24c58..987bdfbb6a 100644
--- a/tools/packaging/kata-deploy/binary/Cargo.toml
+++ b/tools/packaging/kata-deploy/binary/Cargo.toml
@@ -38,6 +38,7 @@ tokio = { workspace = true, features = [
     "rt-multi-thread",
     "macros",
     "signal",
+    "sync",
     "time",
     "net",
     "io-util",
diff --git a/tools/packaging/kata-deploy/binary/src/main.rs b/tools/packaging/kata-deploy/binary/src/main.rs
index b078a88757..11137d9821 100644
--- a/tools/packaging/kata-deploy/binary/src/main.rs
+++ b/tools/packaging/kata-deploy/binary/src/main.rs
@@ -28,7 +28,21 @@ enum Action {
     Reset,
 }
 
-#[tokio::main]
+// Cap the tokio runtime to a small fixed number of worker threads. The default
+// multi-thread runtime allocates `num_cpus()` workers (each with a ~2 MiB
+// stack), which on a 200+ vCPU GPU node is the dominant contributor to the
+// DaemonSet pod's VmData reservation (~440 MiB). Two workers is plenty:
+//
+//   - the install path is overwhelmingly I/O-bound,
+//   - it shells out to `nsenter ... systemctl restart …` (synchronous,
+//     blocking calls that wedge the thread they run on for tens of seconds);
+//     a second worker keeps the health server able to answer kubelet probes
+//     within timeoutSeconds while the first is blocked.
+//
+// `current_thread` would be tighter still, but starves the health server the
+// moment a host_systemctl call runs — the kubelet then fails the readiness
+// probe and the pod is restarted before install can finish.
+#[tokio::main(flavor = "multi_thread", worker_threads = 2)]
 async fn main() -> Result<()> {
     // Set log level based on DEBUG environment variable
     // This must be done before initializing the logger