From 6cd842494cdaaeea9d1d7b9fcfe444aad30604e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 5 May 2026 14:07:13 +0200 Subject: [PATCH] kata-deploy: cap the tokio worker pool to 2 threads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default #[tokio::main] expands with flavor = "multi_thread" and worker_threads = num_cpus::get(). On a typical NVIDIA GPU node (200+ vCPUs) that allocates 200+ worker threads with ~2 MiB stacks each, which is the single largest contributor to the DaemonSet pod's VmData reservation — hundreds of MiB of address space mapped but never touched, easily reproducing the "kata-deploy is using ~400 MB" reports on any monitoring layer that surfaces VSZ / committed virtual memory. Switch to a fixed two-worker multi-thread runtime instead: #[tokio::main(flavor = "multi_thread", worker_threads = 2)] Two workers is exactly the right number for kata-deploy: - the install path is overwhelmingly I/O-bound and runs serially; one worker is enough to drive the install future itself, - install does shell out to `nsenter --target 1 systemctl restart containerd` (and friends) via the synchronous std::process:: Command::output(), which wedges the worker thread it runs on for tens of seconds; the second worker keeps the spawned health-server task able to answer kubelet probes inside timeoutSeconds while the first is blocked. flavor = "current_thread" would be tighter still on stacks (~4 MiB saved) but is fundamentally unsafe here: with a single runtime thread, any blocking host_systemctl call freezes the health server too, the kubelet fails the readiness probe, and the pod is restarted long before install completes. The CI lifecycle test reliably reproduces this as a 15-minute timeout waiting for the kata-deploy DaemonSet pod to become Ready. Net result vs. upstream's num_cpus()-driven pool on a 200-vCPU node: ~200 fewer worker threads, ~400 MiB less VmData reservation, while keeping kubelet probes responsive across the entire install path. Add the "sync" tokio feature here too so subsequent commits in the series can use tokio::sync primitives (OnceCell) without another features bump. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- tools/packaging/kata-deploy/binary/Cargo.toml | 1 + tools/packaging/kata-deploy/binary/src/main.rs | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/packaging/kata-deploy/binary/Cargo.toml b/tools/packaging/kata-deploy/binary/Cargo.toml index e68ff24c58..987bdfbb6a 100644 --- a/tools/packaging/kata-deploy/binary/Cargo.toml +++ b/tools/packaging/kata-deploy/binary/Cargo.toml @@ -38,6 +38,7 @@ tokio = { workspace = true, features = [ "rt-multi-thread", "macros", "signal", + "sync", "time", "net", "io-util", diff --git a/tools/packaging/kata-deploy/binary/src/main.rs b/tools/packaging/kata-deploy/binary/src/main.rs index b078a88757..11137d9821 100644 --- a/tools/packaging/kata-deploy/binary/src/main.rs +++ b/tools/packaging/kata-deploy/binary/src/main.rs @@ -28,7 +28,21 @@ enum Action { Reset, } -#[tokio::main] +// Cap the tokio runtime to a small fixed number of worker threads. The default +// multi-thread runtime allocates `num_cpus()` workers (each with a ~2 MiB +// stack), which on a 200+ vCPU GPU node is the dominant contributor to the +// DaemonSet pod's VmData reservation (~440 MiB). Two workers is plenty: +// +// - the install path is overwhelmingly I/O-bound, +// - it shells out to `nsenter ... systemctl restart …` (synchronous, +// blocking calls that wedge the thread they run on for tens of seconds); +// a second worker keeps the health server able to answer kubelet probes +// within timeoutSeconds while the first is blocked. +// +// `current_thread` would be tighter still, but starves the health server the +// moment a host_systemctl call runs — the kubelet then fails the readiness +// probe and the pod is restarted before install can finish. +#[tokio::main(flavor = "multi_thread", worker_threads = 2)] async fn main() -> Result<()> { // Set log level based on DEBUG environment variable // This must be done before initializing the logger