From 87d27e0cc81e9a61adb9d7738a29ba35d6945981 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 3 Jun 2026 21:42:02 +0200
Subject: [PATCH 1/9] kata-deploy-job-dispatcher: add generic per-node Job
 dispatcher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a small, deployment-agnostic dispatcher binary that runs exactly one
Kubernetes Job per selected node and paces the rollout, so callers get
guaranteed per-node coverage without encoding the fan-out in Helm.

Motivation: templating one Job per node into a Helm release does not
scale (the release Secret hits etcd's 1 MiB limit and hooks run
sequentially), and a single Indexed Job cannot guarantee per-node
coverage when paced - the scheduler ignores completed pods when
evaluating topology spread, so nodes get uneven numbers of pods. A tiny
dispatcher that enumerates nodes live and creates node-pinned Jobs itself
sidesteps both problems and keeps the Helm release O(1) in fleet size.

The dispatcher:
  - enumerates target nodes live (explicit --nodes list or
    --node-selector label selector), paginating the API;
  - stamps out one Job per node from a YAML template, pinning it with
    nodeName and an owner label for server-side filtering;
  - keeps at most --parallelism Jobs in flight, refilling as they finish,
    and sets an OwnerReference to the owner Job so the per-node Jobs are
    garbage-collected with it;
  - is a plain API client (kube): it never touches the host, so it can
    run fully unprivileged.

Node membership is resolved live on each run, not frozen at Helm
template-render time: re-running the dispatcher (e.g. via `helm upgrade`)
picks up nodes added since the last run and skips ones already done, as
the per-node stages are idempotent. The dispatcher is one-shot, however
- it does not watch the API, so nodes added while it is not running are
only covered by the next run.

job.rs holds the pure helpers (node-name sanitization, deterministic Job
naming, template instantiation, status interpretation) with rstest unit
tests; main.rs wires up the CLI and the fan-out loop.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 Cargo.lock                                    |  15 +
 Cargo.toml                                    |   3 +
 .../kata-deploy/job-dispatcher/Cargo.toml     |  39 ++
 .../kata-deploy/job-dispatcher/src/job.rs     | 347 ++++++++++++++++
 .../kata-deploy/job-dispatcher/src/main.rs    | 373 ++++++++++++++++++
 5 files changed, 777 insertions(+)
 create mode 100644 tools/packaging/kata-deploy/job-dispatcher/Cargo.toml
 create mode 100644 tools/packaging/kata-deploy/job-dispatcher/src/job.rs
 create mode 100644 tools/packaging/kata-deploy/job-dispatcher/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index 7d8a405b5b..2a6f8c29d1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3650,6 +3650,21 @@ dependencies = [
  "zstd 0.13.3",
 ]
 
+[[package]]
+name = "kata-deploy-job-dispatcher"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "env_logger",
+ "k8s-openapi",
+ "kube",
+ "log",
+ "rstest 0.18.2",
+ "serde_yaml 0.9.34+deprecated",
+ "tokio",
+]
+
 [[package]]
 name = "kata-sys-util"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 668c51eed2..7803baa1c5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,6 +48,9 @@ members = [
   # kata-deploy (Kubernetes installer binary)
   "tools/packaging/kata-deploy/binary",
 
+  # kata-deploy-job-dispatcher (generic per-node Job dispatcher)
+  "tools/packaging/kata-deploy/job-dispatcher",
+
   # runtime-rs
   "src/runtime-rs",
   "src/runtime-rs/crates/agent",
diff --git a/tools/packaging/kata-deploy/job-dispatcher/Cargo.toml b/tools/packaging/kata-deploy/job-dispatcher/Cargo.toml
new file mode 100644
index 0000000000..dca8c223b5
--- /dev/null
+++ b/tools/packaging/kata-deploy/job-dispatcher/Cargo.toml
@@ -0,0 +1,39 @@
+[package]
+name = "kata-deploy-job-dispatcher"
+version = "0.1.0"
+authors.workspace = true
+edition = "2021"
+license.workspace = true
+rust-version.workspace = true
+
+[[bin]]
+name = "kata-deploy-job-dispatcher"
+path = "src/main.rs"
+
+[dependencies]
+anyhow.workspace = true
+clap.workspace = true
+env_logger = "0.10"
+k8s-openapi = { version = "0.26", default-features = false, features = [
+    "v1_33",
+] }
+# Only the bare client is needed: this tool drives Jobs by polling with
+# Api::list, so kube::runtime (watchers/reflectors) and kube::derive are not
+# pulled in. `ring` matches kube's default rustls CryptoProvider and must stay
+# enabled, otherwise rustls panics at startup.
+kube = { version = "2.0", default-features = false, features = [
+    "client",
+    "rustls-tls",
+    "ring",
+] }
+log.workspace = true
+serde_yaml = "0.9"
+tokio = { workspace = true, features = [
+    "rt-multi-thread",
+    "macros",
+    "signal",
+    "time",
+] }
+
+[dev-dependencies]
+rstest.workspace = true
diff --git a/tools/packaging/kata-deploy/job-dispatcher/src/job.rs b/tools/packaging/kata-deploy/job-dispatcher/src/job.rs
new file mode 100644
index 0000000000..2fd6ddacbd
--- /dev/null
+++ b/tools/packaging/kata-deploy/job-dispatcher/src/job.rs
@@ -0,0 +1,347 @@
+// Copyright (c) 2026 NVIDIA Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+
+use k8s_openapi::api::batch::v1::Job;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference;
+use std::collections::hash_map::DefaultHasher;
+use std::collections::BTreeMap;
+use std::hash::{Hash, Hasher};
+
+/// Label applied to every per-node Job, set to the dispatcher's name prefix.
+/// Used as a server-side selector so the dispatcher only ever sees the Jobs it
+/// created (and not unrelated Jobs in the namespace).
+pub const OWNER_LABEL: &str = "kata-deploy-job-dispatcher/owner";
+
+/// Label carrying the (sanitized) target node name, for human inspection.
+pub const NODE_LABEL: &str = "kata-deploy-job-dispatcher/node";
+
+/// Annotation carrying the full, unmodified target node name. Node names can
+/// exceed the 63-char label-value limit or contain characters invalid in a
+/// label value, so the authoritative value lives in an annotation.
+pub const NODE_ANNOTATION: &str = "kata-deploy-job-dispatcher/node-name";
+
+/// Maximum length of a DNS-1123 label and of a Kubernetes label value.
+pub const MAX_LABEL_LEN: usize = 63;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum JobOutcome {
+    Running,
+    Succeeded,
+    Failed,
+}
+
+/// Lowercase a node name and replace any character that is not a valid
+/// DNS-1123 label character (`[a-z0-9-]`) with `-`, then trim leading/trailing
+/// dashes. The result is safe to embed in a Job name and label value.
+pub fn sanitize_node(node: &str) -> String {
+    let lowered = node.to_ascii_lowercase();
+    let mapped: String = lowered
+        .chars()
+        .map(|c| {
+            if c.is_ascii_alphanumeric() || c == '-' {
+                c
+            } else {
+                '-'
+            }
+        })
+        .collect();
+    mapped.trim_matches('-').to_string()
+}
+
+/// Short, stable hex digest of an arbitrary string. Used to keep generated
+/// Job names unique when the sanitized/truncated form would otherwise collide.
+fn short_hash(s: &str) -> String {
+    let mut hasher = DefaultHasher::new();
+    s.hash(&mut hasher);
+    format!("{:08x}", (hasher.finish() & 0xffff_ffff) as u32)
+}
+
+/// Build a deterministic, RFC1123-label-safe Job name (`<= 63` chars) for a
+/// node. When `<prefix>-<sanitized-node>` fits it is used verbatim; otherwise
+/// it is truncated and a short hash of the *full* node name is appended so two
+/// different long node names cannot collide.
+pub fn job_name(prefix: &str, node: &str) -> String {
+    let sanitized = sanitize_node(node);
+    let base = format!("{prefix}-{sanitized}");
+    if base.len() <= MAX_LABEL_LEN {
+        return base;
+    }
+    let hash = short_hash(node);
+    // Reserve room for "-" + hash.
+    let keep = MAX_LABEL_LEN.saturating_sub(hash.len() + 1);
+    let truncated = base.chars().take(keep).collect::<String>();
+    format!("{}-{}", truncated.trim_end_matches('-'), hash)
+}
+
+/// Sanitize an arbitrary string into a value safe to use BOTH as the prefix of
+/// a DNS-1123 Job name and as a Kubernetes label value: lowercased, every
+/// non-`[a-z0-9-]` character replaced with `-`, leading/trailing `-` trimmed,
+/// and truncated to [`MAX_LABEL_LEN`] (re-trimming any trailing `-` left by the
+/// truncation). The dispatcher records its `--name-prefix` in [`OWNER_LABEL`]
+/// and reuses it as the Job-name prefix, so callers can pass a raw value (e.g.
+/// a Helm release/suffix) without risking an invalid or over-long label.
+pub fn sanitize_label_value(value: &str) -> String {
+    let sanitized = sanitize_node(value);
+    if sanitized.len() <= MAX_LABEL_LEN {
+        return sanitized;
+    }
+    sanitized
+        .chars()
+        .take(MAX_LABEL_LEN)
+        .collect::<String>()
+        .trim_end_matches('-')
+        .to_string()
+}
+
+/// True if `job` carries [`OWNER_LABEL`] set to exactly `owner_value`. Used to
+/// decide whether a pre-existing (409) Job is safe to adopt: the dispatcher
+/// only ever LISTs Jobs by that label, so adopting one that lacks it would
+/// leave it stuck in-flight forever.
+pub fn job_owned_by(job: &Job, owner_value: &str) -> bool {
+    job.metadata
+        .labels
+        .as_ref()
+        .and_then(|labels| labels.get(OWNER_LABEL))
+        .map(|value| value == owner_value)
+        .unwrap_or(false)
+}
+
+/// Clone the template Job and specialize it for a single node:
+///   - set a unique `metadata.name`,
+///   - pin the pod to `node` via `spec.template.spec.nodeName`,
+///   - add owner/node tracking labels (+ a full-name annotation),
+///   - optionally attach an `ownerReference` for garbage collection.
+///
+/// `owner_value` is the dispatcher's name prefix, recorded in [`OWNER_LABEL`] so
+/// the dispatcher can list back only its own Jobs.
+pub fn build_node_job(
+    template: &Job,
+    name: &str,
+    node: &str,
+    owner_value: &str,
+    owner: Option<&OwnerReference>,
+) -> Job {
+    let mut job = template.clone();
+
+    job.metadata.name = Some(name.to_string());
+    // A template may carry generateName; an explicit name wins, drop it to
+    // avoid the apiserver rejecting both being set.
+    job.metadata.generate_name = None;
+
+    let labels = job.metadata.labels.get_or_insert_with(BTreeMap::new);
+    labels.insert(OWNER_LABEL.to_string(), owner_value.to_string());
+    labels.insert(NODE_LABEL.to_string(), sanitize_node(node));
+
+    let annotations = job.metadata.annotations.get_or_insert_with(BTreeMap::new);
+    annotations.insert(NODE_ANNOTATION.to_string(), node.to_string());
+
+    if let Some(owner_ref) = owner {
+        job.metadata.owner_references = Some(vec![owner_ref.clone()]);
+    }
+
+    let spec = job.spec.get_or_insert_with(Default::default);
+
+    // Mirror the owner label onto the pod template so the pods are easy to
+    // find too.
+    let tmpl_meta = spec.template.metadata.get_or_insert_with(Default::default);
+    let tmpl_labels = tmpl_meta.labels.get_or_insert_with(BTreeMap::new);
+    tmpl_labels.insert(OWNER_LABEL.to_string(), owner_value.to_string());
+
+    let pod_spec = spec.template.spec.get_or_insert_with(Default::default);
+    pod_spec.node_name = Some(node.to_string());
+
+    job
+}
+
+/// Interpret a Job's `.status` into a coarse outcome. Prefers the explicit
+/// `Complete`/`Failed` conditions; falls back to the succeeded counter.
+pub fn interpret_status(job: &Job) -> JobOutcome {
+    let Some(status) = job.status.as_ref() else {
+        return JobOutcome::Running;
+    };
+
+    if let Some(conditions) = status.conditions.as_ref() {
+        for c in conditions {
+            if c.status != "True" {
+                continue;
+            }
+            match c.type_.as_str() {
+                "Failed" => return JobOutcome::Failed,
+                "Complete" => return JobOutcome::Succeeded,
+                _ => {}
+            }
+        }
+    }
+
+    if status.succeeded.unwrap_or(0) >= 1 {
+        return JobOutcome::Succeeded;
+    }
+
+    JobOutcome::Running
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rstest::rstest;
+
+    #[rstest]
+    #[case("worker-0", "worker-0")]
+    #[case("Worker.Example.COM", "worker-example-com")]
+    #[case("--node--", "node")]
+    #[case("a_b/c", "a-b-c")]
+    fn test_sanitize_node(#[case] input: &str, #[case] expected: &str) {
+        assert_eq!(sanitize_node(input), expected);
+    }
+
+    #[rstest]
+    #[case("kata-deploy-install", "kata-deploy-install")]
+    #[case("Kata_Deploy.Install", "kata-deploy-install")]
+    #[case("--weird--", "weird")]
+    fn test_sanitize_label_value_short(#[case] input: &str, #[case] expected: &str) {
+        assert_eq!(sanitize_label_value(input), expected);
+    }
+
+    #[test]
+    fn test_sanitize_label_value_truncates() {
+        let out = sanitize_label_value(&"a".repeat(100));
+        assert_eq!(out.len(), MAX_LABEL_LEN);
+        assert!(
+            !out.ends_with('-'),
+            "truncation must not leave a trailing dash"
+        );
+    }
+
+    #[test]
+    fn test_job_owned_by() {
+        let mut job = Job::default();
+        assert!(!job_owned_by(&job, "kata-deploy-install"));
+        job.metadata
+            .labels
+            .get_or_insert_with(BTreeMap::new)
+            .insert(OWNER_LABEL.to_string(), "kata-deploy-install".to_string());
+        assert!(job_owned_by(&job, "kata-deploy-install"));
+        assert!(!job_owned_by(&job, "other-owner"));
+    }
+
+    #[rstest]
+    #[case("kata-deploy-install", "worker-0", "kata-deploy-install-worker-0")]
+    #[case("kata-deploy-cleanup", "Worker.0", "kata-deploy-cleanup-worker-0")]
+    fn test_job_name_short(#[case] prefix: &str, #[case] node: &str, #[case] expected: &str) {
+        assert_eq!(job_name(prefix, node), expected);
+    }
+
+    #[test]
+    fn test_job_name_truncated_and_unique() {
+        let prefix = "kata-deploy-install";
+        let long_a = "node-with-a-really-really-really-really-really-long-name-aaaaaaa";
+        let long_b = "node-with-a-really-really-really-really-really-long-name-bbbbbbb";
+
+        let name_a = job_name(prefix, long_a);
+        let name_b = job_name(prefix, long_b);
+
+        assert!(
+            name_a.len() <= 63,
+            "name too long: {} ({})",
+            name_a,
+            name_a.len()
+        );
+        assert!(
+            name_b.len() <= 63,
+            "name too long: {} ({})",
+            name_b,
+            name_b.len()
+        );
+        assert_ne!(
+            name_a, name_b,
+            "different node names must yield different job names"
+        );
+    }
+
+    #[test]
+    fn test_build_node_job_pins_node_and_labels() {
+        let template: Job = serde_yaml::from_str(
+            r#"
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ignored
+spec:
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: c
+          image: busybox
+"#,
+        )
+        .unwrap();
+
+        let owner = OwnerReference {
+            api_version: "batch/v1".to_string(),
+            kind: "Job".to_string(),
+            name: "dispatcher".to_string(),
+            uid: "abc-123".to_string(),
+            controller: Some(false),
+            block_owner_deletion: Some(false),
+        };
+
+        let job = build_node_job(
+            &template,
+            "kata-deploy-install-node1",
+            "node1",
+            "kata-deploy-install",
+            Some(&owner),
+        );
+
+        assert_eq!(
+            job.metadata.name.as_deref(),
+            Some("kata-deploy-install-node1")
+        );
+        let labels = job.metadata.labels.unwrap();
+        assert_eq!(
+            labels.get(OWNER_LABEL).map(String::as_str),
+            Some("kata-deploy-install")
+        );
+        assert_eq!(labels.get(NODE_LABEL).map(String::as_str), Some("node1"));
+        let annotations = job.metadata.annotations.unwrap();
+        assert_eq!(
+            annotations.get(NODE_ANNOTATION).map(String::as_str),
+            Some("node1")
+        );
+        assert_eq!(job.metadata.owner_references.unwrap().len(), 1);
+        let pod_spec = job.spec.unwrap().template.spec.unwrap();
+        assert_eq!(pod_spec.node_name.as_deref(), Some("node1"));
+    }
+
+    fn job_with_status(status_yaml: &str) -> Job {
+        let yaml = format!(
+            "apiVersion: batch/v1\nkind: Job\nmetadata:\n  name: j\nstatus:\n{status_yaml}"
+        );
+        serde_yaml::from_str(&yaml).unwrap()
+    }
+
+    #[rstest]
+    #[case(
+        "  conditions:\n    - type: Complete\n      status: \"True\"\n",
+        JobOutcome::Succeeded
+    )]
+    #[case(
+        "  conditions:\n    - type: Failed\n      status: \"True\"\n",
+        JobOutcome::Failed
+    )]
+    #[case(
+        "  conditions:\n    - type: Complete\n      status: \"False\"\n",
+        JobOutcome::Running
+    )]
+    #[case("  succeeded: 1\n", JobOutcome::Succeeded)]
+    fn test_interpret_status(#[case] status_yaml: &str, #[case] expected: JobOutcome) {
+        assert_eq!(interpret_status(&job_with_status(status_yaml)), expected);
+    }
+
+    #[test]
+    fn test_interpret_status_running_when_unset() {
+        assert_eq!(interpret_status(&Job::default()), JobOutcome::Running);
+    }
+}
diff --git a/tools/packaging/kata-deploy/job-dispatcher/src/main.rs b/tools/packaging/kata-deploy/job-dispatcher/src/main.rs
new file mode 100644
index 0000000000..c5db2e5cf5
--- /dev/null
+++ b/tools/packaging/kata-deploy/job-dispatcher/src/main.rs
@@ -0,0 +1,373 @@
+// Copyright (c) 2026 NVIDIA Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+
+//! kata-deploy-job-dispatcher: a small, deployment-agnostic dispatcher that runs exactly
+//! one node-pinned Job per selected node.
+//!
+//! Given a Job template (any `batch/v1` Job manifest) and a node selector, it
+//! creates one Job per node — pinned to that node via `spec.nodeName` — keeps
+//! at most `--parallelism` Jobs in flight at a time (refilling as they finish),
+//! and exits non-zero if any node's Job failed. This gives paced rollouts with
+//! *guaranteed per-node coverage*, which an Indexed Job / topology-spread
+//! cannot guarantee once `parallelism < completions` (the scheduler ignores
+//! completed pods when balancing the spread).
+//!
+//! It has no host dependencies and only needs RBAC to list nodes and to
+//! manage Jobs in its namespace.
+
+mod job;
+
+use anyhow::{bail, Context, Result};
+use clap::Parser;
+use job::{
+    build_node_job, interpret_status, job_name, job_owned_by, sanitize_label_value, JobOutcome,
+    OWNER_LABEL,
+};
+use k8s_openapi::api::batch::v1::Job;
+use k8s_openapi::api::core::v1::Node;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference;
+use kube::api::{Api, ListParams, PostParams};
+use kube::Client;
+use log::{error, info};
+use std::collections::{HashMap, VecDeque};
+use std::time::Duration;
+
+#[derive(Parser, Debug)]
+#[command(
+    author,
+    version,
+    about = "Run one node-pinned Job per selected node, paced and with guaranteed coverage."
+)]
+struct Args {
+    /// Path to a YAML file containing the batch/v1 Job to run on each node.
+    /// The dispatcher clones it per node and sets metadata.name + nodeName.
+    #[arg(long)]
+    job_template: String,
+
+    /// Prefix for generated per-node Job names. Also recorded as the
+    /// "kata-deploy-job-dispatcher/owner" label so the dispatcher tracks only its own Jobs.
+    #[arg(long)]
+    name_prefix: String,
+
+    /// Namespace to create the per-node Jobs in. Defaults to $POD_NAMESPACE,
+    /// then the in-cluster service-account namespace, then "default".
+    #[arg(long)]
+    namespace: Option<String>,
+
+    /// Maximum number of per-node Jobs in flight at once.
+    #[arg(long, default_value_t = 100)]
+    parallelism: usize,
+
+    /// Server-side label selector used to pick target nodes, e.g.
+    /// "kubernetes.io/os=linux" or "node-role.kubernetes.io/control-plane".
+    /// Supports the full label-selector grammar (In/NotIn/Exists/DoesNotExist).
+    #[arg(long)]
+    node_selector: Option<String>,
+
+    /// Server-side field selector used to pick target nodes (ANDed with the
+    /// label selector).
+    #[arg(long)]
+    node_field_selector: Option<String>,
+
+    /// Explicit comma-separated node names. When set, the node selectors are
+    /// ignored and exactly these nodes are targeted.
+    #[arg(long)]
+    nodes: Option<String>,
+
+    /// Optional owner Job name (in the dispatcher's namespace). When set, every
+    /// per-node Job gets an ownerReference to it so they are garbage-collected
+    /// together with the owner.
+    #[arg(long)]
+    owner_job_name: Option<String>,
+
+    /// Seconds between status polls.
+    #[arg(long, default_value_t = 5)]
+    poll_interval_secs: u64,
+
+    /// Page size used when listing nodes (server-side pagination).
+    #[arg(long, default_value_t = 500)]
+    node_page_size: u32,
+}
+
+// The dispatcher is overwhelmingly I/O-bound (apiserver round-trips); two worker
+// threads are plenty and keep the footprint small.
+#[tokio::main(flavor = "multi_thread", worker_threads = 2)]
+async fn main() -> Result<()> {
+    env_logger::Builder::from_default_env()
+        .filter_level(log::LevelFilter::Info)
+        .init();
+
+    let args = Args::parse();
+
+    let client = Client::try_default()
+        .await
+        .context("failed to create Kubernetes client")?;
+
+    let namespace = resolve_namespace(args.namespace.clone());
+    info!("kata-deploy-job-dispatcher starting (namespace: {namespace})");
+
+    let nodes = resolve_nodes(&client, &args).await?;
+    if nodes.is_empty() {
+        info!("no target nodes matched the selection; nothing to do");
+        return Ok(());
+    }
+
+    let template_raw = std::fs::read_to_string(&args.job_template)
+        .with_context(|| format!("failed to read job template {}", args.job_template))?;
+    let template: Job = serde_yaml::from_str(&template_raw)
+        .with_context(|| format!("failed to parse job template {}", args.job_template))?;
+
+    let owner = match args.owner_job_name.as_deref() {
+        Some(name) => Some(owner_ref_for_job(&client, &namespace, name).await?),
+        None => None,
+    };
+
+    let jobs: Api<Job> = Api::namespaced(client.clone(), &namespace);
+
+    let parallelism = args.parallelism.clamp(1, nodes.len());
+    info!(
+        "fanning out {} per-node Job(s) with parallelism {}",
+        nodes.len(),
+        parallelism
+    );
+
+    run_fanout(
+        &jobs,
+        &template,
+        &nodes,
+        &args,
+        &namespace,
+        parallelism,
+        owner.as_ref(),
+    )
+    .await
+}
+
+/// Resolve the namespace to create Jobs in: explicit flag, then $POD_NAMESPACE,
+/// then the in-cluster service-account namespace file, then "default".
+fn resolve_namespace(flag: Option<String>) -> String {
+    if let Some(ns) = flag.filter(|s| !s.trim().is_empty()) {
+        return ns;
+    }
+    if let Ok(ns) = std::env::var("POD_NAMESPACE") {
+        if !ns.trim().is_empty() {
+            return ns;
+        }
+    }
+    if let Ok(ns) =
+        std::fs::read_to_string("/var/run/secrets/kubernetes.io/serviceaccount/namespace")
+    {
+        let ns = ns.trim().to_string();
+        if !ns.is_empty() {
+            return ns;
+        }
+    }
+    "default".to_string()
+}
+
+/// Resolve the set of target node names: an explicit `--nodes` list when given,
+/// otherwise a paginated, server-side-filtered LIST of nodes.
+async fn resolve_nodes(client: &Client, args: &Args) -> Result<Vec<String>> {
+    if let Some(list) = args.nodes.as_deref() {
+        let mut names: Vec<String> = list
+            .split(',')
+            .map(|s| s.trim().to_string())
+            .filter(|s| !s.is_empty())
+            .collect();
+        names.sort();
+        names.dedup();
+        return Ok(names);
+    }
+
+    let api: Api<Node> = Api::all(client.clone());
+    let mut names = Vec::new();
+    let mut continue_token: Option<String> = None;
+
+    loop {
+        let lp = ListParams {
+            limit: Some(args.node_page_size.max(1)),
+            label_selector: args.node_selector.clone(),
+            field_selector: args.node_field_selector.clone(),
+            continue_token: continue_token.clone(),
+            ..Default::default()
+        };
+
+        let page = api.list(&lp).await.context("failed to list nodes")?;
+        for node in &page.items {
+            if let Some(name) = node.metadata.name.clone() {
+                names.push(name);
+            }
+        }
+
+        match page.metadata.continue_ {
+            Some(token) if !token.is_empty() => continue_token = Some(token),
+            _ => break,
+        }
+    }
+
+    names.sort();
+    names.dedup();
+    Ok(names)
+}
+
+/// Fetch the owner Job and build an `ownerReference` to it (non-controller, so
+/// it does not interfere with the Job controller's own ownership of pods).
+async fn owner_ref_for_job(client: &Client, namespace: &str, name: &str) -> Result<OwnerReference> {
+    let jobs: Api<Job> = Api::namespaced(client.clone(), namespace);
+    let job = jobs
+        .get(name)
+        .await
+        .with_context(|| format!("failed to get owner job {name}"))?;
+    let uid = job
+        .metadata
+        .uid
+        .ok_or_else(|| anyhow::anyhow!("owner job {name} has no uid"))?;
+    Ok(OwnerReference {
+        api_version: "batch/v1".to_string(),
+        kind: "Job".to_string(),
+        name: name.to_string(),
+        uid,
+        controller: Some(false),
+        block_owner_deletion: Some(false),
+    })
+}
+
+/// Create and watch per-node Jobs, keeping at most `parallelism` in flight.
+/// Returns an error listing the nodes whose Jobs failed, if any.
+async fn run_fanout(
+    jobs: &Api<Job>,
+    template: &Job,
+    nodes: &[String],
+    args: &Args,
+    namespace: &str,
+    parallelism: usize,
+    owner: Option<&OwnerReference>,
+) -> Result<()> {
+    let mut queue: VecDeque<&String> = nodes.iter().collect();
+    // job name -> node name
+    let mut in_flight: HashMap<String, String> = HashMap::new();
+    let mut succeeded = 0usize;
+    let mut failed: Vec<String> = Vec::new();
+
+    let post = PostParams::default();
+    let poll = Duration::from_secs(args.poll_interval_secs.max(1));
+    // The name prefix is recorded in OWNER_LABEL and reused as the Job-name
+    // prefix; sanitize it once so it is a valid label value / DNS-1123 prefix
+    // regardless of what the caller passed (e.g. a Helm release suffix).
+    let owner_value = sanitize_label_value(&args.name_prefix);
+    let owner_selector = format!("{OWNER_LABEL}={owner_value}");
+
+    while !queue.is_empty() || !in_flight.is_empty() {
+        // Refill the in-flight set up to the parallelism cap.
+        while in_flight.len() < parallelism {
+            let Some(node) = queue.pop_front() else {
+                break;
+            };
+            let name = job_name(&owner_value, node);
+            let node_job = build_node_job(template, &name, node, &owner_value, owner);
+            match jobs.create(&post, &node_job).await {
+                Ok(_) => info!("created job {name} (node {node})"),
+                // A Job with this name already exists (e.g. left over from a
+                // previous, interrupted run). Only adopt it if it actually
+                // carries our owner label: status polling LISTs Jobs by that
+                // label, so adopting one that lacks it (or belongs to someone
+                // else) would leave it stuck in-flight forever. If it is not
+                // ours, fail the node instead of hanging.
+                Err(kube::Error::Api(e)) if e.code == 409 => match jobs.get(&name).await {
+                    Ok(existing) if job_owned_by(&existing, &owner_value) => {
+                        info!("job {name} (node {node}) already exists and is ours, adopting it");
+                    }
+                    Ok(_) => {
+                        error!(
+                            "job {name} (node {node}) already exists but is not labeled \
+                             {OWNER_LABEL}={owner_value}; refusing to adopt it"
+                        );
+                        failed.push(node.clone());
+                        continue;
+                    }
+                    Err(e) => {
+                        error!("failed to fetch pre-existing job {name} (node {node}): {e}");
+                        failed.push(node.clone());
+                        continue;
+                    }
+                },
+                Err(e) => {
+                    error!("failed to create job {name} (node {node}): {e}");
+                    failed.push(node.clone());
+                    continue;
+                }
+            }
+            in_flight.insert(name, node.clone());
+        }
+
+        if in_flight.is_empty() {
+            break;
+        }
+
+        tokio::time::sleep(poll).await;
+
+        // One LIST per poll returns the status of all our Jobs at once.
+        let lp = ListParams {
+            label_selector: Some(owner_selector.clone()),
+            ..Default::default()
+        };
+        let listed = jobs
+            .list(&lp)
+            .await
+            .context("failed to list per-node jobs")?;
+        let mut status_by_name: HashMap<&str, &Job> = HashMap::new();
+        for j in &listed.items {
+            if let Some(name) = j.metadata.name.as_deref() {
+                status_by_name.insert(name, j);
+            }
+        }
+
+        let mut finished: Vec<String> = Vec::new();
+        for (name, node) in &in_flight {
+            let Some(j) = status_by_name.get(name.as_str()) else {
+                continue;
+            };
+            match interpret_status(j) {
+                JobOutcome::Succeeded => {
+                    succeeded += 1;
+                    finished.push(name.clone());
+                    info!("node {node}: job {name} succeeded");
+                }
+                JobOutcome::Failed => {
+                    failed.push(node.clone());
+                    finished.push(name.clone());
+                    error!("node {node}: job {name} failed");
+                }
+                JobOutcome::Running => {}
+            }
+        }
+        for name in finished {
+            in_flight.remove(&name);
+        }
+
+        info!(
+            "progress: {succeeded} succeeded, {} failed, {} in-flight, {} queued",
+            failed.len(),
+            in_flight.len(),
+            queue.len()
+        );
+    }
+
+    if !failed.is_empty() {
+        failed.sort();
+        failed.dedup();
+        bail!(
+            "{} node(s) failed: {}. Inspect the per-node Job logs with: \
+             kubectl logs -n {} -l {}={} --all-containers --prefix",
+            failed.len(),
+            failed.join(", "),
+            namespace,
+            OWNER_LABEL,
+            owner_value
+        );
+    }
+
+    info!("all {succeeded} node(s) completed successfully");
+    Ok(())
+}

From d4205c7fccd9ec4cda6b916819bfe3d99a1b7d5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 3 Jun 2026 21:42:15 +0200
Subject: [PATCH 2/9] kata-deploy: build and publish the
 kata-deploy-job-dispatcher image
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Package and ship the dispatcher built in the previous commit so the
job-mode Helm chart has an image to run.

  - Dockerfile.components: build kata-deploy and kata-deploy-job-dispatcher
    from the same rust-builder stage (one compile), and run fmt/clippy/
    test for both crates.
  - job-dispatcher/Dockerfile: a minimal distroless/static image containing
    only the dispatcher binary and CA certs - it is an API client, so it
    needs nothing from the host.
  - local-build: kata-deploy-job-dispatcher becomes its own build component
    with its own static tarball
    (kata-deploy-static-kata-deploy-job-dispatcher.tar.zst); the shared
    rust-builder output is reused so the two components do not recompile
    the workspace locally. The payload script builds and pushes a separate
    "<kata-deploy registry>-job-dispatcher" image with the same tag scheme,
    and release.sh publishes its multi-arch manifest symmetrically.
  - CI: add kata-deploy-job-dispatcher to the build-kata-deploy-components
    matrices (its tarball is picked up by the existing kata-artifacts-*
    glob), and gate it in the kata-deploy rust static checks.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 .../build-kata-static-tarball-amd64.yaml      |  1 +
 .../build-kata-static-tarball-arm64.yaml      |  1 +
 .../build-kata-static-tarball-ppc64le.yaml    |  1 +
 .../build-kata-static-tarball-s390x.yaml      |  1 +
 .github/workflows/static-checks.yaml          |  3 ++
 .../kata-deploy/Dockerfile.components         |  9 +++-
 .../kata-deploy/job-dispatcher/Dockerfile     | 41 +++++++++++++++++++
 .../kata-deploy/local-build/Makefile          |  5 +++
 .../kata-deploy-build-and-upload-payload.sh   | 30 +++++++++++++-
 .../kata-deploy-build-components-tarballs.sh  | 38 +++++++++++++++--
 tools/packaging/release/release.sh            | 24 ++++++++++-
 11 files changed, 145 insertions(+), 9 deletions(-)
 create mode 100644 tools/packaging/kata-deploy/job-dispatcher/Dockerfile

diff --git a/.github/workflows/build-kata-static-tarball-amd64.yaml b/.github/workflows/build-kata-static-tarball-amd64.yaml
index 40bc636de9..7d520f3cf8 100644
--- a/.github/workflows/build-kata-static-tarball-amd64.yaml
+++ b/.github/workflows/build-kata-static-tarball-amd64.yaml
@@ -162,6 +162,7 @@ jobs:
       matrix:
         component:
           - kata-deploy-binary
+          - kata-deploy-job-dispatcher
           - nydus-snapshotter-for-coco-guest-pull
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-amd64-${{ toJSON(matrix) }}
diff --git a/.github/workflows/build-kata-static-tarball-arm64.yaml b/.github/workflows/build-kata-static-tarball-arm64.yaml
index 1431680ece..50087e0631 100644
--- a/.github/workflows/build-kata-static-tarball-arm64.yaml
+++ b/.github/workflows/build-kata-static-tarball-arm64.yaml
@@ -156,6 +156,7 @@ jobs:
       matrix:
         component:
           - kata-deploy-binary
+          - kata-deploy-job-dispatcher
           - nydus-snapshotter-for-coco-guest-pull
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-arm64-${{ toJSON(matrix) }}
diff --git a/.github/workflows/build-kata-static-tarball-ppc64le.yaml b/.github/workflows/build-kata-static-tarball-ppc64le.yaml
index 8596f757f9..ba98a6a691 100644
--- a/.github/workflows/build-kata-static-tarball-ppc64le.yaml
+++ b/.github/workflows/build-kata-static-tarball-ppc64le.yaml
@@ -101,6 +101,7 @@ jobs:
       matrix:
         component:
           - kata-deploy-binary
+          - kata-deploy-job-dispatcher
           - nydus-snapshotter-for-coco-guest-pull
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-ppc64le-${{ toJSON(matrix) }}
diff --git a/.github/workflows/build-kata-static-tarball-s390x.yaml b/.github/workflows/build-kata-static-tarball-s390x.yaml
index 03bf46fc01..c0b5fe0b62 100644
--- a/.github/workflows/build-kata-static-tarball-s390x.yaml
+++ b/.github/workflows/build-kata-static-tarball-s390x.yaml
@@ -139,6 +139,7 @@ jobs:
       matrix:
         component:
           - kata-deploy-binary
+          - kata-deploy-job-dispatcher
           - nydus-snapshotter-for-coco-guest-pull
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-s390x-${{ toJSON(matrix) }}
diff --git a/.github/workflows/static-checks.yaml b/.github/workflows/static-checks.yaml
index 2c90dacc0c..f3ef50dbfb 100644
--- a/.github/workflows/static-checks.yaml
+++ b/.github/workflows/static-checks.yaml
@@ -121,6 +121,9 @@ jobs:
           cargo fmt -p kata-deploy --check
           cargo clippy -p kata-deploy --all-targets --all-features -- -D warnings
           RUSTFLAGS="-D warnings" cargo test -p kata-deploy -- --test-threads=1
+          cargo fmt -p kata-deploy-job-dispatcher --check
+          cargo clippy -p kata-deploy-job-dispatcher --all-targets --all-features -- -D warnings
+          RUSTFLAGS="-D warnings" cargo test -p kata-deploy-job-dispatcher -- --test-threads=1
 
   static-checks:
     name: static-checks
diff --git a/tools/packaging/kata-deploy/Dockerfile.components b/tools/packaging/kata-deploy/Dockerfile.components
index 122141a839..cc5c1e3a4f 100644
--- a/tools/packaging/kata-deploy/Dockerfile.components
+++ b/tools/packaging/kata-deploy/Dockerfile.components
@@ -57,6 +57,7 @@ WORKDIR /kata
 COPY Cargo.toml Cargo.lock ./
 COPY src ./src
 COPY tools/packaging/kata-deploy/binary ./tools/packaging/kata-deploy/binary
+COPY tools/packaging/kata-deploy/job-dispatcher ./tools/packaging/kata-deploy/job-dispatcher
 
 # Install target and run tests based on architecture
 # - AMD64/arm64: use musl for fully static binaries
@@ -98,9 +99,11 @@ RUN \
 	rust_target="$(cat /tmp/rust_target)" && \
 	echo "Checking code formatting..." && \
 	cargo fmt -p kata-deploy --check && \
+	cargo fmt -p kata-deploy-job-dispatcher --check && \
 	echo "Code formatting check passed!" && \
 	echo "Running cargo clippy with target ${rust_target}..." && \
 	cargo clippy -p kata-deploy --all-targets --all-features --release --locked --target "${rust_target}" -- -D warnings && \
+	cargo clippy -p kata-deploy-job-dispatcher --all-targets --all-features --release --locked --target "${rust_target}" -- -D warnings && \
 	echo "Cargo clippy passed!"
 
 # Run tests using --test-threads=1 to prevent environment variable pollution between tests,
@@ -109,14 +112,16 @@ RUN \
 	rust_target="$(cat /tmp/rust_target)"; \
 	echo "Running binary tests with target ${rust_target}..." && \
 	RUSTFLAGS="-D warnings" cargo test -p kata-deploy --target "${rust_target}" -- --test-threads=1 && \
+	RUSTFLAGS="-D warnings" cargo test -p kata-deploy-job-dispatcher --target "${rust_target}" -- --test-threads=1 && \
 	echo "All tests passed!"
 
 RUN \
 	rust_target="$(cat /tmp/rust_target)"; \
-	echo "Building kata-deploy binary for ${rust_target}..." && \
-	RUSTFLAGS="-D warnings" cargo build --release -p kata-deploy --target "${rust_target}" && \
+	echo "Building kata-deploy + kata-deploy-job-dispatcher binaries for ${rust_target}..." && \
+	RUSTFLAGS="-D warnings" cargo build --release -p kata-deploy -p kata-deploy-job-dispatcher --target "${rust_target}" && \
 	mkdir -p /kata-deploy/bin && \
 	cp "/kata/target/${rust_target}/release/kata-deploy" /kata-deploy/bin/kata-deploy && \
+	cp "/kata/target/${rust_target}/release/kata-deploy-job-dispatcher" /kata-deploy/bin/kata-deploy-job-dispatcher && \
 	echo "Cleaning up build artifacts to save disk space..." && \
 	rm -rf /kata/target && \
 	cargo clean
diff --git a/tools/packaging/kata-deploy/job-dispatcher/Dockerfile b/tools/packaging/kata-deploy/job-dispatcher/Dockerfile
new file mode 100644
index 0000000000..36fa227e7d
--- /dev/null
+++ b/tools/packaging/kata-deploy/job-dispatcher/Dockerfile
@@ -0,0 +1,41 @@
+# Copyright (c) 2026 Kata Contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Minimal image for the job-mode dispatcher (kata-deploy-job-dispatcher).
+#
+# Unlike the kata-deploy image, this dispatcher never touches the host: it only
+# talks to the Kubernetes API (lists nodes, creates/watches per-node Jobs). It
+# therefore needs nothing but the statically-linked binary and CA certificates,
+# and ships on distroless/static.
+#
+# The binary is produced by the shared rust-builder stage and packaged into
+# kata-deploy-static-kata-deploy-job-dispatcher.tar.zst (see Dockerfile.components and
+# local-build/kata-deploy-build-components-tarballs.sh). Build from the repo
+# root so the tarball path resolves:
+#   docker build -f tools/packaging/kata-deploy/job-dispatcher/Dockerfile .
+
+#### Extract the dispatcher binary from its tarball
+FROM alpine:3.22 AS extract-stage
+
+ARG KATA_ARTIFACTS_DIR=tools/packaging/kata-deploy/kata-artifacts
+
+SHELL ["/bin/ash", "-eo", "pipefail", "-c"]
+
+RUN apk add --no-cache zstd
+
+COPY ${KATA_ARTIFACTS_DIR}/kata-deploy-static-kata-deploy-job-dispatcher.tar.zst /tmp/dispatcher.tar.zst
+
+RUN \
+	mkdir -p /opt/dispatcher && \
+	zstd -dc /tmp/dispatcher.tar.zst | tar -xf - -C /opt/dispatcher ./usr/bin/kata-deploy-job-dispatcher
+
+#### Dispatcher image
+# distroless does not publish pinned/versioned tags - only rolling ones
+# (latest, nonroot, debug) - so :latest is the intended way to consume it.
+# hadolint ignore=DL3007
+FROM gcr.io/distroless/static-debian13:latest
+
+COPY --from=extract-stage /opt/dispatcher/usr/bin/kata-deploy-job-dispatcher /usr/bin/kata-deploy-job-dispatcher
+
+ENTRYPOINT ["/usr/bin/kata-deploy-job-dispatcher"]
diff --git a/tools/packaging/kata-deploy/local-build/Makefile b/tools/packaging/kata-deploy/local-build/Makefile
index e6507f8522..5af4a6e989 100644
--- a/tools/packaging/kata-deploy/local-build/Makefile
+++ b/tools/packaging/kata-deploy/local-build/Makefile
@@ -70,6 +70,7 @@ endif
 
 PUBLISH_COMPONENT_TARBALLS = \
 	kata-deploy-binary-tarball \
+	kata-deploy-job-dispatcher-tarball \
 	nydus-snapshotter-for-coco-guest-pull-tarball
 
 ifeq ($(ARCH), x86_64)
@@ -106,6 +107,7 @@ endif
 # can consume a single nvgpu bundle without rebuilding extra components.
 NVGPU_FINAL_TARBALL_INPUTS = \
 	kata-deploy-static-kata-deploy-binary.tar.zst \
+	kata-deploy-static-kata-deploy-job-dispatcher.tar.zst \
 	kata-deploy-static-nydus-snapshotter-for-coco-guest-pull.tar.zst \
 	kata-static-kernel-nvidia-gpu.tar.zst \
 	kata-static-ovmf-sev.tar.zst \
@@ -313,6 +315,9 @@ virtiofsd-tarball:
 kata-deploy-binary-tarball:
 	$(call BUILD_KATA_DEPLOY_COMPONENT,kata-deploy-binary)
 
+kata-deploy-job-dispatcher-tarball:
+	$(call BUILD_KATA_DEPLOY_COMPONENT,kata-deploy-job-dispatcher)
+
 nydus-snapshotter-for-coco-guest-pull-tarball:
 	$(call BUILD_KATA_DEPLOY_COMPONENT,nydus-snapshotter-for-coco-guest-pull)
 
diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-payload.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-payload.sh
index 580402c83e..040ce2b5fb 100755
--- a/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-payload.sh
+++ b/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-payload.sh
@@ -17,6 +17,18 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
 REGISTRY="${1:-"quay.io/kata-containers/kata-deploy"}"
 TAG="${2:-}"
 ARTIFACTS_BUILD_DIR="${3:-${REPO_ROOT}/tools/packaging/kata-deploy/local-build/build}"
+# Separate, minimal image for the job-mode dispatcher (kata-deploy-job-dispatcher).
+# Built from its own staged tarball, with the same tag scheme as the kata-deploy
+# image. The repo name mirrors the kata-deploy repo with "-job-dispatcher" inserted
+# before any "-ci" suffix, so the "-ci" stays last:
+#   .../kata-deploy     -> .../kata-deploy-job-dispatcher
+#   .../kata-deploy-ci  -> .../kata-deploy-job-dispatcher-ci
+if [[ "${REGISTRY}" == *-ci ]]; then
+	default_job_dispatcher_image_reference="${REGISTRY%-ci}-job-dispatcher-ci"
+else
+	default_job_dispatcher_image_reference="${REGISTRY}-job-dispatcher"
+fi
+JOB_DISPATCHER_IMAGE_REFERENCE="${4:-${default_job_dispatcher_image_reference}}"
 
 KATA_DEPLOY_DIR="${REPO_ROOT}/tools/packaging/kata-deploy"
 ARTIFACTS_STAGE_DIR="${KATA_DEPLOY_DIR}/kata-artifacts"
@@ -40,22 +52,36 @@ arch=$(uname -m)
 # Disable provenance and SBOM so each tag is a single image manifest. quay.io rejects
 # pushing multi-arch manifest lists that include attestation manifests ("manifest invalid").
 PLATFORM="linux/${arch}"
-IMAGE_TAG="${REGISTRY}:kata-containers-$(git -C "${REPO_ROOT}" rev-parse HEAD)-${arch}"
+COMMIT_TAG="kata-containers-$(git -C "${REPO_ROOT}" rev-parse HEAD)-${arch}"
+IMAGE_TAG="${REGISTRY}:${COMMIT_TAG}"
+JOB_DISPATCHER_IMAGE_TAG="${JOB_DISPATCHER_IMAGE_REFERENCE}:${COMMIT_TAG}"
 
 DOCKERFILE="${REPO_ROOT}/tools/packaging/kata-deploy/Dockerfile"
+JOB_DISPATCHER_DOCKERFILE="${REPO_ROOT}/tools/packaging/kata-deploy/job-dispatcher/Dockerfile"
 
-echo "Building the image"
+echo "Building the kata-deploy image"
 docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \
 	-f "${DOCKERFILE}" \
 	--tag "${IMAGE_TAG}" --push .
 
+echo "Building the kata-deploy-job-dispatcher image"
+docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \
+	-f "${JOB_DISPATCHER_DOCKERFILE}" \
+	--tag "${JOB_DISPATCHER_IMAGE_TAG}" --push .
+
 if [[ -n "${TAG}" ]]; then
 	ADDITIONAL_TAG="${REGISTRY}:${TAG}"
+	JOB_DISPATCHER_ADDITIONAL_TAG="${JOB_DISPATCHER_IMAGE_REFERENCE}:${TAG}"
 
 	echo "Building the ${ADDITIONAL_TAG} image"
 	docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \
 		-f "${DOCKERFILE}" \
 		--tag "${ADDITIONAL_TAG}" --push .
+
+	echo "Building the ${JOB_DISPATCHER_ADDITIONAL_TAG} image"
+	docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \
+		-f "${JOB_DISPATCHER_DOCKERFILE}" \
+		--tag "${JOB_DISPATCHER_ADDITIONAL_TAG}" --push .
 fi
 
 popd
diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-build-components-tarballs.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-build-components-tarballs.sh
index f90abbdbf8..c156639a1d 100755
--- a/tools/packaging/kata-deploy/local-build/kata-deploy-build-components-tarballs.sh
+++ b/tools/packaging/kata-deploy/local-build/kata-deploy-build-components-tarballs.sh
@@ -37,21 +37,49 @@ if [[ -z "${rust_toolchain}" ]]; then
 	exit 1
 fi
 
-build_kata_deploy_binary() {
+rust_builder_out="${build_dir}/kata-deploy-binary-out"
+
+# kata-deploy and kata-deploy-job-dispatcher are produced by the same rust-builder
+# stage. Build it once *per process* and let each component package its own
+# binary, so running both components in a single invocation does not compile the
+# workspace twice. The guard is process-local (not a directory check) on purpose:
+# a fresh invocation must always rebuild, otherwise a stale output dir from an
+# earlier run/commit would be silently reused.
+rust_binaries_built="false"
+build_rust_binaries() {
+	if [[ "${rust_binaries_built}" == "true" ]]; then
+		return
+	fi
+	rm -rf "${rust_builder_out}"
 	docker buildx build \
 		--target rust-builder \
 		--build-arg "RUST_TOOLCHAIN=${rust_toolchain}" \
-		--output "type=local,dest=${build_dir}/kata-deploy-binary-out" \
+		--output "type=local,dest=${rust_builder_out}" \
 		-f "${repo_root_dir}/tools/packaging/kata-deploy/Dockerfile.components" \
 		"${repo_root_dir}"
+	rust_binaries_built="true"
+}
+
+build_kata_deploy_binary() {
+	build_rust_binaries
 
 	mkdir -p "${build_dir}/kata-deploy-binary/usr/bin"
-	cp "${build_dir}/kata-deploy-binary-out/kata-deploy/bin/kata-deploy" \
+	cp "${rust_builder_out}/kata-deploy/bin/kata-deploy" \
 		"${build_dir}/kata-deploy-binary/usr/bin/kata-deploy"
 	tar --zstd -cf "${build_dir}/kata-deploy-static-kata-deploy-binary.tar.zst" \
 		-C "${build_dir}/kata-deploy-binary" .
 }
 
+build_kata_deploy_job_dispatcher() {
+	build_rust_binaries
+
+	mkdir -p "${build_dir}/kata-deploy-job-dispatcher/usr/bin"
+	cp "${rust_builder_out}/kata-deploy/bin/kata-deploy-job-dispatcher" \
+		"${build_dir}/kata-deploy-job-dispatcher/usr/bin/kata-deploy-job-dispatcher"
+	tar --zstd -cf "${build_dir}/kata-deploy-static-kata-deploy-job-dispatcher.tar.zst" \
+		-C "${build_dir}/kata-deploy-job-dispatcher" .
+}
+
 build_nydus_snapshotter_for_coco_guest_pull() {
 	docker buildx build \
 		--target nydus-binary-downloader \
@@ -70,13 +98,15 @@ build_nydus_snapshotter_for_coco_guest_pull() {
 
 case "${component}" in
 	kata-deploy-binary) build_kata_deploy_binary ;;
+	kata-deploy-job-dispatcher) build_kata_deploy_job_dispatcher ;;
 	nydus-snapshotter-for-coco-guest-pull) build_nydus_snapshotter_for_coco_guest_pull ;;
 	all)
 		build_kata_deploy_binary
+		build_kata_deploy_job_dispatcher
 		build_nydus_snapshotter_for_coco_guest_pull
 		;;
 	*)
-		echo "Unknown component '${component}'. Expected: kata-deploy-binary, nydus-snapshotter-for-coco-guest-pull, all" >&2
+		echo "Unknown component '${component}'. Expected: kata-deploy-binary, kata-deploy-job-dispatcher, nydus-snapshotter-for-coco-guest-pull, all" >&2
 		exit 1
 		;;
 esac
diff --git a/tools/packaging/release/release.sh b/tools/packaging/release/release.sh
index a66efed75f..d717977747 100755
--- a/tools/packaging/release/release.sh
+++ b/tools/packaging/release/release.sh
@@ -19,6 +19,11 @@ KATA_DEPLOY_IMAGE_TAGS="${KATA_DEPLOY_IMAGE_TAGS:-}"
 IFS=' ' read -r -a IMAGE_TAGS <<< "${KATA_DEPLOY_IMAGE_TAGS}"
 KATA_DEPLOY_REGISTRIES="${KATA_DEPLOY_REGISTRIES:-}"
 IFS=' ' read -r -a REGISTRIES <<< "${KATA_DEPLOY_REGISTRIES}"
+# Registries for the separate job-mode dispatcher image. When unset, derived
+# from KATA_DEPLOY_REGISTRIES by inserting "-job-dispatcher" before any "-ci"
+# suffix on each entry (so the "-ci" stays last).
+KATA_DEPLOY_JOB_DISPATCHER_REGISTRIES="${KATA_DEPLOY_JOB_DISPATCHER_REGISTRIES:-}"
+IFS=' ' read -r -a JOB_DISPATCHER_REGISTRIES <<< "${KATA_DEPLOY_JOB_DISPATCHER_REGISTRIES}"
 GH_TOKEN="${GH_TOKEN:-}"
 ARCHITECTURE="${ARCHITECTURE:-}"
 KATA_STATIC_TARBALL="${KATA_STATIC_TARBALL:-}"
@@ -146,11 +151,28 @@ function _publish_multiarch_manifest()
 	_check_required_env_var "KATA_DEPLOY_IMAGE_TAGS"
 	_check_required_env_var "KATA_DEPLOY_REGISTRIES"
 
+	# The dispatcher is shipped as a separate, minimal image alongside kata-deploy
+	# with the same tags. When no dedicated registries are given, derive them from
+	# each kata-deploy registry by inserting "-job-dispatcher" before any "-ci"
+	# suffix, so the "-ci" stays last:
+	#   .../kata-deploy     -> .../kata-deploy-job-dispatcher
+	#   .../kata-deploy-ci  -> .../kata-deploy-job-dispatcher-ci
+	if [[ ${#JOB_DISPATCHER_REGISTRIES[@]} -eq 0 ]]; then
+		JOB_DISPATCHER_REGISTRIES=()
+		for registry in "${REGISTRIES[@]}"; do
+			if [[ "${registry}" == *-ci ]]; then
+				JOB_DISPATCHER_REGISTRIES+=("${registry%-ci}-job-dispatcher-ci")
+			else
+				JOB_DISPATCHER_REGISTRIES+=("${registry}-job-dispatcher")
+			fi
+		done
+	fi
+
 	# Per-arch images are built without provenance/SBOM so each tag is a single image manifest;
 	# quay.io rejects pushing multi-arch manifest lists that include attestation manifests
 	# ("manifest invalid"), so we do not enable them for this workflow.
 	# imagetools create pushes to --tag by default.
-	for registry in "${REGISTRIES[@]}"; do
+	for registry in "${REGISTRIES[@]}" "${JOB_DISPATCHER_REGISTRIES[@]}"; do
 		for tag in "${IMAGE_TAGS[@]}"; do
 			docker buildx imagetools create --tag "${registry}:${tag}" \
 				"${registry}:${tag}-amd64" \

From 225ff2209ed49353311d35da4b608936b1b96974 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 3 Jun 2026 22:06:16 +0200
Subject: [PATCH 3/9] kata-deploy: split install/cleanup into staged actions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1 of migrating kata-deploy from a DaemonSet to a staged JobSet
workflow: refactor the binary's install/cleanup flows into discrete,
independently invocable stages while keeping the existing DaemonSet
path fully working.

Add new staged subcommands that each run one step and exit, so a JobSet
can drive them as ordered initContainers/Jobs per node:

  install: host-check -> artifacts -> cri -> label
  cleanup (reverse): unlabel -> revert-cri -> remove-artifacts

`install` becomes a compatibility wrapper composing the install stages
in the canonical order, so the DaemonSet deployment model is unchanged.
The DaemonSet `cleanup` (with its DaemonSet-presence gating) is left
intact; the staged cleanup actions are added alongside it and skip that
gating since the JobSet workflow only schedules them on a real uninstall.

Each stage has an idempotent skip check so reruns are safe:
  - install label / cleanup unlabel: short-circuit via the node label
  - cleanup remove-artifacts: skip when the install dir is already gone
  - cleanup revert-cri: skip the disruptive runtime restart when the CRI
    drop-ins are already absent (new cri_drop_in_present helper)

Introduce a shared KATA_RUNTIME_LABEL constant and add rstest-based
tests covering the subcommand-name -> Action mapping, rejection of
unknown actions, and the visible/hidden help semantics.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 .../packaging/kata-deploy/binary/src/main.rs  | 365 +++++++++++++++++-
 1 file changed, 348 insertions(+), 17 deletions(-)

diff --git a/tools/packaging/kata-deploy/binary/src/main.rs b/tools/packaging/kata-deploy/binary/src/main.rs
index 6691fed0e9..d015fa05e1 100644
--- a/tools/packaging/kata-deploy/binary/src/main.rs
+++ b/tools/packaging/kata-deploy/binary/src/main.rs
@@ -56,6 +56,39 @@ enum Action {
     Install,
     Cleanup,
     Reset,
+    /// Stage 0 of a staged (JobSet) install: validate host/node prerequisites
+    /// without mutating the host. Fails fast with actionable diagnostics when
+    /// the node cannot support installation.
+    #[clap(name = "install-stage-host-check")]
+    InstallStageHostCheck,
+    /// Stage 1 of a staged (JobSet) install: install kata artifacts/config on
+    /// the host and set up configured snapshotters. Does not touch CRI
+    /// configuration, but is still privileged (host writes + snapshotter setup
+    /// shell into the host via nsenter).
+    #[clap(name = "install-stage-artifacts")]
+    InstallStageArtifacts,
+    /// Stage 2 of a staged (JobSet) install: write CRI drop-ins, restart the
+    /// runtime, and wait for node readiness. Privileged + short-lived.
+    #[clap(name = "install-stage-cri")]
+    InstallStageCri,
+    /// Stage 3 of a staged (JobSet) install: apply the kata-runtime node label.
+    /// Unprivileged, Kubernetes API only.
+    #[clap(name = "install-stage-label")]
+    InstallStageLabel,
+    /// Cleanup stage 1 of a staged (JobSet) uninstall: remove the kata-runtime
+    /// node label first so the scheduler stops placing kata workloads here.
+    /// Unprivileged, Kubernetes API only.
+    #[clap(name = "cleanup-stage-unlabel")]
+    CleanupStageUnlabel,
+    /// Cleanup stage 2 of a staged (JobSet) uninstall: remove CRI drop-ins,
+    /// restart the runtime, and wait for readiness. Privileged + short-lived.
+    #[clap(name = "cleanup-stage-revert-cri")]
+    CleanupStageRevertCri,
+    /// Cleanup stage 3 of a staged (JobSet) uninstall: remove kata
+    /// artifacts/config/symlinks from the host. Privileged (mutates the host
+    /// filesystem under the install dir).
+    #[clap(name = "cleanup-stage-remove-artifacts")]
+    CleanupStageRemoveArtifacts,
     /// Internal: entered via re-exec after install completes. Holds the
     /// DaemonSet pod alive waiting for SIGTERM, then runs cleanup. Hidden
     /// from `--help`; users should never invoke this directly.
@@ -63,6 +96,10 @@ enum Action {
     InternalPostInstallWait,
 }
 
+/// Node label applied to mark a node as kata-capable. Shared across the
+/// install/cleanup label stages so the key stays consistent.
+const KATA_RUNTIME_LABEL: &str = "katacontainers.io/kata-runtime";
+
 // Cap the tokio runtime to a small fixed number of worker threads. The default
 // multi-thread runtime allocates `num_cpus()` workers (each with a ~2 MiB
 // stack), which on a 200+ vCPU GPU node is the dominant contributor to the
@@ -107,6 +144,13 @@ async fn main() -> Result<()> {
         Action::Install => "install",
         Action::Cleanup => "cleanup",
         Action::Reset => "reset",
+        Action::InstallStageHostCheck => "install-stage-host-check",
+        Action::InstallStageArtifacts => "install-stage-artifacts",
+        Action::InstallStageCri => "install-stage-cri",
+        Action::InstallStageLabel => "install-stage-label",
+        Action::CleanupStageUnlabel => "cleanup-stage-unlabel",
+        Action::CleanupStageRevertCri => "cleanup-stage-revert-cri",
+        Action::CleanupStageRemoveArtifacts => "cleanup-stage-remove-artifacts",
         Action::InternalPostInstallWait => "internal-post-install-wait",
     };
     config.print_info(action_str);
@@ -245,6 +289,42 @@ async fn main() -> Result<()> {
             // Exit after completion so the job can complete
             info!("Reset completed, exiting");
         }
+        // Staged (JobSet) install actions. Each runs one step of the install
+        // pipeline as a short-lived Job/initContainer and exits. The DaemonSet
+        // path does not use these directly; it goes through `install` above,
+        // which composes the same stage functions.
+        Action::InstallStageHostCheck => {
+            install_stage_host_check(&config, &runtime).await?;
+            info!("Install host-check stage completed, exiting");
+        }
+        Action::InstallStageArtifacts => {
+            install_stage_artifacts(&config, &runtime).await?;
+            info!("Install artifacts stage completed, exiting");
+        }
+        Action::InstallStageCri => {
+            install_stage_cri(&config, &runtime).await?;
+            info!("Install CRI stage completed, exiting");
+        }
+        Action::InstallStageLabel => {
+            install_stage_label(&config).await?;
+            info!("Install label stage completed, exiting");
+        }
+        // Staged (JobSet) cleanup actions. These run in reverse order
+        // (unlabel -> revert-cri -> remove-artifacts) and, unlike the DaemonSet
+        // `cleanup` above, do not perform DaemonSet-presence gating: the JobSet
+        // workflow only schedules these when an uninstall is actually intended.
+        Action::CleanupStageUnlabel => {
+            cleanup_stage_unlabel(&config).await?;
+            info!("Cleanup unlabel stage completed, exiting");
+        }
+        Action::CleanupStageRevertCri => {
+            cleanup_stage_revert_cri(&config, &runtime).await?;
+            info!("Cleanup revert-cri stage completed, exiting");
+        }
+        Action::CleanupStageRemoveArtifacts => {
+            cleanup_stage_remove_artifacts(&config).await?;
+            info!("Cleanup remove-artifacts stage completed, exiting");
+        }
     }
 
     Ok(())
@@ -273,20 +353,39 @@ fn reexec_into_post_install_wait(
     ))
 }
 
+/// Full install pipeline. Used by the DaemonSet deployment model. Composes the
+/// same per-stage functions the staged JobSet workflow invokes individually, in
+/// the canonical order: host-check -> artifacts -> cri -> label.
 async fn install(config: &config::Config, runtime: &str) -> Result<()> {
     info!("Installing Kata Containers");
 
-    const SUPPORTED_RUNTIMES: &[&str] = &[
-        "crio",
-        "containerd",
-        "k3s",
-        "k3s-agent",
-        "rke2-agent",
-        "rke2-server",
-        "k0s-worker",
-        "k0s-controller",
-        "microk8s",
-    ];
+    install_stage_host_check(config, runtime).await?;
+    install_stage_artifacts(config, runtime).await?;
+    install_stage_cri(config, runtime).await?;
+    install_stage_label(config).await?;
+
+    info!("Kata Containers installation completed successfully");
+    Ok(())
+}
+
+const SUPPORTED_RUNTIMES: &[&str] = &[
+    "crio",
+    "containerd",
+    "k3s",
+    "k3s-agent",
+    "rke2-agent",
+    "rke2-server",
+    "k0s-worker",
+    "k0s-controller",
+    "microk8s",
+];
+
+/// Install stage 0 (host-check): validate that this node can support a Kata
+/// installation before any host mutation happens. This is read-only and safe
+/// to run repeatedly; it fails fast with actionable diagnostics so a staged
+/// JobSet can abort the per-node pipeline before the privileged stages run.
+async fn install_stage_host_check(config: &config::Config, runtime: &str) -> Result<()> {
+    info!("install (host-check): validating node prerequisites for runtime {runtime}");
 
     if !SUPPORTED_RUNTIMES.contains(&runtime) {
         return Err(anyhow::anyhow!(
@@ -345,16 +444,44 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> {
         }
     }
 
-    runtime::containerd::setup_containerd_config_files(runtime, config).await?;
+    info!("install (host-check): node prerequisites satisfied");
+    Ok(())
+}
+
+/// Install stage 1 (artifacts): place kata artifacts/config on the host and set
+/// up any configured snapshotters. This does not touch CRI configuration, but it
+/// still needs privileged host access: writing under the host install dir and
+/// the snapshotter setup (e.g. nydus) shell into the host via nsenter.
+async fn install_stage_artifacts(config: &config::Config, runtime: &str) -> Result<()> {
+    info!("install (artifacts): installing kata artifacts on host");
 
     artifacts::install_artifacts(config, runtime).await?;
 
+    if runtime != "crio" {
+        if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() {
+            for snapshotter in snapshotters {
+                artifacts::snapshotters::install_snapshotter(snapshotter, config).await?;
+            }
+        }
+    }
+
+    info!("install (artifacts): artifacts installed");
+    Ok(())
+}
+
+/// Install stage 2 (cri): write CRI drop-ins, configure snapshotters, restart
+/// the runtime, and wait for the node to become ready. This is the privileged,
+/// node-disrupting stage and is kept short-lived.
+async fn install_stage_cri(config: &config::Config, runtime: &str) -> Result<()> {
+    info!("install (cri): configuring CRI runtime");
+
+    runtime::containerd::setup_containerd_config_files(runtime, config).await?;
+
     runtime::configure_cri_runtime(config, runtime).await?;
 
     if runtime != "crio" {
         if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() {
             for snapshotter in snapshotters {
-                artifacts::snapshotters::install_snapshotter(snapshotter, config).await?;
                 artifacts::snapshotters::configure_snapshotter(snapshotter, runtime, config)
                     .await?;
             }
@@ -365,9 +492,29 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> {
     runtime::lifecycle::restart_runtime(config, runtime).await?;
     info!("Runtime restart completed successfully");
 
-    label_node_with_retry(config, "katacontainers.io/kata-runtime", "true").await?;
+    Ok(())
+}
+
+/// Install stage 3 (label): apply the kata-runtime node label. Unprivileged,
+/// Kubernetes API only. Skips re-applying when the label is already correct.
+async fn install_stage_label(config: &config::Config) -> Result<()> {
+    info!("install (label): applying node label");
+
+    match k8s::get_node_label(config, KATA_RUNTIME_LABEL).await {
+        Ok(Some(ref val)) if val == "true" => {
+            info!(
+                "install (label): node already labeled {}=true, skipping",
+                KATA_RUNTIME_LABEL
+            );
+            return Ok(());
+        }
+        // Any other state (absent, different value, or a transient read error)
+        // falls through to label_node_with_retry, which applies and verifies.
+        _ => {}
+    }
+
+    label_node_with_retry(config, KATA_RUNTIME_LABEL, "true").await?;
 
-    info!("Kata Containers installation completed successfully");
     Ok(())
 }
 
@@ -539,7 +686,7 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> {
     info!("No other kata-deploy DaemonSets found, performing full shared cleanup");
 
     info!("Removing kata-runtime label from node");
-    k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?;
+    k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?;
     info!("Successfully removed kata-runtime label");
 
     // Restart the CRI runtime last. On k3s/rke2 this restarts the entire
@@ -553,10 +700,111 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> {
     Ok(())
 }
 
+/// Cleanup stage 1 (unlabel): remove the kata-runtime node label first so the
+/// scheduler stops placing kata workloads on this node before any host
+/// mutation. Unprivileged, Kubernetes API only. Skips when already absent.
+async fn cleanup_stage_unlabel(config: &config::Config) -> Result<()> {
+    info!("cleanup (unlabel): removing node label");
+
+    // If the label is already absent, there is nothing to do. Any other state
+    // (present, or unknown due to a transient read error) falls through to the
+    // removal below.
+    if let Ok(None) = k8s::get_node_label(config, KATA_RUNTIME_LABEL).await {
+        info!(
+            "cleanup (unlabel): label {} already absent, skipping",
+            KATA_RUNTIME_LABEL
+        );
+        return Ok(());
+    }
+
+    k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?;
+    info!("cleanup (unlabel): label removed");
+    Ok(())
+}
+
+/// Cleanup stage 2 (revert-cri): remove CRI drop-ins (and any snapshotter
+/// config), then restart the runtime and wait for readiness. This is the
+/// privileged, node-disrupting cleanup stage and is kept short-lived. Skips
+/// entirely when the CRI drop-ins are already absent, avoiding an unnecessary
+/// runtime restart.
+async fn cleanup_stage_revert_cri(config: &config::Config, runtime: &str) -> Result<()> {
+    info!("cleanup (revert-cri): reverting CRI configuration");
+
+    if !cri_drop_in_present(config, runtime).await {
+        info!("cleanup (revert-cri): CRI drop-ins already absent, skipping");
+        return Ok(());
+    }
+
+    if runtime != "crio" {
+        if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() {
+            for snapshotter in snapshotters {
+                info!("cleanup (revert-cri): uninstalling snapshotter {snapshotter}");
+                artifacts::snapshotters::uninstall_snapshotter(snapshotter, config).await?;
+            }
+        }
+    }
+
+    runtime::cleanup_cri_runtime_config(config, runtime).await?;
+
+    info!("cleanup (revert-cri): restarting runtime");
+    runtime::restart_and_wait_for_ready(config, runtime).await?;
+    info!("cleanup (revert-cri): runtime restarted");
+
+    Ok(())
+}
+
+/// Cleanup stage 3 (remove-artifacts): delete kata artifacts/config/symlinks
+/// from the host. Skips when the install directory is already gone.
+async fn cleanup_stage_remove_artifacts(config: &config::Config) -> Result<()> {
+    info!("cleanup (remove-artifacts): removing kata artifacts from host");
+
+    if !std::path::Path::new(&config.host_install_dir).exists() {
+        info!(
+            "cleanup (remove-artifacts): install dir {} already absent, skipping",
+            config.host_install_dir
+        );
+        return Ok(());
+    }
+
+    artifacts::remove_artifacts(config).await?;
+    info!("cleanup (remove-artifacts): artifacts removed");
+    Ok(())
+}
+
+/// Best-effort check for whether kata's CRI drop-in configuration is present on
+/// the host for this runtime. Used by the staged cleanup to skip a disruptive
+/// runtime restart when there is nothing to revert. On any uncertainty (e.g.
+/// the containerd paths cannot be resolved) this returns `true` so the caller
+/// errs on the side of running the revert rather than incorrectly skipping it.
+async fn cri_drop_in_present(config: &config::Config, runtime: &str) -> bool {
+    if runtime == "crio" {
+        return std::path::Path::new(&config.crio_drop_in_conf_file).exists();
+    }
+
+    match config.get_containerd_paths(runtime).await {
+        Ok(paths) => {
+            // /etc/containerd is mounted directly; other paths live under /host.
+            let resolved = if paths.drop_in_file.starts_with("/etc/containerd/") {
+                std::path::PathBuf::from(&paths.drop_in_file)
+            } else {
+                std::path::Path::new("/host").join(paths.drop_in_file.trim_start_matches('/'))
+            };
+            resolved.exists()
+        }
+        Err(e) => {
+            log::warn!(
+                "cleanup (revert-cri): could not resolve containerd paths to check drop-in \
+                 presence ({e}); proceeding with revert"
+            );
+            true
+        }
+    }
+}
+
 async fn reset(config: &config::Config, runtime: &str) -> Result<()> {
     info!("Resetting Kata Containers");
 
-    k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?;
+    k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?;
     runtime::lifecycle::restart_cri_runtime(config, runtime).await?;
     if matches!(runtime, "crio" | "containerd") {
         utils::host_systemctl(&["restart", "kubelet"])?;
@@ -566,3 +814,86 @@ async fn reset(config: &config::Config, runtime: &str) -> Result<()> {
     info!("Kata Containers reset completed successfully");
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    //! Tests for CLI action wiring. The staged install/cleanup actions are the
+    //! entrypoints the JobSet workflow invokes per node, so we lock in their
+    //! exact subcommand names (a rename would silently break the chart) and the
+    //! mapping into the `Action` enum.
+
+    use super::*;
+    use clap::ValueEnum;
+    use rstest::rstest;
+
+    /// Every staged subcommand name parses into the expected `Action` variant.
+    /// Keep this in sync with the `#[clap(name = ...)]` attributes above.
+    #[rstest]
+    #[case("install", Action::Install)]
+    #[case("cleanup", Action::Cleanup)]
+    #[case("reset", Action::Reset)]
+    #[case("install-stage-host-check", Action::InstallStageHostCheck)]
+    #[case("install-stage-artifacts", Action::InstallStageArtifacts)]
+    #[case("install-stage-cri", Action::InstallStageCri)]
+    #[case("install-stage-label", Action::InstallStageLabel)]
+    #[case("cleanup-stage-unlabel", Action::CleanupStageUnlabel)]
+    #[case("cleanup-stage-revert-cri", Action::CleanupStageRevertCri)]
+    #[case("cleanup-stage-remove-artifacts", Action::CleanupStageRemoveArtifacts)]
+    #[case("internal-post-install-wait", Action::InternalPostInstallWait)]
+    fn test_action_parses_from_arg(#[case] arg: &str, #[case] expected: Action) {
+        let args = Args::try_parse_from(["kata-deploy", arg])
+            .unwrap_or_else(|e| panic!("failed to parse action {arg:?}: {e}"));
+        assert_eq!(
+            std::mem::discriminant(&args.action),
+            std::mem::discriminant(&expected),
+            "arg {arg:?} parsed into the wrong Action variant",
+        );
+    }
+
+    /// Unknown actions must be rejected rather than silently accepted.
+    #[rstest]
+    #[case("install-stage")]
+    #[case("cleanup-stage")]
+    #[case("install-stage-foo")]
+    #[case("bogus")]
+    fn test_unknown_action_is_rejected(#[case] arg: &str) {
+        assert!(
+            Args::try_parse_from(["kata-deploy", arg]).is_err(),
+            "expected action {arg:?} to be rejected",
+        );
+    }
+
+    /// The hidden internal waiter must stay hidden from `--help` so users never
+    /// invoke it directly, while still being parseable (asserted above).
+    #[test]
+    fn test_internal_action_is_hidden() {
+        let internal = Action::InternalPostInstallWait
+            .to_possible_value()
+            .expect("internal action should have a possible value");
+        assert!(
+            internal.is_hide_set(),
+            "internal-post-install-wait should be hidden from --help",
+        );
+    }
+
+    /// All non-internal staged actions remain visible in `--help` so operators
+    /// can discover and run individual stages.
+    #[rstest]
+    #[case(Action::InstallStageHostCheck)]
+    #[case(Action::InstallStageArtifacts)]
+    #[case(Action::InstallStageCri)]
+    #[case(Action::InstallStageLabel)]
+    #[case(Action::CleanupStageUnlabel)]
+    #[case(Action::CleanupStageRevertCri)]
+    #[case(Action::CleanupStageRemoveArtifacts)]
+    fn test_staged_actions_are_visible(#[case] action: Action) {
+        let value = action
+            .to_possible_value()
+            .expect("staged action should have a possible value");
+        assert!(
+            !value.is_hide_set(),
+            "staged action {:?} should be visible in --help",
+            value.get_name(),
+        );
+    }
+}

From 28fce44b702d04a334937126eb58fcd89085ff19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 3 Jun 2026 22:06:26 +0200
Subject: [PATCH 4/9] kata-deploy: extract shared pod env/volumes into helm
 helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull the kata-deploy container's environment block and host
volume/volumeMount definitions out of the DaemonSet template into
reusable named templates in _helpers.tpl:

  - kata-deploy.commonEnv
  - kata-deploy.commonVolumeMounts
  - kata-deploy.commonVolumes

These are derived purely from chart values and are independent of the
deployment model, so they can be shared verbatim by upcoming per-node
install/cleanup Jobs without duplicating the (large) env wiring.

Pure refactor: the rendered DaemonSet is byte-for-byte identical to
before (verified via normalized `helm template` diff across default and
multiInstallSuffix/userDropIn/customRuntimes permutations).

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 .../kata-deploy/templates/_helpers.tpl        | 240 ++++++++++++++++++
 .../kata-deploy/templates/kata-deploy.yaml    | 215 +---------------
 2 files changed, 243 insertions(+), 212 deletions(-)

diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
index cd885522e0..dae7c0ca32 100644
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
@@ -409,6 +409,246 @@ Get debug value from structured config
 {{- end -}}
 {{- end -}}
 
+{{/*
+Common environment variables for any pod that runs the kata-deploy binary
+(DaemonSet, staged JobSet install/cleanup Jobs, reconcile-created Jobs).
+
+These are all derived from chart values and are independent of the deployment
+model, so they are shared verbatim. HEALTH_PORT and the health probes are NOT
+included here: they only matter for the long-running install pod (DaemonSet),
+not the short-lived staged Jobs.
+
+Emitted at column 0; callers must indent with `nindent` to the right depth,
+e.g. `{{- include "kata-deploy.commonEnv" . | nindent 8 }}`.
+*/}}
+{{- define "kata-deploy.commonEnv" -}}
+- name: NODE_NAME
+  valueFrom:
+    fieldRef:
+      fieldPath: spec.nodeName
+{{- if .Values.env.multiInstallSuffix }}
+- name: DAEMONSET_NAME
+  value: {{ printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix | quote }}
+{{- else }}
+- name: DAEMONSET_NAME
+  value: {{ .Chart.Name | quote }}
+{{- end }}
+- name: DEBUG
+  value: {{ include "kata-deploy.getDebug" . | quote }}
+{{- $shimsAmd64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "amd64") | trim -}}
+{{- if $shimsAmd64 }}
+- name: SHIMS_X86_64
+  value: {{ $shimsAmd64 | quote }}
+{{- end }}
+{{- $shimsArm64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "arm64") | trim -}}
+{{- if $shimsArm64 }}
+- name: SHIMS_AARCH64
+  value: {{ $shimsArm64 | quote }}
+{{- end }}
+{{- $shimsS390x := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "s390x") | trim -}}
+{{- if $shimsS390x }}
+- name: SHIMS_S390X
+  value: {{ $shimsS390x | quote }}
+{{- end }}
+{{- $shimsPpc64le := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "ppc64le") | trim -}}
+{{- if $shimsPpc64le }}
+- name: SHIMS_PPC64LE
+  value: {{ $shimsPpc64le | quote }}
+{{- end }}
+{{- $defaultShimAmd64 := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "amd64") | trim -}}
+{{- if $defaultShimAmd64 }}
+- name: DEFAULT_SHIM_X86_64
+  value: {{ $defaultShimAmd64 | quote }}
+{{- end }}
+{{- $defaultShimArm64 := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "arm64") | trim -}}
+{{- if $defaultShimArm64 }}
+- name: DEFAULT_SHIM_AARCH64
+  value: {{ $defaultShimArm64 | quote }}
+{{- end }}
+{{- $defaultShimS390x := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "s390x") | trim -}}
+{{- if $defaultShimS390x }}
+- name: DEFAULT_SHIM_S390X
+  value: {{ $defaultShimS390x | quote }}
+{{- end }}
+{{- $defaultShimPpc64le := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "ppc64le") | trim -}}
+{{- if $defaultShimPpc64le }}
+- name: DEFAULT_SHIM_PPC64LE
+  value: {{ $defaultShimPpc64le | quote }}
+{{- end }}
+{{- $allowedHypervisorAnnotationsAmd64 := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "amd64") | trim -}}
+{{- if $allowedHypervisorAnnotationsAmd64 }}
+- name: ALLOWED_HYPERVISOR_ANNOTATIONS_X86_64
+  value: {{ $allowedHypervisorAnnotationsAmd64 | quote }}
+{{- end }}
+{{- $allowedHypervisorAnnotationsArm64 := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "arm64") | trim -}}
+{{- if $allowedHypervisorAnnotationsArm64 }}
+- name: ALLOWED_HYPERVISOR_ANNOTATIONS_AARCH64
+  value: {{ $allowedHypervisorAnnotationsArm64 | quote }}
+{{- end }}
+{{- $allowedHypervisorAnnotationsS390x := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "s390x") | trim -}}
+{{- if $allowedHypervisorAnnotationsS390x }}
+- name: ALLOWED_HYPERVISOR_ANNOTATIONS_S390X
+  value: {{ $allowedHypervisorAnnotationsS390x | quote }}
+{{- end }}
+{{- $allowedHypervisorAnnotationsPpc64le := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "ppc64le") | trim -}}
+{{- if $allowedHypervisorAnnotationsPpc64le }}
+- name: ALLOWED_HYPERVISOR_ANNOTATIONS_PPC64LE
+  value: {{ $allowedHypervisorAnnotationsPpc64le | quote }}
+{{- end }}
+{{- $snapshotterHandlerMappingAmd64 := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "amd64") | trim -}}
+{{- if $snapshotterHandlerMappingAmd64 }}
+- name: SNAPSHOTTER_HANDLER_MAPPING_X86_64
+  value: {{ $snapshotterHandlerMappingAmd64 | quote }}
+{{- end }}
+{{- $snapshotterHandlerMappingArm64 := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "arm64") | trim -}}
+{{- if $snapshotterHandlerMappingArm64 }}
+- name: SNAPSHOTTER_HANDLER_MAPPING_AARCH64
+  value: {{ $snapshotterHandlerMappingArm64 | quote }}
+{{- end }}
+{{- $snapshotterHandlerMappingS390x := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "s390x") | trim -}}
+{{- if $snapshotterHandlerMappingS390x }}
+- name: SNAPSHOTTER_HANDLER_MAPPING_S390X
+  value: {{ $snapshotterHandlerMappingS390x | quote }}
+{{- end }}
+{{- $snapshotterHandlerMappingPpc64le := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "ppc64le") | trim -}}
+{{- if $snapshotterHandlerMappingPpc64le }}
+- name: SNAPSHOTTER_HANDLER_MAPPING_PPC64LE
+  value: {{ $snapshotterHandlerMappingPpc64le | quote }}
+{{- end }}
+{{- $agentHttpsProxy := include "kata-deploy.getAgentHttpsProxy" . | trim -}}
+{{- if $agentHttpsProxy }}
+- name: AGENT_HTTPS_PROXY
+  value: {{ $agentHttpsProxy | quote }}
+{{- end }}
+{{- $agentNoProxy := include "kata-deploy.getAgentNoProxy" . | trim -}}
+{{- if $agentNoProxy }}
+- name: AGENT_NO_PROXY
+  value: {{ $agentNoProxy | quote }}
+{{- end }}
+{{- $pullTypeMappingAmd64 := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "amd64") | trim -}}
+{{- if $pullTypeMappingAmd64 }}
+- name: PULL_TYPE_MAPPING_X86_64
+  value: {{ $pullTypeMappingAmd64 | quote }}
+{{- end }}
+{{- $pullTypeMappingArm64 := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "arm64") | trim -}}
+{{- if $pullTypeMappingArm64 }}
+- name: PULL_TYPE_MAPPING_AARCH64
+  value: {{ $pullTypeMappingArm64 | quote }}
+{{- end }}
+{{- $pullTypeMappingS390x := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "s390x") | trim -}}
+{{- if $pullTypeMappingS390x }}
+- name: PULL_TYPE_MAPPING_S390X
+  value: {{ $pullTypeMappingS390x | quote }}
+{{- end }}
+{{- $pullTypeMappingPpc64le := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "ppc64le") | trim -}}
+{{- if $pullTypeMappingPpc64le }}
+- name: PULL_TYPE_MAPPING_PPC64LE
+  value: {{ $pullTypeMappingPpc64le | quote }}
+{{- end }}
+- name: INSTALLATION_PREFIX
+  value: {{ .Values.env.installationPrefix | quote }}
+- name: MULTI_INSTALL_SUFFIX
+  value: {{ .Values.env.multiInstallSuffix | quote }}
+{{- $snapshotterSetup := include "kata-deploy.getSnapshotterSetup" . | trim -}}
+{{- if $snapshotterSetup }}
+- name: EXPERIMENTAL_SETUP_SNAPSHOTTER
+  value: {{ $snapshotterSetup | quote }}
+{{- end }}
+{{- $forceGuestPullAmd64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "amd64") | trim -}}
+{{- if $forceGuestPullAmd64 }}
+- name: EXPERIMENTAL_FORCE_GUEST_PULL_X86_64
+  value: {{ $forceGuestPullAmd64 | quote }}
+{{- end }}
+{{- $forceGuestPullArm64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "arm64") | trim -}}
+{{- if $forceGuestPullArm64 }}
+- name: EXPERIMENTAL_FORCE_GUEST_PULL_AARCH64
+  value: {{ $forceGuestPullArm64 | quote }}
+{{- end }}
+{{- $forceGuestPullS390x := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "s390x") | trim -}}
+{{- if $forceGuestPullS390x }}
+- name: EXPERIMENTAL_FORCE_GUEST_PULL_S390X
+  value: {{ $forceGuestPullS390x | quote }}
+{{- end }}
+{{- $forceGuestPullPpc64le := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "ppc64le") | trim -}}
+{{- if $forceGuestPullPpc64le }}
+- name: EXPERIMENTAL_FORCE_GUEST_PULL_PPC64LE
+  value: {{ $forceGuestPullPpc64le | quote }}
+{{- end }}
+{{- if .Values.containerd.configFileName | trim }}
+- name: CONTAINERD_CONFIG_FILE_NAME
+  value: {{ .Values.containerd.configFileName | trim | quote }}
+{{- end }}
+{{- if .Values.containerd.userDropIn | trim }}
+- name: CONTAINERD_USER_DROP_IN_SOURCE_FILE
+  value: "/custom-containerd-config/containerd-user-dropin.toml"
+{{- end }}
+{{- with .Values.env.hostOS }}
+- name: HOST_OS
+  value: {{ . | quote }}
+{{- end }}
+{{- if and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes }}
+- name: CUSTOM_RUNTIMES_ENABLED
+  value: "true"
+{{- end }}
+{{- end -}}
+
+{{/*
+Common volumeMounts for any pod that runs the kata-deploy binary against the
+host. Emitted at column 0; indent with `nindent` at the call site.
+*/}}
+{{- define "kata-deploy.commonVolumeMounts" -}}
+- name: crio-conf
+  mountPath: /etc/crio/
+- name: containerd-conf
+  mountPath: /etc/containerd/
+- name: host
+  mountPath: /host/
+{{- if .Values.containerd.userDropIn | trim }}
+- name: custom-containerd-config
+  mountPath: /custom-containerd-config/
+  readOnly: true
+{{- end }}
+{{- if or (and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes) (eq (include "kata-deploy.hasDefaultRuntimeDropIns" . | trim) "true") }}
+- name: custom-configs
+  mountPath: /custom-configs/
+  readOnly: true
+{{- end }}
+{{- end -}}
+
+{{/*
+Common host/configMap volumes backing the mounts above. Emitted at column 0;
+indent with `nindent` at the call site.
+*/}}
+{{- define "kata-deploy.commonVolumes" -}}
+- name: crio-conf
+  hostPath:
+    path: /etc/crio/
+- name: containerd-conf
+  hostPath:
+    path: '{{- template "containerdConfPath" .Values }}'
+- name: host
+  hostPath:
+    path: /
+{{- if .Values.containerd.userDropIn | trim }}
+- name: custom-containerd-config
+  configMap:
+{{- if .Values.env.multiInstallSuffix }}
+    name: {{ .Chart.Name }}-containerd-user-dropin-{{ .Values.env.multiInstallSuffix }}
+{{- else }}
+    name: {{ .Chart.Name }}-containerd-user-dropin
+{{- end }}
+{{- end }}
+{{- if or (and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes) (eq (include "kata-deploy.hasDefaultRuntimeDropIns" . | trim) "true") }}
+- name: custom-configs
+  configMap:
+{{- if .Values.env.multiInstallSuffix }}
+    name: {{ .Chart.Name }}-custom-configs-{{ .Values.env.multiInstallSuffix }}
+{{- else }}
+    name: {{ .Chart.Name }}-custom-configs
+{{- end }}
+{{- end }}
+{{- end -}}
+
 {{/*
 Get EXPERIMENTAL_FORCE_GUEST_PULL for a specific architecture from structured config
 Returns comma-separated list of shim names with forceGuestPull enabled
diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml
index e1a2614a64..ff02c34de6 100644
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml
@@ -18,7 +18,6 @@
 {{- end -}}
 {{- end -}}
 {{- end -}}
-{{- $hasCustomConfigs := or (and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes) (eq (include "kata-deploy.hasDefaultRuntimeDropIns" . | trim) "true") -}}
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
@@ -153,174 +152,7 @@ spec:
         {{- end }}
         - install
         env:
-        - name: NODE_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: spec.nodeName
-{{- if .Values.env.multiInstallSuffix }}
-        - name: DAEMONSET_NAME
-          value: {{ printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix | quote }}
-{{- else }}
-        - name: DAEMONSET_NAME
-          value: {{ .Chart.Name | quote }}
-{{- end }}
-        - name: DEBUG
-          value: {{ include "kata-deploy.getDebug" . | quote }}
-        {{- $shimsAmd64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "amd64") | trim -}}
-        {{- if $shimsAmd64 }}
-        - name: SHIMS_X86_64
-          value: {{ $shimsAmd64 | quote }}
-        {{- end }}
-        {{- $shimsArm64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "arm64") | trim -}}
-        {{- if $shimsArm64 }}
-        - name: SHIMS_AARCH64
-          value: {{ $shimsArm64 | quote }}
-        {{- end }}
-        {{- $shimsS390x := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "s390x") | trim -}}
-        {{- if $shimsS390x }}
-        - name: SHIMS_S390X
-          value: {{ $shimsS390x | quote }}
-        {{- end }}
-        {{- $shimsPpc64le := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "ppc64le") | trim -}}
-        {{- if $shimsPpc64le }}
-        - name: SHIMS_PPC64LE
-          value: {{ $shimsPpc64le | quote }}
-        {{- end }}
-        {{- $defaultShimAmd64 := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "amd64") | trim -}}
-        {{- if $defaultShimAmd64 }}
-        - name: DEFAULT_SHIM_X86_64
-          value: {{ $defaultShimAmd64 | quote }}
-        {{- end }}
-        {{- $defaultShimArm64 := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "arm64") | trim -}}
-        {{- if $defaultShimArm64 }}
-        - name: DEFAULT_SHIM_AARCH64
-          value: {{ $defaultShimArm64 | quote }}
-        {{- end }}
-        {{- $defaultShimS390x := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "s390x") | trim -}}
-        {{- if $defaultShimS390x }}
-        - name: DEFAULT_SHIM_S390X
-          value: {{ $defaultShimS390x | quote }}
-        {{- end }}
-        {{- $defaultShimPpc64le := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "ppc64le") | trim -}}
-        {{- if $defaultShimPpc64le }}
-        - name: DEFAULT_SHIM_PPC64LE
-          value: {{ $defaultShimPpc64le | quote }}
-        {{- end }}
-        {{- $allowedHypervisorAnnotationsAmd64 := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "amd64") | trim -}}
-        {{- if $allowedHypervisorAnnotationsAmd64 }}
-        - name: ALLOWED_HYPERVISOR_ANNOTATIONS_X86_64
-          value: {{ $allowedHypervisorAnnotationsAmd64 | quote }}
-        {{- end }}
-        {{- $allowedHypervisorAnnotationsArm64 := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "arm64") | trim -}}
-        {{- if $allowedHypervisorAnnotationsArm64 }}
-        - name: ALLOWED_HYPERVISOR_ANNOTATIONS_AARCH64
-          value: {{ $allowedHypervisorAnnotationsArm64 | quote }}
-        {{- end }}
-        {{- $allowedHypervisorAnnotationsS390x := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "s390x") | trim -}}
-        {{- if $allowedHypervisorAnnotationsS390x }}
-        - name: ALLOWED_HYPERVISOR_ANNOTATIONS_S390X
-          value: {{ $allowedHypervisorAnnotationsS390x | quote }}
-        {{- end }}
-        {{- $allowedHypervisorAnnotationsPpc64le := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "ppc64le") | trim -}}
-        {{- if $allowedHypervisorAnnotationsPpc64le }}
-        - name: ALLOWED_HYPERVISOR_ANNOTATIONS_PPC64LE
-          value: {{ $allowedHypervisorAnnotationsPpc64le | quote }}
-        {{- end }}
-        {{- $snapshotterHandlerMappingAmd64 := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "amd64") | trim -}}
-        {{- if $snapshotterHandlerMappingAmd64 }}
-        - name: SNAPSHOTTER_HANDLER_MAPPING_X86_64
-          value: {{ $snapshotterHandlerMappingAmd64 | quote }}
-        {{- end }}
-        {{- $snapshotterHandlerMappingArm64 := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "arm64") | trim -}}
-        {{- if $snapshotterHandlerMappingArm64 }}
-        - name: SNAPSHOTTER_HANDLER_MAPPING_AARCH64
-          value: {{ $snapshotterHandlerMappingArm64 | quote }}
-        {{- end }}
-        {{- $snapshotterHandlerMappingS390x := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "s390x") | trim -}}
-        {{- if $snapshotterHandlerMappingS390x }}
-        - name: SNAPSHOTTER_HANDLER_MAPPING_S390X
-          value: {{ $snapshotterHandlerMappingS390x | quote }}
-        {{- end }}
-        {{- $snapshotterHandlerMappingPpc64le := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "ppc64le") | trim -}}
-        {{- if $snapshotterHandlerMappingPpc64le }}
-        - name: SNAPSHOTTER_HANDLER_MAPPING_PPC64LE
-          value: {{ $snapshotterHandlerMappingPpc64le | quote }}
-        {{- end }}
-        {{- $agentHttpsProxy := include "kata-deploy.getAgentHttpsProxy" . | trim -}}
-        {{- if $agentHttpsProxy }}
-        - name: AGENT_HTTPS_PROXY
-          value: {{ $agentHttpsProxy | quote }}
-        {{- end }}
-        {{- $agentNoProxy := include "kata-deploy.getAgentNoProxy" . | trim -}}
-        {{- if $agentNoProxy }}
-        - name: AGENT_NO_PROXY
-          value: {{ $agentNoProxy | quote }}
-        {{- end }}
-        {{- $pullTypeMappingAmd64 := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "amd64") | trim -}}
-        {{- if $pullTypeMappingAmd64 }}
-        - name: PULL_TYPE_MAPPING_X86_64
-          value: {{ $pullTypeMappingAmd64 | quote }}
-        {{- end }}
-        {{- $pullTypeMappingArm64 := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "arm64") | trim -}}
-        {{- if $pullTypeMappingArm64 }}
-        - name: PULL_TYPE_MAPPING_AARCH64
-          value: {{ $pullTypeMappingArm64 | quote }}
-        {{- end }}
-        {{- $pullTypeMappingS390x := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "s390x") | trim -}}
-        {{- if $pullTypeMappingS390x }}
-        - name: PULL_TYPE_MAPPING_S390X
-          value: {{ $pullTypeMappingS390x | quote }}
-        {{- end }}
-        {{- $pullTypeMappingPpc64le := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "ppc64le") | trim -}}
-        {{- if $pullTypeMappingPpc64le }}
-        - name: PULL_TYPE_MAPPING_PPC64LE
-          value: {{ $pullTypeMappingPpc64le | quote }}
-        {{- end }}
-        - name: INSTALLATION_PREFIX
-          value: {{ .Values.env.installationPrefix | quote }}
-        - name: MULTI_INSTALL_SUFFIX
-          value: {{ .Values.env.multiInstallSuffix | quote }}
-        {{- $snapshotterSetup := include "kata-deploy.getSnapshotterSetup" . | trim -}}
-        {{- if $snapshotterSetup }}
-        - name: EXPERIMENTAL_SETUP_SNAPSHOTTER
-          value: {{ $snapshotterSetup | quote }}
-        {{- end }}
-        {{- $forceGuestPullAmd64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "amd64") | trim -}}
-        {{- if $forceGuestPullAmd64 }}
-        - name: EXPERIMENTAL_FORCE_GUEST_PULL_X86_64
-          value: {{ $forceGuestPullAmd64 | quote }}
-        {{- end }}
-        {{- $forceGuestPullArm64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "arm64") | trim -}}
-        {{- if $forceGuestPullArm64 }}
-        - name: EXPERIMENTAL_FORCE_GUEST_PULL_AARCH64
-          value: {{ $forceGuestPullArm64 | quote }}
-        {{- end }}
-        {{- $forceGuestPullS390x := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "s390x") | trim -}}
-        {{- if $forceGuestPullS390x }}
-        - name: EXPERIMENTAL_FORCE_GUEST_PULL_S390X
-          value: {{ $forceGuestPullS390x | quote }}
-        {{- end }}
-        {{- $forceGuestPullPpc64le := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "ppc64le") | trim -}}
-        {{- if $forceGuestPullPpc64le }}
-        - name: EXPERIMENTAL_FORCE_GUEST_PULL_PPC64LE
-          value: {{ $forceGuestPullPpc64le | quote }}
-        {{- end }}
-{{- if .Values.containerd.configFileName | trim }}
-        - name: CONTAINERD_CONFIG_FILE_NAME
-          value: {{ .Values.containerd.configFileName | trim | quote }}
-{{- end }}
-{{- if .Values.containerd.userDropIn | trim }}
-        - name: CONTAINERD_USER_DROP_IN_SOURCE_FILE
-          value: "/custom-containerd-config/containerd-user-dropin.toml"
-{{- end }}
-{{- with .Values.env.hostOS }}
-        - name: HOST_OS
-          value: {{ . | quote }}
-{{- end }}
-{{- if and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes }}
-        - name: CUSTOM_RUNTIMES_ENABLED
-          value: "true"
-{{- end }}
+{{- include "kata-deploy.commonEnv" . | nindent 8 }}
 {{- $healthDefaults := dict
     "port" 8090
     "startupProbe" (dict "enabled" true "initialDelaySeconds" 1 "periodSeconds" 10 "failureThreshold" 60 "timeoutSeconds" 3)
@@ -365,50 +197,9 @@ spec:
 {{- toYaml . | nindent 10 }}
 {{- end }}
         volumeMounts:
-        - name: crio-conf
-          mountPath: /etc/crio/
-        - name: containerd-conf
-          mountPath: /etc/containerd/
-        - name: host
-          mountPath: /host/
-{{- if .Values.containerd.userDropIn | trim }}
-        - name: custom-containerd-config
-          mountPath: /custom-containerd-config/
-          readOnly: true
-{{- end }}
-{{- if $hasCustomConfigs }}
-        - name: custom-configs
-          mountPath: /custom-configs/
-          readOnly: true
-{{- end }}
+{{- include "kata-deploy.commonVolumeMounts" . | nindent 8 }}
       volumes:
-      - name: crio-conf
-        hostPath:
-          path: /etc/crio/
-      - name: containerd-conf
-        hostPath:
-          path: '{{- template "containerdConfPath" .Values }}'
-      - name: host
-        hostPath:
-          path: /
-{{- if .Values.containerd.userDropIn | trim }}
-      - name: custom-containerd-config
-        configMap:
-{{- if .Values.env.multiInstallSuffix }}
-          name: {{ .Chart.Name }}-containerd-user-dropin-{{ .Values.env.multiInstallSuffix }}
-{{- else }}
-          name: {{ .Chart.Name }}-containerd-user-dropin
-{{- end }}
-{{- end }}
-{{- if $hasCustomConfigs }}
-      - name: custom-configs
-        configMap:
-{{- if .Values.env.multiInstallSuffix }}
-          name: {{ .Chart.Name }}-custom-configs-{{ .Values.env.multiInstallSuffix }}
-{{- else }}
-          name: {{ .Chart.Name }}-custom-configs
-{{- end }}
-{{- end }}
+{{- include "kata-deploy.commonVolumes" . | nindent 6 }}
 {{- with .Values.updateStrategy }}
   updateStrategy:
     {{- toYaml . | nindent 4 }}

From 54878fa373d845d616c17497086b93a8ea215411 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 3 Jun 2026 22:06:55 +0200
Subject: [PATCH 5/9] kata-deploy: add job deployment mode driven by the
 job-dispatcher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 2 of the DaemonSet -> staged-Job migration: add an opt-in
`deploymentMode: job` that installs Kata via short-lived, per-node
install Jobs instead of the long-running DaemonSet. The DaemonSet remains
the default and is now gated behind `deploymentMode == daemonset`.

Rather than render one Job per node into the Helm release (which grows
the release secret O(nodes) and offers no rollout pacing), job mode ships
a single tiny post-install/post-upgrade hook Job that runs the
kata-deploy-job-dispatcher. The dispatcher enumerates the selected nodes
LIVE from the API server and stamps out one node-pinned install Job per
node from a constant-size ConfigMap of Job templates, keeping at most
`job.parallelism` in flight and refilling as they finish. This guarantees
per-node coverage with a paced rollout while the Helm release stays O(1)
regardless of fleet size. New nodes are picked up by re-running
`helm upgrade`; there is no always-on component.

Each per-node Job runs the staged install pipeline as ordered
initContainers and exits:

  host-check -> artifacts -> cri   (initContainers, run sequentially)
  label                            (main container)

The privilege split is explicit: the dispatcher pod is a pure
control-plane client (lists nodes, manages Jobs in its own namespace) and
runs fully unprivileged under a dedicated, least-privilege ServiceAccount
(kata-rbac.yaml); only the per-node Jobs it creates carry the privileged
kata-deploy host-mutation rights.

Node selection (templates/_helpers.tpl: nodeLabelSelector / perNodeJob):
  - job.nodes: explicit node-name list passed to the dispatcher, and
  - job.nodeSelector (equality map) ANDed with
  - job.nodeSelectorExpressions (k8s label-selector requirements:
    In / NotIn / Exists / DoesNotExist),
compiled into a single label-selector string the dispatcher resolves
live. The default expressions target worker (non-control-plane) nodes, so
no custom node labeling is required; set the expressions to [] to target
all discovered nodes.

Reuses the commonEnv/commonVolume* helpers and adds the stageContainer,
serviceAccountName, dispatcherServiceAccountName, dispatcherImage and
perNodeJob helpers shared by the dispatcher and the staged Jobs. The
default (daemonset) render is unchanged.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 .../kata-deploy/lib/helm-deploy.bash          |  13 ++
 tests/gha-run-k8s-common.sh                   |  13 ++
 .../kata-deploy/templates/_helpers.tpl        | 175 ++++++++++++++++++
 .../templates/kata-deploy-install-job.yaml    | 113 +++++++++++
 .../templates/kata-deploy-job-templates.yaml  |  33 ++++
 .../kata-deploy/templates/kata-deploy.yaml    |   2 +
 .../kata-deploy/templates/kata-rbac.yaml      |  62 +++++++
 .../helm-chart/kata-deploy/values.yaml        | 103 +++++++++++
 ...kata-deploy-build-and-upload-helm-chart.sh |  13 ++
 9 files changed, 527 insertions(+)
 create mode 100644 tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml
 create mode 100644 tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml

diff --git a/tests/functional/kata-deploy/lib/helm-deploy.bash b/tests/functional/kata-deploy/lib/helm-deploy.bash
index 463bd1648b..73d15dae94 100644
--- a/tests/functional/kata-deploy/lib/helm-deploy.bash
+++ b/tests/functional/kata-deploy/lib/helm-deploy.bash
@@ -31,11 +31,24 @@ generate_base_values() {
 	local output_file="$1"
 	local extra_values_file="${2:-}"
 
+	local kata_deploy_image="${DOCKER_REGISTRY}/${DOCKER_REPO}"
+	local dispatcher_image
+	if [[ "${kata_deploy_image}" == *-ci ]]; then
+		dispatcher_image="${kata_deploy_image%-ci}-job-dispatcher-ci"
+	else
+		dispatcher_image="${kata_deploy_image}-job-dispatcher"
+	fi
+
 	cat > "${output_file}" <<EOF
 image:
   reference: ${DOCKER_REGISTRY}/${DOCKER_REPO}
   tag: ${DOCKER_TAG}
 
+job:
+  dispatcherImage:
+    reference: ${dispatcher_image}
+    tag: ${DOCKER_TAG}
+
 k8sDistribution: "${KUBERNETES}"
 debug: true
 
diff --git a/tests/gha-run-k8s-common.sh b/tests/gha-run-k8s-common.sh
index 8fc5b68007..c23f014573 100644
--- a/tests/gha-run-k8s-common.sh
+++ b/tests/gha-run-k8s-common.sh
@@ -732,6 +732,19 @@ function helm_helper() {
 	fi
 	yq -i ".image.tag = \"${HELM_IMAGE_TAG}\"" "${values_yaml}"
 
+	# Derive the dispatcher image name from the main kata-deploy image,
+	# mirroring the -ci/non-ci logic used by the build/release scripts: the
+	# dispatcher lives at "<base>-job-dispatcher", with the "-ci" suffix (if
+	# any) kept at the very end (e.g. kata-deploy-ci -> kata-deploy-job-dispatcher-ci).
+	local dispatcher_reference
+	if [[ "${HELM_IMAGE_REFERENCE}" == *-ci ]]; then
+		dispatcher_reference="${HELM_IMAGE_REFERENCE%-ci}-job-dispatcher-ci"
+	else
+		dispatcher_reference="${HELM_IMAGE_REFERENCE}-job-dispatcher"
+	fi
+	yq -i ".job.dispatcherImage.reference = \"${dispatcher_reference}\"" "${values_yaml}"
+	yq -i ".job.dispatcherImage.tag = \"${HELM_IMAGE_TAG}\"" "${values_yaml}"
+
 	[[ -n "${HELM_K8S_DISTRIBUTION}" ]] && yq -i ".k8sDistribution = \"${HELM_K8S_DISTRIBUTION}\"" "${values_yaml}"
 
 	if [[ "${HELM_DEFAULT_INSTALLATION}" = "false" ]]; then
diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
index dae7c0ca32..457cb00ab6 100644
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
@@ -391,6 +391,21 @@ reference:tag (tag defaults to Chart.AppVersion).
 {{- end -}}
 {{- end -}}
 
+{{/*
+Dispatcher image reference for the job-mode dispatcher (kata-deploy-job-dispatcher).
+Supports tag (reference:tag) and digest (reference@sha256:...) formats; tag
+defaults to Chart.AppVersion.
+*/}}
+{{- define "kata-deploy.dispatcherImage" -}}
+{{- $ref := .Values.job.dispatcherImage.reference -}}
+{{- $tag := default .Chart.AppVersion .Values.job.dispatcherImage.tag | toString -}}
+{{- if contains "@" $ref -}}
+{{- $ref -}}
+{{- else -}}
+{{- printf "%s:%s" $ref $tag -}}
+{{- end -}}
+{{- end -}}
+
 {{/*
 Get snapshotter setup list from structured config
 */}}
@@ -592,6 +607,166 @@ e.g. `{{- include "kata-deploy.commonEnv" . | nindent 8 }}`.
 {{- end }}
 {{- end -}}
 
+{{/*
+Build a Kubernetes label-selector STRING (the form accepted by the apiserver
+and `kubectl --selector`) from an equality map plus a list of match-expression
+requirements. This is handed to `kata-deploy-job-dispatcher --node-selector`, which
+resolves the actual target nodes LIVE at run time (so node membership is never
+frozen into the Helm release).
+
+Arguments (dict):
+  eq    - equality label map           -> "k=v"
+  exprs - list of {key, operator, values}:
+            Exists       -> "key"
+            DoesNotExist -> "!key"
+            In           -> "key in (v1,v2)"
+            NotIn        -> "key notin (v1,v2)"
+
+Returns the comma-joined selector string (possibly empty, meaning "all nodes").
+*/}}
+{{- define "kata-deploy.nodeLabelSelector" -}}
+{{- $parts := list -}}
+{{- range $k, $v := (.eq | default dict) -}}
+{{- $parts = append $parts (printf "%s=%s" $k $v) -}}
+{{- end -}}
+{{- range $expr := (.exprs | default list) -}}
+{{- $op := $expr.operator -}}
+{{- if eq $op "Exists" -}}
+{{- $parts = append $parts $expr.key -}}
+{{- else if eq $op "DoesNotExist" -}}
+{{- $parts = append $parts (printf "!%s" $expr.key) -}}
+{{- else if eq $op "In" -}}
+{{- $parts = append $parts (printf "%s in (%s)" $expr.key (join "," ($expr.values | default list))) -}}
+{{- else if eq $op "NotIn" -}}
+{{- $parts = append $parts (printf "%s notin (%s)" $expr.key (join "," ($expr.values | default list))) -}}
+{{- else -}}
+{{- fail (printf "nodeSelectorExpressions: unsupported operator %q for key %q (use In, NotIn, Exists, DoesNotExist)" $op $expr.key) -}}
+{{- end -}}
+{{- end -}}
+{{- join "," $parts -}}
+{{- end -}}
+
+{{/*
+Per-node staged Job manifest (deploymentMode: job), embedded verbatim into the
+job-templates ConfigMap. The dispatcher (kata-deploy-job-dispatcher) clones this once per
+target node, injecting metadata.name + spec.template.spec.nodeName, so the
+template itself carries NO node identity and NO Helm hook annotations.
+
+Arguments (dict):
+  root  - top-level context (.)
+  stage - "install" | "cleanup"
+
+install pipeline:  host-check -> artifacts -> cri (initContainers) ; label (main)
+cleanup pipeline:  unlabel -> revert-cri    (initContainers) ; remove-artifacts (main)
+
+Emitted at column 0 (a standalone Job document); embed with `indent` at the call
+site under a ConfigMap data key.
+*/}}
+{{- define "kata-deploy.perNodeJob" -}}
+{{- $root := .root -}}
+{{- $stage := .stage -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  labels:
+    app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+    app.kubernetes.io/instance: {{ $root.Release.Name }}
+    kata-deploy/stage: {{ $stage }}
+spec:
+  backoffLimit: {{ $root.Values.job.backoffLimit }}
+  ttlSecondsAfterFinished: {{ $root.Values.job.ttlSecondsAfterFinished }}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+        app.kubernetes.io/instance: {{ $root.Release.Name }}
+        kata-deploy/stage: {{ $stage }}
+    spec:
+{{- with $root.Values.imagePullSecrets }}
+      imagePullSecrets:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+      serviceAccountName: {{ include "kata-deploy.serviceAccountName" $root }}
+      restartPolicy: Never
+      hostPID: true
+{{- with $root.Values.tolerations }}
+      tolerations:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+{{- with $root.Values.priorityClassName }}
+      priorityClassName: {{ . | quote }}
+{{- end }}
+{{- if eq $stage "install" }}
+      initContainers:
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "host-check" "action" "install-stage-host-check" "privileged" true "mountHost" true) | nindent 8 }}
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "artifacts" "action" "install-stage-artifacts" "privileged" true "mountHost" true) | nindent 8 }}
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "cri" "action" "install-stage-cri" "privileged" true "mountHost" true) | nindent 8 }}
+      containers:
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "label" "action" "install-stage-label" "privileged" false "mountHost" false) | nindent 8 }}
+{{- else }}
+      initContainers:
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "unlabel" "action" "cleanup-stage-unlabel" "privileged" false "mountHost" false) | nindent 8 }}
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "revert-cri" "action" "cleanup-stage-revert-cri" "privileged" true "mountHost" true) | nindent 8 }}
+      containers:
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "remove-artifacts" "action" "cleanup-stage-remove-artifacts" "privileged" true "mountHost" true) | nindent 8 }}
+{{- end }}
+      volumes:
+{{- include "kata-deploy.commonVolumes" $root | nindent 8 }}
+{{- end -}}
+
+{{/*
+Service account name (honoring multiInstallSuffix), shared by all kata-deploy
+workloads (DaemonSet and staged Jobs).
+*/}}
+{{- define "kata-deploy.serviceAccountName" -}}
+{{- if .Values.env.multiInstallSuffix -}}
+{{ .Chart.Name }}-sa-{{ .Values.env.multiInstallSuffix }}
+{{- else -}}
+{{ .Chart.Name }}-sa
+{{- end -}}
+{{- end -}}
+
+{{/*
+ServiceAccount name for the job-mode dispatcher (kata-deploy-job-dispatcher). Separate from
+kata-deploy.serviceAccountName: the dispatcher is a pure API client (list nodes,
+manage Jobs) and must NOT carry the privileged kata-deploy host-mutation rights.
+*/}}
+{{- define "kata-deploy.dispatcherServiceAccountName" -}}
+{{- if .Values.env.multiInstallSuffix -}}
+{{ .Chart.Name }}-dispatcher-sa-{{ .Values.env.multiInstallSuffix }}
+{{- else -}}
+{{ .Chart.Name }}-dispatcher-sa
+{{- end -}}
+{{- end -}}
+
+{{/*
+Render a single staged-pipeline container that runs one kata-deploy stage action.
+Used by the per-node staged install/cleanup Jobs (deploymentMode: job).
+
+Arguments (dict):
+  root        - the top-level context (.)
+  name        - container name
+  action      - kata-deploy subcommand (e.g. install-stage-cri)
+  privileged  - bool, whether the container runs privileged (host nsenter/restart)
+  mountHost   - bool, whether to mount the host paths (crio/containerd/host)
+
+Emitted at column 0; indent with `nindent` at the call site.
+*/}}
+{{- define "kata-deploy.stageContainer" -}}
+- name: {{ .name }}
+  image: {{ include "kata-deploy.image" .root }}
+  imagePullPolicy: {{ .root.Values.imagePullPolicy }}
+  command: ["/usr/bin/kata-deploy", "{{ .action }}"]
+  env:
+{{- include "kata-deploy.commonEnv" .root | nindent 4 }}
+  securityContext:
+    privileged: {{ .privileged }}
+{{- if .mountHost }}
+  volumeMounts:
+{{- include "kata-deploy.commonVolumeMounts" .root | nindent 4 }}
+{{- end }}
+{{- end -}}
+
 {{/*
 Common volumeMounts for any pod that runs the kata-deploy binary against the
 host. Emitted at column 0; indent with `nindent` at the call site.
diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml
new file mode 100644
index 0000000000..ff8e97f3fb
--- /dev/null
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml
@@ -0,0 +1,113 @@
+{{- /*
+Install dispatcher (deploymentMode: job).
+
+A single, tiny post-install/post-upgrade hook Job that runs the dispatcher
+(kata-deploy-job-dispatcher). The dispatcher enumerates the selected nodes LIVE, then
+creates one node-pinned install Job per node from the job-templates ConfigMap,
+keeping at most job.parallelism in flight and refilling as they finish. This
+guarantees one install per node (coverage) with a paced rollout, while the Helm
+release stays O(1) regardless of fleet size.
+
+Each per-node Job runs the staged pipeline as ordered initContainers and exits:
+
+  host-check -> artifacts -> cri   (initContainers, run sequentially)
+  label                            (main container)
+
+Helm waits only on THIS dispatcher Job (the verification hook runs at a higher
+weight, after it). before-hook-creation lets `helm upgrade` re-run the dispatcher,
+which re-enumerates nodes (idempotent stages skip already-installed nodes and
+pick up newly added ones).
+*/ -}}
+{{- if eq (.Values.deploymentMode | default "daemonset") "job" }}
+{{- $root := . }}
+{{- $base := .Chart.Name }}
+{{- if .Values.env.multiInstallSuffix }}
+{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }}
+{{- end }}
+{{- $sa := include "kata-deploy.dispatcherServiceAccountName" . }}
+{{- $dispatcherName := printf "%s-install-dispatcher" $base | trunc 63 | trimSuffix "-" }}
+{{- $nodes := .Values.job.nodes | default list }}
+{{- $selector := include "kata-deploy.nodeLabelSelector" (dict "eq" (.Values.job.nodeSelector | default dict) "exprs" (.Values.job.nodeSelectorExpressions | default list)) }}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ $dispatcherName }}
+  namespace: {{ $root.Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+    app.kubernetes.io/instance: {{ $root.Release.Name }}
+    kata-deploy/dispatcher: install
+  annotations:
+    "helm.sh/hook": post-install,post-upgrade
+    "helm.sh/hook-weight": "5"
+    "helm.sh/hook-delete-policy": before-hook-creation
+spec:
+  # The dispatcher does per-node retries (job.backoffLimit) itself; a dispatcher
+  # failure means "some node failed" and should surface, not be retried blindly.
+  backoffLimit: 0
+  ttlSecondsAfterFinished: {{ $root.Values.job.ttlSecondsAfterFinished }}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+        app.kubernetes.io/instance: {{ $root.Release.Name }}
+        kata-deploy/dispatcher: install
+    spec:
+{{- with $root.Values.imagePullSecrets }}
+      imagePullSecrets:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+      serviceAccountName: {{ $sa }}
+      restartPolicy: Never
+      # The dispatcher never touches the host; it is a plain API client. Lock the
+      # pod down so a compromise cannot escalate beyond its (minimal) API rights.
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65532
+        runAsGroup: 65532
+        seccompProfile:
+          type: RuntimeDefault
+{{- with $root.Values.tolerations }}
+      tolerations:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+{{- with $root.Values.priorityClassName }}
+      priorityClassName: {{ . | quote }}
+{{- end }}
+      containers:
+        - name: dispatcher
+          image: {{ include "kata-deploy.dispatcherImage" $root }}
+          imagePullPolicy: {{ $root.Values.imagePullPolicy }}
+          securityContext:
+            privileged: false
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
+          command:
+            - /usr/bin/kata-deploy-job-dispatcher
+            - "--job-template=/etc/kata-job/install-job.yaml"
+            - "--name-prefix={{ $base }}-install"
+            - "--owner-job-name={{ $dispatcherName }}"
+            - "--parallelism={{ $root.Values.job.parallelism }}"
+{{- if $nodes }}
+            - "--nodes={{ join "," $nodes }}"
+{{- else if $selector }}
+            - "--node-selector={{ $selector }}"
+{{- end }}
+          env:
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          volumeMounts:
+            - name: job-templates
+              mountPath: /etc/kata-job
+              readOnly: true
+      volumes:
+        - name: job-templates
+          configMap:
+            name: {{ printf "%s-job-templates" $base | trunc 63 | trimSuffix "-" }}
+{{- end }}
diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml
new file mode 100644
index 0000000000..4d455f0763
--- /dev/null
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml
@@ -0,0 +1,33 @@
+{{- /*
+Per-node Job templates for deploymentMode: job.
+
+This ConfigMap holds the install and cleanup per-node Job manifests, rendered
+ONCE (constant size, independent of the number of nodes). The job-mode dispatcher
+(kata-deploy-job-dispatcher) mounts it, and for every selected node clones the relevant
+template, injects metadata.name + spec.template.spec.nodeName, and creates the
+Job. Keeping the rich pod spec (env/volumes/shim config) here means the Helm
+chart stays the single source of truth; the dispatcher only does fan-out.
+
+It is a normal (non-hook) resource: Helm creates it before the post-install
+dispatcher hook runs, and it still exists during the pre-delete cleanup hook
+(release resources are torn down only after pre-delete hooks complete).
+*/ -}}
+{{- if eq (.Values.deploymentMode | default "daemonset") "job" }}
+{{- $base := .Chart.Name }}
+{{- if .Values.env.multiInstallSuffix }}
+{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }}
+{{- end }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ printf "%s-job-templates" $base | trunc 63 | trimSuffix "-" }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ include "kata-deploy.name" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+data:
+  install-job.yaml: |
+{{ include "kata-deploy.perNodeJob" (dict "root" . "stage" "install") | indent 4 }}
+  cleanup-job.yaml: |
+{{ include "kata-deploy.perNodeJob" (dict "root" . "stage" "cleanup") | indent 4 }}
+{{- end }}
diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml
index ff02c34de6..17ff5bd183 100644
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml
@@ -1,3 +1,4 @@
+{{- if eq (.Values.deploymentMode | default "daemonset") "daemonset" -}}
 {{- if index .Values "node-feature-discovery" "enabled" -}}
 {{- $existingNFDNamespace := include "kata-deploy.detectExistingNFD" . | trim -}}
 {{- if $existingNFDNamespace -}}
@@ -204,3 +205,4 @@ spec:
   updateStrategy:
     {{- toYaml . | nindent 4 }}
 {{- end}}
+{{- end -}}
diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml
index 863b037c51..0f66e45a4a 100644
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml
@@ -65,6 +65,68 @@ subjects:
   name: {{ .Chart.Name }}-sa
 {{- end }}
   namespace: {{ .Release.Namespace }}
+{{- if eq (.Values.deploymentMode | default "daemonset") "job" }}
+---
+# Dedicated, least-privilege identity for the job-mode dispatcher
+# (kata-deploy-job-dispatcher). It is a pure control-plane client: it lists nodes
+# (cluster-scoped) and manages per-node Jobs in the release namespace
+# (namespace-scoped). It deliberately does NOT get the privileged kata-deploy
+# host-mutation rights (node patch, runtimeclasses, NFD, etc.); those stay on
+# kata-deploy-sa, which only the per-node Jobs use.
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "kata-deploy.dispatcherServiceAccountName" . }}
+  namespace: {{ .Release.Namespace }}
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ .Chart.Name }}-dispatcher-noderole{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+rules:
+# Enumerating nodes is inherently cluster-scoped.
+- apiGroups: [""]
+  resources: ["nodes"]
+  verbs: ["list"]
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ .Chart.Name }}-dispatcher-noderb{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ .Chart.Name }}-dispatcher-noderole{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+subjects:
+- kind: ServiceAccount
+  name: {{ include "kata-deploy.dispatcherServiceAccountName" . }}
+  namespace: {{ .Release.Namespace }}
+---
+kind: Role
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ .Chart.Name }}-dispatcher-role{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+  namespace: {{ .Release.Namespace }}
+rules:
+# The dispatcher only ever creates/watches/GCs per-node Jobs in its own namespace.
+- apiGroups: ["batch"]
+  resources: ["jobs"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+---
+kind: RoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ .Chart.Name }}-dispatcher-rb{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+  namespace: {{ .Release.Namespace }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: {{ .Chart.Name }}-dispatcher-role{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+subjects:
+- kind: ServiceAccount
+  name: {{ include "kata-deploy.dispatcherServiceAccountName" . }}
+  namespace: {{ .Release.Namespace }}
+{{- end }}
 ---
 # ServiceAccount and RBAC for the post-delete Job that removes the kept RBAC above.
 # Created as post-delete hooks with lower weight than the Job so they exist when the Job runs.
diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml
index 7b0e9fa74a..120bfd5027 100644
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml
@@ -1,3 +1,106 @@
+# Deployment model for installing/cleaning up Kata on nodes.
+#   daemonset: (default) the long-running kata-deploy DaemonSet installs Kata on
+#              every matching node and reverts it on pod termination (uninstall).
+#   job:       no always-on component. A tiny dispatcher Job (the dispatcher,
+#              kata-deploy-job-dispatcher) runs as a post-install/upgrade hook, enumerates
+#              the selected nodes LIVE, and creates one node-pinned install Job
+#              per node - paced to job.parallelism and guaranteeing one install
+#              per node. Each per-node Job runs the staged pipeline as ordered
+#              initContainers and exits. Uninstall works the same way via a
+#              pre-delete dispatcher (reverse pipeline).
+#
+# Why a dispatcher instead of rendering per-node Jobs in the chart: Helm stores
+# the whole rendered release in one ~1 MiB Secret and runs hook resources
+# sequentially, and neither an Indexed Job nor a JobSet can guarantee one pod
+# per node once parallelism < node-count (the scheduler ignores completed pods
+# when balancing spread). The dispatcher keeps the release O(1), enumerates nodes
+# at run time, and paces a guaranteed-coverage rollout with built-in Jobs only.
+#
+# NOTE on "job" mode and new nodes:
+#   The dispatcher only runs on `helm install` / `helm upgrade` / `helm uninstall`.
+#   When you add nodes later, re-run `helm upgrade` so the dispatcher enumerates
+#   and installs the new nodes (the staged actions are idempotent, so already-
+#   installed nodes are skipped). This is intentional: it avoids an always-on
+#   privileged component on every node.
+deploymentMode: daemonset  # daemonset | job
+
+# Settings specific to deploymentMode: job
+job:
+  # Dispatcher image: the dispatcher that fans out per-node Jobs. It only talks to
+  # the Kubernetes API (lists nodes, creates/watches Jobs); it never touches the
+  # host. Supports reference:tag or reference@sha256:digest; tag defaults to the
+  # chart appVersion.
+  dispatcherImage:
+    reference: quay.io/kata-containers/kata-deploy-job-dispatcher
+    tag: ""
+  # Maximum number of nodes processed concurrently (the dispatcher keeps at most
+  # this many per-node Jobs in flight, refilling as they finish). Lower it to
+  # pace the rollout (e.g. limit how many CRI runtimes restart at once on a big
+  # fleet); raise it to install faster. Effectively capped at the node count.
+  parallelism: 100
+  # How to choose which nodes get a per-node INSTALL Job. Precedence:
+  #   1. job.nodes (explicit list of node names) - if non-empty, used verbatim
+  #      (passed to the dispatcher as --nodes).
+  #   2. otherwise a label selector built from job.nodeSelector (equality) ANDed
+  #      with job.nodeSelectorExpressions (In/NotIn/Exists/DoesNotExist) is
+  #      passed to the dispatcher, which resolves matching nodes LIVE at run time.
+  #   3. if both are empty, ALL nodes are targeted.
+  #
+  # DEFAULT: target worker (non-control-plane) nodes, so no custom labeling is
+  # required. Override these freely:
+  #   - Target nodes with a specific label:
+  #       job:
+  #         nodeSelector: { kata-containers: "enabled" }
+  #   - Target every node (including control-plane), e.g. single-node clusters/CI:
+  #       job:
+  #         nodeSelectorExpressions: []
+  #   - Richer expressions:
+  #       job:
+  #         nodeSelectorExpressions:
+  #           - { key: kubernetes.io/os, operator: In, values: ["linux"] }
+  #           - { key: node-role.kubernetes.io/control-plane, operator: DoesNotExist }
+  #   - Pin to explicit nodes:
+  #       job:
+  #         nodes: ["worker-1", "worker-2"]
+  nodes: []
+  # Equality label selector (ANDed with nodeSelectorExpressions). Ignored when
+  # job.nodes is set. Empty by default.
+  nodeSelector: {}
+  # Kubernetes-style label selector requirements (ANDed with nodeSelector).
+  # Each entry: { key, operator, values }. operator is one of:
+  #   In | NotIn (values required) | Exists | DoesNotExist (values must be empty).
+  # Default selects nodes that are NOT control-plane/master (i.e. worker nodes).
+  # Set to [] to disable role filtering and target all discovered nodes.
+  nodeSelectorExpressions:
+    - key: node-role.kubernetes.io/control-plane
+      operator: DoesNotExist
+    - key: node-role.kubernetes.io/master
+      operator: DoesNotExist
+  # Node selection for the UNINSTALL (pre-delete hook) dispatcher. Same precedence
+  # and semantics as install (cleanup.nodes, else cleanup.nodeSelector ANDed with
+  # cleanup.nodeSelectorExpressions, else all nodes).
+  #
+  # The cleanup dispatcher resolves nodes LIVE when it runs at `helm uninstall`
+  # (the dispatcher does the lookup), so - unlike a frozen Helm-rendered hook -
+  # the DEFAULT below can safely be "nodes carrying katacontainers.io/kata-runtime",
+  # i.e. exactly the nodes install actually labeled. Override to clean a
+  # different set, e.g.:
+  #   job:
+  #     cleanup:
+  #       nodes: ["worker-1"]
+  cleanup:
+    nodes: []
+    nodeSelector: {}
+    nodeSelectorExpressions:
+      - key: katacontainers.io/kata-runtime
+        operator: Exists
+  # How long finished per-node Jobs are retained before automatic garbage
+  # collection (seconds). Applies to both install and cleanup per-node Jobs.
+  ttlSecondsAfterFinished: 600
+  # Per-node retry budget: retries for a single node's Job before it is marked
+  # failed. One node failing never aborts the others.
+  backoffLimit: 3
+
 imagePullPolicy: Always
 
 imagePullSecrets: []
diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh
index 337a29291b..2897bcdd46 100755
--- a/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh
+++ b/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh
@@ -23,8 +23,21 @@ tmp="$(mktemp -d)"
 trap '[[ -n "${KEEP_TMPDIR}" ]] && echo "kept: ${tmp}" || rm -rf "${tmp}"' EXIT
 
 cp -r "${CHART_SRC}" "${tmp}/"
+
+# Job-mode dispatcher image. Its repo mirrors the kata-deploy repo with
+# "-job-dispatcher" inserted before any "-ci" suffix (so the "-ci" stays last):
+#   .../kata-deploy     -> .../kata-deploy-job-dispatcher
+#   .../kata-deploy-ci  -> .../kata-deploy-job-dispatcher-ci
+# It is built and pushed with the same tag by kata-deploy-build-and-upload-payload.sh.
+if [[ "${REGISTRY}" == *-ci ]]; then
+	JOB_DISPATCHER_IMAGE_REFERENCE="${JOB_DISPATCHER_IMAGE_REFERENCE:-"${REGISTRY%-ci}-job-dispatcher-ci"}"
+else
+	JOB_DISPATCHER_IMAGE_REFERENCE="${JOB_DISPATCHER_IMAGE_REFERENCE:-"${REGISTRY}-job-dispatcher"}"
+fi
+
 yq eval ".version = \"${CHART_VERSION}\" | .appVersion = \"${CHART_VERSION}\"" -i "${tmp}/kata-deploy/Chart.yaml"
 yq eval ".image.reference = \"${REGISTRY}\" | .image.tag = \"${TAG}\"" -i "${tmp}/kata-deploy/values.yaml"
+yq eval ".job.dispatcherImage.reference = \"${JOB_DISPATCHER_IMAGE_REFERENCE}\" | .job.dispatcherImage.tag = \"${TAG}\"" -i "${tmp}/kata-deploy/values.yaml"
 helm dependencies update "${tmp}/kata-deploy"
 helm package "${tmp}/kata-deploy" -d "${tmp}"
 helm push "${tmp}/kata-deploy-${CHART_VERSION}.tgz" "oci://${CHART_REGISTRY}"

From 3d732986d2f8eb783950292767576a88762ba3cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 3 Jun 2026 22:07:15 +0200
Subject: [PATCH 6/9] kata-deploy: add per-node staged cleanup for job mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the uninstall counterpart to the install dispatcher for
deploymentMode: job. On `helm uninstall`, a single pre-delete hook Job
runs the kata-deploy-job-dispatcher, which enumerates the targeted nodes
live and fans out one node-pinned cleanup Job per node that runs the
install pipeline in reverse and exits:

  unlabel -> revert-cri   (initContainers, run sequentially)
  remove-artifacts        (main container)

Running as a pre-delete hook means the dispatcher ServiceAccount/RBAC and
the kata-deploy host-mutation RBAC still exist while the Jobs run, so the
unlabel stage retains node get/patch access. revert-cri and
remove-artifacts are host-only operations (privileged nsenter / host
mount) and need no extra cluster RBAC.

Ordering mirrors install in reverse: unlabel first so the scheduler stops
placing kata workloads here, then revert the CRI config + restart the
runtime, then remove the on-host artifacts. Each stage is idempotent and
skips when already undone, so partially-installed nodes and re-runs are
safe.

Uninstall node selection is deliberately SEPARATE from install (a
dedicated job.cleanup.* block) and defaults to every node carrying the
katacontainers.io/kata-runtime label (set by the install label stage)
rather than re-evaluating the install selector. Because the cleanup
dispatcher resolves nodes live when it runs, this stays robust to
install-time selector drift (relabeled nodes, etc.) while remaining fully
overridable via job.cleanup.nodes / job.cleanup.nodeSelector /
job.cleanup.nodeSelectorExpressions. The default (daemonset) mode is
unaffected.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 .../templates/kata-deploy-cleanup-job.yaml    | 112 ++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-cleanup-job.yaml

diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-cleanup-job.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-cleanup-job.yaml
new file mode 100644
index 0000000000..31b3887cc0
--- /dev/null
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-cleanup-job.yaml
@@ -0,0 +1,112 @@
+{{- /*
+Cleanup dispatcher (deploymentMode: job, pre-delete hook).
+
+The mirror image of the install dispatcher: a single tiny pre-delete hook Job that
+runs the dispatcher (kata-deploy-job-dispatcher) to fan out one node-pinned cleanup Job
+per selected node, paced to job.parallelism. Each per-node Job runs the install
+pipeline in reverse and exits:
+
+  unlabel -> revert-cri   (initContainers, run sequentially)
+  remove-artifacts        (main container)
+
+Unlike the old per-node hook model, node selection here is resolved LIVE when the
+hook runs at `helm uninstall` (the dispatcher does the lookup), not frozen at
+install/upgrade time. That is why the default cleanup selector can be
+"nodes carrying the katacontainers.io/kata-runtime label" (i.e. exactly the
+nodes install actually labeled) - see values.yaml job.cleanup.
+
+This runs while the release's kept ServiceAccount/RBAC and the job-templates
+ConfigMap still exist; they are torn down only after pre-delete hooks complete.
+*/ -}}
+{{- if eq (.Values.deploymentMode | default "daemonset") "job" }}
+{{- $root := . }}
+{{- $base := .Chart.Name }}
+{{- if .Values.env.multiInstallSuffix }}
+{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }}
+{{- end }}
+{{- $sa := include "kata-deploy.dispatcherServiceAccountName" . }}
+{{- $dispatcherName := printf "%s-cleanup-dispatcher" $base | trunc 63 | trimSuffix "-" }}
+{{- $cleanup := .Values.job.cleanup | default dict }}
+{{- $cNodes := $cleanup.nodes | default list }}
+{{- $cSelector := include "kata-deploy.nodeLabelSelector" (dict "eq" ($cleanup.nodeSelector | default dict) "exprs" ($cleanup.nodeSelectorExpressions | default list)) }}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ $dispatcherName }}
+  namespace: {{ $root.Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+    app.kubernetes.io/instance: {{ $root.Release.Name }}
+    kata-deploy/dispatcher: cleanup
+  annotations:
+    "helm.sh/hook": pre-delete
+    "helm.sh/hook-weight": "5"
+    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
+spec:
+  backoffLimit: 0
+  ttlSecondsAfterFinished: {{ $root.Values.job.ttlSecondsAfterFinished }}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+        app.kubernetes.io/instance: {{ $root.Release.Name }}
+        kata-deploy/dispatcher: cleanup
+    spec:
+{{- with $root.Values.imagePullSecrets }}
+      imagePullSecrets:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+      serviceAccountName: {{ $sa }}
+      restartPolicy: Never
+      # The dispatcher never touches the host; it is a plain API client. Lock the
+      # pod down so a compromise cannot escalate beyond its (minimal) API rights.
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65532
+        runAsGroup: 65532
+        seccompProfile:
+          type: RuntimeDefault
+{{- with $root.Values.tolerations }}
+      tolerations:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+{{- with $root.Values.priorityClassName }}
+      priorityClassName: {{ . | quote }}
+{{- end }}
+      containers:
+        - name: dispatcher
+          image: {{ include "kata-deploy.dispatcherImage" $root }}
+          imagePullPolicy: {{ $root.Values.imagePullPolicy }}
+          securityContext:
+            privileged: false
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
+          command:
+            - /usr/bin/kata-deploy-job-dispatcher
+            - "--job-template=/etc/kata-job/cleanup-job.yaml"
+            - "--name-prefix={{ $base }}-cleanup"
+            - "--owner-job-name={{ $dispatcherName }}"
+            - "--parallelism={{ $root.Values.job.parallelism }}"
+{{- if $cNodes }}
+            - "--nodes={{ join "," $cNodes }}"
+{{- else if $cSelector }}
+            - "--node-selector={{ $cSelector }}"
+{{- end }}
+          env:
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          volumeMounts:
+            - name: job-templates
+              mountPath: /etc/kata-job
+              readOnly: true
+      volumes:
+        - name: job-templates
+          configMap:
+            name: {{ printf "%s-job-templates" $base | trunc 63 | trimSuffix "-" }}
+{{- end }}

From c23fe1152995e0e0d8a1371bb21ec0acc776dad3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 3 Jun 2026 22:07:31 +0200
Subject: [PATCH 7/9] kata-deploy: make verification Job aware of job
 deployment mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The verification Job assumed the DaemonSet model: it waited for the
DaemonSet to exist, for its pods, and for `rollout status daemonset/...`,
then required every node in the cluster to be labeled. None of that holds
for deploymentMode: job, where install happens via the dispatcher and the
per-node Jobs it fans out, and only the targeted (worker) nodes get
labeled.

Make the hook mode-aware:
  - Hook weight: in job mode the install dispatcher runs as a
    post-install hook at weight 5, so verification now runs at weight 10
    (after it); daemonset mode keeps weight 0 (the DaemonSet is a normal
    resource).
  - Readiness wait: in job mode, wait for the install dispatcher Job to
    complete and then for the per-node install Jobs
    (kata-deploy/stage=install) to finish (with the same CRI-restart
    retry logic) instead of a DaemonSet rollout.
  - Label check: in job mode, verify exactly the nodes the dispatcher
    targeted are labeled, rather than comparing the labeled count against
    all nodes in the cluster.
  - Grant the verification ClusterRole read access to batch/jobs (used by
    the job-mode waits; harmless in daemonset mode).

The daemonset code path is unchanged and the default render (no
verification.pod) is byte-for-byte identical.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 .../templates/verification-job.yaml           | 70 ++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/verification-job.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/verification-job.yaml
index 21ed7b6155..4871169cfc 100644
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/verification-job.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/verification-job.yaml
@@ -6,6 +6,10 @@ Verification Job - runs after kata-deploy installation to validate Kata is worki
 Only created when verification.pod is provided.
 */ -}}
 {{- if .Values.verification.pod }}
+{{- $isJob := eq (.Values.deploymentMode | default "daemonset") "job" }}
+{{- $base := .Chart.Name }}
+{{- if .Values.env.multiInstallSuffix }}{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }}{{- end }}
+{{- $installDispatcher := printf "%s-install-dispatcher" $base | trunc 63 | trimSuffix "-" }}
 apiVersion: v1
 kind: ConfigMap
 metadata:
@@ -27,7 +31,10 @@ metadata:
     app.kubernetes.io/component: verification
   annotations:
     "helm.sh/hook": post-install,post-upgrade
-    "helm.sh/hook-weight": "0"
+    # In job mode the per-node install Jobs are post-install hooks at weight 5;
+    # verification must run after them, so use a higher weight. In daemonset
+    # mode the DaemonSet is a normal resource (created before any hook), so 0 is fine.
+    "helm.sh/hook-weight": {{ if $isJob }}"10"{{ else }}"0"{{ end }}
     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 spec:
   backoffLimit: 3
@@ -57,6 +64,33 @@ spec:
               echo "Timeout: ${TIMEOUT}s"
               echo ""
 
+              {{- if $isJob }}
+              # job mode: there is no DaemonSet. Helm has already waited for the
+              # install dispatcher hook (this verification hook runs at a higher
+              # weight); re-check it defensively here. The dispatcher only reports
+              # success once every per-node install Job has succeeded.
+              DISPATCHER="{{ $installDispatcher }}"
+              INSTALL_TIMEOUT="{{ .Values.verification.daemonsetTimeout }}"
+              echo "Waiting for install dispatcher Job ${DISPATCHER} to complete (timeout ${INSTALL_TIMEOUT}s)..."
+              # kata-deploy restarts the CRI runtime during the cri stage, which can
+              # cause transient API server unavailability. Retry the wait to handle this.
+              wait_retries=5
+              wait_retry_delay=15
+              for wait_attempt in $(seq 1 ${wait_retries}); do
+                if kubectl wait --for=condition=complete "job/${DISPATCHER}" -n {{ .Release.Namespace }} --timeout="${INSTALL_TIMEOUT}s" 2>&1; then
+                  break
+                fi
+                if [[ ${wait_attempt} -eq ${wait_retries} ]]; then
+                  echo "ERROR: install dispatcher ${DISPATCHER} did not complete after ${wait_retries} attempts"
+                  kubectl get job "${DISPATCHER}" -n {{ .Release.Namespace }} || true
+                  kubectl logs -n {{ .Release.Namespace }} "job/${DISPATCHER}" --tail=50 || true
+                  kubectl get jobs -n {{ .Release.Namespace }} -l kata-deploy/stage=install || true
+                  exit 1
+                fi
+                echo "API server may be restarting (attempt ${wait_attempt}/${wait_retries}), retrying in ${wait_retry_delay}s..."
+                sleep ${wait_retry_delay}
+              done
+              {{- else }}
               # First, wait for kata-deploy DaemonSet to exist (it's created by Helm, not a hook)
               echo "Waiting for kata-deploy DaemonSet to be created..."
               {{- if .Values.env.multiInstallSuffix }}
@@ -128,6 +162,7 @@ spec:
                 echo "API server may be restarting (attempt ${rollout_attempt}/${rollout_retries}), retrying in ${rollout_retry_delay}s..."
                 sleep ${rollout_retry_delay}
               done
+              {{- end }}
 
               # Wait for nodes to be labeled with katacontainers.io/kata-runtime=true
               # This label is set by kata-deploy when installation is complete
@@ -137,6 +172,35 @@ spec:
               max_wait=60
               echo "Node label timeout: ${max_wait}s"
               elapsed=0
+              {{- if $isJob }}
+              # job mode: only the targeted nodes get labeled. The dispatcher
+              # created one per-node install Job per targeted node (label
+              # kata-deploy/stage=install); use that count as the expected
+              # coverage rather than comparing against all nodes.
+              expected_count=$(kubectl get jobs -n {{ .Release.Namespace }} -l kata-deploy/stage=install --no-headers 2>/dev/null | wc -l)
+              echo "Expected ${expected_count} node(s) to be labeled (one per per-node install Job)"
+              while true; do
+                labeled_nodes=$(kubectl get nodes -l katacontainers.io/kata-runtime=true --no-headers 2>/dev/null | wc -l)
+
+                if [[ ${expected_count} -gt 0 ]] && [[ ${labeled_nodes} -ge ${expected_count} ]]; then
+                  echo "All ${expected_count} targeted node(s) labeled with kata-runtime=true"
+                  kubectl get nodes -L katacontainers.io/kata-runtime || true
+                  break
+                fi
+
+                if [[ ${elapsed} -ge ${max_wait} ]]; then
+                  echo "ERROR: Timeout waiting for nodes to be labeled after ${max_wait}s"
+                  echo "Labeled nodes: ${labeled_nodes}/${expected_count}"
+                  echo "Node labels:"
+                  kubectl get nodes -L katacontainers.io/kata-runtime || true
+                  exit 1
+                fi
+
+                echo "Labeled nodes: ${labeled_nodes}/${expected_count} (${elapsed}s/${max_wait}s)"
+                sleep 5
+                elapsed=$((elapsed + 5))
+              done
+              {{- else }}
               while true; do
                 labeled_nodes=$(kubectl get nodes -l katacontainers.io/kata-runtime=true --no-headers 2>/dev/null | wc -l)
                 total_nodes=$(kubectl get nodes --no-headers 2>/dev/null | wc -l)
@@ -159,6 +223,7 @@ spec:
                 sleep 5
                 elapsed=$((elapsed + 5))
               done
+              {{- end }}
 
               # Give kubelet time to pick up the new runtime configuration after containerd restart
               echo ""
@@ -315,6 +380,9 @@ rules:
   - apiGroups: ["apps"]
     resources: ["daemonsets"]
     verbs: ["get", "list", "watch"]
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["get", "list", "watch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding

From aebadb1ab2336d142716d8bae069b78c3a960196 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 3 Jun 2026 22:07:57 +0200
Subject: [PATCH 8/9] docs: document kata-deploy job deployment mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Document the new opt-in deploymentMode: job alongside the default
DaemonSet model in the maintained docs (not just the chart README):

  - helm-configuration.md: add a "Deployment Modes (DaemonSet vs Job)"
    section covering the dispatcher-driven staged install/cleanup
    pipelines, why a dispatcher is used instead of Helm-rendered per-node
    Jobs (O(1) release, guaranteed coverage, paced rollout, explicit
    privilege split), the "re-run helm upgrade to cover newly added
    nodes" model (no always-on reconcile component), and the
    node-selection precedence (job.nodes > job.nodeSelector +
    job.nodeSelectorExpressions) that defaults to worker nodes.
  - installation.md: note that the DaemonSet is the default but no longer
    the only model, linking to the section above.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 docs/helm-configuration.md | 173 +++++++++++++++++++++++++++++++++++++
 docs/installation.md       |   6 ++
 2 files changed, 179 insertions(+)

diff --git a/docs/helm-configuration.md b/docs/helm-configuration.md
index febeb7f433..aa83ecda2f 100644
--- a/docs/helm-configuration.md
+++ b/docs/helm-configuration.md
@@ -93,6 +93,179 @@ customRuntimes:
 
 Again, view the default [`values.yaml`](#parameters) file for more details.
 
+## Deployment Modes (DaemonSet vs Job)
+
+The chart can install Kata on nodes in one of two ways, selected with the
+top-level `deploymentMode` value:
+
+- **`daemonset`** (default): the long-running `kata-deploy` DaemonSet installs
+  Kata on every matching node and reverts it when the pod is terminated (i.e. on
+  uninstall). This is the historical behavior and is unchanged.
+- **`job`**: there is **no always-on component**. A tiny *dispatcher* Job (the
+  dispatcher, `kata-deploy-job-dispatcher`) runs as a `post-install`/`post-upgrade` hook,
+  enumerates the selected nodes **live** via the Kubernetes API, and creates one
+  node-pinned install `Job` per node. Each per-node Job runs the staged install
+  pipeline as ordered `initContainers` and then exits:
+
+  ```
+  host-check -> artifacts -> cri   (initContainers)  ->  label (main)
+  ```
+
+  On `helm uninstall`, a `pre-delete` dispatcher fans out per-node Jobs that run
+  the pipeline in reverse (`unlabel -> revert-cri -> remove-artifacts`). Unlike
+  the DaemonSet, **nothing keeps running on the node after installation
+  completes**, and the dispatcher itself only ever talks to the API server — it
+  never touches the host (so it ships as a separate, minimal image,
+  `job.dispatcherImage`).
+
+  The privilege split is explicit: the dispatcher pod runs **fully unprivileged**
+  (`runAsNonRoot`, all capabilities dropped, no privilege escalation, read-only
+  root filesystem, `RuntimeDefault` seccomp) under a **dedicated minimal
+  ServiceAccount** whose only rights are `nodes: list` (cluster-scoped) and
+  managing Jobs in the release namespace. All privileged, host-mutating work
+  stays in the per-node Jobs, which continue to use the `kata-deploy`
+  ServiceAccount.
+
+```yaml title="values.yaml"
+deploymentMode: job
+```
+
+#### Why a dispatcher instead of Helm-rendered per-node Jobs
+
+Rendering one Job per node directly in the chart does not scale: Helm stores the
+whole rendered release in a single (~1 MiB) Secret and runs hook resources
+sequentially, so large fleets blow the size limit and/or take far too long. A
+single `Indexed Job` or a `JobSet` removes those limits but **cannot guarantee
+one pod per node** once `parallelism < node-count`: Kubernetes' topology-spread
+and affinity scheduling ignore *completed* pods, so as paced pods finish, later
+pods pile onto a subset of nodes and leave others uncovered.
+
+The dispatcher sidesteps both problems: the Helm release stays O(1) (just the
+dispatcher + a constant-size ConfigMap holding the per-node Job templates), node
+membership is resolved at run time, and the dispatcher itself paces the rollout
+(at most `job.parallelism` per-node Jobs in flight) while **guaranteeing one Job
+per node**. Per-node Jobs are garbage-collected via an `ownerReference` to the
+dispatcher and `job.ttlSecondsAfterFinished`.
+
+### Adding nodes in `job` mode
+
+The dispatcher only runs on `helm install` / `helm upgrade` / `helm uninstall`.
+There is **no dispatcher watching for new nodes**, so when you add nodes later,
+re-run `helm upgrade`; the dispatcher re-enumerates the cluster and installs the
+new nodes:
+
+```sh
+helm upgrade kata-deploy "${CHART}" --version "${VERSION}" --reuse-values
+```
+
+Each per-node stage is idempotent (it skips when already applied), so the
+upgrade only does real work on the newly added nodes.
+
+### Recovering from a failed or deleted dispatcher
+
+The dispatcher runs as a **blocking** `post-install`/`post-upgrade` hook Job with
+`restartPolicy: Never` and `backoffLimit: 0`, so if its pod is evicted, drained,
+or deleted mid-rollout the Job is marked *failed* and is **not** restarted
+automatically — `helm install`/`helm upgrade` surfaces the failure rather than
+leaving you silently half-installed.
+
+What survives the dispatcher dying:
+
+- **Per-node Jobs already created keep running.** They are independent,
+  `nodeName`-pinned Jobs, not children of the dispatcher pod, so installs that
+  were already dispatched run to completion and those nodes get labeled. Only
+  nodes still queued (never dispatched) are skipped, so at worst you get
+  *partial coverage* — never a half-mutated host, because each stage is
+  idempotent.
+- Those per-node Jobs carry a (non-controller) `ownerReference` to the dispatcher
+  Job, so they survive *pod* deletion but are garbage-collected once the
+  dispatcher **Job** itself is removed or its `job.ttlSecondsAfterFinished`
+  elapses. Keep that TTL comfortably larger than a single node's install so
+  in-flight Jobs are not reaped early.
+
+Recovery is the same one-liner as adding nodes — re-run `helm upgrade`:
+
+```sh
+helm upgrade kata-deploy "${CHART}" --version "${VERSION}" --reuse-values
+```
+
+The `before-hook-creation` delete policy first removes the stale dispatcher Job
+(cascading away any leftover per-node Jobs); the fresh dispatcher then
+re-enumerates nodes live, recreates the per-node Jobs (adopting any that still
+exist rather than duplicating them), and because every stage is idempotent the
+already-installed nodes are fast no-ops. Coverage converges on the re-run.
+
+### Choosing which nodes get a Job
+
+In `job` mode, node selection is configured under the `job` key, with the
+following precedence (highest first):
+
+1. `job.nodes`: an explicit list of node names, passed to the dispatcher verbatim.
+2. `job.nodeSelector` (an equality map) **ANDed with**
+   `job.nodeSelectorExpressions` (Kubernetes label-selector requirements using
+   the operators `In`, `NotIn`, `Exists`, `DoesNotExist`). These are compiled
+   into a single label-selector string that the dispatcher resolves live.
+3. If both are empty, **all** nodes are targeted.
+
+By **default the expressions target worker (non-control-plane) nodes**, so no
+custom node labeling is required (this differs from the DaemonSet `nodeSelector`
+examples above, which rely on you labeling nodes). Override as needed:
+
+```yaml title="values.yaml"
+# Target nodes carrying a specific label:
+job:
+  nodeSelector:
+    kata-containers: "enabled"
+
+# Target every node, including control-plane (e.g. single-node clusters / CI):
+job:
+  nodeSelectorExpressions: []
+
+# Richer expressions:
+job:
+  nodeSelectorExpressions:
+    - { key: kubernetes.io/os, operator: In, values: ["linux"] }
+    - { key: node-role.kubernetes.io/control-plane, operator: DoesNotExist }
+
+# Pin to explicit nodes:
+job:
+  nodes: ["worker-1", "worker-2"]
+```
+
+Use `job.parallelism` to pace the rollout — it caps how many per-node Jobs run
+concurrently (e.g. to limit how many CRI runtimes restart at once on a big
+fleet). It is effectively capped at the number of targeted nodes.
+
+### Choosing which nodes are cleaned up on uninstall
+
+Because the cleanup dispatcher resolves nodes **live when it runs** at
+`helm uninstall` (the dispatcher does the lookup, not Helm at render time), the
+node set is *not* frozen into the stored release. This means the **default
+cleanup selector can simply be "nodes carrying the
+`katacontainers.io/kata-runtime` label"** — i.e. exactly the nodes the install
+actually labeled, regardless of how the install selector has drifted since.
+
+Override it under `job.cleanup`, with the same precedence/semantics as install
+(`cleanup.nodes`, then `cleanup.nodeSelector` ANDed with
+`cleanup.nodeSelectorExpressions`, else all nodes):
+
+```yaml title="values.yaml"
+# Only uninstall from specific nodes:
+job:
+  cleanup:
+    nodes: ["worker-1"]
+
+# Use an explicit selector instead of the kata-runtime label default:
+job:
+  cleanup:
+    nodeSelectorExpressions:
+      - { key: node-role.kubernetes.io/control-plane, operator: DoesNotExist }
+```
+
+See the default [`values.yaml`](#parameters) for the remaining `job.*` options
+(e.g. `dispatcherImage`, `parallelism`, `ttlSecondsAfterFinished`,
+`backoffLimit`).
+
 ## Examples
 
 We provide a few examples that you can pass to helm via the `-f`/`--values` flag.
diff --git a/docs/installation.md b/docs/installation.md
index f3d21a3118..f54f752cc7 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -40,6 +40,12 @@ $ helm show values "${CHART}" --version "${VERSION}"
 This installs the `kata-deploy` DaemonSet and the default Kata `RuntimeClass`
 resources on your cluster.
 
+> **Note:** the DaemonSet is the default install model, but it is no longer the
+> only one. You can instead install Kata via short-lived, staged per-node Jobs
+> (no always-on component on the node) by setting `deploymentMode: job`. See
+> [Deployment Modes (DaemonSet vs Job)](helm-configuration.md#deployment-modes-daemonset-vs-job)
+> for details and node-selection options.
+
 To see what versions of the chart are available:
 
 ```sh

From aa274908019b20dffd880df62b526d1f2c2759fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 10 Jun 2026 15:00:18 +0200
Subject: [PATCH 9/9] kata-deploy: track distroless static base by tag, not
 digest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kata-deploy main image pinned its gcr.io/distroless/static-debian13
base by sha256 digest. distroless does not publish versioned tags, so a
pinned digest just goes stale with no clear upgrade path. Track the
rolling tag instead (guarded with a hadolint DL3007 ignore plus a comment
explaining why), matching the kata-deploy-job-dispatcher image base.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 tools/packaging/kata-deploy/Dockerfile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/packaging/kata-deploy/Dockerfile b/tools/packaging/kata-deploy/Dockerfile
index b5b80ff640..152c0f583d 100644
--- a/tools/packaging/kata-deploy/Dockerfile
+++ b/tools/packaging/kata-deploy/Dockerfile
@@ -111,7 +111,10 @@ RUN \
 	esac
 
 #### kata-deploy main image
-FROM gcr.io/distroless/static-debian13@sha256:972618ca78034aaddc55864342014a96b85108c607372f7cbd0dbd1361f1d841
+# distroless does not publish pinned/versioned tags - only rolling ones
+# (latest, nonroot, debug) - so :latest is the intended way to consume it.
+# hadolint ignore=DL3007
+FROM gcr.io/distroless/static-debian13:latest
 
 ARG DESTINATION=/opt/kata-artifacts