kata-deploy: add job deployment mode driven by the job-dispatcher

Phase 2 of the DaemonSet -> staged-Job migration: add an opt-in `deploymentMode: job` that installs Kata via short-lived, per-node install Jobs instead of the long-running DaemonSet. The DaemonSet remains the default and is now gated behind `deploymentMode == daemonset`. Rather than render one Job per node into the Helm release (which grows the release secret O(nodes) and offers no rollout pacing), job mode ships a single tiny post-install/post-upgrade hook Job that runs the kata-deploy-job-dispatcher. The dispatcher enumerates the selected nodes LIVE from the API server and stamps out one node-pinned install Job per node from a constant-size ConfigMap of Job templates, keeping at most `job.parallelism` in flight and refilling as they finish. This guarantees per-node coverage with a paced rollout while the Helm release stays O(1) regardless of fleet size. New nodes are picked up by re-running `helm upgrade`; there is no always-on component. Each per-node Job runs the staged install pipeline as ordered initContainers and exits: host-check -> artifacts -> cri (initContainers, run sequentially) label (main container) The privilege split is explicit: the dispatcher pod is a pure control-plane client (lists nodes, manages Jobs in its own namespace) and runs fully unprivileged under a dedicated, least-privilege ServiceAccount (kata-rbac.yaml); only the per-node Jobs it creates carry the privileged kata-deploy host-mutation rights. Node selection (templates/_helpers.tpl: nodeLabelSelector / perNodeJob): - job.nodes: explicit node-name list passed to the dispatcher, and - job.nodeSelector (equality map) ANDed with - job.nodeSelectorExpressions (k8s label-selector requirements: In / NotIn / Exists / DoesNotExist), compiled into a single label-selector string the dispatcher resolves live. The default expressions target worker (non-control-plane) nodes, so no custom node labeling is required; set the expressions to [] to target all discovered nodes. Reuses the commonEnv/commonVolume* helpers and adds the stageContainer, serviceAccountName, dispatcherServiceAccountName, dispatcherImage and perNodeJob helpers shared by the dispatcher and the staged Jobs. The default (daemonset) render is unchanged. Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com> Assisted-by: Cursor <cursoragent@cursor.com>
2026-07-01 06:28:11 +00:00 · 2026-06-03 22:06:55 +02:00
parent 28fce44b70
commit 54878fa373
9 changed files with 527 additions and 0 deletions
--- a/tests/functional/kata-deploy/lib/helm-deploy.bash
+++ b/tests/functional/kata-deploy/lib/helm-deploy.bash
@@ -31,11 +31,24 @@ generate_base_values() {
 	local output_file="$1"
 	local extra_values_file="${2:-}"

+	local kata_deploy_image="${DOCKER_REGISTRY}/${DOCKER_REPO}"
+	local dispatcher_image
+	if [[ "${kata_deploy_image}" == *-ci ]]; then
+		dispatcher_image="${kata_deploy_image%-ci}-job-dispatcher-ci"
+	else
+		dispatcher_image="${kata_deploy_image}-job-dispatcher"
+	fi
+
 	cat > "${output_file}" <<EOF
 image:
  reference: ${DOCKER_REGISTRY}/${DOCKER_REPO}
  tag: ${DOCKER_TAG}

+job:
+  dispatcherImage:
+    reference: ${dispatcher_image}
+    tag: ${DOCKER_TAG}
+
 k8sDistribution: "${KUBERNETES}"
 debug: true

--- a/tests/gha-run-k8s-common.sh
+++ b/tests/gha-run-k8s-common.sh
@@ -732,6 +732,19 @@ function helm_helper() {
 	fi
 	yq -i ".image.tag = \"${HELM_IMAGE_TAG}\"" "${values_yaml}"

+	# Derive the dispatcher image name from the main kata-deploy image,
+	# mirroring the -ci/non-ci logic used by the build/release scripts: the
+	# dispatcher lives at "<base>-job-dispatcher", with the "-ci" suffix (if
+	# any) kept at the very end (e.g. kata-deploy-ci -> kata-deploy-job-dispatcher-ci).
+	local dispatcher_reference
+	if [[ "${HELM_IMAGE_REFERENCE}" == *-ci ]]; then
+		dispatcher_reference="${HELM_IMAGE_REFERENCE%-ci}-job-dispatcher-ci"
+	else
+		dispatcher_reference="${HELM_IMAGE_REFERENCE}-job-dispatcher"
+	fi
+	yq -i ".job.dispatcherImage.reference = \"${dispatcher_reference}\"" "${values_yaml}"
+	yq -i ".job.dispatcherImage.tag = \"${HELM_IMAGE_TAG}\"" "${values_yaml}"
+
 	[[ -n "${HELM_K8S_DISTRIBUTION}" ]] && yq -i ".k8sDistribution = \"${HELM_K8S_DISTRIBUTION}\"" "${values_yaml}"

 	if [[ "${HELM_DEFAULT_INSTALLATION}" = "false" ]]; then
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl
@@ -391,6 +391,21 @@ reference:tag (tag defaults to Chart.AppVersion).
 {{- end -}}
 {{- end -}}

+{{/*
+Dispatcher image reference for the job-mode dispatcher (kata-deploy-job-dispatcher).
+Supports tag (reference:tag) and digest (reference@sha256:...) formats; tag
+defaults to Chart.AppVersion.
+*/}}
+{{- define "kata-deploy.dispatcherImage" -}}
+{{- $ref := .Values.job.dispatcherImage.reference -}}
+{{- $tag := default .Chart.AppVersion .Values.job.dispatcherImage.tag | toString -}}
+{{- if contains "@" $ref -}}
+{{- $ref -}}
+{{- else -}}
+{{- printf "%s:%s" $ref $tag -}}
+{{- end -}}
+{{- end -}}
+
 {{/*
 Get snapshotter setup list from structured config
 */}}
@@ -592,6 +607,166 @@ e.g. `{{- include "kata-deploy.commonEnv" . | nindent 8 }}`.
 {{- end }}
 {{- end -}}

+{{/*
+Build a Kubernetes label-selector STRING (the form accepted by the apiserver
+and `kubectl --selector`) from an equality map plus a list of match-expression
+requirements. This is handed to `kata-deploy-job-dispatcher --node-selector`, which
+resolves the actual target nodes LIVE at run time (so node membership is never
+frozen into the Helm release).
+
+Arguments (dict):
+  eq    - equality label map           -> "k=v"
+  exprs - list of {key, operator, values}:
+            Exists       -> "key"
+            DoesNotExist -> "!key"
+            In           -> "key in (v1,v2)"
+            NotIn        -> "key notin (v1,v2)"
+
+Returns the comma-joined selector string (possibly empty, meaning "all nodes").
+*/}}
+{{- define "kata-deploy.nodeLabelSelector" -}}
+{{- $parts := list -}}
+{{- range $k, $v := (.eq | default dict) -}}
+{{- $parts = append $parts (printf "%s=%s" $k $v) -}}
+{{- end -}}
+{{- range $expr := (.exprs | default list) -}}
+{{- $op := $expr.operator -}}
+{{- if eq $op "Exists" -}}
+{{- $parts = append $parts $expr.key -}}
+{{- else if eq $op "DoesNotExist" -}}
+{{- $parts = append $parts (printf "!%s" $expr.key) -}}
+{{- else if eq $op "In" -}}
+{{- $parts = append $parts (printf "%s in (%s)" $expr.key (join "," ($expr.values | default list))) -}}
+{{- else if eq $op "NotIn" -}}
+{{- $parts = append $parts (printf "%s notin (%s)" $expr.key (join "," ($expr.values | default list))) -}}
+{{- else -}}
+{{- fail (printf "nodeSelectorExpressions: unsupported operator %q for key %q (use In, NotIn, Exists, DoesNotExist)" $op $expr.key) -}}
+{{- end -}}
+{{- end -}}
+{{- join "," $parts -}}
+{{- end -}}
+
+{{/*
+Per-node staged Job manifest (deploymentMode: job), embedded verbatim into the
+job-templates ConfigMap. The dispatcher (kata-deploy-job-dispatcher) clones this once per
+target node, injecting metadata.name + spec.template.spec.nodeName, so the
+template itself carries NO node identity and NO Helm hook annotations.
+
+Arguments (dict):
+  root  - top-level context (.)
+  stage - "install" | "cleanup"
+
+install pipeline:  host-check -> artifacts -> cri (initContainers) ; label (main)
+cleanup pipeline:  unlabel -> revert-cri    (initContainers) ; remove-artifacts (main)
+
+Emitted at column 0 (a standalone Job document); embed with `indent` at the call
+site under a ConfigMap data key.
+*/}}
+{{- define "kata-deploy.perNodeJob" -}}
+{{- $root := .root -}}
+{{- $stage := .stage -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  labels:
+    app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+    app.kubernetes.io/instance: {{ $root.Release.Name }}
+    kata-deploy/stage: {{ $stage }}
+spec:
+  backoffLimit: {{ $root.Values.job.backoffLimit }}
+  ttlSecondsAfterFinished: {{ $root.Values.job.ttlSecondsAfterFinished }}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+        app.kubernetes.io/instance: {{ $root.Release.Name }}
+        kata-deploy/stage: {{ $stage }}
+    spec:
+{{- with $root.Values.imagePullSecrets }}
+      imagePullSecrets:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+      serviceAccountName: {{ include "kata-deploy.serviceAccountName" $root }}
+      restartPolicy: Never
+      hostPID: true
+{{- with $root.Values.tolerations }}
+      tolerations:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+{{- with $root.Values.priorityClassName }}
+      priorityClassName: {{ . | quote }}
+{{- end }}
+{{- if eq $stage "install" }}
+      initContainers:
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "host-check" "action" "install-stage-host-check" "privileged" true "mountHost" true) | nindent 8 }}
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "artifacts" "action" "install-stage-artifacts" "privileged" true "mountHost" true) | nindent 8 }}
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "cri" "action" "install-stage-cri" "privileged" true "mountHost" true) | nindent 8 }}
+      containers:
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "label" "action" "install-stage-label" "privileged" false "mountHost" false) | nindent 8 }}
+{{- else }}
+      initContainers:
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "unlabel" "action" "cleanup-stage-unlabel" "privileged" false "mountHost" false) | nindent 8 }}
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "revert-cri" "action" "cleanup-stage-revert-cri" "privileged" true "mountHost" true) | nindent 8 }}
+      containers:
+{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "remove-artifacts" "action" "cleanup-stage-remove-artifacts" "privileged" true "mountHost" true) | nindent 8 }}
+{{- end }}
+      volumes:
+{{- include "kata-deploy.commonVolumes" $root | nindent 8 }}
+{{- end -}}
+
+{{/*
+Service account name (honoring multiInstallSuffix), shared by all kata-deploy
+workloads (DaemonSet and staged Jobs).
+*/}}
+{{- define "kata-deploy.serviceAccountName" -}}
+{{- if .Values.env.multiInstallSuffix -}}
+{{ .Chart.Name }}-sa-{{ .Values.env.multiInstallSuffix }}
+{{- else -}}
+{{ .Chart.Name }}-sa
+{{- end -}}
+{{- end -}}
+
+{{/*
+ServiceAccount name for the job-mode dispatcher (kata-deploy-job-dispatcher). Separate from
+kata-deploy.serviceAccountName: the dispatcher is a pure API client (list nodes,
+manage Jobs) and must NOT carry the privileged kata-deploy host-mutation rights.
+*/}}
+{{- define "kata-deploy.dispatcherServiceAccountName" -}}
+{{- if .Values.env.multiInstallSuffix -}}
+{{ .Chart.Name }}-dispatcher-sa-{{ .Values.env.multiInstallSuffix }}
+{{- else -}}
+{{ .Chart.Name }}-dispatcher-sa
+{{- end -}}
+{{- end -}}
+
+{{/*
+Render a single staged-pipeline container that runs one kata-deploy stage action.
+Used by the per-node staged install/cleanup Jobs (deploymentMode: job).
+
+Arguments (dict):
+  root        - the top-level context (.)
+  name        - container name
+  action      - kata-deploy subcommand (e.g. install-stage-cri)
+  privileged  - bool, whether the container runs privileged (host nsenter/restart)
+  mountHost   - bool, whether to mount the host paths (crio/containerd/host)
+
+Emitted at column 0; indent with `nindent` at the call site.
+*/}}
+{{- define "kata-deploy.stageContainer" -}}
+- name: {{ .name }}
+  image: {{ include "kata-deploy.image" .root }}
+  imagePullPolicy: {{ .root.Values.imagePullPolicy }}
+  command: ["/usr/bin/kata-deploy", "{{ .action }}"]
+  env:
+{{- include "kata-deploy.commonEnv" .root | nindent 4 }}
+  securityContext:
+    privileged: {{ .privileged }}
+{{- if .mountHost }}
+  volumeMounts:
+{{- include "kata-deploy.commonVolumeMounts" .root | nindent 4 }}
+{{- end }}
+{{- end -}}
+
 {{/*
 Common volumeMounts for any pod that runs the kata-deploy binary against the
 host. Emitted at column 0; indent with `nindent` at the call site.
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml
@@ -0,0 +1,113 @@
+{{- /*
+Install dispatcher (deploymentMode: job).
+
+A single, tiny post-install/post-upgrade hook Job that runs the dispatcher
+(kata-deploy-job-dispatcher). The dispatcher enumerates the selected nodes LIVE, then
+creates one node-pinned install Job per node from the job-templates ConfigMap,
+keeping at most job.parallelism in flight and refilling as they finish. This
+guarantees one install per node (coverage) with a paced rollout, while the Helm
+release stays O(1) regardless of fleet size.
+
+Each per-node Job runs the staged pipeline as ordered initContainers and exits:
+
+  host-check -> artifacts -> cri   (initContainers, run sequentially)
+  label                            (main container)
+
+Helm waits only on THIS dispatcher Job (the verification hook runs at a higher
+weight, after it). before-hook-creation lets `helm upgrade` re-run the dispatcher,
+which re-enumerates nodes (idempotent stages skip already-installed nodes and
+pick up newly added ones).
+*/ -}}
+{{- if eq (.Values.deploymentMode | default "daemonset") "job" }}
+{{- $root := . }}
+{{- $base := .Chart.Name }}
+{{- if .Values.env.multiInstallSuffix }}
+{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }}
+{{- end }}
+{{- $sa := include "kata-deploy.dispatcherServiceAccountName" . }}
+{{- $dispatcherName := printf "%s-install-dispatcher" $base | trunc 63 | trimSuffix "-" }}
+{{- $nodes := .Values.job.nodes | default list }}
+{{- $selector := include "kata-deploy.nodeLabelSelector" (dict "eq" (.Values.job.nodeSelector | default dict) "exprs" (.Values.job.nodeSelectorExpressions | default list)) }}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ $dispatcherName }}
+  namespace: {{ $root.Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+    app.kubernetes.io/instance: {{ $root.Release.Name }}
+    kata-deploy/dispatcher: install
+  annotations:
+    "helm.sh/hook": post-install,post-upgrade
+    "helm.sh/hook-weight": "5"
+    "helm.sh/hook-delete-policy": before-hook-creation
+spec:
+  # The dispatcher does per-node retries (job.backoffLimit) itself; a dispatcher
+  # failure means "some node failed" and should surface, not be retried blindly.
+  backoffLimit: 0
+  ttlSecondsAfterFinished: {{ $root.Values.job.ttlSecondsAfterFinished }}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: {{ include "kata-deploy.name" $root }}
+        app.kubernetes.io/instance: {{ $root.Release.Name }}
+        kata-deploy/dispatcher: install
+    spec:
+{{- with $root.Values.imagePullSecrets }}
+      imagePullSecrets:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+      serviceAccountName: {{ $sa }}
+      restartPolicy: Never
+      # The dispatcher never touches the host; it is a plain API client. Lock the
+      # pod down so a compromise cannot escalate beyond its (minimal) API rights.
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65532
+        runAsGroup: 65532
+        seccompProfile:
+          type: RuntimeDefault
+{{- with $root.Values.tolerations }}
+      tolerations:
+{{- toYaml . | nindent 8 }}
+{{- end }}
+{{- with $root.Values.priorityClassName }}
+      priorityClassName: {{ . | quote }}
+{{- end }}
+      containers:
+        - name: dispatcher
+          image: {{ include "kata-deploy.dispatcherImage" $root }}
+          imagePullPolicy: {{ $root.Values.imagePullPolicy }}
+          securityContext:
+            privileged: false
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
+          command:
+            - /usr/bin/kata-deploy-job-dispatcher
+            - "--job-template=/etc/kata-job/install-job.yaml"
+            - "--name-prefix={{ $base }}-install"
+            - "--owner-job-name={{ $dispatcherName }}"
+            - "--parallelism={{ $root.Values.job.parallelism }}"
+{{- if $nodes }}
+            - "--nodes={{ join "," $nodes }}"
+{{- else if $selector }}
+            - "--node-selector={{ $selector }}"
+{{- end }}
+          env:
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          volumeMounts:
+            - name: job-templates
+              mountPath: /etc/kata-job
+              readOnly: true
+      volumes:
+        - name: job-templates
+          configMap:
+            name: {{ printf "%s-job-templates" $base | trunc 63 | trimSuffix "-" }}
+{{- end }}
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml
@@ -0,0 +1,33 @@
+{{- /*
+Per-node Job templates for deploymentMode: job.
+
+This ConfigMap holds the install and cleanup per-node Job manifests, rendered
+ONCE (constant size, independent of the number of nodes). The job-mode dispatcher
+(kata-deploy-job-dispatcher) mounts it, and for every selected node clones the relevant
+template, injects metadata.name + spec.template.spec.nodeName, and creates the
+Job. Keeping the rich pod spec (env/volumes/shim config) here means the Helm
+chart stays the single source of truth; the dispatcher only does fan-out.
+
+It is a normal (non-hook) resource: Helm creates it before the post-install
+dispatcher hook runs, and it still exists during the pre-delete cleanup hook
+(release resources are torn down only after pre-delete hooks complete).
+*/ -}}
+{{- if eq (.Values.deploymentMode | default "daemonset") "job" }}
+{{- $base := .Chart.Name }}
+{{- if .Values.env.multiInstallSuffix }}
+{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }}
+{{- end }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ printf "%s-job-templates" $base | trunc 63 | trimSuffix "-" }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ include "kata-deploy.name" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+data:
+  install-job.yaml: |
+{{ include "kata-deploy.perNodeJob" (dict "root" . "stage" "install") | indent 4 }}
+  cleanup-job.yaml: |
+{{ include "kata-deploy.perNodeJob" (dict "root" . "stage" "cleanup") | indent 4 }}
+{{- end }}
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml
@@ -1,3 +1,4 @@
+{{- if eq (.Values.deploymentMode | default "daemonset") "daemonset" -}}
 {{- if index .Values "node-feature-discovery" "enabled" -}}
 {{- $existingNFDNamespace := include "kata-deploy.detectExistingNFD" . | trim -}}
 {{- if $existingNFDNamespace -}}
@@ -204,3 +205,4 @@ spec:
  updateStrategy:
    {{- toYaml . | nindent 4 }}
 {{- end}}
+{{- end -}}
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml
@@ -65,6 +65,68 @@ subjects:
  name: {{ .Chart.Name }}-sa
 {{- end }}
  namespace: {{ .Release.Namespace }}
+{{- if eq (.Values.deploymentMode | default "daemonset") "job" }}
+---
+# Dedicated, least-privilege identity for the job-mode dispatcher
+# (kata-deploy-job-dispatcher). It is a pure control-plane client: it lists nodes
+# (cluster-scoped) and manages per-node Jobs in the release namespace
+# (namespace-scoped). It deliberately does NOT get the privileged kata-deploy
+# host-mutation rights (node patch, runtimeclasses, NFD, etc.); those stay on
+# kata-deploy-sa, which only the per-node Jobs use.
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "kata-deploy.dispatcherServiceAccountName" . }}
+  namespace: {{ .Release.Namespace }}
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ .Chart.Name }}-dispatcher-noderole{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+rules:
+# Enumerating nodes is inherently cluster-scoped.
+- apiGroups: [""]
+  resources: ["nodes"]
+  verbs: ["list"]
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ .Chart.Name }}-dispatcher-noderb{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ .Chart.Name }}-dispatcher-noderole{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+subjects:
+- kind: ServiceAccount
+  name: {{ include "kata-deploy.dispatcherServiceAccountName" . }}
+  namespace: {{ .Release.Namespace }}
+---
+kind: Role
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ .Chart.Name }}-dispatcher-role{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+  namespace: {{ .Release.Namespace }}
+rules:
+# The dispatcher only ever creates/watches/GCs per-node Jobs in its own namespace.
+- apiGroups: ["batch"]
+  resources: ["jobs"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+---
+kind: RoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ .Chart.Name }}-dispatcher-rb{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+  namespace: {{ .Release.Namespace }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: {{ .Chart.Name }}-dispatcher-role{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }}
+subjects:
+- kind: ServiceAccount
+  name: {{ include "kata-deploy.dispatcherServiceAccountName" . }}
+  namespace: {{ .Release.Namespace }}
+{{- end }}
 ---
 # ServiceAccount and RBAC for the post-delete Job that removes the kept RBAC above.
 # Created as post-delete hooks with lower weight than the Job so they exist when the Job runs.
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml
@@ -1,3 +1,106 @@
+# Deployment model for installing/cleaning up Kata on nodes.
+#   daemonset: (default) the long-running kata-deploy DaemonSet installs Kata on
+#              every matching node and reverts it on pod termination (uninstall).
+#   job:       no always-on component. A tiny dispatcher Job (the dispatcher,
+#              kata-deploy-job-dispatcher) runs as a post-install/upgrade hook, enumerates
+#              the selected nodes LIVE, and creates one node-pinned install Job
+#              per node - paced to job.parallelism and guaranteeing one install
+#              per node. Each per-node Job runs the staged pipeline as ordered
+#              initContainers and exits. Uninstall works the same way via a
+#              pre-delete dispatcher (reverse pipeline).
+#
+# Why a dispatcher instead of rendering per-node Jobs in the chart: Helm stores
+# the whole rendered release in one ~1 MiB Secret and runs hook resources
+# sequentially, and neither an Indexed Job nor a JobSet can guarantee one pod
+# per node once parallelism < node-count (the scheduler ignores completed pods
+# when balancing spread). The dispatcher keeps the release O(1), enumerates nodes
+# at run time, and paces a guaranteed-coverage rollout with built-in Jobs only.
+#
+# NOTE on "job" mode and new nodes:
+#   The dispatcher only runs on `helm install` / `helm upgrade` / `helm uninstall`.
+#   When you add nodes later, re-run `helm upgrade` so the dispatcher enumerates
+#   and installs the new nodes (the staged actions are idempotent, so already-
+#   installed nodes are skipped). This is intentional: it avoids an always-on
+#   privileged component on every node.
+deploymentMode: daemonset  # daemonset | job
+
+# Settings specific to deploymentMode: job
+job:
+  # Dispatcher image: the dispatcher that fans out per-node Jobs. It only talks to
+  # the Kubernetes API (lists nodes, creates/watches Jobs); it never touches the
+  # host. Supports reference:tag or reference@sha256:digest; tag defaults to the
+  # chart appVersion.
+  dispatcherImage:
+    reference: quay.io/kata-containers/kata-deploy-job-dispatcher
+    tag: ""
+  # Maximum number of nodes processed concurrently (the dispatcher keeps at most
+  # this many per-node Jobs in flight, refilling as they finish). Lower it to
+  # pace the rollout (e.g. limit how many CRI runtimes restart at once on a big
+  # fleet); raise it to install faster. Effectively capped at the node count.
+  parallelism: 100
+  # How to choose which nodes get a per-node INSTALL Job. Precedence:
+  #   1. job.nodes (explicit list of node names) - if non-empty, used verbatim
+  #      (passed to the dispatcher as --nodes).
+  #   2. otherwise a label selector built from job.nodeSelector (equality) ANDed
+  #      with job.nodeSelectorExpressions (In/NotIn/Exists/DoesNotExist) is
+  #      passed to the dispatcher, which resolves matching nodes LIVE at run time.
+  #   3. if both are empty, ALL nodes are targeted.
+  #
+  # DEFAULT: target worker (non-control-plane) nodes, so no custom labeling is
+  # required. Override these freely:
+  #   - Target nodes with a specific label:
+  #       job:
+  #         nodeSelector: { kata-containers: "enabled" }
+  #   - Target every node (including control-plane), e.g. single-node clusters/CI:
+  #       job:
+  #         nodeSelectorExpressions: []
+  #   - Richer expressions:
+  #       job:
+  #         nodeSelectorExpressions:
+  #           - { key: kubernetes.io/os, operator: In, values: ["linux"] }
+  #           - { key: node-role.kubernetes.io/control-plane, operator: DoesNotExist }
+  #   - Pin to explicit nodes:
+  #       job:
+  #         nodes: ["worker-1", "worker-2"]
+  nodes: []
+  # Equality label selector (ANDed with nodeSelectorExpressions). Ignored when
+  # job.nodes is set. Empty by default.
+  nodeSelector: {}
+  # Kubernetes-style label selector requirements (ANDed with nodeSelector).
+  # Each entry: { key, operator, values }. operator is one of:
+  #   In | NotIn (values required) | Exists | DoesNotExist (values must be empty).
+  # Default selects nodes that are NOT control-plane/master (i.e. worker nodes).
+  # Set to [] to disable role filtering and target all discovered nodes.
+  nodeSelectorExpressions:
+    - key: node-role.kubernetes.io/control-plane
+      operator: DoesNotExist
+    - key: node-role.kubernetes.io/master
+      operator: DoesNotExist
+  # Node selection for the UNINSTALL (pre-delete hook) dispatcher. Same precedence
+  # and semantics as install (cleanup.nodes, else cleanup.nodeSelector ANDed with
+  # cleanup.nodeSelectorExpressions, else all nodes).
+  #
+  # The cleanup dispatcher resolves nodes LIVE when it runs at `helm uninstall`
+  # (the dispatcher does the lookup), so - unlike a frozen Helm-rendered hook -
+  # the DEFAULT below can safely be "nodes carrying katacontainers.io/kata-runtime",
+  # i.e. exactly the nodes install actually labeled. Override to clean a
+  # different set, e.g.:
+  #   job:
+  #     cleanup:
+  #       nodes: ["worker-1"]
+  cleanup:
+    nodes: []
+    nodeSelector: {}
+    nodeSelectorExpressions:
+      - key: katacontainers.io/kata-runtime
+        operator: Exists
+  # How long finished per-node Jobs are retained before automatic garbage
+  # collection (seconds). Applies to both install and cleanup per-node Jobs.
+  ttlSecondsAfterFinished: 600
+  # Per-node retry budget: retries for a single node's Job before it is marked
+  # failed. One node failing never aborts the others.
+  backoffLimit: 3
+
 imagePullPolicy: Always

 imagePullSecrets: []
--- a/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh
+++ b/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh
@@ -23,8 +23,21 @@ tmp="$(mktemp -d)"
 trap '[[ -n "${KEEP_TMPDIR}" ]] && echo "kept: ${tmp}" || rm -rf "${tmp}"' EXIT

 cp -r "${CHART_SRC}" "${tmp}/"
+
+# Job-mode dispatcher image. Its repo mirrors the kata-deploy repo with
+# "-job-dispatcher" inserted before any "-ci" suffix (so the "-ci" stays last):
+#   .../kata-deploy     -> .../kata-deploy-job-dispatcher
+#   .../kata-deploy-ci  -> .../kata-deploy-job-dispatcher-ci
+# It is built and pushed with the same tag by kata-deploy-build-and-upload-payload.sh.
+if [[ "${REGISTRY}" == *-ci ]]; then
+	JOB_DISPATCHER_IMAGE_REFERENCE="${JOB_DISPATCHER_IMAGE_REFERENCE:-"${REGISTRY%-ci}-job-dispatcher-ci"}"
+else
+	JOB_DISPATCHER_IMAGE_REFERENCE="${JOB_DISPATCHER_IMAGE_REFERENCE:-"${REGISTRY}-job-dispatcher"}"
+fi
+
 yq eval ".version = \"${CHART_VERSION}\" | .appVersion = \"${CHART_VERSION}\"" -i "${tmp}/kata-deploy/Chart.yaml"
 yq eval ".image.reference = \"${REGISTRY}\" | .image.tag = \"${TAG}\"" -i "${tmp}/kata-deploy/values.yaml"
+yq eval ".job.dispatcherImage.reference = \"${JOB_DISPATCHER_IMAGE_REFERENCE}\" | .job.dispatcherImage.tag = \"${TAG}\"" -i "${tmp}/kata-deploy/values.yaml"
 helm dependencies update "${tmp}/kata-deploy"
 helm package "${tmp}/kata-deploy" -d "${tmp}"
 helm push "${tmp}/kata-deploy-${CHART_VERSION}.tgz" "oci://${CHART_REGISTRY}"