Merge pull request #13222 from fidencio/topic/nvidia-switch-to-kata-deploy-jobs

kata-deploy: nvidia: Default to the Job-based deployment mode
2026-07-01 14:38:33 +00:00 · 2026-06-22 12:55:10 +02:00
parent a87d71763e fefc0b75ab
commit f1ebefcdfb
3 changed files with 90 additions and 39 deletions
--- a/docs/helm-configuration.md
+++ b/docs/helm-configuration.md
@@ -299,7 +299,9 @@ Includes:

 ### [`try-kata-nvidia-gpu.values.yaml`](https://github.com/kata-containers/kata-containers/blob/main/tools/packaging/kata-deploy/helm-chart/kata-deploy/try-kata-nvidia-gpu.values.yaml)

-This file enables only the NVIDIA GPU-enabled shims:
+This file enables only the NVIDIA GPU-enabled shims and installs them using the
+[`job` deployment mode](#deployment-modes-daemonset-vs-job) (no always-on
+DaemonSet on the node):

 ```sh
 helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \
--- a/tests/gha-run-k8s-common.sh
+++ b/tests/gha-run-k8s-common.sh
@@ -746,6 +746,19 @@ function helm_helper() {
 	yq -i ".job.dispatcherImage.reference = \"${dispatcher_reference}\"" "${values_yaml}"
 	yq -i ".job.dispatcherImage.tag = \"${HELM_IMAGE_TAG}\"" "${values_yaml}"

+	# Resolve the deployment mode coming from the (base) values file so the
+	# post-install wait below knows whether to expect a DaemonSet or per-node Jobs.
+	local deployment_mode
+	deployment_mode="$(yq -r '.deploymentMode // "daemonset"' "${values_yaml}")"
+
+	# In "job" mode, the dispatcher's default node selector targets only worker
+	# (non-control-plane) nodes. Our CI clusters are typically single-node, where
+	# the only node carries the control-plane label, so clear the role filter to
+	# target every discovered node (matching the documented single-node/CI setup).
+	if [[ "${deployment_mode}" == "job" ]]; then
+		yq -i ".job.nodeSelectorExpressions = []" "${values_yaml}"
+	fi
+
 	[[ -n "${HELM_K8S_DISTRIBUTION}" ]] && yq -i ".k8sDistribution = \"${HELM_K8S_DISTRIBUTION}\"" "${values_yaml}"

 	if [[ "${HELM_DEFAULT_INSTALLATION}" = "false" ]]; then
@@ -1105,50 +1118,81 @@ VERIFICATION_POD_EOF
 		return 1
 	fi

-	# helm --wait is ineffective for single-node clusters with maxUnavailable=1
-	# (the DaemonSet is considered ready with 0 ready pods). First wait until at
-	# least one kata-deploy pod exists, then wait on the pod readiness condition
-	# instead — the readiness probe (/readyz) returns 200 only after install
-	# completes (artifacts extracted, CRI restarted, node labeled).
-	local pod_label_name="kata-deploy"
-	local multi_install_suffix=""
-	multi_install_suffix="$(yq -r '.env.multiInstallSuffix // ""' "${values_yaml}")"
-	if [[ -n "${multi_install_suffix}" ]]; then
-		pod_label_name="${pod_label_name}-${multi_install_suffix}"
-	fi
+	if [[ "${deployment_mode}" == "job" ]]; then
+		# In "job" mode there is no always-on DaemonSet: the dispatcher runs as a
+		# blocking post-install hook and fans out one per-node install Job, so by
+		# the time `helm upgrade --install` returns the install pipeline has run.
+		# The final stage labels the node, so wait until at least one node carries
+		# the kata-runtime label as the "install complete" signal.
+		echo "deploymentMode=job: waiting for per-node install Jobs to label the node(s)"
+		local label_wait_deadline=$((SECONDS + KATA_DEPLOY_WAIT_TIMEOUT))
+		while true; do
+			if [[ -n "$(kubectl get nodes -l katacontainers.io/kata-runtime=true -o name 2>/dev/null)" ]]; then
+				break
+			fi
+			if (( SECONDS >= label_wait_deadline )); then
+				echo "ERROR: Timed out waiting for kata-deploy install Jobs to label any node"
+				echo "::group::kata-deploy job-mode status (no node labeled)"
+				kubectl -n kube-system get jobs -l app.kubernetes.io/name=kata-deploy -o wide || true
+				kubectl -n kube-system get pods -l app.kubernetes.io/name=kata-deploy -o wide || true
+				kubectl -n kube-system describe jobs -l app.kubernetes.io/name=kata-deploy || true
+				kubectl -n kube-system logs -l app.kubernetes.io/name=kata-deploy --all-containers --tail=-1 --timestamps 2>/dev/null || true
+				echo "::endgroup::"
+				return 1
+			fi
+			sleep 5
+		done

-	local pod_wait_deadline=$((SECONDS + KATA_DEPLOY_WAIT_TIMEOUT))
-	while true; do
-		if [[ -n "$(kubectl -n kube-system get pod -l "name=${pod_label_name}" -o name 2>/dev/null)" ]]; then
-			break
+		echo "::group::kata-deploy job-mode logs (current)"
+		kubectl_retry -n kube-system get jobs -l app.kubernetes.io/name=kata-deploy -o wide || true
+		kubectl_retry -n kube-system logs -l app.kubernetes.io/name=kata-deploy --all-containers --tail=-1 --timestamps 2>/dev/null || true
+		echo "::endgroup::"
+	else
+		# helm --wait is ineffective for single-node clusters with maxUnavailable=1
+		# (the DaemonSet is considered ready with 0 ready pods). First wait until at
+		# least one kata-deploy pod exists, then wait on the pod readiness condition
+		# instead — the readiness probe (/readyz) returns 200 only after install
+		# completes (artifacts extracted, CRI restarted, node labeled).
+		local pod_label_name="kata-deploy"
+		local multi_install_suffix=""
+		multi_install_suffix="$(yq -r '.env.multiInstallSuffix // ""' "${values_yaml}")"
+		if [[ -n "${multi_install_suffix}" ]]; then
+			pod_label_name="${pod_label_name}-${multi_install_suffix}"
 		fi
-		if (( SECONDS >= pod_wait_deadline )); then
-			echo "ERROR: Timed out waiting for kata-deploy pod to be created"
-			echo "::group::kata-deploy daemonset status (no pod created)"
-			kubectl -n kube-system get ds -l "name=${pod_label_name}" -o wide || true
-			kubectl -n kube-system describe ds -l "name=${pod_label_name}" || true
+
+		local pod_wait_deadline=$((SECONDS + KATA_DEPLOY_WAIT_TIMEOUT))
+		while true; do
+			if [[ -n "$(kubectl -n kube-system get pod -l "name=${pod_label_name}" -o name 2>/dev/null)" ]]; then
+				break
+			fi
+			if (( SECONDS >= pod_wait_deadline )); then
+				echo "ERROR: Timed out waiting for kata-deploy pod to be created"
+				echo "::group::kata-deploy daemonset status (no pod created)"
+				kubectl -n kube-system get ds -l "name=${pod_label_name}" -o wide || true
+				kubectl -n kube-system describe ds -l "name=${pod_label_name}" || true
+				echo "::endgroup::"
+				return 1
+			fi
+			sleep 1
+		done
+		if ! kubectl -n kube-system wait pod -l "name=${pod_label_name}" --for=condition=Ready --timeout="${KATA_DEPLOY_WAIT_TIMEOUT}s"; then
+			echo "::group::kata-deploy pod describe (install timed out)"
+			kubectl -n kube-system describe pod -l "name=${pod_label_name}" || true
+			echo "::endgroup::"
+			echo "::group::kata-deploy logs (install timed out)"
+			kubectl -n kube-system logs -l "name=${pod_label_name}" --all-containers --previous --tail=-1 --timestamps 2>/dev/null || true
+			kubectl -n kube-system logs -l "name=${pod_label_name}" --all-containers --tail=-1 --timestamps 2>/dev/null || true
 			echo "::endgroup::"
 			return 1
 		fi
-		sleep 1
-	done
-	if ! kubectl -n kube-system wait pod -l "name=${pod_label_name}" --for=condition=Ready --timeout="${KATA_DEPLOY_WAIT_TIMEOUT}s"; then
-		echo "::group::kata-deploy pod describe (install timed out)"
-		kubectl -n kube-system describe pod -l "name=${pod_label_name}" || true
-		echo "::endgroup::"
-		echo "::group::kata-deploy logs (install timed out)"
-		kubectl -n kube-system logs -l "name=${pod_label_name}" --all-containers --previous --tail=-1 --timestamps 2>/dev/null || true
-		kubectl -n kube-system logs -l "name=${pod_label_name}" --all-containers --tail=-1 --timestamps 2>/dev/null || true
-		echo "::endgroup::"
-		return 1
-	fi

-	echo "::group::kata-deploy logs (current)"
-	kubectl_retry -n kube-system logs -l "name=${pod_label_name}" --all-containers --tail=-1 --timestamps || true
-	echo "::endgroup::"
-	echo "::group::kata-deploy logs (previous)"
-	kubectl_retry -n kube-system logs -l "name=${pod_label_name}" --all-containers --previous --tail=-1 --timestamps 2>/dev/null || true
-	echo "::endgroup::"
+		echo "::group::kata-deploy logs (current)"
+		kubectl_retry -n kube-system logs -l "name=${pod_label_name}" --all-containers --tail=-1 --timestamps || true
+		echo "::endgroup::"
+		echo "::group::kata-deploy logs (previous)"
+		kubectl_retry -n kube-system logs -l "name=${pod_label_name}" --all-containers --previous --tail=-1 --timestamps 2>/dev/null || true
+		echo "::endgroup::"
+	fi

 	echo "::group::Runtime classes"
 	kubectl_retry get runtimeclass
--- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/try-kata-nvidia-gpu.values.yaml
+++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/try-kata-nvidia-gpu.values.yaml
@@ -8,6 +8,11 @@

 debug: false

+# Install Kata via short-lived per-node Jobs instead of an always-on DaemonSet.
+# A tiny dispatcher Job fans out one install Job per selected node and exits, so
+# nothing keeps running on the node once the install completes.
+deploymentMode: job
+
 snapshotter:
  setup: ["nydus"]