kata-deploy: nvidia: Default to the Job-based deployment mode

Switch the NVIDIA GPU example values file to install Kata via the
Job-based deployment mode (deploymentMode: job) instead of the
always-on, privileged DaemonSet, so that nothing keeps running on the
node once the install completes.

To exercise this in our CI, make the helm_helper aware of the deployment
mode coming from the (base) values file:

  - In "job" mode, clear job.nodeSelectorExpressions so the dispatcher
    targets every discovered node.  Our CI clusters are typically
    single-node, where the only node carries the control-plane label,
    and the default selector excludes control-plane/master nodes.

  - There is no always-on DaemonSet to wait on in "job" mode.  The
    dispatcher runs as a blocking post-install hook and the final
    per-node stage labels the node, so wait until at least one node
    carries the katacontainers.io/kata-runtime label as the
    "install complete" signal (dumping Job/pod logs on timeout).

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Fabiano Fidêncio
2026-06-12 22:55:11 +02:00
parent 6005f8a499
commit fefc0b75ab
3 changed files with 90 additions and 39 deletions

View File

@@ -299,7 +299,9 @@ Includes:
### [`try-kata-nvidia-gpu.values.yaml`](https://github.com/kata-containers/kata-containers/blob/main/tools/packaging/kata-deploy/helm-chart/kata-deploy/try-kata-nvidia-gpu.values.yaml)
This file enables only the NVIDIA GPU-enabled shims:
This file enables only the NVIDIA GPU-enabled shims and installs them using the
[`job` deployment mode](#deployment-modes-daemonset-vs-job) (no always-on
DaemonSet on the node):
```sh
helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \

View File

@@ -745,6 +745,19 @@ function helm_helper() {
yq -i ".job.dispatcherImage.reference = \"${dispatcher_reference}\"" "${values_yaml}"
yq -i ".job.dispatcherImage.tag = \"${HELM_IMAGE_TAG}\"" "${values_yaml}"
# Resolve the deployment mode coming from the (base) values file so the
# post-install wait below knows whether to expect a DaemonSet or per-node Jobs.
local deployment_mode
deployment_mode="$(yq -r '.deploymentMode // "daemonset"' "${values_yaml}")"
# In "job" mode, the dispatcher's default node selector targets only worker
# (non-control-plane) nodes. Our CI clusters are typically single-node, where
# the only node carries the control-plane label, so clear the role filter to
# target every discovered node (matching the documented single-node/CI setup).
if [[ "${deployment_mode}" == "job" ]]; then
yq -i ".job.nodeSelectorExpressions = []" "${values_yaml}"
fi
[[ -n "${HELM_K8S_DISTRIBUTION}" ]] && yq -i ".k8sDistribution = \"${HELM_K8S_DISTRIBUTION}\"" "${values_yaml}"
if [[ "${HELM_DEFAULT_INSTALLATION}" = "false" ]]; then
@@ -1084,50 +1097,81 @@ VERIFICATION_POD_EOF
return 1
fi
# helm --wait is ineffective for single-node clusters with maxUnavailable=1
# (the DaemonSet is considered ready with 0 ready pods). First wait until at
# least one kata-deploy pod exists, then wait on the pod readiness condition
# instead — the readiness probe (/readyz) returns 200 only after install
# completes (artifacts extracted, CRI restarted, node labeled).
local pod_label_name="kata-deploy"
local multi_install_suffix=""
multi_install_suffix="$(yq -r '.env.multiInstallSuffix // ""' "${values_yaml}")"
if [[ -n "${multi_install_suffix}" ]]; then
pod_label_name="${pod_label_name}-${multi_install_suffix}"
fi
if [[ "${deployment_mode}" == "job" ]]; then
# In "job" mode there is no always-on DaemonSet: the dispatcher runs as a
# blocking post-install hook and fans out one per-node install Job, so by
# the time `helm upgrade --install` returns the install pipeline has run.
# The final stage labels the node, so wait until at least one node carries
# the kata-runtime label as the "install complete" signal.
echo "deploymentMode=job: waiting for per-node install Jobs to label the node(s)"
local label_wait_deadline=$((SECONDS + KATA_DEPLOY_WAIT_TIMEOUT))
while true; do
if [[ -n "$(kubectl get nodes -l katacontainers.io/kata-runtime=true -o name 2>/dev/null)" ]]; then
break
fi
if (( SECONDS >= label_wait_deadline )); then
echo "ERROR: Timed out waiting for kata-deploy install Jobs to label any node"
echo "::group::kata-deploy job-mode status (no node labeled)"
kubectl -n kube-system get jobs -l app.kubernetes.io/name=kata-deploy -o wide || true
kubectl -n kube-system get pods -l app.kubernetes.io/name=kata-deploy -o wide || true
kubectl -n kube-system describe jobs -l app.kubernetes.io/name=kata-deploy || true
kubectl -n kube-system logs -l app.kubernetes.io/name=kata-deploy --all-containers --tail=-1 --timestamps 2>/dev/null || true
echo "::endgroup::"
return 1
fi
sleep 5
done
local pod_wait_deadline=$((SECONDS + KATA_DEPLOY_WAIT_TIMEOUT))
while true; do
if [[ -n "$(kubectl -n kube-system get pod -l "name=${pod_label_name}" -o name 2>/dev/null)" ]]; then
break
echo "::group::kata-deploy job-mode logs (current)"
kubectl_retry -n kube-system get jobs -l app.kubernetes.io/name=kata-deploy -o wide || true
kubectl_retry -n kube-system logs -l app.kubernetes.io/name=kata-deploy --all-containers --tail=-1 --timestamps 2>/dev/null || true
echo "::endgroup::"
else
# helm --wait is ineffective for single-node clusters with maxUnavailable=1
# (the DaemonSet is considered ready with 0 ready pods). First wait until at
# least one kata-deploy pod exists, then wait on the pod readiness condition
# instead — the readiness probe (/readyz) returns 200 only after install
# completes (artifacts extracted, CRI restarted, node labeled).
local pod_label_name="kata-deploy"
local multi_install_suffix=""
multi_install_suffix="$(yq -r '.env.multiInstallSuffix // ""' "${values_yaml}")"
if [[ -n "${multi_install_suffix}" ]]; then
pod_label_name="${pod_label_name}-${multi_install_suffix}"
fi
if (( SECONDS >= pod_wait_deadline )); then
echo "ERROR: Timed out waiting for kata-deploy pod to be created"
echo "::group::kata-deploy daemonset status (no pod created)"
kubectl -n kube-system get ds -l "name=${pod_label_name}" -o wide || true
kubectl -n kube-system describe ds -l "name=${pod_label_name}" || true
local pod_wait_deadline=$((SECONDS + KATA_DEPLOY_WAIT_TIMEOUT))
while true; do
if [[ -n "$(kubectl -n kube-system get pod -l "name=${pod_label_name}" -o name 2>/dev/null)" ]]; then
break
fi
if (( SECONDS >= pod_wait_deadline )); then
echo "ERROR: Timed out waiting for kata-deploy pod to be created"
echo "::group::kata-deploy daemonset status (no pod created)"
kubectl -n kube-system get ds -l "name=${pod_label_name}" -o wide || true
kubectl -n kube-system describe ds -l "name=${pod_label_name}" || true
echo "::endgroup::"
return 1
fi
sleep 1
done
if ! kubectl -n kube-system wait pod -l "name=${pod_label_name}" --for=condition=Ready --timeout="${KATA_DEPLOY_WAIT_TIMEOUT}s"; then
echo "::group::kata-deploy pod describe (install timed out)"
kubectl -n kube-system describe pod -l "name=${pod_label_name}" || true
echo "::endgroup::"
echo "::group::kata-deploy logs (install timed out)"
kubectl -n kube-system logs -l "name=${pod_label_name}" --all-containers --previous --tail=-1 --timestamps 2>/dev/null || true
kubectl -n kube-system logs -l "name=${pod_label_name}" --all-containers --tail=-1 --timestamps 2>/dev/null || true
echo "::endgroup::"
return 1
fi
sleep 1
done
if ! kubectl -n kube-system wait pod -l "name=${pod_label_name}" --for=condition=Ready --timeout="${KATA_DEPLOY_WAIT_TIMEOUT}s"; then
echo "::group::kata-deploy pod describe (install timed out)"
kubectl -n kube-system describe pod -l "name=${pod_label_name}" || true
echo "::endgroup::"
echo "::group::kata-deploy logs (install timed out)"
kubectl -n kube-system logs -l "name=${pod_label_name}" --all-containers --previous --tail=-1 --timestamps 2>/dev/null || true
kubectl -n kube-system logs -l "name=${pod_label_name}" --all-containers --tail=-1 --timestamps 2>/dev/null || true
echo "::endgroup::"
return 1
fi
echo "::group::kata-deploy logs (current)"
kubectl_retry -n kube-system logs -l "name=${pod_label_name}" --all-containers --tail=-1 --timestamps || true
echo "::endgroup::"
echo "::group::kata-deploy logs (previous)"
kubectl_retry -n kube-system logs -l "name=${pod_label_name}" --all-containers --previous --tail=-1 --timestamps 2>/dev/null || true
echo "::endgroup::"
echo "::group::kata-deploy logs (current)"
kubectl_retry -n kube-system logs -l "name=${pod_label_name}" --all-containers --tail=-1 --timestamps || true
echo "::endgroup::"
echo "::group::kata-deploy logs (previous)"
kubectl_retry -n kube-system logs -l "name=${pod_label_name}" --all-containers --previous --tail=-1 --timestamps 2>/dev/null || true
echo "::endgroup::"
fi
echo "::group::Runtime classes"
kubectl_retry get runtimeclass

View File

@@ -8,6 +8,11 @@
debug: false
# Install Kata via short-lived per-node Jobs instead of an always-on DaemonSet.
# A tiny dispatcher Job fans out one install Job per selected node and exits, so
# nothing keeps running on the node once the install completes.
deploymentMode: job
snapshotter:
setup: ["nydus"]