tests: add kata-deploy lifecycle tests for restart resilience and cleanup

Add functional tests that cover two previously untested kata-deploy behaviors: 1. Restart resilience (regression test for #12761): deploys a long-running kata pod, triggers a kata-deploy DaemonSet restart via rollout restart, and verifies the kata pod survives with the same UID and zero additional container restarts. 2. Artifact cleanup: after helm uninstall, verifies that RuntimeClasses are removed, the kata-runtime node label is cleared, /opt/kata is gone from the host filesystem, and containerd remains healthy. 3. Artifact presence: after install, verifies /opt/kata and the shim binary exist on the host, RuntimeClasses are created, and the node is labeled. Host filesystem checks use a short-lived privileged pod with a hostPath mount to inspect the node directly. Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
2026-04-04 19:16:12 +00:00 · 2026-04-01 11:28:26 +02:00
parent b4b62417ed
commit 2131147360
2 changed files with 214 additions and 0 deletions
--- a/tests/functional/kata-deploy/kata-deploy-lifecycle.bats
+++ b/tests/functional/kata-deploy/kata-deploy-lifecycle.bats
@@ -0,0 +1,213 @@
+#!/usr/bin/env bats
+#
+# Copyright (c) 2026 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Kata Deploy Lifecycle Tests
+#
+# Validates kata-deploy behavior during DaemonSet restarts and uninstalls:
+#
+# 1. Artifacts present: After install, kata artifacts exist on the host,
+#    RuntimeClasses are created, and the node is labeled.
+#
+# 2. Restart resilience: Running kata pods must survive a kata-deploy
+#    DaemonSet restart without crashing. (Regression test for #12761)
+#
+# 3. Artifact cleanup: After helm uninstall, kata artifacts must be
+#    fully removed from the host and containerd must remain healthy.
+#
+# Required environment variables:
+#   DOCKER_REGISTRY - Container registry for kata-deploy image
+#   DOCKER_REPO     - Repository name for kata-deploy image
+#   DOCKER_TAG      - Image tag to test
+#   KATA_HYPERVISOR - Hypervisor to test (qemu, clh, etc.)
+#   KUBERNETES      - K8s distribution (microk8s, k3s, rke2, etc.)
+
+load "${BATS_TEST_DIRNAME}/../../common.bash"
+repo_root_dir="${BATS_TEST_DIRNAME}/../../../"
+load "${repo_root_dir}/tests/gha-run-k8s-common.sh"
+
+source "${BATS_TEST_DIRNAME}/lib/helm-deploy.bash"
+
+LIFECYCLE_POD_NAME="kata-lifecycle-test"
+
+# Run a command on the host node's filesystem using a short-lived privileged pod.
+# The host root is mounted at /host inside the pod.
+# Usage: run_on_host "test -d /host/opt/kata && echo YES || echo NO"
+run_on_host() {
+	local cmd="$1"
+	local node_name
+	node_name=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | head -1)
+	local pod_name="host-exec-${RANDOM}"
+
+	kubectl run "${pod_name}" \
+		--image=quay.io/kata-containers/alpine-bash-curl:latest \
+		--restart=Never --rm -i \
+		--overrides="{
+			\"spec\": {
+				\"nodeName\": \"${node_name}\",
+				\"activeDeadlineSeconds\": 300,
+				\"tolerations\": [{\"operator\": \"Exists\"}],
+				\"containers\": [{
+					\"name\": \"exec\",
+					\"image\": \"quay.io/kata-containers/alpine-bash-curl:latest\",
+					\"imagePullPolicy\": \"IfNotPresent\",
+					\"command\": [\"sh\", \"-c\", \"${cmd}\"],
+					\"securityContext\": {\"privileged\": true},
+					\"volumeMounts\": [{\"name\": \"host\", \"mountPath\": \"/host\", \"readOnly\": true}]
+				}],
+				\"volumes\": [{\"name\": \"host\", \"hostPath\": {\"path\": \"/\"}}]
+			}
+		}"
+}
+
+setup_file() {
+	ensure_helm
+
+	echo "# Image: ${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" >&3
+	echo "# Hypervisor: ${KATA_HYPERVISOR}" >&3
+	echo "# K8s distribution: ${KUBERNETES}" >&3
+	echo "# Deploying kata-deploy..." >&3
+	deploy_kata
+	echo "# kata-deploy deployed successfully" >&3
+}
+
+@test "Kata artifacts are present on host after install" {
+	echo "# Checking kata artifacts on host..." >&3
+
+	run run_on_host "test -d /host/opt/kata && echo PRESENT || echo MISSING"
+	echo "# /opt/kata directory: ${output}" >&3
+	[[ "${output}" == *"PRESENT"* ]]
+
+	run run_on_host "test -f /host/opt/kata/bin/containerd-shim-kata-v2 && echo FOUND || (test -f /host/opt/kata/runtime-rs/bin/containerd-shim-kata-v2 && echo FOUND || echo MISSING)"
+	echo "# containerd-shim-kata-v2: ${output}" >&3
+	[[ "${output}" == *"FOUND"* ]]
+
+	# RuntimeClasses must exist (filter out AKS-managed ones)
+	local rc_count
+	rc_count=$(kubectl get runtimeclasses --no-headers 2>/dev/null | grep -v "kata-mshv-vm-isolation" | grep -c "kata" || true)
+	echo "# Kata RuntimeClasses: ${rc_count}" >&3
+	[[ ${rc_count} -gt 0 ]]
+
+	# Node must have the kata-runtime label
+	local label
+	label=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.katacontainers\.io/kata-runtime}')
+	echo "# Node label katacontainers.io/kata-runtime: ${label}" >&3
+	[[ "${label}" == "true" ]]
+}
+
+@test "DaemonSet restart does not crash running kata pods" {
+	# Create a long-running kata pod
+	cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${LIFECYCLE_POD_NAME}
+spec:
+  runtimeClassName: kata-${KATA_HYPERVISOR}
+  restartPolicy: Always
+  nodeSelector:
+    katacontainers.io/kata-runtime: "true"
+  containers:
+    - name: test
+      image: quay.io/kata-containers/alpine-bash-curl:latest
+      imagePullPolicy: IfNotPresent
+      command: ["sleep", "infinity"]
+EOF
+
+	echo "# Waiting for kata pod to be running..." >&3
+	kubectl wait --for=condition=Ready "pod/${LIFECYCLE_POD_NAME}" --timeout=120s
+
+	# Record pod identity before the DaemonSet restart
+	local pod_uid_before
+	pod_uid_before=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.metadata.uid}')
+	local restart_count_before
+	restart_count_before=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.containerStatuses[0].restartCount}')
+	echo "# Pod UID before: ${pod_uid_before}, restarts: ${restart_count_before}" >&3
+
+	# Trigger a DaemonSet restart — this simulates what happens when a user
+	# changes a label, updates a config value, or does a rolling update.
+	echo "# Triggering kata-deploy DaemonSet restart..." >&3
+	kubectl -n "${HELM_NAMESPACE}" rollout restart daemonset/kata-deploy
+
+	echo "# Waiting for DaemonSet rollout to complete..." >&3
+	kubectl -n "${HELM_NAMESPACE}" rollout status daemonset/kata-deploy --timeout=300s
+
+	# On k3s/rke2 the new kata-deploy pod restarts the k3s service as
+	# part of install, which causes a brief API server outage. Wait for
+	# the node to become ready before querying pod status.
+	kubectl wait nodes --timeout=120s --all --for condition=Ready=True
+	echo "# Node is ready after DaemonSet rollout" >&3
+
+	# The kata pod must still be Running with the same UID and no extra restarts.
+	# Retry kubectl through any residual API unavailability.
+	local pod_phase=""
+	local retries=0
+	while [[ ${retries} -lt 30 ]]; do
+		pod_phase=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.phase}' 2>/dev/null) && break
+		retries=$((retries + 1))
+		sleep 2
+	done
+	echo "# Pod phase after restart: ${pod_phase}" >&3
+	[[ "${pod_phase}" == "Running" ]]
+
+	local pod_uid_after
+	pod_uid_after=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.metadata.uid}')
+	echo "# Pod UID after: ${pod_uid_after}" >&3
+	[[ "${pod_uid_before}" == "${pod_uid_after}" ]]
+
+	local restart_count_after
+	restart_count_after=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.containerStatuses[0].restartCount}')
+	echo "# Restart count after: ${restart_count_after}" >&3
+	[[ "${restart_count_before}" == "${restart_count_after}" ]]
+
+	echo "# SUCCESS: Kata pod survived DaemonSet restart without crashing" >&3
+}
+
+@test "Artifacts are fully cleaned up after uninstall" {
+	echo "# Uninstalling kata-deploy..." >&3
+	uninstall_kata
+	echo "# Uninstall complete, verifying cleanup..." >&3
+
+	# Wait for node to recover — containerd restart during cleanup may
+	# cause brief unavailability (especially on k3s/rke2).
+	kubectl wait nodes --timeout=120s --all --for condition=Ready=True
+
+	# RuntimeClasses must be gone (filter out AKS-managed ones)
+	local rc_count
+	rc_count=$(kubectl get runtimeclasses --no-headers 2>/dev/null | grep -v "kata-mshv-vm-isolation" | grep -c "kata" || true)
+	echo "# Kata RuntimeClasses remaining: ${rc_count}" >&3
+	[[ ${rc_count} -eq 0 ]]
+
+	# Node label must be removed
+	local label
+	label=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.katacontainers\.io/kata-runtime}' 2>/dev/null || echo "")
+	echo "# Node label after uninstall: '${label}'" >&3
+	[[ -z "${label}" ]]
+
+	# Kata artifacts must be removed from the host filesystem
+	echo "# Checking host filesystem for leftover artifacts..." >&3
+	run run_on_host "test -d /host/opt/kata && echo EXISTS || echo REMOVED"
+	echo "# /opt/kata: ${output}" >&3
+	[[ "${output}" == *"REMOVED"* ]]
+
+	# Containerd must still be healthy and reporting a valid version
+	local container_runtime_version
+	container_runtime_version=$(kubectl get nodes --no-headers -o custom-columns=CONTAINER_RUNTIME:.status.nodeInfo.containerRuntimeVersion)
+	echo "# Container runtime version: ${container_runtime_version}" >&3
+	[[ "${container_runtime_version}" != *"Unknown"* ]]
+
+	echo "# SUCCESS: All kata artifacts cleaned up, containerd healthy" >&3
+}
+
+teardown() {
+	if [[ "${BATS_TEST_NAME}" == *"restart"* ]]; then
+		kubectl delete pod "${LIFECYCLE_POD_NAME}" --ignore-not-found=true --wait=false 2>/dev/null || true
+	fi
+}
+
+teardown_file() {
+	kubectl delete pod "${LIFECYCLE_POD_NAME}" --ignore-not-found=true --wait=false 2>/dev/null || true
+	uninstall_kata 2>/dev/null || true
+}
--- a/tests/functional/kata-deploy/run-kata-deploy-tests.sh
+++ b/tests/functional/kata-deploy/run-kata-deploy-tests.sh
@@ -20,6 +20,7 @@ else
 	KATA_DEPLOY_TEST_UNION=( \
 		"kata-deploy.bats" \
 		"kata-deploy-custom-runtimes.bats" \
+		"kata-deploy-lifecycle.bats" \
 	)
 fi