From 213114736014b6c1f234cc1ca2109336e1a42c8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 1 Apr 2026 11:28:26 +0200 Subject: [PATCH] tests: add kata-deploy lifecycle tests for restart resilience and cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add functional tests that cover two previously untested kata-deploy behaviors: 1. Restart resilience (regression test for #12761): deploys a long-running kata pod, triggers a kata-deploy DaemonSet restart via rollout restart, and verifies the kata pod survives with the same UID and zero additional container restarts. 2. Artifact cleanup: after helm uninstall, verifies that RuntimeClasses are removed, the kata-runtime node label is cleared, /opt/kata is gone from the host filesystem, and containerd remains healthy. 3. Artifact presence: after install, verifies /opt/kata and the shim binary exist on the host, RuntimeClasses are created, and the node is labeled. Host filesystem checks use a short-lived privileged pod with a hostPath mount to inspect the node directly. Signed-off-by: Fabiano Fidêncio --- .../kata-deploy/kata-deploy-lifecycle.bats | 213 ++++++++++++++++++ .../kata-deploy/run-kata-deploy-tests.sh | 1 + 2 files changed, 214 insertions(+) create mode 100644 tests/functional/kata-deploy/kata-deploy-lifecycle.bats diff --git a/tests/functional/kata-deploy/kata-deploy-lifecycle.bats b/tests/functional/kata-deploy/kata-deploy-lifecycle.bats new file mode 100644 index 0000000000..1c883b2c4a --- /dev/null +++ b/tests/functional/kata-deploy/kata-deploy-lifecycle.bats @@ -0,0 +1,213 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Kata Deploy Lifecycle Tests +# +# Validates kata-deploy behavior during DaemonSet restarts and uninstalls: +# +# 1. Artifacts present: After install, kata artifacts exist on the host, +# RuntimeClasses are created, and the node is labeled. +# +# 2. Restart resilience: Running kata pods must survive a kata-deploy +# DaemonSet restart without crashing. (Regression test for #12761) +# +# 3. Artifact cleanup: After helm uninstall, kata artifacts must be +# fully removed from the host and containerd must remain healthy. +# +# Required environment variables: +# DOCKER_REGISTRY - Container registry for kata-deploy image +# DOCKER_REPO - Repository name for kata-deploy image +# DOCKER_TAG - Image tag to test +# KATA_HYPERVISOR - Hypervisor to test (qemu, clh, etc.) +# KUBERNETES - K8s distribution (microk8s, k3s, rke2, etc.) + +load "${BATS_TEST_DIRNAME}/../../common.bash" +repo_root_dir="${BATS_TEST_DIRNAME}/../../../" +load "${repo_root_dir}/tests/gha-run-k8s-common.sh" + +source "${BATS_TEST_DIRNAME}/lib/helm-deploy.bash" + +LIFECYCLE_POD_NAME="kata-lifecycle-test" + +# Run a command on the host node's filesystem using a short-lived privileged pod. +# The host root is mounted at /host inside the pod. +# Usage: run_on_host "test -d /host/opt/kata && echo YES || echo NO" +run_on_host() { + local cmd="$1" + local node_name + node_name=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | head -1) + local pod_name="host-exec-${RANDOM}" + + kubectl run "${pod_name}" \ + --image=quay.io/kata-containers/alpine-bash-curl:latest \ + --restart=Never --rm -i \ + --overrides="{ + \"spec\": { + \"nodeName\": \"${node_name}\", + \"activeDeadlineSeconds\": 300, + \"tolerations\": [{\"operator\": \"Exists\"}], + \"containers\": [{ + \"name\": \"exec\", + \"image\": \"quay.io/kata-containers/alpine-bash-curl:latest\", + \"imagePullPolicy\": \"IfNotPresent\", + \"command\": [\"sh\", \"-c\", \"${cmd}\"], + \"securityContext\": {\"privileged\": true}, + \"volumeMounts\": [{\"name\": \"host\", \"mountPath\": \"/host\", \"readOnly\": true}] + }], + \"volumes\": [{\"name\": \"host\", \"hostPath\": {\"path\": \"/\"}}] + } + }" +} + +setup_file() { + ensure_helm + + echo "# Image: ${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" >&3 + echo "# Hypervisor: ${KATA_HYPERVISOR}" >&3 + echo "# K8s distribution: ${KUBERNETES}" >&3 + echo "# Deploying kata-deploy..." >&3 + deploy_kata + echo "# kata-deploy deployed successfully" >&3 +} + +@test "Kata artifacts are present on host after install" { + echo "# Checking kata artifacts on host..." >&3 + + run run_on_host "test -d /host/opt/kata && echo PRESENT || echo MISSING" + echo "# /opt/kata directory: ${output}" >&3 + [[ "${output}" == *"PRESENT"* ]] + + run run_on_host "test -f /host/opt/kata/bin/containerd-shim-kata-v2 && echo FOUND || (test -f /host/opt/kata/runtime-rs/bin/containerd-shim-kata-v2 && echo FOUND || echo MISSING)" + echo "# containerd-shim-kata-v2: ${output}" >&3 + [[ "${output}" == *"FOUND"* ]] + + # RuntimeClasses must exist (filter out AKS-managed ones) + local rc_count + rc_count=$(kubectl get runtimeclasses --no-headers 2>/dev/null | grep -v "kata-mshv-vm-isolation" | grep -c "kata" || true) + echo "# Kata RuntimeClasses: ${rc_count}" >&3 + [[ ${rc_count} -gt 0 ]] + + # Node must have the kata-runtime label + local label + label=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.katacontainers\.io/kata-runtime}') + echo "# Node label katacontainers.io/kata-runtime: ${label}" >&3 + [[ "${label}" == "true" ]] +} + +@test "DaemonSet restart does not crash running kata pods" { + # Create a long-running kata pod + cat <&3 + kubectl wait --for=condition=Ready "pod/${LIFECYCLE_POD_NAME}" --timeout=120s + + # Record pod identity before the DaemonSet restart + local pod_uid_before + pod_uid_before=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.metadata.uid}') + local restart_count_before + restart_count_before=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.containerStatuses[0].restartCount}') + echo "# Pod UID before: ${pod_uid_before}, restarts: ${restart_count_before}" >&3 + + # Trigger a DaemonSet restart — this simulates what happens when a user + # changes a label, updates a config value, or does a rolling update. + echo "# Triggering kata-deploy DaemonSet restart..." >&3 + kubectl -n "${HELM_NAMESPACE}" rollout restart daemonset/kata-deploy + + echo "# Waiting for DaemonSet rollout to complete..." >&3 + kubectl -n "${HELM_NAMESPACE}" rollout status daemonset/kata-deploy --timeout=300s + + # On k3s/rke2 the new kata-deploy pod restarts the k3s service as + # part of install, which causes a brief API server outage. Wait for + # the node to become ready before querying pod status. + kubectl wait nodes --timeout=120s --all --for condition=Ready=True + echo "# Node is ready after DaemonSet rollout" >&3 + + # The kata pod must still be Running with the same UID and no extra restarts. + # Retry kubectl through any residual API unavailability. + local pod_phase="" + local retries=0 + while [[ ${retries} -lt 30 ]]; do + pod_phase=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.phase}' 2>/dev/null) && break + retries=$((retries + 1)) + sleep 2 + done + echo "# Pod phase after restart: ${pod_phase}" >&3 + [[ "${pod_phase}" == "Running" ]] + + local pod_uid_after + pod_uid_after=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.metadata.uid}') + echo "# Pod UID after: ${pod_uid_after}" >&3 + [[ "${pod_uid_before}" == "${pod_uid_after}" ]] + + local restart_count_after + restart_count_after=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.containerStatuses[0].restartCount}') + echo "# Restart count after: ${restart_count_after}" >&3 + [[ "${restart_count_before}" == "${restart_count_after}" ]] + + echo "# SUCCESS: Kata pod survived DaemonSet restart without crashing" >&3 +} + +@test "Artifacts are fully cleaned up after uninstall" { + echo "# Uninstalling kata-deploy..." >&3 + uninstall_kata + echo "# Uninstall complete, verifying cleanup..." >&3 + + # Wait for node to recover — containerd restart during cleanup may + # cause brief unavailability (especially on k3s/rke2). + kubectl wait nodes --timeout=120s --all --for condition=Ready=True + + # RuntimeClasses must be gone (filter out AKS-managed ones) + local rc_count + rc_count=$(kubectl get runtimeclasses --no-headers 2>/dev/null | grep -v "kata-mshv-vm-isolation" | grep -c "kata" || true) + echo "# Kata RuntimeClasses remaining: ${rc_count}" >&3 + [[ ${rc_count} -eq 0 ]] + + # Node label must be removed + local label + label=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.katacontainers\.io/kata-runtime}' 2>/dev/null || echo "") + echo "# Node label after uninstall: '${label}'" >&3 + [[ -z "${label}" ]] + + # Kata artifacts must be removed from the host filesystem + echo "# Checking host filesystem for leftover artifacts..." >&3 + run run_on_host "test -d /host/opt/kata && echo EXISTS || echo REMOVED" + echo "# /opt/kata: ${output}" >&3 + [[ "${output}" == *"REMOVED"* ]] + + # Containerd must still be healthy and reporting a valid version + local container_runtime_version + container_runtime_version=$(kubectl get nodes --no-headers -o custom-columns=CONTAINER_RUNTIME:.status.nodeInfo.containerRuntimeVersion) + echo "# Container runtime version: ${container_runtime_version}" >&3 + [[ "${container_runtime_version}" != *"Unknown"* ]] + + echo "# SUCCESS: All kata artifacts cleaned up, containerd healthy" >&3 +} + +teardown() { + if [[ "${BATS_TEST_NAME}" == *"restart"* ]]; then + kubectl delete pod "${LIFECYCLE_POD_NAME}" --ignore-not-found=true --wait=false 2>/dev/null || true + fi +} + +teardown_file() { + kubectl delete pod "${LIFECYCLE_POD_NAME}" --ignore-not-found=true --wait=false 2>/dev/null || true + uninstall_kata 2>/dev/null || true +} diff --git a/tests/functional/kata-deploy/run-kata-deploy-tests.sh b/tests/functional/kata-deploy/run-kata-deploy-tests.sh index 33beb14c4f..eff06efaa2 100644 --- a/tests/functional/kata-deploy/run-kata-deploy-tests.sh +++ b/tests/functional/kata-deploy/run-kata-deploy-tests.sh @@ -20,6 +20,7 @@ else KATA_DEPLOY_TEST_UNION=( \ "kata-deploy.bats" \ "kata-deploy-custom-runtimes.bats" \ + "kata-deploy-lifecycle.bats" \ ) fi