tests: Simplify kata-deploy test to use helm directly

The kata-deploy test was using helm_helper which made it hard to debug failures (die() calls would cause "Executed 0 tests" errors) and added unnecessary complexity. The test now calls helm directly like a user would, making it simpler and more representative of real-world usage. The verification job status is explicitly checked with proper failure detection instead of relying on helm --wait. Timeouts are configurable via environment variables to account for different network speeds and image sizes: - KATA_DEPLOY_TIMEOUT (default: 600s) - KATA_DEPLOY_DAEMONSET_TIMEOUT (default: 300s) - KATA_DEPLOY_VERIFICATION_TIMEOUT (default: 120s) Documentation has been added to explain what each timeout controls and how to customize them. Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
2026-01-24 14:05:30 +00:00 · 2026-01-21 11:59:10 +01:00
parent 86e0b08b13
commit ec18dd79ba
1 changed files with 129 additions and 44 deletions
--- a/tests/functional/kata-deploy/kata-deploy.bats
+++ b/tests/functional/kata-deploy/kata-deploy.bats
@@ -4,15 +4,35 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
+# Kata Deploy Functional Tests
+#
+# This test validates that kata-deploy successfully installs and configures
+# Kata Containers on a Kubernetes cluster using Helm.
+#
+# Required environment variables:
+#   DOCKER_REGISTRY - Container registry for kata-deploy image
+#   DOCKER_REPO     - Repository name for kata-deploy image  
+#   DOCKER_TAG      - Image tag to test
+#   KATA_HYPERVISOR - Hypervisor to test (qemu, clh, etc.)
+#   KUBERNETES      - K8s distribution (microk8s, k3s, rke2, etc.)
+#
+# Optional timeout configuration (increase for slow networks or large images):
+#   KATA_DEPLOY_TIMEOUT                - Overall helm timeout (default: 30m)
+#   KATA_DEPLOY_DAEMONSET_TIMEOUT      - DaemonSet rollout timeout in seconds (default: 1200 = 20m)
+#                                        Includes time to pull kata-deploy image
+#   KATA_DEPLOY_VERIFICATION_TIMEOUT   - Verification pod timeout in seconds (default: 180 = 3m)
+#                                        Time for verification pod to run
+#
+# Example with custom timeouts for slow network:
+#   KATA_DEPLOY_DAEMONSET_TIMEOUT=3600 bats kata-deploy.bats
+#

 load "${BATS_TEST_DIRNAME}/../../common.bash"
 repo_root_dir="${BATS_TEST_DIRNAME}/../../../"
 load "${repo_root_dir}/tests/gha-run-k8s-common.sh"

 setup() {
-	ensure_yq
-
-	pushd "${repo_root_dir}"
+	ensure_helm

 	# We expect 2 runtime classes because:
 	# * `kata` is the default runtimeclass created by Helm, basically an alias for `kata-${KATA_HYPERVISOR}`.
@@ -26,50 +46,113 @@ setup() {
 		"kata\s+kata-${KATA_HYPERVISOR}" \
 		"kata-${KATA_HYPERVISOR}\s+kata-${KATA_HYPERVISOR}" \
 	)
-
-
-	# Set the latest image, the one generated as part of the PR, to be used as part of the tests
-	export HELM_IMAGE_REFERENCE="${DOCKER_REGISTRY}/${DOCKER_REPO}"
-	export HELM_IMAGE_TAG="${DOCKER_TAG}"
-
-	# Enable debug for Kata Containers
-	export HELM_DEBUG="true"
-
-	# Create the runtime class only for the shim that's being tested
-	export HELM_SHIMS="${KATA_HYPERVISOR}"
-
-	# Set the tested hypervisor as the default `kata` shim
-	export HELM_DEFAULT_SHIM="${KATA_HYPERVISOR}"
-
-	# Let the Helm chart create the default `kata` runtime class
-	export HELM_CREATE_DEFAULT_RUNTIME_CLASS="true"
-
-	HOST_OS=""
-        if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
-                HOST_OS="${KATA_HOST_OS}"
-        fi
-	export HELM_HOST_OS="${HOST_OS}"
-
-	export HELM_K8S_DISTRIBUTION="${KUBERNETES}"
-
-	# Enable deployment verification (verifies Kata Containers
-	# VM kernel isolation by comparing node vs pod kernel)
-	export HELM_VERIFY_DEPLOYMENT="true"
-
-	helm_helper
-
-	echo "::group::kata-deploy logs"
-	kubectl -n kube-system logs --tail=100 -l name=kata-deploy
-	echo "::endgroup::"
-
-	echo "::group::Runtime classes"
-	kubectl get runtimeclass
-	echo "::endgroup::"
-
-	popd
 }

@test "Test runtimeclasses are being properly created and container runtime is not broken" {
+	pushd "${repo_root_dir}"
+	
+	# Create verification pod spec
+	local verification_yaml
+	verification_yaml=$(mktemp)
+	cat > "${verification_yaml}" << EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kata-deploy-verify
+spec:
+  runtimeClassName: kata-${KATA_HYPERVISOR}
+  restartPolicy: Never
+  nodeSelector:
+    katacontainers.io/kata-runtime: "true"
+  containers:
+    - name: verify
+      image: quay.io/kata-containers/alpine-bash-curl:latest
+      imagePullPolicy: Always
+      command:
+        - sh
+        - -c
+        - |
+          echo "=== Kata Verification ==="
+          echo "Kernel: \$(uname -r)"
+          echo "SUCCESS: Pod running with Kata runtime"
+EOF
+	
+	# Install kata-deploy via Helm
+	echo "Installing kata-deploy with Helm..."
+	local helm_chart_dir="tools/packaging/kata-deploy/helm-chart/kata-deploy"
+	
+	# Timeouts can be customized via environment variables:
+	# - KATA_DEPLOY_TIMEOUT: Overall helm timeout (includes all hooks)
+	#   Default: 600s (10 minutes)
+	# - KATA_DEPLOY_DAEMONSET_TIMEOUT: Time to wait for kata-deploy DaemonSet rollout (image pull + pod start)
+	#   Default: 300s (5 minutes) - accounts for large image downloads
+	# - KATA_DEPLOY_VERIFICATION_TIMEOUT: Time to wait for verification pod to complete
+	#   Default: 120s (2 minutes) - verification pod execution time
+	local helm_timeout="${KATA_DEPLOY_TIMEOUT:-600s}"
+	local daemonset_timeout="${KATA_DEPLOY_DAEMONSET_TIMEOUT:-300}"
+	local verification_timeout="${KATA_DEPLOY_VERIFICATION_TIMEOUT:-120}"
+	
+	echo "Timeout configuration:"
+	echo "  Helm overall: ${helm_timeout}"
+	echo "  DaemonSet rollout: ${daemonset_timeout}s (includes image pull)"
+	echo "  Verification pod: ${verification_timeout}s (pod execution)"
+
+	helm dependency build "${helm_chart_dir}"
+	
+	# Disable all shims except the one being tested
+	helm upgrade --install kata-deploy "${helm_chart_dir}" \
+		--set image.reference="${DOCKER_REGISTRY}/${DOCKER_REPO}" \
+		--set image.tag="${DOCKER_TAG}" \
+		--set debug=true \
+		--set k8sDistribution="${KUBERNETES}" \
+		--set shims.clh.enabled=false \
+		--set shims.cloud-hypervisor.enabled=false \
+		--set shims.dragonball.enabled=false \
+		--set shims.fc.enabled=false \
+		--set shims.qemu.enabled=false \
+		--set shims.qemu-runtime-rs.enabled=false \
+		--set shims.qemu-cca.enabled=false \
+		--set shims.qemu-se.enabled=false \
+		--set shims.qemu-se-runtime-rs.enabled=false \
+		--set shims.qemu-nvidia-gpu.enabled=false \
+		--set shims.qemu-nvidia-gpu-snp.enabled=false \
+		--set shims.qemu-nvidia-gpu-tdx.enabled=false \
+		--set shims.qemu-sev.enabled=false \
+		--set shims.qemu-snp.enabled=false \
+		--set shims.qemu-snp-runtime-rs.enabled=false \
+		--set shims.qemu-tdx.enabled=false \
+		--set shims.qemu-tdx-runtime-rs.enabled=false \
+		--set shims.qemu-coco-dev.enabled=false \
+		--set shims.qemu-coco-dev-runtime-rs.enabled=false \
+		--set "shims.${KATA_HYPERVISOR}.enabled=true" \
+		--set "defaultShim.amd64=${KATA_HYPERVISOR}" \
+		--set "defaultShim.arm64=${KATA_HYPERVISOR}" \
+		--set runtimeClasses.enabled=true \
+		--set runtimeClasses.createDefault=true \
+		--set-file verification.pod="${verification_yaml}" \
+		--set verification.timeout="${verification_timeout}" \
+		--set verification.daemonsetTimeout="${daemonset_timeout}" \
+		--namespace kube-system \
+		--wait --timeout "${helm_timeout}"
+	
+	rm -f "${verification_yaml}"
+	
+	echo ""
+	echo "::group::kata-deploy logs"
+	kubectl -n kube-system logs --tail=200 -l name=kata-deploy
+	echo "::endgroup::"
+
+	echo ""
+	echo "::group::Runtime classes"
+	kubectl get runtimeclass
+	echo "::endgroup::"
+	
+	# helm --wait already waits for post-install hooks to complete
+	# If helm returns successfully, the verification job passed
+	# The job is deleted after success (hook-delete-policy: hook-succeeded)
+	echo ""
+	echo "Helm install completed successfully - verification passed"
+	
 	# We filter `kata-mshv-vm-isolation` out as that's present on AKS clusters, but that's not coming from kata-deploy
 	current_runtime_classes=$(kubectl get runtimeclasses | grep -v "kata-mshv-vm-isolation" | grep "kata" | wc -l)
 	[[ ${current_runtime_classes} -eq ${expected_runtime_classes} ]]
@@ -91,6 +174,8 @@ setup() {
 	# Check that the container runtime verison doesn't have unknown, which happens when containerd can't start properly
 	container_runtime_version=$(kubectl get nodes --no-headers -o custom-columns=CONTAINER_RUNTIME:.status.nodeInfo.containerRuntimeVersion)
 	[[ ${container_runtime_version} != *"containerd://Unknown"* ]]
+	
+	popd
 }

 teardown() {