tests: Simplify kata-deploy test to use helm directly

The kata-deploy test was using helm_helper which made it hard to debug
failures (die() calls would cause "Executed 0 tests" errors) and added
unnecessary complexity.

The test now calls helm directly like a user would, making it simpler
and more representative of real-world usage. The verification job status
is explicitly checked with proper failure detection instead of relying
on helm --wait.

Timeouts are configurable via environment variables to account for
different network speeds and image sizes:
- KATA_DEPLOY_TIMEOUT (default: 600s)
- KATA_DEPLOY_DAEMONSET_TIMEOUT (default: 300s)
- KATA_DEPLOY_VERIFICATION_TIMEOUT (default: 120s)

Documentation has been added to explain what each timeout controls and
how to customize them.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
This commit is contained in:
Fabiano Fidêncio
2026-01-21 11:59:10 +01:00
parent 86e0b08b13
commit ec18dd79ba

View File

@@ -4,15 +4,35 @@
#
# SPDX-License-Identifier: Apache-2.0
#
# Kata Deploy Functional Tests
#
# This test validates that kata-deploy successfully installs and configures
# Kata Containers on a Kubernetes cluster using Helm.
#
# Required environment variables:
# DOCKER_REGISTRY - Container registry for kata-deploy image
# DOCKER_REPO - Repository name for kata-deploy image
# DOCKER_TAG - Image tag to test
# KATA_HYPERVISOR - Hypervisor to test (qemu, clh, etc.)
# KUBERNETES - K8s distribution (microk8s, k3s, rke2, etc.)
#
# Optional timeout configuration (increase for slow networks or large images):
# KATA_DEPLOY_TIMEOUT - Overall helm timeout (default: 30m)
# KATA_DEPLOY_DAEMONSET_TIMEOUT - DaemonSet rollout timeout in seconds (default: 1200 = 20m)
# Includes time to pull kata-deploy image
# KATA_DEPLOY_VERIFICATION_TIMEOUT - Verification pod timeout in seconds (default: 180 = 3m)
# Time for verification pod to run
#
# Example with custom timeouts for slow network:
# KATA_DEPLOY_DAEMONSET_TIMEOUT=3600 bats kata-deploy.bats
#
load "${BATS_TEST_DIRNAME}/../../common.bash"
repo_root_dir="${BATS_TEST_DIRNAME}/../../../"
load "${repo_root_dir}/tests/gha-run-k8s-common.sh"
setup() {
ensure_yq
pushd "${repo_root_dir}"
ensure_helm
# We expect 2 runtime classes because:
# * `kata` is the default runtimeclass created by Helm, basically an alias for `kata-${KATA_HYPERVISOR}`.
@@ -26,50 +46,113 @@ setup() {
"kata\s+kata-${KATA_HYPERVISOR}" \
"kata-${KATA_HYPERVISOR}\s+kata-${KATA_HYPERVISOR}" \
)
# Set the latest image, the one generated as part of the PR, to be used as part of the tests
export HELM_IMAGE_REFERENCE="${DOCKER_REGISTRY}/${DOCKER_REPO}"
export HELM_IMAGE_TAG="${DOCKER_TAG}"
# Enable debug for Kata Containers
export HELM_DEBUG="true"
# Create the runtime class only for the shim that's being tested
export HELM_SHIMS="${KATA_HYPERVISOR}"
# Set the tested hypervisor as the default `kata` shim
export HELM_DEFAULT_SHIM="${KATA_HYPERVISOR}"
# Let the Helm chart create the default `kata` runtime class
export HELM_CREATE_DEFAULT_RUNTIME_CLASS="true"
HOST_OS=""
if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
HOST_OS="${KATA_HOST_OS}"
fi
export HELM_HOST_OS="${HOST_OS}"
export HELM_K8S_DISTRIBUTION="${KUBERNETES}"
# Enable deployment verification (verifies Kata Containers
# VM kernel isolation by comparing node vs pod kernel)
export HELM_VERIFY_DEPLOYMENT="true"
helm_helper
echo "::group::kata-deploy logs"
kubectl -n kube-system logs --tail=100 -l name=kata-deploy
echo "::endgroup::"
echo "::group::Runtime classes"
kubectl get runtimeclass
echo "::endgroup::"
popd
}
@test "Test runtimeclasses are being properly created and container runtime is not broken" {
pushd "${repo_root_dir}"
# Create verification pod spec
local verification_yaml
verification_yaml=$(mktemp)
cat > "${verification_yaml}" << EOF
apiVersion: v1
kind: Pod
metadata:
name: kata-deploy-verify
spec:
runtimeClassName: kata-${KATA_HYPERVISOR}
restartPolicy: Never
nodeSelector:
katacontainers.io/kata-runtime: "true"
containers:
- name: verify
image: quay.io/kata-containers/alpine-bash-curl:latest
imagePullPolicy: Always
command:
- sh
- -c
- |
echo "=== Kata Verification ==="
echo "Kernel: \$(uname -r)"
echo "SUCCESS: Pod running with Kata runtime"
EOF
# Install kata-deploy via Helm
echo "Installing kata-deploy with Helm..."
local helm_chart_dir="tools/packaging/kata-deploy/helm-chart/kata-deploy"
# Timeouts can be customized via environment variables:
# - KATA_DEPLOY_TIMEOUT: Overall helm timeout (includes all hooks)
# Default: 600s (10 minutes)
# - KATA_DEPLOY_DAEMONSET_TIMEOUT: Time to wait for kata-deploy DaemonSet rollout (image pull + pod start)
# Default: 300s (5 minutes) - accounts for large image downloads
# - KATA_DEPLOY_VERIFICATION_TIMEOUT: Time to wait for verification pod to complete
# Default: 120s (2 minutes) - verification pod execution time
local helm_timeout="${KATA_DEPLOY_TIMEOUT:-600s}"
local daemonset_timeout="${KATA_DEPLOY_DAEMONSET_TIMEOUT:-300}"
local verification_timeout="${KATA_DEPLOY_VERIFICATION_TIMEOUT:-120}"
echo "Timeout configuration:"
echo " Helm overall: ${helm_timeout}"
echo " DaemonSet rollout: ${daemonset_timeout}s (includes image pull)"
echo " Verification pod: ${verification_timeout}s (pod execution)"
helm dependency build "${helm_chart_dir}"
# Disable all shims except the one being tested
helm upgrade --install kata-deploy "${helm_chart_dir}" \
--set image.reference="${DOCKER_REGISTRY}/${DOCKER_REPO}" \
--set image.tag="${DOCKER_TAG}" \
--set debug=true \
--set k8sDistribution="${KUBERNETES}" \
--set shims.clh.enabled=false \
--set shims.cloud-hypervisor.enabled=false \
--set shims.dragonball.enabled=false \
--set shims.fc.enabled=false \
--set shims.qemu.enabled=false \
--set shims.qemu-runtime-rs.enabled=false \
--set shims.qemu-cca.enabled=false \
--set shims.qemu-se.enabled=false \
--set shims.qemu-se-runtime-rs.enabled=false \
--set shims.qemu-nvidia-gpu.enabled=false \
--set shims.qemu-nvidia-gpu-snp.enabled=false \
--set shims.qemu-nvidia-gpu-tdx.enabled=false \
--set shims.qemu-sev.enabled=false \
--set shims.qemu-snp.enabled=false \
--set shims.qemu-snp-runtime-rs.enabled=false \
--set shims.qemu-tdx.enabled=false \
--set shims.qemu-tdx-runtime-rs.enabled=false \
--set shims.qemu-coco-dev.enabled=false \
--set shims.qemu-coco-dev-runtime-rs.enabled=false \
--set "shims.${KATA_HYPERVISOR}.enabled=true" \
--set "defaultShim.amd64=${KATA_HYPERVISOR}" \
--set "defaultShim.arm64=${KATA_HYPERVISOR}" \
--set runtimeClasses.enabled=true \
--set runtimeClasses.createDefault=true \
--set-file verification.pod="${verification_yaml}" \
--set verification.timeout="${verification_timeout}" \
--set verification.daemonsetTimeout="${daemonset_timeout}" \
--namespace kube-system \
--wait --timeout "${helm_timeout}"
rm -f "${verification_yaml}"
echo ""
echo "::group::kata-deploy logs"
kubectl -n kube-system logs --tail=200 -l name=kata-deploy
echo "::endgroup::"
echo ""
echo "::group::Runtime classes"
kubectl get runtimeclass
echo "::endgroup::"
# helm --wait already waits for post-install hooks to complete
# If helm returns successfully, the verification job passed
# The job is deleted after success (hook-delete-policy: hook-succeeded)
echo ""
echo "Helm install completed successfully - verification passed"
# We filter `kata-mshv-vm-isolation` out as that's present on AKS clusters, but that's not coming from kata-deploy
current_runtime_classes=$(kubectl get runtimeclasses | grep -v "kata-mshv-vm-isolation" | grep "kata" | wc -l)
[[ ${current_runtime_classes} -eq ${expected_runtime_classes} ]]
@@ -91,6 +174,8 @@ setup() {
# Check that the container runtime verison doesn't have unknown, which happens when containerd can't start properly
container_runtime_version=$(kubectl get nodes --no-headers -o custom-columns=CONTAINER_RUNTIME:.status.nodeInfo.containerRuntimeVersion)
[[ ${container_runtime_version} != *"containerd://Unknown"* ]]
popd
}
teardown() {