tests: add kata-monitor helm chart k8s test

Add a single-job k8s test that installs the kata-deploy helm chart with monitor.enabled=true, pointed at the per-PR kata-monitor image built earlier in the same run, and exercises both the rollout and the user-visible behaviour: * the kata-monitor DaemonSet rolls out and the pod stays up without container restarts; * a real kata-runtime probe pod is scheduled, then /metrics and /sandboxes are scraped through the apiserver pod-proxy to prove kata-monitor sees the sandbox (non-zero running-shim count plus at least one per-sandbox kata_shim_* metric); * after the probe pod is deleted, /metrics drops back to a zero running-shim count. Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com> Assisted-by: OpenAI Codex <codex@openai.com>
2026-07-01 22:50:54 +00:00 · 2026-06-03 12:58:31 +02:00
parent 285d5daa23
commit 92a9691470
5 changed files with 398 additions and 0 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -305,6 +305,20 @@ jobs:
      target-branch: ${{ inputs.target-branch }}
      kata-monitor-image: ghcr.io/${{ github.repository_owner }}/kata-monitor-ci:${{ inputs.tag }}-amd64

+  run-kata-monitor-k8s-tests:
+    if: ${{ inputs.skip-test != 'yes' }}
+    needs: [publish-kata-deploy-payload-amd64, publish-kata-monitor-image-amd64]
+    uses: ./.github/workflows/run-kata-monitor-k8s-tests.yaml
+    with:
+      registry: ghcr.io
+      repo: ${{ github.repository_owner }}/kata-deploy-ci
+      tag: ${{ inputs.tag }}-amd64
+      pr-number: ${{ inputs.pr-number }}
+      commit-hash: ${{ inputs.commit-hash }}
+      target-branch: ${{ inputs.target-branch }}
+      kata-monitor-image-reference: ghcr.io/${{ github.repository_owner }}/kata-monitor-ci
+      kata-monitor-image-tag: ${{ inputs.tag }}-amd64
+
  run-k8s-tests-on-aks:
    if: ${{ inputs.skip-test != 'yes' }}
    needs: publish-kata-deploy-payload-amd64
--- a/.github/workflows/run-kata-monitor-k8s-tests.yaml
+++ b/.github/workflows/run-kata-monitor-k8s-tests.yaml
@@ -0,0 +1,110 @@
+name: CI | Run kata-monitor k8s tests
+on:
+  workflow_call:
+    inputs:
+      registry:
+        description: Registry of the kata-deploy image under test.
+        required: true
+        type: string
+      repo:
+        description: Repository of the kata-deploy image under test.
+        required: true
+        type: string
+      tag:
+        description: Tag of the kata-deploy image under test.
+        required: true
+        type: string
+      pr-number:
+        required: true
+        type: string
+      commit-hash:
+        required: false
+        type: string
+      target-branch:
+        required: false
+        type: string
+        default: ""
+      kata-monitor-image-reference:
+        description: Registry/repo of the kata-monitor image to exercise.
+        required: true
+        type: string
+      kata-monitor-image-tag:
+        description: Tag of the kata-monitor image to exercise.
+        required: true
+        type: string
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-kata-monitor-k8s
+  cancel-in-progress: true
+
+permissions: {}
+
+jobs:
+  run-kata-monitor-k8s-tests:
+    name: run-kata-monitor-k8s-tests
+    strategy:
+      fail-fast: false
+      matrix:
+        vmm:
+          - qemu
+        k8s:
+          - vanilla
+    concurrency:
+      group: ${{ github.workflow }}-run-kata-monitor-k8s-tests-${{ github.event.pull_request.number || github.ref }}-${{ toJSON(matrix) }}
+      cancel-in-progress: true
+    runs-on: ubuntu-24.04
+    env:
+      DOCKER_REGISTRY: ${{ inputs.registry }}
+      DOCKER_REPO: ${{ inputs.repo }}
+      DOCKER_TAG: ${{ inputs.tag }}
+      GH_PR_NUMBER: ${{ inputs.pr-number }}
+      KATA_HYPERVISOR: ${{ matrix.vmm }}
+      KUBERNETES: ${{ matrix.k8s }}
+      CONTAINER_ENGINE: containerd
+      CONTAINER_ENGINE_VERSION: active
+      KATA_MONITOR_IMAGE_REFERENCE: ${{ inputs.kata-monitor-image-reference }}
+      KATA_MONITOR_IMAGE_TAG: ${{ inputs.kata-monitor-image-tag }}
+      GH_TOKEN: ${{ github.token }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          ref: ${{ inputs.commit-hash }}
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Rebase atop of the latest target branch
+        run: |
+          ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
+        env:
+          TARGET_BRANCH: ${{ inputs.target-branch }}
+
+      - name: Remove unnecessary directories to free up space
+        run: |
+          sudo rm -rf /usr/local/.ghcup
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          sudo rm -rf /usr/lib/jvm
+          sudo rm -rf /usr/share/swift
+          sudo rm -rf /usr/local/share/powershell
+          sudo rm -rf /usr/local/julia*
+          sudo rm -rf /opt/az
+          sudo rm -rf /usr/local/share/chromium
+          sudo rm -rf /opt/microsoft
+          sudo rm -rf /opt/google
+          sudo rm -rf /usr/lib/firefox
+
+      - name: Deploy ${{ matrix.k8s }}
+        run:  bash tests/functional/kata-deploy/gha-run.sh deploy-k8s
+
+      - name: Install `bats`
+        run: bash tests/functional/kata-deploy/gha-run.sh install-bats
+
+      - name: Run tests
+        run: bash tests/functional/kata-monitor/gha-run.sh run-helm-tests
+
+      - name: Report tests
+        if: always()
+        run: bash tests/functional/kata-monitor/gha-run.sh report-helm-tests
--- a/tests/functional/kata-monitor/gha-run.sh
+++ b/tests/functional/kata-monitor/gha-run.sh
@@ -63,12 +63,27 @@ function run() {
 	bash "${kata_monitor_dir}/kata-monitor-tests.sh"
 }

+function run_helm_tests() {
+	# shellcheck disable=SC2154
+	info "Running kata-monitor helm-chart tests"
+
+	pushd "${kata_monitor_dir}"
+	bash run-kata-monitor-helm-tests.sh
+	popd
+}
+
+function report_helm_tests() {
+	report_bats_tests "${kata_monitor_dir}"
+}
+
 function main() {
 	action="${1:-}"
 	case "${action}" in
 		install-dependencies) install_dependencies ;;
 		install-kata) install_kata ;;
 		run) run ;;
+		run-helm-tests) run_helm_tests ;;
+		report-helm-tests) report_helm_tests ;;
 		*) >&2 die "Invalid argument" ;;
 	esac
 }
--- a/tests/functional/kata-monitor/kata-monitor.bats
+++ b/tests/functional/kata-monitor/kata-monitor.bats
@@ -0,0 +1,233 @@
+#!/usr/bin/env bats
+#
+# Copyright (c) 2026 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# kata-monitor helm chart functional test.
+#
+# Validates that the optional kata-monitor DaemonSet shipped by the
+# kata-deploy helm chart actually rolls out, exercises the per-PR
+# kata-monitor image, and exposes per-sandbox Prometheus metrics for a
+# live kata pod.
+#
+# Required environment variables (mirroring kata-deploy.bats):
+#   DOCKER_REGISTRY                - Registry for kata-deploy image
+#   DOCKER_REPO                    - Repository name for kata-deploy image
+#   DOCKER_TAG                     - kata-deploy image tag to test
+#   KATA_HYPERVISOR                - Hypervisor to test (qemu, ...)
+#   KUBERNETES                     - K8s distribution (k3s, k0s, ...)
+#   KATA_MONITOR_IMAGE_REFERENCE   - Registry/repo for the kata-monitor image
+#   KATA_MONITOR_IMAGE_TAG         - kata-monitor image tag to test
+#
+
+load "${BATS_TEST_DIRNAME}/../../common.bash"
+repo_root_dir="${BATS_TEST_DIRNAME}/../../../"
+load "${repo_root_dir}/tests/gha-run-k8s-common.sh"
+
+# Reuse the helm install/uninstall helpers maintained alongside the
+# kata-deploy bats tests.
+source "${BATS_TEST_DIRNAME}/../kata-deploy/lib/helm-deploy.bash"
+
+# Cache update polling interval is short on the monitor side; 30s is
+# generous enough to absorb both the cache refresh and any apiserver
+# proxy latency without dragging the test runtime up.
+KATA_MONITOR_CACHE_TIMEOUT_S="${KATA_MONITOR_CACHE_TIMEOUT_S:-30}"
+
+# Pod name used to give kata-monitor a real sandbox to surface metrics
+# about. Kept fixed so teardown can clean it up unconditionally.
+KATA_MONITOR_PROBE_POD="kata-monitor-probe"
+
+setup() {
+	ensure_helm
+
+	: "${KATA_MONITOR_IMAGE_REFERENCE:?KATA_MONITOR_IMAGE_REFERENCE must be set}"
+	: "${KATA_MONITOR_IMAGE_TAG:?KATA_MONITOR_IMAGE_TAG must be set}"
+}
+
+# Hit `path` (e.g. /metrics or /sandboxes) on one of the kata-monitor
+# pods via the apiserver pod-proxy. Stdout is the response body; stderr
+# / non-zero exit propagate kubectl failures.
+kata_monitor_get() {
+	local path="$1"
+	local pod
+
+	pod="$(kubectl -n "${HELM_NAMESPACE}" get pods \
+		-l app.kubernetes.io/name=kata-monitor \
+		-o jsonpath='{.items[0].metadata.name}')"
+	[[ -n "${pod}" ]] || { echo "no kata-monitor pod found" >&2; return 1; }
+
+	kubectl -n "${HELM_NAMESPACE}" get --raw \
+		"/api/v1/namespaces/${HELM_NAMESPACE}/pods/${pod}:8090/proxy${path}"
+}
+
+# Block until `predicate` returns 0 against the fresh /metrics output,
+# or until ${KATA_MONITOR_CACHE_TIMEOUT_S} elapses. The predicate is a
+# bash function name receiving the metrics body on stdin.
+wait_for_metrics() {
+	local predicate="$1"
+	local body
+	local deadline=$((SECONDS + KATA_MONITOR_CACHE_TIMEOUT_S))
+
+	while (( SECONDS < deadline )); do
+		if body="$(kata_monitor_get /metrics 2>/dev/null)" \
+			&& printf '%s' "${body}" | "${predicate}"; then
+			return 0
+		fi
+		sleep 1
+	done
+
+	echo "Timed out waiting for kata-monitor /metrics predicate '${predicate}'" >&2
+	echo "Last response was:" >&2
+	printf '%s\n' "${body:-<none>}" | head -50 >&2
+	return 1
+}
+
+predicate_has_running_shim() {
+	# Match `kata_monitor_running_shim_count <N>` where N >= 1.
+	grep -E '^kata_monitor_running_shim_count [1-9][0-9]* *$' >/dev/null
+}
+
+predicate_no_running_shim() {
+	grep -E '^kata_monitor_running_shim_count 0 *$' >/dev/null
+}
+
+predicate_has_shim_metric() {
+	# Any kata_shim_* metric line with a non-empty sandbox_id label is
+	# enough to prove the per-sandbox scrape path works end-to-end.
+	grep -E '^kata_shim_[a-z_]+\{[^}]*sandbox_id="[0-9a-f-]+' >/dev/null
+}
+
+@test "kata-monitor helm chart rolls out and exposes per-sandbox metrics" {
+	pushd "${repo_root_dir}"
+
+	local helm_timeout="${KATA_DEPLOY_TIMEOUT:-600s}"
+	local rollout_timeout="${KATA_MONITOR_ROLLOUT_TIMEOUT:-300s}"
+
+	echo "Installing kata-deploy with monitor.enabled=true ..."
+	echo "  kata-monitor image: ${KATA_MONITOR_IMAGE_REFERENCE}:${KATA_MONITOR_IMAGE_TAG}"
+
+	HELM_TIMEOUT="${helm_timeout}" deploy_kata "" \
+		--set monitor.enabled=true \
+		--set "monitor.image.reference=${KATA_MONITOR_IMAGE_REFERENCE}" \
+		--set "monitor.image.tag=${KATA_MONITOR_IMAGE_TAG}"
+
+	echo ""
+	echo "::group::kata-monitor DaemonSet rollout"
+	kubectl -n "${HELM_NAMESPACE}" rollout status ds/kata-monitor \
+		--timeout="${rollout_timeout}"
+	echo "::endgroup::"
+
+	kubectl -n "${HELM_NAMESPACE}" wait pod \
+		-l app.kubernetes.io/name=kata-monitor \
+		--for=condition=Ready --timeout="${rollout_timeout}"
+
+	# Enabling monitor.enabled=true in the same chart deploys kata-monitor
+	# alongside kata-deploy, and kata-deploy reconfigures and restarts
+	# containerd as part of installing kata. That bounce drops the first
+	# kata-monitor instance's containerd connection and costs it a one-off
+	# restart, which is expected and not a regression. Now that kata-deploy
+	# is Ready (so containerd is settled), restart the DaemonSet and assert
+	# the fresh pods stay up without restarts — a genuine crash loop (e.g.
+	# the recent glibc/musl mismatch) still fails here because it would
+	# never reach Ready or would keep restarting.
+	kubectl -n "${HELM_NAMESPACE}" rollout restart ds/kata-monitor
+	kubectl -n "${HELM_NAMESPACE}" rollout status ds/kata-monitor \
+		--timeout="${rollout_timeout}"
+
+	echo ""
+	echo "::group::kata-monitor pods"
+	kubectl -n "${HELM_NAMESPACE}" get pods -l app.kubernetes.io/name=kata-monitor -o wide
+	echo "::endgroup::"
+
+	kubectl -n "${HELM_NAMESPACE}" wait pod \
+		-l app.kubernetes.io/name=kata-monitor \
+		--for=condition=Ready --timeout="${rollout_timeout}"
+
+	local restarts
+	restarts="$(kubectl -n "${HELM_NAMESPACE}" get pods \
+		-l app.kubernetes.io/name=kata-monitor \
+		-o jsonpath='{range .items[*]}{.status.containerStatuses[0].restartCount}{"\n"}{end}')"
+	while IFS= read -r r; do
+		[[ -z "${r}" ]] && continue
+		[[ "${r}" -eq 0 ]] || {
+			echo "kata-monitor pod restarted ${r} time(s) after containerd settled; failing"
+			return 1
+		}
+	done <<< "${restarts}"
+
+	# Give kata-monitor something real to surface metrics about: a kata
+	# pod that just sleeps. Reuses the same image as kata-deploy.bats's
+	# verification pod for cache-warmth on the runner.
+	local probe_yaml
+	probe_yaml=$(mktemp)
+	cat > "${probe_yaml}" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${KATA_MONITOR_PROBE_POD}
+spec:
+  runtimeClassName: kata-${KATA_HYPERVISOR}
+  restartPolicy: Never
+  nodeSelector:
+    katacontainers.io/kata-runtime: "true"
+  containers:
+    - name: probe
+      image: quay.io/kata-containers/alpine-bash-curl:latest
+      imagePullPolicy: IfNotPresent
+      command: ["sh", "-c", "sleep 600"]
+EOF
+
+	echo ""
+	echo "Creating kata probe pod ..."
+	kubectl apply -f "${probe_yaml}"
+	rm -f "${probe_yaml}"
+
+	kubectl wait "pod/${KATA_MONITOR_PROBE_POD}" \
+		--for=condition=Ready --timeout=300s
+
+	echo ""
+	echo "::group::Probe pod status"
+	kubectl get "pod/${KATA_MONITOR_PROBE_POD}" -o wide
+	echo "::endgroup::"
+
+	# Now the per-sandbox assertions. Wait for the monitor's cache to
+	# pick the probe up, then prove a shim metric actually lands in the
+	# scrape body.
+	wait_for_metrics predicate_has_running_shim
+	wait_for_metrics predicate_has_shim_metric
+	echo "kata-monitor /metrics surfaced the probe sandbox"
+
+	# /sandboxes is the second public endpoint of kata-monitor; confirm
+	# it lists at least one sandbox while the probe pod is alive.
+	local sandboxes
+	sandboxes="$(kata_monitor_get /sandboxes)"
+	echo "::group::/sandboxes response"
+	printf '%s\n' "${sandboxes}"
+	echo "::endgroup::"
+	[[ -n "${sandboxes//[[:space:]]/}" ]] || {
+		echo "/sandboxes returned empty body" >&2
+		return 1
+	}
+
+	# Tear the probe pod down and prove kata-monitor's cache flushes
+	# the sandbox out — mirrors is_sandbox_missing_iterate in the
+	# host-level test.
+	echo ""
+	echo "Deleting probe pod and asserting cache invalidates ..."
+	kubectl delete "pod/${KATA_MONITOR_PROBE_POD}" --wait=true --timeout=60s
+
+	wait_for_metrics predicate_no_running_shim
+	echo "kata-monitor /metrics dropped the probe sandbox after deletion"
+
+	popd
+}
+
+teardown() {
+	# Best-effort cleanup — the @test deletes the probe pod on the
+	# happy path, but a failure between create and delete would leave
+	# it behind.
+	kubectl delete "pod/${KATA_MONITOR_PROBE_POD}" --ignore-not-found --wait=false 2>/dev/null || true
+
+	uninstall_kata
+}
--- a/tests/functional/kata-monitor/run-kata-monitor-helm-tests.sh
+++ b/tests/functional/kata-monitor/run-kata-monitor-helm-tests.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#
+# Copyright (c) 2026 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set -e
+set -o pipefail
+
+kata_monitor_dir=$(dirname "$(readlink -f "$0")")
+# shellcheck source=/dev/null
+source "${kata_monitor_dir}/../../common.bash"
+
+# Setting to "yes" enables fail fast, stopping execution at the first failed test.
+export BATS_TEST_FAIL_FAST="${BATS_TEST_FAIL_FAST:-no}"
+
+if [[ -n "${KATA_MONITOR_HELM_TEST_UNION:-}" ]]; then
+	KATA_MONITOR_HELM_TEST_UNION=("${KATA_MONITOR_HELM_TEST_UNION}")
+else
+	KATA_MONITOR_HELM_TEST_UNION=( \
+		"kata-monitor.bats" \
+	)
+fi
+
+run_bats_tests "${kata_monitor_dir}" KATA_MONITOR_HELM_TEST_UNION