diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 51a036731c..ca3aeb8e12 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -305,6 +305,20 @@ jobs: target-branch: ${{ inputs.target-branch }} kata-monitor-image: ghcr.io/${{ github.repository_owner }}/kata-monitor-ci:${{ inputs.tag }}-amd64 + run-kata-monitor-k8s-tests: + if: ${{ inputs.skip-test != 'yes' }} + needs: [publish-kata-deploy-payload-amd64, publish-kata-monitor-image-amd64] + uses: ./.github/workflows/run-kata-monitor-k8s-tests.yaml + with: + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + pr-number: ${{ inputs.pr-number }} + commit-hash: ${{ inputs.commit-hash }} + target-branch: ${{ inputs.target-branch }} + kata-monitor-image-reference: ghcr.io/${{ github.repository_owner }}/kata-monitor-ci + kata-monitor-image-tag: ${{ inputs.tag }}-amd64 + run-k8s-tests-on-aks: if: ${{ inputs.skip-test != 'yes' }} needs: publish-kata-deploy-payload-amd64 diff --git a/.github/workflows/run-kata-monitor-k8s-tests.yaml b/.github/workflows/run-kata-monitor-k8s-tests.yaml new file mode 100644 index 0000000000..8a62d0e24b --- /dev/null +++ b/.github/workflows/run-kata-monitor-k8s-tests.yaml @@ -0,0 +1,110 @@ +name: CI | Run kata-monitor k8s tests +on: + workflow_call: + inputs: + registry: + description: Registry of the kata-deploy image under test. + required: true + type: string + repo: + description: Repository of the kata-deploy image under test. + required: true + type: string + tag: + description: Tag of the kata-deploy image under test. + required: true + type: string + pr-number: + required: true + type: string + commit-hash: + required: false + type: string + target-branch: + required: false + type: string + default: "" + kata-monitor-image-reference: + description: Registry/repo of the kata-monitor image to exercise. + required: true + type: string + kata-monitor-image-tag: + description: Tag of the kata-monitor image to exercise. + required: true + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-kata-monitor-k8s + cancel-in-progress: true + +permissions: {} + +jobs: + run-kata-monitor-k8s-tests: + name: run-kata-monitor-k8s-tests + strategy: + fail-fast: false + matrix: + vmm: + - qemu + k8s: + - vanilla + concurrency: + group: ${{ github.workflow }}-run-kata-monitor-k8s-tests-${{ github.event.pull_request.number || github.ref }}-${{ toJSON(matrix) }} + cancel-in-progress: true + runs-on: ubuntu-24.04 + env: + DOCKER_REGISTRY: ${{ inputs.registry }} + DOCKER_REPO: ${{ inputs.repo }} + DOCKER_TAG: ${{ inputs.tag }} + GH_PR_NUMBER: ${{ inputs.pr-number }} + KATA_HYPERVISOR: ${{ matrix.vmm }} + KUBERNETES: ${{ matrix.k8s }} + CONTAINER_ENGINE: containerd + CONTAINER_ENGINE_VERSION: active + KATA_MONITOR_IMAGE_REFERENCE: ${{ inputs.kata-monitor-image-reference }} + KATA_MONITOR_IMAGE_TAG: ${{ inputs.kata-monitor-image-tag }} + GH_TOKEN: ${{ github.token }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: ${{ inputs.commit-hash }} + fetch-depth: 0 + persist-credentials: false + + - name: Rebase atop of the latest target branch + run: | + ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch" + env: + TARGET_BRANCH: ${{ inputs.target-branch }} + + - name: Remove unnecessary directories to free up space + run: | + sudo rm -rf /usr/local/.ghcup + sudo rm -rf /opt/hostedtoolcache/CodeQL + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + sudo rm -rf /usr/lib/jvm + sudo rm -rf /usr/share/swift + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/local/julia* + sudo rm -rf /opt/az + sudo rm -rf /usr/local/share/chromium + sudo rm -rf /opt/microsoft + sudo rm -rf /opt/google + sudo rm -rf /usr/lib/firefox + + - name: Deploy ${{ matrix.k8s }} + run: bash tests/functional/kata-deploy/gha-run.sh deploy-k8s + + - name: Install `bats` + run: bash tests/functional/kata-deploy/gha-run.sh install-bats + + - name: Run tests + run: bash tests/functional/kata-monitor/gha-run.sh run-helm-tests + + - name: Report tests + if: always() + run: bash tests/functional/kata-monitor/gha-run.sh report-helm-tests diff --git a/tests/functional/kata-monitor/gha-run.sh b/tests/functional/kata-monitor/gha-run.sh index b1f697c9d5..ca0757bb65 100755 --- a/tests/functional/kata-monitor/gha-run.sh +++ b/tests/functional/kata-monitor/gha-run.sh @@ -63,12 +63,27 @@ function run() { bash "${kata_monitor_dir}/kata-monitor-tests.sh" } +function run_helm_tests() { + # shellcheck disable=SC2154 + info "Running kata-monitor helm-chart tests" + + pushd "${kata_monitor_dir}" + bash run-kata-monitor-helm-tests.sh + popd +} + +function report_helm_tests() { + report_bats_tests "${kata_monitor_dir}" +} + function main() { action="${1:-}" case "${action}" in install-dependencies) install_dependencies ;; install-kata) install_kata ;; run) run ;; + run-helm-tests) run_helm_tests ;; + report-helm-tests) report_helm_tests ;; *) >&2 die "Invalid argument" ;; esac } diff --git a/tests/functional/kata-monitor/kata-monitor.bats b/tests/functional/kata-monitor/kata-monitor.bats new file mode 100644 index 0000000000..e2ad9ed4f7 --- /dev/null +++ b/tests/functional/kata-monitor/kata-monitor.bats @@ -0,0 +1,233 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# kata-monitor helm chart functional test. +# +# Validates that the optional kata-monitor DaemonSet shipped by the +# kata-deploy helm chart actually rolls out, exercises the per-PR +# kata-monitor image, and exposes per-sandbox Prometheus metrics for a +# live kata pod. +# +# Required environment variables (mirroring kata-deploy.bats): +# DOCKER_REGISTRY - Registry for kata-deploy image +# DOCKER_REPO - Repository name for kata-deploy image +# DOCKER_TAG - kata-deploy image tag to test +# KATA_HYPERVISOR - Hypervisor to test (qemu, ...) +# KUBERNETES - K8s distribution (k3s, k0s, ...) +# KATA_MONITOR_IMAGE_REFERENCE - Registry/repo for the kata-monitor image +# KATA_MONITOR_IMAGE_TAG - kata-monitor image tag to test +# + +load "${BATS_TEST_DIRNAME}/../../common.bash" +repo_root_dir="${BATS_TEST_DIRNAME}/../../../" +load "${repo_root_dir}/tests/gha-run-k8s-common.sh" + +# Reuse the helm install/uninstall helpers maintained alongside the +# kata-deploy bats tests. +source "${BATS_TEST_DIRNAME}/../kata-deploy/lib/helm-deploy.bash" + +# Cache update polling interval is short on the monitor side; 30s is +# generous enough to absorb both the cache refresh and any apiserver +# proxy latency without dragging the test runtime up. +KATA_MONITOR_CACHE_TIMEOUT_S="${KATA_MONITOR_CACHE_TIMEOUT_S:-30}" + +# Pod name used to give kata-monitor a real sandbox to surface metrics +# about. Kept fixed so teardown can clean it up unconditionally. +KATA_MONITOR_PROBE_POD="kata-monitor-probe" + +setup() { + ensure_helm + + : "${KATA_MONITOR_IMAGE_REFERENCE:?KATA_MONITOR_IMAGE_REFERENCE must be set}" + : "${KATA_MONITOR_IMAGE_TAG:?KATA_MONITOR_IMAGE_TAG must be set}" +} + +# Hit `path` (e.g. /metrics or /sandboxes) on one of the kata-monitor +# pods via the apiserver pod-proxy. Stdout is the response body; stderr +# / non-zero exit propagate kubectl failures. +kata_monitor_get() { + local path="$1" + local pod + + pod="$(kubectl -n "${HELM_NAMESPACE}" get pods \ + -l app.kubernetes.io/name=kata-monitor \ + -o jsonpath='{.items[0].metadata.name}')" + [[ -n "${pod}" ]] || { echo "no kata-monitor pod found" >&2; return 1; } + + kubectl -n "${HELM_NAMESPACE}" get --raw \ + "/api/v1/namespaces/${HELM_NAMESPACE}/pods/${pod}:8090/proxy${path}" +} + +# Block until `predicate` returns 0 against the fresh /metrics output, +# or until ${KATA_MONITOR_CACHE_TIMEOUT_S} elapses. The predicate is a +# bash function name receiving the metrics body on stdin. +wait_for_metrics() { + local predicate="$1" + local body + local deadline=$((SECONDS + KATA_MONITOR_CACHE_TIMEOUT_S)) + + while (( SECONDS < deadline )); do + if body="$(kata_monitor_get /metrics 2>/dev/null)" \ + && printf '%s' "${body}" | "${predicate}"; then + return 0 + fi + sleep 1 + done + + echo "Timed out waiting for kata-monitor /metrics predicate '${predicate}'" >&2 + echo "Last response was:" >&2 + printf '%s\n' "${body:-}" | head -50 >&2 + return 1 +} + +predicate_has_running_shim() { + # Match `kata_monitor_running_shim_count ` where N >= 1. + grep -E '^kata_monitor_running_shim_count [1-9][0-9]* *$' >/dev/null +} + +predicate_no_running_shim() { + grep -E '^kata_monitor_running_shim_count 0 *$' >/dev/null +} + +predicate_has_shim_metric() { + # Any kata_shim_* metric line with a non-empty sandbox_id label is + # enough to prove the per-sandbox scrape path works end-to-end. + grep -E '^kata_shim_[a-z_]+\{[^}]*sandbox_id="[0-9a-f-]+' >/dev/null +} + +@test "kata-monitor helm chart rolls out and exposes per-sandbox metrics" { + pushd "${repo_root_dir}" + + local helm_timeout="${KATA_DEPLOY_TIMEOUT:-600s}" + local rollout_timeout="${KATA_MONITOR_ROLLOUT_TIMEOUT:-300s}" + + echo "Installing kata-deploy with monitor.enabled=true ..." + echo " kata-monitor image: ${KATA_MONITOR_IMAGE_REFERENCE}:${KATA_MONITOR_IMAGE_TAG}" + + HELM_TIMEOUT="${helm_timeout}" deploy_kata "" \ + --set monitor.enabled=true \ + --set "monitor.image.reference=${KATA_MONITOR_IMAGE_REFERENCE}" \ + --set "monitor.image.tag=${KATA_MONITOR_IMAGE_TAG}" + + echo "" + echo "::group::kata-monitor DaemonSet rollout" + kubectl -n "${HELM_NAMESPACE}" rollout status ds/kata-monitor \ + --timeout="${rollout_timeout}" + echo "::endgroup::" + + kubectl -n "${HELM_NAMESPACE}" wait pod \ + -l app.kubernetes.io/name=kata-monitor \ + --for=condition=Ready --timeout="${rollout_timeout}" + + # Enabling monitor.enabled=true in the same chart deploys kata-monitor + # alongside kata-deploy, and kata-deploy reconfigures and restarts + # containerd as part of installing kata. That bounce drops the first + # kata-monitor instance's containerd connection and costs it a one-off + # restart, which is expected and not a regression. Now that kata-deploy + # is Ready (so containerd is settled), restart the DaemonSet and assert + # the fresh pods stay up without restarts — a genuine crash loop (e.g. + # the recent glibc/musl mismatch) still fails here because it would + # never reach Ready or would keep restarting. + kubectl -n "${HELM_NAMESPACE}" rollout restart ds/kata-monitor + kubectl -n "${HELM_NAMESPACE}" rollout status ds/kata-monitor \ + --timeout="${rollout_timeout}" + + echo "" + echo "::group::kata-monitor pods" + kubectl -n "${HELM_NAMESPACE}" get pods -l app.kubernetes.io/name=kata-monitor -o wide + echo "::endgroup::" + + kubectl -n "${HELM_NAMESPACE}" wait pod \ + -l app.kubernetes.io/name=kata-monitor \ + --for=condition=Ready --timeout="${rollout_timeout}" + + local restarts + restarts="$(kubectl -n "${HELM_NAMESPACE}" get pods \ + -l app.kubernetes.io/name=kata-monitor \ + -o jsonpath='{range .items[*]}{.status.containerStatuses[0].restartCount}{"\n"}{end}')" + while IFS= read -r r; do + [[ -z "${r}" ]] && continue + [[ "${r}" -eq 0 ]] || { + echo "kata-monitor pod restarted ${r} time(s) after containerd settled; failing" + return 1 + } + done <<< "${restarts}" + + # Give kata-monitor something real to surface metrics about: a kata + # pod that just sleeps. Reuses the same image as kata-deploy.bats's + # verification pod for cache-warmth on the runner. + local probe_yaml + probe_yaml=$(mktemp) + cat > "${probe_yaml}" <&2 + return 1 + } + + # Tear the probe pod down and prove kata-monitor's cache flushes + # the sandbox out — mirrors is_sandbox_missing_iterate in the + # host-level test. + echo "" + echo "Deleting probe pod and asserting cache invalidates ..." + kubectl delete "pod/${KATA_MONITOR_PROBE_POD}" --wait=true --timeout=60s + + wait_for_metrics predicate_no_running_shim + echo "kata-monitor /metrics dropped the probe sandbox after deletion" + + popd +} + +teardown() { + # Best-effort cleanup — the @test deletes the probe pod on the + # happy path, but a failure between create and delete would leave + # it behind. + kubectl delete "pod/${KATA_MONITOR_PROBE_POD}" --ignore-not-found --wait=false 2>/dev/null || true + + uninstall_kata +} diff --git a/tests/functional/kata-monitor/run-kata-monitor-helm-tests.sh b/tests/functional/kata-monitor/run-kata-monitor-helm-tests.sh new file mode 100644 index 0000000000..b0159839d9 --- /dev/null +++ b/tests/functional/kata-monitor/run-kata-monitor-helm-tests.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -e +set -o pipefail + +kata_monitor_dir=$(dirname "$(readlink -f "$0")") +# shellcheck source=/dev/null +source "${kata_monitor_dir}/../../common.bash" + +# Setting to "yes" enables fail fast, stopping execution at the first failed test. +export BATS_TEST_FAIL_FAST="${BATS_TEST_FAIL_FAST:-no}" + +if [[ -n "${KATA_MONITOR_HELM_TEST_UNION:-}" ]]; then + KATA_MONITOR_HELM_TEST_UNION=("${KATA_MONITOR_HELM_TEST_UNION}") +else + KATA_MONITOR_HELM_TEST_UNION=( \ + "kata-monitor.bats" \ + ) +fi + +run_bats_tests "${kata_monitor_dir}" KATA_MONITOR_HELM_TEST_UNION