tests: add kata-monitor helm chart k8s test

Add a single-job k8s test that installs the kata-deploy helm chart
with monitor.enabled=true, pointed at the per-PR kata-monitor image
built earlier in the same run, and exercises both the rollout and the
user-visible behaviour:

  * the kata-monitor DaemonSet rolls out and the pod stays up without
    container restarts;
  * a real kata-runtime probe pod is scheduled, then /metrics and
    /sandboxes are scraped through the apiserver pod-proxy to prove
    kata-monitor sees the sandbox (non-zero running-shim count plus at
    least one per-sandbox kata_shim_* metric);
  * after the probe pod is deleted, /metrics drops back to a zero
    running-shim count.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: OpenAI Codex <codex@openai.com>
This commit is contained in:
Fabiano Fidêncio
2026-06-03 12:58:31 +02:00
committed by Fabiano Fidêncio
parent 285d5daa23
commit 92a9691470
5 changed files with 398 additions and 0 deletions

View File

@@ -305,6 +305,20 @@ jobs:
target-branch: ${{ inputs.target-branch }}
kata-monitor-image: ghcr.io/${{ github.repository_owner }}/kata-monitor-ci:${{ inputs.tag }}-amd64
run-kata-monitor-k8s-tests:
if: ${{ inputs.skip-test != 'yes' }}
needs: [publish-kata-deploy-payload-amd64, publish-kata-monitor-image-amd64]
uses: ./.github/workflows/run-kata-monitor-k8s-tests.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
pr-number: ${{ inputs.pr-number }}
commit-hash: ${{ inputs.commit-hash }}
target-branch: ${{ inputs.target-branch }}
kata-monitor-image-reference: ghcr.io/${{ github.repository_owner }}/kata-monitor-ci
kata-monitor-image-tag: ${{ inputs.tag }}-amd64
run-k8s-tests-on-aks:
if: ${{ inputs.skip-test != 'yes' }}
needs: publish-kata-deploy-payload-amd64

View File

@@ -0,0 +1,110 @@
name: CI | Run kata-monitor k8s tests
on:
workflow_call:
inputs:
registry:
description: Registry of the kata-deploy image under test.
required: true
type: string
repo:
description: Repository of the kata-deploy image under test.
required: true
type: string
tag:
description: Tag of the kata-deploy image under test.
required: true
type: string
pr-number:
required: true
type: string
commit-hash:
required: false
type: string
target-branch:
required: false
type: string
default: ""
kata-monitor-image-reference:
description: Registry/repo of the kata-monitor image to exercise.
required: true
type: string
kata-monitor-image-tag:
description: Tag of the kata-monitor image to exercise.
required: true
type: string
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-kata-monitor-k8s
cancel-in-progress: true
permissions: {}
jobs:
run-kata-monitor-k8s-tests:
name: run-kata-monitor-k8s-tests
strategy:
fail-fast: false
matrix:
vmm:
- qemu
k8s:
- vanilla
concurrency:
group: ${{ github.workflow }}-run-kata-monitor-k8s-tests-${{ github.event.pull_request.number || github.ref }}-${{ toJSON(matrix) }}
cancel-in-progress: true
runs-on: ubuntu-24.04
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
GH_PR_NUMBER: ${{ inputs.pr-number }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
KUBERNETES: ${{ matrix.k8s }}
CONTAINER_ENGINE: containerd
CONTAINER_ENGINE_VERSION: active
KATA_MONITOR_IMAGE_REFERENCE: ${{ inputs.kata-monitor-image-reference }}
KATA_MONITOR_IMAGE_TAG: ${{ inputs.kata-monitor-image-tag }}
GH_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: Remove unnecessary directories to free up space
run: |
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/local/julia*
sudo rm -rf /opt/az
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/google
sudo rm -rf /usr/lib/firefox
- name: Deploy ${{ matrix.k8s }}
run: bash tests/functional/kata-deploy/gha-run.sh deploy-k8s
- name: Install `bats`
run: bash tests/functional/kata-deploy/gha-run.sh install-bats
- name: Run tests
run: bash tests/functional/kata-monitor/gha-run.sh run-helm-tests
- name: Report tests
if: always()
run: bash tests/functional/kata-monitor/gha-run.sh report-helm-tests

View File

@@ -63,12 +63,27 @@ function run() {
bash "${kata_monitor_dir}/kata-monitor-tests.sh"
}
function run_helm_tests() {
# shellcheck disable=SC2154
info "Running kata-monitor helm-chart tests"
pushd "${kata_monitor_dir}"
bash run-kata-monitor-helm-tests.sh
popd
}
function report_helm_tests() {
report_bats_tests "${kata_monitor_dir}"
}
function main() {
action="${1:-}"
case "${action}" in
install-dependencies) install_dependencies ;;
install-kata) install_kata ;;
run) run ;;
run-helm-tests) run_helm_tests ;;
report-helm-tests) report_helm_tests ;;
*) >&2 die "Invalid argument" ;;
esac
}

View File

@@ -0,0 +1,233 @@
#!/usr/bin/env bats
#
# Copyright (c) 2026 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# kata-monitor helm chart functional test.
#
# Validates that the optional kata-monitor DaemonSet shipped by the
# kata-deploy helm chart actually rolls out, exercises the per-PR
# kata-monitor image, and exposes per-sandbox Prometheus metrics for a
# live kata pod.
#
# Required environment variables (mirroring kata-deploy.bats):
# DOCKER_REGISTRY - Registry for kata-deploy image
# DOCKER_REPO - Repository name for kata-deploy image
# DOCKER_TAG - kata-deploy image tag to test
# KATA_HYPERVISOR - Hypervisor to test (qemu, ...)
# KUBERNETES - K8s distribution (k3s, k0s, ...)
# KATA_MONITOR_IMAGE_REFERENCE - Registry/repo for the kata-monitor image
# KATA_MONITOR_IMAGE_TAG - kata-monitor image tag to test
#
load "${BATS_TEST_DIRNAME}/../../common.bash"
repo_root_dir="${BATS_TEST_DIRNAME}/../../../"
load "${repo_root_dir}/tests/gha-run-k8s-common.sh"
# Reuse the helm install/uninstall helpers maintained alongside the
# kata-deploy bats tests.
source "${BATS_TEST_DIRNAME}/../kata-deploy/lib/helm-deploy.bash"
# Cache update polling interval is short on the monitor side; 30s is
# generous enough to absorb both the cache refresh and any apiserver
# proxy latency without dragging the test runtime up.
KATA_MONITOR_CACHE_TIMEOUT_S="${KATA_MONITOR_CACHE_TIMEOUT_S:-30}"
# Pod name used to give kata-monitor a real sandbox to surface metrics
# about. Kept fixed so teardown can clean it up unconditionally.
KATA_MONITOR_PROBE_POD="kata-monitor-probe"
setup() {
ensure_helm
: "${KATA_MONITOR_IMAGE_REFERENCE:?KATA_MONITOR_IMAGE_REFERENCE must be set}"
: "${KATA_MONITOR_IMAGE_TAG:?KATA_MONITOR_IMAGE_TAG must be set}"
}
# Hit `path` (e.g. /metrics or /sandboxes) on one of the kata-monitor
# pods via the apiserver pod-proxy. Stdout is the response body; stderr
# / non-zero exit propagate kubectl failures.
kata_monitor_get() {
local path="$1"
local pod
pod="$(kubectl -n "${HELM_NAMESPACE}" get pods \
-l app.kubernetes.io/name=kata-monitor \
-o jsonpath='{.items[0].metadata.name}')"
[[ -n "${pod}" ]] || { echo "no kata-monitor pod found" >&2; return 1; }
kubectl -n "${HELM_NAMESPACE}" get --raw \
"/api/v1/namespaces/${HELM_NAMESPACE}/pods/${pod}:8090/proxy${path}"
}
# Block until `predicate` returns 0 against the fresh /metrics output,
# or until ${KATA_MONITOR_CACHE_TIMEOUT_S} elapses. The predicate is a
# bash function name receiving the metrics body on stdin.
wait_for_metrics() {
local predicate="$1"
local body
local deadline=$((SECONDS + KATA_MONITOR_CACHE_TIMEOUT_S))
while (( SECONDS < deadline )); do
if body="$(kata_monitor_get /metrics 2>/dev/null)" \
&& printf '%s' "${body}" | "${predicate}"; then
return 0
fi
sleep 1
done
echo "Timed out waiting for kata-monitor /metrics predicate '${predicate}'" >&2
echo "Last response was:" >&2
printf '%s\n' "${body:-<none>}" | head -50 >&2
return 1
}
predicate_has_running_shim() {
# Match `kata_monitor_running_shim_count <N>` where N >= 1.
grep -E '^kata_monitor_running_shim_count [1-9][0-9]* *$' >/dev/null
}
predicate_no_running_shim() {
grep -E '^kata_monitor_running_shim_count 0 *$' >/dev/null
}
predicate_has_shim_metric() {
# Any kata_shim_* metric line with a non-empty sandbox_id label is
# enough to prove the per-sandbox scrape path works end-to-end.
grep -E '^kata_shim_[a-z_]+\{[^}]*sandbox_id="[0-9a-f-]+' >/dev/null
}
@test "kata-monitor helm chart rolls out and exposes per-sandbox metrics" {
pushd "${repo_root_dir}"
local helm_timeout="${KATA_DEPLOY_TIMEOUT:-600s}"
local rollout_timeout="${KATA_MONITOR_ROLLOUT_TIMEOUT:-300s}"
echo "Installing kata-deploy with monitor.enabled=true ..."
echo " kata-monitor image: ${KATA_MONITOR_IMAGE_REFERENCE}:${KATA_MONITOR_IMAGE_TAG}"
HELM_TIMEOUT="${helm_timeout}" deploy_kata "" \
--set monitor.enabled=true \
--set "monitor.image.reference=${KATA_MONITOR_IMAGE_REFERENCE}" \
--set "monitor.image.tag=${KATA_MONITOR_IMAGE_TAG}"
echo ""
echo "::group::kata-monitor DaemonSet rollout"
kubectl -n "${HELM_NAMESPACE}" rollout status ds/kata-monitor \
--timeout="${rollout_timeout}"
echo "::endgroup::"
kubectl -n "${HELM_NAMESPACE}" wait pod \
-l app.kubernetes.io/name=kata-monitor \
--for=condition=Ready --timeout="${rollout_timeout}"
# Enabling monitor.enabled=true in the same chart deploys kata-monitor
# alongside kata-deploy, and kata-deploy reconfigures and restarts
# containerd as part of installing kata. That bounce drops the first
# kata-monitor instance's containerd connection and costs it a one-off
# restart, which is expected and not a regression. Now that kata-deploy
# is Ready (so containerd is settled), restart the DaemonSet and assert
# the fresh pods stay up without restarts — a genuine crash loop (e.g.
# the recent glibc/musl mismatch) still fails here because it would
# never reach Ready or would keep restarting.
kubectl -n "${HELM_NAMESPACE}" rollout restart ds/kata-monitor
kubectl -n "${HELM_NAMESPACE}" rollout status ds/kata-monitor \
--timeout="${rollout_timeout}"
echo ""
echo "::group::kata-monitor pods"
kubectl -n "${HELM_NAMESPACE}" get pods -l app.kubernetes.io/name=kata-monitor -o wide
echo "::endgroup::"
kubectl -n "${HELM_NAMESPACE}" wait pod \
-l app.kubernetes.io/name=kata-monitor \
--for=condition=Ready --timeout="${rollout_timeout}"
local restarts
restarts="$(kubectl -n "${HELM_NAMESPACE}" get pods \
-l app.kubernetes.io/name=kata-monitor \
-o jsonpath='{range .items[*]}{.status.containerStatuses[0].restartCount}{"\n"}{end}')"
while IFS= read -r r; do
[[ -z "${r}" ]] && continue
[[ "${r}" -eq 0 ]] || {
echo "kata-monitor pod restarted ${r} time(s) after containerd settled; failing"
return 1
}
done <<< "${restarts}"
# Give kata-monitor something real to surface metrics about: a kata
# pod that just sleeps. Reuses the same image as kata-deploy.bats's
# verification pod for cache-warmth on the runner.
local probe_yaml
probe_yaml=$(mktemp)
cat > "${probe_yaml}" <<EOF
apiVersion: v1
kind: Pod
metadata:
name: ${KATA_MONITOR_PROBE_POD}
spec:
runtimeClassName: kata-${KATA_HYPERVISOR}
restartPolicy: Never
nodeSelector:
katacontainers.io/kata-runtime: "true"
containers:
- name: probe
image: quay.io/kata-containers/alpine-bash-curl:latest
imagePullPolicy: IfNotPresent
command: ["sh", "-c", "sleep 600"]
EOF
echo ""
echo "Creating kata probe pod ..."
kubectl apply -f "${probe_yaml}"
rm -f "${probe_yaml}"
kubectl wait "pod/${KATA_MONITOR_PROBE_POD}" \
--for=condition=Ready --timeout=300s
echo ""
echo "::group::Probe pod status"
kubectl get "pod/${KATA_MONITOR_PROBE_POD}" -o wide
echo "::endgroup::"
# Now the per-sandbox assertions. Wait for the monitor's cache to
# pick the probe up, then prove a shim metric actually lands in the
# scrape body.
wait_for_metrics predicate_has_running_shim
wait_for_metrics predicate_has_shim_metric
echo "kata-monitor /metrics surfaced the probe sandbox"
# /sandboxes is the second public endpoint of kata-monitor; confirm
# it lists at least one sandbox while the probe pod is alive.
local sandboxes
sandboxes="$(kata_monitor_get /sandboxes)"
echo "::group::/sandboxes response"
printf '%s\n' "${sandboxes}"
echo "::endgroup::"
[[ -n "${sandboxes//[[:space:]]/}" ]] || {
echo "/sandboxes returned empty body" >&2
return 1
}
# Tear the probe pod down and prove kata-monitor's cache flushes
# the sandbox out — mirrors is_sandbox_missing_iterate in the
# host-level test.
echo ""
echo "Deleting probe pod and asserting cache invalidates ..."
kubectl delete "pod/${KATA_MONITOR_PROBE_POD}" --wait=true --timeout=60s
wait_for_metrics predicate_no_running_shim
echo "kata-monitor /metrics dropped the probe sandbox after deletion"
popd
}
teardown() {
# Best-effort cleanup — the @test deletes the probe pod on the
# happy path, but a failure between create and delete would leave
# it behind.
kubectl delete "pod/${KATA_MONITOR_PROBE_POD}" --ignore-not-found --wait=false 2>/dev/null || true
uninstall_kata
}

View File

@@ -0,0 +1,26 @@
#!/bin/bash
#
# Copyright (c) 2026 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
set -e
set -o pipefail
kata_monitor_dir=$(dirname "$(readlink -f "$0")")
# shellcheck source=/dev/null
source "${kata_monitor_dir}/../../common.bash"
# Setting to "yes" enables fail fast, stopping execution at the first failed test.
export BATS_TEST_FAIL_FAST="${BATS_TEST_FAIL_FAST:-no}"
if [[ -n "${KATA_MONITOR_HELM_TEST_UNION:-}" ]]; then
KATA_MONITOR_HELM_TEST_UNION=("${KATA_MONITOR_HELM_TEST_UNION}")
else
KATA_MONITOR_HELM_TEST_UNION=( \
"kata-monitor.bats" \
)
fi
run_bats_tests "${kata_monitor_dir}" KATA_MONITOR_HELM_TEST_UNION