mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-01 22:50:54 +00:00
tests: add kata-monitor helm chart k8s test
Add a single-job k8s test that installs the kata-deploy helm chart
with monitor.enabled=true, pointed at the per-PR kata-monitor image
built earlier in the same run, and exercises both the rollout and the
user-visible behaviour:
* the kata-monitor DaemonSet rolls out and the pod stays up without
container restarts;
* a real kata-runtime probe pod is scheduled, then /metrics and
/sandboxes are scraped through the apiserver pod-proxy to prove
kata-monitor sees the sandbox (non-zero running-shim count plus at
least one per-sandbox kata_shim_* metric);
* after the probe pod is deleted, /metrics drops back to a zero
running-shim count.
Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: OpenAI Codex <codex@openai.com>
This commit is contained in:
committed by
Fabiano Fidêncio
parent
285d5daa23
commit
92a9691470
14
.github/workflows/ci.yaml
vendored
14
.github/workflows/ci.yaml
vendored
@@ -305,6 +305,20 @@ jobs:
|
||||
target-branch: ${{ inputs.target-branch }}
|
||||
kata-monitor-image: ghcr.io/${{ github.repository_owner }}/kata-monitor-ci:${{ inputs.tag }}-amd64
|
||||
|
||||
run-kata-monitor-k8s-tests:
|
||||
if: ${{ inputs.skip-test != 'yes' }}
|
||||
needs: [publish-kata-deploy-payload-amd64, publish-kata-monitor-image-amd64]
|
||||
uses: ./.github/workflows/run-kata-monitor-k8s-tests.yaml
|
||||
with:
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ inputs.tag }}-amd64
|
||||
pr-number: ${{ inputs.pr-number }}
|
||||
commit-hash: ${{ inputs.commit-hash }}
|
||||
target-branch: ${{ inputs.target-branch }}
|
||||
kata-monitor-image-reference: ghcr.io/${{ github.repository_owner }}/kata-monitor-ci
|
||||
kata-monitor-image-tag: ${{ inputs.tag }}-amd64
|
||||
|
||||
run-k8s-tests-on-aks:
|
||||
if: ${{ inputs.skip-test != 'yes' }}
|
||||
needs: publish-kata-deploy-payload-amd64
|
||||
|
||||
110
.github/workflows/run-kata-monitor-k8s-tests.yaml
vendored
Normal file
110
.github/workflows/run-kata-monitor-k8s-tests.yaml
vendored
Normal file
@@ -0,0 +1,110 @@
|
||||
name: CI | Run kata-monitor k8s tests
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
registry:
|
||||
description: Registry of the kata-deploy image under test.
|
||||
required: true
|
||||
type: string
|
||||
repo:
|
||||
description: Repository of the kata-deploy image under test.
|
||||
required: true
|
||||
type: string
|
||||
tag:
|
||||
description: Tag of the kata-deploy image under test.
|
||||
required: true
|
||||
type: string
|
||||
pr-number:
|
||||
required: true
|
||||
type: string
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
target-branch:
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
kata-monitor-image-reference:
|
||||
description: Registry/repo of the kata-monitor image to exercise.
|
||||
required: true
|
||||
type: string
|
||||
kata-monitor-image-tag:
|
||||
description: Tag of the kata-monitor image to exercise.
|
||||
required: true
|
||||
type: string
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-kata-monitor-k8s
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
run-kata-monitor-k8s-tests:
|
||||
name: run-kata-monitor-k8s-tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
vmm:
|
||||
- qemu
|
||||
k8s:
|
||||
- vanilla
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-run-kata-monitor-k8s-tests-${{ github.event.pull_request.number || github.ref }}-${{ toJSON(matrix) }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ubuntu-24.04
|
||||
env:
|
||||
DOCKER_REGISTRY: ${{ inputs.registry }}
|
||||
DOCKER_REPO: ${{ inputs.repo }}
|
||||
DOCKER_TAG: ${{ inputs.tag }}
|
||||
GH_PR_NUMBER: ${{ inputs.pr-number }}
|
||||
KATA_HYPERVISOR: ${{ matrix.vmm }}
|
||||
KUBERNETES: ${{ matrix.k8s }}
|
||||
CONTAINER_ENGINE: containerd
|
||||
CONTAINER_ENGINE_VERSION: active
|
||||
KATA_MONITOR_IMAGE_REFERENCE: ${{ inputs.kata-monitor-image-reference }}
|
||||
KATA_MONITOR_IMAGE_TAG: ${{ inputs.kata-monitor-image-tag }}
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
fetch-depth: 0
|
||||
persist-credentials: false
|
||||
|
||||
- name: Rebase atop of the latest target branch
|
||||
run: |
|
||||
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
|
||||
env:
|
||||
TARGET_BRANCH: ${{ inputs.target-branch }}
|
||||
|
||||
- name: Remove unnecessary directories to free up space
|
||||
run: |
|
||||
sudo rm -rf /usr/local/.ghcup
|
||||
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /usr/local/share/boost
|
||||
sudo rm -rf /usr/lib/jvm
|
||||
sudo rm -rf /usr/share/swift
|
||||
sudo rm -rf /usr/local/share/powershell
|
||||
sudo rm -rf /usr/local/julia*
|
||||
sudo rm -rf /opt/az
|
||||
sudo rm -rf /usr/local/share/chromium
|
||||
sudo rm -rf /opt/microsoft
|
||||
sudo rm -rf /opt/google
|
||||
sudo rm -rf /usr/lib/firefox
|
||||
|
||||
- name: Deploy ${{ matrix.k8s }}
|
||||
run: bash tests/functional/kata-deploy/gha-run.sh deploy-k8s
|
||||
|
||||
- name: Install `bats`
|
||||
run: bash tests/functional/kata-deploy/gha-run.sh install-bats
|
||||
|
||||
- name: Run tests
|
||||
run: bash tests/functional/kata-monitor/gha-run.sh run-helm-tests
|
||||
|
||||
- name: Report tests
|
||||
if: always()
|
||||
run: bash tests/functional/kata-monitor/gha-run.sh report-helm-tests
|
||||
@@ -63,12 +63,27 @@ function run() {
|
||||
bash "${kata_monitor_dir}/kata-monitor-tests.sh"
|
||||
}
|
||||
|
||||
function run_helm_tests() {
|
||||
# shellcheck disable=SC2154
|
||||
info "Running kata-monitor helm-chart tests"
|
||||
|
||||
pushd "${kata_monitor_dir}"
|
||||
bash run-kata-monitor-helm-tests.sh
|
||||
popd
|
||||
}
|
||||
|
||||
function report_helm_tests() {
|
||||
report_bats_tests "${kata_monitor_dir}"
|
||||
}
|
||||
|
||||
function main() {
|
||||
action="${1:-}"
|
||||
case "${action}" in
|
||||
install-dependencies) install_dependencies ;;
|
||||
install-kata) install_kata ;;
|
||||
run) run ;;
|
||||
run-helm-tests) run_helm_tests ;;
|
||||
report-helm-tests) report_helm_tests ;;
|
||||
*) >&2 die "Invalid argument" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
233
tests/functional/kata-monitor/kata-monitor.bats
Normal file
233
tests/functional/kata-monitor/kata-monitor.bats
Normal file
@@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env bats
|
||||
#
|
||||
# Copyright (c) 2026 NVIDIA Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# kata-monitor helm chart functional test.
|
||||
#
|
||||
# Validates that the optional kata-monitor DaemonSet shipped by the
|
||||
# kata-deploy helm chart actually rolls out, exercises the per-PR
|
||||
# kata-monitor image, and exposes per-sandbox Prometheus metrics for a
|
||||
# live kata pod.
|
||||
#
|
||||
# Required environment variables (mirroring kata-deploy.bats):
|
||||
# DOCKER_REGISTRY - Registry for kata-deploy image
|
||||
# DOCKER_REPO - Repository name for kata-deploy image
|
||||
# DOCKER_TAG - kata-deploy image tag to test
|
||||
# KATA_HYPERVISOR - Hypervisor to test (qemu, ...)
|
||||
# KUBERNETES - K8s distribution (k3s, k0s, ...)
|
||||
# KATA_MONITOR_IMAGE_REFERENCE - Registry/repo for the kata-monitor image
|
||||
# KATA_MONITOR_IMAGE_TAG - kata-monitor image tag to test
|
||||
#
|
||||
|
||||
load "${BATS_TEST_DIRNAME}/../../common.bash"
|
||||
repo_root_dir="${BATS_TEST_DIRNAME}/../../../"
|
||||
load "${repo_root_dir}/tests/gha-run-k8s-common.sh"
|
||||
|
||||
# Reuse the helm install/uninstall helpers maintained alongside the
|
||||
# kata-deploy bats tests.
|
||||
source "${BATS_TEST_DIRNAME}/../kata-deploy/lib/helm-deploy.bash"
|
||||
|
||||
# Cache update polling interval is short on the monitor side; 30s is
|
||||
# generous enough to absorb both the cache refresh and any apiserver
|
||||
# proxy latency without dragging the test runtime up.
|
||||
KATA_MONITOR_CACHE_TIMEOUT_S="${KATA_MONITOR_CACHE_TIMEOUT_S:-30}"
|
||||
|
||||
# Pod name used to give kata-monitor a real sandbox to surface metrics
|
||||
# about. Kept fixed so teardown can clean it up unconditionally.
|
||||
KATA_MONITOR_PROBE_POD="kata-monitor-probe"
|
||||
|
||||
setup() {
|
||||
ensure_helm
|
||||
|
||||
: "${KATA_MONITOR_IMAGE_REFERENCE:?KATA_MONITOR_IMAGE_REFERENCE must be set}"
|
||||
: "${KATA_MONITOR_IMAGE_TAG:?KATA_MONITOR_IMAGE_TAG must be set}"
|
||||
}
|
||||
|
||||
# Hit `path` (e.g. /metrics or /sandboxes) on one of the kata-monitor
|
||||
# pods via the apiserver pod-proxy. Stdout is the response body; stderr
|
||||
# / non-zero exit propagate kubectl failures.
|
||||
kata_monitor_get() {
|
||||
local path="$1"
|
||||
local pod
|
||||
|
||||
pod="$(kubectl -n "${HELM_NAMESPACE}" get pods \
|
||||
-l app.kubernetes.io/name=kata-monitor \
|
||||
-o jsonpath='{.items[0].metadata.name}')"
|
||||
[[ -n "${pod}" ]] || { echo "no kata-monitor pod found" >&2; return 1; }
|
||||
|
||||
kubectl -n "${HELM_NAMESPACE}" get --raw \
|
||||
"/api/v1/namespaces/${HELM_NAMESPACE}/pods/${pod}:8090/proxy${path}"
|
||||
}
|
||||
|
||||
# Block until `predicate` returns 0 against the fresh /metrics output,
|
||||
# or until ${KATA_MONITOR_CACHE_TIMEOUT_S} elapses. The predicate is a
|
||||
# bash function name receiving the metrics body on stdin.
|
||||
wait_for_metrics() {
|
||||
local predicate="$1"
|
||||
local body
|
||||
local deadline=$((SECONDS + KATA_MONITOR_CACHE_TIMEOUT_S))
|
||||
|
||||
while (( SECONDS < deadline )); do
|
||||
if body="$(kata_monitor_get /metrics 2>/dev/null)" \
|
||||
&& printf '%s' "${body}" | "${predicate}"; then
|
||||
return 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "Timed out waiting for kata-monitor /metrics predicate '${predicate}'" >&2
|
||||
echo "Last response was:" >&2
|
||||
printf '%s\n' "${body:-<none>}" | head -50 >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
predicate_has_running_shim() {
|
||||
# Match `kata_monitor_running_shim_count <N>` where N >= 1.
|
||||
grep -E '^kata_monitor_running_shim_count [1-9][0-9]* *$' >/dev/null
|
||||
}
|
||||
|
||||
predicate_no_running_shim() {
|
||||
grep -E '^kata_monitor_running_shim_count 0 *$' >/dev/null
|
||||
}
|
||||
|
||||
predicate_has_shim_metric() {
|
||||
# Any kata_shim_* metric line with a non-empty sandbox_id label is
|
||||
# enough to prove the per-sandbox scrape path works end-to-end.
|
||||
grep -E '^kata_shim_[a-z_]+\{[^}]*sandbox_id="[0-9a-f-]+' >/dev/null
|
||||
}
|
||||
|
||||
@test "kata-monitor helm chart rolls out and exposes per-sandbox metrics" {
|
||||
pushd "${repo_root_dir}"
|
||||
|
||||
local helm_timeout="${KATA_DEPLOY_TIMEOUT:-600s}"
|
||||
local rollout_timeout="${KATA_MONITOR_ROLLOUT_TIMEOUT:-300s}"
|
||||
|
||||
echo "Installing kata-deploy with monitor.enabled=true ..."
|
||||
echo " kata-monitor image: ${KATA_MONITOR_IMAGE_REFERENCE}:${KATA_MONITOR_IMAGE_TAG}"
|
||||
|
||||
HELM_TIMEOUT="${helm_timeout}" deploy_kata "" \
|
||||
--set monitor.enabled=true \
|
||||
--set "monitor.image.reference=${KATA_MONITOR_IMAGE_REFERENCE}" \
|
||||
--set "monitor.image.tag=${KATA_MONITOR_IMAGE_TAG}"
|
||||
|
||||
echo ""
|
||||
echo "::group::kata-monitor DaemonSet rollout"
|
||||
kubectl -n "${HELM_NAMESPACE}" rollout status ds/kata-monitor \
|
||||
--timeout="${rollout_timeout}"
|
||||
echo "::endgroup::"
|
||||
|
||||
kubectl -n "${HELM_NAMESPACE}" wait pod \
|
||||
-l app.kubernetes.io/name=kata-monitor \
|
||||
--for=condition=Ready --timeout="${rollout_timeout}"
|
||||
|
||||
# Enabling monitor.enabled=true in the same chart deploys kata-monitor
|
||||
# alongside kata-deploy, and kata-deploy reconfigures and restarts
|
||||
# containerd as part of installing kata. That bounce drops the first
|
||||
# kata-monitor instance's containerd connection and costs it a one-off
|
||||
# restart, which is expected and not a regression. Now that kata-deploy
|
||||
# is Ready (so containerd is settled), restart the DaemonSet and assert
|
||||
# the fresh pods stay up without restarts — a genuine crash loop (e.g.
|
||||
# the recent glibc/musl mismatch) still fails here because it would
|
||||
# never reach Ready or would keep restarting.
|
||||
kubectl -n "${HELM_NAMESPACE}" rollout restart ds/kata-monitor
|
||||
kubectl -n "${HELM_NAMESPACE}" rollout status ds/kata-monitor \
|
||||
--timeout="${rollout_timeout}"
|
||||
|
||||
echo ""
|
||||
echo "::group::kata-monitor pods"
|
||||
kubectl -n "${HELM_NAMESPACE}" get pods -l app.kubernetes.io/name=kata-monitor -o wide
|
||||
echo "::endgroup::"
|
||||
|
||||
kubectl -n "${HELM_NAMESPACE}" wait pod \
|
||||
-l app.kubernetes.io/name=kata-monitor \
|
||||
--for=condition=Ready --timeout="${rollout_timeout}"
|
||||
|
||||
local restarts
|
||||
restarts="$(kubectl -n "${HELM_NAMESPACE}" get pods \
|
||||
-l app.kubernetes.io/name=kata-monitor \
|
||||
-o jsonpath='{range .items[*]}{.status.containerStatuses[0].restartCount}{"\n"}{end}')"
|
||||
while IFS= read -r r; do
|
||||
[[ -z "${r}" ]] && continue
|
||||
[[ "${r}" -eq 0 ]] || {
|
||||
echo "kata-monitor pod restarted ${r} time(s) after containerd settled; failing"
|
||||
return 1
|
||||
}
|
||||
done <<< "${restarts}"
|
||||
|
||||
# Give kata-monitor something real to surface metrics about: a kata
|
||||
# pod that just sleeps. Reuses the same image as kata-deploy.bats's
|
||||
# verification pod for cache-warmth on the runner.
|
||||
local probe_yaml
|
||||
probe_yaml=$(mktemp)
|
||||
cat > "${probe_yaml}" <<EOF
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: ${KATA_MONITOR_PROBE_POD}
|
||||
spec:
|
||||
runtimeClassName: kata-${KATA_HYPERVISOR}
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
katacontainers.io/kata-runtime: "true"
|
||||
containers:
|
||||
- name: probe
|
||||
image: quay.io/kata-containers/alpine-bash-curl:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: ["sh", "-c", "sleep 600"]
|
||||
EOF
|
||||
|
||||
echo ""
|
||||
echo "Creating kata probe pod ..."
|
||||
kubectl apply -f "${probe_yaml}"
|
||||
rm -f "${probe_yaml}"
|
||||
|
||||
kubectl wait "pod/${KATA_MONITOR_PROBE_POD}" \
|
||||
--for=condition=Ready --timeout=300s
|
||||
|
||||
echo ""
|
||||
echo "::group::Probe pod status"
|
||||
kubectl get "pod/${KATA_MONITOR_PROBE_POD}" -o wide
|
||||
echo "::endgroup::"
|
||||
|
||||
# Now the per-sandbox assertions. Wait for the monitor's cache to
|
||||
# pick the probe up, then prove a shim metric actually lands in the
|
||||
# scrape body.
|
||||
wait_for_metrics predicate_has_running_shim
|
||||
wait_for_metrics predicate_has_shim_metric
|
||||
echo "kata-monitor /metrics surfaced the probe sandbox"
|
||||
|
||||
# /sandboxes is the second public endpoint of kata-monitor; confirm
|
||||
# it lists at least one sandbox while the probe pod is alive.
|
||||
local sandboxes
|
||||
sandboxes="$(kata_monitor_get /sandboxes)"
|
||||
echo "::group::/sandboxes response"
|
||||
printf '%s\n' "${sandboxes}"
|
||||
echo "::endgroup::"
|
||||
[[ -n "${sandboxes//[[:space:]]/}" ]] || {
|
||||
echo "/sandboxes returned empty body" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
# Tear the probe pod down and prove kata-monitor's cache flushes
|
||||
# the sandbox out — mirrors is_sandbox_missing_iterate in the
|
||||
# host-level test.
|
||||
echo ""
|
||||
echo "Deleting probe pod and asserting cache invalidates ..."
|
||||
kubectl delete "pod/${KATA_MONITOR_PROBE_POD}" --wait=true --timeout=60s
|
||||
|
||||
wait_for_metrics predicate_no_running_shim
|
||||
echo "kata-monitor /metrics dropped the probe sandbox after deletion"
|
||||
|
||||
popd
|
||||
}
|
||||
|
||||
teardown() {
|
||||
# Best-effort cleanup — the @test deletes the probe pod on the
|
||||
# happy path, but a failure between create and delete would leave
|
||||
# it behind.
|
||||
kubectl delete "pod/${KATA_MONITOR_PROBE_POD}" --ignore-not-found --wait=false 2>/dev/null || true
|
||||
|
||||
uninstall_kata
|
||||
}
|
||||
26
tests/functional/kata-monitor/run-kata-monitor-helm-tests.sh
Normal file
26
tests/functional/kata-monitor/run-kata-monitor-helm-tests.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2026 NVIDIA Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
kata_monitor_dir=$(dirname "$(readlink -f "$0")")
|
||||
# shellcheck source=/dev/null
|
||||
source "${kata_monitor_dir}/../../common.bash"
|
||||
|
||||
# Setting to "yes" enables fail fast, stopping execution at the first failed test.
|
||||
export BATS_TEST_FAIL_FAST="${BATS_TEST_FAIL_FAST:-no}"
|
||||
|
||||
if [[ -n "${KATA_MONITOR_HELM_TEST_UNION:-}" ]]; then
|
||||
KATA_MONITOR_HELM_TEST_UNION=("${KATA_MONITOR_HELM_TEST_UNION}")
|
||||
else
|
||||
KATA_MONITOR_HELM_TEST_UNION=( \
|
||||
"kata-monitor.bats" \
|
||||
)
|
||||
fi
|
||||
|
||||
run_bats_tests "${kata_monitor_dir}" KATA_MONITOR_HELM_TEST_UNION
|
||||
Reference in New Issue
Block a user