Merge pull request #10954 from kata-containers/topic/metrics-kata-deploy

Rework and fix metrics issues
2025-07-19 18:01:01 +00:00 · 2025-03-04 20:22:53 -05:00 · 2025-03-04 20:22:53 -05:00 · 4bb0eb4590
commit 4bb0eb4590
parent edf6af2a43 c69509be1c
12 changed files with 105 additions and 70 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -288,8 +288,11 @@ jobs:
    needs: build-kata-static-tarball-amd64
    uses: ./.github/workflows/run-metrics.yaml
    with:
-      tarball-suffix: -${{ inputs.tag }}
+      registry: ghcr.io
      repo: ${{ github.repository_owner }}/kata-deploy-ci
      tag: ${{ inputs.tag }}-amd64
      commit-hash: ${{ inputs.commit-hash }}
      pr-number: ${{ inputs.pr-number }}
      target-branch: ${{ inputs.target-branch }}
  run-basic-amd64-tests:
--- a/.github/workflows/run-metrics.yaml
+++ b/.github/workflows/run-metrics.yaml
@ -2,8 +2,17 @@ name: CI | Run test metrics
 on:
  workflow_call:
    inputs:
-      tarball-suffix:
+      registry:
-        required: false
+        required: true
        type: string
      repo:
        required: true
        type: string
      tag:
        required: true
        type: string
      pr-number:
        required: true
        type: string
      commit-hash:
        required: false
@ -14,34 +23,7 @@ on:
        default: ""
 jobs:
  setup-kata:
    name: Kata Setup
    runs-on: metrics
    env:
      GOPATH: ${{ github.workspace }}
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.commit-hash }}
          fetch-depth: 0
      - name: Rebase atop of the latest target branch
        run: |
          ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
        env:
          TARGET_BRANCH: ${{ inputs.target-branch }}
      - name: get-kata-tarball
        uses: actions/download-artifact@v4
        with:
          name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
          path: kata-artifacts
      - name: Install kata
        run: bash tests/metrics/gha-run.sh install-kata kata-artifacts
  run-metrics:
    needs: setup-kata
    strategy:
      # We can set this to true whenever we're 100% sure that
      # the all the tests are not flaky, otherwise we'll fail
@ -54,34 +36,78 @@ jobs:
    env:
      GOPATH: ${{ github.workspace }}
      KATA_HYPERVISOR: ${{ matrix.vmm }}
      DOCKER_REGISTRY: ${{ inputs.registry }}
      DOCKER_REPO: ${{ inputs.repo }}
      DOCKER_TAG: ${{ inputs.tag }}
      GH_PR_NUMBER: ${{ inputs.pr-number }}
      K8S_TEST_HOST_TYPE: "baremetal"
      USING_NFD: "false"
      KUBERNETES: kubeadm
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.commit-hash }}
          fetch-depth: 0
      - name: Rebase atop of the latest target branch
        run: |
          ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
        env:
          TARGET_BRANCH: ${{ inputs.target-branch }}
      - name: Deploy Kata
        timeout-minutes: 10
        run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-kubeadm
      - name: Install check metrics
        run: bash tests/metrics/gha-run.sh install-checkmetrics
      - name: enabling the hypervisor
        run: bash tests/metrics/gha-run.sh enabling-hypervisor
      - name: run launch times test
        timeout-minutes: 15
        continue-on-error: true
        run: bash tests/metrics/gha-run.sh run-test-launchtimes
      - name: run memory foot print test
        timeout-minutes: 15
        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-memory-usage
      - name: run memory usage inside container test
        timeout-minutes: 15
        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container
      - name: run blogbench test
        timeout-minutes: 15
        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-blogbench
      - name: run tensorflow test
        timeout-minutes: 15
        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-tensorflow
      - name: run fio test
        timeout-minutes: 15
        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-fio
      - name: run iperf test
        timeout-minutes: 15
        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-iperf
      - name: run latency test
        timeout-minutes: 15
        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-latency
      - name: check metrics
        run:  bash tests/metrics/gha-run.sh check-metrics
      - name: make metrics tarball ${{ matrix.vmm }}
        run: bash tests/metrics/gha-run.sh make-tarball-results
@ -92,3 +118,8 @@ jobs:
          path: results-${{ matrix.vmm }}.tar.gz
          retention-days: 1
          if-no-files-found: error
      - name: Delete kata-deploy
        timeout-minutes: 10
        if: always()
        run: bash tests/integration/kubernetes/gha-run.sh cleanup-kubeadm
--- a/tests/common.bash
+++ b/tests/common.bash
@ -298,23 +298,6 @@ function clean_env_ctr()
 	fi
 }
 # Kills running shim and hypervisor components
 function kill_kata_components() {
 	local ATTEMPTS=2
 	local TIMEOUT="30s"
 	local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" )
 	sudo systemctl stop containerd
 	# iterate over the list of kata components and stop them
 	for (( i=1; i<=ATTEMPTS; i++ )); do
 		for PID_NAME in "${PID_NAMES[@]}"; do
 			[[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true
 		done
 		sleep 1
 	done
 	sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd
 }
 # Restarts a systemd service while ensuring the start-limit-burst is set to 0.
 # Outputs warnings to stdio if something has gone wrong.
 #
--- a/tests/integration/kubernetes/gha-run.sh
+++ b/tests/integration/kubernetes/gha-run.sh
@ -433,8 +433,8 @@ function cleanup() {
 		return
 	fi
-	# In case of canceling workflow manually, 'run_kubernetes_tests.sh' continues running and triggers new tests, 
+	# In case of canceling workflow manually, 'run_kubernetes_tests.sh' continues running and triggers new tests,
-	# resulting in the CI being in an unexpected state. So we need kill all running test scripts before cleaning up the node. 
+	# resulting in the CI being in an unexpected state. So we need kill all running test scripts before cleaning up the node.
 	# See issue https://github.com/kata-containers/kata-containers/issues/9980
 	delete_test_runners	|| true
 	# Switch back to the default namespace and delete the tests one
@ -594,6 +594,7 @@ function main() {
 		collect-artifacts) collect_artifacts ;;
 		cleanup) cleanup ;;
 		cleanup-kcli) cleanup "kcli" ;;
 		cleanup-kubeadm) cleanup "kubeadm" ;;
 		cleanup-sev) cleanup "sev" ;;
 		cleanup-snp) cleanup "snp" ;;
 		cleanup-tdx) cleanup "tdx" ;;
--- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
@ -18,7 +18,7 @@ checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
 checktype = "mean"
 midval = 0.39
 minpercent = 40.0
-maxpercent = 30.0
+maxpercent = 50.0
 [[metric]]
 name = "memory-footprint"
@ -121,7 +121,7 @@ description = "measure sequential write throughput using fio"
 checkvar = "[.\"fio\".\"Results sequential\"] | .[] | .[] | .write.bw | select( . != null )"
 checktype = "mean"
 midval = 307948
-minpercent = 20.0
+minpercent = 40.0
 maxpercent = 20.0
 [[metric]]
@ -199,7 +199,7 @@ description = "measure container parallel bandwidth using iperf3"
 checkvar = ".\"network-iperf3\".Results | .[] | .parallel.Result"
 checktype = "mean"
 midval = 57516472021.90
-minpercent = 20.0
+minpercent = 40.0
 maxpercent = 20.0
 [[metric]]
@ -211,6 +211,6 @@ description = "iperf"
 # within (inclusive)
 checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result"
 checktype = "mean"
-midval = 0.04
+midval = 0.02
 minpercent = 70.0
 maxpercent = 60.0
--- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
@ -212,5 +212,5 @@ description = "iperf"
 checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result"
 checktype = "mean"
 midval = 0.040
-minpercent = 60.0
+minpercent = 80.0
 maxpercent = 60.0
--- a/tests/metrics/gha-run.sh
+++ b/tests/metrics/gha-run.sh
@ -54,9 +54,14 @@ function make_tarball_results() {
 }
 function run_test_launchtimes() {
-	info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
+	repetitions=20
 	if [[ ${KATA_HYPERVISOR} == "qemu" ]]; then
 		# The qemu workload seems to fail before it can run ~5-7 repetitions of the workload
 		repetitions=3
 	fi
-	bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20
+	info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
 	bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n "${repetitions}"
 }
 function run_test_memory_usage() {
@ -114,14 +119,12 @@ function run_test_latency() {
 	info "Running Latency test using ${KATA_HYPERVISOR} hypervisor"
 	bash tests/metrics/network/latency_kubernetes/latency-network.sh
 	check_metrics
 }
 function main() {
 	action="${1:-}"
 	case "${action}" in
-		install-kata) install_kata && install_checkmetrics ;;
+		install-checkmetrics) install_checkmetrics ;;
 		enabling-hypervisor) enabling_hypervisor ;;
 		make-tarball-results) make_tarball_results ;;
 		run-test-launchtimes) run_test_launchtimes ;;
@ -132,7 +135,8 @@ function main() {
 		run-test-fio) run_test_fio ;;
 		run-test-iperf) run_test_iperf ;;
 		run-test-latency) run_test_latency ;;
-		*) >&2 die "Invalid argument" ;;
+		check-metrics) check_metrics;;
 		*) >&2 die "Invalid argument: ${action}" ;;
 	esac
 }
--- a/tests/metrics/lib/common.bash
+++ b/tests/metrics/lib/common.bash
@ -224,6 +224,23 @@ function kill_processes_before_start()
 	kill_kata_components
 }
 # Kills running shim and hypervisor components
 function kill_kata_components() {
 	local ATTEMPTS=2
 	local TIMEOUT="300s"
 	local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" )
 	sudo systemctl stop containerd
 	# iterate over the list of kata components and stop them
 	for (( i=1; i<=ATTEMPTS; i++ )); do
 		for PID_NAME in "${PID_NAMES[@]}"; do
 			[[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true
 		done
 		sleep 1
 	done
 	sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd
 }
 # Generate a random name - generally used when creating containers, but can
 # be used for any other appropriate purpose
 function random_name()
--- a/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh
+++ b/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh
@ -179,7 +179,7 @@ function iperf3_start_deployment() {
 	# Check no processes are left behind
 	check_processes
-	wait_time=20
+	wait_time=180
 	sleep_time=2
 	# Create deployment
--- a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml
+++ b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml
@ -19,7 +19,7 @@ spec:
        app: iperf3-client
    spec:
      tolerations:
-        - key: node-role.kubernetes.io/master
+        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      containers:
--- a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml
+++ b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml
@ -25,12 +25,10 @@ spec:
          - weight: 1
            preference:
              matchExpressions:
-              - key: kubernetes.io/role
+              - key: node-role.kubernetes.io/control-plane
-                operator: In
+                operator: Exists
                values:
                - master
      tolerations:
-        - key: node-role.kubernetes.io/master
+        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      containers:
--- a/tests/metrics/network/latency_kubernetes/latency-network.sh
+++ b/tests/metrics/network/latency_kubernetes/latency-network.sh
@ -33,12 +33,10 @@ function main() {
 	cmds=("bc" "jq")
 	check_cmds "${cmds[@]}"
 	init_env
 	# Check no processes are left behind
 	check_processes
-	wait_time=20
+	wait_time=180
 	sleep_time=2
 	# Create server