Merge pull request #10954 from kata-containers/topic/metrics-kata-deploy

Rework and fix metrics issues
2025-07-19 09:51:29 +00:00 · 2025-03-04 20:22:53 -05:00 · 2025-03-04 20:22:53 -05:00 · 4bb0eb4590
commit 4bb0eb4590
parent edf6af2a43 c69509be1c
12 changed files with 105 additions and 70 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -288,8 +288,11 @@ jobs:
    needs: build-kata-static-tarball-amd64
    uses: ./.github/workflows/run-metrics.yaml
    with:
-      tarball-suffix: -${{ inputs.tag }}
+      registry: ghcr.io
+      repo: ${{ github.repository_owner }}/kata-deploy-ci
+      tag: ${{ inputs.tag }}-amd64
      commit-hash: ${{ inputs.commit-hash }}
+      pr-number: ${{ inputs.pr-number }}
      target-branch: ${{ inputs.target-branch }}

  run-basic-amd64-tests:
--- a/.github/workflows/run-metrics.yaml
+++ b/.github/workflows/run-metrics.yaml
@ -2,8 +2,17 @@ name: CI | Run test metrics
 on:
  workflow_call:
    inputs:
-      tarball-suffix:
-        required: false
+      registry:
+        required: true
+        type: string
+      repo:
+        required: true
+        type: string
+      tag:
+        required: true
+        type: string
+      pr-number:
+        required: true
        type: string
      commit-hash:
        required: false
@ -14,34 +23,7 @@ on:
        default: ""

 jobs:
-  setup-kata:
-    name: Kata Setup
-    runs-on: metrics
-    env:
-      GOPATH: ${{ github.workspace }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.commit-hash }}
-          fetch-depth: 0
-
-      - name: Rebase atop of the latest target branch
-        run: |
-          ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
-        env:
-          TARGET_BRANCH: ${{ inputs.target-branch }}
-
-      - name: get-kata-tarball
-        uses: actions/download-artifact@v4
-        with:
-          name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
-          path: kata-artifacts
-
-      - name: Install kata
-        run: bash tests/metrics/gha-run.sh install-kata kata-artifacts
-
  run-metrics:
-    needs: setup-kata
    strategy:
      # We can set this to true whenever we're 100% sure that
      # the all the tests are not flaky, otherwise we'll fail
@ -54,34 +36,78 @@ jobs:
    env:
      GOPATH: ${{ github.workspace }}
      KATA_HYPERVISOR: ${{ matrix.vmm }}
+      DOCKER_REGISTRY: ${{ inputs.registry }}
+      DOCKER_REPO: ${{ inputs.repo }}
+      DOCKER_TAG: ${{ inputs.tag }}
+      GH_PR_NUMBER: ${{ inputs.pr-number }}
+      K8S_TEST_HOST_TYPE: "baremetal"
+      USING_NFD: "false"
+      KUBERNETES: kubeadm
    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.commit-hash }}
+          fetch-depth: 0
+
+      - name: Rebase atop of the latest target branch
+        run: |
+          ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
+        env:
+          TARGET_BRANCH: ${{ inputs.target-branch }}
+
+      - name: Deploy Kata
+        timeout-minutes: 10
+        run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-kubeadm
+
+      - name: Install check metrics
+        run: bash tests/metrics/gha-run.sh install-checkmetrics
+
      - name: enabling the hypervisor
        run: bash tests/metrics/gha-run.sh enabling-hypervisor

      - name: run launch times test
+        timeout-minutes: 15
+        continue-on-error: true
        run: bash tests/metrics/gha-run.sh run-test-launchtimes

      - name: run memory foot print test
+        timeout-minutes: 15
+        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-memory-usage

      - name: run memory usage inside container test
+        timeout-minutes: 15
+        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container

      - name: run blogbench test
+        timeout-minutes: 15
+        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-blogbench

      - name: run tensorflow test
+        timeout-minutes: 15
+        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-tensorflow

      - name: run fio test
+        timeout-minutes: 15
+        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-fio

      - name: run iperf test
+        timeout-minutes: 15
+        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-iperf

      - name: run latency test
+        timeout-minutes: 15
+        continue-on-error: true
        run:  bash tests/metrics/gha-run.sh run-test-latency

+      - name: check metrics
+        run:  bash tests/metrics/gha-run.sh check-metrics
+
      - name: make metrics tarball ${{ matrix.vmm }}
        run: bash tests/metrics/gha-run.sh make-tarball-results

@ -92,3 +118,8 @@ jobs:
          path: results-${{ matrix.vmm }}.tar.gz
          retention-days: 1
          if-no-files-found: error
+
+      - name: Delete kata-deploy
+        timeout-minutes: 10
+        if: always()
+        run: bash tests/integration/kubernetes/gha-run.sh cleanup-kubeadm
--- a/tests/common.bash
+++ b/tests/common.bash
@ -298,23 +298,6 @@ function clean_env_ctr()
 	fi
 }

-# Kills running shim and hypervisor components
-function kill_kata_components() {
-	local ATTEMPTS=2
-	local TIMEOUT="30s"
-	local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" )
-
-	sudo systemctl stop containerd
-	# iterate over the list of kata components and stop them
-	for (( i=1; i<=ATTEMPTS; i++ )); do
-		for PID_NAME in "${PID_NAMES[@]}"; do
-			[[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true
-		done
-		sleep 1
-	done
-	sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd
-}
-
 # Restarts a systemd service while ensuring the start-limit-burst is set to 0.
 # Outputs warnings to stdio if something has gone wrong.
 #
--- a/tests/integration/kubernetes/gha-run.sh
+++ b/tests/integration/kubernetes/gha-run.sh
@ -594,6 +594,7 @@ function main() {
 		collect-artifacts) collect_artifacts ;;
 		cleanup) cleanup ;;
 		cleanup-kcli) cleanup "kcli" ;;
+		cleanup-kubeadm) cleanup "kubeadm" ;;
 		cleanup-sev) cleanup "sev" ;;
 		cleanup-snp) cleanup "snp" ;;
 		cleanup-tdx) cleanup "tdx" ;;
--- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
@ -18,7 +18,7 @@ checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
 checktype = "mean"
 midval = 0.39
 minpercent = 40.0
-maxpercent = 30.0
+maxpercent = 50.0

 [[metric]]
 name = "memory-footprint"
@ -121,7 +121,7 @@ description = "measure sequential write throughput using fio"
 checkvar = "[.\"fio\".\"Results sequential\"] | .[] | .[] | .write.bw | select( . != null )"
 checktype = "mean"
 midval = 307948
-minpercent = 20.0
+minpercent = 40.0
 maxpercent = 20.0

 [[metric]]
@ -199,7 +199,7 @@ description = "measure container parallel bandwidth using iperf3"
 checkvar = ".\"network-iperf3\".Results | .[] | .parallel.Result"
 checktype = "mean"
 midval = 57516472021.90
-minpercent = 20.0
+minpercent = 40.0
 maxpercent = 20.0

 [[metric]]
@ -211,6 +211,6 @@ description = "iperf"
 # within (inclusive)
 checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result"
 checktype = "mean"
-midval = 0.04
+midval = 0.02
 minpercent = 70.0
 maxpercent = 60.0
--- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
@ -212,5 +212,5 @@ description = "iperf"
 checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result"
 checktype = "mean"
 midval = 0.040
-minpercent = 60.0
+minpercent = 80.0
 maxpercent = 60.0
--- a/tests/metrics/gha-run.sh
+++ b/tests/metrics/gha-run.sh
@ -54,9 +54,14 @@ function make_tarball_results() {
 }

 function run_test_launchtimes() {
-	info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
+	repetitions=20
+	if [[ ${KATA_HYPERVISOR} == "qemu" ]]; then
+		# The qemu workload seems to fail before it can run ~5-7 repetitions of the workload
+		repetitions=3
+	fi

-	bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20
+	info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
+	bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n "${repetitions}"
 }

 function run_test_memory_usage() {
@ -114,14 +119,12 @@ function run_test_latency() {
 	info "Running Latency test using ${KATA_HYPERVISOR} hypervisor"

 	bash tests/metrics/network/latency_kubernetes/latency-network.sh
-
-	check_metrics
 }

 function main() {
 	action="${1:-}"
 	case "${action}" in
-		install-kata) install_kata && install_checkmetrics ;;
+		install-checkmetrics) install_checkmetrics ;;
 		enabling-hypervisor) enabling_hypervisor ;;
 		make-tarball-results) make_tarball_results ;;
 		run-test-launchtimes) run_test_launchtimes ;;
@ -132,7 +135,8 @@ function main() {
 		run-test-fio) run_test_fio ;;
 		run-test-iperf) run_test_iperf ;;
 		run-test-latency) run_test_latency ;;
-		*) >&2 die "Invalid argument" ;;
+		check-metrics) check_metrics;;
+		*) >&2 die "Invalid argument: ${action}" ;;
 	esac
 }

--- a/tests/metrics/lib/common.bash
+++ b/tests/metrics/lib/common.bash
@ -224,6 +224,23 @@ function kill_processes_before_start()
 	kill_kata_components
 }

+# Kills running shim and hypervisor components
+function kill_kata_components() {
+	local ATTEMPTS=2
+	local TIMEOUT="300s"
+	local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" )
+
+	sudo systemctl stop containerd
+	# iterate over the list of kata components and stop them
+	for (( i=1; i<=ATTEMPTS; i++ )); do
+		for PID_NAME in "${PID_NAMES[@]}"; do
+			[[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true
+		done
+		sleep 1
+	done
+	sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd
+}
+
 # Generate a random name - generally used when creating containers, but can
 # be used for any other appropriate purpose
 function random_name()
--- a/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh
+++ b/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh
@ -179,7 +179,7 @@ function iperf3_start_deployment() {
 	# Check no processes are left behind
 	check_processes

-	wait_time=20
+	wait_time=180
 	sleep_time=2

 	# Create deployment
--- a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml
+++ b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml
@ -19,7 +19,7 @@ spec:
        app: iperf3-client
    spec:
      tolerations:
-        - key: node-role.kubernetes.io/master
+        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      containers:
--- a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml
+++ b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml
@ -25,12 +25,10 @@ spec:
          - weight: 1
            preference:
              matchExpressions:
-              - key: kubernetes.io/role
-                operator: In
-                values:
-                - master
+              - key: node-role.kubernetes.io/control-plane
+                operator: Exists
      tolerations:
-        - key: node-role.kubernetes.io/master
+        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      containers:
--- a/tests/metrics/network/latency_kubernetes/latency-network.sh
+++ b/tests/metrics/network/latency_kubernetes/latency-network.sh
@ -33,12 +33,10 @@ function main() {
 	cmds=("bc" "jq")
 	check_cmds "${cmds[@]}"

-	init_env
-
 	# Check no processes are left behind
 	check_processes

-	wait_time=20
+	wait_time=180
 	sleep_time=2

 	# Create server