Merge pull request #10954 from kata-containers/topic/metrics-kata-deploy

Rework and fix metrics issues
2025-09-17 23:07:55 +00:00 · 2025-03-04 20:22:53 -05:00
parent edf6af2a43 c69509be1c
commit 4bb0eb4590
12 changed files with 105 additions and 70 deletions
--- a/tests/common.bash
+++ b/tests/common.bash
@@ -298,23 +298,6 @@ function clean_env_ctr()
 	fi
 }

-# Kills running shim and hypervisor components
-function kill_kata_components() {
-	local ATTEMPTS=2
-	local TIMEOUT="30s"
-	local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" )
-
-	sudo systemctl stop containerd
-	# iterate over the list of kata components and stop them
-	for (( i=1; i<=ATTEMPTS; i++ )); do
-		for PID_NAME in "${PID_NAMES[@]}"; do
-			[[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true
-		done
-		sleep 1
-	done
-	sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd
-}
-
 # Restarts a systemd service while ensuring the start-limit-burst is set to 0.
 # Outputs warnings to stdio if something has gone wrong.
 #
--- a/tests/integration/kubernetes/gha-run.sh
+++ b/tests/integration/kubernetes/gha-run.sh
@@ -433,8 +433,8 @@ function cleanup() {
 		return
 	fi

-	# In case of canceling workflow manually, 'run_kubernetes_tests.sh' continues running and triggers new tests, 
-	# resulting in the CI being in an unexpected state. So we need kill all running test scripts before cleaning up the node. 
+	# In case of canceling workflow manually, 'run_kubernetes_tests.sh' continues running and triggers new tests,
+	# resulting in the CI being in an unexpected state. So we need kill all running test scripts before cleaning up the node.
 	# See issue https://github.com/kata-containers/kata-containers/issues/9980
 	delete_test_runners	|| true
 	# Switch back to the default namespace and delete the tests one
@@ -594,6 +594,7 @@ function main() {
 		collect-artifacts) collect_artifacts ;;
 		cleanup) cleanup ;;
 		cleanup-kcli) cleanup "kcli" ;;
+		cleanup-kubeadm) cleanup "kubeadm" ;;
 		cleanup-sev) cleanup "sev" ;;
 		cleanup-snp) cleanup "snp" ;;
 		cleanup-tdx) cleanup "tdx" ;;
--- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
@@ -18,7 +18,7 @@ checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
 checktype = "mean"
 midval = 0.39
 minpercent = 40.0
-maxpercent = 30.0
+maxpercent = 50.0

 [[metric]]
 name = "memory-footprint"
@@ -121,7 +121,7 @@ description = "measure sequential write throughput using fio"
 checkvar = "[.\"fio\".\"Results sequential\"] | .[] | .[] | .write.bw | select( . != null )"
 checktype = "mean"
 midval = 307948
-minpercent = 20.0
+minpercent = 40.0
 maxpercent = 20.0

 [[metric]]
@@ -199,7 +199,7 @@ description = "measure container parallel bandwidth using iperf3"
 checkvar = ".\"network-iperf3\".Results | .[] | .parallel.Result"
 checktype = "mean"
 midval = 57516472021.90
-minpercent = 20.0
+minpercent = 40.0
 maxpercent = 20.0

 [[metric]]
@@ -211,6 +211,6 @@ description = "iperf"
 # within (inclusive)
 checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result"
 checktype = "mean"
-midval = 0.04
+midval = 0.02
 minpercent = 70.0
 maxpercent = 60.0
--- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
@@ -212,5 +212,5 @@ description = "iperf"
 checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result"
 checktype = "mean"
 midval = 0.040
-minpercent = 60.0
+minpercent = 80.0
 maxpercent = 60.0
--- a/tests/metrics/gha-run.sh
+++ b/tests/metrics/gha-run.sh
@@ -54,9 +54,14 @@ function make_tarball_results() {
 }

 function run_test_launchtimes() {
-	info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
+	repetitions=20
+	if [[ ${KATA_HYPERVISOR} == "qemu" ]]; then
+		# The qemu workload seems to fail before it can run ~5-7 repetitions of the workload
+		repetitions=3
+	fi

-	bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20
+	info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
+	bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n "${repetitions}"
 }

 function run_test_memory_usage() {
@@ -114,14 +119,12 @@ function run_test_latency() {
 	info "Running Latency test using ${KATA_HYPERVISOR} hypervisor"

 	bash tests/metrics/network/latency_kubernetes/latency-network.sh
-
-	check_metrics
 }

 function main() {
 	action="${1:-}"
 	case "${action}" in
-		install-kata) install_kata && install_checkmetrics ;;
+		install-checkmetrics) install_checkmetrics ;;
 		enabling-hypervisor) enabling_hypervisor ;;
 		make-tarball-results) make_tarball_results ;;
 		run-test-launchtimes) run_test_launchtimes ;;
@@ -132,7 +135,8 @@ function main() {
 		run-test-fio) run_test_fio ;;
 		run-test-iperf) run_test_iperf ;;
 		run-test-latency) run_test_latency ;;
-		*) >&2 die "Invalid argument" ;;
+		check-metrics) check_metrics;;
+		*) >&2 die "Invalid argument: ${action}" ;;
 	esac
 }

--- a/tests/metrics/lib/common.bash
+++ b/tests/metrics/lib/common.bash
@@ -224,6 +224,23 @@ function kill_processes_before_start()
 	kill_kata_components
 }

+# Kills running shim and hypervisor components
+function kill_kata_components() {
+	local ATTEMPTS=2
+	local TIMEOUT="300s"
+	local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" )
+
+	sudo systemctl stop containerd
+	# iterate over the list of kata components and stop them
+	for (( i=1; i<=ATTEMPTS; i++ )); do
+		for PID_NAME in "${PID_NAMES[@]}"; do
+			[[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true
+		done
+		sleep 1
+	done
+	sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd
+}
+
 # Generate a random name - generally used when creating containers, but can
 # be used for any other appropriate purpose
 function random_name()
--- a/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh
+++ b/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh
@@ -179,7 +179,7 @@ function iperf3_start_deployment() {
 	# Check no processes are left behind
 	check_processes

-	wait_time=20
+	wait_time=180
 	sleep_time=2

 	# Create deployment
--- a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml
+++ b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml
@@ -19,7 +19,7 @@ spec:
        app: iperf3-client
    spec:
      tolerations:
-        - key: node-role.kubernetes.io/master
+        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      containers:
--- a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml
+++ b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml
@@ -25,12 +25,10 @@ spec:
          - weight: 1
            preference:
              matchExpressions:
-              - key: kubernetes.io/role
-                operator: In
-                values:
-                - master
+              - key: node-role.kubernetes.io/control-plane
+                operator: Exists
      tolerations:
-        - key: node-role.kubernetes.io/master
+        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      containers:
--- a/tests/metrics/network/latency_kubernetes/latency-network.sh
+++ b/tests/metrics/network/latency_kubernetes/latency-network.sh
@@ -33,12 +33,10 @@ function main() {
 	cmds=("bc" "jq")
 	check_cmds "${cmds[@]}"

-	init_env
-
 	# Check no processes are left behind
 	check_processes

-	wait_time=20
+	wait_time=180
 	sleep_time=2

 	# Create server