From 3fab7944a34c727f6ad1d24e93cc30927b18b4c2 Mon Sep 17 00:00:00 2001
From: stevenhorsman <steven@uk.ibm.com>
Date: Thu, 27 Feb 2025 16:57:34 +0000
Subject: [PATCH] workflows: Improve metrics jobs

- As the metrics tests are largely independent
then allow subsequent tests to run even if previous
ones failed. The results might not be perfect if
clean-up is required, but we can work on that later.
- Move the test results check out of the latency
test that seems arbitrary and into it's own job step
- Add timeouts to steps that might fail/hang if there
are containerd/K8s issues

Signed-off-by: stevenhorsman <steven@uk.ibm.com>
---
 .github/workflows/run-metrics.yaml | 20 ++++++++++++++++++++
 tests/metrics/gha-run.sh           |  3 +--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/run-metrics.yaml b/.github/workflows/run-metrics.yaml
index d5dbb32b0a..a6074ba32a 100644
--- a/.github/workflows/run-metrics.yaml
+++ b/.github/workflows/run-metrics.yaml
@@ -66,29 +66,48 @@ jobs:
         run: bash tests/metrics/gha-run.sh enabling-hypervisor
 
       - name: run launch times test
+        timeout-minutes: 15
+        continue-on-error: true
         run: bash tests/metrics/gha-run.sh run-test-launchtimes
 
       - name: run memory foot print test
+        timeout-minutes: 15
+        continue-on-error: true
         run:  bash tests/metrics/gha-run.sh run-test-memory-usage
 
       - name: run memory usage inside container test
+        timeout-minutes: 15
+        continue-on-error: true
         run:  bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container
 
       - name: run blogbench test
+        timeout-minutes: 15
+        continue-on-error: true
         run:  bash tests/metrics/gha-run.sh run-test-blogbench
 
       - name: run tensorflow test
+        timeout-minutes: 15
+        continue-on-error: true
         run:  bash tests/metrics/gha-run.sh run-test-tensorflow
 
       - name: run fio test
+        timeout-minutes: 15
+        continue-on-error: true
         run:  bash tests/metrics/gha-run.sh run-test-fio
 
       - name: run iperf test
+        timeout-minutes: 15
+        continue-on-error: true
         run:  bash tests/metrics/gha-run.sh run-test-iperf
 
       - name: run latency test
+        timeout-minutes: 15
+        continue-on-error: true
         run:  bash tests/metrics/gha-run.sh run-test-latency
 
+      - name: check metrics
+        run:  bash tests/metrics/gha-run.sh check-metrics
+
       - name: make metrics tarball ${{ matrix.vmm }}
         run: bash tests/metrics/gha-run.sh make-tarball-results
 
@@ -101,5 +120,6 @@ jobs:
           if-no-files-found: error
 
       - name: Delete kata-deploy
+        timeout-minutes: 10
         if: always()
         run: bash tests/integration/kubernetes/gha-run.sh cleanup-kubeadm
diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh
index 7fb55df89e..a02f1e45fa 100755
--- a/tests/metrics/gha-run.sh
+++ b/tests/metrics/gha-run.sh
@@ -114,8 +114,6 @@ function run_test_latency() {
 	info "Running Latency test using ${KATA_HYPERVISOR} hypervisor"
 
 	bash tests/metrics/network/latency_kubernetes/latency-network.sh
-
-	check_metrics
 }
 
 function main() {
@@ -132,6 +130,7 @@ function main() {
 		run-test-fio) run_test_fio ;;
 		run-test-iperf) run_test_iperf ;;
 		run-test-latency) run_test_latency ;;
+		check-metrics) check_metrics;;
 		*) >&2 die "Invalid argument: ${action}" ;;
 	esac
 }