diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cd6f15259b..ed7a1eecfb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -288,8 +288,11 @@ jobs: needs: build-kata-static-tarball-amd64 uses: ./.github/workflows/run-metrics.yaml with: - tarball-suffix: -${{ inputs.tag }} + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 commit-hash: ${{ inputs.commit-hash }} + pr-number: ${{ inputs.pr-number }} target-branch: ${{ inputs.target-branch }} run-basic-amd64-tests: diff --git a/.github/workflows/run-metrics.yaml b/.github/workflows/run-metrics.yaml index be39105e78..a6074ba32a 100644 --- a/.github/workflows/run-metrics.yaml +++ b/.github/workflows/run-metrics.yaml @@ -2,8 +2,17 @@ name: CI | Run test metrics on: workflow_call: inputs: - tarball-suffix: - required: false + registry: + required: true + type: string + repo: + required: true + type: string + tag: + required: true + type: string + pr-number: + required: true type: string commit-hash: required: false @@ -14,34 +23,7 @@ on: default: "" jobs: - setup-kata: - name: Kata Setup - runs-on: metrics - env: - GOPATH: ${{ github.workspace }} - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.commit-hash }} - fetch-depth: 0 - - - name: Rebase atop of the latest target branch - run: | - ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch" - env: - TARGET_BRANCH: ${{ inputs.target-branch }} - - - name: get-kata-tarball - uses: actions/download-artifact@v4 - with: - name: kata-static-tarball-amd64${{ inputs.tarball-suffix }} - path: kata-artifacts - - - name: Install kata - run: bash tests/metrics/gha-run.sh install-kata kata-artifacts - run-metrics: - needs: setup-kata strategy: # We can set this to true whenever we're 100% sure that # the all the tests are not flaky, otherwise we'll fail @@ -54,34 +36,78 @@ jobs: env: GOPATH: ${{ github.workspace }} KATA_HYPERVISOR: ${{ matrix.vmm }} + DOCKER_REGISTRY: ${{ inputs.registry }} + DOCKER_REPO: ${{ inputs.repo }} + DOCKER_TAG: ${{ inputs.tag }} + GH_PR_NUMBER: ${{ inputs.pr-number }} + K8S_TEST_HOST_TYPE: "baremetal" + USING_NFD: "false" + KUBERNETES: kubeadm steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.commit-hash }} + fetch-depth: 0 + + - name: Rebase atop of the latest target branch + run: | + ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch" + env: + TARGET_BRANCH: ${{ inputs.target-branch }} + + - name: Deploy Kata + timeout-minutes: 10 + run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-kubeadm + + - name: Install check metrics + run: bash tests/metrics/gha-run.sh install-checkmetrics + - name: enabling the hypervisor run: bash tests/metrics/gha-run.sh enabling-hypervisor - name: run launch times test + timeout-minutes: 15 + continue-on-error: true run: bash tests/metrics/gha-run.sh run-test-launchtimes - name: run memory foot print test + timeout-minutes: 15 + continue-on-error: true run: bash tests/metrics/gha-run.sh run-test-memory-usage - name: run memory usage inside container test + timeout-minutes: 15 + continue-on-error: true run: bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container - name: run blogbench test + timeout-minutes: 15 + continue-on-error: true run: bash tests/metrics/gha-run.sh run-test-blogbench - name: run tensorflow test + timeout-minutes: 15 + continue-on-error: true run: bash tests/metrics/gha-run.sh run-test-tensorflow - name: run fio test + timeout-minutes: 15 + continue-on-error: true run: bash tests/metrics/gha-run.sh run-test-fio - name: run iperf test + timeout-minutes: 15 + continue-on-error: true run: bash tests/metrics/gha-run.sh run-test-iperf - name: run latency test + timeout-minutes: 15 + continue-on-error: true run: bash tests/metrics/gha-run.sh run-test-latency + - name: check metrics + run: bash tests/metrics/gha-run.sh check-metrics + - name: make metrics tarball ${{ matrix.vmm }} run: bash tests/metrics/gha-run.sh make-tarball-results @@ -92,3 +118,8 @@ jobs: path: results-${{ matrix.vmm }}.tar.gz retention-days: 1 if-no-files-found: error + + - name: Delete kata-deploy + timeout-minutes: 10 + if: always() + run: bash tests/integration/kubernetes/gha-run.sh cleanup-kubeadm diff --git a/tests/common.bash b/tests/common.bash index 1a7597ea82..546370fc48 100644 --- a/tests/common.bash +++ b/tests/common.bash @@ -298,23 +298,6 @@ function clean_env_ctr() fi } -# Kills running shim and hypervisor components -function kill_kata_components() { - local ATTEMPTS=2 - local TIMEOUT="30s" - local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" ) - - sudo systemctl stop containerd - # iterate over the list of kata components and stop them - for (( i=1; i<=ATTEMPTS; i++ )); do - for PID_NAME in "${PID_NAMES[@]}"; do - [[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true - done - sleep 1 - done - sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd -} - # Restarts a systemd service while ensuring the start-limit-burst is set to 0. # Outputs warnings to stdio if something has gone wrong. # diff --git a/tests/integration/kubernetes/gha-run.sh b/tests/integration/kubernetes/gha-run.sh index f5a8084408..9b7e7e45b6 100755 --- a/tests/integration/kubernetes/gha-run.sh +++ b/tests/integration/kubernetes/gha-run.sh @@ -433,8 +433,8 @@ function cleanup() { return fi - # In case of canceling workflow manually, 'run_kubernetes_tests.sh' continues running and triggers new tests, - # resulting in the CI being in an unexpected state. So we need kill all running test scripts before cleaning up the node. + # In case of canceling workflow manually, 'run_kubernetes_tests.sh' continues running and triggers new tests, + # resulting in the CI being in an unexpected state. So we need kill all running test scripts before cleaning up the node. # See issue https://github.com/kata-containers/kata-containers/issues/9980 delete_test_runners || true # Switch back to the default namespace and delete the tests one @@ -594,6 +594,7 @@ function main() { collect-artifacts) collect_artifacts ;; cleanup) cleanup ;; cleanup-kcli) cleanup "kcli" ;; + cleanup-kubeadm) cleanup "kubeadm" ;; cleanup-sev) cleanup "sev" ;; cleanup-snp) cleanup "snp" ;; cleanup-tdx) cleanup "tdx" ;; diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index d23d935a33..5c9f126dd7 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -18,7 +18,7 @@ checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" checktype = "mean" midval = 0.39 minpercent = 40.0 -maxpercent = 30.0 +maxpercent = 50.0 [[metric]] name = "memory-footprint" @@ -121,7 +121,7 @@ description = "measure sequential write throughput using fio" checkvar = "[.\"fio\".\"Results sequential\"] | .[] | .[] | .write.bw | select( . != null )" checktype = "mean" midval = 307948 -minpercent = 20.0 +minpercent = 40.0 maxpercent = 20.0 [[metric]] @@ -199,7 +199,7 @@ description = "measure container parallel bandwidth using iperf3" checkvar = ".\"network-iperf3\".Results | .[] | .parallel.Result" checktype = "mean" midval = 57516472021.90 -minpercent = 20.0 +minpercent = 40.0 maxpercent = 20.0 [[metric]] @@ -211,6 +211,6 @@ description = "iperf" # within (inclusive) checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result" checktype = "mean" -midval = 0.04 +midval = 0.02 minpercent = 70.0 maxpercent = 60.0 diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index 94bc4ee2e3..a328e21296 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -212,5 +212,5 @@ description = "iperf" checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result" checktype = "mean" midval = 0.040 -minpercent = 60.0 +minpercent = 80.0 maxpercent = 60.0 diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index 4edf79f028..9ca6046491 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -54,9 +54,14 @@ function make_tarball_results() { } function run_test_launchtimes() { - info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor" + repetitions=20 + if [[ ${KATA_HYPERVISOR} == "qemu" ]]; then + # The qemu workload seems to fail before it can run ~5-7 repetitions of the workload + repetitions=3 + fi - bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 + info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor" + bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n "${repetitions}" } function run_test_memory_usage() { @@ -114,14 +119,12 @@ function run_test_latency() { info "Running Latency test using ${KATA_HYPERVISOR} hypervisor" bash tests/metrics/network/latency_kubernetes/latency-network.sh - - check_metrics } function main() { action="${1:-}" case "${action}" in - install-kata) install_kata && install_checkmetrics ;; + install-checkmetrics) install_checkmetrics ;; enabling-hypervisor) enabling_hypervisor ;; make-tarball-results) make_tarball_results ;; run-test-launchtimes) run_test_launchtimes ;; @@ -132,7 +135,8 @@ function main() { run-test-fio) run_test_fio ;; run-test-iperf) run_test_iperf ;; run-test-latency) run_test_latency ;; - *) >&2 die "Invalid argument" ;; + check-metrics) check_metrics;; + *) >&2 die "Invalid argument: ${action}" ;; esac } diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index f94a1978a5..fcd166cc67 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -224,6 +224,23 @@ function kill_processes_before_start() kill_kata_components } +# Kills running shim and hypervisor components +function kill_kata_components() { + local ATTEMPTS=2 + local TIMEOUT="300s" + local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" ) + + sudo systemctl stop containerd + # iterate over the list of kata components and stop them + for (( i=1; i<=ATTEMPTS; i++ )); do + for PID_NAME in "${PID_NAMES[@]}"; do + [[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true + done + sleep 1 + done + sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd +} + # Generate a random name - generally used when creating containers, but can # be used for any other appropriate purpose function random_name() diff --git a/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh b/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh index 03a2d6f353..696be2f035 100755 --- a/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh +++ b/tests/metrics/network/iperf3_kubernetes/k8s-network-metrics-iperf3.sh @@ -179,7 +179,7 @@ function iperf3_start_deployment() { # Check no processes are left behind check_processes - wait_time=20 + wait_time=180 sleep_time=2 # Create deployment diff --git a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml index f0a02bbe7c..0fb4ba15fd 100644 --- a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml +++ b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-daemonset.yaml @@ -19,7 +19,7 @@ spec: app: iperf3-client spec: tolerations: - - key: node-role.kubernetes.io/master + - key: node-role.kubernetes.io/control-plane operator: Exists effect: NoSchedule containers: diff --git a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml index 6be5754910..95fcca0b66 100644 --- a/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml +++ b/tests/metrics/network/iperf3_kubernetes/runtimeclass_workloads/iperf3-deployment.yaml @@ -25,12 +25,10 @@ spec: - weight: 1 preference: matchExpressions: - - key: kubernetes.io/role - operator: In - values: - - master + - key: node-role.kubernetes.io/control-plane + operator: Exists tolerations: - - key: node-role.kubernetes.io/master + - key: node-role.kubernetes.io/control-plane operator: Exists effect: NoSchedule containers: diff --git a/tests/metrics/network/latency_kubernetes/latency-network.sh b/tests/metrics/network/latency_kubernetes/latency-network.sh index 1d8b773eb8..03158adbb7 100755 --- a/tests/metrics/network/latency_kubernetes/latency-network.sh +++ b/tests/metrics/network/latency_kubernetes/latency-network.sh @@ -33,12 +33,10 @@ function main() { cmds=("bc" "jq") check_cmds "${cmds[@]}" - init_env - # Check no processes are left behind check_processes - wait_time=20 + wait_time=180 sleep_time=2 # Create server