Merge pull request #10954 from kata-containers/topic/metrics-kata-deploy

Rework and fix metrics issues
This commit is contained in:
Zvonko Kaiser 2025-03-04 20:22:53 -05:00 committed by GitHub
commit 4bb0eb4590
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 105 additions and 70 deletions

View File

@ -288,8 +288,11 @@ jobs:
needs: build-kata-static-tarball-amd64 needs: build-kata-static-tarball-amd64
uses: ./.github/workflows/run-metrics.yaml uses: ./.github/workflows/run-metrics.yaml
with: with:
tarball-suffix: -${{ inputs.tag }} registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
commit-hash: ${{ inputs.commit-hash }} commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }} target-branch: ${{ inputs.target-branch }}
run-basic-amd64-tests: run-basic-amd64-tests:

View File

@ -2,8 +2,17 @@ name: CI | Run test metrics
on: on:
workflow_call: workflow_call:
inputs: inputs:
tarball-suffix: registry:
required: false required: true
type: string
repo:
required: true
type: string
tag:
required: true
type: string
pr-number:
required: true
type: string type: string
commit-hash: commit-hash:
required: false required: false
@ -14,34 +23,7 @@ on:
default: "" default: ""
jobs: jobs:
setup-kata:
name: Kata Setup
runs-on: metrics
env:
GOPATH: ${{ github.workspace }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tarball
uses: actions/download-artifact@v4
with:
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: Install kata
run: bash tests/metrics/gha-run.sh install-kata kata-artifacts
run-metrics: run-metrics:
needs: setup-kata
strategy: strategy:
# We can set this to true whenever we're 100% sure that # We can set this to true whenever we're 100% sure that
# the all the tests are not flaky, otherwise we'll fail # the all the tests are not flaky, otherwise we'll fail
@ -54,34 +36,78 @@ jobs:
env: env:
GOPATH: ${{ github.workspace }} GOPATH: ${{ github.workspace }}
KATA_HYPERVISOR: ${{ matrix.vmm }} KATA_HYPERVISOR: ${{ matrix.vmm }}
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
GH_PR_NUMBER: ${{ inputs.pr-number }}
K8S_TEST_HOST_TYPE: "baremetal"
USING_NFD: "false"
KUBERNETES: kubeadm
steps: steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: Deploy Kata
timeout-minutes: 10
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-kubeadm
- name: Install check metrics
run: bash tests/metrics/gha-run.sh install-checkmetrics
- name: enabling the hypervisor - name: enabling the hypervisor
run: bash tests/metrics/gha-run.sh enabling-hypervisor run: bash tests/metrics/gha-run.sh enabling-hypervisor
- name: run launch times test - name: run launch times test
timeout-minutes: 15
continue-on-error: true
run: bash tests/metrics/gha-run.sh run-test-launchtimes run: bash tests/metrics/gha-run.sh run-test-launchtimes
- name: run memory foot print test - name: run memory foot print test
timeout-minutes: 15
continue-on-error: true
run: bash tests/metrics/gha-run.sh run-test-memory-usage run: bash tests/metrics/gha-run.sh run-test-memory-usage
- name: run memory usage inside container test - name: run memory usage inside container test
timeout-minutes: 15
continue-on-error: true
run: bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container run: bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container
- name: run blogbench test - name: run blogbench test
timeout-minutes: 15
continue-on-error: true
run: bash tests/metrics/gha-run.sh run-test-blogbench run: bash tests/metrics/gha-run.sh run-test-blogbench
- name: run tensorflow test - name: run tensorflow test
timeout-minutes: 15
continue-on-error: true
run: bash tests/metrics/gha-run.sh run-test-tensorflow run: bash tests/metrics/gha-run.sh run-test-tensorflow
- name: run fio test - name: run fio test
timeout-minutes: 15
continue-on-error: true
run: bash tests/metrics/gha-run.sh run-test-fio run: bash tests/metrics/gha-run.sh run-test-fio
- name: run iperf test - name: run iperf test
timeout-minutes: 15
continue-on-error: true
run: bash tests/metrics/gha-run.sh run-test-iperf run: bash tests/metrics/gha-run.sh run-test-iperf
- name: run latency test - name: run latency test
timeout-minutes: 15
continue-on-error: true
run: bash tests/metrics/gha-run.sh run-test-latency run: bash tests/metrics/gha-run.sh run-test-latency
- name: check metrics
run: bash tests/metrics/gha-run.sh check-metrics
- name: make metrics tarball ${{ matrix.vmm }} - name: make metrics tarball ${{ matrix.vmm }}
run: bash tests/metrics/gha-run.sh make-tarball-results run: bash tests/metrics/gha-run.sh make-tarball-results
@ -92,3 +118,8 @@ jobs:
path: results-${{ matrix.vmm }}.tar.gz path: results-${{ matrix.vmm }}.tar.gz
retention-days: 1 retention-days: 1
if-no-files-found: error if-no-files-found: error
- name: Delete kata-deploy
timeout-minutes: 10
if: always()
run: bash tests/integration/kubernetes/gha-run.sh cleanup-kubeadm

View File

@ -298,23 +298,6 @@ function clean_env_ctr()
fi fi
} }
# Kills running shim and hypervisor components
function kill_kata_components() {
local ATTEMPTS=2
local TIMEOUT="30s"
local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" )
sudo systemctl stop containerd
# iterate over the list of kata components and stop them
for (( i=1; i<=ATTEMPTS; i++ )); do
for PID_NAME in "${PID_NAMES[@]}"; do
[[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true
done
sleep 1
done
sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd
}
# Restarts a systemd service while ensuring the start-limit-burst is set to 0. # Restarts a systemd service while ensuring the start-limit-burst is set to 0.
# Outputs warnings to stdio if something has gone wrong. # Outputs warnings to stdio if something has gone wrong.
# #

View File

@ -433,8 +433,8 @@ function cleanup() {
return return
fi fi
# In case of canceling workflow manually, 'run_kubernetes_tests.sh' continues running and triggers new tests, # In case of canceling workflow manually, 'run_kubernetes_tests.sh' continues running and triggers new tests,
# resulting in the CI being in an unexpected state. So we need kill all running test scripts before cleaning up the node. # resulting in the CI being in an unexpected state. So we need kill all running test scripts before cleaning up the node.
# See issue https://github.com/kata-containers/kata-containers/issues/9980 # See issue https://github.com/kata-containers/kata-containers/issues/9980
delete_test_runners || true delete_test_runners || true
# Switch back to the default namespace and delete the tests one # Switch back to the default namespace and delete the tests one
@ -594,6 +594,7 @@ function main() {
collect-artifacts) collect_artifacts ;; collect-artifacts) collect_artifacts ;;
cleanup) cleanup ;; cleanup) cleanup ;;
cleanup-kcli) cleanup "kcli" ;; cleanup-kcli) cleanup "kcli" ;;
cleanup-kubeadm) cleanup "kubeadm" ;;
cleanup-sev) cleanup "sev" ;; cleanup-sev) cleanup "sev" ;;
cleanup-snp) cleanup "snp" ;; cleanup-snp) cleanup "snp" ;;
cleanup-tdx) cleanup "tdx" ;; cleanup-tdx) cleanup "tdx" ;;

View File

@ -18,7 +18,7 @@ checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
checktype = "mean" checktype = "mean"
midval = 0.39 midval = 0.39
minpercent = 40.0 minpercent = 40.0
maxpercent = 30.0 maxpercent = 50.0
[[metric]] [[metric]]
name = "memory-footprint" name = "memory-footprint"
@ -121,7 +121,7 @@ description = "measure sequential write throughput using fio"
checkvar = "[.\"fio\".\"Results sequential\"] | .[] | .[] | .write.bw | select( . != null )" checkvar = "[.\"fio\".\"Results sequential\"] | .[] | .[] | .write.bw | select( . != null )"
checktype = "mean" checktype = "mean"
midval = 307948 midval = 307948
minpercent = 20.0 minpercent = 40.0
maxpercent = 20.0 maxpercent = 20.0
[[metric]] [[metric]]
@ -199,7 +199,7 @@ description = "measure container parallel bandwidth using iperf3"
checkvar = ".\"network-iperf3\".Results | .[] | .parallel.Result" checkvar = ".\"network-iperf3\".Results | .[] | .parallel.Result"
checktype = "mean" checktype = "mean"
midval = 57516472021.90 midval = 57516472021.90
minpercent = 20.0 minpercent = 40.0
maxpercent = 20.0 maxpercent = 20.0
[[metric]] [[metric]]
@ -211,6 +211,6 @@ description = "iperf"
# within (inclusive) # within (inclusive)
checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result" checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result"
checktype = "mean" checktype = "mean"
midval = 0.04 midval = 0.02
minpercent = 70.0 minpercent = 70.0
maxpercent = 60.0 maxpercent = 60.0

View File

@ -212,5 +212,5 @@ description = "iperf"
checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result" checkvar = ".\"network-iperf3\".Results | .[] | .jitter.Result"
checktype = "mean" checktype = "mean"
midval = 0.040 midval = 0.040
minpercent = 60.0 minpercent = 80.0
maxpercent = 60.0 maxpercent = 60.0

View File

@ -54,9 +54,14 @@ function make_tarball_results() {
} }
function run_test_launchtimes() { function run_test_launchtimes() {
info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor" repetitions=20
if [[ ${KATA_HYPERVISOR} == "qemu" ]]; then
# The qemu workload seems to fail before it can run ~5-7 repetitions of the workload
repetitions=3
fi
bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n "${repetitions}"
} }
function run_test_memory_usage() { function run_test_memory_usage() {
@ -114,14 +119,12 @@ function run_test_latency() {
info "Running Latency test using ${KATA_HYPERVISOR} hypervisor" info "Running Latency test using ${KATA_HYPERVISOR} hypervisor"
bash tests/metrics/network/latency_kubernetes/latency-network.sh bash tests/metrics/network/latency_kubernetes/latency-network.sh
check_metrics
} }
function main() { function main() {
action="${1:-}" action="${1:-}"
case "${action}" in case "${action}" in
install-kata) install_kata && install_checkmetrics ;; install-checkmetrics) install_checkmetrics ;;
enabling-hypervisor) enabling_hypervisor ;; enabling-hypervisor) enabling_hypervisor ;;
make-tarball-results) make_tarball_results ;; make-tarball-results) make_tarball_results ;;
run-test-launchtimes) run_test_launchtimes ;; run-test-launchtimes) run_test_launchtimes ;;
@ -132,7 +135,8 @@ function main() {
run-test-fio) run_test_fio ;; run-test-fio) run_test_fio ;;
run-test-iperf) run_test_iperf ;; run-test-iperf) run_test_iperf ;;
run-test-latency) run_test_latency ;; run-test-latency) run_test_latency ;;
*) >&2 die "Invalid argument" ;; check-metrics) check_metrics;;
*) >&2 die "Invalid argument: ${action}" ;;
esac esac
} }

View File

@ -224,6 +224,23 @@ function kill_processes_before_start()
kill_kata_components kill_kata_components
} }
# Kills running shim and hypervisor components
function kill_kata_components() {
local ATTEMPTS=2
local TIMEOUT="300s"
local PID_NAMES=( "containerd-shim-kata-v2" "qemu-system-x86_64" "qemu-system-x86_64-tdx-experimental" "cloud-hypervisor" )
sudo systemctl stop containerd
# iterate over the list of kata components and stop them
for (( i=1; i<=ATTEMPTS; i++ )); do
for PID_NAME in "${PID_NAMES[@]}"; do
[[ ! -z "$(pidof ${PID_NAME})" ]] && sudo killall -w -s SIGKILL "${PID_NAME}" >/dev/null 2>&1 || true
done
sleep 1
done
sudo timeout -s SIGKILL "${TIMEOUT}" systemctl start containerd
}
# Generate a random name - generally used when creating containers, but can # Generate a random name - generally used when creating containers, but can
# be used for any other appropriate purpose # be used for any other appropriate purpose
function random_name() function random_name()

View File

@ -179,7 +179,7 @@ function iperf3_start_deployment() {
# Check no processes are left behind # Check no processes are left behind
check_processes check_processes
wait_time=20 wait_time=180
sleep_time=2 sleep_time=2
# Create deployment # Create deployment

View File

@ -19,7 +19,7 @@ spec:
app: iperf3-client app: iperf3-client
spec: spec:
tolerations: tolerations:
- key: node-role.kubernetes.io/master - key: node-role.kubernetes.io/control-plane
operator: Exists operator: Exists
effect: NoSchedule effect: NoSchedule
containers: containers:

View File

@ -25,12 +25,10 @@ spec:
- weight: 1 - weight: 1
preference: preference:
matchExpressions: matchExpressions:
- key: kubernetes.io/role - key: node-role.kubernetes.io/control-plane
operator: In operator: Exists
values:
- master
tolerations: tolerations:
- key: node-role.kubernetes.io/master - key: node-role.kubernetes.io/control-plane
operator: Exists operator: Exists
effect: NoSchedule effect: NoSchedule
containers: containers:

View File

@ -33,12 +33,10 @@ function main() {
cmds=("bc" "jq") cmds=("bc" "jq")
check_cmds "${cmds[@]}" check_cmds "${cmds[@]}"
init_env
# Check no processes are left behind # Check no processes are left behind
check_processes check_processes
wait_time=20 wait_time=180
sleep_time=2 sleep_time=2
# Create server # Create server