diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index cf0b95d49a..cdff6c766b 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -11,6 +11,7 @@ set -o pipefail kata_tarball_dir=${2:-kata-artifacts} metrics_dir="$(dirname "$(readlink -f "$0")")" +source "${metrics_dir}/../common.bash" create_symbolic_links() { hypervisor="${1:-qemu}" @@ -18,39 +19,76 @@ create_symbolic_links() { local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${hypervisor}.toml" if [ ${hypervisor} != 'qemu' ] && [ ${hypervisor} != 'clh' ]; then - exit 2 + die "Failed to set the configuration.toml: '${hypervisor}' is not recognized as a valid hypervisor name." fi sudo ln -sf "${source_configuration_file}" "${link_configuration_file}" } +# Configures containerd +overwrite_containerd_config() { + containerd_config="/etc/containerd/config.toml" + sudo rm "${containerd_config}" + sudo tee "${containerd_config}" << EOF +version = 2 +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + SystemdCgroup = true + +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "kata" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata] + runtime_type = "io.containerd.kata.v2" +EOF +} + install_kata() { - local katadir="/opt/kata" - local destdir="/" + local kata_tarball="kata-static.tar.xz" + declare -r katadir="/opt/kata" + declare -r destdir="/" + declare -r local_bin_dir="/usr/local/bin/" # Removing previous kata installation sudo rm -rf "${katadir}" pushd ${kata_tarball_dir} - for c in kata-static-*.tar.xz; do - echo "untarring tarball "${c}" into ${destdir}" - sudo tar -xvf "${c}" -C "${destdir}" - done + sudo tar -xvf "${kata_tarball}" -C "${destdir}" popd + + # create symbolic links to kata components + for b in "${katadir}/bin/*" ; do + sudo ln -sf "${b}" "${local_bin_dir}/$(basename $b)" + done + + check_containerd_config_for_kata + restart_containerd_service +} + +check_containerd_config_for_kata() { + # check containerd config + declare -r line1="default_runtime_name = \"kata\"" + declare -r line2="runtime_type = \"io.containerd.kata.v2\"" + declare -r num_lines_containerd=2 + declare -r containerd_path="/etc/containerd/config.toml" + local count_matches=$(grep -ic "$line1\|$line2" ${containerd_path}) + + if [ $count_matches = $num_lines_containerd ]; then + info "containerd ok" + else + info "overwriting containerd configuration w/ a valid one" + overwrite_containerd_config + fi } function run_test_launchtimes() { hypervisor="${1}" - echo "Running launchtimes tests: " + info "Running Launch Time test using ${hypervisor} hypervisor" create_symbolic_links "${hypervisor}" - - if [ "${hypervisor}" = 'qemu' ]; then - echo "qemu" - elif [ "${hypervisor}" = 'clh' ]; then - echo "clh" - fi + bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 } function main() { @@ -59,7 +97,7 @@ function main() { install-kata) install_kata ;; run-test-launchtimes-qemu) run_test_launchtimes "qemu" ;; run-test-launchtimes-clh) run_test_launchtimes "clh" ;; - *) >&2 echo "Invalid argument"; exit 2 ;; + *) >&2 die "Invalid argument" ;; esac } diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index 30c5c1321d..ad7b2a5c41 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -8,7 +8,7 @@ THIS_FILE=$(readlink -f ${BASH_SOURCE[0]}) LIB_DIR=${THIS_FILE%/*} RESULT_DIR="${LIB_DIR}/../results" -source ${LIB_DIR}/../../lib/common.bash +source ${LIB_DIR}/../../common.bash source ${LIB_DIR}/json.bash source /etc/os-release || source /usr/lib/os-release diff --git a/tests/metrics/time/launch_times.sh b/tests/metrics/time/launch_times.sh new file mode 100755 index 0000000000..0458b979ed --- /dev/null +++ b/tests/metrics/time/launch_times.sh @@ -0,0 +1,482 @@ +#!/bin/bash +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description of the test: +# This test takes a number of time measurements through the complete +# launch/shutdown cycle of a single container. +# From those measurements it derives a number of time measures, such as: +# - time to payload execution +# - time to get to VM kernel +# - time in VM kernel boot +# - time to quit +# - total time (from launch to finished) +# +# Note, the used for this test must support the full 'date' command +# syntax - the date from busybox for instance *does not* support this, so +# will not work with this test. +# +# Note, this test launches a single container at a time, that quits - thus, +# this test measures times for the 'first container' only. This test does +# not look for any scalability slowdowns as the number of running containers +# increases for instance - that is handled in other tests + +set -e + +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +# Calculating the kernel time from dmesg stamps only really works for VM +# based runtimes - we dynamically enable it if we find we are using a known +# VM runtime +CALCULATE_KERNEL= + +REQUIRED_CMDS=("bc" "awk") + +# set the total number of decimal digits after the decimal point +# for representing the calculations results +CALC_SCALE=4 + +# The date command format we use to ensure we capture the ns timings +# Note the no-0-padding - 0 padding the results breaks bc in some cases +DATECMD="date -u +%-s:%-N" + +# The modern Linux RNG is extremely fast at generating entropy on demand +# and does not need to have as large a store of entropy anymore as the value +# of 256 was found to work well with common cryptographic algorithms +entropy_level="256" + +# Grabs the number of iterations performed +num_iters=0 + +# sets to this max number of repetitons for failed runs +MAX_REPETITIONS=3 + +# The individual results are stored in an array +declare -a total_result_ds +declare -a to_workload_ds +declare -a in_kernel_ds +declare -a to_kernel_ds +declare -a to_quit_ds +# data_is_valid value 1 represent not valid +# data_is_valid value 0 represent is valid +data_is_valid=0 + +check_entropy_level() { + retries="10" + for i in $(seq 1 "$retries"); do + if [ $(cat /proc/sys/kernel/random/entropy_avail) -ge ${entropy_level} ]; then + break; + fi + sleep 1 + done + if [ $(cat /proc/sys/kernel/random/entropy_avail) -lt ${entropy_level} ]; then + die "Not enough entropy level to run this test" + fi +} + +# convert a 'seconds:nanoseconds' string into nanoseconds +sn_to_ns() { + # !!: Remove 0's from beginning otherwise the number will be converted to octal + s=$(echo ${1%:*} | sed 's/^0*//g') + ns=$(echo ${1##*:} | sed 's/^0*//g') + # use shell magic to strip out the 's' and 'ns' fields and print + # them as a 0-padded ns string... + printf "%d%09d" ${s} ${ns} +} + +# convert 'nanoseconds' (since epoch) into a 'float' seconds +ns_to_s() { + printf "%.0${CALC_SCALE}f" $(bc <<< "scale=$CALC_SCALE; $1 / 1000000000") +} + +run_workload() { + # L_CALC_SCALE is set to accounting a significant + # number of decimal digits after the decimal points + # for 'bc' performing math in kernel period estimation + L_CALC_SCALE=13 + start_time=$($DATECMD) + + # Check entropy level of the host + check_entropy_level + + # Run the image and command and capture the results into an array... + declare workload_result + readarray -n 0 workload_result < <(sudo -E "${CTR_EXE}" run --rm --runtime=${CTR_RUNTIME} ${IMAGE} test bash -c "$DATECMD $DMESGCMD") + + end_time=$($DATECMD) + + # Delay this calculation until after we have run - do not want + # to measure it in the results + start_time=$(sn_to_ns $start_time) + end_time=$(sn_to_ns $end_time) + + # Extract the 'date' info from the first line of the log + # This script assumes the VM clock is in sync with the host clock... + workload_time=${workload_result[0]} + workload_time=$(echo $workload_time | tr -d '\r') + workload_time=$(sn_to_ns $workload_time) + + # How long did the whole launch/quit take + total_period=$((end_time-start_time)) + # How long did it take to get to the workload + workload_period=$((workload_time-start_time)) + # How long did it take to quit + shutdown_period=$((end_time-workload_time)) + + if [ -n "$CALCULATE_KERNEL" ]; then + # Grab the last kernel dmesg time + # In our case, we need to find the last real kernel line before + # the systemd lines begin. The last: + # 'Freeing unused kernel' line is a reasonable + # 'last in kernel line' to look for. + # We make a presumption here that as we are in a cold-boot VM + # kernel, the first dmesg is at '0 seconds', so the timestamp + # of that last line is the length of time in the kernel. + kernel_last_line=$( (fgrep "Freeing unused kernel" <<- EOF + ${workload_result[@]} + EOF + ) | tail -1 ) + + if [ -z "$kernel_last_line" ]; then + echo "No kernel last line" + for l in "${workload_result[@]}"; do + echo ">: [$l]" + done + die "No kernel last line" + fi + + kernel_period=$(echo $kernel_last_line | awk '{print $2}' | tr -d "]") + + # And we can then work out how much time it took to get to the kernel + to_kernel_period=$(printf "%f" $(bc <<<"scale=$L_CALC_SCALE; $(ns_to_s $workload_period) - $kernel_period")) + else + kernel_period="0.0" + to_kernel_period="0.0" + fi + + total_result="$(ns_to_s $total_period)" + to_workload="$(ns_to_s $workload_period)" + in_kernel=$kernel_period + to_kernel=$to_kernel_period + to_quit=$(ns_to_s $shutdown_period) + + tr_is_neg=$(echo $total_result'<='0.0 | bc -l) + tw_is_neg=$(echo $to_workload'<='0.0 | bc -l) + ik_is_neg=$(echo $in_kernel'<='0.0 | bc -l) + tk_is_neg=$(echo $to_kernel'<='0.0 | bc -l) + tq_is_neg=$(echo $to_quit'<='0.0 | bc -l) + + data_is_valid=0 + if [ $tr_is_neg -eq 1 ] || [ $tw_is_neg -eq 1 ] || [ $ik_is_neg -eq 1 ] || [ $tk_is_neg -eq 1 ] || [ $tq_is_neg -eq 1 ]; then + data_is_valid=1 + else + # Insert results individually + total_result_ds+=($total_result) + to_workload_ds+=($to_workload) + in_kernel_ds+=($in_kernel) + to_kernel_ds+=($to_kernel) + to_quit_ds+=($to_quit) + fi + + ((num_iters+=1)) + + # If we are doing an (optional) scaling test, then we launch a permanent container + # between each of our 'test' containers. The aim being to see if our launch times + # are linear with the number of running containers or not + if [ -n "$SCALING" ]; then + sudo -E "${CTR_EXE}" run --runtime=${CTR_RUNTIME} -d ${IMAGE} test bash -c "tail -f /dev/null" + fi +} + +# Writes a JSON with the measurements +# results per execution +write_individual_results() { + for i in "${!total_result_ds[@]}"; do + local json="$(cat << EOF + { + "total": { + "Result": ${total_result_ds[i]}, + "Units": "s" + }, + "to-workload": { + "Result": ${to_workload_ds[i]}, + "Units": "s" + }, + "in-kernel": { + "Result": ${in_kernel_ds[i]}, + "Units": "s" + }, + "to-kernel": { + "Result": ${to_kernel_ds[i]}, + "Units": "s" + }, + "to-quit": { + "Result": ${to_quit_ds[i]}, + "Units": "s" + } + } +EOF +)" + metrics_json_add_array_element "$json" + done +} + +init () { + TEST_ARGS="image=${IMAGE} runtime=${CTR_RUNTIME} units=seconds" + + # We set the generic name here, but we save the different time results separately, + # and append the actual detail to the name at the time of saving... + TEST_NAME="boot times" + + # If we are scaling, note that in the name + [ -n "$SCALING" ] && TEST_NAME="${TEST_NAME} scaling" + + echo "Executing test: ${TEST_NAME} ${TEST_ARGS}" + check_cmds "${REQUIRED_CMDS[@]}" + + # For non-VM runtimes, we don't use the output of dmesg, and + # we have seen it cause some test instabilities, so do not invo> + # it if not needed. + if [ "${CTR_RUNTIME}" == "io.containerd.runc.v2" ]; then + DMESGCMD="" + else + CALCULATE_KERNEL=1 + DMESGCMD="; dmesg" + fi + + # Start from a fairly clean environment + init_env + check_images "$IMAGE" +} + +# Computes the average of the data +calc_avg_array() { + data=("$@") + avg=0 + LSCALE=6 + size="${#data[@]}" + + [ -z "$data" ] && die "List of results was not passed to the calc_avg_array() function when trying to calculate the average result." + [ $size -eq 0 ] && die "Division by zero: The number of items is 0 when trying to calculate the average result." + + sum=$(IFS='+'; echo "scale=4; ${data[*]}" | bc) + avg=$(echo "scale=$LSCALE; $sum / $size" | bc) + printf "%.0${CALC_SCALE}f" $avg +} + + +# Computes the standard deviation of the data +calc_sd_array() { + data=("$@") + sum_sqr_n=0 + size=${#data[@]} + + # LSCALE is the scale used for calculations in the middle + # CALC_SCALE is the scale used for the result + LSCALE=13 + CALC_SCALE=6 + + [ -z "$data" ] && die "List results was not passed to the calc_sd_result() function when trying to calculate the standard deviation result." + [ $size -eq 0 ] && die "Division by zero: The number of items is 0 when trying to calculate the standard deviation result." + + + # [1] sum data + sum_data=$(IFS='+'; echo "scale=$LSCALE; ${data[*]}" | bc) + + # [2] square the sum of data + pow_2_sum_data=$(echo "scale=$LSCALE; $sum_data ^ 2" | bc) + + # [3] divide the square of data by the num of items + div_sqr_n=$(echo "scale=$LSCALE; $pow_2_sum_data / $size" | bc) + + # [4] Sum of the sqr of each item + for i in "${data[@]}"; do + sqr_n=$(echo "scale=$LSCALE; $i ^ 2" | bc) + sum_sqr_n=$(echo "scale=$LSCALE; $sqr_n + $sum_sqr_n" | bc) + done + + # substract [4] from [3] + subs=$(echo "scale=$LSCALE; $sum_sqr_n - $div_sqr_n" | bc) + + # get variance + var=$(echo "scale=$LSCALE; $subs / $size" | bc) + + # get standard deviation + sd=$(echo "scale=$LSCALE; sqrt($var)" | bc) + + # if sd is zero, limit the decimal scale to 1 digit + sd_is_zero=$(echo $sd'=='0.0 | bc -l) + [ $sd_is_zero -eq 1 ] && CALC_SCALE=1 + + printf "%.0${CALC_SCALE}f" $sd +} + +# Computes the Coefficient of variation. +# The result is given as percentage. +calc_cov_array() { + sd=$1 + mean=$2 + + # LSCALE used for consider more decimals digits than usual in cov estimation. + # CALC_SCALE is the scale used to return the result. + LSCALE=13 + CALC_SCALE=6 + + mean_is_zero=$(echo $mean'=='0.0 | bc -l) + + [ -z "$sd" ] && die "Standard deviation was not passed to the calc_cov_array() function when trying to calculate the CoV result." + [ -z "$mean" ] && die "Mean was not passed to the calc_cov_array() function when trying to calculate the CoV result." + [ $mean_is_zero -eq 1 ] && die "Division by zero: Mean value passed is 0 when trying to get CoV result." + + cov=$(echo "scale=$LSCALE; $sd / $mean" | bc) + cov=$(echo "scale=$LSCALE; $cov * 100" | bc) + + # if cov is zero, limit the decimal scale to 1 digit + cov_is_zero=$(echo $cov'=='0.0 | bc -l) + [ $cov_is_zero -eq 1 ] && CALC_SCALE=1 + + printf "%.0${CALC_SCALE}f" $cov +} + +# Writes a JSON with the statistics results +# for each launch time metric +write_stats_results() { + size="${#total_result_ds[@]}" + avg_total_result=$(calc_avg_array "${total_result_ds[@]}") + avg_to_workload=$(calc_avg_array "${to_workload_ds[@]}") + avg_in_kernel=$(calc_avg_array "${in_kernel_ds[@]}") + avg_to_kernel=$(calc_avg_array "${to_kernel_ds[@]}") + avg_to_quit=$(calc_avg_array "${to_quit_ds[@]}") + + sd_total_result=$(calc_sd_array "${total_result_ds[@]}") + sd_to_workload=$(calc_sd_array "${to_workload_ds[@]}") + sd_in_kernel=$(calc_sd_array "${in_kernel_ds[@]}") + sd_to_kernel=$(calc_sd_array "${to_kernel_ds[@]}") + sd_to_quit=$(calc_sd_array "${to_quit_ds[@]}") + + cov_total_result=$(calc_cov_array ${sd_total_result} ${avg_total_result}) + cov_to_workload=$(calc_cov_array ${sd_to_workload} ${avg_to_workload}) + cov_in_kernel=$(calc_cov_array ${sd_in_kernel} ${avg_in_kernel}) + cov_to_kernel=$(calc_cov_array ${sd_to_kernel} ${avg_to_kernel}) + cov_to_quit=$(calc_cov_array ${sd_to_quit} ${avg_to_quit}) + + local json="$(cat << EOF + { + "size": $size, + "total": { + "avg": $avg_total_result, + "sd": $sd_total_result, + "cov": $cov_total_result + }, + "to-workload": { + "avg": $avg_to_workload, + "sd": $sd_to_workload, + "cov": $cov_to_workload + }, + "in-kernel": { + "avg": $avg_in_kernel, + "sd": $sd_in_kernel, + "cov": $cov_in_kernel + }, + "to-kernel_avg": { + "avg": $avg_to_kernel, + "sd": $sd_to_kernel, + "cov": $cov_to_kernel + }, + "to-quit": { + "avg": $avg_to_quit, + "sd": $sd_to_quit, + "cov": $cov_to_quit + } + } +EOF +)" + metrics_json_add_array_element "$json" +} + +help() { + usage=$(cat << EOF +Usage: $0 [-h] [options] + Description: + This script takes time measurements for different + stages of a boot/run/rm cycle + Options: + -h, Help + -i , Image name (mandatory) + -n , Number of containers to run (mandatory) + -s, Enable scaling (keep containers running) +EOF +) + echo "$usage" +} + +main() { + local OPTIND + while getopts "dhi:n:s" opt;do + case ${opt} in + h) + help + exit 0; + ;; + i) + IMAGE="${OPTARG}" + ;; + n) + TIMES="${OPTARG}" + ;; + s) + SCALING=true + ;; + ?) + # parse failure + help + die "Failed to parse arguments" + ;; + esac + done + shift $((OPTIND-1)) + + [ -z "$IMAGE" ] && help && die "Mandatory IMAGE name not supplied" + [ -z "$TIMES" ] && help && die "Mandatory nunmber of containers not supplied" + # Although this is mandatory, the 'lib/common.bash' environment can set + # it, so we may not fail if it is not set on the command line... + [ -z "$RUNTIME" ] && help && die "Mandatory runtime argument not supplied" + + init + j=0 + max_reps=$MAX_REPETITIONS + + while [ $j -lt $TIMES ]; do + + echo " run $num_iters" + run_workload + + if [ $data_is_valid -eq 0 ]; then + j=$(( j + 1 )) + # if valid result then reset 'max_reps' to initial value + max_reps=$MAX_REPETITIONS + continue + fi + + echo "Skipping run due to invalid result" + ((max_reps-=1)) + + if [ $max_reps -lt 0 ]; then + die "Max. num of repetitions reached for run: $j" + fi + done + + metrics_json_init + metrics_json_start_array + write_stats_results + metrics_json_end_array "Statistics" + metrics_json_start_array + write_individual_results + metrics_json_end_array "Results" + metrics_json_save + clean_env_ctr +} + +main "$@"