From 63b8534b4146431248695895b52f677fc64e6090 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 20 Jul 2023 19:59:40 +0000 Subject: [PATCH 01/10] metrics: Enable Tensorflow metrics for kata CI This PR enables the Tensorflow benchmark metrics for kata CI. Fixes #7395 Signed-off-by: Gabriela Cervantes --- tests/metrics/gha-run.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index 99ec5a1c1a..6a31e27dc7 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -81,8 +81,6 @@ function run_test_blogbench() { function run_test_tensorflow() { info "Running TensorFlow test using ${KATA_HYPERVISOR} hypervisor" - # ToDo: remove the exit once the metrics workflow is stable - exit 0 bash tests/metrics/machine_learning/tensorflow.sh 1 20 } From 08dfaa97aa2dc9b94f113094ea9f29dfe4c768dc Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 20 Jul 2023 20:35:01 +0000 Subject: [PATCH 02/10] metrics: General improvements to the tensorflow script This PR adds general improvements to the tensorflow script. Signed-off-by: Gabriela Cervantes --- tests/metrics/gha-run.sh | 1 + tests/metrics/machine_learning/tensorflow.sh | 179 +++++++++++++------ 2 files changed, 125 insertions(+), 55 deletions(-) diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index 6a31e27dc7..7bb54e2da3 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -8,6 +8,7 @@ set -o errexit set -o nounset set -o pipefail +set -x kata_tarball_dir="${2:-kata-artifacts}" metrics_dir="$(dirname "$(readlink -f "$0")")" diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh index e4af1ef8a5..8aee730cf5 100755 --- a/tests/metrics/machine_learning/tensorflow.sh +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -4,7 +4,8 @@ # # SPDX-License-Identifier: Apache-2.0 -set -e +#set -e +set -x # General env SCRIPT_PATH=$(dirname "$(readlink -f "$0")") @@ -14,16 +15,28 @@ IMAGE="docker.io/library/tensorflow:latest" DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile" BATCH_SIZE="512" NUM_BATCHES="300" -CMD_RESULT="cd benchmarks/scripts/tf_cnn_benchmarks/ && cat result" -CMD_FILE="cat benchmarks/scripts/tf_cnn_benchmarks/result | grep 'total images' | wc -l" -tensorflow_file=$(mktemp tensorflowresults.XXXXXXXXXX) +resnet_tensorflow_file=$(mktemp resnettensorflowresults.XXXXXXXXXX) +alexnet_tensorflow_file=$(mktemp alexnettensorflowresults.XXXXXXXXXX) NUM_CONTAINERS="$1" TIMEOUT="$2" TEST_NAME="tensorflow" PAYLOAD_ARGS="tail -f /dev/null" +# Options to control the start of the workload using a trigger-file +dst_dir="/host" +src_dir=$(mktemp --tmpdir -d tensorflow.XXXXXXXXXX) +MOUNT_OPTIONS="type=bind,src=$src_dir,dst=$dst_dir,options=rbind:ro" +# CMD points to the script that starts the workload +alexnet_start_script="alexnet_start.sh" +resnet_start_script="resnet_start.sh" +CMD_RESNET="$dst_dir/$resnet_start_script" +CMD_ALEXNET="$dst_dir/$alexnet_start_script" +timeout=600 +INITIAL_NUM_PIDS=1 +CMD_FILE="cat alexnet_results | grep 'total images' | wc -l" +RESNET_CMD_FILE="cat resnet_results | grep 'total images' | wc -l" function remove_tmp_file() { - rm -rf "${tensorflow_file}" + rm -rf "${resnet_tensorflow_file}" "${alexnet_tensorflow_file}" } trap remove_tmp_file EXIT @@ -31,81 +44,117 @@ trap remove_tmp_file EXIT function help() { cat << EOF Usage: $0 - Description: - This script launches n number of containers - to run the tf cnn benchmarks using a Tensorflow - container. - Options: - : Number of containers to run. - : Timeout to launch the containers. + Description: + This script launches n number of containers + to run the tf cnn benchmarks using a Tensorflow + container. + Options: + : Number of containers to run. + : Timeout to launch the containers. EOF } -function resnet50_test() { - local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > result" - info "Running Resnet50 Tensorflow test" +function create_resnet_start_script() { + local script="${src_dir}/${resnet_start_script}" + rm -rf "${script}" + +cat <>"${script}" +#!/bin/bash +python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > resnet_results +EOF + chmod +x "${script}" +} + +function create_alexnet_start_script() { + local script="${src_dir}/${alexnet_start_script}" + rm -rf "${script}" + +cat <>"${script}" +#!/bin/bash +python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --num_batches=100 --device=cpu --batch_size=100 --forward_only=true --model=alexnet --data_format=NHWC > alexnet_results +EOF + chmod +x "${script}" +} + +function tensorflow_test() { + info "Copy Resnet Tensorflow test" + local pids=() + local j=0 for i in "${containers[@]}"; do - sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}" + $(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESNET}")& + pids["${j}"]=$! + ((j++)) + done + + # wait for all pids + for pid in ${pids[*]}; do + wait "${pid}" + done + + info "All containers are running the workload..." + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${RESNET_CMD_FILE}") + retries="100" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq "1" ] && break + sleep 1 + done + done + + info "Copy Alexnet Tensorflow test" + local pids=() + local j=0 + for i in "${containers[@]}"; do + $(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_ALEXNET}")& + pids["${j}"]=$! + ((j++)) + done + + # wait for all pids + for pid in ${pids[*]}; do + wait "${pid}" done for i in "${containers[@]}"; do check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") - retries="200" + retries="300" for j in $(seq 1 "${retries}"); do - [ "${check_file}" -eq 1 ] && break + [ "${check_file}" -eq "1" ] && break sleep 1 done done for i in "${containers[@]}"; do - sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}" + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat resnet_results" >> "${resnet_tensorflow_file}" + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat alexnet_results" >> "${alexnet_tensorflow_file}" done - local resnet50_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') - local average_resnet50=$(echo "${resnet50_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + local resnet_results=$(cat "${resnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') + local average_resnet=$(echo "${resnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) local json="$(cat << EOF { - "Resnet50": { - "Result": "${resnet50_results}", - "Average": "${average_resnet50}", - "Units": "s" + "Resnet": { + "Result": "${resnet_results}", + "Average": "${average_resnet}", + "Units": "images/s" } } EOF )" + metrics_json_add_array_element "$json" -} -function axelnet_test() { - local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > result" - info "Running AxelNet Tensorflow test" - for i in "${containers[@]}"; do - sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}" - done - - for i in "${containers[@]}"; do - check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") - retries="200" - for j in $(seq 1 "${retries}"); do - [ "${check_file}" -eq 1 ] && break - sleep 1 - done - done - - for i in "${containers[@]}"; do - sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}" - done - - local axelnet_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') - local average_axelnet=$(echo "${axelnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + local alexnet_results=$(cat "${alexnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') + local average_alexnet=$(echo "${alexnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) local json="$(cat << EOF { - "AxelNet": { - "Result": "${axelnet_results}", - "Average": "${average_axelnet}", - "Units": "s" + "AlexNet": { + "Result": "${alexnet_results}", + "Average": "${average_alexnet}", + "Units": "images/s" } } EOF @@ -143,11 +192,14 @@ function main() { check_ctr_images "${IMAGE}" "${DOCKERFILE}" init_env + create_resnet_start_script + create_alexnet_start_script + info "Creating ${NUM_CONTAINERS} containers" for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do containers+=($(random_name)) - sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" + sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" --mount="${MOUNT_OPTIONS}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" ((not_started_count--)) info "$not_started_count remaining containers" done @@ -158,12 +210,29 @@ function main() { # Check that the requested number of containers are running check_containers_are_up - resnet50_test + # Check that the requested number of containers are running + local timeout_launch="10" + check_containers_are_up & pid=$! + (sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$! - axelnet_test + if wait "${pid}" 2>/dev/null; then + pkill -HUP -P "${pid_tout}" + wait "${pid_tout}" + else + warn "Time out exceeded" + return 1 + fi + + # Get the initial number of pids in a single container before the workload starts + INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) + ((INITIAL_NUM_PIDS++)) + + tensorflow_test metrics_json_save + rm -rf "${src_dir}" + clean_env_ctr } main "$@" From 3c32875046147c9e28c81616ba5fcee68789c277 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Fri, 21 Jul 2023 20:02:02 +0000 Subject: [PATCH 03/10] checkmetrics: Add Resnet value for clh This PR adds the checkmetrics Resnet value for clh. Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-clh-kata-metric8.toml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index 9ca4d139b3..75f1592176 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -71,3 +71,16 @@ checktype = "mean" midval = 96939.0 minpercent = 20.0 maxpercent = 20.0 + +[[metric]] +name = "tensorflow" +type = "json" +description = "tensorflow resnet model" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"tensorflow\".Results | .[] | .Resnet.Result" +checktype = "mean" +midval = 4379.2 +minpercent = 20.0 +maxpercent = 20.0 From a79a3a8e1d7d4c0d1634e4ac32e9b17b9dc20d81 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Fri, 21 Jul 2023 20:03:49 +0000 Subject: [PATCH 04/10] checkmetrics: Add alexnet value for clh This PR adds the AlexNet value for clh for checkmetrics. Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-clh-kata-metric8.toml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index 75f1592176..edcef8d8d1 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -84,3 +84,16 @@ checktype = "mean" midval = 4379.2 minpercent = 20.0 maxpercent = 20.0 + +[[metric]] +name = "tensorflow" +type = "json" +description = "tensorflow alexnet model" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"tensorflow\".Results | .[] | .AlexNet.Result" +checktype = "mean" +midval = 98.0 +minpercent = 20.0 +maxpercent = 20.0 From a435d36fe160a252b846a6c1e0142b8b2a134f84 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Fri, 21 Jul 2023 20:05:54 +0000 Subject: [PATCH 05/10] checkmetrics: Add Resnet value for qemu This PR adds the Resnet value for qemu for checkmetrics. Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-qemu-kata-metric8.toml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index 4860fa2563..7a423ec5f8 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -71,3 +71,16 @@ checktype = "mean" midval = 98687.0 minpercent = 20.0 maxpercent = 20.0 + +[[metric]] +name = "tensorflow" +type = "json" +description = "tensorflow resnet model" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"tensorflow\".Results | .[] | .Resnet.Result" +checktype = "mean" +midval = 4396.2 +minpercent = 20.0 +maxpercent = 20.0 From 53af71cfd0db77cae4283292b7bc575ff16c5224 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Fri, 21 Jul 2023 20:07:05 +0000 Subject: [PATCH 06/10] checkmetrics: Add AlexNet value for qemu This PR adds AlexNet value for qemu for checkmetrics. Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-qemu-kata-metric8.toml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index 7a423ec5f8..392025e358 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -84,3 +84,16 @@ checktype = "mean" midval = 4396.2 minpercent = 20.0 maxpercent = 20.0 + +[[metric]] +name = "tensorflow" +type = "json" +description = "tensorflow alexnet model" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"tensorflow\".Results | .[] | .AlexNet.Result" +checktype = "mean" +midval = 98.3 +minpercent = 20.0 +maxpercent = 20.0 From f9dec11a8fdfbc0e83811490c1f63962ea878222 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Fri, 21 Jul 2023 21:14:37 +0000 Subject: [PATCH 07/10] checkmetrics: Move checkmetrics to gha-run script This PR moves the checkmetrics to gha-run script to gathered tensorflow information. Signed-off-by: Gabriela Cervantes --- tests/metrics/gha-run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index 7bb54e2da3..edb4650b5b 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -76,14 +76,14 @@ function run_test_blogbench() { info "Running Blogbench test using ${KATA_HYPERVISOR} hypervisor" bash tests/metrics/storage/blogbench.sh - - check_metrics } function run_test_tensorflow() { info "Running TensorFlow test using ${KATA_HYPERVISOR} hypervisor" bash tests/metrics/machine_learning/tensorflow.sh 1 20 + + check_metrics } function main() { From 3b883bf5a701bcf1d5e4927c8c7b3f56df3a4525 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Mon, 24 Jul 2023 16:11:52 +0000 Subject: [PATCH 08/10] metrics: Fix atoi invalid syntax This PR will avoid to have the strconv.atoi parsing error when we are retrieving the results from the json. Signed-off-by: Gabriela Cervantes --- .../ci_worker/checkmetrics-json-clh-kata-metric8.toml | 2 +- .../ci_worker/checkmetrics-json-qemu-kata-metric8.toml | 4 ++-- tests/metrics/machine_learning/tensorflow.sh | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index edcef8d8d1..2461af0da7 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -81,7 +81,7 @@ description = "tensorflow resnet model" # within (inclusive) checkvar = ".\"tensorflow\".Results | .[] | .Resnet.Result" checktype = "mean" -midval = 4379.2 +midval = 4379.0 minpercent = 20.0 maxpercent = 20.0 diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index 392025e358..48a03259b2 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -81,7 +81,7 @@ description = "tensorflow resnet model" # within (inclusive) checkvar = ".\"tensorflow\".Results | .[] | .Resnet.Result" checktype = "mean" -midval = 4396.2 +midval = 4396.0 minpercent = 20.0 maxpercent = 20.0 @@ -94,6 +94,6 @@ description = "tensorflow alexnet model" # within (inclusive) checkvar = ".\"tensorflow\".Results | .[] | .AlexNet.Result" checktype = "mean" -midval = 98.3 +midval = 98.0 minpercent = 20.0 maxpercent = 20.0 diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh index 8aee730cf5..cfabcaa865 100755 --- a/tests/metrics/machine_learning/tensorflow.sh +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -136,8 +136,8 @@ function tensorflow_test() { local json="$(cat << EOF { "Resnet": { - "Result": "${resnet_results}", - "Average": "${average_resnet}", + "Result": ${resnet_results}, + "Average": ${average_resnet}, "Units": "images/s" } } @@ -152,8 +152,8 @@ EOF local json="$(cat << EOF { "AlexNet": { - "Result": "${alexnet_results}", - "Average": "${average_alexnet}", + "Result": ${alexnet_results}, + "Average": ${average_alexnet}, "Units": "images/s" } } From 51cd99c927430b30caa6dacc4ac0f606ca55a3dc Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Mon, 24 Jul 2023 19:55:49 +0000 Subject: [PATCH 09/10] metrics: Round axelnet and resnet results This PR rounds the axelnet and resnet results in order to extract properly the result. Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-clh-kata-metric8.toml | 6 +-- .../checkmetrics-json-qemu-kata-metric8.toml | 4 +- tests/metrics/machine_learning/tensorflow.sh | 45 ++++++++++--------- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index 2461af0da7..9569f0397f 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -79,9 +79,9 @@ description = "tensorflow resnet model" # Min and Max values to set a 'range' that # the median of the CSV Results data must fall # within (inclusive) -checkvar = ".\"tensorflow\".Results | .[] | .Resnet.Result" +checkvar = ".\"tensorflow\".Results | .[] | .resnet.Result" checktype = "mean" -midval = 4379.0 +midval = 3566.0 minpercent = 20.0 maxpercent = 20.0 @@ -92,7 +92,7 @@ description = "tensorflow alexnet model" # Min and Max values to set a 'range' that # the median of the CSV Results data must fall # within (inclusive) -checkvar = ".\"tensorflow\".Results | .[] | .AlexNet.Result" +checkvar = ".\"tensorflow\".Results | .[] | .alexnet.Result" checktype = "mean" midval = 98.0 minpercent = 20.0 diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index 48a03259b2..e281865f93 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -79,7 +79,7 @@ description = "tensorflow resnet model" # Min and Max values to set a 'range' that # the median of the CSV Results data must fall # within (inclusive) -checkvar = ".\"tensorflow\".Results | .[] | .Resnet.Result" +checkvar = ".\"tensorflow\".Results | .[] | .resnet.Result" checktype = "mean" midval = 4396.0 minpercent = 20.0 @@ -92,7 +92,7 @@ description = "tensorflow alexnet model" # Min and Max values to set a 'range' that # the median of the CSV Results data must fall # within (inclusive) -checkvar = ".\"tensorflow\".Results | .[] | .AlexNet.Result" +checkvar = ".\"tensorflow\".Results | .[] | .alexnet.Result" checktype = "mean" midval = 98.0 minpercent = 20.0 diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh index cfabcaa865..88695aa48b 100755 --- a/tests/metrics/machine_learning/tensorflow.sh +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -13,8 +13,8 @@ source "${SCRIPT_PATH}/../lib/common.bash" IMAGE="docker.io/library/tensorflow:latest" DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile" -BATCH_SIZE="512" -NUM_BATCHES="300" +BATCH_SIZE="100" +NUM_BATCHES="100" resnet_tensorflow_file=$(mktemp resnettensorflowresults.XXXXXXXXXX) alexnet_tensorflow_file=$(mktemp alexnettensorflowresults.XXXXXXXXXX) NUM_CONTAINERS="$1" @@ -71,7 +71,7 @@ function create_alexnet_start_script() { cat <>"${script}" #!/bin/bash -python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --num_batches=100 --device=cpu --batch_size=100 --forward_only=true --model=alexnet --data_format=NHWC > alexnet_results +python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > alexnet_results EOF chmod +x "${script}" } @@ -118,7 +118,7 @@ function tensorflow_test() { for i in "${containers[@]}"; do check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") - retries="300" + retries="100" for j in $(seq 1 "${retries}"); do [ "${check_file}" -eq "1" ] && break sleep 1 @@ -127,32 +127,33 @@ function tensorflow_test() { for i in "${containers[@]}"; do sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat resnet_results" >> "${resnet_tensorflow_file}" + done + + local res_results=$(cat "${resnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') + local resnet_results=$(printf "%.0f\n" "${res_results}") + local res_average=$(echo "${resnet_results}" | sed "s/,/+/g;s/.*/(&)\/${NUM_CONTAINERS}/g" | bc -l) + local average_resnet=$(printf "%.0f\n" "${res_average}") + + for i in "${containers[@]}"; do sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat alexnet_results" >> "${alexnet_tensorflow_file}" done - local resnet_results=$(cat "${resnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') - local average_resnet=$(echo "${resnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + cat "${alexnet_tensorflow_file}" + + local alex_results=$(cat "${alexnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') + local alexnet_results=$(printf "%.0f\n" "${alex_results}") + local alex_average=$(echo "${alexnet_results}" | sed "s/,/+/g;s/.*/(&)\/${NUM_CONTAINERS}/g" | bc -l) + local average_alexnet=$(printf "%.0f\n" "${alex_average}") local json="$(cat << EOF { - "Resnet": { - "Result": ${resnet_results}, + "resnet": { + "Result": "3566", "Average": ${average_resnet}, "Units": "images/s" } - } -EOF -)" - - metrics_json_add_array_element "$json" - - local alexnet_results=$(cat "${alexnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') - local average_alexnet=$(echo "${alexnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) - - local json="$(cat << EOF - { - "AlexNet": { - "Result": ${alexnet_results}, + "alexnet": { + "Result": "96", "Average": ${average_alexnet}, "Units": "images/s" } @@ -234,5 +235,7 @@ function main() { rm -rf "${src_dir}" clean_env_ctr + + cat /home/gha_runner/actions-runner/_work/kata-containers/kata-containers/tests/metrics/results/tensorflow.json } main "$@" From bee1a628bd92443fa4fdadb6d454718d944661a2 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Tue, 25 Jul 2023 22:11:50 +0000 Subject: [PATCH 10/10] metrics: Fix json result for tensorflow This PR fixes the json result for tensorflow.i Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-qemu-kata-metric8.toml | 2 +- tests/metrics/gha-run.sh | 1 - tests/metrics/machine_learning/tensorflow.sh | 31 ++++++++----------- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index e281865f93..af9622418b 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -81,7 +81,7 @@ description = "tensorflow resnet model" # within (inclusive) checkvar = ".\"tensorflow\".Results | .[] | .resnet.Result" checktype = "mean" -midval = 4396.0 +midval = 3546.0 minpercent = 20.0 maxpercent = 20.0 diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index edb4650b5b..850cca98ff 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -8,7 +8,6 @@ set -o errexit set -o nounset set -o pipefail -set -x kata_tarball_dir="${2:-kata-artifacts}" metrics_dir="$(dirname "$(readlink -f "$0")")" diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh index 88695aa48b..fc6c1f8c9c 100755 --- a/tests/metrics/machine_learning/tensorflow.sh +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -4,8 +4,7 @@ # # SPDX-License-Identifier: Apache-2.0 -#set -e -set -x +set -o pipefail # General env SCRIPT_PATH=$(dirname "$(readlink -f "$0")") @@ -44,13 +43,13 @@ trap remove_tmp_file EXIT function help() { cat << EOF Usage: $0 - Description: - This script launches n number of containers - to run the tf cnn benchmarks using a Tensorflow - container. - Options: - : Number of containers to run. - : Timeout to launch the containers. + Description: + This script launches n number of containers + to run the tf cnn benchmarks using a Tensorflow + container. + Options: + : Number of containers to run. + : Timeout to launch the containers. EOF } @@ -95,7 +94,7 @@ function tensorflow_test() { for i in "${containers[@]}"; do check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${RESNET_CMD_FILE}") - retries="100" + retries="300" for j in $(seq 1 "${retries}"); do [ "${check_file}" -eq "1" ] && break sleep 1 @@ -118,7 +117,7 @@ function tensorflow_test() { for i in "${containers[@]}"; do check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") - retries="100" + retries="300" for j in $(seq 1 "${retries}"); do [ "${check_file}" -eq "1" ] && break sleep 1 @@ -138,8 +137,6 @@ function tensorflow_test() { sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat alexnet_results" >> "${alexnet_tensorflow_file}" done - cat "${alexnet_tensorflow_file}" - local alex_results=$(cat "${alexnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') local alexnet_results=$(printf "%.0f\n" "${alex_results}") local alex_average=$(echo "${alexnet_results}" | sed "s/,/+/g;s/.*/(&)\/${NUM_CONTAINERS}/g" | bc -l) @@ -148,12 +145,12 @@ function tensorflow_test() { local json="$(cat << EOF { "resnet": { - "Result": "3566", + "Result": ${resnet_results}, "Average": ${average_resnet}, "Units": "images/s" - } + }, "alexnet": { - "Result": "96", + "Result": ${alexnet_results}, "Average": ${average_alexnet}, "Units": "images/s" } @@ -235,7 +232,5 @@ function main() { rm -rf "${src_dir}" clean_env_ctr - - cat /home/gha_runner/actions-runner/_work/kata-containers/kata-containers/tests/metrics/results/tensorflow.json } main "$@"