From 9879709aae781440dc2a4624fe0a501208e6aee7 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 17:27:11 +0000 Subject: [PATCH 01/10] metrics: Add common functions to the common script This PR adds the collect results function to the common metrics script. Fixes #7617 Signed-off-by: Gabriela Cervantes --- tests/metrics/lib/common.bash | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index 4111d67d40..14dabda38f 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -361,3 +361,25 @@ function wait_ksm_settle() done info "Timed out after ${1}s waiting for KSM to settle" } + +function collect_results() { + WORKLOAD=${1} + [[ -z ${WORKLOAD} ]] && die "Container workload is missing" + + local tasks_running=("${containers[@]}") + local retries=100 + + while [ "${#tasks_running[@]}" -gt 0 ] && [ "${retries}" -gt 0 ]; do + for i in "${!tasks_running[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${tasks_running[i]}" sh > + + # if the current task is done, remove the corresponding container from the active list + [ "${check_file}" -eq "1" ] && unset 'tasks_running[i]' + done + ((retries--)) + sleep 3 + echo -n "." + done + echo -e "\n" +} + From 286de046af2ad7397a71fdd5cbce4da993b837fd Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 17:31:23 +0000 Subject: [PATCH 02/10] metrics: Remove collect results function definition This PR removes the collect results function from tensorflow script as it is going to be referenced in the common metrics script. Signed-off-by: Gabriela Cervantes --- tests/metrics/machine_learning/tensorflow.sh | 21 -------------------- 1 file changed, 21 deletions(-) diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh index 3473b451ce..7f11319272 100755 --- a/tests/metrics/machine_learning/tensorflow.sh +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -100,27 +100,6 @@ function launch_workload() { done } -function collect_results() { - WORKLOAD=${1} - [[ -z ${WORKLOAD} ]] && die "Container workload is missing" - - local tasks_running=("${containers[@]}") - local retries=${MAX_RETRIES} - - while [ "${#tasks_running[@]}" -gt 0 ] && [ "${retries}" -gt 0 ]; do - for i in "${!tasks_running[@]}"; do - check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${tasks_running[i]}" sh -c "${WORKLOAD}") - - # if the current task is done, remove the corresponding container from the active list - [ "${check_file}" -eq "1" ] && unset 'tasks_running[i]' - done - ((retries--)) - sleep 3 - echo -n "." - done - echo -e "\n" -} - function tensorflow_test() { # Resnet section info "Running TF-Resnet test" From d3e57cf4548a72fcdb3130122b6900728eebdbb4 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 17:34:30 +0000 Subject: [PATCH 03/10] metrics: Use collect_results function in tensorflow mobilenet test This PR uses the collect results function defined in common for the tensorflow mobilenet test. Signed-off-by: Gabriela Cervantes --- .../tensorflow_mobilenet_v1_bfloat16_fp32.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh b/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh index d6c1a312a8..e48f75a5a8 100755 --- a/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh +++ b/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh @@ -87,14 +87,7 @@ function mobilenet_v1_bfloat16_fp32_test() { touch "${host_trigger_file}" info "All containers are running the workload..." - for i in "${containers[@]}"; do - check_file=$(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") - retries="30" - for j in $(seq 1 "${retries}"); do - [ "${check_file}" = 1 ] && break - sleep 1 - done - done + collect_results "${CMD_FILE}" for i in "${containers[@]}"; do sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULTS}" >> "${tensorflow_file}" From 1c84680d8c97339b8f18f701d00ad4efcdfe8650 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 17:39:24 +0000 Subject: [PATCH 04/10] metrics: Add check containers are up in common script This PR adds check containers are up in common script for kata metrics. Signed-off-by: Gabriela Cervantes --- tests/metrics/lib/common.bash | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index 14dabda38f..56da11417c 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -383,3 +383,17 @@ function collect_results() { echo -e "\n" } +function check_containers_are_up() { + local NUM_CONTAINERS="$1" + [[ -z ${NUM_CONTAINERS} ]] && die "Number of containers is missing" + + local TIMEOUT=60 + local containers_launched=0 + for i in $(seq "${TIMEOUT}") ; do + info "Verify that the containers are running" + containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")" + [ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break + sleep 1 + [ "${i}" == "${TIMEOUT}" ] && return 1 + done +} From 9d57a1fab4505be931500b8348765a600b7d0630 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 17:42:09 +0000 Subject: [PATCH 05/10] metrics: Use check containers are up in tensorflow script This PR uses the check containers are up from the common script in the tensorflow script. Signed-off-by: Gabriela Cervantes --- tests/metrics/machine_learning/tensorflow.sh | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh index 7f11319272..5a2d7cb1e4 100755 --- a/tests/metrics/machine_learning/tensorflow.sh +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -150,17 +150,6 @@ EOF metrics_json_end_array "Results" } -function check_containers_are_up() { - local containers_launched=0 - for i in $(seq "${TIMEOUT}") ; do - info "Verify that the containers are running" - containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")" - [ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break - sleep 1 - [ "${i}" == "${TIMEOUT}" ] && return 1 - done -} - function main() { # Verify enough arguments if [ "$#" -lt 2 ]; then @@ -195,11 +184,11 @@ function main() { metrics_json_start_array # Check that the requested number of containers are running - check_containers_are_up + check_containers_are_up "${NUM_CONTAINERS}" # Check that the requested number of containers are running local timeout_launch="10" - check_containers_are_up & pid=$! + check_containers_are_up "${NUM_CONTAINERS}" & pid=$! (sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$! if wait "${pid}" 2>/dev/null; then From 918c783084d7b775789f8a725892dd88feffccf0 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 17:44:36 +0000 Subject: [PATCH 06/10] metrics: Add check containers are up in tensorflow mobilenet script This PR adds the check containers are up in the common script in the tensorflow mobilenet script. Signed-off-by: Gabriela Cervantes --- tests/metrics/lib/common.bash | 10 +++++----- .../tensorflow_mobilenet_v1_bfloat16_fp32.sh | 15 ++------------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index 56da11417c..a715ea035d 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -363,18 +363,18 @@ function wait_ksm_settle() } function collect_results() { - WORKLOAD=${1} - [[ -z ${WORKLOAD} ]] && die "Container workload is missing" + local WORKLOAD="$1" + [[ -z "${WORKLOAD}" ]] && die "Container workload is missing" local tasks_running=("${containers[@]}") local retries=100 while [ "${#tasks_running[@]}" -gt 0 ] && [ "${retries}" -gt 0 ]; do for i in "${!tasks_running[@]}"; do - check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${tasks_running[i]}" sh > + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${tasks_running[i]}" sh -c "${WORKLOAD}") # if the current task is done, remove the corresponding container from the active list - [ "${check_file}" -eq "1" ] && unset 'tasks_running[i]' + [ "${check_file}" = 1 ] && unset 'tasks_running[i]' done ((retries--)) sleep 3 @@ -385,7 +385,7 @@ function collect_results() { function check_containers_are_up() { local NUM_CONTAINERS="$1" - [[ -z ${NUM_CONTAINERS} ]] && die "Number of containers is missing" + [[ -z "${NUM_CONTAINERS}" ]] && die "Number of containers is missing" local TIMEOUT=60 local containers_launched=0 diff --git a/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh b/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh index e48f75a5a8..aaacc86a4d 100755 --- a/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh +++ b/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh @@ -109,17 +109,6 @@ EOF metrics_json_end_array "Results" } -function check_containers_are_up() { - local containers_launched=0 - for i in $(seq "${TIMEOUT}") ; do - info "Verify that the containers are running" - containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")" - [ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break - sleep 1 - [ "${i}" == "${TIMEOUT}" ] && return 1 - done -} - function main() { # Verify enough arguments if [ $# != 2 ]; then @@ -153,11 +142,11 @@ function main() { metrics_json_start_array # Check that the requested number of containers are running - check_containers_are_up + check_containers_are_up "${NUM_CONTAINERS}" # Check that the requested number of containers are running local timeout_launch="10" - check_containers_are_up & pid=$! + check_containers_are_up "${NUM_CONTAINERS}" & pid=$! (sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$! if wait "${pid}" 2>/dev/null; then From 833cf7a684658fcb20ddcf5e80bcb75aa92d03c5 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 20:12:22 +0000 Subject: [PATCH 07/10] metrics: Add check containers are running function This PR adds the check containers are running function the common metrics script. Signed-off-by: Gabriela Cervantes --- tests/metrics/lib/common.bash | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index a715ea035d..327379a517 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -397,3 +397,21 @@ function check_containers_are_up() { [ "${i}" == "${TIMEOUT}" ] && return 1 done } + +function check_containers_are_running() { + local NUM_CONTAINERS="$1" + [[ -z "${NUM_CONTAINERS}" ]] && die "Number of containers is missing" + + # Check that the requested number of containers are running + local timeout_launch="10" + check_containers_are_up "${NUM_CONTAINERS}" & pid=$! + (sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$! + + if wait "${pid}" 2>/dev/null; then + pkill -HUP -P "${pid_tout}" + wait "${pid_tout}" + else + warn "Time out exceeded" + return 1 + fi +} From f700f9b0ba02e9dba90091d1c18f0eebef08ed98 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 20:13:37 +0000 Subject: [PATCH 08/10] metrics: Remove unused variable in tensorflow script This PR removes an unused variable in tensorflow script. Signed-off-by: Gabriela Cervantes --- tests/metrics/machine_learning/tensorflow.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh index 5a2d7cb1e4..803ee882ef 100755 --- a/tests/metrics/machine_learning/tensorflow.sh +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -35,7 +35,6 @@ ALEXNET_FILE="alexnet_results" ALEXNET_CHECK_FILE_CMD="cat /${ALEXNET_FILE} | grep 'total images' | wc -l" RESNET_FILE="resnet_results" RESNET_CHECK_FILE_CMD="cat /${RESNET_FILE} | grep 'total images' | wc -l" -MAX_RETRIES=300 function remove_tmp_file() { rm -rf "${resnet_tensorflow_file}" "${alexnet_tensorflow_file}" From 36337ee146de060e31b948ee339985381930857b Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 20:15:18 +0000 Subject: [PATCH 09/10] metrics: Add check containers are up in tensorflow script This PR adds the check containers are up function from common in tensorflow script. Signed-off-by: Gabriela Cervantes --- tests/metrics/machine_learning/tensorflow.sh | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh index 803ee882ef..7b33f96546 100755 --- a/tests/metrics/machine_learning/tensorflow.sh +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -186,17 +186,7 @@ function main() { check_containers_are_up "${NUM_CONTAINERS}" # Check that the requested number of containers are running - local timeout_launch="10" - check_containers_are_up "${NUM_CONTAINERS}" & pid=$! - (sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$! - - if wait "${pid}" 2>/dev/null; then - pkill -HUP -P "${pid_tout}" - wait "${pid_tout}" - else - warn "Time out exceeded" - return 1 - fi + check_containers_are_running "${NUM_CONTAINERS}" # Get the initial number of pids in a single container before the workload starts INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) From fdcd52ff78ff1e043a6b774d89534e2a1c5f01ed Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 10 Aug 2023 20:17:20 +0000 Subject: [PATCH 10/10] metrics: Add check containers are running in tensorflow mobilenet This PR adds check containers are running in tensorflow mobilenet that is being defined in common script. Signed-off-by: Gabriela Cervantes --- .../tensorflow_mobilenet_v1_bfloat16_fp32.sh | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh b/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh index aaacc86a4d..e451216fa6 100755 --- a/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh +++ b/tests/metrics/machine_learning/tensorflow_mobilenet_v1_bfloat16_fp32.sh @@ -145,17 +145,7 @@ function main() { check_containers_are_up "${NUM_CONTAINERS}" # Check that the requested number of containers are running - local timeout_launch="10" - check_containers_are_up "${NUM_CONTAINERS}" & pid=$! - (sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$! - - if wait "${pid}" 2>/dev/null; then - pkill -HUP -P "${pid_tout}" - wait "${pid_tout}" - else - warn "Time out exceeded" - return 1 - fi + check_containers_are_running "${NUM_CONTAINERS}" # Get the initial number of pids in a single container before the workload starts INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2)