mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-08-31 16:36:38 +00:00
Merge pull request #7618 from GabyCT/topic/addfunctionscommon
metrics: Add common functions to the common script
This commit is contained in:
@@ -361,3 +361,57 @@ function wait_ksm_settle()
|
||||
done
|
||||
info "Timed out after ${1}s waiting for KSM to settle"
|
||||
}
|
||||
|
||||
function collect_results() {
|
||||
local WORKLOAD="$1"
|
||||
[[ -z "${WORKLOAD}" ]] && die "Container workload is missing"
|
||||
|
||||
local tasks_running=("${containers[@]}")
|
||||
local retries=100
|
||||
|
||||
while [ "${#tasks_running[@]}" -gt 0 ] && [ "${retries}" -gt 0 ]; do
|
||||
for i in "${!tasks_running[@]}"; do
|
||||
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${tasks_running[i]}" sh -c "${WORKLOAD}")
|
||||
|
||||
# if the current task is done, remove the corresponding container from the active list
|
||||
[ "${check_file}" = 1 ] && unset 'tasks_running[i]'
|
||||
done
|
||||
((retries--))
|
||||
sleep 3
|
||||
echo -n "."
|
||||
done
|
||||
echo -e "\n"
|
||||
}
|
||||
|
||||
function check_containers_are_up() {
|
||||
local NUM_CONTAINERS="$1"
|
||||
[[ -z "${NUM_CONTAINERS}" ]] && die "Number of containers is missing"
|
||||
|
||||
local TIMEOUT=60
|
||||
local containers_launched=0
|
||||
for i in $(seq "${TIMEOUT}") ; do
|
||||
info "Verify that the containers are running"
|
||||
containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
|
||||
[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
|
||||
sleep 1
|
||||
[ "${i}" == "${TIMEOUT}" ] && return 1
|
||||
done
|
||||
}
|
||||
|
||||
function check_containers_are_running() {
|
||||
local NUM_CONTAINERS="$1"
|
||||
[[ -z "${NUM_CONTAINERS}" ]] && die "Number of containers is missing"
|
||||
|
||||
# Check that the requested number of containers are running
|
||||
local timeout_launch="10"
|
||||
check_containers_are_up "${NUM_CONTAINERS}" & pid=$!
|
||||
(sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$!
|
||||
|
||||
if wait "${pid}" 2>/dev/null; then
|
||||
pkill -HUP -P "${pid_tout}"
|
||||
wait "${pid_tout}"
|
||||
else
|
||||
warn "Time out exceeded"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
@@ -35,7 +35,6 @@ ALEXNET_FILE="alexnet_results"
|
||||
ALEXNET_CHECK_FILE_CMD="cat /${ALEXNET_FILE} | grep 'total images' | wc -l"
|
||||
RESNET_FILE="resnet_results"
|
||||
RESNET_CHECK_FILE_CMD="cat /${RESNET_FILE} | grep 'total images' | wc -l"
|
||||
MAX_RETRIES=300
|
||||
|
||||
function remove_tmp_file() {
|
||||
rm -rf "${resnet_tensorflow_file}" "${alexnet_tensorflow_file}"
|
||||
@@ -100,27 +99,6 @@ function launch_workload() {
|
||||
done
|
||||
}
|
||||
|
||||
function collect_results() {
|
||||
WORKLOAD=${1}
|
||||
[[ -z ${WORKLOAD} ]] && die "Container workload is missing"
|
||||
|
||||
local tasks_running=("${containers[@]}")
|
||||
local retries=${MAX_RETRIES}
|
||||
|
||||
while [ "${#tasks_running[@]}" -gt 0 ] && [ "${retries}" -gt 0 ]; do
|
||||
for i in "${!tasks_running[@]}"; do
|
||||
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${tasks_running[i]}" sh -c "${WORKLOAD}")
|
||||
|
||||
# if the current task is done, remove the corresponding container from the active list
|
||||
[ "${check_file}" -eq "1" ] && unset 'tasks_running[i]'
|
||||
done
|
||||
((retries--))
|
||||
sleep 3
|
||||
echo -n "."
|
||||
done
|
||||
echo -e "\n"
|
||||
}
|
||||
|
||||
function tensorflow_test() {
|
||||
# Resnet section
|
||||
info "Running TF-Resnet test"
|
||||
@@ -171,17 +149,6 @@ EOF
|
||||
metrics_json_end_array "Results"
|
||||
}
|
||||
|
||||
function check_containers_are_up() {
|
||||
local containers_launched=0
|
||||
for i in $(seq "${TIMEOUT}") ; do
|
||||
info "Verify that the containers are running"
|
||||
containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
|
||||
[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
|
||||
sleep 1
|
||||
[ "${i}" == "${TIMEOUT}" ] && return 1
|
||||
done
|
||||
}
|
||||
|
||||
function main() {
|
||||
# Verify enough arguments
|
||||
if [ "$#" -lt 2 ]; then
|
||||
@@ -216,20 +183,10 @@ function main() {
|
||||
metrics_json_start_array
|
||||
|
||||
# Check that the requested number of containers are running
|
||||
check_containers_are_up
|
||||
check_containers_are_up "${NUM_CONTAINERS}"
|
||||
|
||||
# Check that the requested number of containers are running
|
||||
local timeout_launch="10"
|
||||
check_containers_are_up & pid=$!
|
||||
(sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$!
|
||||
|
||||
if wait "${pid}" 2>/dev/null; then
|
||||
pkill -HUP -P "${pid_tout}"
|
||||
wait "${pid_tout}"
|
||||
else
|
||||
warn "Time out exceeded"
|
||||
return 1
|
||||
fi
|
||||
check_containers_are_running "${NUM_CONTAINERS}"
|
||||
|
||||
# Get the initial number of pids in a single container before the workload starts
|
||||
INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2)
|
||||
|
@@ -87,14 +87,7 @@ function mobilenet_v1_bfloat16_fp32_test() {
|
||||
touch "${host_trigger_file}"
|
||||
info "All containers are running the workload..."
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
check_file=$(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
|
||||
retries="30"
|
||||
for j in $(seq 1 "${retries}"); do
|
||||
[ "${check_file}" = 1 ] && break
|
||||
sleep 1
|
||||
done
|
||||
done
|
||||
collect_results "${CMD_FILE}"
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULTS}" >> "${tensorflow_file}"
|
||||
@@ -116,17 +109,6 @@ EOF
|
||||
metrics_json_end_array "Results"
|
||||
}
|
||||
|
||||
function check_containers_are_up() {
|
||||
local containers_launched=0
|
||||
for i in $(seq "${TIMEOUT}") ; do
|
||||
info "Verify that the containers are running"
|
||||
containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
|
||||
[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
|
||||
sleep 1
|
||||
[ "${i}" == "${TIMEOUT}" ] && return 1
|
||||
done
|
||||
}
|
||||
|
||||
function main() {
|
||||
# Verify enough arguments
|
||||
if [ $# != 2 ]; then
|
||||
@@ -160,20 +142,10 @@ function main() {
|
||||
metrics_json_start_array
|
||||
|
||||
# Check that the requested number of containers are running
|
||||
check_containers_are_up
|
||||
check_containers_are_up "${NUM_CONTAINERS}"
|
||||
|
||||
# Check that the requested number of containers are running
|
||||
local timeout_launch="10"
|
||||
check_containers_are_up & pid=$!
|
||||
(sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$!
|
||||
|
||||
if wait "${pid}" 2>/dev/null; then
|
||||
pkill -HUP -P "${pid_tout}"
|
||||
wait "${pid_tout}"
|
||||
else
|
||||
warn "Time out exceeded"
|
||||
return 1
|
||||
fi
|
||||
check_containers_are_running "${NUM_CONTAINERS}"
|
||||
|
||||
# Get the initial number of pids in a single container before the workload starts
|
||||
INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2)
|
||||
|
Reference in New Issue
Block a user