metrics: General improvements to the tensorflow script

This PR adds general improvements to the tensorflow script.

Signed-off-by: Gabriela Cervantes <gabriela.cervantes.tellez@intel.com>
This commit is contained in:
Gabriela Cervantes 2023-07-20 20:35:01 +00:00
parent 63b8534b41
commit 08dfaa97aa
2 changed files with 125 additions and 55 deletions

View File

@ -8,6 +8,7 @@
set -o errexit set -o errexit
set -o nounset set -o nounset
set -o pipefail set -o pipefail
set -x
kata_tarball_dir="${2:-kata-artifacts}" kata_tarball_dir="${2:-kata-artifacts}"
metrics_dir="$(dirname "$(readlink -f "$0")")" metrics_dir="$(dirname "$(readlink -f "$0")")"

View File

@ -4,7 +4,8 @@
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
set -e #set -e
set -x
# General env # General env
SCRIPT_PATH=$(dirname "$(readlink -f "$0")") SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
@ -14,16 +15,28 @@ IMAGE="docker.io/library/tensorflow:latest"
DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile" DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile"
BATCH_SIZE="512" BATCH_SIZE="512"
NUM_BATCHES="300" NUM_BATCHES="300"
CMD_RESULT="cd benchmarks/scripts/tf_cnn_benchmarks/ && cat result" resnet_tensorflow_file=$(mktemp resnettensorflowresults.XXXXXXXXXX)
CMD_FILE="cat benchmarks/scripts/tf_cnn_benchmarks/result | grep 'total images' | wc -l" alexnet_tensorflow_file=$(mktemp alexnettensorflowresults.XXXXXXXXXX)
tensorflow_file=$(mktemp tensorflowresults.XXXXXXXXXX)
NUM_CONTAINERS="$1" NUM_CONTAINERS="$1"
TIMEOUT="$2" TIMEOUT="$2"
TEST_NAME="tensorflow" TEST_NAME="tensorflow"
PAYLOAD_ARGS="tail -f /dev/null" PAYLOAD_ARGS="tail -f /dev/null"
# Options to control the start of the workload using a trigger-file
dst_dir="/host"
src_dir=$(mktemp --tmpdir -d tensorflow.XXXXXXXXXX)
MOUNT_OPTIONS="type=bind,src=$src_dir,dst=$dst_dir,options=rbind:ro"
# CMD points to the script that starts the workload
alexnet_start_script="alexnet_start.sh"
resnet_start_script="resnet_start.sh"
CMD_RESNET="$dst_dir/$resnet_start_script"
CMD_ALEXNET="$dst_dir/$alexnet_start_script"
timeout=600
INITIAL_NUM_PIDS=1
CMD_FILE="cat alexnet_results | grep 'total images' | wc -l"
RESNET_CMD_FILE="cat resnet_results | grep 'total images' | wc -l"
function remove_tmp_file() { function remove_tmp_file() {
rm -rf "${tensorflow_file}" rm -rf "${resnet_tensorflow_file}" "${alexnet_tensorflow_file}"
} }
trap remove_tmp_file EXIT trap remove_tmp_file EXIT
@ -31,81 +44,117 @@ trap remove_tmp_file EXIT
function help() { function help() {
cat << EOF cat << EOF
Usage: $0 <count> <timeout> Usage: $0 <count> <timeout>
Description: Description:
This script launches n number of containers This script launches n number of containers
to run the tf cnn benchmarks using a Tensorflow to run the tf cnn benchmarks using a Tensorflow
container. container.
Options: Options:
<count> : Number of containers to run. <count> : Number of containers to run.
<timeout> : Timeout to launch the containers. <timeout> : Timeout to launch the containers.
EOF EOF
} }
function resnet50_test() { function create_resnet_start_script() {
local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > result" local script="${src_dir}/${resnet_start_script}"
info "Running Resnet50 Tensorflow test" rm -rf "${script}"
cat <<EOF >>"${script}"
#!/bin/bash
python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > resnet_results
EOF
chmod +x "${script}"
}
function create_alexnet_start_script() {
local script="${src_dir}/${alexnet_start_script}"
rm -rf "${script}"
cat <<EOF >>"${script}"
#!/bin/bash
python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --num_batches=100 --device=cpu --batch_size=100 --forward_only=true --model=alexnet --data_format=NHWC > alexnet_results
EOF
chmod +x "${script}"
}
function tensorflow_test() {
info "Copy Resnet Tensorflow test"
local pids=()
local j=0
for i in "${containers[@]}"; do for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}" $(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESNET}")&
pids["${j}"]=$!
((j++))
done
# wait for all pids
for pid in ${pids[*]}; do
wait "${pid}"
done
info "All containers are running the workload..."
for i in "${containers[@]}"; do
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${RESNET_CMD_FILE}")
retries="100"
for j in $(seq 1 "${retries}"); do
[ "${check_file}" -eq "1" ] && break
sleep 1
done
done
info "Copy Alexnet Tensorflow test"
local pids=()
local j=0
for i in "${containers[@]}"; do
$(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_ALEXNET}")&
pids["${j}"]=$!
((j++))
done
# wait for all pids
for pid in ${pids[*]}; do
wait "${pid}"
done done
for i in "${containers[@]}"; do for i in "${containers[@]}"; do
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
retries="200" retries="300"
for j in $(seq 1 "${retries}"); do for j in $(seq 1 "${retries}"); do
[ "${check_file}" -eq 1 ] && break [ "${check_file}" -eq "1" ] && break
sleep 1 sleep 1
done done
done done
for i in "${containers[@]}"; do for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}" sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat resnet_results" >> "${resnet_tensorflow_file}"
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat alexnet_results" >> "${alexnet_tensorflow_file}"
done done
local resnet50_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') local resnet_results=$(cat "${resnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
local average_resnet50=$(echo "${resnet50_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) local average_resnet=$(echo "${resnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
local json="$(cat << EOF local json="$(cat << EOF
{ {
"Resnet50": { "Resnet": {
"Result": "${resnet50_results}", "Result": "${resnet_results}",
"Average": "${average_resnet50}", "Average": "${average_resnet}",
"Units": "s" "Units": "images/s"
} }
} }
EOF EOF
)" )"
metrics_json_add_array_element "$json" metrics_json_add_array_element "$json"
}
function axelnet_test() { local alexnet_results=$(cat "${alexnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > result" local average_alexnet=$(echo "${alexnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
info "Running AxelNet Tensorflow test"
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
done
for i in "${containers[@]}"; do
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
retries="200"
for j in $(seq 1 "${retries}"); do
[ "${check_file}" -eq 1 ] && break
sleep 1
done
done
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}"
done
local axelnet_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
local average_axelnet=$(echo "${axelnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
local json="$(cat << EOF local json="$(cat << EOF
{ {
"AxelNet": { "AlexNet": {
"Result": "${axelnet_results}", "Result": "${alexnet_results}",
"Average": "${average_axelnet}", "Average": "${average_alexnet}",
"Units": "s" "Units": "images/s"
} }
} }
EOF EOF
@ -143,11 +192,14 @@ function main() {
check_ctr_images "${IMAGE}" "${DOCKERFILE}" check_ctr_images "${IMAGE}" "${DOCKERFILE}"
init_env init_env
create_resnet_start_script
create_alexnet_start_script
info "Creating ${NUM_CONTAINERS} containers" info "Creating ${NUM_CONTAINERS} containers"
for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
containers+=($(random_name)) containers+=($(random_name))
sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" --mount="${MOUNT_OPTIONS}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
((not_started_count--)) ((not_started_count--))
info "$not_started_count remaining containers" info "$not_started_count remaining containers"
done done
@ -158,12 +210,29 @@ function main() {
# Check that the requested number of containers are running # Check that the requested number of containers are running
check_containers_are_up check_containers_are_up
resnet50_test # Check that the requested number of containers are running
local timeout_launch="10"
check_containers_are_up & pid=$!
(sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$!
axelnet_test if wait "${pid}" 2>/dev/null; then
pkill -HUP -P "${pid_tout}"
wait "${pid_tout}"
else
warn "Time out exceeded"
return 1
fi
# Get the initial number of pids in a single container before the workload starts
INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2)
((INITIAL_NUM_PIDS++))
tensorflow_test
metrics_json_save metrics_json_save
rm -rf "${src_dir}"
clean_env_ctr clean_env_ctr
} }
main "$@" main "$@"