diff --git a/tests/metrics/README.md b/tests/metrics/README.md index d017ed3fc6..d67904620c 100644 --- a/tests/metrics/README.md +++ b/tests/metrics/README.md @@ -79,6 +79,8 @@ Test relating to measure reading and writing against clusters. Tests relating with TensorFlow and Pytorch implementations of several popular convolutional models. +For further details see the [machine learning tests documentation](machine_learning). + ## Saving Results In order to ensure continuity, and thus testing and historical tracking of results, diff --git a/tests/metrics/machine_learning/README.md b/tests/metrics/machine_learning/README.md new file mode 100644 index 0000000000..8c62b1566f --- /dev/null +++ b/tests/metrics/machine_learning/README.md @@ -0,0 +1,29 @@ +# Kata Containers Tensorflow Metrics + +Kata Containers provides a series of performance tests using the +TensorFlow reference benchmarks (tf_cnn_benchmarks). +The tf_cnn_benchmarks containers TensorFlow implementations of several +popular convolutional models. + +## Running the test + +Individual tests can be run by hand, for example: + +``` +$ cd metrics/machine_learning +$ ./tensorflow.sh 25 60 +``` +# Kata Containers Pytorch Metrics + +Kata Containers provides a series of performance tests using Pytorch +benchmarks based on a suite of Python high performance computing +benchmarks. + +## Running the Pytorch test + +Individual tests can be run by hand, for example: + +``` +$ cd metrics/machine_learning +$ ./tensorflow.sh 40 100 +``` diff --git a/tests/metrics/machine_learning/pytorch.sh b/tests/metrics/machine_learning/pytorch.sh new file mode 100755 index 0000000000..9958c1690b --- /dev/null +++ b/tests/metrics/machine_learning/pytorch.sh @@ -0,0 +1,160 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +set -e + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +IMAGE="docker.io/library/pytorch:latest" +DOCKERFILE="${SCRIPT_PATH}/pytorch_dockerfile/Dockerfile" +equation_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX) +isoneural_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX) +NUM_CONTAINERS="$1" +TIMEOUT="$2" +TEST_NAME="pytorch" +CMD_RUN="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/equation_of_state --burnin 20 --device cpu -b pytorch -s 524288 > LOG" +CMD_RUN_ISONEURAL="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/isoneutral_mixing --burnin 20 --device cpu -b pytorch -s 524288 > LOG" +CMD_RESULT="cd pyhpc-benchmarks-3.0 && cat LOG" +CMD_FILE="cat pyhpc-benchmarks-3.0/LOG | grep 'seconds' | wc -l" +PAYLOAD_ARGS="tail -f /dev/null" + +function remove_tmp_file() { + rm -rf "${equation_pytorch_file}" "${isoneural_pytorch_file}" +} + +trap remove_tmp_file EXIT + +function check_containers_are_up() { + local containers_launched=0 + for i in $(seq "${TIMEOUT}") ; do + info "Verify that the containers are running" + containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")" + [ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break + sleep 1 + [ "${i}" == "${TIMEOUT}" ] && return 1 + done +} + +function equation_of_state_pytorch_test() { + info "Running Equation of State Pytorch test" + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}" + done + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") + retries="200" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq 1 ] && break + sleep 1 + done + done + + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${equation_pytorch_file}" + done + + local equation_pytorch_results=$(cat "${equation_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//') + local equation_average_pytorch=$(echo "${equation_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + + local json="$(cat << EOF + { + "Pytorch Equation of State": { + "Result": "${equation_pytorch_results}", + "Average": "${equation_average_pytorch}", + "Units": "s" + } + } +EOF +)" + metrics_json_add_array_element "$json" + +} + +function isoneural_pytorch_test() { + info "Running Isoneural Pytorch test" + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN_ISONEURAL}" + done + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") + retries="200" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq 1 ] && break + sleep 1 + done + done + + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${isoneural_pytorch_file}" + done + + local isoneural_pytorch_results=$(cat "${isoneural_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//') + local isoneural_average_pytorch=$(echo "${isoneural_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + + local json="$(cat << EOF + { + "Pytorch Isoneural": { + "Result": "${isoneural_pytorch_results}", + "Average": "${isoneural_average_pytorch}", + "Units": "s" + } + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" + +} + + +function main() { + # Verify enough arguments + if [ $# != 2 ]; then + echo >&2 "error: Not enough arguments [$@]" + help + exit 1 + fi + + local i=0 + local containers=() + local not_started_count="${NUM_CONTAINERS}" + + # Check tools/commands dependencies + cmds=("awk" "docker" "bc") + check_cmds "${cmds[@]}" + check_ctr_images "${IMAGE}" "${DOCKERFILE}" + + init_env + info "Creating ${NUM_CONTAINERS} containers" + + for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" + ((not_started_count--)) + info "$not_started_count remaining containers" + done + + metrics_json_init + metrics_json_start_array + + + # Check that the requested number of containers are running + check_containers_are_up + + equation_of_state_pytorch_test + + isoneural_pytorch_test + + metrics_json_save + + clean_env_ctr + +} +main "$@" diff --git a/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile b/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile new file mode 100644 index 0000000000..7acdd9280e --- /dev/null +++ b/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile @@ -0,0 +1,19 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Usage: FROM [image name] +FROM intel/intel-optimized-pytorch:1.12.100 + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential curl git && \ + apt-get remove -y unattended-upgrades && \ + curl -OkL https://github.com/dionhaefner/pyhpc-benchmarks/archive/refs/tags/v3.0.tar.gz && \ + tar -xf v3.0.tar.gz && \ + pip install --no-cache-dir click==8.1.3 && \ + cd pyhpc-benchmarks-3.0 && pip3 install --no-cache-dir --user torch==1.10.0 + +CMD ["/bin/bash"] diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh new file mode 100755 index 0000000000..e4af1ef8a5 --- /dev/null +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +set -e + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +IMAGE="docker.io/library/tensorflow:latest" +DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile" +BATCH_SIZE="512" +NUM_BATCHES="300" +CMD_RESULT="cd benchmarks/scripts/tf_cnn_benchmarks/ && cat result" +CMD_FILE="cat benchmarks/scripts/tf_cnn_benchmarks/result | grep 'total images' | wc -l" +tensorflow_file=$(mktemp tensorflowresults.XXXXXXXXXX) +NUM_CONTAINERS="$1" +TIMEOUT="$2" +TEST_NAME="tensorflow" +PAYLOAD_ARGS="tail -f /dev/null" + +function remove_tmp_file() { + rm -rf "${tensorflow_file}" +} + +trap remove_tmp_file EXIT + +function help() { +cat << EOF +Usage: $0 + Description: + This script launches n number of containers + to run the tf cnn benchmarks using a Tensorflow + container. + Options: + : Number of containers to run. + : Timeout to launch the containers. +EOF +} + +function resnet50_test() { + local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > result" + info "Running Resnet50 Tensorflow test" + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}" + done + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") + retries="200" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq 1 ] && break + sleep 1 + done + done + + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}" + done + + local resnet50_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') + local average_resnet50=$(echo "${resnet50_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + + local json="$(cat << EOF + { + "Resnet50": { + "Result": "${resnet50_results}", + "Average": "${average_resnet50}", + "Units": "s" + } + } +EOF +)" + metrics_json_add_array_element "$json" +} + +function axelnet_test() { + local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > result" + info "Running AxelNet Tensorflow test" + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}" + done + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") + retries="200" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq 1 ] && break + sleep 1 + done + done + + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}" + done + + local axelnet_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') + local average_axelnet=$(echo "${axelnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + + local json="$(cat << EOF + { + "AxelNet": { + "Result": "${axelnet_results}", + "Average": "${average_axelnet}", + "Units": "s" + } + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" +} + +function check_containers_are_up() { + local containers_launched=0 + for i in $(seq "${TIMEOUT}") ; do + info "Verify that the containers are running" + containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")" + [ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break + sleep 1 + [ "${i}" == "${TIMEOUT}" ] && return 1 + done +} + +function main() { + # Verify enough arguments + if [ $# != 2 ]; then + echo >&2 "error: Not enough arguments [$@]" + help + exit 1 + fi + + local i=0 + local containers=() + local not_started_count="${NUM_CONTAINERS}" + + # Check tools/commands dependencies + cmds=("awk" "docker" "bc") + check_cmds "${cmds[@]}" + check_ctr_images "${IMAGE}" "${DOCKERFILE}" + + init_env + info "Creating ${NUM_CONTAINERS} containers" + + for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" + ((not_started_count--)) + info "$not_started_count remaining containers" + done + + metrics_json_init + metrics_json_start_array + + # Check that the requested number of containers are running + check_containers_are_up + + resnet50_test + + axelnet_test + + metrics_json_save + + clean_env_ctr +} +main "$@" diff --git a/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile b/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile new file mode 100644 index 0000000000..a8c73d5f61 --- /dev/null +++ b/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile @@ -0,0 +1,18 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Usage: FROM [image name] +FROM intel/intel-optimized-tensorflow:2.9.1 + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential git && \ + apt-get remove -y unattended-upgrades && \ + git clone https://github.com/tensorflow/benchmarks + +CMD ["/bin/bash"]