Merge pull request #7315 from GabyCT/topic/machinelearning

tests: Add machine learning performance tests
This commit is contained in:
David Esparza 2023-07-12 15:57:11 -06:00 committed by GitHub
commit f63673838b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 397 additions and 0 deletions

View File

@ -79,6 +79,8 @@ Test relating to measure reading and writing against clusters.
Tests relating with TensorFlow and Pytorch implementations of several popular Tests relating with TensorFlow and Pytorch implementations of several popular
convolutional models. convolutional models.
For further details see the [machine learning tests documentation](machine_learning).
## Saving Results ## Saving Results
In order to ensure continuity, and thus testing and historical tracking of results, In order to ensure continuity, and thus testing and historical tracking of results,

View File

@ -0,0 +1,29 @@
# Kata Containers Tensorflow Metrics
Kata Containers provides a series of performance tests using the
TensorFlow reference benchmarks (tf_cnn_benchmarks).
The tf_cnn_benchmarks containers TensorFlow implementations of several
popular convolutional models.
## Running the test
Individual tests can be run by hand, for example:
```
$ cd metrics/machine_learning
$ ./tensorflow.sh 25 60
```
# Kata Containers Pytorch Metrics
Kata Containers provides a series of performance tests using Pytorch
benchmarks based on a suite of Python high performance computing
benchmarks.
## Running the Pytorch test
Individual tests can be run by hand, for example:
```
$ cd metrics/machine_learning
$ ./tensorflow.sh 40 100
```

View File

@ -0,0 +1,160 @@
#!/bin/bash
#
# Copyright (c) 2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
set -e
# General env
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../lib/common.bash"
IMAGE="docker.io/library/pytorch:latest"
DOCKERFILE="${SCRIPT_PATH}/pytorch_dockerfile/Dockerfile"
equation_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX)
isoneural_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX)
NUM_CONTAINERS="$1"
TIMEOUT="$2"
TEST_NAME="pytorch"
CMD_RUN="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/equation_of_state --burnin 20 --device cpu -b pytorch -s 524288 > LOG"
CMD_RUN_ISONEURAL="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/isoneutral_mixing --burnin 20 --device cpu -b pytorch -s 524288 > LOG"
CMD_RESULT="cd pyhpc-benchmarks-3.0 && cat LOG"
CMD_FILE="cat pyhpc-benchmarks-3.0/LOG | grep 'seconds' | wc -l"
PAYLOAD_ARGS="tail -f /dev/null"
function remove_tmp_file() {
rm -rf "${equation_pytorch_file}" "${isoneural_pytorch_file}"
}
trap remove_tmp_file EXIT
function check_containers_are_up() {
local containers_launched=0
for i in $(seq "${TIMEOUT}") ; do
info "Verify that the containers are running"
containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
sleep 1
[ "${i}" == "${TIMEOUT}" ] && return 1
done
}
function equation_of_state_pytorch_test() {
info "Running Equation of State Pytorch test"
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
done
for i in "${containers[@]}"; do
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
retries="200"
for j in $(seq 1 "${retries}"); do
[ "${check_file}" -eq 1 ] && break
sleep 1
done
done
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${equation_pytorch_file}"
done
local equation_pytorch_results=$(cat "${equation_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//')
local equation_average_pytorch=$(echo "${equation_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
local json="$(cat << EOF
{
"Pytorch Equation of State": {
"Result": "${equation_pytorch_results}",
"Average": "${equation_average_pytorch}",
"Units": "s"
}
}
EOF
)"
metrics_json_add_array_element "$json"
}
function isoneural_pytorch_test() {
info "Running Isoneural Pytorch test"
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN_ISONEURAL}"
done
for i in "${containers[@]}"; do
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
retries="200"
for j in $(seq 1 "${retries}"); do
[ "${check_file}" -eq 1 ] && break
sleep 1
done
done
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${isoneural_pytorch_file}"
done
local isoneural_pytorch_results=$(cat "${isoneural_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//')
local isoneural_average_pytorch=$(echo "${isoneural_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
local json="$(cat << EOF
{
"Pytorch Isoneural": {
"Result": "${isoneural_pytorch_results}",
"Average": "${isoneural_average_pytorch}",
"Units": "s"
}
}
EOF
)"
metrics_json_add_array_element "$json"
metrics_json_end_array "Results"
}
function main() {
# Verify enough arguments
if [ $# != 2 ]; then
echo >&2 "error: Not enough arguments [$@]"
help
exit 1
fi
local i=0
local containers=()
local not_started_count="${NUM_CONTAINERS}"
# Check tools/commands dependencies
cmds=("awk" "docker" "bc")
check_cmds "${cmds[@]}"
check_ctr_images "${IMAGE}" "${DOCKERFILE}"
init_env
info "Creating ${NUM_CONTAINERS} containers"
for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
containers+=($(random_name))
sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
((not_started_count--))
info "$not_started_count remaining containers"
done
metrics_json_init
metrics_json_start_array
# Check that the requested number of containers are running
check_containers_are_up
equation_of_state_pytorch_test
isoneural_pytorch_test
metrics_json_save
clean_env_ctr
}
main "$@"

View File

@ -0,0 +1,19 @@
# Copyright (c) 2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
# Usage: FROM [image name]
FROM intel/intel-optimized-pytorch:1.12.100
# Version of the Dockerfile
LABEL DOCKERFILE_VERSION="1.0"
RUN apt-get update && \
apt-get install -y --no-install-recommends build-essential curl git && \
apt-get remove -y unattended-upgrades && \
curl -OkL https://github.com/dionhaefner/pyhpc-benchmarks/archive/refs/tags/v3.0.tar.gz && \
tar -xf v3.0.tar.gz && \
pip install --no-cache-dir click==8.1.3 && \
cd pyhpc-benchmarks-3.0 && pip3 install --no-cache-dir --user torch==1.10.0
CMD ["/bin/bash"]

View File

@ -0,0 +1,169 @@
#!/bin/bash
#
# Copyright (c) 2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
set -e
# General env
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../lib/common.bash"
IMAGE="docker.io/library/tensorflow:latest"
DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile"
BATCH_SIZE="512"
NUM_BATCHES="300"
CMD_RESULT="cd benchmarks/scripts/tf_cnn_benchmarks/ && cat result"
CMD_FILE="cat benchmarks/scripts/tf_cnn_benchmarks/result | grep 'total images' | wc -l"
tensorflow_file=$(mktemp tensorflowresults.XXXXXXXXXX)
NUM_CONTAINERS="$1"
TIMEOUT="$2"
TEST_NAME="tensorflow"
PAYLOAD_ARGS="tail -f /dev/null"
function remove_tmp_file() {
rm -rf "${tensorflow_file}"
}
trap remove_tmp_file EXIT
function help() {
cat << EOF
Usage: $0 <count> <timeout>
Description:
This script launches n number of containers
to run the tf cnn benchmarks using a Tensorflow
container.
Options:
<count> : Number of containers to run.
<timeout> : Timeout to launch the containers.
EOF
}
function resnet50_test() {
local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > result"
info "Running Resnet50 Tensorflow test"
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
done
for i in "${containers[@]}"; do
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
retries="200"
for j in $(seq 1 "${retries}"); do
[ "${check_file}" -eq 1 ] && break
sleep 1
done
done
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}"
done
local resnet50_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
local average_resnet50=$(echo "${resnet50_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
local json="$(cat << EOF
{
"Resnet50": {
"Result": "${resnet50_results}",
"Average": "${average_resnet50}",
"Units": "s"
}
}
EOF
)"
metrics_json_add_array_element "$json"
}
function axelnet_test() {
local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > result"
info "Running AxelNet Tensorflow test"
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
done
for i in "${containers[@]}"; do
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
retries="200"
for j in $(seq 1 "${retries}"); do
[ "${check_file}" -eq 1 ] && break
sleep 1
done
done
for i in "${containers[@]}"; do
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}"
done
local axelnet_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
local average_axelnet=$(echo "${axelnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
local json="$(cat << EOF
{
"AxelNet": {
"Result": "${axelnet_results}",
"Average": "${average_axelnet}",
"Units": "s"
}
}
EOF
)"
metrics_json_add_array_element "$json"
metrics_json_end_array "Results"
}
function check_containers_are_up() {
local containers_launched=0
for i in $(seq "${TIMEOUT}") ; do
info "Verify that the containers are running"
containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
sleep 1
[ "${i}" == "${TIMEOUT}" ] && return 1
done
}
function main() {
# Verify enough arguments
if [ $# != 2 ]; then
echo >&2 "error: Not enough arguments [$@]"
help
exit 1
fi
local i=0
local containers=()
local not_started_count="${NUM_CONTAINERS}"
# Check tools/commands dependencies
cmds=("awk" "docker" "bc")
check_cmds "${cmds[@]}"
check_ctr_images "${IMAGE}" "${DOCKERFILE}"
init_env
info "Creating ${NUM_CONTAINERS} containers"
for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
containers+=($(random_name))
sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
((not_started_count--))
info "$not_started_count remaining containers"
done
metrics_json_init
metrics_json_start_array
# Check that the requested number of containers are running
check_containers_are_up
resnet50_test
axelnet_test
metrics_json_save
clean_env_ctr
}
main "$@"

View File

@ -0,0 +1,18 @@
# Copyright (c) 2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
# Usage: FROM [image name]
FROM intel/intel-optimized-tensorflow:2.9.1
# Version of the Dockerfile
LABEL DOCKERFILE_VERSION="1.0"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y --no-install-recommends build-essential git && \
apt-get remove -y unattended-upgrades && \
git clone https://github.com/tensorflow/benchmarks
CMD ["/bin/bash"]