mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-06-26 15:32:30 +00:00
Merge pull request #7315 from GabyCT/topic/machinelearning
tests: Add machine learning performance tests
This commit is contained in:
commit
f63673838b
@ -79,6 +79,8 @@ Test relating to measure reading and writing against clusters.
|
||||
Tests relating with TensorFlow and Pytorch implementations of several popular
|
||||
convolutional models.
|
||||
|
||||
For further details see the [machine learning tests documentation](machine_learning).
|
||||
|
||||
## Saving Results
|
||||
|
||||
In order to ensure continuity, and thus testing and historical tracking of results,
|
||||
|
29
tests/metrics/machine_learning/README.md
Normal file
29
tests/metrics/machine_learning/README.md
Normal file
@ -0,0 +1,29 @@
|
||||
# Kata Containers Tensorflow Metrics
|
||||
|
||||
Kata Containers provides a series of performance tests using the
|
||||
TensorFlow reference benchmarks (tf_cnn_benchmarks).
|
||||
The tf_cnn_benchmarks containers TensorFlow implementations of several
|
||||
popular convolutional models.
|
||||
|
||||
## Running the test
|
||||
|
||||
Individual tests can be run by hand, for example:
|
||||
|
||||
```
|
||||
$ cd metrics/machine_learning
|
||||
$ ./tensorflow.sh 25 60
|
||||
```
|
||||
# Kata Containers Pytorch Metrics
|
||||
|
||||
Kata Containers provides a series of performance tests using Pytorch
|
||||
benchmarks based on a suite of Python high performance computing
|
||||
benchmarks.
|
||||
|
||||
## Running the Pytorch test
|
||||
|
||||
Individual tests can be run by hand, for example:
|
||||
|
||||
```
|
||||
$ cd metrics/machine_learning
|
||||
$ ./tensorflow.sh 40 100
|
||||
```
|
160
tests/metrics/machine_learning/pytorch.sh
Executable file
160
tests/metrics/machine_learning/pytorch.sh
Executable file
@ -0,0 +1,160 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -e
|
||||
|
||||
# General env
|
||||
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
|
||||
source "${SCRIPT_PATH}/../lib/common.bash"
|
||||
|
||||
IMAGE="docker.io/library/pytorch:latest"
|
||||
DOCKERFILE="${SCRIPT_PATH}/pytorch_dockerfile/Dockerfile"
|
||||
equation_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX)
|
||||
isoneural_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX)
|
||||
NUM_CONTAINERS="$1"
|
||||
TIMEOUT="$2"
|
||||
TEST_NAME="pytorch"
|
||||
CMD_RUN="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/equation_of_state --burnin 20 --device cpu -b pytorch -s 524288 > LOG"
|
||||
CMD_RUN_ISONEURAL="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/isoneutral_mixing --burnin 20 --device cpu -b pytorch -s 524288 > LOG"
|
||||
CMD_RESULT="cd pyhpc-benchmarks-3.0 && cat LOG"
|
||||
CMD_FILE="cat pyhpc-benchmarks-3.0/LOG | grep 'seconds' | wc -l"
|
||||
PAYLOAD_ARGS="tail -f /dev/null"
|
||||
|
||||
function remove_tmp_file() {
|
||||
rm -rf "${equation_pytorch_file}" "${isoneural_pytorch_file}"
|
||||
}
|
||||
|
||||
trap remove_tmp_file EXIT
|
||||
|
||||
function check_containers_are_up() {
|
||||
local containers_launched=0
|
||||
for i in $(seq "${TIMEOUT}") ; do
|
||||
info "Verify that the containers are running"
|
||||
containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
|
||||
[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
|
||||
sleep 1
|
||||
[ "${i}" == "${TIMEOUT}" ] && return 1
|
||||
done
|
||||
}
|
||||
|
||||
function equation_of_state_pytorch_test() {
|
||||
info "Running Equation of State Pytorch test"
|
||||
for i in "${containers[@]}"; do
|
||||
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
|
||||
done
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
|
||||
retries="200"
|
||||
for j in $(seq 1 "${retries}"); do
|
||||
[ "${check_file}" -eq 1 ] && break
|
||||
sleep 1
|
||||
done
|
||||
done
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${equation_pytorch_file}"
|
||||
done
|
||||
|
||||
local equation_pytorch_results=$(cat "${equation_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//')
|
||||
local equation_average_pytorch=$(echo "${equation_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"Pytorch Equation of State": {
|
||||
"Result": "${equation_pytorch_results}",
|
||||
"Average": "${equation_average_pytorch}",
|
||||
"Units": "s"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
|
||||
}
|
||||
|
||||
function isoneural_pytorch_test() {
|
||||
info "Running Isoneural Pytorch test"
|
||||
for i in "${containers[@]}"; do
|
||||
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN_ISONEURAL}"
|
||||
done
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
|
||||
retries="200"
|
||||
for j in $(seq 1 "${retries}"); do
|
||||
[ "${check_file}" -eq 1 ] && break
|
||||
sleep 1
|
||||
done
|
||||
done
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${isoneural_pytorch_file}"
|
||||
done
|
||||
|
||||
local isoneural_pytorch_results=$(cat "${isoneural_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//')
|
||||
local isoneural_average_pytorch=$(echo "${isoneural_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"Pytorch Isoneural": {
|
||||
"Result": "${isoneural_pytorch_results}",
|
||||
"Average": "${isoneural_average_pytorch}",
|
||||
"Units": "s"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
metrics_json_end_array "Results"
|
||||
|
||||
}
|
||||
|
||||
|
||||
function main() {
|
||||
# Verify enough arguments
|
||||
if [ $# != 2 ]; then
|
||||
echo >&2 "error: Not enough arguments [$@]"
|
||||
help
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local i=0
|
||||
local containers=()
|
||||
local not_started_count="${NUM_CONTAINERS}"
|
||||
|
||||
# Check tools/commands dependencies
|
||||
cmds=("awk" "docker" "bc")
|
||||
check_cmds "${cmds[@]}"
|
||||
check_ctr_images "${IMAGE}" "${DOCKERFILE}"
|
||||
|
||||
init_env
|
||||
info "Creating ${NUM_CONTAINERS} containers"
|
||||
|
||||
for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
|
||||
containers+=($(random_name))
|
||||
sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
|
||||
((not_started_count--))
|
||||
info "$not_started_count remaining containers"
|
||||
done
|
||||
|
||||
metrics_json_init
|
||||
metrics_json_start_array
|
||||
|
||||
|
||||
# Check that the requested number of containers are running
|
||||
check_containers_are_up
|
||||
|
||||
equation_of_state_pytorch_test
|
||||
|
||||
isoneural_pytorch_test
|
||||
|
||||
metrics_json_save
|
||||
|
||||
clean_env_ctr
|
||||
|
||||
}
|
||||
main "$@"
|
19
tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile
Normal file
19
tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile
Normal file
@ -0,0 +1,19 @@
|
||||
# Copyright (c) 2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Usage: FROM [image name]
|
||||
FROM intel/intel-optimized-pytorch:1.12.100
|
||||
|
||||
# Version of the Dockerfile
|
||||
LABEL DOCKERFILE_VERSION="1.0"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends build-essential curl git && \
|
||||
apt-get remove -y unattended-upgrades && \
|
||||
curl -OkL https://github.com/dionhaefner/pyhpc-benchmarks/archive/refs/tags/v3.0.tar.gz && \
|
||||
tar -xf v3.0.tar.gz && \
|
||||
pip install --no-cache-dir click==8.1.3 && \
|
||||
cd pyhpc-benchmarks-3.0 && pip3 install --no-cache-dir --user torch==1.10.0
|
||||
|
||||
CMD ["/bin/bash"]
|
169
tests/metrics/machine_learning/tensorflow.sh
Executable file
169
tests/metrics/machine_learning/tensorflow.sh
Executable file
@ -0,0 +1,169 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -e
|
||||
|
||||
# General env
|
||||
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
|
||||
source "${SCRIPT_PATH}/../lib/common.bash"
|
||||
|
||||
IMAGE="docker.io/library/tensorflow:latest"
|
||||
DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile"
|
||||
BATCH_SIZE="512"
|
||||
NUM_BATCHES="300"
|
||||
CMD_RESULT="cd benchmarks/scripts/tf_cnn_benchmarks/ && cat result"
|
||||
CMD_FILE="cat benchmarks/scripts/tf_cnn_benchmarks/result | grep 'total images' | wc -l"
|
||||
tensorflow_file=$(mktemp tensorflowresults.XXXXXXXXXX)
|
||||
NUM_CONTAINERS="$1"
|
||||
TIMEOUT="$2"
|
||||
TEST_NAME="tensorflow"
|
||||
PAYLOAD_ARGS="tail -f /dev/null"
|
||||
|
||||
function remove_tmp_file() {
|
||||
rm -rf "${tensorflow_file}"
|
||||
}
|
||||
|
||||
trap remove_tmp_file EXIT
|
||||
|
||||
function help() {
|
||||
cat << EOF
|
||||
Usage: $0 <count> <timeout>
|
||||
Description:
|
||||
This script launches n number of containers
|
||||
to run the tf cnn benchmarks using a Tensorflow
|
||||
container.
|
||||
Options:
|
||||
<count> : Number of containers to run.
|
||||
<timeout> : Timeout to launch the containers.
|
||||
EOF
|
||||
}
|
||||
|
||||
function resnet50_test() {
|
||||
local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > result"
|
||||
info "Running Resnet50 Tensorflow test"
|
||||
for i in "${containers[@]}"; do
|
||||
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
|
||||
done
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
|
||||
retries="200"
|
||||
for j in $(seq 1 "${retries}"); do
|
||||
[ "${check_file}" -eq 1 ] && break
|
||||
sleep 1
|
||||
done
|
||||
done
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}"
|
||||
done
|
||||
|
||||
local resnet50_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
|
||||
local average_resnet50=$(echo "${resnet50_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"Resnet50": {
|
||||
"Result": "${resnet50_results}",
|
||||
"Average": "${average_resnet50}",
|
||||
"Units": "s"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
}
|
||||
|
||||
function axelnet_test() {
|
||||
local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > result"
|
||||
info "Running AxelNet Tensorflow test"
|
||||
for i in "${containers[@]}"; do
|
||||
sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
|
||||
done
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
|
||||
retries="200"
|
||||
for j in $(seq 1 "${retries}"); do
|
||||
[ "${check_file}" -eq 1 ] && break
|
||||
sleep 1
|
||||
done
|
||||
done
|
||||
|
||||
for i in "${containers[@]}"; do
|
||||
sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${tensorflow_file}"
|
||||
done
|
||||
|
||||
local axelnet_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
|
||||
local average_axelnet=$(echo "${axelnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"AxelNet": {
|
||||
"Result": "${axelnet_results}",
|
||||
"Average": "${average_axelnet}",
|
||||
"Units": "s"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
metrics_json_end_array "Results"
|
||||
}
|
||||
|
||||
function check_containers_are_up() {
|
||||
local containers_launched=0
|
||||
for i in $(seq "${TIMEOUT}") ; do
|
||||
info "Verify that the containers are running"
|
||||
containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
|
||||
[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
|
||||
sleep 1
|
||||
[ "${i}" == "${TIMEOUT}" ] && return 1
|
||||
done
|
||||
}
|
||||
|
||||
function main() {
|
||||
# Verify enough arguments
|
||||
if [ $# != 2 ]; then
|
||||
echo >&2 "error: Not enough arguments [$@]"
|
||||
help
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local i=0
|
||||
local containers=()
|
||||
local not_started_count="${NUM_CONTAINERS}"
|
||||
|
||||
# Check tools/commands dependencies
|
||||
cmds=("awk" "docker" "bc")
|
||||
check_cmds "${cmds[@]}"
|
||||
check_ctr_images "${IMAGE}" "${DOCKERFILE}"
|
||||
|
||||
init_env
|
||||
info "Creating ${NUM_CONTAINERS} containers"
|
||||
|
||||
for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
|
||||
containers+=($(random_name))
|
||||
sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
|
||||
((not_started_count--))
|
||||
info "$not_started_count remaining containers"
|
||||
done
|
||||
|
||||
metrics_json_init
|
||||
metrics_json_start_array
|
||||
|
||||
# Check that the requested number of containers are running
|
||||
check_containers_are_up
|
||||
|
||||
resnet50_test
|
||||
|
||||
axelnet_test
|
||||
|
||||
metrics_json_save
|
||||
|
||||
clean_env_ctr
|
||||
}
|
||||
main "$@"
|
@ -0,0 +1,18 @@
|
||||
# Copyright (c) 2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Usage: FROM [image name]
|
||||
FROM intel/intel-optimized-tensorflow:2.9.1
|
||||
|
||||
# Version of the Dockerfile
|
||||
LABEL DOCKERFILE_VERSION="1.0"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends build-essential git && \
|
||||
apt-get remove -y unattended-upgrades && \
|
||||
git clone https://github.com/tensorflow/benchmarks
|
||||
|
||||
CMD ["/bin/bash"]
|
Loading…
Reference in New Issue
Block a user