Merge pull request #7315 from GabyCT/topic/machinelearning

tests: Add machine learning performance tests
2025-08-15 22:53:43 +00:00 · 2023-07-12 15:57:11 -06:00 · 2023-07-12 15:57:11 -06:00 · f63673838b
commit f63673838b
parent 3f38f75918 7f961461bd
6 changed files with 397 additions and 0 deletions
--- a/tests/metrics/README.md
+++ b/tests/metrics/README.md
@ -79,6 +79,8 @@ Test relating to measure reading and writing against clusters.
 Tests relating with TensorFlow and Pytorch implementations of several popular
 convolutional models.
 For further details see the [machine learning tests documentation](machine_learning).
 ## Saving Results
 In order to ensure continuity, and thus testing and historical tracking of results,
--- a/tests/metrics/machine_learning/README.md
+++ b/tests/metrics/machine_learning/README.md
@ -0,0 +1,29 @@
 # Kata Containers Tensorflow Metrics
 Kata Containers provides a series of performance tests using the
 TensorFlow reference benchmarks (tf_cnn_benchmarks).
 The tf_cnn_benchmarks containers TensorFlow implementations of several
 popular convolutional models.
 ## Running the test
 Individual tests can be run by hand, for example:
 ```
 $ cd metrics/machine_learning
 $ ./tensorflow.sh 25 60
 ```
 # Kata Containers Pytorch Metrics
 Kata Containers provides a series of performance tests using Pytorch
 benchmarks based on a suite of Python high performance computing
 benchmarks.
 ## Running the Pytorch test
 Individual tests can be run by hand, for example:
 ```
 $ cd metrics/machine_learning
 $ ./tensorflow.sh 40 100
 ```
--- a/tests/metrics/machine_learning/pytorch.sh
+++ b/tests/metrics/machine_learning/pytorch.sh
@ -0,0 +1,160 @@
 #!/bin/bash
 #
 # Copyright (c) 2023 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
 set -e
 # General env
 SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
 source "${SCRIPT_PATH}/../lib/common.bash"
 IMAGE="docker.io/library/pytorch:latest"
 DOCKERFILE="${SCRIPT_PATH}/pytorch_dockerfile/Dockerfile"
 equation_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX)
 isoneural_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX)
 NUM_CONTAINERS="$1"
 TIMEOUT="$2"
 TEST_NAME="pytorch"
 CMD_RUN="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/equation_of_state --burnin 20 --device cpu -b pytorch -s 524288 > LOG"
 CMD_RUN_ISONEURAL="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/isoneutral_mixing  --burnin 20 --device cpu -b pytorch -s 524288 > LOG"
 CMD_RESULT="cd pyhpc-benchmarks-3.0 && cat LOG"
 CMD_FILE="cat pyhpc-benchmarks-3.0/LOG | grep 'seconds' | wc -l"
 PAYLOAD_ARGS="tail -f /dev/null"
 function remove_tmp_file() {
 	rm -rf "${equation_pytorch_file}" "${isoneural_pytorch_file}"
 }
 trap remove_tmp_file EXIT
 function check_containers_are_up() {
 	local containers_launched=0
 	for i in $(seq "${TIMEOUT}") ; do
 		info "Verify that the containers are running"
 		containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
 		[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
 		sleep 1
 		[ "${i}" == "${TIMEOUT}" ] && return 1
 	done
 }
 function equation_of_state_pytorch_test() {
 	info "Running Equation of State Pytorch test"
 	for i in "${containers[@]}"; do
 		sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
 	done
 	for i in "${containers[@]}"; do
 		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
 		retries="200"
 		for j in $(seq 1 "${retries}"); do
 			[ "${check_file}" -eq 1 ] && break
 			sleep 1
 		done
 	done
 	for i in "${containers[@]}"; do
 		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}"  >> "${equation_pytorch_file}"
 	done
 	local equation_pytorch_results=$(cat "${equation_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//')
 	local equation_average_pytorch=$(echo "${equation_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
 	local json="$(cat << EOF
 	{
 		"Pytorch Equation of State": {
 			"Result": "${equation_pytorch_results}",
 			"Average": "${equation_average_pytorch}",
 			"Units": "s"
 		}
 	}
 EOF
 )"
 	metrics_json_add_array_element "$json"
 }
 function isoneural_pytorch_test() {
 	info "Running Isoneural Pytorch test"
 	for i in "${containers[@]}"; do
 		sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN_ISONEURAL}"
 	done
 	for i in "${containers[@]}"; do
 		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
 		retries="200"
 		for j in $(seq 1 "${retries}"); do
 			[ "${check_file}" -eq 1 ] && break
 			sleep 1
 		done
 	done
 	for i in "${containers[@]}"; do
 		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}"  >> "${isoneural_pytorch_file}"
 	done
 	local isoneural_pytorch_results=$(cat "${isoneural_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//')
 	local isoneural_average_pytorch=$(echo "${isoneural_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
 	local json="$(cat << EOF
 	{
 		"Pytorch Isoneural": {
 			"Result": "${isoneural_pytorch_results}",
 			"Average": "${isoneural_average_pytorch}",
 			"Units": "s"
 		}
 	}
 EOF
 )"
 	metrics_json_add_array_element "$json"
 	metrics_json_end_array "Results"
 }
 function main() {
 	# Verify enough arguments
 	if [ $# != 2 ]; then
 		echo >&2 "error: Not enough arguments [$@]"
 		help
 		exit 1
 	fi
 	local i=0
 	local containers=()
 	local not_started_count="${NUM_CONTAINERS}"
 	# Check tools/commands dependencies
 	cmds=("awk" "docker" "bc")
 	check_cmds "${cmds[@]}"
 	check_ctr_images "${IMAGE}" "${DOCKERFILE}"
 	init_env
 	info "Creating ${NUM_CONTAINERS} containers"
 	for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
 		containers+=($(random_name))
 		sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
 		((not_started_count--))
 		info "$not_started_count remaining containers"
 	done
 	metrics_json_init
 	metrics_json_start_array
 	# Check that the requested number of containers are running
 	check_containers_are_up
 	equation_of_state_pytorch_test
 	isoneural_pytorch_test
 	metrics_json_save
 	clean_env_ctr
 }
 main "$@"
--- a/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile
+++ b/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile
@ -0,0 +1,19 @@
 # Copyright (c) 2023 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
 # Usage: FROM [image name]
 FROM intel/intel-optimized-pytorch:1.12.100
 # Version of the Dockerfile
 LABEL DOCKERFILE_VERSION="1.0"
 RUN apt-get update && \
 	apt-get install -y --no-install-recommends build-essential curl git && \
 	apt-get remove -y unattended-upgrades && \
 	curl -OkL https://github.com/dionhaefner/pyhpc-benchmarks/archive/refs/tags/v3.0.tar.gz  && \
 	tar -xf v3.0.tar.gz && \
 	pip install --no-cache-dir click==8.1.3 && \
 	cd pyhpc-benchmarks-3.0 && pip3 install --no-cache-dir --user torch==1.10.0
 CMD ["/bin/bash"]
--- a/tests/metrics/machine_learning/tensorflow.sh
+++ b/tests/metrics/machine_learning/tensorflow.sh
@ -0,0 +1,169 @@
 #!/bin/bash
 #
 # Copyright (c) 2023 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
 set -e
 # General env
 SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
 source "${SCRIPT_PATH}/../lib/common.bash"
 IMAGE="docker.io/library/tensorflow:latest"
 DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile"
 BATCH_SIZE="512"
 NUM_BATCHES="300"
 CMD_RESULT="cd benchmarks/scripts/tf_cnn_benchmarks/ && cat result"
 CMD_FILE="cat benchmarks/scripts/tf_cnn_benchmarks/result | grep 'total images' | wc -l"
 tensorflow_file=$(mktemp tensorflowresults.XXXXXXXXXX)
 NUM_CONTAINERS="$1"
 TIMEOUT="$2"
 TEST_NAME="tensorflow"
 PAYLOAD_ARGS="tail -f /dev/null"
 function remove_tmp_file() {
 	rm -rf "${tensorflow_file}"
 }
 trap remove_tmp_file EXIT
 function help() {
 cat << EOF
 Usage: $0 <count> <timeout>
 	Description:
 		This script launches n number of containers
 		to run the tf cnn benchmarks using a Tensorflow
 		container.
 	Options:
 		<count> : Number of containers to run.
 		<timeout> : Timeout to launch the containers.
 EOF
 }
 function resnet50_test() {
 	local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > result"
 	info "Running Resnet50 Tensorflow test"
 	for i in "${containers[@]}"; do
 		sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
 	done
 	for i in "${containers[@]}"; do
 		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
 		retries="200"
 		for j in $(seq 1 "${retries}"); do
 			[ "${check_file}" -eq 1 ] && break
 			sleep 1
 		done
 	done
 	for i in "${containers[@]}"; do
 		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}"  >> "${tensorflow_file}"
 	done
 	local resnet50_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
 	local average_resnet50=$(echo "${resnet50_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
 	local json="$(cat << EOF
 	{
 		"Resnet50": {
 			"Result": "${resnet50_results}",
 			"Average": "${average_resnet50}",
 			"Units": "s"
 		}
 	}
 EOF
 )"
 	metrics_json_add_array_element "$json"
 }
 function axelnet_test() {
 	local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > result"
 	info "Running AxelNet Tensorflow test"
 	for i in "${containers[@]}"; do
 		sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
 	done
 	for i in "${containers[@]}"; do
 		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
 		retries="200"
 		for j in $(seq 1 "${retries}"); do
 			[ "${check_file}" -eq 1 ] && break
 			sleep 1
 		done
 	done
 	for i in "${containers[@]}"; do
 		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}"  >> "${tensorflow_file}"
 	done
 	local axelnet_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
 	local average_axelnet=$(echo "${axelnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
 	local json="$(cat << EOF
 	{
 		"AxelNet": {
 			"Result": "${axelnet_results}",
 			"Average": "${average_axelnet}",
 			"Units": "s"
 		}
 	}
 EOF
 )"
 	metrics_json_add_array_element "$json"
 	metrics_json_end_array "Results"
 }
 function check_containers_are_up() {
 	local containers_launched=0
 	for i in $(seq "${TIMEOUT}") ; do
 		info "Verify that the containers are running"
 		containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
 		[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
 		sleep 1
 		[ "${i}" == "${TIMEOUT}" ] && return 1
 	done
 }
 function main() {
 	# Verify enough arguments
 	if [ $# != 2 ]; then
 		echo >&2 "error: Not enough arguments [$@]"
 		help
 		exit 1
 	fi
 	local i=0
 	local containers=()
 	local not_started_count="${NUM_CONTAINERS}"
 	# Check tools/commands dependencies
 	cmds=("awk" "docker" "bc")
 	check_cmds "${cmds[@]}"
 	check_ctr_images "${IMAGE}" "${DOCKERFILE}"
 	init_env
 	info "Creating ${NUM_CONTAINERS} containers"
 	for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
 		containers+=($(random_name))
 		sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
 		((not_started_count--))
 		info "$not_started_count remaining containers"
 	done
 	metrics_json_init
 	metrics_json_start_array
 	# Check that the requested number of containers are running
 	check_containers_are_up
 	resnet50_test
 	axelnet_test
 	metrics_json_save
 	clean_env_ctr
 }
 main "$@"
--- a/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile
+++ b/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile
@ -0,0 +1,18 @@
 # Copyright (c) 2023 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
 # Usage: FROM [image name]
 FROM intel/intel-optimized-tensorflow:2.9.1
 # Version of the Dockerfile
 LABEL DOCKERFILE_VERSION="1.0"
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && \
 	apt-get install -y --no-install-recommends build-essential git && \
 	apt-get remove -y unattended-upgrades && \
 	git clone https://github.com/tensorflow/benchmarks
 CMD ["/bin/bash"]