Merge pull request #7315 from GabyCT/topic/machinelearning

tests: Add machine learning performance tests
2025-08-15 06:34:03 +00:00 · 2023-07-12 15:57:11 -06:00 · 2023-07-12 15:57:11 -06:00 · f63673838b
commit f63673838b
parent 3f38f75918 7f961461bd
6 changed files with 397 additions and 0 deletions
--- a/tests/metrics/README.md
+++ b/tests/metrics/README.md
@ -79,6 +79,8 @@ Test relating to measure reading and writing against clusters.
 Tests relating with TensorFlow and Pytorch implementations of several popular
 convolutional models.

+For further details see the [machine learning tests documentation](machine_learning).
+
 ## Saving Results

 In order to ensure continuity, and thus testing and historical tracking of results,
--- a/tests/metrics/machine_learning/README.md
+++ b/tests/metrics/machine_learning/README.md
@ -0,0 +1,29 @@
+# Kata Containers Tensorflow Metrics
+
+Kata Containers provides a series of performance tests using the
+TensorFlow reference benchmarks (tf_cnn_benchmarks).
+The tf_cnn_benchmarks containers TensorFlow implementations of several
+popular convolutional models.
+
+## Running the test
+
+Individual tests can be run by hand, for example:
+
+```
+$ cd metrics/machine_learning
+$ ./tensorflow.sh 25 60
+```
+# Kata Containers Pytorch Metrics
+
+Kata Containers provides a series of performance tests using Pytorch
+benchmarks based on a suite of Python high performance computing
+benchmarks.
+
+## Running the Pytorch test
+
+Individual tests can be run by hand, for example:
+
+```
+$ cd metrics/machine_learning
+$ ./tensorflow.sh 40 100
+```
--- a/tests/metrics/machine_learning/pytorch.sh
+++ b/tests/metrics/machine_learning/pytorch.sh
@ -0,0 +1,160 @@
+#!/bin/bash
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+
+# General env
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+IMAGE="docker.io/library/pytorch:latest"
+DOCKERFILE="${SCRIPT_PATH}/pytorch_dockerfile/Dockerfile"
+equation_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX)
+isoneural_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX)
+NUM_CONTAINERS="$1"
+TIMEOUT="$2"
+TEST_NAME="pytorch"
+CMD_RUN="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/equation_of_state --burnin 20 --device cpu -b pytorch -s 524288 > LOG"
+CMD_RUN_ISONEURAL="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/isoneutral_mixing  --burnin 20 --device cpu -b pytorch -s 524288 > LOG"
+CMD_RESULT="cd pyhpc-benchmarks-3.0 && cat LOG"
+CMD_FILE="cat pyhpc-benchmarks-3.0/LOG | grep 'seconds' | wc -l"
+PAYLOAD_ARGS="tail -f /dev/null"
+
+function remove_tmp_file() {
+	rm -rf "${equation_pytorch_file}" "${isoneural_pytorch_file}"
+}
+
+trap remove_tmp_file EXIT
+
+function check_containers_are_up() {
+	local containers_launched=0
+	for i in $(seq "${TIMEOUT}") ; do
+		info "Verify that the containers are running"
+		containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
+		[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
+		sleep 1
+		[ "${i}" == "${TIMEOUT}" ] && return 1
+	done
+}
+
+function equation_of_state_pytorch_test() {
+	info "Running Equation of State Pytorch test"
+	for i in "${containers[@]}"; do
+		sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
+	done
+
+	for i in "${containers[@]}"; do
+		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
+		retries="200"
+		for j in $(seq 1 "${retries}"); do
+			[ "${check_file}" -eq 1 ] && break
+			sleep 1
+		done
+	done
+
+	for i in "${containers[@]}"; do
+		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}"  >> "${equation_pytorch_file}"
+	done
+
+	local equation_pytorch_results=$(cat "${equation_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//')
+	local equation_average_pytorch=$(echo "${equation_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
+
+	local json="$(cat << EOF
+	{
+		"Pytorch Equation of State": {
+			"Result": "${equation_pytorch_results}",
+			"Average": "${equation_average_pytorch}",
+			"Units": "s"
+		}
+	}
+EOF
+)"
+	metrics_json_add_array_element "$json"
+
+}
+
+function isoneural_pytorch_test() {
+	info "Running Isoneural Pytorch test"
+	for i in "${containers[@]}"; do
+		sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN_ISONEURAL}"
+	done
+
+	for i in "${containers[@]}"; do
+		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
+		retries="200"
+		for j in $(seq 1 "${retries}"); do
+			[ "${check_file}" -eq 1 ] && break
+			sleep 1
+		done
+	done
+
+	for i in "${containers[@]}"; do
+		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}"  >> "${isoneural_pytorch_file}"
+	done
+
+	local isoneural_pytorch_results=$(cat "${isoneural_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//')
+	local isoneural_average_pytorch=$(echo "${isoneural_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
+
+	local json="$(cat << EOF
+	{
+		"Pytorch Isoneural": {
+			"Result": "${isoneural_pytorch_results}",
+			"Average": "${isoneural_average_pytorch}",
+			"Units": "s"
+		}
+	}
+EOF
+)"
+	metrics_json_add_array_element "$json"
+	metrics_json_end_array "Results"
+
+}
+
+
+function main() {
+	# Verify enough arguments
+	if [ $# != 2 ]; then
+		echo >&2 "error: Not enough arguments [$@]"
+		help
+		exit 1
+	fi
+
+	local i=0
+	local containers=()
+	local not_started_count="${NUM_CONTAINERS}"
+
+	# Check tools/commands dependencies
+	cmds=("awk" "docker" "bc")
+	check_cmds "${cmds[@]}"
+	check_ctr_images "${IMAGE}" "${DOCKERFILE}"
+
+	init_env
+	info "Creating ${NUM_CONTAINERS} containers"
+
+	for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
+		containers+=($(random_name))
+		sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
+		((not_started_count--))
+		info "$not_started_count remaining containers"
+	done
+
+	metrics_json_init
+	metrics_json_start_array
+
+
+	# Check that the requested number of containers are running
+	check_containers_are_up
+
+	equation_of_state_pytorch_test
+
+	isoneural_pytorch_test
+
+	metrics_json_save
+
+	clean_env_ctr
+
+}
+main "$@"
--- a/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile
+++ b/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile
@ -0,0 +1,19 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Usage: FROM [image name]
+FROM intel/intel-optimized-pytorch:1.12.100
+
+# Version of the Dockerfile
+LABEL DOCKERFILE_VERSION="1.0"
+
+RUN apt-get update && \
+	apt-get install -y --no-install-recommends build-essential curl git && \
+	apt-get remove -y unattended-upgrades && \
+	curl -OkL https://github.com/dionhaefner/pyhpc-benchmarks/archive/refs/tags/v3.0.tar.gz  && \
+	tar -xf v3.0.tar.gz && \
+	pip install --no-cache-dir click==8.1.3 && \
+	cd pyhpc-benchmarks-3.0 && pip3 install --no-cache-dir --user torch==1.10.0
+
+CMD ["/bin/bash"]
--- a/tests/metrics/machine_learning/tensorflow.sh
+++ b/tests/metrics/machine_learning/tensorflow.sh
@ -0,0 +1,169 @@
+#!/bin/bash
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+
+# General env
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+IMAGE="docker.io/library/tensorflow:latest"
+DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile"
+BATCH_SIZE="512"
+NUM_BATCHES="300"
+CMD_RESULT="cd benchmarks/scripts/tf_cnn_benchmarks/ && cat result"
+CMD_FILE="cat benchmarks/scripts/tf_cnn_benchmarks/result | grep 'total images' | wc -l"
+tensorflow_file=$(mktemp tensorflowresults.XXXXXXXXXX)
+NUM_CONTAINERS="$1"
+TIMEOUT="$2"
+TEST_NAME="tensorflow"
+PAYLOAD_ARGS="tail -f /dev/null"
+
+function remove_tmp_file() {
+	rm -rf "${tensorflow_file}"
+}
+
+trap remove_tmp_file EXIT
+
+function help() {
+cat << EOF
+Usage: $0 <count> <timeout>
+	Description:
+		This script launches n number of containers
+		to run the tf cnn benchmarks using a Tensorflow
+		container.
+	Options:
+		<count> : Number of containers to run.
+		<timeout> : Timeout to launch the containers.
+EOF
+}
+
+function resnet50_test() {
+	local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > result"
+	info "Running Resnet50 Tensorflow test"
+	for i in "${containers[@]}"; do
+		sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
+	done
+
+	for i in "${containers[@]}"; do
+		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
+		retries="200"
+		for j in $(seq 1 "${retries}"); do
+			[ "${check_file}" -eq 1 ] && break
+			sleep 1
+		done
+	done
+
+	for i in "${containers[@]}"; do
+		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}"  >> "${tensorflow_file}"
+	done
+
+	local resnet50_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
+	local average_resnet50=$(echo "${resnet50_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
+
+	local json="$(cat << EOF
+	{
+		"Resnet50": {
+			"Result": "${resnet50_results}",
+			"Average": "${average_resnet50}",
+			"Units": "s"
+		}
+	}
+EOF
+)"
+	metrics_json_add_array_element "$json"
+}
+
+function axelnet_test() {
+	local CMD_RUN="cd benchmarks/scripts/tf_cnn_benchmarks/ && python tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > result"
+	info "Running AxelNet Tensorflow test"
+	for i in "${containers[@]}"; do
+		sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}"
+	done
+
+	for i in "${containers[@]}"; do
+		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}")
+		retries="200"
+		for j in $(seq 1 "${retries}"); do
+			[ "${check_file}" -eq 1 ] && break
+			sleep 1
+		done
+	done
+
+	for i in "${containers[@]}"; do
+		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}"  >> "${tensorflow_file}"
+	done
+
+	local axelnet_results=$(cat "${tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//')
+	local average_axelnet=$(echo "${axelnet_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
+
+	local json="$(cat << EOF
+	{
+		"AxelNet": {
+			"Result": "${axelnet_results}",
+			"Average": "${average_axelnet}",
+			"Units": "s"
+		}
+	}
+EOF
+)"
+	metrics_json_add_array_element "$json"
+	metrics_json_end_array "Results"
+}
+
+function check_containers_are_up() {
+	local containers_launched=0
+	for i in $(seq "${TIMEOUT}") ; do
+		info "Verify that the containers are running"
+		containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")"
+		[ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break
+		sleep 1
+		[ "${i}" == "${TIMEOUT}" ] && return 1
+	done
+}
+
+function main() {
+	# Verify enough arguments
+	if [ $# != 2 ]; then
+		echo >&2 "error: Not enough arguments [$@]"
+		help
+		exit 1
+	fi
+
+	local i=0
+	local containers=()
+	local not_started_count="${NUM_CONTAINERS}"
+
+	# Check tools/commands dependencies
+	cmds=("awk" "docker" "bc")
+	check_cmds "${cmds[@]}"
+	check_ctr_images "${IMAGE}" "${DOCKERFILE}"
+
+	init_env
+	info "Creating ${NUM_CONTAINERS} containers"
+
+	for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
+		containers+=($(random_name))
+		sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
+		((not_started_count--))
+		info "$not_started_count remaining containers"
+	done
+
+	metrics_json_init
+	metrics_json_start_array
+
+	# Check that the requested number of containers are running
+	check_containers_are_up
+
+	resnet50_test
+
+	axelnet_test
+
+	metrics_json_save
+
+	clean_env_ctr
+}
+main "$@"
--- a/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile
+++ b/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile
@ -0,0 +1,18 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Usage: FROM [image name]
+FROM intel/intel-optimized-tensorflow:2.9.1
+
+# Version of the Dockerfile
+LABEL DOCKERFILE_VERSION="1.0"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+	apt-get install -y --no-install-recommends build-essential git && \
+	apt-get remove -y unattended-upgrades && \
+	git clone https://github.com/tensorflow/benchmarks
+
+CMD ["/bin/bash"]