mirror of
				https://github.com/kata-containers/kata-containers.git
				synced 2025-10-31 01:13:02 +00:00 
			
		
		
		
	metrics: compute tensorflow statistics
This PR computes average results for TF bench. Additionally, it improves the data parsing from all running containers. Fixes: #7603 Signed-off-by: David Esparza <david.esparza.borquez@intel.com>
This commit is contained in:
		| @@ -31,11 +31,16 @@ CMD_RESNET="$dst_dir/$resnet_start_script" | |||||||
| CMD_ALEXNET="$dst_dir/$alexnet_start_script" | CMD_ALEXNET="$dst_dir/$alexnet_start_script" | ||||||
| timeout=600 | timeout=600 | ||||||
| INITIAL_NUM_PIDS=1 | INITIAL_NUM_PIDS=1 | ||||||
| CMD_FILE="cat alexnet_results | grep 'total images' | wc -l" | ALEXNET_FILE="alexnet_results" | ||||||
| RESNET_CMD_FILE="cat resnet_results | grep 'total images' | wc -l" | ALEXNET_CHECK_FILE_CMD="cat /${ALEXNET_FILE} | grep 'total images' | wc -l" | ||||||
|  | RESNET_FILE="resnet_results" | ||||||
|  | RESNET_CHECK_FILE_CMD="cat /${RESNET_FILE} | grep 'total images' | wc -l" | ||||||
|  | MAX_RETRIES=300 | ||||||
|  |  | ||||||
| function remove_tmp_file() { | function remove_tmp_file() { | ||||||
| 	rm -rf "${resnet_tensorflow_file}" "${alexnet_tensorflow_file}" | 	rm -rf "${resnet_tensorflow_file}" "${alexnet_tensorflow_file}" | ||||||
|  | 	rm -rf "${src_dir}" | ||||||
|  | 	clean_env_ctr | ||||||
| } | } | ||||||
|  |  | ||||||
| trap remove_tmp_file EXIT | trap remove_tmp_file EXIT | ||||||
| @@ -59,7 +64,8 @@ function create_resnet_start_script() { | |||||||
|  |  | ||||||
| cat <<EOF >>"${script}" | cat <<EOF >>"${script}" | ||||||
| #!/bin/bash | #!/bin/bash | ||||||
| python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > resnet_results | pushd "benchmarks/scripts/tf_cnn_benchmarks" | ||||||
|  | python tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > "/${RESNET_FILE}" | ||||||
| EOF | EOF | ||||||
| 	chmod +x "${script}" | 	chmod +x "${script}" | ||||||
| } | } | ||||||
| @@ -70,78 +76,82 @@ function create_alexnet_start_script() { | |||||||
|  |  | ||||||
| cat <<EOF >>"${script}" | cat <<EOF >>"${script}" | ||||||
| #!/bin/bash | #!/bin/bash | ||||||
| python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > alexnet_results | pushd "benchmarks/scripts/tf_cnn_benchmarks" | ||||||
|  | python tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > "/${ALEXNET_FILE}" | ||||||
| EOF | EOF | ||||||
| 	chmod +x "${script}" | 	chmod +x "${script}" | ||||||
| } | } | ||||||
|  |  | ||||||
|  | function launch_workload() { | ||||||
|  | 	WORKLOAD=${1} | ||||||
|  | 	[[ -z ${WORKLOAD} ]] && die "Container workload is missing" | ||||||
|  |  | ||||||
|  | 	local pids=() | ||||||
|  | 	local j=0 | ||||||
|  | 	for i in "${containers[@]}"; do | ||||||
|  | 		$(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${WORKLOAD}")& | ||||||
|  | 		pids["${j}"]=$! | ||||||
|  | 		((j++)) | ||||||
|  | 	done | ||||||
|  |  | ||||||
|  | 	# wait for all pids | ||||||
|  | 	for pid in ${pids[*]}; do | ||||||
|  | 		wait "${pid}" | ||||||
|  | 	done | ||||||
|  | } | ||||||
|  |  | ||||||
|  | function collect_results() { | ||||||
|  | 	WORKLOAD=${1} | ||||||
|  | 	[[ -z ${WORKLOAD} ]] && die "Container workload is missing" | ||||||
|  |  | ||||||
|  | 	local tasks_running=("${containers[@]}") | ||||||
|  | 	local retries=${MAX_RETRIES} | ||||||
|  |  | ||||||
|  | 	while [ "${#tasks_running[@]}" -gt 0 ] && [ "${retries}" -gt 0 ]; do | ||||||
|  | 		for i in "${!tasks_running[@]}"; do | ||||||
|  | 			check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${tasks_running[i]}" sh -c "${WORKLOAD}") | ||||||
|  |  | ||||||
|  | 			# if the current task is done, remove the corresponding container from the active list | ||||||
|  | 			[ "${check_file}" -eq "1" ] && unset 'tasks_running[i]' | ||||||
|  | 		done | ||||||
|  | 		((retries--)) | ||||||
|  | 		sleep 3 | ||||||
|  | 		echo -n "." | ||||||
|  | 	done | ||||||
|  | 	echo -e "\n" | ||||||
|  | } | ||||||
|  |  | ||||||
| function tensorflow_test() { | function tensorflow_test() { | ||||||
| 	info "Copy Resnet Tensorflow test" | 	# Resnet section | ||||||
| 	local pids=() | 	info "Running TF-Resnet test" | ||||||
| 	local j=0 | 	launch_workload "${CMD_RESNET}" | ||||||
|  | 	collect_results "${RESNET_CHECK_FILE_CMD}" | ||||||
|  |  | ||||||
|  | 	# Alexnet section | ||||||
|  | 	info "Running TF-Alexnet test" | ||||||
|  | 	launch_workload "${CMD_ALEXNET}" | ||||||
|  | 	collect_results "${ALEXNET_CHECK_FILE_CMD}" | ||||||
|  |  | ||||||
|  | 	info "Tensorflow workload completed" | ||||||
|  | 	# Retrieving results | ||||||
| 	for i in "${containers[@]}"; do | 	for i in "${containers[@]}"; do | ||||||
| 		$(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESNET}")& | 		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat /${RESNET_FILE}"  >> "${resnet_tensorflow_file}" | ||||||
| 		pids["${j}"]=$! | 		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat /${ALEXNET_FILE}"  >> "${alexnet_tensorflow_file}" | ||||||
| 		((j++)) |  | ||||||
| 	done | 	done | ||||||
|  |  | ||||||
| 	# wait for all pids | 	# Parsing resnet results | ||||||
| 	for pid in ${pids[*]}; do | 	local resnet_results=$(cat "${resnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') | ||||||
| 		wait "${pid}" | 	local res_sum="$(sed -e 's/,/\n/g' <<< ${resnet_results} | awk 'BEGIN {total=0} {total += $1} END {print total}')" | ||||||
| 	done | 	local num_elements="$(awk '{print NF}' FS=',' <<<${resnet_results})" | ||||||
|  | 	local average_resnet="$(echo "scale=2 ; ${res_sum} / ${num_elements}" | bc)" | ||||||
|  |  | ||||||
| 	info "All containers are running the workload..." | 	# Parsing alexnet results | ||||||
|  | 	local alexnet_results=$(cat "${alexnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') | ||||||
| 	for i in "${containers[@]}"; do | 	local alex_sum="$(sed -e 's/,/\n/g' <<< ${alexnet_results} | awk 'BEGIN {total=0} {total += $1} END {print total}')" | ||||||
| 		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${RESNET_CMD_FILE}") | 	num_elements="$(awk '{print NF}' FS=',' <<< ${alexnet_results})" | ||||||
| 		retries="300" | 	local average_alexnet="$(echo " scale=2 ; ${alex_sum} / ${num_elements}" | bc)" | ||||||
| 		for j in $(seq 1 "${retries}"); do |  | ||||||
| 			[ "${check_file}" -eq "1" ] && break |  | ||||||
| 			sleep 1 |  | ||||||
| 		done |  | ||||||
| 	done |  | ||||||
|  |  | ||||||
| 	info "Copy Alexnet Tensorflow test" |  | ||||||
| 	local pids=() |  | ||||||
| 	local j=0 |  | ||||||
| 	for i in "${containers[@]}"; do |  | ||||||
| 		$(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_ALEXNET}")& |  | ||||||
| 		pids["${j}"]=$! |  | ||||||
| 		((j++)) |  | ||||||
| 	done |  | ||||||
|  |  | ||||||
| 	# wait for all pids |  | ||||||
| 	for pid in ${pids[*]}; do |  | ||||||
| 		wait "${pid}" |  | ||||||
| 	done |  | ||||||
|  |  | ||||||
| 	for i in "${containers[@]}"; do |  | ||||||
| 		check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") |  | ||||||
| 		retries="300" |  | ||||||
| 		for j in $(seq 1 "${retries}"); do |  | ||||||
| 			[ "${check_file}" -eq "1" ] && break |  | ||||||
| 			sleep 1 |  | ||||||
| 		done |  | ||||||
| 	done |  | ||||||
|  |  | ||||||
| 	for i in "${containers[@]}"; do |  | ||||||
| 		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat resnet_results"  >> "${resnet_tensorflow_file}" |  | ||||||
| 	done |  | ||||||
|  |  | ||||||
| 	local res_results=$(cat "${resnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') |  | ||||||
| 	local resnet_results=$(printf "%.0f\n" "${res_results}") |  | ||||||
| 	local res_average=$(echo "${resnet_results}" | sed "s/,/+/g;s/.*/(&)\/${NUM_CONTAINERS}/g" | bc -l) |  | ||||||
| 	local average_resnet=$(printf "%.0f\n" "${res_average}") |  | ||||||
|  |  | ||||||
| 	for i in "${containers[@]}"; do |  | ||||||
| 		sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat alexnet_results"  >> "${alexnet_tensorflow_file}" |  | ||||||
| 	done |  | ||||||
|  |  | ||||||
| 	local alex_results=$(cat "${alexnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') |  | ||||||
| 	local alexnet_results=$(printf "%.0f\n" "${alex_results}") |  | ||||||
| 	local alex_average=$(echo "${alexnet_results}" | sed "s/,/+/g;s/.*/(&)\/${NUM_CONTAINERS}/g" | bc -l) |  | ||||||
| 	local average_alexnet=$(printf "%.0f\n" "${alex_average}") |  | ||||||
|  |  | ||||||
|  | 	# writing json results file | ||||||
| 	local json="$(cat << EOF | 	local json="$(cat << EOF | ||||||
| 	{ | 	{ | ||||||
| 		"resnet": { | 		"resnet": { | ||||||
| @@ -174,7 +184,7 @@ function check_containers_are_up() { | |||||||
|  |  | ||||||
| function main() { | function main() { | ||||||
| 	# Verify enough arguments | 	# Verify enough arguments | ||||||
| 	if [ $# != 2 ]; then | 	if [ "$#" -lt 2 ]; then | ||||||
| 		echo >&2 "error: Not enough arguments [$@]" | 		echo >&2 "error: Not enough arguments [$@]" | ||||||
| 		help | 		help | ||||||
| 		exit 1 | 		exit 1 | ||||||
| @@ -224,13 +234,8 @@ function main() { | |||||||
| 	# Get the initial number of pids in a single container before the workload starts | 	# Get the initial number of pids in a single container before the workload starts | ||||||
| 	INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) | 	INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) | ||||||
| 	((INITIAL_NUM_PIDS++)) | 	((INITIAL_NUM_PIDS++)) | ||||||
|  |  | ||||||
| 	tensorflow_test | 	tensorflow_test | ||||||
|  |  | ||||||
| 	metrics_json_save | 	metrics_json_save | ||||||
|  |  | ||||||
| 	rm -rf "${src_dir}" |  | ||||||
|  |  | ||||||
| 	clean_env_ctr |  | ||||||
| } | } | ||||||
|  |  | ||||||
| main "$@" | main "$@" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user