tests: Introduce retry mechanism for helm install

Kata-deploy often fails due to a transiently unreachable k8s cluster
for the qemu-coco-dev test on s390x.
(e.g. https://github.com/kata-containers/kata-containers/actions/runs/10831142906/job/30058527098?pr=10009)
This commit introduces a retry mechanism to mitigate these failures by
retrying the command two more times with a 10-second interval as a workaround.

Signed-off-by: Hyounggyu Choi <Hyounggyu.Choi@ibm.com>
This commit is contained in:
Hyounggyu Choi 2024-09-13 12:32:47 +02:00
parent e937cb1ded
commit 4c933a5611

View File

@ -207,7 +207,25 @@ function deploy_kata() {
[ "$(yq .image.tag ${values_yaml})" = "${DOCKER_TAG}" ] || die "Failed to set image tag"
echo "::endgroup::"
helm install kata-deploy "${helm_chart_dir}" --values "${values_yaml}" --namespace kube-system --debug
local max_tries=3
local interval=10
local i=0
# Retry loop for helm install to prevent transient failures due to instantly unreachable cluster
set +e # Disable immediate exit on failure
while true; do
helm upgrade --install kata-deploy "${helm_chart_dir}" --values "${values_yaml}" --namespace kube-system --debug
if [ $? -eq 0 ]; then
echo "Helm install succeeded!"
break
fi
i=$((i+1))
[ $i -lt $max_tries ] && echo "Retrying after $interval seconds (Attempt $i of $(($max_tries - 1)))" || break
sleep $interval
done
set -e # Re-enable immediate exit on failure
if [ $i -eq $max_tries ]; then
die "Failed to deploy kata-deploy after $max_tries tries"
fi
# `helm install --wait` does not take effect on single replicas and maxUnavailable=1 DaemonSets
# like kata-deploy on CI. So wait for pods being Running in the "tradicional" way.