CI: Introduce retry mechanism for kubectl in gha-run.sh

Frequent errors have been observed during k8s e2e tests:

- The connection to the server 127.0.0.1:6443 was refused - did you specify the right host or port?
- Error from server (ServiceUnavailable): the server is currently unable to handle the request
- Error from server (NotFound): the server could not find the requested resource

These errors can be resolved by retrying the kubectl command.

This commit introduces a wrapper function in common.sh that runs kubectl up to 3 times
with a 5-second interval. Initially, this change only covers gha-run.sh for Kubernetes.

Signed-off-by: Hyounggyu Choi <Hyounggyu.Choi@ibm.com>
This commit is contained in:
Hyounggyu Choi 2024-06-06 18:28:37 +02:00
parent b30d085271
commit 8ff128dda8
2 changed files with 37 additions and 21 deletions

View File

@ -80,6 +80,22 @@ function handle_error() {
} }
trap 'handle_error $LINENO' ERR trap 'handle_error $LINENO' ERR
# A wrapper function for kubectl with retry logic
# runs the command up to 3 times with a 5-second interval
# to ensure successful execution
function kubectl_retry() {
local max_tries=3
local interval=5
local i=0
while true; do
kubectl $@ && return 0 || true
i=$((i + 1))
[ $i -lt $max_tries ] && echo "'kubectl $@' failed, retrying in $interval seconds" 1>&2 || break
sleep $interval
done
echo "'kubectl $@' failed after $max_tries tries" 1>&2 && return 1
}
function waitForProcess() { function waitForProcess() {
wait_time="$1" wait_time="$1"
sleep_time="$2" sleep_time="$2"

View File

@ -220,12 +220,12 @@ function deploy_kata() {
grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" || die "Failed to setup the tests image" grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" || die "Failed to setup the tests image"
echo "::endgroup::" echo "::endgroup::"
kubectl apply -f "${tools_dir}/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml" kubectl_retry apply -f "${tools_dir}/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml"
case "${KUBERNETES}" in case "${KUBERNETES}" in
k0s) kubectl apply -k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k0s" ;; k0s) kubectl_retry apply -k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k0s" ;;
k3s) kubectl apply -k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k3s" ;; k3s) kubectl_retry apply -k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k3s" ;;
rke2) kubectl apply -k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/rke2" ;; rke2) kubectl_retry apply -k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/rke2" ;;
*) kubectl apply -f "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" *) kubectl_retry apply -f "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
esac esac
local cmd="kubectl -n kube-system get -l name=kata-deploy pod 2>/dev/null | grep '\<Running\>'" local cmd="kubectl -n kube-system get -l name=kata-deploy pod 2>/dev/null | grep '\<Running\>'"
@ -241,11 +241,11 @@ function deploy_kata() {
fi fi
echo "::group::kata-deploy logs" echo "::group::kata-deploy logs"
kubectl -n kube-system logs --tail=100 -l name=kata-deploy kubectl_retry -n kube-system logs --tail=100 -l name=kata-deploy
echo "::endgroup::" echo "::endgroup::"
echo "::group::Runtime classes" echo "::group::Runtime classes"
kubectl get runtimeclass kubectl_retry get runtimeclass
echo "::endgroup::" echo "::endgroup::"
} }
@ -395,7 +395,7 @@ function cleanup_kata_deploy() {
esac esac
# shellcheck disable=2086 # shellcheck disable=2086
kubectl delete ${deploy_spec} kubectl_retry delete --ignore-not-found ${deploy_spec}
kubectl -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod kubectl -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod
# Let the `kata-deploy` script take care of the runtime class creation / removal # Let the `kata-deploy` script take care of the runtime class creation / removal
@ -419,12 +419,12 @@ function cleanup_kata_deploy() {
cat "${tools_dir}/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml" cat "${tools_dir}/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml"
grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" "${tools_dir}/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml" || die "Failed to setup the tests image" grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" "${tools_dir}/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml" || die "Failed to setup the tests image"
# shellcheck disable=2086 # shellcheck disable=2086
kubectl apply ${cleanup_spec} kubectl_retry apply ${cleanup_spec}
sleep 180s sleep 180s
# shellcheck disable=2086 # shellcheck disable=2086
kubectl delete ${cleanup_spec} kubectl_retry delete --ignore-not-found ${cleanup_spec}
kubectl delete -f "${tools_dir}/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml" kubectl_retry delete --ignore-not-found -f "${tools_dir}/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml"
} }
function cleanup() { function cleanup() {
@ -513,11 +513,11 @@ function deploy_nydus_snapshotter() {
misc/snapshotter/base/nydus-snapshotter.yaml misc/snapshotter/base/nydus-snapshotter.yaml
# Deploy nydus snapshotter as a daemonset # Deploy nydus snapshotter as a daemonset
kubectl create -f "misc/snapshotter/nydus-snapshotter-rbac.yaml" kubectl_retry create -f "misc/snapshotter/nydus-snapshotter-rbac.yaml"
if [ "${KUBERNETES}" = "k3s" ]; then if [ "${KUBERNETES}" = "k3s" ]; then
kubectl apply -k "misc/snapshotter/overlays/k3s" kubectl_retry apply -k "misc/snapshotter/overlays/k3s"
else else
kubectl apply -f "misc/snapshotter/base/nydus-snapshotter.yaml" kubectl_retry apply -f "misc/snapshotter/base/nydus-snapshotter.yaml"
fi fi
popd popd
@ -525,9 +525,9 @@ function deploy_nydus_snapshotter() {
echo "::endgroup::" echo "::endgroup::"
echo "::group::nydus snapshotter logs" echo "::group::nydus snapshotter logs"
pods_name=$(kubectl get pods --selector=app=nydus-snapshotter -n nydus-system -o=jsonpath='{.items[*].metadata.name}') pods_name=$(kubectl_retry get pods --selector=app=nydus-snapshotter -n nydus-system -o=jsonpath='{.items[*].metadata.name}')
kubectl logs "${pods_name}" -n nydus-system kubectl_retry logs "${pods_name}" -n nydus-system
kubectl describe pod "${pods_name}" -n nydus-system kubectl_retry describe pod "${pods_name}" -n nydus-system
echo "::endgroup::" echo "::endgroup::"
} }
@ -542,13 +542,13 @@ function cleanup_nydus_snapshotter() {
pushd "$nydus_snapshotter_install_dir" pushd "$nydus_snapshotter_install_dir"
if [ "${KUBERNETES}" = "k3s" ]; then if [ "${KUBERNETES}" = "k3s" ]; then
kubectl delete -k "misc/snapshotter/overlays/k3s" kubectl_retry delete --ignore-not-found -k "misc/snapshotter/overlays/k3s"
else else
kubectl delete -f "misc/snapshotter/base/nydus-snapshotter.yaml" kubectl_retry delete --ignore-not-found -f "misc/snapshotter/base/nydus-snapshotter.yaml"
fi fi
sleep 180s sleep 180s
kubectl delete -f "misc/snapshotter/nydus-snapshotter-rbac.yaml" kubectl_retry delete --ignore-not-found -f "misc/snapshotter/nydus-snapshotter-rbac.yaml"
kubectl get namespace nydus-system -o json | jq 'del(.spec.finalizers)' | kubectl replace --raw "/api/v1/namespaces/nydus-system/finalize" -f - || true kubectl_retry get namespace nydus-system -o json | jq 'del(.spec.finalizers)' | kubectl_retry replace --raw "/api/v1/namespaces/nydus-system/finalize" -f - || true
popd popd
sleep 30s sleep 30s
echo "::endgroup::" echo "::endgroup::"