mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-25 12:43:23 +00:00
Add retries to validate-cluster
E2e shows occasional kubectl failures here, so add some retries. We may want to make this more general, but I think we should try it out in small scope first. Also clean up the retry loop so it doesn't process errors as successful runs (discovered in testing). Also simplify a bit of go template syntax. Testing: I made kubectl randomly fail 50% of the time ($RANDOM%2 ==0) and iterated until this gave me more helpful results. Still not perfect, but better.
This commit is contained in:
parent
88a68e99f3
commit
7e0f66bd0e
@ -27,12 +27,26 @@ set -o pipefail
|
|||||||
KUBE_ROOT=$(dirname "${BASH_SOURCE}")/..
|
KUBE_ROOT=$(dirname "${BASH_SOURCE}")/..
|
||||||
|
|
||||||
if [ -f "${KUBE_ROOT}/cluster/env.sh" ]; then
|
if [ -f "${KUBE_ROOT}/cluster/env.sh" ]; then
|
||||||
source "${KUBE_ROOT}/cluster/env.sh"
|
source "${KUBE_ROOT}/cluster/env.sh"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
source "${KUBE_ROOT}/cluster/lib/util.sh"
|
source "${KUBE_ROOT}/cluster/lib/util.sh"
|
||||||
source "${KUBE_ROOT}/cluster/kube-util.sh"
|
source "${KUBE_ROOT}/cluster/kube-util.sh"
|
||||||
|
|
||||||
|
# Run kubectl and retry upon failure.
|
||||||
|
function kubectl_retry() {
|
||||||
|
tries=3
|
||||||
|
while ! "${KUBE_ROOT}/cluster/kubectl.sh" "$@"; do
|
||||||
|
tries=$((tries-1))
|
||||||
|
if [[ ${tries} -le 0 ]]; then
|
||||||
|
echo "('kubectl $@' failed, giving up)" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
echo "(kubectl failed, will retry ${tries} times)" >&2
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
|
ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
|
||||||
|
|
||||||
EXPECTED_NUM_NODES="${NUM_NODES}"
|
EXPECTED_NUM_NODES="${NUM_NODES}"
|
||||||
@ -43,6 +57,12 @@ fi
|
|||||||
return_value=0
|
return_value=0
|
||||||
attempt=0
|
attempt=0
|
||||||
while true; do
|
while true; do
|
||||||
|
# Pause between iterations of this large outer loop.
|
||||||
|
if [[ ${attempt} -gt 0 ]]; then
|
||||||
|
sleep 15
|
||||||
|
fi
|
||||||
|
attempt=$((attempt+1))
|
||||||
|
|
||||||
# The "kubectl get nodes -o template" exports node information.
|
# The "kubectl get nodes -o template" exports node information.
|
||||||
#
|
#
|
||||||
# Echo the output and gather 2 counts:
|
# Echo the output and gather 2 counts:
|
||||||
@ -52,35 +72,36 @@ while true; do
|
|||||||
# Suppress errors from kubectl output because during cluster bootstrapping
|
# Suppress errors from kubectl output because during cluster bootstrapping
|
||||||
# for clusters where the master node is registered, the apiserver will become
|
# for clusters where the master node is registered, the apiserver will become
|
||||||
# available and then get restarted as the kubelet configures the docker bridge.
|
# available and then get restarted as the kubelet configures the docker bridge.
|
||||||
node=$("${KUBE_ROOT}/cluster/kubectl.sh" get nodes) || true
|
node=$(kubectl_retry get nodes) || continue
|
||||||
found=$(($(echo "${node}" | wc -l) - 1)) || true
|
found=$(($(echo "${node}" | wc -l) - 1))
|
||||||
ready=$(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1)) || true
|
ready=$(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1))
|
||||||
|
|
||||||
if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")); then
|
if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")); then
|
||||||
break
|
break
|
||||||
elif (( "${found}" > "${EXPECTED_NUM_NODES}" )) && (( "${ready}" > "${EXPECTED_NUM_NODES}")); then
|
elif (( "${found}" > "${EXPECTED_NUM_NODES}" )); then
|
||||||
echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Found more nodes than expected, your cluster may not behave correctly.${color_norm}"
|
echo -e "${color_red}Found ${found} nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
|
||||||
|
break
|
||||||
|
elif (( "${ready}" > "${EXPECTED_NUM_NODES}")); then
|
||||||
|
echo -e "${color_red}Found ${ready} ready nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
# Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
|
# Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
|
||||||
if (( attempt > 100 )); then
|
if (( attempt > 100 )); then
|
||||||
echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
|
echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
|
||||||
"${KUBE_ROOT}/cluster/kubectl.sh" get nodes
|
kubectl_retry get nodes
|
||||||
if [ "$((${EXPECTED_NUM_NODES} - ${ready}))" -gt "${ALLOWED_NOTREADY_NODES}" ]; then
|
if [ "$((${EXPECTED_NUM_NODES} - ${ready}))" -gt "${ALLOWED_NOTREADY_NODES}" ]; then
|
||||||
exit 1
|
exit 1
|
||||||
else
|
else
|
||||||
return_value=2
|
return_value=2
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
|
echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
|
||||||
fi
|
fi
|
||||||
attempt=$((attempt+1))
|
|
||||||
sleep 15
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
echo "Found ${found} node(s)."
|
echo "Found ${found} node(s)."
|
||||||
"${KUBE_ROOT}/cluster/kubectl.sh" get nodes
|
kubectl_retry get nodes
|
||||||
|
|
||||||
attempt=0
|
attempt=0
|
||||||
while true; do
|
while true; do
|
||||||
@ -89,9 +110,9 @@ while true; do
|
|||||||
# Echo the output and gather 2 counts:
|
# Echo the output and gather 2 counts:
|
||||||
# - Total number of componentstatuses.
|
# - Total number of componentstatuses.
|
||||||
# - Number of "healthy" components.
|
# - Number of "healthy" components.
|
||||||
cs_status=$("${KUBE_ROOT}/cluster/kubectl.sh" get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}},{{end}}{{end}}') || true
|
cs_status=$(kubectl_retry get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}') || true
|
||||||
componentstatuses=$(echo "${cs_status}" | tr "," "\n" | grep -c 'Healthy:') || true
|
componentstatuses=$(echo "${cs_status}" | grep -c 'Healthy:') || true
|
||||||
healthy=$(echo "${cs_status}" | tr "," "\n" | grep -c 'Healthy:True') || true
|
healthy=$(echo "${cs_status}" | grep -c 'Healthy:True') || true
|
||||||
|
|
||||||
if ((componentstatuses > healthy)); then
|
if ((componentstatuses > healthy)); then
|
||||||
if ((attempt < 5)); then
|
if ((attempt < 5)); then
|
||||||
@ -100,7 +121,7 @@ while true; do
|
|||||||
sleep 30
|
sleep 30
|
||||||
else
|
else
|
||||||
echo -e " ${color_yellow}Validate output:${color_norm}"
|
echo -e " ${color_yellow}Validate output:${color_norm}"
|
||||||
"${KUBE_ROOT}/cluster/kubectl.sh" get cs
|
kubectl_retry get cs
|
||||||
echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}"
|
echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
@ -110,7 +131,7 @@ while true; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
echo "Validate output:"
|
echo "Validate output:"
|
||||||
"${KUBE_ROOT}/cluster/kubectl.sh" get cs
|
kubectl_retry get cs
|
||||||
if [ "${return_value}" == "0" ]; then
|
if [ "${return_value}" == "0" ]; then
|
||||||
echo -e "${color_green}Cluster validation succeeded${color_norm}"
|
echo -e "${color_green}Cluster validation succeeded${color_norm}"
|
||||||
else
|
else
|
||||||
|
Loading…
Reference in New Issue
Block a user