diff --git a/cluster/validate-cluster.sh b/cluster/validate-cluster.sh index b84356af536..38af0a3aaaf 100755 --- a/cluster/validate-cluster.sh +++ b/cluster/validate-cluster.sh @@ -73,7 +73,9 @@ REQUIRED_NUM_NODES=$((EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES)) # Make several attempts to deal with slow cluster birth. return_value=0 attempt=0 +# Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters. PAUSE_BETWEEN_ITERATIONS_SECONDS=15 +MAX_ATTEMPTS=100 ADDITIONAL_ITERATIONS=$(((CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1)/PAUSE_BETWEEN_ITERATIONS_SECONDS)) while true; do # Pause between iterations of this large outer loop. @@ -91,7 +93,18 @@ while true; do # Suppress errors from kubectl output because during cluster bootstrapping # for clusters where the master node is registered, the apiserver will become # available and then get restarted as the kubelet configures the docker bridge. - node=$(kubectl_retry get nodes) || continue + # + # We are assigning the result of kubectl_retry get nodes operation to the res + # varaible in that way, to prevent stopping the whole script on an error. + node=$(kubectl_retry get nodes) && res="$?" || res="$?" + if [ "${res}" -ne "0" ]; then + if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then + echo -e "${color_red} Failed to get nodes.${color_norm}" + exit 1 + else + continue + fi + fi found=$(($(echo "${node}" | wc -l) - 1)) ready=$(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1)) @@ -110,8 +123,7 @@ while true; do echo -e "${color_green}Found ${REQUIRED_NUM_NODES} Nodes, allowing additional ${ADDITIONAL_ITERATIONS} iterations for other Nodes to join.${color_norm}" last_run="${last_run:-$((attempt + ADDITIONAL_ITERATIONS - 1))}" fi - # Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters. - if [[ "${attempt}" -gt "${last_run:-100}" ]]; then + if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then echo -e "${color_yellow}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}" kubectl_retry get nodes if [[ "${REQUIRED_NUM_NODES}" -gt "${ready}" ]]; then