Fixed infinite loop in cluster validation.

Fixed infinite loop in cluster validation when getting nodes failed.
2025-07-21 10:51:29 +00:00 · 2016-12-15 14:34:02 +01:00 · 2016-12-15 14:34:02 +01:00 · 3dbfa94798
commit 3dbfa94798
parent 59ad9a30ca
1 changed files with 15 additions and 3 deletions
--- a/cluster/validate-cluster.sh
+++ b/cluster/validate-cluster.sh
@ -73,7 +73,9 @@ REQUIRED_NUM_NODES=$((EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES))
 # Make several attempts to deal with slow cluster birth.
 return_value=0
 attempt=0
+# Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
 PAUSE_BETWEEN_ITERATIONS_SECONDS=15
+MAX_ATTEMPTS=100
 ADDITIONAL_ITERATIONS=$(((CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1)/PAUSE_BETWEEN_ITERATIONS_SECONDS))
 while true; do
  # Pause between iterations of this large outer loop.
@ -91,7 +93,18 @@ while true; do
  # Suppress errors from kubectl output because during cluster bootstrapping
  # for clusters where the master node is registered, the apiserver will become
  # available and then get restarted as the kubelet configures the docker bridge.
-  node=$(kubectl_retry get nodes) || continue
+  #
+  # We are assigning the result of kubectl_retry get nodes operation to the res
+  # varaible in that way, to prevent stopping the whole script on an error.
+  node=$(kubectl_retry get nodes) && res="$?" || res="$?"
+  if [ "${res}" -ne "0" ]; then
+    if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
+      echo -e "${color_red} Failed to get nodes.${color_norm}"
+      exit 1
+    else
+      continue
+    fi
+  fi
  found=$(($(echo "${node}" | wc -l) - 1))
  ready=$(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1))

@ -110,8 +123,7 @@ while true; do
      echo -e "${color_green}Found ${REQUIRED_NUM_NODES} Nodes, allowing additional ${ADDITIONAL_ITERATIONS} iterations for other Nodes to join.${color_norm}"
      last_run="${last_run:-$((attempt + ADDITIONAL_ITERATIONS - 1))}"
    fi
-    # Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
-    if [[ "${attempt}" -gt "${last_run:-100}" ]]; then
+    if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
      echo -e "${color_yellow}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
      kubectl_retry get nodes
      if [[ "${REQUIRED_NUM_NODES}" -gt "${ready}" ]]; then