From 5b5d3b9410ff4eaf10a36f792a099c227ca658e2 Mon Sep 17 00:00:00 2001
From: Shyam Jeedigunta <shyamjvs@google.com>
Date: Mon, 3 Jul 2017 14:33:28 +0200
Subject: [PATCH] Add parallelism to GCE cluster upgrade

---
 cluster/gce/upgrade.sh | 213 ++++++++++++++++++++++++-----------------
 1 file changed, 124 insertions(+), 89 deletions(-)

diff --git a/cluster/gce/upgrade.sh b/cluster/gce/upgrade.sh
index ad84f1ff090..f6a1299564c 100755
--- a/cluster/gce/upgrade.sh
+++ b/cluster/gce/upgrade.sh
@@ -39,6 +39,7 @@ function usage() {
   echo "  -M:  Upgrade master only"
   echo "  -N:  Upgrade nodes only"
   echo "  -P:  Node upgrade prerequisites only (create a new instance template)"
+  echo "  -c:  Upgrade NODE_UPGRADE_PARALLELISM nodes in parallel (default=1) within a single instance group. The MIGs themselves are dealt serially."
   echo "  -o:  Use os distro sepcified in KUBE_NODE_OS_DISTRIBUTION for new nodes. Options include 'debian' or 'gci'"
   echo "  -l:  Use local(dev) binaries. This is only supported for master upgrades."
   echo ""
@@ -254,7 +255,7 @@ function setup-base-image() {
     source "${KUBE_ROOT}/cluster/gce/${NODE_OS_DISTRIBUTION}/node-helper.sh"
     # Reset the node image based on current os distro
     set-node-image
-fi
+  fi
 }
 
 # prepare-node-upgrade creates a new instance template suitable for upgrading
@@ -327,10 +328,105 @@ function upgrade-node-env() {
   fi
 }
 
+# Upgrades a single node.
+# $1: The name of the node
+#
+# Note: This is called multiple times from do-node-upgrade() in parallel, so should be thread-safe.
+function do-single-node-upgrade() {
+  local -r instance="$1"
+  instance_id=$(gcloud compute instances describe "${instance}" \
+    --format='get(id)' \
+    --project="${PROJECT}" \
+    --zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
+  if [[ "${describe_rc}" != 0 ]]; then
+    echo "== FAILED to describe ${instance} =="
+    echo "${instance_id}"
+    return ${describe_rc}
+  fi
+
+  # Drain node
+  echo "== Draining ${instance}. == " >&2
+  "${KUBE_ROOT}/cluster/kubectl.sh" drain --delete-local-data --force --ignore-daemonsets "${instance}" \
+    && drain_rc=$? || drain_rc=$?
+  if [[ "${drain_rc}" != 0 ]]; then
+    echo "== FAILED to drain ${instance} =="
+    return ${drain_rc}
+  fi
+
+  # Recreate instance
+  echo "== Recreating instance ${instance}. ==" >&2
+  recreate=$(gcloud compute instance-groups managed recreate-instances "${group}" \
+    --project="${PROJECT}" \
+    --zone="${ZONE}" \
+    --instances="${instance}" 2>&1) && recreate_rc=$? || recreate_rc=$?
+  if [[ "${recreate_rc}" != 0 ]]; then
+    echo "== FAILED to recreate ${instance} =="
+    echo "${recreate}"
+    return ${recreate_rc}
+  fi
+
+  # Wait for instance to be recreated
+  echo "== Waiting for instance ${instance} to be recreated. ==" >&2
+  while true; do
+    new_instance_id=$(gcloud compute instances describe "${instance}" \
+      --format='get(id)' \
+      --project="${PROJECT}" \
+      --zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
+    if [[ "${describe_rc}" != 0 ]]; then
+      echo "== FAILED to describe ${instance} =="
+      echo "${new_instance_id}"
+      echo "  (Will retry.)"
+    elif [[ "${new_instance_id}" == "${instance_id}" ]]; then
+      echo -n .
+    else
+      echo "Instance ${instance} recreated."
+      break
+    fi
+    sleep 1
+  done
+
+  # Wait for k8s node object to reflect new instance id
+  echo "== Waiting for new node to be added to k8s.  ==" >&2
+  while true; do
+    external_id=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output=jsonpath='{.spec.externalID}' 2>&1) && kubectl_rc=$? || kubectl_rc=$?
+    if [[ "${kubectl_rc}" != 0 ]]; then
+      echo "== FAILED to get node ${instance} =="
+      echo "${external_id}"
+      echo "  (Will retry.)"
+    elif [[ "${external_id}" == "${new_instance_id}" ]]; then
+      echo "Node ${instance} recreated."
+      break
+    elif [[ "${external_id}" == "${instance_id}" ]]; then
+      echo -n .
+    else
+      echo "Unexpected external_id '${external_id}' matches neither old ('${instance_id}') nor new ('${new_instance_id}')."
+      echo "  (Will retry.)"
+    fi
+    sleep 1
+  done
+
+  # Wait for the node to not have SchedulingDisabled=True and also to have
+  # Ready=True.
+  echo "== Waiting for ${instance} to become ready. ==" >&2
+  while true; do
+    cordoned=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "SchedulingDisabled")].status}')
+    ready=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "Ready")].status}')
+    if [[ "${cordoned}" == 'True' ]]; then
+      echo "Node ${instance} is still not ready: SchedulingDisabled=${ready}"
+    elif [[ "${ready}" != 'True' ]]; then
+      echo "Node ${instance} is still not ready: Ready=${ready}"
+    else
+      echo "Node ${instance} Ready=${ready}"
+      break
+    fi
+    sleep 1
+  done
+}
+
 # Prereqs:
 # - prepare-node-upgrade should have been called successfully
 function do-node-upgrade() {
-  echo "== Upgrading nodes to ${KUBE_VERSION}. ==" >&2
+  echo "== Upgrading nodes to ${KUBE_VERSION} with max parallelism of ${node_upgrade_parallelism}. ==" >&2
   # Do the actual upgrade.
   # NOTE(zmerlynn): If you are changing this gcloud command, update
   #                 test/e2e/cluster_upgrade.go to match this EXACTLY.
@@ -362,95 +458,30 @@ function do-node-upgrade() {
       echo "${instances}"
       return ${list_instances_rc}
     fi
+
+    process_count_left=${node_upgrade_parallelism}
+    pids=()
+    ret_code_sum=0  # Should stay 0 in the loop iff all parallel node upgrades succeed.
     for instance in ${instances[@]}; do
-      # Cache instance id for later
-      instance_id=$(gcloud compute instances describe "${instance}" \
-        --format='get(id)' \
-        --project="${PROJECT}" \
-        --zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
-      if [[ "${describe_rc}" != 0 ]]; then
-        echo "== FAILED to describe ${instance} =="
-        echo "${instance_id}"
-        return ${describe_rc}
-      fi
+      do-single-node-upgrade "${instance}" & pids+=("$!")
 
-      # Drain node
-      echo "== Draining ${instance}. == " >&2
-      "${KUBE_ROOT}/cluster/kubectl.sh" drain --delete-local-data --force --ignore-daemonsets "${instance}" \
-        && drain_rc=$? || drain_rc=$?
-      if [[ "${drain_rc}" != 0 ]]; then
-        echo "== FAILED to drain ${instance} =="
-        return ${drain_rc}
-      fi
-
-      # Recreate instance
-      echo "== Recreating instance ${instance}. ==" >&2
-      recreate=$(gcloud compute instance-groups managed recreate-instances "${group}" \
-        --project="${PROJECT}" \
-        --zone="${ZONE}" \
-        --instances="${instance}" 2>&1) && recreate_rc=$? || recreate_rc=$?
-      if [[ "${recreate_rc}" != 0 ]]; then
-        echo "== FAILED to recreate ${instance} =="
-        echo "${recreate}"
-        return ${recreate_rc}
-      fi
-
-      # Wait for instance to be recreated
-      echo "== Waiting for instance ${instance} to be recreated. ==" >&2
-      while true; do
-        new_instance_id=$(gcloud compute instances describe "${instance}" \
-          --format='get(id)' \
-          --project="${PROJECT}" \
-          --zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
-        if [[ "${describe_rc}" != 0 ]]; then
-          echo "== FAILED to describe ${instance} =="
-          echo "${new_instance_id}"
-          echo "  (Will retry.)"
-        elif [[ "${new_instance_id}" == "${instance_id}" ]]; then
-          echo -n .
-        else
-          echo "Instance ${instance} recreated."
-          break
+      # We don't want to run more than ${node_upgrade_parallelism} upgrades at a time,
+      # so wait once we hit that many nodes. This isn't ideal, since one might take much
+      # longer than the others, but it should help.
+      process_count_left=$((process_count_left - 1))
+      if [[ process_count_left -eq 0 || "${instance}" == "${instances[-1]}" ]]; then
+        # Wait for each of the parallel node upgrades to finish.
+        for pid in "${pids[@]}"; do
+          wait $pid
+          ret_code_sum=$(( ret_code_sum + $? ))
+        done
+        # Return even if at least one of the node upgrades failed.
+        if [[ ${ret_code_sum} != 0 ]]; then
+          echo "== Some of the ${node_upgrade_parallelism} parallel node upgrades failed. =="
+          return ${ret_code_sum}
         fi
-        sleep 1
-      done
-
-      # Wait for k8s node object to reflect new instance id
-      echo "== Waiting for new node to be added to k8s.  ==" >&2
-      while true; do
-        external_id=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output=jsonpath='{.spec.externalID}' 2>&1) && kubectl_rc=$? || kubectl_rc=$?
-        if [[ "${kubectl_rc}" != 0 ]]; then
-          echo "== FAILED to get node ${instance} =="
-          echo "${external_id}"
-          echo "  (Will retry.)"
-        elif [[ "${external_id}" == "${new_instance_id}" ]]; then
-          echo "Node ${instance} recreated."
-          break
-        elif [[ "${external_id}" == "${instance_id}" ]]; then
-          echo -n .
-        else
-          echo "Unexpected external_id '${external_id}' matches neither old ('${instance_id}') nor new ('${new_instance_id}')."
-          echo "  (Will retry.)"
-        fi
-        sleep 1
-      done
-
-      # Wait for the node to not have SchedulingDisabled=True and also to have
-      # Ready=True.
-      echo "== Waiting for ${instance} to become ready. ==" >&2
-      while true; do
-        cordoned=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "SchedulingDisabled")].status}')
-        ready=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "Ready")].status}')
-        if [[ "${cordoned}" == 'True' ]]; then
-          echo "Node ${instance} is still not ready: SchedulingDisabled=${ready}"
-        elif [[ "${ready}" != 'True' ]]; then
-          echo "Node ${instance} is still not ready: Ready=${ready}"
-        else
-          echo "Node ${instance} Ready=${ready}"
-          break
-        fi
-        sleep 1
-      done
+        process_count_left=${node_upgrade_parallelism}
+      fi
     done
   done
 
@@ -471,8 +502,9 @@ node_upgrade=true
 node_prereqs=false
 local_binaries=false
 env_os_distro=false
+node_upgrade_parallelism=1
 
-while getopts ":MNPlho" opt; do
+while getopts ":MNPlcho" opt; do
   case ${opt} in
     M)
       node_upgrade=false
@@ -486,6 +518,9 @@ while getopts ":MNPlho" opt; do
     l)
       local_binaries=true
       ;;
+    c)
+      node_upgrade_parallelism=${NODE_UPGRADE_PARALLELISM:-1}
+      ;;
     o)
       env_os_distro=true
       ;;