Add parallelism to GCE cluster upgrade

This commit is contained in:
Shyam Jeedigunta 2017-07-03 14:33:28 +02:00
parent a92123c530
commit 5b5d3b9410

View File

@ -39,6 +39,7 @@ function usage() {
echo " -M: Upgrade master only"
echo " -N: Upgrade nodes only"
echo " -P: Node upgrade prerequisites only (create a new instance template)"
echo " -c: Upgrade NODE_UPGRADE_PARALLELISM nodes in parallel (default=1) within a single instance group. The MIGs themselves are dealt serially."
echo " -o: Use os distro sepcified in KUBE_NODE_OS_DISTRIBUTION for new nodes. Options include 'debian' or 'gci'"
echo " -l: Use local(dev) binaries. This is only supported for master upgrades."
echo ""
@ -254,7 +255,7 @@ function setup-base-image() {
source "${KUBE_ROOT}/cluster/gce/${NODE_OS_DISTRIBUTION}/node-helper.sh"
# Reset the node image based on current os distro
set-node-image
fi
fi
}
# prepare-node-upgrade creates a new instance template suitable for upgrading
@ -327,43 +328,12 @@ function upgrade-node-env() {
fi
}
# Prereqs:
# - prepare-node-upgrade should have been called successfully
function do-node-upgrade() {
echo "== Upgrading nodes to ${KUBE_VERSION}. ==" >&2
# Do the actual upgrade.
# NOTE(zmerlynn): If you are changing this gcloud command, update
# test/e2e/cluster_upgrade.go to match this EXACTLY.
local template_name=$(get-template-name-from-version ${SANITIZED_VERSION})
local old_templates=()
local updates=()
for group in ${INSTANCE_GROUPS[@]}; do
old_templates+=($(gcloud compute instance-groups managed list \
--project="${PROJECT}" \
--zones="${ZONE}" \
--regexp="${group}" \
--format='value(instanceTemplate)' || true))
set_instance_template_out=$(gcloud compute instance-groups managed set-instance-template "${group}" \
--template="${template_name}" \
--project="${PROJECT}" \
--zone="${ZONE}" 2>&1) && set_instance_template_rc=$? || set_instance_template_rc=$?
if [[ "${set_instance_template_rc}" != 0 ]]; then
echo "== FAILED to set-instance-template for ${group} to ${template_name} =="
echo "${set_instance_template_out}"
return ${set_instance_template_rc}
fi
instances=()
instances+=($(gcloud compute instance-groups managed list-instances "${group}" \
--format='value(instance)' \
--project="${PROJECT}" \
--zone="${ZONE}" 2>&1)) && list_instances_rc=$? || list_instances_rc=$?
if [[ "${list_instances_rc}" != 0 ]]; then
echo "== FAILED to list instances in group ${group} =="
echo "${instances}"
return ${list_instances_rc}
fi
for instance in ${instances[@]}; do
# Cache instance id for later
# Upgrades a single node.
# $1: The name of the node
#
# Note: This is called multiple times from do-node-upgrade() in parallel, so should be thread-safe.
function do-single-node-upgrade() {
local -r instance="$1"
instance_id=$(gcloud compute instances describe "${instance}" \
--format='get(id)' \
--project="${PROJECT}" \
@ -451,6 +421,67 @@ function do-node-upgrade() {
fi
sleep 1
done
}
# Prereqs:
# - prepare-node-upgrade should have been called successfully
function do-node-upgrade() {
echo "== Upgrading nodes to ${KUBE_VERSION} with max parallelism of ${node_upgrade_parallelism}. ==" >&2
# Do the actual upgrade.
# NOTE(zmerlynn): If you are changing this gcloud command, update
# test/e2e/cluster_upgrade.go to match this EXACTLY.
local template_name=$(get-template-name-from-version ${SANITIZED_VERSION})
local old_templates=()
local updates=()
for group in ${INSTANCE_GROUPS[@]}; do
old_templates+=($(gcloud compute instance-groups managed list \
--project="${PROJECT}" \
--zones="${ZONE}" \
--regexp="${group}" \
--format='value(instanceTemplate)' || true))
set_instance_template_out=$(gcloud compute instance-groups managed set-instance-template "${group}" \
--template="${template_name}" \
--project="${PROJECT}" \
--zone="${ZONE}" 2>&1) && set_instance_template_rc=$? || set_instance_template_rc=$?
if [[ "${set_instance_template_rc}" != 0 ]]; then
echo "== FAILED to set-instance-template for ${group} to ${template_name} =="
echo "${set_instance_template_out}"
return ${set_instance_template_rc}
fi
instances=()
instances+=($(gcloud compute instance-groups managed list-instances "${group}" \
--format='value(instance)' \
--project="${PROJECT}" \
--zone="${ZONE}" 2>&1)) && list_instances_rc=$? || list_instances_rc=$?
if [[ "${list_instances_rc}" != 0 ]]; then
echo "== FAILED to list instances in group ${group} =="
echo "${instances}"
return ${list_instances_rc}
fi
process_count_left=${node_upgrade_parallelism}
pids=()
ret_code_sum=0 # Should stay 0 in the loop iff all parallel node upgrades succeed.
for instance in ${instances[@]}; do
do-single-node-upgrade "${instance}" & pids+=("$!")
# We don't want to run more than ${node_upgrade_parallelism} upgrades at a time,
# so wait once we hit that many nodes. This isn't ideal, since one might take much
# longer than the others, but it should help.
process_count_left=$((process_count_left - 1))
if [[ process_count_left -eq 0 || "${instance}" == "${instances[-1]}" ]]; then
# Wait for each of the parallel node upgrades to finish.
for pid in "${pids[@]}"; do
wait $pid
ret_code_sum=$(( ret_code_sum + $? ))
done
# Return even if at least one of the node upgrades failed.
if [[ ${ret_code_sum} != 0 ]]; then
echo "== Some of the ${node_upgrade_parallelism} parallel node upgrades failed. =="
return ${ret_code_sum}
fi
process_count_left=${node_upgrade_parallelism}
fi
done
done
@ -471,8 +502,9 @@ node_upgrade=true
node_prereqs=false
local_binaries=false
env_os_distro=false
node_upgrade_parallelism=1
while getopts ":MNPlho" opt; do
while getopts ":MNPlcho" opt; do
case ${opt} in
M)
node_upgrade=false
@ -486,6 +518,9 @@ while getopts ":MNPlho" opt; do
l)
local_binaries=true
;;
c)
node_upgrade_parallelism=${NODE_UPGRADE_PARALLELISM:-1}
;;
o)
env_os_distro=true
;;