mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 11:21:47 +00:00
Add parallelism to GCE cluster upgrade
This commit is contained in:
parent
a92123c530
commit
5b5d3b9410
@ -39,6 +39,7 @@ function usage() {
|
|||||||
echo " -M: Upgrade master only"
|
echo " -M: Upgrade master only"
|
||||||
echo " -N: Upgrade nodes only"
|
echo " -N: Upgrade nodes only"
|
||||||
echo " -P: Node upgrade prerequisites only (create a new instance template)"
|
echo " -P: Node upgrade prerequisites only (create a new instance template)"
|
||||||
|
echo " -c: Upgrade NODE_UPGRADE_PARALLELISM nodes in parallel (default=1) within a single instance group. The MIGs themselves are dealt serially."
|
||||||
echo " -o: Use os distro sepcified in KUBE_NODE_OS_DISTRIBUTION for new nodes. Options include 'debian' or 'gci'"
|
echo " -o: Use os distro sepcified in KUBE_NODE_OS_DISTRIBUTION for new nodes. Options include 'debian' or 'gci'"
|
||||||
echo " -l: Use local(dev) binaries. This is only supported for master upgrades."
|
echo " -l: Use local(dev) binaries. This is only supported for master upgrades."
|
||||||
echo ""
|
echo ""
|
||||||
@ -254,7 +255,7 @@ function setup-base-image() {
|
|||||||
source "${KUBE_ROOT}/cluster/gce/${NODE_OS_DISTRIBUTION}/node-helper.sh"
|
source "${KUBE_ROOT}/cluster/gce/${NODE_OS_DISTRIBUTION}/node-helper.sh"
|
||||||
# Reset the node image based on current os distro
|
# Reset the node image based on current os distro
|
||||||
set-node-image
|
set-node-image
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# prepare-node-upgrade creates a new instance template suitable for upgrading
|
# prepare-node-upgrade creates a new instance template suitable for upgrading
|
||||||
@ -327,10 +328,105 @@ function upgrade-node-env() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Upgrades a single node.
|
||||||
|
# $1: The name of the node
|
||||||
|
#
|
||||||
|
# Note: This is called multiple times from do-node-upgrade() in parallel, so should be thread-safe.
|
||||||
|
function do-single-node-upgrade() {
|
||||||
|
local -r instance="$1"
|
||||||
|
instance_id=$(gcloud compute instances describe "${instance}" \
|
||||||
|
--format='get(id)' \
|
||||||
|
--project="${PROJECT}" \
|
||||||
|
--zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
|
||||||
|
if [[ "${describe_rc}" != 0 ]]; then
|
||||||
|
echo "== FAILED to describe ${instance} =="
|
||||||
|
echo "${instance_id}"
|
||||||
|
return ${describe_rc}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Drain node
|
||||||
|
echo "== Draining ${instance}. == " >&2
|
||||||
|
"${KUBE_ROOT}/cluster/kubectl.sh" drain --delete-local-data --force --ignore-daemonsets "${instance}" \
|
||||||
|
&& drain_rc=$? || drain_rc=$?
|
||||||
|
if [[ "${drain_rc}" != 0 ]]; then
|
||||||
|
echo "== FAILED to drain ${instance} =="
|
||||||
|
return ${drain_rc}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Recreate instance
|
||||||
|
echo "== Recreating instance ${instance}. ==" >&2
|
||||||
|
recreate=$(gcloud compute instance-groups managed recreate-instances "${group}" \
|
||||||
|
--project="${PROJECT}" \
|
||||||
|
--zone="${ZONE}" \
|
||||||
|
--instances="${instance}" 2>&1) && recreate_rc=$? || recreate_rc=$?
|
||||||
|
if [[ "${recreate_rc}" != 0 ]]; then
|
||||||
|
echo "== FAILED to recreate ${instance} =="
|
||||||
|
echo "${recreate}"
|
||||||
|
return ${recreate_rc}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Wait for instance to be recreated
|
||||||
|
echo "== Waiting for instance ${instance} to be recreated. ==" >&2
|
||||||
|
while true; do
|
||||||
|
new_instance_id=$(gcloud compute instances describe "${instance}" \
|
||||||
|
--format='get(id)' \
|
||||||
|
--project="${PROJECT}" \
|
||||||
|
--zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
|
||||||
|
if [[ "${describe_rc}" != 0 ]]; then
|
||||||
|
echo "== FAILED to describe ${instance} =="
|
||||||
|
echo "${new_instance_id}"
|
||||||
|
echo " (Will retry.)"
|
||||||
|
elif [[ "${new_instance_id}" == "${instance_id}" ]]; then
|
||||||
|
echo -n .
|
||||||
|
else
|
||||||
|
echo "Instance ${instance} recreated."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
|
# Wait for k8s node object to reflect new instance id
|
||||||
|
echo "== Waiting for new node to be added to k8s. ==" >&2
|
||||||
|
while true; do
|
||||||
|
external_id=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output=jsonpath='{.spec.externalID}' 2>&1) && kubectl_rc=$? || kubectl_rc=$?
|
||||||
|
if [[ "${kubectl_rc}" != 0 ]]; then
|
||||||
|
echo "== FAILED to get node ${instance} =="
|
||||||
|
echo "${external_id}"
|
||||||
|
echo " (Will retry.)"
|
||||||
|
elif [[ "${external_id}" == "${new_instance_id}" ]]; then
|
||||||
|
echo "Node ${instance} recreated."
|
||||||
|
break
|
||||||
|
elif [[ "${external_id}" == "${instance_id}" ]]; then
|
||||||
|
echo -n .
|
||||||
|
else
|
||||||
|
echo "Unexpected external_id '${external_id}' matches neither old ('${instance_id}') nor new ('${new_instance_id}')."
|
||||||
|
echo " (Will retry.)"
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
|
# Wait for the node to not have SchedulingDisabled=True and also to have
|
||||||
|
# Ready=True.
|
||||||
|
echo "== Waiting for ${instance} to become ready. ==" >&2
|
||||||
|
while true; do
|
||||||
|
cordoned=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "SchedulingDisabled")].status}')
|
||||||
|
ready=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "Ready")].status}')
|
||||||
|
if [[ "${cordoned}" == 'True' ]]; then
|
||||||
|
echo "Node ${instance} is still not ready: SchedulingDisabled=${ready}"
|
||||||
|
elif [[ "${ready}" != 'True' ]]; then
|
||||||
|
echo "Node ${instance} is still not ready: Ready=${ready}"
|
||||||
|
else
|
||||||
|
echo "Node ${instance} Ready=${ready}"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
# Prereqs:
|
# Prereqs:
|
||||||
# - prepare-node-upgrade should have been called successfully
|
# - prepare-node-upgrade should have been called successfully
|
||||||
function do-node-upgrade() {
|
function do-node-upgrade() {
|
||||||
echo "== Upgrading nodes to ${KUBE_VERSION}. ==" >&2
|
echo "== Upgrading nodes to ${KUBE_VERSION} with max parallelism of ${node_upgrade_parallelism}. ==" >&2
|
||||||
# Do the actual upgrade.
|
# Do the actual upgrade.
|
||||||
# NOTE(zmerlynn): If you are changing this gcloud command, update
|
# NOTE(zmerlynn): If you are changing this gcloud command, update
|
||||||
# test/e2e/cluster_upgrade.go to match this EXACTLY.
|
# test/e2e/cluster_upgrade.go to match this EXACTLY.
|
||||||
@ -362,95 +458,30 @@ function do-node-upgrade() {
|
|||||||
echo "${instances}"
|
echo "${instances}"
|
||||||
return ${list_instances_rc}
|
return ${list_instances_rc}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
process_count_left=${node_upgrade_parallelism}
|
||||||
|
pids=()
|
||||||
|
ret_code_sum=0 # Should stay 0 in the loop iff all parallel node upgrades succeed.
|
||||||
for instance in ${instances[@]}; do
|
for instance in ${instances[@]}; do
|
||||||
# Cache instance id for later
|
do-single-node-upgrade "${instance}" & pids+=("$!")
|
||||||
instance_id=$(gcloud compute instances describe "${instance}" \
|
|
||||||
--format='get(id)' \
|
|
||||||
--project="${PROJECT}" \
|
|
||||||
--zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
|
|
||||||
if [[ "${describe_rc}" != 0 ]]; then
|
|
||||||
echo "== FAILED to describe ${instance} =="
|
|
||||||
echo "${instance_id}"
|
|
||||||
return ${describe_rc}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Drain node
|
# We don't want to run more than ${node_upgrade_parallelism} upgrades at a time,
|
||||||
echo "== Draining ${instance}. == " >&2
|
# so wait once we hit that many nodes. This isn't ideal, since one might take much
|
||||||
"${KUBE_ROOT}/cluster/kubectl.sh" drain --delete-local-data --force --ignore-daemonsets "${instance}" \
|
# longer than the others, but it should help.
|
||||||
&& drain_rc=$? || drain_rc=$?
|
process_count_left=$((process_count_left - 1))
|
||||||
if [[ "${drain_rc}" != 0 ]]; then
|
if [[ process_count_left -eq 0 || "${instance}" == "${instances[-1]}" ]]; then
|
||||||
echo "== FAILED to drain ${instance} =="
|
# Wait for each of the parallel node upgrades to finish.
|
||||||
return ${drain_rc}
|
for pid in "${pids[@]}"; do
|
||||||
fi
|
wait $pid
|
||||||
|
ret_code_sum=$(( ret_code_sum + $? ))
|
||||||
# Recreate instance
|
done
|
||||||
echo "== Recreating instance ${instance}. ==" >&2
|
# Return even if at least one of the node upgrades failed.
|
||||||
recreate=$(gcloud compute instance-groups managed recreate-instances "${group}" \
|
if [[ ${ret_code_sum} != 0 ]]; then
|
||||||
--project="${PROJECT}" \
|
echo "== Some of the ${node_upgrade_parallelism} parallel node upgrades failed. =="
|
||||||
--zone="${ZONE}" \
|
return ${ret_code_sum}
|
||||||
--instances="${instance}" 2>&1) && recreate_rc=$? || recreate_rc=$?
|
|
||||||
if [[ "${recreate_rc}" != 0 ]]; then
|
|
||||||
echo "== FAILED to recreate ${instance} =="
|
|
||||||
echo "${recreate}"
|
|
||||||
return ${recreate_rc}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Wait for instance to be recreated
|
|
||||||
echo "== Waiting for instance ${instance} to be recreated. ==" >&2
|
|
||||||
while true; do
|
|
||||||
new_instance_id=$(gcloud compute instances describe "${instance}" \
|
|
||||||
--format='get(id)' \
|
|
||||||
--project="${PROJECT}" \
|
|
||||||
--zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
|
|
||||||
if [[ "${describe_rc}" != 0 ]]; then
|
|
||||||
echo "== FAILED to describe ${instance} =="
|
|
||||||
echo "${new_instance_id}"
|
|
||||||
echo " (Will retry.)"
|
|
||||||
elif [[ "${new_instance_id}" == "${instance_id}" ]]; then
|
|
||||||
echo -n .
|
|
||||||
else
|
|
||||||
echo "Instance ${instance} recreated."
|
|
||||||
break
|
|
||||||
fi
|
fi
|
||||||
sleep 1
|
process_count_left=${node_upgrade_parallelism}
|
||||||
done
|
fi
|
||||||
|
|
||||||
# Wait for k8s node object to reflect new instance id
|
|
||||||
echo "== Waiting for new node to be added to k8s. ==" >&2
|
|
||||||
while true; do
|
|
||||||
external_id=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output=jsonpath='{.spec.externalID}' 2>&1) && kubectl_rc=$? || kubectl_rc=$?
|
|
||||||
if [[ "${kubectl_rc}" != 0 ]]; then
|
|
||||||
echo "== FAILED to get node ${instance} =="
|
|
||||||
echo "${external_id}"
|
|
||||||
echo " (Will retry.)"
|
|
||||||
elif [[ "${external_id}" == "${new_instance_id}" ]]; then
|
|
||||||
echo "Node ${instance} recreated."
|
|
||||||
break
|
|
||||||
elif [[ "${external_id}" == "${instance_id}" ]]; then
|
|
||||||
echo -n .
|
|
||||||
else
|
|
||||||
echo "Unexpected external_id '${external_id}' matches neither old ('${instance_id}') nor new ('${new_instance_id}')."
|
|
||||||
echo " (Will retry.)"
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
# Wait for the node to not have SchedulingDisabled=True and also to have
|
|
||||||
# Ready=True.
|
|
||||||
echo "== Waiting for ${instance} to become ready. ==" >&2
|
|
||||||
while true; do
|
|
||||||
cordoned=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "SchedulingDisabled")].status}')
|
|
||||||
ready=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "Ready")].status}')
|
|
||||||
if [[ "${cordoned}" == 'True' ]]; then
|
|
||||||
echo "Node ${instance} is still not ready: SchedulingDisabled=${ready}"
|
|
||||||
elif [[ "${ready}" != 'True' ]]; then
|
|
||||||
echo "Node ${instance} is still not ready: Ready=${ready}"
|
|
||||||
else
|
|
||||||
echo "Node ${instance} Ready=${ready}"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
@ -471,8 +502,9 @@ node_upgrade=true
|
|||||||
node_prereqs=false
|
node_prereqs=false
|
||||||
local_binaries=false
|
local_binaries=false
|
||||||
env_os_distro=false
|
env_os_distro=false
|
||||||
|
node_upgrade_parallelism=1
|
||||||
|
|
||||||
while getopts ":MNPlho" opt; do
|
while getopts ":MNPlcho" opt; do
|
||||||
case ${opt} in
|
case ${opt} in
|
||||||
M)
|
M)
|
||||||
node_upgrade=false
|
node_upgrade=false
|
||||||
@ -486,6 +518,9 @@ while getopts ":MNPlho" opt; do
|
|||||||
l)
|
l)
|
||||||
local_binaries=true
|
local_binaries=true
|
||||||
;;
|
;;
|
||||||
|
c)
|
||||||
|
node_upgrade_parallelism=${NODE_UPGRADE_PARALLELISM:-1}
|
||||||
|
;;
|
||||||
o)
|
o)
|
||||||
env_os_distro=true
|
env_os_distro=true
|
||||||
;;
|
;;
|
||||||
|
Loading…
Reference in New Issue
Block a user