1
0
mirror of https://github.com/rancher/rke.git synced 2025-09-16 15:10:12 +00:00

Attempt upgrade on NotReady hosts

This commit is contained in:
rajashree
2020-02-26 13:33:22 -08:00
parent ca3a3b1814
commit e27a05f8b1
10 changed files with 313 additions and 177 deletions

View File

@@ -17,7 +17,6 @@ import (
"github.com/sirupsen/logrus"
"golang.org/x/sync/errgroup"
apierrors "k8s.io/apimachinery/pkg/api/errors"
k8sutil "k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/kubernetes"
"k8s.io/kubectl/pkg/drain"
)
@@ -56,19 +55,7 @@ func RunWorkerPlane(ctx context.Context, allHosts []*hosts.Host, localConnDialer
func UpgradeWorkerPlaneForWorkerAndEtcdNodes(ctx context.Context, kubeClient *kubernetes.Clientset, mixedRolesHosts []*hosts.Host, workerOnlyHosts []*hosts.Host, inactiveHosts map[string]bool, localConnDialerFactory hosts.DialerFactory, prsMap map[string]v3.PrivateRegistry, workerNodePlanMap map[string]v3.RKEConfigNodePlan, certMap map[string]pki.CertificatePKI, updateWorkersOnly bool, alpineImage string, upgradeStrategy *v3.NodeUpgradeStrategy, newHosts map[string]bool, maxUnavailable int) (string, error) {
log.Infof(ctx, "[%s] Upgrading Worker Plane..", WorkerRole)
var errMsgMaxUnavailableNotFailed string
if maxUnavailable > WorkerThreads {
/* upgrading a large number of nodes in parallel leads to a large number of goroutines, which has led to errors regarding too many open sockets
Because of this RKE switched to using workerpools. 50 workerthreads has been sufficient to optimize rke up, upgrading at most 50 nodes in parallel.
So the user configurable maxUnavailable will be respected only as long as it's less than 50 and capped at 50 */
maxUnavailable = WorkerThreads
logrus.Info("Setting maxUnavailable to 50, to avoid issues related to upgrading large number of nodes in parallel")
}
if len(inactiveHosts) > 0 {
maxUnavailable -= len(inactiveHosts)
logrus.Infof("Resetting maxUnavailable to %v since %v host(s) are found to be inactive/unavailable prior to upgrade", maxUnavailable, len(inactiveHosts))
}
maxUnavailable = resetMaxUnavailable(maxUnavailable, len(inactiveHosts))
updateNewHostsList(kubeClient, append(mixedRolesHosts, workerOnlyHosts...), newHosts)
if len(mixedRolesHosts) > 0 {
log.Infof(ctx, "First checking and processing worker components for upgrades on nodes with etcd role one at a time")
@@ -95,23 +82,6 @@ func UpgradeWorkerPlaneForWorkerAndEtcdNodes(ctx context.Context, kubeClient *ku
return errMsgMaxUnavailableNotFailed, nil
}
func CalculateMaxUnavailable(maxUnavailableVal string, numHosts int) (int, error) {
// if maxUnavailable is given in percent, round down
maxUnavailableParsed := k8sutil.Parse(maxUnavailableVal)
logrus.Debugf("Provided value for maxUnavailable: %v", maxUnavailableParsed)
maxUnavailable, err := k8sutil.GetValueFromIntOrPercent(&maxUnavailableParsed, numHosts, false)
if err != nil {
logrus.Errorf("Unable to parse max_unavailable, should be a number or percentage of nodes, error: %v", err)
return 0, err
}
if maxUnavailable == 0 {
// In case there is only one node and rounding down maxUnvailable percentage led to 0
maxUnavailable = 1
}
logrus.Debugf("Parsed value of maxUnavailable: %v", maxUnavailable)
return maxUnavailable, nil
}
func updateNewHostsList(kubeClient *kubernetes.Clientset, allHosts []*hosts.Host, newHosts map[string]bool) {
for _, h := range allHosts {
_, err := k8s.GetNode(kubeClient, h.HostnameOverride)
@@ -159,7 +129,13 @@ func processWorkerPlaneForUpgrade(ctx context.Context, kubeClient *kubernetes.Cl
}
continue
}
nodes, err := getNodeListForUpgrade(kubeClient, &hostsFailed, newHosts, inactiveHosts)
if err := CheckNodeReady(kubeClient, runHost, WorkerRole); err != nil {
errList = append(errList, err)
hostsFailed.Store(runHost.HostnameOverride, true)
hostsFailedToUpgrade <- runHost.HostnameOverride
break
}
nodes, err := getNodeListForUpgrade(kubeClient, &hostsFailed, newHosts, inactiveHosts, WorkerRole)
if err != nil {
errList = append(errList, err)
}
@@ -214,9 +190,6 @@ func processWorkerPlaneForUpgrade(ctx context.Context, kubeClient *kubernetes.Cl
func upgradeWorkerHost(ctx context.Context, kubeClient *kubernetes.Clientset, runHost *hosts.Host, drainFlag bool, drainHelper drain.Helper,
localConnDialerFactory hosts.DialerFactory, prsMap map[string]v3.PrivateRegistry, workerNodePlanMap map[string]v3.RKEConfigNodePlan, certMap map[string]pki.CertificatePKI, updateWorkersOnly bool,
alpineImage string) error {
if err := checkNodeReady(kubeClient, runHost, WorkerRole); err != nil {
return err
}
// cordon and drain
if err := cordonAndDrainNode(kubeClient, runHost, drainFlag, drainHelper, WorkerRole); err != nil {
return err
@@ -226,7 +199,7 @@ func upgradeWorkerHost(ctx context.Context, kubeClient *kubernetes.Clientset, ru
return err
}
// consider upgrade done when kubeclient lists node as ready
if err := checkNodeReady(kubeClient, runHost, WorkerRole); err != nil {
if err := CheckNodeReady(kubeClient, runHost, WorkerRole); err != nil {
return err
}
// uncordon node