diff --git a/services/controlplane.go b/services/controlplane.go index 645609f4..62adec85 100644 --- a/services/controlplane.go +++ b/services/controlplane.go @@ -72,7 +72,10 @@ func UpgradeControlPlaneNodes(ctx context.Context, kubeClient *kubernetes.Client drainHelper = getDrainHelper(kubeClient, *upgradeStrategy) log.Infof(ctx, "[%s] Parameters provided to drain command: %#v", ControlRole, fmt.Sprintf("Force: %v, IgnoreAllDaemonSets: %v, DeleteLocalData: %v, Timeout: %v, GracePeriodSeconds: %v", drainHelper.Force, drainHelper.IgnoreAllDaemonSets, drainHelper.DeleteLocalData, drainHelper.Timeout, drainHelper.GracePeriodSeconds)) } - maxUnavailable = resetMaxUnavailable(maxUnavailable, len(inactiveHosts)) + maxUnavailable, err := resetMaxUnavailable(maxUnavailable, len(inactiveHosts), ControlRole) + if err != nil { + return errMsgMaxUnavailableNotFailed, err + } hostsFailedToUpgrade, err := processControlPlaneForUpgrade(ctx, kubeClient, controlHosts, localConnDialerFactory, prsMap, cpNodePlanMap, updateWorkersOnly, alpineImage, certMap, upgradeStrategy, newHosts, inactiveHosts, maxUnavailable, drainHelper) if err != nil { diff --git a/services/node_util.go b/services/node_util.go index abd44f46..d468c4a2 100644 --- a/services/node_util.go +++ b/services/node_util.go @@ -104,18 +104,21 @@ func CalculateMaxUnavailable(maxUnavailableVal string, numHosts int) (int, error return maxUnavailable, nil } -func resetMaxUnavailable(maxUnavailable, lenInactiveHosts int) int { +func resetMaxUnavailable(maxUnavailable, lenInactiveHosts int, component string) (int, error) { if maxUnavailable > WorkerThreads { /* upgrading a large number of nodes in parallel leads to a large number of goroutines, which has led to errors regarding too many open sockets Because of this RKE switched to using workerpools. 50 workerthreads has been sufficient to optimize rke up, upgrading at most 50 nodes in parallel. So the user configurable maxUnavailable will be respected only as long as it's less than 50 and capped at 50 */ maxUnavailable = WorkerThreads - logrus.Info("Resetting maxUnavailable to 50, to avoid issues related to upgrading large number of nodes in parallel") + logrus.Infof("Resetting %s to 50, to avoid issues related to upgrading large number of nodes in parallel", "max_unavailable_"+component) } if lenInactiveHosts > 0 { + if maxUnavailable == lenInactiveHosts { + return 0, fmt.Errorf("cannot proceed with upgrade of %s since %v host(s) are found to be inactive prior to upgrade", component, lenInactiveHosts) + } maxUnavailable -= lenInactiveHosts - logrus.Infof("Resetting maxUnavailable to %v since %v host(s) are found to be inactive/unavailable prior to upgrade", maxUnavailable, lenInactiveHosts) + logrus.Infof("Resetting %s to %v since %v host(s) are found to be inactive prior to upgrade", "max_unavailable_"+component, maxUnavailable, lenInactiveHosts) } - return maxUnavailable + return maxUnavailable, nil } diff --git a/services/workerplane.go b/services/workerplane.go index 06038e4e..a9039b3f 100644 --- a/services/workerplane.go +++ b/services/workerplane.go @@ -55,7 +55,10 @@ func RunWorkerPlane(ctx context.Context, allHosts []*hosts.Host, localConnDialer func UpgradeWorkerPlaneForWorkerAndEtcdNodes(ctx context.Context, kubeClient *kubernetes.Clientset, mixedRolesHosts []*hosts.Host, workerOnlyHosts []*hosts.Host, inactiveHosts map[string]bool, localConnDialerFactory hosts.DialerFactory, prsMap map[string]v3.PrivateRegistry, workerNodePlanMap map[string]v3.RKEConfigNodePlan, certMap map[string]pki.CertificatePKI, updateWorkersOnly bool, alpineImage string, upgradeStrategy *v3.NodeUpgradeStrategy, newHosts map[string]bool, maxUnavailable int) (string, error) { log.Infof(ctx, "[%s] Upgrading Worker Plane..", WorkerRole) var errMsgMaxUnavailableNotFailed string - maxUnavailable = resetMaxUnavailable(maxUnavailable, len(inactiveHosts)) + maxUnavailable, err := resetMaxUnavailable(maxUnavailable, len(inactiveHosts), WorkerRole) + if err != nil { + return errMsgMaxUnavailableNotFailed, err + } updateNewHostsList(kubeClient, append(mixedRolesHosts, workerOnlyHosts...), newHosts) if len(mixedRolesHosts) > 0 { log.Infof(ctx, "First checking and processing worker components for upgrades on nodes with etcd role one at a time")