From 1ecf6effbfc8aba97b48b6de9c31d2e7ab5a3c25 Mon Sep 17 00:00:00 2001 From: rajashree Date: Fri, 20 Mar 2020 11:11:59 -0700 Subject: [PATCH] Reconcile workerplane for NotReady control hosts --- cluster/cluster.go | 30 ++++++++++++++++++++++++++---- services/controlplane.go | 6 +++++- services/workerplane.go | 4 ++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/cluster/cluster.go b/cluster/cluster.go index 4abfc766..00647981 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -193,16 +193,32 @@ func (c *Cluster) UpgradeControlPlane(ctx context.Context, kubeClient *kubernete if len(notReadyHostNames) > 0 { // attempt upgrade on NotReady hosts without respecting max_unavailable_controlplane logrus.Infof("Attempting upgrade of controlplane components on following hosts in NotReady status: %v", strings.Join(notReadyHostNames, ",")) - services.RunControlPlane(ctx, notReadyHosts, + err = services.RunControlPlane(ctx, notReadyHosts, c.LocalConnDialerFactory, c.PrivateRegistriesMap, cpNodePlanMap, c.UpdateWorkersOnly, c.SystemImages.Alpine, c.Certificates) + if err != nil { + logrus.Errorf("Failed to upgrade controlplane components on NotReady hosts, error: %v", err) + } + err = services.RunWorkerPlane(ctx, notReadyHosts, + c.LocalConnDialerFactory, + c.PrivateRegistriesMap, + cpNodePlanMap, + c.Certificates, + c.UpdateWorkersOnly, + c.SystemImages.Alpine) + if err != nil { + logrus.Errorf("Failed to upgrade worker components on NotReady hosts, error: %v", err) + } // Calling CheckNodeReady wil give some time for nodes to get in Ready state for _, host := range notReadyHosts { - services.CheckNodeReady(kubeClient, host, services.ControlRole) + err = services.CheckNodeReady(kubeClient, host, services.ControlRole) + if err != nil { + logrus.Errorf("Host %v failed to report Ready status with error: %v", host.HostnameOverride, err) + } } } // rolling upgrade respecting maxUnavailable @@ -294,16 +310,22 @@ func (c *Cluster) UpgradeWorkerPlane(ctx context.Context, kubeClient *kubernetes if len(notReadyHostNames) > 0 { // attempt upgrade on NotReady hosts without respecting max_unavailable_worker logrus.Infof("Attempting upgrade of worker components on following hosts in NotReady status: %v", strings.Join(notReadyHostNames, ",")) - services.RunWorkerPlane(ctx, notReadyHosts, + err = services.RunWorkerPlane(ctx, notReadyHosts, c.LocalConnDialerFactory, c.PrivateRegistriesMap, workerNodePlanMap, c.Certificates, c.UpdateWorkersOnly, c.SystemImages.Alpine) + if err != nil { + logrus.Errorf("Failed to upgrade worker components on NotReady hosts, error: %v", err) + } // Calling CheckNodeReady wil give some time for nodes to get in Ready state for _, host := range notReadyHosts { - services.CheckNodeReady(kubeClient, host, services.WorkerRole) + err = services.CheckNodeReady(kubeClient, host, services.WorkerRole) + if err != nil { + logrus.Errorf("Host %v failed to report Ready status with error: %v", host.HostnameOverride, err) + } } } diff --git a/services/controlplane.go b/services/controlplane.go index ecfd2db8..884bb43c 100644 --- a/services/controlplane.go +++ b/services/controlplane.go @@ -56,7 +56,7 @@ func UpgradeControlPlaneNodes(ctx context.Context, kubeClient *kubernetes.Client } var errMsgMaxUnavailableNotFailed string var drainHelper drain.Helper - log.Infof(ctx, "[%s] Processing controlplane hosts for upgrade one at a time", ControlRole) + log.Infof(ctx, "[%s] Processing controlplane hosts for upgrade %v at a time", ControlRole, maxUnavailable) if len(newHosts) > 0 { var nodes []string for _, host := range controlHosts { @@ -163,6 +163,10 @@ func processControlPlaneForUpgrade(ctx context.Context, kubeClient *kubernetes.C } if !controlPlaneUpgradable && !workerPlaneUpgradable { log.Infof(ctx, "Upgrade not required for controlplane and worker components of host %v", runHost.HostnameOverride) + if err := k8s.CordonUncordon(kubeClient, runHost.HostnameOverride, false); err != nil { + // This node didn't undergo an upgrade, so RKE will only log any error after uncordoning it and won't count this in maxUnavailable + logrus.Errorf("[controlplane] Failed to uncordon node %v, error: %v", runHost.HostnameOverride, err) + } continue } if err := upgradeControlHost(ctx, kubeClient, runHost, upgradeStrategy.Drain, drainHelper, localConnDialerFactory, prsMap, cpNodePlanMap, updateWorkersOnly, alpineImage, certMap, controlPlaneUpgradable, workerPlaneUpgradable); err != nil { diff --git a/services/workerplane.go b/services/workerplane.go index 2c828234..9001667d 100644 --- a/services/workerplane.go +++ b/services/workerplane.go @@ -163,6 +163,10 @@ func processWorkerPlaneForUpgrade(ctx context.Context, kubeClient *kubernetes.Cl } if !upgradable { logrus.Infof("[workerplane] Upgrade not required for worker components of host %v", runHost.HostnameOverride) + if err := k8s.CordonUncordon(kubeClient, runHost.HostnameOverride, false); err != nil { + // This node didn't undergo an upgrade, so RKE will only log any error after uncordoning it and won't count this in maxUnavailable + logrus.Errorf("[workerplane] Failed to uncordon node %v, error: %v", runHost.HostnameOverride, err) + } continue } if err := upgradeWorkerHost(ctx, kubeClient, runHost, upgradeStrategy.Drain, drainHelper, localConnDialerFactory, prsMap, workerNodePlanMap, certMap, updateWorkersOnly, alpineImage); err != nil {