From bb6873ce489a2c0aad49f836e772ebf082278e76 Mon Sep 17 00:00:00 2001
From: rajashree <rajashree.28m@gmail.com>
Date: Sat, 7 Mar 2020 12:11:41 -0800
Subject: [PATCH] Addresses following issues:

1. Compare maxUnavailable with powered off hosts before attempting to reconcile
NotReady hosts
2. Include powered off hosts as failed hosts for controlplane upgrade to return error
3. Change coredns upgrade strategy. With addons changes it was changed to have the k8s
default value for a deployment of 25% maxUnavailable and maxSurge. This commit changes it
back to maxUnavailable of 1 to avoid dns addon upgrade issues
---
 cluster/cluster.go       | 10 ++++++++++
 cluster/defaults.go      | 42 +++++++++++++++++++++++++++++++++++++++-
 services/controlplane.go | 20 ++++++++++++-------
 services/node_util.go    |  2 +-
 services/workerplane.go  |  4 ----
 5 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/cluster/cluster.go b/cluster/cluster.go
index bb7d1333..e55ee966 100644
--- a/cluster/cluster.go
+++ b/cluster/cluster.go
@@ -166,6 +166,7 @@ func (c *Cluster) UpgradeControlPlane(ctx context.Context, kubeClient *kubernete
 	inactiveHosts := make(map[string]bool)
 	var controlPlaneHosts, notReadyHosts []*hosts.Host
 	var notReadyHostNames []string
+	var err error
 
 	for _, host := range c.InactiveHosts {
 		// include only hosts with controlplane role
@@ -173,6 +174,10 @@ func (c *Cluster) UpgradeControlPlane(ctx context.Context, kubeClient *kubernete
 			inactiveHosts[host.HostnameOverride] = true
 		}
 	}
+	c.MaxUnavailableForControlNodes, err = services.ResetMaxUnavailable(c.MaxUnavailableForControlNodes, len(inactiveHosts), services.ControlRole)
+	if err != nil {
+		return "", err
+	}
 	for _, host := range c.ControlPlaneHosts {
 		if !c.HostsLabeledToIgnoreUpgrade[host.Address] {
 			controlPlaneHosts = append(controlPlaneHosts, host)
@@ -265,6 +270,7 @@ func (c *Cluster) UpgradeWorkerPlane(ctx context.Context, kubeClient *kubernetes
 	inactiveHosts := make(map[string]bool)
 	var notReadyHosts []*hosts.Host
 	var notReadyHostNames []string
+	var err error
 
 	for _, host := range c.InactiveHosts {
 		// if host has controlplane role, it already has worker components upgraded
@@ -272,6 +278,10 @@ func (c *Cluster) UpgradeWorkerPlane(ctx context.Context, kubeClient *kubernetes
 			inactiveHosts[host.HostnameOverride] = true
 		}
 	}
+	c.MaxUnavailableForWorkerNodes, err = services.ResetMaxUnavailable(c.MaxUnavailableForWorkerNodes, len(inactiveHosts), services.WorkerRole)
+	if err != nil {
+		return "", err
+	}
 	for _, host := range append(etcdAndWorkerHosts, workerOnlyHosts...) {
 		if c.NewHosts[host.HostnameOverride] {
 			continue
diff --git a/cluster/defaults.go b/cluster/defaults.go
index 30e96df0..a673f890 100644
--- a/cluster/defaults.go
+++ b/cluster/defaults.go
@@ -603,7 +603,7 @@ func GetExternalFlags(local, updateOnly, disablePortCheck bool, configDir, clust
 func (c *Cluster) setAddonsDefaults() {
 	c.Ingress.UpdateStrategy = setDaemonsetAddonDefaults(c.Ingress.UpdateStrategy)
 	c.Network.UpdateStrategy = setDaemonsetAddonDefaults(c.Network.UpdateStrategy)
-	c.DNS.UpdateStrategy = setDeploymentAddonDefaults(c.DNS.UpdateStrategy)
+	c.DNS.UpdateStrategy = setDNSDeploymentAddonDefaults(c.DNS.UpdateStrategy, c.DNS.Provider)
 	if c.DNS.LinearAutoscalerParams == nil {
 		c.DNS.LinearAutoscalerParams = &DefaultClusterProportionalAutoscalerLinearParams
 	}
@@ -638,3 +638,43 @@ func setDeploymentAddonDefaults(updateStrategy *appsv1.DeploymentStrategy) *apps
 	}
 	return updateStrategy
 }
+
+func setDNSDeploymentAddonDefaults(updateStrategy *appsv1.DeploymentStrategy, dnsProvider string) *appsv1.DeploymentStrategy {
+	var (
+		coreDNSMaxUnavailable, coreDNSMaxSurge = intstr.FromInt(1), intstr.FromInt(0)
+		kubeDNSMaxSurge, kubeDNSMaxUnavailable = intstr.FromString("10%"), intstr.FromInt(0)
+	)
+	if updateStrategy != nil && updateStrategy.Type != appsv1.RollingUpdateDeploymentStrategyType {
+		return updateStrategy
+	}
+	switch dnsProvider {
+	case CoreDNSProvider:
+		if updateStrategy == nil || updateStrategy.RollingUpdate == nil {
+			return &appsv1.DeploymentStrategy{
+				Type: appsv1.RollingUpdateDeploymentStrategyType,
+				RollingUpdate: &appsv1.RollingUpdateDeployment{
+					MaxUnavailable: &coreDNSMaxUnavailable,
+					MaxSurge:       &coreDNSMaxSurge,
+				},
+			}
+		}
+		if updateStrategy.RollingUpdate.MaxUnavailable == nil {
+			updateStrategy.RollingUpdate.MaxUnavailable = &coreDNSMaxUnavailable
+		}
+	case KubeDNSProvider:
+		if updateStrategy == nil || updateStrategy.RollingUpdate == nil {
+			return &appsv1.DeploymentStrategy{
+				Type: appsv1.RollingUpdateDeploymentStrategyType,
+				RollingUpdate: &appsv1.RollingUpdateDeployment{
+					MaxUnavailable: &kubeDNSMaxUnavailable,
+					MaxSurge:       &kubeDNSMaxSurge,
+				},
+			}
+		}
+		if updateStrategy.RollingUpdate.MaxSurge == nil {
+			updateStrategy.RollingUpdate.MaxSurge = &kubeDNSMaxSurge
+		}
+	}
+
+	return updateStrategy
+}
diff --git a/services/controlplane.go b/services/controlplane.go
index a8b791de..473af400 100644
--- a/services/controlplane.go
+++ b/services/controlplane.go
@@ -72,16 +72,22 @@ func UpgradeControlPlaneNodes(ctx context.Context, kubeClient *kubernetes.Client
 		drainHelper = getDrainHelper(kubeClient, *upgradeStrategy)
 		log.Infof(ctx, "[%s] Parameters provided to drain command: %#v", ControlRole, fmt.Sprintf("Force: %v, IgnoreAllDaemonSets: %v, DeleteLocalData: %v, Timeout: %v, GracePeriodSeconds: %v", drainHelper.Force, drainHelper.IgnoreAllDaemonSets, drainHelper.DeleteLocalData, drainHelper.Timeout, drainHelper.GracePeriodSeconds))
 	}
-	maxUnavailable, err := resetMaxUnavailable(maxUnavailable, len(inactiveHosts), ControlRole)
-	if err != nil {
-		return errMsgMaxUnavailableNotFailed, err
+	var inactiveHostErr error
+	if len(inactiveHosts) > 0 {
+		var inactiveHostNames []string
+		for hostName := range inactiveHosts {
+			inactiveHostNames = append(inactiveHostNames, hostName)
+		}
+		inactiveHostErr = fmt.Errorf("provisioning incomplete, host(s) [%s] skipped because they could not be contacted", strings.Join(inactiveHostNames, ","))
 	}
 	hostsFailedToUpgrade, err := processControlPlaneForUpgrade(ctx, kubeClient, controlHosts, localConnDialerFactory, prsMap, cpNodePlanMap, updateWorkersOnly, alpineImage, certMap,
 		upgradeStrategy, newHosts, inactiveHosts, maxUnavailable, drainHelper)
-	if err != nil {
-		logrus.Errorf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err)
-		errMsgMaxUnavailableNotFailed = fmt.Sprintf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err)
-		return errMsgMaxUnavailableNotFailed, err
+	if err != nil || inactiveHostErr != nil {
+		if len(hostsFailedToUpgrade) > 0 {
+			logrus.Errorf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err)
+			errMsgMaxUnavailableNotFailed = fmt.Sprintf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err)
+		}
+		return errMsgMaxUnavailableNotFailed, util.ErrList([]error{err, inactiveHostErr})
 	}
 	log.Infof(ctx, "[%s] Successfully upgraded Controller Plane..", ControlRole)
 	return errMsgMaxUnavailableNotFailed, nil
diff --git a/services/node_util.go b/services/node_util.go
index 6a373050..15b7ff85 100644
--- a/services/node_util.go
+++ b/services/node_util.go
@@ -115,7 +115,7 @@ func CalculateMaxUnavailable(maxUnavailableVal string, numHosts int, role string
 	return maxUnavailable, nil
 }
 
-func resetMaxUnavailable(maxUnavailable, lenInactiveHosts int, component string) (int, error) {
+func ResetMaxUnavailable(maxUnavailable, lenInactiveHosts int, component string) (int, error) {
 	if maxUnavailable > WorkerThreads {
 		/* upgrading a large number of nodes in parallel leads to a large number of goroutines, which has led to errors regarding too many open sockets
 		Because of this RKE switched to using workerpools. 50 workerthreads has been sufficient to optimize rke up, upgrading at most 50 nodes in parallel.
diff --git a/services/workerplane.go b/services/workerplane.go
index a9039b3f..2c828234 100644
--- a/services/workerplane.go
+++ b/services/workerplane.go
@@ -55,10 +55,6 @@ func RunWorkerPlane(ctx context.Context, allHosts []*hosts.Host, localConnDialer
 func UpgradeWorkerPlaneForWorkerAndEtcdNodes(ctx context.Context, kubeClient *kubernetes.Clientset, mixedRolesHosts []*hosts.Host, workerOnlyHosts []*hosts.Host, inactiveHosts map[string]bool, localConnDialerFactory hosts.DialerFactory, prsMap map[string]v3.PrivateRegistry, workerNodePlanMap map[string]v3.RKEConfigNodePlan, certMap map[string]pki.CertificatePKI, updateWorkersOnly bool, alpineImage string, upgradeStrategy *v3.NodeUpgradeStrategy, newHosts map[string]bool, maxUnavailable int) (string, error) {
 	log.Infof(ctx, "[%s] Upgrading Worker Plane..", WorkerRole)
 	var errMsgMaxUnavailableNotFailed string
-	maxUnavailable, err := resetMaxUnavailable(maxUnavailable, len(inactiveHosts), WorkerRole)
-	if err != nil {
-		return errMsgMaxUnavailableNotFailed, err
-	}
 	updateNewHostsList(kubeClient, append(mixedRolesHosts, workerOnlyHosts...), newHosts)
 	if len(mixedRolesHosts) > 0 {
 		log.Infof(ctx, "First checking and processing worker components for upgrades on nodes with etcd role one at a time")