mirror of
https://github.com/rancher/rke.git
synced 2025-09-05 00:40:10 +00:00
Addresses following issues:
1. Compare maxUnavailable with powered off hosts before attempting to reconcile NotReady hosts 2. Include powered off hosts as failed hosts for controlplane upgrade to return error 3. Change coredns upgrade strategy. With addons changes it was changed to have the k8s default value for a deployment of 25% maxUnavailable and maxSurge. This commit changes it back to maxUnavailable of 1 to avoid dns addon upgrade issues
This commit is contained in:
@@ -166,6 +166,7 @@ func (c *Cluster) UpgradeControlPlane(ctx context.Context, kubeClient *kubernete
|
||||
inactiveHosts := make(map[string]bool)
|
||||
var controlPlaneHosts, notReadyHosts []*hosts.Host
|
||||
var notReadyHostNames []string
|
||||
var err error
|
||||
|
||||
for _, host := range c.InactiveHosts {
|
||||
// include only hosts with controlplane role
|
||||
@@ -173,6 +174,10 @@ func (c *Cluster) UpgradeControlPlane(ctx context.Context, kubeClient *kubernete
|
||||
inactiveHosts[host.HostnameOverride] = true
|
||||
}
|
||||
}
|
||||
c.MaxUnavailableForControlNodes, err = services.ResetMaxUnavailable(c.MaxUnavailableForControlNodes, len(inactiveHosts), services.ControlRole)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, host := range c.ControlPlaneHosts {
|
||||
if !c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
controlPlaneHosts = append(controlPlaneHosts, host)
|
||||
@@ -265,6 +270,7 @@ func (c *Cluster) UpgradeWorkerPlane(ctx context.Context, kubeClient *kubernetes
|
||||
inactiveHosts := make(map[string]bool)
|
||||
var notReadyHosts []*hosts.Host
|
||||
var notReadyHostNames []string
|
||||
var err error
|
||||
|
||||
for _, host := range c.InactiveHosts {
|
||||
// if host has controlplane role, it already has worker components upgraded
|
||||
@@ -272,6 +278,10 @@ func (c *Cluster) UpgradeWorkerPlane(ctx context.Context, kubeClient *kubernetes
|
||||
inactiveHosts[host.HostnameOverride] = true
|
||||
}
|
||||
}
|
||||
c.MaxUnavailableForWorkerNodes, err = services.ResetMaxUnavailable(c.MaxUnavailableForWorkerNodes, len(inactiveHosts), services.WorkerRole)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, host := range append(etcdAndWorkerHosts, workerOnlyHosts...) {
|
||||
if c.NewHosts[host.HostnameOverride] {
|
||||
continue
|
||||
|
@@ -603,7 +603,7 @@ func GetExternalFlags(local, updateOnly, disablePortCheck bool, configDir, clust
|
||||
func (c *Cluster) setAddonsDefaults() {
|
||||
c.Ingress.UpdateStrategy = setDaemonsetAddonDefaults(c.Ingress.UpdateStrategy)
|
||||
c.Network.UpdateStrategy = setDaemonsetAddonDefaults(c.Network.UpdateStrategy)
|
||||
c.DNS.UpdateStrategy = setDeploymentAddonDefaults(c.DNS.UpdateStrategy)
|
||||
c.DNS.UpdateStrategy = setDNSDeploymentAddonDefaults(c.DNS.UpdateStrategy, c.DNS.Provider)
|
||||
if c.DNS.LinearAutoscalerParams == nil {
|
||||
c.DNS.LinearAutoscalerParams = &DefaultClusterProportionalAutoscalerLinearParams
|
||||
}
|
||||
@@ -638,3 +638,43 @@ func setDeploymentAddonDefaults(updateStrategy *appsv1.DeploymentStrategy) *apps
|
||||
}
|
||||
return updateStrategy
|
||||
}
|
||||
|
||||
func setDNSDeploymentAddonDefaults(updateStrategy *appsv1.DeploymentStrategy, dnsProvider string) *appsv1.DeploymentStrategy {
|
||||
var (
|
||||
coreDNSMaxUnavailable, coreDNSMaxSurge = intstr.FromInt(1), intstr.FromInt(0)
|
||||
kubeDNSMaxSurge, kubeDNSMaxUnavailable = intstr.FromString("10%"), intstr.FromInt(0)
|
||||
)
|
||||
if updateStrategy != nil && updateStrategy.Type != appsv1.RollingUpdateDeploymentStrategyType {
|
||||
return updateStrategy
|
||||
}
|
||||
switch dnsProvider {
|
||||
case CoreDNSProvider:
|
||||
if updateStrategy == nil || updateStrategy.RollingUpdate == nil {
|
||||
return &appsv1.DeploymentStrategy{
|
||||
Type: appsv1.RollingUpdateDeploymentStrategyType,
|
||||
RollingUpdate: &appsv1.RollingUpdateDeployment{
|
||||
MaxUnavailable: &coreDNSMaxUnavailable,
|
||||
MaxSurge: &coreDNSMaxSurge,
|
||||
},
|
||||
}
|
||||
}
|
||||
if updateStrategy.RollingUpdate.MaxUnavailable == nil {
|
||||
updateStrategy.RollingUpdate.MaxUnavailable = &coreDNSMaxUnavailable
|
||||
}
|
||||
case KubeDNSProvider:
|
||||
if updateStrategy == nil || updateStrategy.RollingUpdate == nil {
|
||||
return &appsv1.DeploymentStrategy{
|
||||
Type: appsv1.RollingUpdateDeploymentStrategyType,
|
||||
RollingUpdate: &appsv1.RollingUpdateDeployment{
|
||||
MaxUnavailable: &kubeDNSMaxUnavailable,
|
||||
MaxSurge: &kubeDNSMaxSurge,
|
||||
},
|
||||
}
|
||||
}
|
||||
if updateStrategy.RollingUpdate.MaxSurge == nil {
|
||||
updateStrategy.RollingUpdate.MaxSurge = &kubeDNSMaxSurge
|
||||
}
|
||||
}
|
||||
|
||||
return updateStrategy
|
||||
}
|
||||
|
@@ -72,16 +72,22 @@ func UpgradeControlPlaneNodes(ctx context.Context, kubeClient *kubernetes.Client
|
||||
drainHelper = getDrainHelper(kubeClient, *upgradeStrategy)
|
||||
log.Infof(ctx, "[%s] Parameters provided to drain command: %#v", ControlRole, fmt.Sprintf("Force: %v, IgnoreAllDaemonSets: %v, DeleteLocalData: %v, Timeout: %v, GracePeriodSeconds: %v", drainHelper.Force, drainHelper.IgnoreAllDaemonSets, drainHelper.DeleteLocalData, drainHelper.Timeout, drainHelper.GracePeriodSeconds))
|
||||
}
|
||||
maxUnavailable, err := resetMaxUnavailable(maxUnavailable, len(inactiveHosts), ControlRole)
|
||||
if err != nil {
|
||||
return errMsgMaxUnavailableNotFailed, err
|
||||
var inactiveHostErr error
|
||||
if len(inactiveHosts) > 0 {
|
||||
var inactiveHostNames []string
|
||||
for hostName := range inactiveHosts {
|
||||
inactiveHostNames = append(inactiveHostNames, hostName)
|
||||
}
|
||||
inactiveHostErr = fmt.Errorf("provisioning incomplete, host(s) [%s] skipped because they could not be contacted", strings.Join(inactiveHostNames, ","))
|
||||
}
|
||||
hostsFailedToUpgrade, err := processControlPlaneForUpgrade(ctx, kubeClient, controlHosts, localConnDialerFactory, prsMap, cpNodePlanMap, updateWorkersOnly, alpineImage, certMap,
|
||||
upgradeStrategy, newHosts, inactiveHosts, maxUnavailable, drainHelper)
|
||||
if err != nil {
|
||||
logrus.Errorf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err)
|
||||
errMsgMaxUnavailableNotFailed = fmt.Sprintf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err)
|
||||
return errMsgMaxUnavailableNotFailed, err
|
||||
if err != nil || inactiveHostErr != nil {
|
||||
if len(hostsFailedToUpgrade) > 0 {
|
||||
logrus.Errorf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err)
|
||||
errMsgMaxUnavailableNotFailed = fmt.Sprintf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err)
|
||||
}
|
||||
return errMsgMaxUnavailableNotFailed, util.ErrList([]error{err, inactiveHostErr})
|
||||
}
|
||||
log.Infof(ctx, "[%s] Successfully upgraded Controller Plane..", ControlRole)
|
||||
return errMsgMaxUnavailableNotFailed, nil
|
||||
|
@@ -115,7 +115,7 @@ func CalculateMaxUnavailable(maxUnavailableVal string, numHosts int, role string
|
||||
return maxUnavailable, nil
|
||||
}
|
||||
|
||||
func resetMaxUnavailable(maxUnavailable, lenInactiveHosts int, component string) (int, error) {
|
||||
func ResetMaxUnavailable(maxUnavailable, lenInactiveHosts int, component string) (int, error) {
|
||||
if maxUnavailable > WorkerThreads {
|
||||
/* upgrading a large number of nodes in parallel leads to a large number of goroutines, which has led to errors regarding too many open sockets
|
||||
Because of this RKE switched to using workerpools. 50 workerthreads has been sufficient to optimize rke up, upgrading at most 50 nodes in parallel.
|
||||
|
@@ -55,10 +55,6 @@ func RunWorkerPlane(ctx context.Context, allHosts []*hosts.Host, localConnDialer
|
||||
func UpgradeWorkerPlaneForWorkerAndEtcdNodes(ctx context.Context, kubeClient *kubernetes.Clientset, mixedRolesHosts []*hosts.Host, workerOnlyHosts []*hosts.Host, inactiveHosts map[string]bool, localConnDialerFactory hosts.DialerFactory, prsMap map[string]v3.PrivateRegistry, workerNodePlanMap map[string]v3.RKEConfigNodePlan, certMap map[string]pki.CertificatePKI, updateWorkersOnly bool, alpineImage string, upgradeStrategy *v3.NodeUpgradeStrategy, newHosts map[string]bool, maxUnavailable int) (string, error) {
|
||||
log.Infof(ctx, "[%s] Upgrading Worker Plane..", WorkerRole)
|
||||
var errMsgMaxUnavailableNotFailed string
|
||||
maxUnavailable, err := resetMaxUnavailable(maxUnavailable, len(inactiveHosts), WorkerRole)
|
||||
if err != nil {
|
||||
return errMsgMaxUnavailableNotFailed, err
|
||||
}
|
||||
updateNewHostsList(kubeClient, append(mixedRolesHosts, workerOnlyHosts...), newHosts)
|
||||
if len(mixedRolesHosts) > 0 {
|
||||
log.Infof(ctx, "First checking and processing worker components for upgrades on nodes with etcd role one at a time")
|
||||
|
Reference in New Issue
Block a user