mirror of
https://github.com/rancher/rke.git
synced 2025-09-18 08:06:20 +00:00
Attempt upgrade on NotReady hosts
This commit is contained in:
@@ -69,6 +69,7 @@ type Cluster struct {
|
||||
EncryptionConfig encryptionConfig
|
||||
NewHosts map[string]bool
|
||||
MaxUnavailableForWorkerNodes int
|
||||
MaxUnavailableForControlNodes int
|
||||
HostsLabeledToIgnoreUpgrade map[string]bool
|
||||
}
|
||||
|
||||
@@ -109,10 +110,10 @@ const (
|
||||
networkAddon = "network"
|
||||
)
|
||||
|
||||
func (c *Cluster) DeployControlPlane(ctx context.Context, svcOptionData map[string]*v3.KubernetesServicesOptions, reconcileCluster bool) error {
|
||||
func (c *Cluster) DeployControlPlane(ctx context.Context, svcOptionData map[string]*v3.KubernetesServicesOptions, reconcileCluster bool) (string, error) {
|
||||
kubeClient, err := k8s.NewClient(c.LocalKubeConfigPath, c.K8sWrapTransport)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to initialize new kubernetes client: %v", err)
|
||||
return "", fmt.Errorf("failed to initialize new kubernetes client: %v", err)
|
||||
}
|
||||
|
||||
// Deploy Etcd Plane
|
||||
@@ -126,15 +127,19 @@ func (c *Cluster) DeployControlPlane(ctx context.Context, svcOptionData map[stri
|
||||
log.Infof(ctx, "[etcd] External etcd connection string has been specified, skipping etcd plane")
|
||||
} else {
|
||||
if err := services.RunEtcdPlane(ctx, c.EtcdHosts, etcdNodePlanMap, c.LocalConnDialerFactory, c.PrivateRegistriesMap, c.UpdateWorkersOnly, c.SystemImages.Alpine, c.Services.Etcd, c.Certificates); err != nil {
|
||||
return fmt.Errorf("[etcd] Failed to bring up Etcd Plane: %v", err)
|
||||
return "", fmt.Errorf("[etcd] Failed to bring up Etcd Plane: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Deploy Control plane
|
||||
cpNodePlanMap := make(map[string]v3.RKEConfigNodePlan)
|
||||
// Build cp node plan map
|
||||
var notReadyHosts []*hosts.Host
|
||||
for _, cpHost := range c.ControlPlaneHosts {
|
||||
cpNodePlanMap[cpHost.Address] = BuildRKEConfigNodePlan(ctx, c, cpHost, cpHost.DockerInfo, svcOptionData)
|
||||
if err := services.CheckNodeReady(kubeClient, cpHost, services.ControlRole); err != nil {
|
||||
notReadyHosts = append(notReadyHosts, cpHost)
|
||||
}
|
||||
}
|
||||
|
||||
if !reconcileCluster {
|
||||
@@ -145,12 +150,18 @@ func (c *Cluster) DeployControlPlane(ctx context.Context, svcOptionData map[stri
|
||||
c.UpdateWorkersOnly,
|
||||
c.SystemImages.Alpine,
|
||||
c.Certificates); err != nil {
|
||||
return fmt.Errorf("[controlPlane] Failed to bring up Control Plane: %v", err)
|
||||
return "", fmt.Errorf("[controlPlane] Failed to bring up Control Plane: %v", err)
|
||||
}
|
||||
return nil
|
||||
return "", nil
|
||||
}
|
||||
return c.UpgradeControlPlane(ctx, kubeClient, cpNodePlanMap, notReadyHosts)
|
||||
}
|
||||
|
||||
func (c *Cluster) UpgradeControlPlane(ctx context.Context, kubeClient *kubernetes.Clientset, cpNodePlanMap map[string]v3.RKEConfigNodePlan, notReadyHosts []*hosts.Host) (string, error) {
|
||||
inactiveHosts := make(map[string]bool)
|
||||
var controlPlaneHosts []*hosts.Host
|
||||
var notReadyHostNames []string
|
||||
|
||||
for _, host := range c.InactiveHosts {
|
||||
if !c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
inactiveHosts[host.HostnameOverride] = true
|
||||
@@ -161,16 +172,35 @@ func (c *Cluster) DeployControlPlane(ctx context.Context, svcOptionData map[stri
|
||||
controlPlaneHosts = append(controlPlaneHosts, host)
|
||||
}
|
||||
}
|
||||
if err := services.UpgradeControlPlaneNodes(ctx, kubeClient, controlPlaneHosts,
|
||||
|
||||
for _, host := range notReadyHosts {
|
||||
notReadyHostNames = append(notReadyHostNames, host.HostnameOverride)
|
||||
}
|
||||
// attempt upgrade on NotReady hosts without respecting maxUnavailable
|
||||
logrus.Infof("Attempting upgrade of controlplane components on following hosts in NotReady status: %v", strings.Join(notReadyHostNames, ","))
|
||||
|
||||
services.RunControlPlane(ctx, notReadyHosts,
|
||||
c.LocalConnDialerFactory,
|
||||
c.PrivateRegistriesMap,
|
||||
cpNodePlanMap,
|
||||
c.UpdateWorkersOnly,
|
||||
c.SystemImages.Alpine,
|
||||
c.Certificates, c.UpgradeStrategy, c.NewHosts, inactiveHosts); err != nil {
|
||||
return fmt.Errorf("[controlPlane] Failed to upgrade Control Plane: %v", err)
|
||||
c.Certificates)
|
||||
for _, host := range notReadyHosts {
|
||||
services.CheckNodeReady(kubeClient, host, services.ControlRole)
|
||||
}
|
||||
return nil
|
||||
// rolling upgrade respecting maxUnavailable
|
||||
errMsgMaxUnavailableNotFailed, err := services.UpgradeControlPlaneNodes(ctx, kubeClient, controlPlaneHosts,
|
||||
c.LocalConnDialerFactory,
|
||||
c.PrivateRegistriesMap,
|
||||
cpNodePlanMap,
|
||||
c.UpdateWorkersOnly,
|
||||
c.SystemImages.Alpine,
|
||||
c.Certificates, c.UpgradeStrategy, c.NewHosts, inactiveHosts, c.MaxUnavailableForControlNodes)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("[controlPlane] Failed to upgrade Control Plane: %v", err)
|
||||
}
|
||||
return errMsgMaxUnavailableNotFailed, nil
|
||||
}
|
||||
|
||||
func (c *Cluster) DeployWorkerPlane(ctx context.Context, svcOptionData map[string]*v3.KubernetesServicesOptions, reconcileCluster bool) (string, error) {
|
||||
@@ -182,12 +212,16 @@ func (c *Cluster) DeployWorkerPlane(ctx context.Context, svcOptionData map[strin
|
||||
// Deploy Worker plane
|
||||
workerNodePlanMap := make(map[string]v3.RKEConfigNodePlan)
|
||||
// Build cp node plan map
|
||||
var notReadyHosts []*hosts.Host
|
||||
allHosts := hosts.GetUniqueHostList(c.EtcdHosts, c.ControlPlaneHosts, c.WorkerHosts)
|
||||
for _, host := range allHosts {
|
||||
workerNodePlanMap[host.Address] = BuildRKEConfigNodePlan(ctx, c, host, host.DockerInfo, svcOptionData)
|
||||
if host.IsControl || c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
continue
|
||||
}
|
||||
if err := services.CheckNodeReady(kubeClient, host, services.WorkerRole); err != nil {
|
||||
notReadyHosts = append(notReadyHosts, host)
|
||||
}
|
||||
if !host.IsEtcd {
|
||||
// separating hosts with only worker role so they undergo upgrade in maxUnavailable batches
|
||||
workerOnlyHosts = append(workerOnlyHosts, host)
|
||||
@@ -211,12 +245,32 @@ func (c *Cluster) DeployWorkerPlane(ctx context.Context, svcOptionData map[strin
|
||||
return "", nil
|
||||
}
|
||||
|
||||
return c.UpgradeWorkerPlane(ctx, kubeClient, workerNodePlanMap, notReadyHosts, etcdAndWorkerHosts, workerOnlyHosts)
|
||||
}
|
||||
|
||||
func (c *Cluster) UpgradeWorkerPlane(ctx context.Context, kubeClient *kubernetes.Clientset, workerNodePlanMap map[string]v3.RKEConfigNodePlan, notReadyHosts, etcdAndWorkerHosts, workerOnlyHosts []*hosts.Host) (string, error) {
|
||||
inactiveHosts := make(map[string]bool)
|
||||
var notReadyHostNames []string
|
||||
for _, host := range c.InactiveHosts {
|
||||
if !c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
inactiveHosts[host.HostnameOverride] = true
|
||||
}
|
||||
}
|
||||
for _, host := range notReadyHosts {
|
||||
notReadyHostNames = append(notReadyHostNames, host.HostnameOverride)
|
||||
}
|
||||
// attempt upgrade on NotReady hosts without respecting maxUnavailable
|
||||
logrus.Infof("Attempting upgrade of worker components on following hosts in NotReady status: %v", strings.Join(notReadyHostNames, ","))
|
||||
services.RunWorkerPlane(ctx, notReadyHosts,
|
||||
c.LocalConnDialerFactory,
|
||||
c.PrivateRegistriesMap,
|
||||
workerNodePlanMap,
|
||||
c.Certificates,
|
||||
c.UpdateWorkersOnly,
|
||||
c.SystemImages.Alpine)
|
||||
for _, host := range notReadyHosts {
|
||||
services.CheckNodeReady(kubeClient, host, services.WorkerRole)
|
||||
}
|
||||
errMsgMaxUnavailableNotFailed, err := services.UpgradeWorkerPlaneForWorkerAndEtcdNodes(ctx, kubeClient, etcdAndWorkerHosts, workerOnlyHosts, inactiveHosts,
|
||||
c.LocalConnDialerFactory,
|
||||
c.PrivateRegistriesMap,
|
||||
|
@@ -79,10 +79,11 @@ const (
|
||||
DefaultKubeAPIArgAuditLogPathValue = "/var/log/kube-audit/audit-log.json"
|
||||
DefaultKubeAPIArgAuditPolicyFileValue = "/etc/kubernetes/audit-policy.yaml"
|
||||
|
||||
DefaultMaxUnavailable = "10%"
|
||||
DefaultNodeDrainTimeout = 120
|
||||
DefaultNodeDrainGracePeriod = -1
|
||||
DefaultNodeDrainIgnoreDaemonsets = true
|
||||
DefaultMaxUnavailableWorker = "10%"
|
||||
DefaultMaxUnavailableControlplane = "1"
|
||||
DefaultNodeDrainTimeout = 120
|
||||
DefaultNodeDrainGracePeriod = -1
|
||||
DefaultNodeDrainIgnoreDaemonsets = true
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -220,14 +221,16 @@ func (c *Cluster) setClusterDefaults(ctx context.Context, flags ExternalFlags) e
|
||||
|
||||
func (c *Cluster) setNodeUpgradeStrategy() {
|
||||
if c.UpgradeStrategy == nil {
|
||||
// we need to escape the "%" at the end of "10%" here so its not interpreted
|
||||
logrus.Debugf("No input provided for maxUnavailable, setting it to default value of %v", DefaultMaxUnavailable+"%")
|
||||
logrus.Debugf("No input provided for maxUnavailableWorker, setting it to default value of %v percent", strings.TrimRight(DefaultMaxUnavailableWorker, "%"))
|
||||
logrus.Debugf("No input provided for maxUnavailableControlplane, setting it to default value of %v", DefaultMaxUnavailableControlplane)
|
||||
c.UpgradeStrategy = &v3.NodeUpgradeStrategy{
|
||||
MaxUnavailable: DefaultMaxUnavailable,
|
||||
MaxUnavailableWorker: DefaultMaxUnavailableWorker,
|
||||
MaxUnavailableControlplane: DefaultMaxUnavailableControlplane,
|
||||
}
|
||||
return
|
||||
}
|
||||
setDefaultIfEmpty(&c.UpgradeStrategy.MaxUnavailable, DefaultMaxUnavailable)
|
||||
setDefaultIfEmpty(&c.UpgradeStrategy.MaxUnavailableWorker, DefaultMaxUnavailableWorker)
|
||||
setDefaultIfEmpty(&c.UpgradeStrategy.MaxUnavailableControlplane, DefaultMaxUnavailableControlplane)
|
||||
if !c.UpgradeStrategy.Drain {
|
||||
return
|
||||
}
|
||||
|
@@ -67,10 +67,10 @@ func (c *Cluster) TunnelHosts(ctx context.Context, flags ExternalFlags) error {
|
||||
return ValidateHostCount(c)
|
||||
}
|
||||
|
||||
func (c *Cluster) RemoveHostsLabeledToIgnoreUpgrade(ctx context.Context) {
|
||||
func (c *Cluster) FindHostsLabeledToIgnoreUpgrade(ctx context.Context) {
|
||||
kubeClient, err := k8s.NewClient(c.LocalKubeConfigPath, c.K8sWrapTransport)
|
||||
if err != nil {
|
||||
logrus.Errorf("Error generating kube client in RemoveHostsLabeledToIgnoreUpgrade: %v", err)
|
||||
logrus.Errorf("Error generating kube client in FindHostsLabeledToIgnoreUpgrade: %v", err)
|
||||
return
|
||||
}
|
||||
var nodes *v1.NodeList
|
||||
@@ -152,6 +152,46 @@ func (c *Cluster) InvertIndexHosts() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) CalculateMaxUnavailable() (int, int, error) {
|
||||
var inactiveControlPlaneHosts, inactiveWorkerHosts []string
|
||||
var workerHosts, controlHosts, maxUnavailableWorker, maxUnavailableControl int
|
||||
|
||||
for _, host := range c.InactiveHosts {
|
||||
if host.IsControl && !c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
inactiveControlPlaneHosts = append(inactiveControlPlaneHosts, host.HostnameOverride)
|
||||
}
|
||||
if !host.IsWorker && !c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
inactiveWorkerHosts = append(inactiveWorkerHosts, host.HostnameOverride)
|
||||
}
|
||||
// not breaking out of the loop so we can log all of the inactive hosts
|
||||
}
|
||||
|
||||
for _, host := range c.WorkerHosts {
|
||||
if c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
continue
|
||||
}
|
||||
workerHosts++
|
||||
}
|
||||
// maxUnavailable should be calculated against all hosts provided in cluster.yml except the ones labelled to be ignored for upgrade
|
||||
workerHosts += len(inactiveWorkerHosts)
|
||||
maxUnavailableWorker, err := services.CalculateMaxUnavailable(c.UpgradeStrategy.MaxUnavailableWorker, workerHosts)
|
||||
if err != nil {
|
||||
return maxUnavailableWorker, maxUnavailableControl, err
|
||||
}
|
||||
for _, host := range c.ControlPlaneHosts {
|
||||
if c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
continue
|
||||
}
|
||||
controlHosts++
|
||||
}
|
||||
controlHosts += len(inactiveControlPlaneHosts)
|
||||
maxUnavailableControl, err = services.CalculateMaxUnavailable(c.UpgradeStrategy.MaxUnavailableControlplane, controlHosts)
|
||||
if err != nil {
|
||||
return maxUnavailableWorker, maxUnavailableControl, err
|
||||
}
|
||||
return maxUnavailableWorker, maxUnavailableControl, nil
|
||||
}
|
||||
|
||||
func (c *Cluster) getConsolidatedAdmissionConfiguration() (*v1alpha1.AdmissionConfiguration, error) {
|
||||
var err error
|
||||
var admissionConfig *v1alpha1.AdmissionConfiguration
|
||||
|
@@ -196,40 +196,6 @@ func ValidateHostCount(c *Cluster) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) ValidateHostCountForUpgradeAndCalculateMaxUnavailable() (int, error) {
|
||||
var inactiveControlPlaneHosts, inactiveWorkerOnlyHosts []string
|
||||
var workerOnlyHosts, maxUnavailable int
|
||||
|
||||
for _, host := range c.InactiveHosts {
|
||||
if host.IsControl && !c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
inactiveControlPlaneHosts = append(inactiveControlPlaneHosts, host.HostnameOverride)
|
||||
}
|
||||
if !host.IsEtcd && !host.IsControl && !c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
inactiveWorkerOnlyHosts = append(inactiveWorkerOnlyHosts, host.HostnameOverride)
|
||||
}
|
||||
// not breaking out of the loop so we can log all of the inactive hosts
|
||||
}
|
||||
if len(inactiveControlPlaneHosts) >= 1 {
|
||||
return maxUnavailable, fmt.Errorf("cannot proceed with upgrade of controlplane if one or more controlplane hosts are inactive; found inactive hosts: %v", strings.Join(inactiveControlPlaneHosts, ","))
|
||||
}
|
||||
for _, host := range c.WorkerHosts {
|
||||
if host.IsControl || host.IsEtcd || c.HostsLabeledToIgnoreUpgrade[host.Address] {
|
||||
continue
|
||||
}
|
||||
workerOnlyHosts++
|
||||
}
|
||||
// maxUnavailable should be calculated against all hosts provided in cluster.yml except the ones labelled to be ignored for upgrade
|
||||
workerOnlyHosts += len(inactiveWorkerOnlyHosts)
|
||||
maxUnavailable, err := services.CalculateMaxUnavailable(c.UpgradeStrategy.MaxUnavailable, workerOnlyHosts)
|
||||
if err != nil {
|
||||
return maxUnavailable, err
|
||||
}
|
||||
if len(inactiveWorkerOnlyHosts) >= maxUnavailable {
|
||||
return maxUnavailable, fmt.Errorf("cannot proceed with upgrade of worker components since %v (>=maxUnavailable) hosts are inactive; found inactive hosts: %v", len(inactiveWorkerOnlyHosts), strings.Join(inactiveWorkerOnlyHosts, ","))
|
||||
}
|
||||
return maxUnavailable, nil
|
||||
}
|
||||
|
||||
func validateDuplicateNodes(c *Cluster) error {
|
||||
for i := range c.Nodes {
|
||||
for j := range c.Nodes {
|
||||
|
Reference in New Issue
Block a user