diff --git a/cluster/reconcile.go b/cluster/reconcile.go index 787115d1..927dc5a7 100644 --- a/cluster/reconcile.go +++ b/cluster/reconcile.go @@ -194,7 +194,7 @@ func reconcileEtcd(ctx context.Context, currentCluster, kubeCluster *Cluster, ku } } // handle etcd member delete - if err := deleteEtcdMembers(ctx, currentCluster, kubeCluster, kubeClient, clientCert, clientKey, etcdToDelete); err != nil { + if err := deleteEtcdMembers(ctx, currentCluster, kubeCluster, kubeClient, svcOptionData, clientCert, clientKey, etcdToDelete); err != nil { return err } // handle etcd member add @@ -238,11 +238,20 @@ func addEtcdMembers(ctx context.Context, currentCluster, kubeCluster *Cluster, k return nil } -func deleteEtcdMembers(ctx context.Context, currentCluster, kubeCluster *Cluster, kubeClient *kubernetes.Clientset, clientCert, clientKey []byte, etcdToDelete []*hosts.Host) error { +func deleteEtcdMembers(ctx context.Context, currentCluster, kubeCluster *Cluster, kubeClient *kubernetes.Clientset, svcOptionData map[string]*v3.KubernetesServicesOptions, clientCert, clientKey []byte, etcdToDelete []*hosts.Host) error { log.Infof(ctx, "[reconcile] Check etcd hosts to be deleted") + etcdNodePlanMap := make(map[string]v3.RKEConfigNodePlan) + for _, etcdMapHost := range kubeCluster.EtcdHosts { + svcOptions, err := kubeCluster.GetKubernetesServicesOptions(etcdMapHost.DockerInfo.OSType, svcOptionData) + if err != nil { + return err + } + etcdNodePlanMap[etcdMapHost.Address] = BuildRKEConfigNodePlan(ctx, kubeCluster, etcdMapHost, svcOptions) + } + for _, etcdHost := range etcdToDelete { etcdHost.IsEtcd = false - if err := services.RemoveEtcdMember(ctx, etcdHost, kubeCluster.EtcdHosts, currentCluster.LocalConnDialerFactory, clientCert, clientKey); err != nil { + if err := services.RemoveEtcdMember(ctx, etcdHost, kubeCluster.EtcdHosts, currentCluster.LocalConnDialerFactory, clientCert, clientKey, etcdNodePlanMap); err != nil { log.Warnf(ctx, "[reconcile] %v", err) continue } diff --git a/services/etcd.go b/services/etcd.go index 6186cca7..f4a20598 100644 --- a/services/etcd.go +++ b/services/etcd.go @@ -195,8 +195,8 @@ func AddEtcdMember(ctx context.Context, toAddEtcdHost *hosts.Host, etcdHosts []* return nil } -func RemoveEtcdMember(ctx context.Context, etcdHost *hosts.Host, etcdHosts []*hosts.Host, localConnDialerFactory hosts.DialerFactory, cert, key []byte) error { - log.Infof(ctx, "[remove/%s] Removing member [etcd-%s] from etcd cluster", ETCDRole, etcdHost.HostnameOverride) +func RemoveEtcdMember(ctx context.Context, toDeleteEtcdHost *hosts.Host, etcdHosts []*hosts.Host, localConnDialerFactory hosts.DialerFactory, cert, key []byte, etcdNodePlanMap map[string]v3.RKEConfigNodePlan) error { + log.Infof(ctx, "[remove/%s] Removing member [etcd-%s] from etcd cluster", ETCDRole, toDeleteEtcdHost.HostnameOverride) var mID string removed := false for _, host := range etcdHosts { @@ -212,7 +212,7 @@ func RemoveEtcdMember(ctx context.Context, etcdHost *hosts.Host, etcdHosts []*ho continue } for _, member := range members { - if member.Name == fmt.Sprintf("etcd-%s", etcdHost.HostnameOverride) { + if member.Name == fmt.Sprintf("etcd-%s", toDeleteEtcdHost.HostnameOverride) { mID = member.ID break } @@ -221,13 +221,28 @@ func RemoveEtcdMember(ctx context.Context, etcdHost *hosts.Host, etcdHosts []*ho logrus.Debugf("Failed to list etcd members from host [%s]: %v", host.Address, err) continue } - removed = true - break + etcdMemberDeletedTime := time.Now() + // Need to health check after successful member remove (especially for leader re-election) + // We will check all hosts to see if the cluster becomes healthy + var healthError error + _, _, healthCheckURL := GetProcessConfig(etcdNodePlanMap[host.Address].Processes[EtcdContainerName], host) + logrus.Infof("[remove/%s] Checking etcd cluster health on [etcd-%s] after removing [etcd-%s]", ETCDRole, host.HostnameOverride, toDeleteEtcdHost.HostnameOverride) + logrus.Debugf("[remove/%s] healthCheckURL for checking etcd cluster health on [etcd-%s] after removing [%s]: [%s]", ETCDRole, host.HostnameOverride, toDeleteEtcdHost.HostnameOverride, healthCheckURL) + healthError = isEtcdHealthy(localConnDialerFactory, host, cert, key, healthCheckURL) + if healthError == nil { + logrus.Infof("[remove/%s] etcd cluster health is healthy on [etcd-%s] after removing [etcd-%s]", ETCDRole, host.HostnameOverride, toDeleteEtcdHost.HostnameOverride) + etcdHealthyTime := time.Now() + diffTime := etcdHealthyTime.Sub(etcdMemberDeletedTime) + logrus.Debugf("Total time between etcd member deleted and etcd cluster healthy is: [%s]", diffTime) + removed = true + break + } + logrus.Warn(healthError) } if !removed { - return fmt.Errorf("Failed to delete etcd member [etcd-%s] from etcd cluster", etcdHost.HostnameOverride) + return fmt.Errorf("Failed to delete etcd member [etcd-%s] from etcd cluster", toDeleteEtcdHost.HostnameOverride) } - log.Infof(ctx, "[remove/%s] Successfully removed member [etcd-%s] from etcd cluster", ETCDRole, etcdHost.HostnameOverride) + log.Infof(ctx, "[remove/%s] Successfully removed member [etcd-%s] from etcd cluster", ETCDRole, toDeleteEtcdHost.HostnameOverride) return nil } diff --git a/services/etcd_util.go b/services/etcd_util.go index ae6f80a7..aaca193b 100644 --- a/services/etcd_util.go +++ b/services/etcd_util.go @@ -44,7 +44,9 @@ func isEtcdHealthy(localConnDialerFactory hosts.DialerFactory, host *hosts.Host, logrus.Debugf("[etcd] check etcd cluster health on host [%s]", host.Address) var finalErr error var healthy string - for i := 0; i < 3; i++ { + // given a max election timeout of 50000ms (50s), max re-election of 77 seconds was seen + // this allows for 18 * 5 seconds = 90 seconds of re-election + for i := 0; i < 18; i++ { dialer, err := getEtcdDialer(localConnDialerFactory, host) if err != nil { return err @@ -67,12 +69,12 @@ func isEtcdHealthy(localConnDialerFactory hosts.DialerFactory, host *hosts.Host, time.Sleep(5 * time.Second) continue } - // log in debug here as we don't want to log in warn on every iteration - // the error will be logged in the caller stack - logrus.Debugf("[etcd] etcd host [%s] reported healthy=%s", host.Address, healthy) + // Changed this from Debug to Info to inform user on what is happening + logrus.Infof("[etcd] etcd host [%s] reported healthy=%s", host.Address, healthy) if healthy == "true" { return nil } + time.Sleep(5 * time.Second) } if finalErr != nil { return fmt.Errorf("[etcd] host [%s] failed to check etcd health: %v", host.Address, finalErr)