mirror of
https://github.com/rancher/rke.git
synced 2025-08-21 00:03:49 +00:00
Check etcd cluster health after member delete
This commit is contained in:
parent
33c69c0108
commit
422dfff0fd
@ -194,7 +194,7 @@ func reconcileEtcd(ctx context.Context, currentCluster, kubeCluster *Cluster, ku
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// handle etcd member delete
|
// handle etcd member delete
|
||||||
if err := deleteEtcdMembers(ctx, currentCluster, kubeCluster, kubeClient, clientCert, clientKey, etcdToDelete); err != nil {
|
if err := deleteEtcdMembers(ctx, currentCluster, kubeCluster, kubeClient, svcOptionData, clientCert, clientKey, etcdToDelete); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// handle etcd member add
|
// handle etcd member add
|
||||||
@ -238,11 +238,20 @@ func addEtcdMembers(ctx context.Context, currentCluster, kubeCluster *Cluster, k
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func deleteEtcdMembers(ctx context.Context, currentCluster, kubeCluster *Cluster, kubeClient *kubernetes.Clientset, clientCert, clientKey []byte, etcdToDelete []*hosts.Host) error {
|
func deleteEtcdMembers(ctx context.Context, currentCluster, kubeCluster *Cluster, kubeClient *kubernetes.Clientset, svcOptionData map[string]*v3.KubernetesServicesOptions, clientCert, clientKey []byte, etcdToDelete []*hosts.Host) error {
|
||||||
log.Infof(ctx, "[reconcile] Check etcd hosts to be deleted")
|
log.Infof(ctx, "[reconcile] Check etcd hosts to be deleted")
|
||||||
|
etcdNodePlanMap := make(map[string]v3.RKEConfigNodePlan)
|
||||||
|
for _, etcdMapHost := range kubeCluster.EtcdHosts {
|
||||||
|
svcOptions, err := kubeCluster.GetKubernetesServicesOptions(etcdMapHost.DockerInfo.OSType, svcOptionData)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
etcdNodePlanMap[etcdMapHost.Address] = BuildRKEConfigNodePlan(ctx, kubeCluster, etcdMapHost, svcOptions)
|
||||||
|
}
|
||||||
|
|
||||||
for _, etcdHost := range etcdToDelete {
|
for _, etcdHost := range etcdToDelete {
|
||||||
etcdHost.IsEtcd = false
|
etcdHost.IsEtcd = false
|
||||||
if err := services.RemoveEtcdMember(ctx, etcdHost, kubeCluster.EtcdHosts, currentCluster.LocalConnDialerFactory, clientCert, clientKey); err != nil {
|
if err := services.RemoveEtcdMember(ctx, etcdHost, kubeCluster.EtcdHosts, currentCluster.LocalConnDialerFactory, clientCert, clientKey, etcdNodePlanMap); err != nil {
|
||||||
log.Warnf(ctx, "[reconcile] %v", err)
|
log.Warnf(ctx, "[reconcile] %v", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -195,8 +195,8 @@ func AddEtcdMember(ctx context.Context, toAddEtcdHost *hosts.Host, etcdHosts []*
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func RemoveEtcdMember(ctx context.Context, etcdHost *hosts.Host, etcdHosts []*hosts.Host, localConnDialerFactory hosts.DialerFactory, cert, key []byte) error {
|
func RemoveEtcdMember(ctx context.Context, toDeleteEtcdHost *hosts.Host, etcdHosts []*hosts.Host, localConnDialerFactory hosts.DialerFactory, cert, key []byte, etcdNodePlanMap map[string]v3.RKEConfigNodePlan) error {
|
||||||
log.Infof(ctx, "[remove/%s] Removing member [etcd-%s] from etcd cluster", ETCDRole, etcdHost.HostnameOverride)
|
log.Infof(ctx, "[remove/%s] Removing member [etcd-%s] from etcd cluster", ETCDRole, toDeleteEtcdHost.HostnameOverride)
|
||||||
var mID string
|
var mID string
|
||||||
removed := false
|
removed := false
|
||||||
for _, host := range etcdHosts {
|
for _, host := range etcdHosts {
|
||||||
@ -212,7 +212,7 @@ func RemoveEtcdMember(ctx context.Context, etcdHost *hosts.Host, etcdHosts []*ho
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, member := range members {
|
for _, member := range members {
|
||||||
if member.Name == fmt.Sprintf("etcd-%s", etcdHost.HostnameOverride) {
|
if member.Name == fmt.Sprintf("etcd-%s", toDeleteEtcdHost.HostnameOverride) {
|
||||||
mID = member.ID
|
mID = member.ID
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -221,13 +221,28 @@ func RemoveEtcdMember(ctx context.Context, etcdHost *hosts.Host, etcdHosts []*ho
|
|||||||
logrus.Debugf("Failed to list etcd members from host [%s]: %v", host.Address, err)
|
logrus.Debugf("Failed to list etcd members from host [%s]: %v", host.Address, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
removed = true
|
etcdMemberDeletedTime := time.Now()
|
||||||
break
|
// Need to health check after successful member remove (especially for leader re-election)
|
||||||
|
// We will check all hosts to see if the cluster becomes healthy
|
||||||
|
var healthError error
|
||||||
|
_, _, healthCheckURL := GetProcessConfig(etcdNodePlanMap[host.Address].Processes[EtcdContainerName], host)
|
||||||
|
logrus.Infof("[remove/%s] Checking etcd cluster health on [etcd-%s] after removing [etcd-%s]", ETCDRole, host.HostnameOverride, toDeleteEtcdHost.HostnameOverride)
|
||||||
|
logrus.Debugf("[remove/%s] healthCheckURL for checking etcd cluster health on [etcd-%s] after removing [%s]: [%s]", ETCDRole, host.HostnameOverride, toDeleteEtcdHost.HostnameOverride, healthCheckURL)
|
||||||
|
healthError = isEtcdHealthy(localConnDialerFactory, host, cert, key, healthCheckURL)
|
||||||
|
if healthError == nil {
|
||||||
|
logrus.Infof("[remove/%s] etcd cluster health is healthy on [etcd-%s] after removing [etcd-%s]", ETCDRole, host.HostnameOverride, toDeleteEtcdHost.HostnameOverride)
|
||||||
|
etcdHealthyTime := time.Now()
|
||||||
|
diffTime := etcdHealthyTime.Sub(etcdMemberDeletedTime)
|
||||||
|
logrus.Debugf("Total time between etcd member deleted and etcd cluster healthy is: [%s]", diffTime)
|
||||||
|
removed = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
logrus.Warn(healthError)
|
||||||
}
|
}
|
||||||
if !removed {
|
if !removed {
|
||||||
return fmt.Errorf("Failed to delete etcd member [etcd-%s] from etcd cluster", etcdHost.HostnameOverride)
|
return fmt.Errorf("Failed to delete etcd member [etcd-%s] from etcd cluster", toDeleteEtcdHost.HostnameOverride)
|
||||||
}
|
}
|
||||||
log.Infof(ctx, "[remove/%s] Successfully removed member [etcd-%s] from etcd cluster", ETCDRole, etcdHost.HostnameOverride)
|
log.Infof(ctx, "[remove/%s] Successfully removed member [etcd-%s] from etcd cluster", ETCDRole, toDeleteEtcdHost.HostnameOverride)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,7 +44,9 @@ func isEtcdHealthy(localConnDialerFactory hosts.DialerFactory, host *hosts.Host,
|
|||||||
logrus.Debugf("[etcd] check etcd cluster health on host [%s]", host.Address)
|
logrus.Debugf("[etcd] check etcd cluster health on host [%s]", host.Address)
|
||||||
var finalErr error
|
var finalErr error
|
||||||
var healthy string
|
var healthy string
|
||||||
for i := 0; i < 3; i++ {
|
// given a max election timeout of 50000ms (50s), max re-election of 77 seconds was seen
|
||||||
|
// this allows for 18 * 5 seconds = 90 seconds of re-election
|
||||||
|
for i := 0; i < 18; i++ {
|
||||||
dialer, err := getEtcdDialer(localConnDialerFactory, host)
|
dialer, err := getEtcdDialer(localConnDialerFactory, host)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -67,12 +69,12 @@ func isEtcdHealthy(localConnDialerFactory hosts.DialerFactory, host *hosts.Host,
|
|||||||
time.Sleep(5 * time.Second)
|
time.Sleep(5 * time.Second)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// log in debug here as we don't want to log in warn on every iteration
|
// Changed this from Debug to Info to inform user on what is happening
|
||||||
// the error will be logged in the caller stack
|
logrus.Infof("[etcd] etcd host [%s] reported healthy=%s", host.Address, healthy)
|
||||||
logrus.Debugf("[etcd] etcd host [%s] reported healthy=%s", host.Address, healthy)
|
|
||||||
if healthy == "true" {
|
if healthy == "true" {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
}
|
}
|
||||||
if finalErr != nil {
|
if finalErr != nil {
|
||||||
return fmt.Errorf("[etcd] host [%s] failed to check etcd health: %v", host.Address, finalErr)
|
return fmt.Errorf("[etcd] host [%s] failed to check etcd health: %v", host.Address, finalErr)
|
||||||
|
Loading…
Reference in New Issue
Block a user