1
0
mirror of https://github.com/rancher/rke.git synced 2025-09-15 22:49:13 +00:00

Return proper error in etcd healthcheck

This commit is contained in:
Alena Prokharchyk
2019-10-30 15:11:26 -07:00
parent 1bee3e452f
commit fa9410faeb
2 changed files with 40 additions and 24 deletions

View File

@@ -81,16 +81,21 @@ func RunEtcdPlane(
}
log.Infof(ctx, "[%s] Successfully started etcd plane.. Checking etcd cluster health", ETCDRole)
clientCert := cert.EncodeCertPEM(certMap[pki.KubeNodeCertName].Certificate)
clientkey := cert.EncodePrivateKeyPEM(certMap[pki.KubeNodeCertName].Key)
var healthy bool
clientKey := cert.EncodePrivateKeyPEM(certMap[pki.KubeNodeCertName].Key)
var healthError error
var hosts []string
for _, host := range etcdHosts {
_, _, healthCheckURL := GetProcessConfig(etcdNodePlanMap[host.Address].Processes[EtcdContainerName], host)
if healthy = isEtcdHealthy(ctx, localConnDialerFactory, host, clientCert, clientkey, healthCheckURL); healthy {
healthError = isEtcdHealthy(localConnDialerFactory, host, clientCert, clientKey, healthCheckURL)
if healthError == nil {
break
}
logrus.Warn(healthError)
hosts = append(hosts, host.Address)
}
if !healthy {
return fmt.Errorf("[etcd] Etcd Cluster is not healthy")
if healthError != nil {
return fmt.Errorf("etcd cluster is unhealthy: hosts [%s] failed to report healthy."+
" Check etcd container logs on each host for more information", strings.Join(hosts, ","))
}
return nil
}
@@ -239,15 +244,20 @@ func ReloadEtcdCluster(ctx context.Context, readyEtcdHosts []*hosts.Host, newHos
return err
}
time.Sleep(EtcdInitWaitTime * time.Second)
var healthy bool
var healthError error
var hosts []string
for _, host := range readyEtcdHosts {
_, _, healthCheckURL := GetProcessConfig(etcdNodePlanMap[host.Address].Processes[EtcdContainerName], host)
if healthy = isEtcdHealthy(ctx, localConnDialerFactory, host, cert, key, healthCheckURL); healthy {
healthError = isEtcdHealthy(localConnDialerFactory, host, cert, key, healthCheckURL)
if healthError == nil {
break
}
logrus.Warn(healthError)
hosts = append(hosts, host.Address)
}
if !healthy {
return fmt.Errorf("[etcd] Etcd Cluster is not healthy")
if healthError != nil {
return fmt.Errorf("etcd cluster is unhealthy: hosts [%s] failed to report healthy."+
" Check etcd container logs on each host for more information", strings.Join(hosts, ","))
}
return nil
}

View File

@@ -19,7 +19,7 @@ import (
func getEtcdClient(ctx context.Context, etcdHost *hosts.Host, localConnDialerFactory hosts.DialerFactory, cert, key []byte) (etcdclient.Client, error) {
dialer, err := getEtcdDialer(localConnDialerFactory, etcdHost)
if err != nil {
return nil, fmt.Errorf("Failed to create a dialer for host [%s]: %v", etcdHost.Address, err)
return nil, fmt.Errorf("failed to create a dialer for host [%s]: %v", etcdHost.Address, err)
}
tlsConfig, err := getEtcdTLSConfig(cert, key)
if err != nil {
@@ -40,17 +40,18 @@ func getEtcdClient(ctx context.Context, etcdHost *hosts.Host, localConnDialerFac
return etcdclient.New(cfg)
}
func isEtcdHealthy(ctx context.Context, localConnDialerFactory hosts.DialerFactory, host *hosts.Host, cert, key []byte, url string) bool {
logrus.Debugf("[etcd] Check etcd cluster health")
func isEtcdHealthy(localConnDialerFactory hosts.DialerFactory, host *hosts.Host, cert, key []byte, url string) error {
logrus.Debugf("[etcd] check etcd cluster health on host [%s]", host.Address)
var finalErr error
var healthy string
for i := 0; i < 3; i++ {
dialer, err := getEtcdDialer(localConnDialerFactory, host)
if err != nil {
return false
return err
}
tlsConfig, err := getEtcdTLSConfig(cert, key)
if err != nil {
logrus.Debugf("[etcd] Failed to create etcd tls config for host [%s]: %v", host.Address, err)
return false
return fmt.Errorf("[etcd] failed to create etcd tls config for host [%s]: %v", host.Address, err)
}
hc := http.Client{
@@ -60,33 +61,38 @@ func isEtcdHealthy(ctx context.Context, localConnDialerFactory hosts.DialerFacto
TLSHandshakeTimeout: 10 * time.Second,
},
}
healthy, err := getHealthEtcd(hc, host, url)
if err != nil {
logrus.Debug(err)
healthy, finalErr = getHealthEtcd(hc, host, url)
if finalErr != nil {
logrus.Debugf("[etcd] failed to check health for etcd host [%s]: %v", host.Address, finalErr)
time.Sleep(5 * time.Second)
continue
}
// log in debug here as we don't want to log in warn on every iteration
// the error will be logged in the caller stack
logrus.Debugf("[etcd] etcd host [%s] reported healthy=%s", host.Address, healthy)
if healthy == "true" {
logrus.Debugf("[etcd] etcd cluster is healthy")
return true
return nil
}
}
return false
if finalErr != nil {
return fmt.Errorf("[etcd] host [%s] failed to check etcd health: %v", host.Address, finalErr)
}
return fmt.Errorf("[etcd] host [%s] reported healthy=%s", host.Address, healthy)
}
func getHealthEtcd(hc http.Client, host *hosts.Host, url string) (string, error) {
healthy := struct{ Health string }{}
resp, err := hc.Get(url)
if err != nil {
return healthy.Health, fmt.Errorf("Failed to get /health for host [%s]: %v", host.Address, err)
return healthy.Health, fmt.Errorf("failed to get /health for host [%s]: %v", host.Address, err)
}
bytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
return healthy.Health, fmt.Errorf("Failed to read response of /health for host [%s]: %v", host.Address, err)
return healthy.Health, fmt.Errorf("failed to read response of /health for host [%s]: %v", host.Address, err)
}
resp.Body.Close()
if err := json.Unmarshal(bytes, &healthy); err != nil {
return healthy.Health, fmt.Errorf("Failed to unmarshal response of /health for host [%s]: %v", host.Address, err)
return healthy.Health, fmt.Errorf("failed to unmarshal response of /health for host [%s]: %v", host.Address, err)
}
return healthy.Health, nil
}