diff --git a/cmd/kubeadm/app/phases/etcd/local.go b/cmd/kubeadm/app/phases/etcd/local.go index 599cd2a6ac6..b18fc2575eb 100644 --- a/cmd/kubeadm/app/phases/etcd/local.go +++ b/cmd/kubeadm/app/phases/etcd/local.go @@ -150,7 +150,7 @@ func CreateStackedEtcdStaticPodManifestFile(client clientset.Interface, manifest } else { klog.V(1).Infof("[etcd] Adding etcd member: %s", etcdPeerAddress) if features.Enabled(cfg.FeatureGates, features.EtcdLearnerMode) { - cluster, err = etcdClient.AddMemberAsLeanerAndPromote(nodeName, etcdPeerAddress) + cluster, err = etcdClient.AddMemberAsLearner(nodeName, etcdPeerAddress) } else { cluster, err = etcdClient.AddMember(nodeName, etcdPeerAddress) } @@ -172,6 +172,17 @@ func CreateStackedEtcdStaticPodManifestFile(client clientset.Interface, manifest return nil } + if features.Enabled(cfg.FeatureGates, features.EtcdLearnerMode) { + learnerID, err := etcdClient.GetMemberID(etcdPeerAddress) + if err != nil { + return err + } + err = etcdClient.MemberPromote(learnerID) + if err != nil { + return err + } + } + fmt.Printf("[etcd] Waiting for the new etcd member to join the cluster. This can take up to %v\n", etcdHealthyCheckInterval*etcdHealthyCheckRetries) if _, err := etcdClient.WaitForClusterAvailable(etcdHealthyCheckRetries, etcdHealthyCheckInterval); err != nil { return err diff --git a/cmd/kubeadm/app/phases/upgrade/staticpods_test.go b/cmd/kubeadm/app/phases/upgrade/staticpods_test.go index db9c6dcfd11..986d11df6f1 100644 --- a/cmd/kubeadm/app/phases/upgrade/staticpods_test.go +++ b/cmd/kubeadm/app/phases/upgrade/staticpods_test.go @@ -251,7 +251,7 @@ func (c fakeTLSEtcdClient) ListMembers() ([]etcdutil.Member, error) { return []etcdutil.Member{}, nil } -func (c fakeTLSEtcdClient) AddMemberAsLeanerAndPromote(name string, peerAddrs string) ([]etcdutil.Member, error) { +func (c fakeTLSEtcdClient) AddMemberAsLearner(name string, peerAddrs string) ([]etcdutil.Member, error) { return []etcdutil.Member{}, nil } @@ -259,6 +259,10 @@ func (c fakeTLSEtcdClient) AddMember(name string, peerAddrs string) ([]etcdutil. return []etcdutil.Member{}, nil } +func (c fakeTLSEtcdClient) MemberPromote(learnerID uint64) error { + return nil +} + func (c fakeTLSEtcdClient) GetMemberID(peerURL string) (uint64, error) { return 0, nil } @@ -290,7 +294,7 @@ func (c fakePodManifestEtcdClient) ListMembers() ([]etcdutil.Member, error) { return []etcdutil.Member{}, nil } -func (c fakePodManifestEtcdClient) AddMemberAsLeanerAndPromote(name string, peerAddrs string) ([]etcdutil.Member, error) { +func (c fakePodManifestEtcdClient) AddMemberAsLearner(name string, peerAddrs string) ([]etcdutil.Member, error) { return []etcdutil.Member{}, nil } @@ -298,6 +302,10 @@ func (c fakePodManifestEtcdClient) AddMember(name string, peerAddrs string) ([]e return []etcdutil.Member{}, nil } +func (c fakePodManifestEtcdClient) MemberPromote(learnerID uint64) error { + return nil +} + func (c fakePodManifestEtcdClient) GetMemberID(peerURL string) (uint64, error) { return 0, nil } diff --git a/cmd/kubeadm/app/util/etcd/etcd.go b/cmd/kubeadm/app/util/etcd/etcd.go index 7cea7b42d83..c2439536d06 100644 --- a/cmd/kubeadm/app/util/etcd/etcd.go +++ b/cmd/kubeadm/app/util/etcd/etcd.go @@ -60,7 +60,8 @@ type ClusterInterrogator interface { Sync() error ListMembers() ([]Member, error) AddMember(name string, peerAddrs string) ([]Member, error) - AddMemberAsLeanerAndPromote(name string, peerAddrs string) ([]Member, error) + AddMemberAsLearner(name string, peerAddrs string) ([]Member, error) + MemberPromote(learnerID uint64) error GetMemberID(peerURL string) (uint64, error) RemoveMember(id uint64) ([]Member, error) } @@ -347,8 +348,8 @@ func (c *Client) AddMember(name string, peerAddrs string) ([]Member, error) { return c.addMember(name, peerAddrs, false) } -// AddMemberAsLeanerAndPromote adds a new learner member into the etcd cluster and promotes it to a voting member -func (c *Client) AddMemberAsLeanerAndPromote(name string, peerAddrs string) ([]Member, error) { +// AddMemberAsLearner adds a new learner member into the etcd cluster. +func (c *Client) AddMemberAsLearner(name string, peerAddrs string) ([]Member, error) { return c.addMember(name, peerAddrs, true) } @@ -364,30 +365,29 @@ func (c *Client) addMember(name string, peerAddrs string, isLearner bool) ([]Mem return nil, errors.Wrapf(err, "error parsing peer address %s", peerAddrs) } + cli, err := clientv3.New(clientv3.Config{ + Endpoints: c.Endpoints, + DialTimeout: etcdTimeout, + DialOptions: []grpc.DialOption{ + grpc.WithBlock(), // block until the underlying connection is up + }, + TLS: c.TLS, + }) + if err != nil { + return nil, err + } + defer cli.Close() + // Adds a new member to the cluster var ( lastError error respMembers []*etcdserverpb.Member learnerID uint64 + resp *clientv3.MemberAddResponse ) err = wait.ExponentialBackoff(etcdBackoff, func() (bool, error) { - cli, err := clientv3.New(clientv3.Config{ - Endpoints: c.Endpoints, - DialTimeout: etcdTimeout, - DialOptions: []grpc.DialOption{ - grpc.WithBlock(), // block until the underlying connection is up - }, - TLS: c.TLS, - }) - if err != nil { - lastError = err - return false, nil - } - defer cli.Close() - ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout) defer cancel() - var resp *clientv3.MemberAddResponse if isLearner { // if learnerID is set, it means the etcd member is already added successfully. if learnerID == 0 { @@ -399,11 +399,6 @@ func (c *Client) addMember(name string, peerAddrs string, isLearner bool) ([]Mem } learnerID = resp.Member.ID } - err = memberPromote(ctx, cli, learnerID) - if err != nil { - lastError = err - return false, nil - } respMembers = resp.Members return true, nil } @@ -459,21 +454,73 @@ func (c *Client) addMember(name string, peerAddrs string, isLearner bool) ([]Mem return ret, nil } -func memberPromote(ctx context.Context, cli *clientv3.Client, learnerID uint64) error { +// isLearner returns true if the given member ID is a learner. +func (c *Client) isLearner(memberID uint64) (bool, error) { + resp, err := c.listMembers() + if err != nil { + return false, err + } + + for _, member := range resp.Members { + if member.ID == memberID && member.IsLearner { + return true, nil + } + } + return false, nil +} + +// MemberPromote promotes a member as a voting member. If the given member ID is already a voting member this method +// will return early and do nothing. +func (c *Client) MemberPromote(learnerID uint64) error { + isLearner, err := c.isLearner(learnerID) + if err != nil { + return err + } + if !isLearner { + klog.V(1).Infof("[etcd] Member %016x already promoted.", learnerID) + return nil + } + klog.V(1).Infof("[etcd] Promoting a learner as a voting member: %016x", learnerID) + cli, err := clientv3.New(clientv3.Config{ + Endpoints: c.Endpoints, + DialTimeout: etcdTimeout, + DialOptions: []grpc.DialOption{ + grpc.WithBlock(), // block until the underlying connection is up + }, + TLS: c.TLS, + }) + if err != nil { + return err + } + defer cli.Close() + // TODO: warning logs from etcd client should be removed. // The warning logs are printed by etcd client code for several reasons, including // 1. can not promote yet(no synced) // 2. context deadline exceeded // 3. peer URLs already exists // Once the client provides a way to check if the etcd learner is ready to promote, the retry logic can be revisited. - _, err := cli.MemberPromote(ctx, learnerID) - if err == nil { - klog.V(1).Infof("[etcd] The learner was promoted as a voting member: %016x", learnerID) - return nil + var ( + lastError error + ) + err = wait.ExponentialBackoff(etcdBackoff, func() (bool, error) { + ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout) + defer cancel() + + _, err = cli.MemberPromote(ctx, learnerID) + if err == nil { + klog.V(1).Infof("[etcd] The learner was promoted as a voting member: %016x", learnerID) + return true, nil + } + klog.V(5).Infof("[etcd] Promoting the learner %016x failed: %v", learnerID, err) + lastError = err + return false, nil + }) + if err != nil { + return lastError } - klog.V(5).Infof("[etcd] Promoting the learner %016x failed: %v", learnerID, err) - return err + return nil } // CheckClusterHealth returns nil for status Up or error for status Down