From 25668531f783ccaa31c5ef9877b6abf72abac30f Mon Sep 17 00:00:00 2001 From: "Lubomir I. Ivanov" Date: Thu, 13 Jun 2019 03:14:42 +0300 Subject: [PATCH] kubeadm: run MemberAdd/Remove for etcd clients with exp-backoff retry When adding a new etcd member the etcd cluster can enter a state of vote, where any new members added at the exact same time will fail with an error right away. Implement exponential backoff retry around the MemberAdd call. This solves a kubeadm problem when concurrently joining control-plane nodes with stacked etcd members. From experiment, a few retries with milliseconds apart are sufficient to achieve the concurrent join of a 3xCP cluster. Apply the same backoff to MemberRemove in case the concurrent removal of members fails for similar reasons. --- cmd/kubeadm/app/util/etcd/BUILD | 1 + cmd/kubeadm/app/util/etcd/etcd.go | 40 ++++++++++++++++++++++++------- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/cmd/kubeadm/app/util/etcd/BUILD b/cmd/kubeadm/app/util/etcd/BUILD index 3168a0c50e3..5492f26fd68 100644 --- a/cmd/kubeadm/app/util/etcd/BUILD +++ b/cmd/kubeadm/app/util/etcd/BUILD @@ -9,6 +9,7 @@ go_library( "//cmd/kubeadm/app/apis/kubeadm:go_default_library", "//cmd/kubeadm/app/constants:go_default_library", "//cmd/kubeadm/app/util/config:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", "//staging/src/k8s.io/client-go/kubernetes:go_default_library", "//vendor/github.com/coreos/etcd/clientv3:go_default_library", "//vendor/github.com/coreos/etcd/pkg/transport:go_default_library", diff --git a/cmd/kubeadm/app/util/etcd/etcd.go b/cmd/kubeadm/app/util/etcd/etcd.go index d6656d7a7ac..c8ac23a929f 100644 --- a/cmd/kubeadm/app/util/etcd/etcd.go +++ b/cmd/kubeadm/app/util/etcd/etcd.go @@ -29,6 +29,7 @@ import ( "github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/pkg/transport" "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog" kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm" @@ -36,6 +37,15 @@ import ( "k8s.io/kubernetes/cmd/kubeadm/app/util/config" ) +// Exponential backoff for MemberAdd/Remove (values exclude jitter): +// 0, 50, 150, 350, 750, 1550, 3150, 6350, 12750 ms +var addRemoveBackoff = wait.Backoff{ + Steps: 8, + Duration: 50 * time.Millisecond, + Factor: 2.0, + Jitter: 0.1, +} + // ClusterInterrogator is an interface to get etcd cluster related information type ClusterInterrogator interface { ClusterAvailable() (bool, error) @@ -187,11 +197,18 @@ func (c *Client) RemoveMember(id uint64) ([]Member, error) { defer cli.Close() // Remove an existing member from the cluster - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - resp, err := cli.MemberRemove(ctx, id) - cancel() + var lastError error + var resp *clientv3.MemberRemoveResponse + err = wait.ExponentialBackoff(addRemoveBackoff, func() (bool, error) { + resp, err = cli.MemberRemove(context.Background(), id) + if err == nil { + return true, nil + } + lastError = err + return false, nil + }) if err != nil { - return nil, err + return nil, lastError } // Returns the updated list of etcd members @@ -224,11 +241,18 @@ func (c *Client) AddMember(name string, peerAddrs string) ([]Member, error) { defer cli.Close() // Adds a new member to the cluster - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - resp, err := cli.MemberAdd(ctx, []string{peerAddrs}) - cancel() + var lastError error + var resp *clientv3.MemberAddResponse + err = wait.ExponentialBackoff(addRemoveBackoff, func() (bool, error) { + resp, err = cli.MemberAdd(context.Background(), []string{peerAddrs}) + if err == nil { + return true, nil + } + lastError = err + return false, nil + }) if err != nil { - return nil, err + return nil, lastError } // Returns the updated list of etcd members