From 186b4858b4b3a01f32b9b86fb58bf8bcf19a45e4 Mon Sep 17 00:00:00 2001 From: Hongchao Deng Date: Tue, 28 Jun 2016 21:11:55 -0700 Subject: [PATCH 1/2] better handle etcd compaction in multi-apiserver --- pkg/storage/etcd3/compact.go | 111 ++++++++++++++++++++++-------- pkg/storage/etcd3/compact_test.go | 2 +- 2 files changed, 85 insertions(+), 28 deletions(-) diff --git a/pkg/storage/etcd3/compact.go b/pkg/storage/etcd3/compact.go index fcaf83aae53..ad1ac1ac2a2 100644 --- a/pkg/storage/etcd3/compact.go +++ b/pkg/storage/etcd3/compact.go @@ -17,6 +17,7 @@ limitations under the License. package etcd3 import ( + "strconv" "sync" "time" @@ -25,7 +26,10 @@ import ( "golang.org/x/net/context" ) -const compactInterval = 10 * time.Minute +const ( + compactInterval = 10 * time.Minute + compactRevKey = "compact_rev_key" +) var ( endpointsMapMu sync.Mutex @@ -36,19 +40,16 @@ func init() { endpointsMap = make(map[string]struct{}) } -// StartCompactor starts a compactor in the background in order to compact keys -// older than fixed time. -// We need to compact keys because we can't let on disk data grow forever. -// We save the most recent 10 minutes data. It should be enough for slow watchers and to tolerate burst. -// TODO: We might keep a longer history (12h) in the future once storage API can take -// advantage of multi-version key. +// StartCompactor starts a compactor in the background to compact old version of keys that's not needed. +// By default, we save the most recent 10 minutes data and compact versions > 10minutes ago. +// It should be enough for slow watchers and to tolerate burst. +// TODO: We might keep a longer history (12h) in the future once storage API can take advantage of past version of keys. func StartCompactor(ctx context.Context, client *clientv3.Client) { endpointsMapMu.Lock() defer endpointsMapMu.Unlock() - // We can't have multiple compaction jobs for the same cluster. + // In one process, we can have only one compactor for one cluster. // Currently we rely on endpoints to differentiate clusters. - var emptyStruct struct{} for _, ep := range client.Endpoints() { if _, ok := endpointsMap[ep]; ok { glog.V(4).Infof("compactor already exists for endpoints %v", client.Endpoints()) @@ -56,18 +57,59 @@ func StartCompactor(ctx context.Context, client *clientv3.Client) { } } for _, ep := range client.Endpoints() { - endpointsMap[ep] = emptyStruct + endpointsMap[ep] = struct{}{} } go compactor(ctx, client, compactInterval) } // compactor periodically compacts historical versions of keys in etcd. -// After compaction, old versions of keys set before given interval will be gone. -// Any API call for the old versions of keys will return error. -// interval: the interval between each compaction. The first compaction happens after "interval". +// It will compact keys with versions older than given interval. +// In other words, after compaction, it will only contain keys set during last interval. +// Any API call for the older versions of keys will return error. +// Interval is the time interval between each compaction. The first compaction happens after "interval". func compactor(ctx context.Context, client *clientv3.Client, interval time.Duration) { - var curRev int64 + // Technical definitions: + // We have a special key in etcd defined as *compactRevKey*. + // compactRevKey's value will be set to the string of last compacted revision. + // compactRevKey's version will be used as logical time for comparison. THe version is referred as compact time. + // Initially, because the key doesn't exist, the compact time (version) is 0. + // + // Algorithm: + // - Compare to see if (local compact_time) = (remote compact_time). + // - If yes, increment both local and remote compact_time, and do a compaction. + // - If not, set local to remote compact_time. + // + // Technical details/insights: + // + // The protocol here is lease based. If one compactor CAS successfully, the others would know it when they fail in + // CAS later and would try again in 10 minutes. If an APIServer crashed, another one would "take over" the lease. + // + // For example, in the following diagram, we have a compactor C1 doing compaction in t1, t2. Another compactor C2 + // at t1' (t1 < t1' < t2) would CAS fail, set its known oldRev to rev at t1, and try again in t2' (t2' > t2). + // If C1 crashed and wouldn't compact at t2, C2 would CAS successfully at t2'. + // + // oldRev(t2) curRev(t2) + // + + // oldRev curRev | + // + + | + // | | | + // | | t1' | t2' + // +---v-------------v----^---------v------^----> + // t0 t1 t2 + // + // We have the guarantees: + // - in normal cases, the interval is 10 minutes. + // - in failover, the interval is >10m and <20m + // + // FAQ: + // - What if time is not accurate? We don't care as long as someone did the compaction. Atomicity is ensured using + // etcd API. + // - What happened under heavy load scenarios? Initially, each apiserver will do only one compaction + // every 10 minutes. This is very unlikely affecting or affected w.r.t. server load. + + var compactTime int64 + var rev int64 var err error for { select { @@ -76,29 +118,44 @@ func compactor(ctx context.Context, client *clientv3.Client, interval time.Durat return } - curRev, err = compact(ctx, client, curRev) + compactTime, rev, err = compact(ctx, client, compactTime, rev) if err != nil { - glog.Error(err) + glog.Errorf("etcd: endpoint (%v) compact failed: %v", client.Endpoints(), err) continue } } } // compact compacts etcd store and returns current rev. -// If it couldn't get current revision, the old rev will be returned. -func compact(ctx context.Context, client *clientv3.Client, oldRev int64) (int64, error) { - resp, err := client.Get(ctx, "/") +// It will return the current compact time and global revision if no error occurred. +// Note that CAS fail will not incur any error. +func compact(ctx context.Context, client *clientv3.Client, t, rev int64) (int64, int64, error) { + resp, err := client.KV.Txn(ctx).If( + clientv3.Compare(clientv3.Version(compactRevKey), "=", t), + ).Then( + clientv3.OpPut(compactRevKey, strconv.FormatInt(rev, 10)), // Expect side effect: increment Version + ).Else( + clientv3.OpGet(compactRevKey), + ).Commit() if err != nil { - return oldRev, err + return t, rev, err } + curRev := resp.Header.Revision - if oldRev == 0 { - return curRev, nil + + if !resp.Succeeded { + curTime := resp.Responses[0].GetResponseRange().Kvs[0].Version + return curTime, curRev, nil } - err = client.Compact(ctx, oldRev) - if err != nil { - return curRev, err + curTime := t + 1 + + if rev == 0 { + // We don't compact on bootstrap. + return curTime, curRev, nil } - glog.Infof("etcd: Compacted rev %d, endpoints %v", oldRev, client.Endpoints()) - return curRev, nil + if err = client.Compact(ctx, rev); err != nil { + return curTime, curRev, err + } + glog.Infof("etcd: compacted rev (%d), endpoints (%v)", rev, client.Endpoints()) + return curTime, curRev, nil } diff --git a/pkg/storage/etcd3/compact_test.go b/pkg/storage/etcd3/compact_test.go index 3992940b379..9972fa5c115 100644 --- a/pkg/storage/etcd3/compact_test.go +++ b/pkg/storage/etcd3/compact_test.go @@ -36,7 +36,7 @@ func TestCompact(t *testing.T) { t.Fatalf("Put failed: %v", err) } - _, err = compact(ctx, client, putResp.Header.Revision) + _, _, err = compact(ctx, client, 0, putResp.Header.Revision) if err != nil { t.Fatalf("compact failed: %v", err) } From 54025ce8b3aadf27914a84c336a822975a69a158 Mon Sep 17 00:00:00 2001 From: Hongchao Deng Date: Fri, 1 Jul 2016 10:01:41 -0700 Subject: [PATCH 2/2] etcd3/store: Add test for compact conflict --- pkg/storage/etcd3/compact_test.go | 34 +++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pkg/storage/etcd3/compact_test.go b/pkg/storage/etcd3/compact_test.go index 9972fa5c115..d6fa099d1f5 100644 --- a/pkg/storage/etcd3/compact_test.go +++ b/pkg/storage/etcd3/compact_test.go @@ -46,3 +46,37 @@ func TestCompact(t *testing.T) { t.Errorf("Expecting ErrCompacted, but get=%v", err) } } + +// TestCompactConflict tests that two compactors (Let's use C1, C2) are trying to compact etcd cluster with the same +// logical time. +// - C1 compacts first. It will succeed. +// - C2 compacts after. It will fail. But it will get latest logical time, which should be larger by one. +func TestCompactConflict(t *testing.T) { + cluster := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1}) + defer cluster.Terminate(t) + client := cluster.RandClient() + ctx := context.Background() + + putResp, err := client.Put(ctx, "/somekey", "data") + if err != nil { + t.Fatalf("Put failed: %v", err) + } + + // Compact first. It would do the compaction and return compact time which is incremented by 1. + curTime, _, err := compact(ctx, client, 0, putResp.Header.Revision) + if err != nil { + t.Fatalf("compact failed: %v", err) + } + if curTime != 1 { + t.Errorf("Expect current logical time = 1, get = %v", curTime) + } + + // Compact again with the same parameters. It won't do compaction but return the latest compact time. + curTime2, _, err := compact(ctx, client, 0, putResp.Header.Revision) + if err != nil { + t.Fatalf("compact failed: %v", err) + } + if curTime != curTime2 { + t.Errorf("Unexpected curTime (%v) != curTime2 (%v)", curTime, curTime2) + } +}