From 186b4858b4b3a01f32b9b86fb58bf8bcf19a45e4 Mon Sep 17 00:00:00 2001
From: Hongchao Deng <hongchaodeng1@gmail.com>
Date: Tue, 28 Jun 2016 21:11:55 -0700
Subject: [PATCH 1/2] better handle etcd compaction in multi-apiserver

---
 pkg/storage/etcd3/compact.go      | 111 ++++++++++++++++++++++--------
 pkg/storage/etcd3/compact_test.go |   2 +-
 2 files changed, 85 insertions(+), 28 deletions(-)

diff --git a/pkg/storage/etcd3/compact.go b/pkg/storage/etcd3/compact.go
index fcaf83aae53..ad1ac1ac2a2 100644
--- a/pkg/storage/etcd3/compact.go
+++ b/pkg/storage/etcd3/compact.go
@@ -17,6 +17,7 @@ limitations under the License.
 package etcd3
 
 import (
+	"strconv"
 	"sync"
 	"time"
 
@@ -25,7 +26,10 @@ import (
 	"golang.org/x/net/context"
 )
 
-const compactInterval = 10 * time.Minute
+const (
+	compactInterval = 10 * time.Minute
+	compactRevKey   = "compact_rev_key"
+)
 
 var (
 	endpointsMapMu sync.Mutex
@@ -36,19 +40,16 @@ func init() {
 	endpointsMap = make(map[string]struct{})
 }
 
-// StartCompactor starts a compactor in the background in order to compact keys
-// older than fixed time.
-// We need to compact keys because we can't let on disk data grow forever.
-// We save the most recent 10 minutes data. It should be enough for slow watchers and to tolerate burst.
-// TODO: We might keep a longer history (12h) in the future once storage API can take
-//       advantage of multi-version key.
+// StartCompactor starts a compactor in the background to compact old version of keys that's not needed.
+// By default, we save the most recent 10 minutes data and compact versions > 10minutes ago.
+// It should be enough for slow watchers and to tolerate burst.
+// TODO: We might keep a longer history (12h) in the future once storage API can take advantage of past version of keys.
 func StartCompactor(ctx context.Context, client *clientv3.Client) {
 	endpointsMapMu.Lock()
 	defer endpointsMapMu.Unlock()
 
-	// We can't have multiple compaction jobs for the same cluster.
+	// In one process, we can have only one compactor for one cluster.
 	// Currently we rely on endpoints to differentiate clusters.
-	var emptyStruct struct{}
 	for _, ep := range client.Endpoints() {
 		if _, ok := endpointsMap[ep]; ok {
 			glog.V(4).Infof("compactor already exists for endpoints %v", client.Endpoints())
@@ -56,18 +57,59 @@ func StartCompactor(ctx context.Context, client *clientv3.Client) {
 		}
 	}
 	for _, ep := range client.Endpoints() {
-		endpointsMap[ep] = emptyStruct
+		endpointsMap[ep] = struct{}{}
 	}
 
 	go compactor(ctx, client, compactInterval)
 }
 
 // compactor periodically compacts historical versions of keys in etcd.
-// After compaction, old versions of keys set before given interval will be gone.
-// Any API call for the old versions of keys will return error.
-// interval: the interval between each compaction. The first compaction happens after "interval".
+// It will compact keys with versions older than given interval.
+// In other words, after compaction, it will only contain keys set during last interval.
+// Any API call for the older versions of keys will return error.
+// Interval is the time interval between each compaction. The first compaction happens after "interval".
 func compactor(ctx context.Context, client *clientv3.Client, interval time.Duration) {
-	var curRev int64
+	// Technical definitions:
+	// We have a special key in etcd defined as *compactRevKey*.
+	// compactRevKey's value will be set to the string of last compacted revision.
+	// compactRevKey's version will be used as logical time for comparison. THe version is referred as compact time.
+	// Initially, because the key doesn't exist, the compact time (version) is 0.
+	//
+	// Algorithm:
+	// - Compare to see if (local compact_time) = (remote compact_time).
+	// - If yes, increment both local and remote compact_time, and do a compaction.
+	// - If not, set local to remote compact_time.
+	//
+	// Technical details/insights:
+	//
+	// The protocol here is lease based. If one compactor CAS successfully, the others would know it when they fail in
+	// CAS later and would try again in 10 minutes. If an APIServer crashed, another one would "take over" the lease.
+	//
+	// For example, in the following diagram, we have a compactor C1 doing compaction in t1, t2. Another compactor C2
+	// at t1' (t1 < t1' < t2) would CAS fail, set its known oldRev to rev at t1, and try again in t2' (t2' > t2).
+	// If C1 crashed and wouldn't compact at t2, C2 would CAS successfully at t2'.
+	//
+	//                   oldRev(t2)   curRev(t2)
+	//                                  +
+	//   oldRev        curRev           |
+	//     +             +              |
+	//     |             |              |
+	//     |             |    t1'       |     t2'
+	// +---v-------------v----^---------v------^---->
+	//     t0           t1             t2
+	//
+	// We have the guarantees:
+	// - in normal cases, the interval is 10 minutes.
+	// - in failover, the interval is >10m and <20m
+	//
+	// FAQ:
+	// - What if time is not accurate? We don't care as long as someone did the compaction. Atomicity is ensured using
+	//   etcd API.
+	// - What happened under heavy load scenarios? Initially, each apiserver will do only one compaction
+	//   every 10 minutes. This is very unlikely affecting or affected w.r.t. server load.
+
+	var compactTime int64
+	var rev int64
 	var err error
 	for {
 		select {
@@ -76,29 +118,44 @@ func compactor(ctx context.Context, client *clientv3.Client, interval time.Durat
 			return
 		}
 
-		curRev, err = compact(ctx, client, curRev)
+		compactTime, rev, err = compact(ctx, client, compactTime, rev)
 		if err != nil {
-			glog.Error(err)
+			glog.Errorf("etcd: endpoint (%v) compact failed: %v", client.Endpoints(), err)
 			continue
 		}
 	}
 }
 
 // compact compacts etcd store and returns current rev.
-// If it couldn't get current revision, the old rev will be returned.
-func compact(ctx context.Context, client *clientv3.Client, oldRev int64) (int64, error) {
-	resp, err := client.Get(ctx, "/")
+// It will return the current compact time and global revision if no error occurred.
+// Note that CAS fail will not incur any error.
+func compact(ctx context.Context, client *clientv3.Client, t, rev int64) (int64, int64, error) {
+	resp, err := client.KV.Txn(ctx).If(
+		clientv3.Compare(clientv3.Version(compactRevKey), "=", t),
+	).Then(
+		clientv3.OpPut(compactRevKey, strconv.FormatInt(rev, 10)), // Expect side effect: increment Version
+	).Else(
+		clientv3.OpGet(compactRevKey),
+	).Commit()
 	if err != nil {
-		return oldRev, err
+		return t, rev, err
 	}
+
 	curRev := resp.Header.Revision
-	if oldRev == 0 {
-		return curRev, nil
+
+	if !resp.Succeeded {
+		curTime := resp.Responses[0].GetResponseRange().Kvs[0].Version
+		return curTime, curRev, nil
 	}
-	err = client.Compact(ctx, oldRev)
-	if err != nil {
-		return curRev, err
+	curTime := t + 1
+
+	if rev == 0 {
+		// We don't compact on bootstrap.
+		return curTime, curRev, nil
 	}
-	glog.Infof("etcd: Compacted rev %d, endpoints %v", oldRev, client.Endpoints())
-	return curRev, nil
+	if err = client.Compact(ctx, rev); err != nil {
+		return curTime, curRev, err
+	}
+	glog.Infof("etcd: compacted rev (%d), endpoints (%v)", rev, client.Endpoints())
+	return curTime, curRev, nil
 }
diff --git a/pkg/storage/etcd3/compact_test.go b/pkg/storage/etcd3/compact_test.go
index 3992940b379..9972fa5c115 100644
--- a/pkg/storage/etcd3/compact_test.go
+++ b/pkg/storage/etcd3/compact_test.go
@@ -36,7 +36,7 @@ func TestCompact(t *testing.T) {
 		t.Fatalf("Put failed: %v", err)
 	}
 
-	_, err = compact(ctx, client, putResp.Header.Revision)
+	_, _, err = compact(ctx, client, 0, putResp.Header.Revision)
 	if err != nil {
 		t.Fatalf("compact failed: %v", err)
 	}

From 54025ce8b3aadf27914a84c336a822975a69a158 Mon Sep 17 00:00:00 2001
From: Hongchao Deng <hongchaodeng1@gmail.com>
Date: Fri, 1 Jul 2016 10:01:41 -0700
Subject: [PATCH 2/2] etcd3/store: Add test for compact conflict

---
 pkg/storage/etcd3/compact_test.go | 34 +++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/pkg/storage/etcd3/compact_test.go b/pkg/storage/etcd3/compact_test.go
index 9972fa5c115..d6fa099d1f5 100644
--- a/pkg/storage/etcd3/compact_test.go
+++ b/pkg/storage/etcd3/compact_test.go
@@ -46,3 +46,37 @@ func TestCompact(t *testing.T) {
 		t.Errorf("Expecting ErrCompacted, but get=%v", err)
 	}
 }
+
+// TestCompactConflict tests that two compactors (Let's use C1, C2) are trying to compact etcd cluster with the same
+// logical time.
+// - C1 compacts first. It will succeed.
+// - C2 compacts after. It will fail. But it will get latest logical time, which should be larger by one.
+func TestCompactConflict(t *testing.T) {
+	cluster := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
+	defer cluster.Terminate(t)
+	client := cluster.RandClient()
+	ctx := context.Background()
+
+	putResp, err := client.Put(ctx, "/somekey", "data")
+	if err != nil {
+		t.Fatalf("Put failed: %v", err)
+	}
+
+	// Compact first. It would do the compaction and return compact time which is incremented by 1.
+	curTime, _, err := compact(ctx, client, 0, putResp.Header.Revision)
+	if err != nil {
+		t.Fatalf("compact failed: %v", err)
+	}
+	if curTime != 1 {
+		t.Errorf("Expect current logical time = 1, get = %v", curTime)
+	}
+
+	// Compact again with the same parameters. It won't do compaction but return the latest compact time.
+	curTime2, _, err := compact(ctx, client, 0, putResp.Header.Revision)
+	if err != nil {
+		t.Fatalf("compact failed: %v", err)
+	}
+	if curTime != curTime2 {
+		t.Errorf("Unexpected curTime (%v) != curTime2 (%v)", curTime, curTime2)
+	}
+}