better handle etcd compaction in multi-apiserver

2025-07-30 15:05:27 +00:00 · 2016-06-28 21:11:55 -07:00 · 2016-06-28 21:11:55 -07:00 · 186b4858b4
commit 186b4858b4
parent 87d3c74715
2 changed files with 85 additions and 28 deletions
--- a/pkg/storage/etcd3/compact.go
+++ b/pkg/storage/etcd3/compact.go
@ -17,6 +17,7 @@ limitations under the License.
 package etcd3
 import (
 	"strconv"
 	"sync"
 	"time"
@ -25,7 +26,10 @@ import (
 	"golang.org/x/net/context"
 )
-const compactInterval = 10 * time.Minute
+const (
 	compactInterval = 10 * time.Minute
 	compactRevKey   = "compact_rev_key"
 )
 var (
 	endpointsMapMu sync.Mutex
@ -36,19 +40,16 @@ func init() {
 	endpointsMap = make(map[string]struct{})
 }
-// StartCompactor starts a compactor in the background in order to compact keys
+// StartCompactor starts a compactor in the background to compact old version of keys that's not needed.
-// older than fixed time.
+// By default, we save the most recent 10 minutes data and compact versions > 10minutes ago.
-// We need to compact keys because we can't let on disk data grow forever.
+// It should be enough for slow watchers and to tolerate burst.
-// We save the most recent 10 minutes data. It should be enough for slow watchers and to tolerate burst.
+// TODO: We might keep a longer history (12h) in the future once storage API can take advantage of past version of keys.
 // TODO: We might keep a longer history (12h) in the future once storage API can take
 //       advantage of multi-version key.
 func StartCompactor(ctx context.Context, client *clientv3.Client) {
 	endpointsMapMu.Lock()
 	defer endpointsMapMu.Unlock()
-	// We can't have multiple compaction jobs for the same cluster.
+	// In one process, we can have only one compactor for one cluster.
 	// Currently we rely on endpoints to differentiate clusters.
 	var emptyStruct struct{}
 	for _, ep := range client.Endpoints() {
 		if _, ok := endpointsMap[ep]; ok {
 			glog.V(4).Infof("compactor already exists for endpoints %v", client.Endpoints())
@ -56,18 +57,59 @@ func StartCompactor(ctx context.Context, client *clientv3.Client) {
 		}
 	}
 	for _, ep := range client.Endpoints() {
-		endpointsMap[ep] = emptyStruct
+		endpointsMap[ep] = struct{}{}
 	}
 	go compactor(ctx, client, compactInterval)
 }
 // compactor periodically compacts historical versions of keys in etcd.
-// After compaction, old versions of keys set before given interval will be gone.
+// It will compact keys with versions older than given interval.
-// Any API call for the old versions of keys will return error.
+// In other words, after compaction, it will only contain keys set during last interval.
-// interval: the interval between each compaction. The first compaction happens after "interval".
+// Any API call for the older versions of keys will return error.
 // Interval is the time interval between each compaction. The first compaction happens after "interval".
 func compactor(ctx context.Context, client *clientv3.Client, interval time.Duration) {
-	var curRev int64
+	// Technical definitions:
 	// We have a special key in etcd defined as *compactRevKey*.
 	// compactRevKey's value will be set to the string of last compacted revision.
 	// compactRevKey's version will be used as logical time for comparison. THe version is referred as compact time.
 	// Initially, because the key doesn't exist, the compact time (version) is 0.
 	//
 	// Algorithm:
 	// - Compare to see if (local compact_time) = (remote compact_time).
 	// - If yes, increment both local and remote compact_time, and do a compaction.
 	// - If not, set local to remote compact_time.
 	//
 	// Technical details/insights:
 	//
 	// The protocol here is lease based. If one compactor CAS successfully, the others would know it when they fail in
 	// CAS later and would try again in 10 minutes. If an APIServer crashed, another one would "take over" the lease.
 	//
 	// For example, in the following diagram, we have a compactor C1 doing compaction in t1, t2. Another compactor C2
 	// at t1' (t1 < t1' < t2) would CAS fail, set its known oldRev to rev at t1, and try again in t2' (t2' > t2).
 	// If C1 crashed and wouldn't compact at t2, C2 would CAS successfully at t2'.
 	//
 	//                   oldRev(t2)   curRev(t2)
 	//                                  +
 	//   oldRev        curRev           |
 	//     +             +              |
 	//     |             |              |
 	//     |             |    t1'       |     t2'
 	// +---v-------------v----^---------v------^---->
 	//     t0           t1             t2
 	//
 	// We have the guarantees:
 	// - in normal cases, the interval is 10 minutes.
 	// - in failover, the interval is >10m and <20m
 	//
 	// FAQ:
 	// - What if time is not accurate? We don't care as long as someone did the compaction. Atomicity is ensured using
 	//   etcd API.
 	// - What happened under heavy load scenarios? Initially, each apiserver will do only one compaction
 	//   every 10 minutes. This is very unlikely affecting or affected w.r.t. server load.
 	var compactTime int64
 	var rev int64
 	var err error
 	for {
 		select {
@ -76,29 +118,44 @@ func compactor(ctx context.Context, client *clientv3.Client, interval time.Durat
 			return
 		}
-		curRev, err = compact(ctx, client, curRev)
+		compactTime, rev, err = compact(ctx, client, compactTime, rev)
 		if err != nil {
-			glog.Error(err)
+			glog.Errorf("etcd: endpoint (%v) compact failed: %v", client.Endpoints(), err)
 			continue
 		}
 	}
 }
 // compact compacts etcd store and returns current rev.
-// If it couldn't get current revision, the old rev will be returned.
+// It will return the current compact time and global revision if no error occurred.
-func compact(ctx context.Context, client *clientv3.Client, oldRev int64) (int64, error) {
+// Note that CAS fail will not incur any error.
-	resp, err := client.Get(ctx, "/")
+func compact(ctx context.Context, client *clientv3.Client, t, rev int64) (int64, int64, error) {
 	resp, err := client.KV.Txn(ctx).If(
 		clientv3.Compare(clientv3.Version(compactRevKey), "=", t),
 	).Then(
 		clientv3.OpPut(compactRevKey, strconv.FormatInt(rev, 10)), // Expect side effect: increment Version
 	).Else(
 		clientv3.OpGet(compactRevKey),
 	).Commit()
 	if err != nil {
-		return oldRev, err
+		return t, rev, err
 	}
 	curRev := resp.Header.Revision
-	if oldRev == 0 {
+
-		return curRev, nil
+	if !resp.Succeeded {
 		curTime := resp.Responses[0].GetResponseRange().Kvs[0].Version
 		return curTime, curRev, nil
 	}
-	err = client.Compact(ctx, oldRev)
+	curTime := t + 1
-	if err != nil {
+
-		return curRev, err
+	if rev == 0 {
 		// We don't compact on bootstrap.
 		return curTime, curRev, nil
 	}
-	glog.Infof("etcd: Compacted rev %d, endpoints %v", oldRev, client.Endpoints())
+	if err = client.Compact(ctx, rev); err != nil {
-	return curRev, nil
+		return curTime, curRev, err
 	}
 	glog.Infof("etcd: compacted rev (%d), endpoints (%v)", rev, client.Endpoints())
 	return curTime, curRev, nil
 }
--- a/pkg/storage/etcd3/compact_test.go
+++ b/pkg/storage/etcd3/compact_test.go
@ -36,7 +36,7 @@ func TestCompact(t *testing.T) {
 		t.Fatalf("Put failed: %v", err)
 	}
-	_, err = compact(ctx, client, putResp.Header.Revision)
+	_, _, err = compact(ctx, client, 0, putResp.Header.Revision)
 	if err != nil {
 		t.Fatalf("compact failed: %v", err)
 	}