Merge pull request #28202 from hongchaodeng/cpa

Automatic merge from submit-queue

Better handle etcd compaction in multi-apiserver

What:
- Change etcd compaction routine to better handle multi-apiserver (HA) scenarios. See the docs in code.
This commit is contained in:
k8s-merge-robot 2016-07-15 13:33:58 -07:00 committed by GitHub
commit d3dbe9c716
2 changed files with 119 additions and 28 deletions

View File

@ -17,6 +17,7 @@ limitations under the License.
package etcd3
import (
"strconv"
"sync"
"time"
@ -25,7 +26,10 @@ import (
"golang.org/x/net/context"
)
const compactInterval = 10 * time.Minute
const (
compactInterval = 10 * time.Minute
compactRevKey = "compact_rev_key"
)
var (
endpointsMapMu sync.Mutex
@ -36,19 +40,16 @@ func init() {
endpointsMap = make(map[string]struct{})
}
// StartCompactor starts a compactor in the background in order to compact keys
// older than fixed time.
// We need to compact keys because we can't let on disk data grow forever.
// We save the most recent 10 minutes data. It should be enough for slow watchers and to tolerate burst.
// TODO: We might keep a longer history (12h) in the future once storage API can take
// advantage of multi-version key.
// StartCompactor starts a compactor in the background to compact old version of keys that's not needed.
// By default, we save the most recent 10 minutes data and compact versions > 10minutes ago.
// It should be enough for slow watchers and to tolerate burst.
// TODO: We might keep a longer history (12h) in the future once storage API can take advantage of past version of keys.
func StartCompactor(ctx context.Context, client *clientv3.Client) {
endpointsMapMu.Lock()
defer endpointsMapMu.Unlock()
// We can't have multiple compaction jobs for the same cluster.
// In one process, we can have only one compactor for one cluster.
// Currently we rely on endpoints to differentiate clusters.
var emptyStruct struct{}
for _, ep := range client.Endpoints() {
if _, ok := endpointsMap[ep]; ok {
glog.V(4).Infof("compactor already exists for endpoints %v", client.Endpoints())
@ -56,18 +57,59 @@ func StartCompactor(ctx context.Context, client *clientv3.Client) {
}
}
for _, ep := range client.Endpoints() {
endpointsMap[ep] = emptyStruct
endpointsMap[ep] = struct{}{}
}
go compactor(ctx, client, compactInterval)
}
// compactor periodically compacts historical versions of keys in etcd.
// After compaction, old versions of keys set before given interval will be gone.
// Any API call for the old versions of keys will return error.
// interval: the interval between each compaction. The first compaction happens after "interval".
// It will compact keys with versions older than given interval.
// In other words, after compaction, it will only contain keys set during last interval.
// Any API call for the older versions of keys will return error.
// Interval is the time interval between each compaction. The first compaction happens after "interval".
func compactor(ctx context.Context, client *clientv3.Client, interval time.Duration) {
var curRev int64
// Technical definitions:
// We have a special key in etcd defined as *compactRevKey*.
// compactRevKey's value will be set to the string of last compacted revision.
// compactRevKey's version will be used as logical time for comparison. THe version is referred as compact time.
// Initially, because the key doesn't exist, the compact time (version) is 0.
//
// Algorithm:
// - Compare to see if (local compact_time) = (remote compact_time).
// - If yes, increment both local and remote compact_time, and do a compaction.
// - If not, set local to remote compact_time.
//
// Technical details/insights:
//
// The protocol here is lease based. If one compactor CAS successfully, the others would know it when they fail in
// CAS later and would try again in 10 minutes. If an APIServer crashed, another one would "take over" the lease.
//
// For example, in the following diagram, we have a compactor C1 doing compaction in t1, t2. Another compactor C2
// at t1' (t1 < t1' < t2) would CAS fail, set its known oldRev to rev at t1, and try again in t2' (t2' > t2).
// If C1 crashed and wouldn't compact at t2, C2 would CAS successfully at t2'.
//
// oldRev(t2) curRev(t2)
// +
// oldRev curRev |
// + + |
// | | |
// | | t1' | t2'
// +---v-------------v----^---------v------^---->
// t0 t1 t2
//
// We have the guarantees:
// - in normal cases, the interval is 10 minutes.
// - in failover, the interval is >10m and <20m
//
// FAQ:
// - What if time is not accurate? We don't care as long as someone did the compaction. Atomicity is ensured using
// etcd API.
// - What happened under heavy load scenarios? Initially, each apiserver will do only one compaction
// every 10 minutes. This is very unlikely affecting or affected w.r.t. server load.
var compactTime int64
var rev int64
var err error
for {
select {
@ -76,29 +118,44 @@ func compactor(ctx context.Context, client *clientv3.Client, interval time.Durat
return
}
curRev, err = compact(ctx, client, curRev)
compactTime, rev, err = compact(ctx, client, compactTime, rev)
if err != nil {
glog.Error(err)
glog.Errorf("etcd: endpoint (%v) compact failed: %v", client.Endpoints(), err)
continue
}
}
}
// compact compacts etcd store and returns current rev.
// If it couldn't get current revision, the old rev will be returned.
func compact(ctx context.Context, client *clientv3.Client, oldRev int64) (int64, error) {
resp, err := client.Get(ctx, "/")
// It will return the current compact time and global revision if no error occurred.
// Note that CAS fail will not incur any error.
func compact(ctx context.Context, client *clientv3.Client, t, rev int64) (int64, int64, error) {
resp, err := client.KV.Txn(ctx).If(
clientv3.Compare(clientv3.Version(compactRevKey), "=", t),
).Then(
clientv3.OpPut(compactRevKey, strconv.FormatInt(rev, 10)), // Expect side effect: increment Version
).Else(
clientv3.OpGet(compactRevKey),
).Commit()
if err != nil {
return oldRev, err
return t, rev, err
}
curRev := resp.Header.Revision
if oldRev == 0 {
return curRev, nil
if !resp.Succeeded {
curTime := resp.Responses[0].GetResponseRange().Kvs[0].Version
return curTime, curRev, nil
}
err = client.Compact(ctx, oldRev)
if err != nil {
return curRev, err
curTime := t + 1
if rev == 0 {
// We don't compact on bootstrap.
return curTime, curRev, nil
}
glog.Infof("etcd: Compacted rev %d, endpoints %v", oldRev, client.Endpoints())
return curRev, nil
if err = client.Compact(ctx, rev); err != nil {
return curTime, curRev, err
}
glog.Infof("etcd: compacted rev (%d), endpoints (%v)", rev, client.Endpoints())
return curTime, curRev, nil
}

View File

@ -36,7 +36,7 @@ func TestCompact(t *testing.T) {
t.Fatalf("Put failed: %v", err)
}
_, err = compact(ctx, client, putResp.Header.Revision)
_, _, err = compact(ctx, client, 0, putResp.Header.Revision)
if err != nil {
t.Fatalf("compact failed: %v", err)
}
@ -46,3 +46,37 @@ func TestCompact(t *testing.T) {
t.Errorf("Expecting ErrCompacted, but get=%v", err)
}
}
// TestCompactConflict tests that two compactors (Let's use C1, C2) are trying to compact etcd cluster with the same
// logical time.
// - C1 compacts first. It will succeed.
// - C2 compacts after. It will fail. But it will get latest logical time, which should be larger by one.
func TestCompactConflict(t *testing.T) {
cluster := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
defer cluster.Terminate(t)
client := cluster.RandClient()
ctx := context.Background()
putResp, err := client.Put(ctx, "/somekey", "data")
if err != nil {
t.Fatalf("Put failed: %v", err)
}
// Compact first. It would do the compaction and return compact time which is incremented by 1.
curTime, _, err := compact(ctx, client, 0, putResp.Header.Revision)
if err != nil {
t.Fatalf("compact failed: %v", err)
}
if curTime != 1 {
t.Errorf("Expect current logical time = 1, get = %v", curTime)
}
// Compact again with the same parameters. It won't do compaction but return the latest compact time.
curTime2, _, err := compact(ctx, client, 0, putResp.Header.Revision)
if err != nil {
t.Fatalf("compact failed: %v", err)
}
if curTime != curTime2 {
t.Errorf("Unexpected curTime (%v) != curTime2 (%v)", curTime, curTime2)
}
}