mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-30 15:05:27 +00:00
better handle etcd compaction in multi-apiserver
This commit is contained in:
parent
87d3c74715
commit
186b4858b4
@ -17,6 +17,7 @@ limitations under the License.
|
|||||||
package etcd3
|
package etcd3
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"strconv"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -25,7 +26,10 @@ import (
|
|||||||
"golang.org/x/net/context"
|
"golang.org/x/net/context"
|
||||||
)
|
)
|
||||||
|
|
||||||
const compactInterval = 10 * time.Minute
|
const (
|
||||||
|
compactInterval = 10 * time.Minute
|
||||||
|
compactRevKey = "compact_rev_key"
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
endpointsMapMu sync.Mutex
|
endpointsMapMu sync.Mutex
|
||||||
@ -36,19 +40,16 @@ func init() {
|
|||||||
endpointsMap = make(map[string]struct{})
|
endpointsMap = make(map[string]struct{})
|
||||||
}
|
}
|
||||||
|
|
||||||
// StartCompactor starts a compactor in the background in order to compact keys
|
// StartCompactor starts a compactor in the background to compact old version of keys that's not needed.
|
||||||
// older than fixed time.
|
// By default, we save the most recent 10 minutes data and compact versions > 10minutes ago.
|
||||||
// We need to compact keys because we can't let on disk data grow forever.
|
// It should be enough for slow watchers and to tolerate burst.
|
||||||
// We save the most recent 10 minutes data. It should be enough for slow watchers and to tolerate burst.
|
// TODO: We might keep a longer history (12h) in the future once storage API can take advantage of past version of keys.
|
||||||
// TODO: We might keep a longer history (12h) in the future once storage API can take
|
|
||||||
// advantage of multi-version key.
|
|
||||||
func StartCompactor(ctx context.Context, client *clientv3.Client) {
|
func StartCompactor(ctx context.Context, client *clientv3.Client) {
|
||||||
endpointsMapMu.Lock()
|
endpointsMapMu.Lock()
|
||||||
defer endpointsMapMu.Unlock()
|
defer endpointsMapMu.Unlock()
|
||||||
|
|
||||||
// We can't have multiple compaction jobs for the same cluster.
|
// In one process, we can have only one compactor for one cluster.
|
||||||
// Currently we rely on endpoints to differentiate clusters.
|
// Currently we rely on endpoints to differentiate clusters.
|
||||||
var emptyStruct struct{}
|
|
||||||
for _, ep := range client.Endpoints() {
|
for _, ep := range client.Endpoints() {
|
||||||
if _, ok := endpointsMap[ep]; ok {
|
if _, ok := endpointsMap[ep]; ok {
|
||||||
glog.V(4).Infof("compactor already exists for endpoints %v", client.Endpoints())
|
glog.V(4).Infof("compactor already exists for endpoints %v", client.Endpoints())
|
||||||
@ -56,18 +57,59 @@ func StartCompactor(ctx context.Context, client *clientv3.Client) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, ep := range client.Endpoints() {
|
for _, ep := range client.Endpoints() {
|
||||||
endpointsMap[ep] = emptyStruct
|
endpointsMap[ep] = struct{}{}
|
||||||
}
|
}
|
||||||
|
|
||||||
go compactor(ctx, client, compactInterval)
|
go compactor(ctx, client, compactInterval)
|
||||||
}
|
}
|
||||||
|
|
||||||
// compactor periodically compacts historical versions of keys in etcd.
|
// compactor periodically compacts historical versions of keys in etcd.
|
||||||
// After compaction, old versions of keys set before given interval will be gone.
|
// It will compact keys with versions older than given interval.
|
||||||
// Any API call for the old versions of keys will return error.
|
// In other words, after compaction, it will only contain keys set during last interval.
|
||||||
// interval: the interval between each compaction. The first compaction happens after "interval".
|
// Any API call for the older versions of keys will return error.
|
||||||
|
// Interval is the time interval between each compaction. The first compaction happens after "interval".
|
||||||
func compactor(ctx context.Context, client *clientv3.Client, interval time.Duration) {
|
func compactor(ctx context.Context, client *clientv3.Client, interval time.Duration) {
|
||||||
var curRev int64
|
// Technical definitions:
|
||||||
|
// We have a special key in etcd defined as *compactRevKey*.
|
||||||
|
// compactRevKey's value will be set to the string of last compacted revision.
|
||||||
|
// compactRevKey's version will be used as logical time for comparison. THe version is referred as compact time.
|
||||||
|
// Initially, because the key doesn't exist, the compact time (version) is 0.
|
||||||
|
//
|
||||||
|
// Algorithm:
|
||||||
|
// - Compare to see if (local compact_time) = (remote compact_time).
|
||||||
|
// - If yes, increment both local and remote compact_time, and do a compaction.
|
||||||
|
// - If not, set local to remote compact_time.
|
||||||
|
//
|
||||||
|
// Technical details/insights:
|
||||||
|
//
|
||||||
|
// The protocol here is lease based. If one compactor CAS successfully, the others would know it when they fail in
|
||||||
|
// CAS later and would try again in 10 minutes. If an APIServer crashed, another one would "take over" the lease.
|
||||||
|
//
|
||||||
|
// For example, in the following diagram, we have a compactor C1 doing compaction in t1, t2. Another compactor C2
|
||||||
|
// at t1' (t1 < t1' < t2) would CAS fail, set its known oldRev to rev at t1, and try again in t2' (t2' > t2).
|
||||||
|
// If C1 crashed and wouldn't compact at t2, C2 would CAS successfully at t2'.
|
||||||
|
//
|
||||||
|
// oldRev(t2) curRev(t2)
|
||||||
|
// +
|
||||||
|
// oldRev curRev |
|
||||||
|
// + + |
|
||||||
|
// | | |
|
||||||
|
// | | t1' | t2'
|
||||||
|
// +---v-------------v----^---------v------^---->
|
||||||
|
// t0 t1 t2
|
||||||
|
//
|
||||||
|
// We have the guarantees:
|
||||||
|
// - in normal cases, the interval is 10 minutes.
|
||||||
|
// - in failover, the interval is >10m and <20m
|
||||||
|
//
|
||||||
|
// FAQ:
|
||||||
|
// - What if time is not accurate? We don't care as long as someone did the compaction. Atomicity is ensured using
|
||||||
|
// etcd API.
|
||||||
|
// - What happened under heavy load scenarios? Initially, each apiserver will do only one compaction
|
||||||
|
// every 10 minutes. This is very unlikely affecting or affected w.r.t. server load.
|
||||||
|
|
||||||
|
var compactTime int64
|
||||||
|
var rev int64
|
||||||
var err error
|
var err error
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
@ -76,29 +118,44 @@ func compactor(ctx context.Context, client *clientv3.Client, interval time.Durat
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
curRev, err = compact(ctx, client, curRev)
|
compactTime, rev, err = compact(ctx, client, compactTime, rev)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Error(err)
|
glog.Errorf("etcd: endpoint (%v) compact failed: %v", client.Endpoints(), err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// compact compacts etcd store and returns current rev.
|
// compact compacts etcd store and returns current rev.
|
||||||
// If it couldn't get current revision, the old rev will be returned.
|
// It will return the current compact time and global revision if no error occurred.
|
||||||
func compact(ctx context.Context, client *clientv3.Client, oldRev int64) (int64, error) {
|
// Note that CAS fail will not incur any error.
|
||||||
resp, err := client.Get(ctx, "/")
|
func compact(ctx context.Context, client *clientv3.Client, t, rev int64) (int64, int64, error) {
|
||||||
|
resp, err := client.KV.Txn(ctx).If(
|
||||||
|
clientv3.Compare(clientv3.Version(compactRevKey), "=", t),
|
||||||
|
).Then(
|
||||||
|
clientv3.OpPut(compactRevKey, strconv.FormatInt(rev, 10)), // Expect side effect: increment Version
|
||||||
|
).Else(
|
||||||
|
clientv3.OpGet(compactRevKey),
|
||||||
|
).Commit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return oldRev, err
|
return t, rev, err
|
||||||
}
|
}
|
||||||
|
|
||||||
curRev := resp.Header.Revision
|
curRev := resp.Header.Revision
|
||||||
if oldRev == 0 {
|
|
||||||
return curRev, nil
|
if !resp.Succeeded {
|
||||||
|
curTime := resp.Responses[0].GetResponseRange().Kvs[0].Version
|
||||||
|
return curTime, curRev, nil
|
||||||
}
|
}
|
||||||
err = client.Compact(ctx, oldRev)
|
curTime := t + 1
|
||||||
if err != nil {
|
|
||||||
return curRev, err
|
if rev == 0 {
|
||||||
|
// We don't compact on bootstrap.
|
||||||
|
return curTime, curRev, nil
|
||||||
}
|
}
|
||||||
glog.Infof("etcd: Compacted rev %d, endpoints %v", oldRev, client.Endpoints())
|
if err = client.Compact(ctx, rev); err != nil {
|
||||||
return curRev, nil
|
return curTime, curRev, err
|
||||||
|
}
|
||||||
|
glog.Infof("etcd: compacted rev (%d), endpoints (%v)", rev, client.Endpoints())
|
||||||
|
return curTime, curRev, nil
|
||||||
}
|
}
|
||||||
|
@ -36,7 +36,7 @@ func TestCompact(t *testing.T) {
|
|||||||
t.Fatalf("Put failed: %v", err)
|
t.Fatalf("Put failed: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = compact(ctx, client, putResp.Header.Revision)
|
_, _, err = compact(ctx, client, 0, putResp.Header.Revision)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("compact failed: %v", err)
|
t.Fatalf("compact failed: %v", err)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user