Maintain 10 minute recovery threshold for container backoff

Signed-off-by: Laura Lorenz <lauralorenz@google.com>

Kubernetes-commit: a0b83a774102a6d8ce03ce03c9d0431b44559019
This commit is contained in:
Laura Lorenz 2024-10-26 02:17:23 +00:00 committed by Kubernetes Publisher
parent c57e0a82c3
commit ab2cdceca1
2 changed files with 143 additions and 10 deletions

View File

@ -32,7 +32,12 @@ type backoffEntry struct {
type Backoff struct {
sync.RWMutex
Clock clock.Clock
Clock clock.Clock
// HasExpiredFunc controls the logic that determines whether the backoff
// counter should be reset, and when to GC old backoff entries. If nil, the
// default hasExpired function will restart the backoff factor to the
// beginning after observing time has passed at least equal to 2*maxDuration
HasExpiredFunc func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool
defaultDuration time.Duration
maxDuration time.Duration
perItemBackoff map[string]*backoffEntry
@ -93,7 +98,7 @@ func (p *Backoff) Next(id string, eventTime time.Time) {
p.Lock()
defer p.Unlock()
entry, ok := p.perItemBackoff[id]
if !ok || hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
if !ok || p.hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
entry = p.initEntryUnsafe(id)
entry.backoff += p.jitter(entry.backoff)
} else {
@ -119,7 +124,7 @@ func (p *Backoff) IsInBackOffSince(id string, eventTime time.Time) bool {
if !ok {
return false
}
if hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
if p.hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
return false
}
return p.Clock.Since(eventTime) < entry.backoff
@ -133,21 +138,21 @@ func (p *Backoff) IsInBackOffSinceUpdate(id string, eventTime time.Time) bool {
if !ok {
return false
}
if hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
if p.hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
return false
}
return eventTime.Sub(entry.lastUpdate) < entry.backoff
}
// Garbage collect records that have aged past maxDuration. Backoff users are expected
// to invoke this periodically.
// Garbage collect records that have aged past their expiration, which defaults
// to 2*maxDuration (see hasExpired godoc). Backoff users are expected to invoke
// this periodically.
func (p *Backoff) GC() {
p.Lock()
defer p.Unlock()
now := p.Clock.Now()
for id, entry := range p.perItemBackoff {
if now.Sub(entry.lastUpdate) > p.maxDuration*2 {
// GC when entry has not been updated for 2*maxDuration
if p.hasExpired(now, entry.lastUpdate, p.maxDuration) {
delete(p.perItemBackoff, id)
}
}
@ -174,7 +179,10 @@ func (p *Backoff) jitter(delay time.Duration) time.Duration {
return time.Duration(p.rand.Float64() * p.maxJitterFactor * float64(delay))
}
// After 2*maxDuration we restart the backoff factor to the beginning
func hasExpired(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
// Unless an alternate function is provided, after 2*maxDuration we restart the backoff factor to the beginning
func (p *Backoff) hasExpired(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
if p.HasExpiredFunc != nil {
return p.HasExpiredFunc(eventTime, lastUpdate, maxDuration)
}
return eventTime.Sub(lastUpdate) > maxDuration*2 // consider stable if it's ok for twice the maxDuration
}

View File

@ -125,6 +125,67 @@ func TestBackoffGC(t *testing.T) {
}
}
func TestAlternateBackoffGC(t *testing.T) {
cases := []struct {
name string
hasExpiredFunc func(time.Time, time.Time, time.Duration) bool
maxDuration time.Duration
nonExpiredTime time.Duration
expiredTime time.Duration
}{
{
name: "default GC",
maxDuration: time.Duration(50 * time.Second),
nonExpiredTime: time.Duration(5 * time.Second),
expiredTime: time.Duration(101 * time.Second),
},
{
name: "GC later than 2*maxDuration",
hasExpiredFunc: func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
return eventTime.Sub(lastUpdate) >= 200*time.Second
},
maxDuration: time.Duration(50 * time.Second),
nonExpiredTime: time.Duration(101 * time.Second),
expiredTime: time.Duration(501 * time.Second),
},
}
for _, tt := range cases {
clock := testingclock.NewFakeClock(time.Now())
base := time.Second
maxDuration := tt.maxDuration
id := tt.name
b := NewFakeBackOff(base, maxDuration, clock)
if tt.hasExpiredFunc != nil {
b.HasExpiredFunc = tt.hasExpiredFunc
}
// initialize backoff
lastUpdate := clock.Now()
b.Next(id, lastUpdate)
// increment to a time within GC expiration
clock.Step(tt.nonExpiredTime)
b.GC()
// confirm we did not GC this entry
_, found := b.perItemBackoff[id]
if !found {
t.Errorf("[%s] expected GC to skip entry, elapsed time=%s", tt.name, clock.Since(lastUpdate))
}
// increment to a time beyond GC expiration
clock.Step(tt.expiredTime)
b.GC()
r, found := b.perItemBackoff[id]
if found {
t.Errorf("[%s] expected GC of entry after %s got entry %v", tt.name, clock.Since(lastUpdate), r)
}
}
}
func TestIsInBackOffSinceUpdate(t *testing.T) {
id := "_idIsInBackOffSinceUpdate"
tc := testingclock.NewFakeClock(time.Now())
@ -250,3 +311,67 @@ func TestBackoffWithJitter(t *testing.T) {
t.Logf("exponentially backed off jittered delays: %v", delays)
}
func TestAlternateHasExpiredFunc(t *testing.T) {
cases := []struct {
name string
hasExpiredFunc func(time.Time, time.Time, time.Duration) bool
maxDuration time.Duration
nonExpiredTime time.Duration
expiredTime time.Duration
}{
{
name: "default expiration",
maxDuration: time.Duration(50 * time.Second),
nonExpiredTime: time.Duration(5 * time.Second),
expiredTime: time.Duration(101 * time.Second),
},
{
name: "expires faster than maxDuration",
hasExpiredFunc: func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
return eventTime.Sub(lastUpdate) >= 8*time.Second
},
maxDuration: time.Duration(50 * time.Second),
nonExpiredTime: time.Duration(5 * time.Second),
expiredTime: time.Duration(9 * time.Second),
},
}
for _, tt := range cases {
clock := testingclock.NewFakeClock(time.Now())
base := time.Second
maxDuration := tt.maxDuration
id := tt.name
b := NewFakeBackOff(base, maxDuration, clock)
if tt.hasExpiredFunc != nil {
b.HasExpiredFunc = tt.hasExpiredFunc
}
// initialize backoff
b.Next(id, clock.Now())
// increment to a time within expiration
clock.Step(tt.nonExpiredTime)
b.Next(id, clock.Now())
// confirm we did a backoff
w := b.Get(id)
if w < base*2 {
t.Errorf("case %v: backoff object has not incremented like expected: want %s, got %s", tt.name, base*2, w)
}
// increment to a time beyond expiration
clock.Step(tt.expiredTime)
b.Next(id, clock.Now())
// confirm we have reset the backoff to base
w = b.Get(id)
if w != base {
t.Errorf("case %v: hasexpired value: expected %s (backoff to be reset to initial), got %s", tt.name, base, w)
}
clock.SetTime(time.Now())
b.Reset(id)
}
}