Merge pull request #128356 from lauralorenz/crashloopbackoff-maintain10minuterecoverythreshold

KEP-4603: Maintain current 10 minute recovery threshold for container backoff regardless of changes to the maximum duration

Kubernetes-commit: ab30adcbae57fc498cb876979e232b422468af9a
This commit is contained in:
Kubernetes Publisher
2024-11-07 22:20:50 +00:00
4 changed files with 146 additions and 13 deletions

2
go.mod
View File

@@ -26,7 +26,7 @@ require (
golang.org/x/time v0.7.0
google.golang.org/protobuf v1.35.1
gopkg.in/evanphx/json-patch.v4 v4.12.0
k8s.io/api v0.0.0-20241108114310-4772861d607e
k8s.io/api v0.0.0-20241108114313-789a813a3da8
k8s.io/apimachinery v0.0.0-20241106231735-d941d9fb4c83
k8s.io/klog/v2 v2.130.1
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f

4
go.sum
View File

@@ -150,8 +150,8 @@ gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
k8s.io/api v0.0.0-20241108114310-4772861d607e h1:SJxpg9FSTdq3qp/R+LRlZv+xjY+miO+30ZpC7QkSkoM=
k8s.io/api v0.0.0-20241108114310-4772861d607e/go.mod h1:h7yaPC7+0KxMELdLjLoo6n6m3EWq6AeHEY25PjH4cPI=
k8s.io/api v0.0.0-20241108114313-789a813a3da8 h1:+3HQBAIjBgEx+fcUE7qou+b97GTD0FwsRvdivPD4Fk8=
k8s.io/api v0.0.0-20241108114313-789a813a3da8/go.mod h1:h7yaPC7+0KxMELdLjLoo6n6m3EWq6AeHEY25PjH4cPI=
k8s.io/apimachinery v0.0.0-20241106231735-d941d9fb4c83 h1:4KfMPmiiRIpvYJQ8cBYFEFht59EKysW1anuJWzHLHNg=
k8s.io/apimachinery v0.0.0-20241106231735-d941d9fb4c83/go.mod h1:HqhdaJUgQqky29T1V0o2yFkt/pZqLFIDyn9Zi/8rxoY=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=

View File

@@ -32,7 +32,12 @@ type backoffEntry struct {
type Backoff struct {
sync.RWMutex
Clock clock.Clock
Clock clock.Clock
// HasExpiredFunc controls the logic that determines whether the backoff
// counter should be reset, and when to GC old backoff entries. If nil, the
// default hasExpired function will restart the backoff factor to the
// beginning after observing time has passed at least equal to 2*maxDuration
HasExpiredFunc func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool
defaultDuration time.Duration
maxDuration time.Duration
perItemBackoff map[string]*backoffEntry
@@ -93,7 +98,7 @@ func (p *Backoff) Next(id string, eventTime time.Time) {
p.Lock()
defer p.Unlock()
entry, ok := p.perItemBackoff[id]
if !ok || hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
if !ok || p.hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
entry = p.initEntryUnsafe(id)
entry.backoff += p.jitter(entry.backoff)
} else {
@@ -119,7 +124,7 @@ func (p *Backoff) IsInBackOffSince(id string, eventTime time.Time) bool {
if !ok {
return false
}
if hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
if p.hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
return false
}
return p.Clock.Since(eventTime) < entry.backoff
@@ -133,21 +138,21 @@ func (p *Backoff) IsInBackOffSinceUpdate(id string, eventTime time.Time) bool {
if !ok {
return false
}
if hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
if p.hasExpired(eventTime, entry.lastUpdate, p.maxDuration) {
return false
}
return eventTime.Sub(entry.lastUpdate) < entry.backoff
}
// Garbage collect records that have aged past maxDuration. Backoff users are expected
// to invoke this periodically.
// Garbage collect records that have aged past their expiration, which defaults
// to 2*maxDuration (see hasExpired godoc). Backoff users are expected to invoke
// this periodically.
func (p *Backoff) GC() {
p.Lock()
defer p.Unlock()
now := p.Clock.Now()
for id, entry := range p.perItemBackoff {
if now.Sub(entry.lastUpdate) > p.maxDuration*2 {
// GC when entry has not been updated for 2*maxDuration
if p.hasExpired(now, entry.lastUpdate, p.maxDuration) {
delete(p.perItemBackoff, id)
}
}
@@ -174,7 +179,10 @@ func (p *Backoff) jitter(delay time.Duration) time.Duration {
return time.Duration(p.rand.Float64() * p.maxJitterFactor * float64(delay))
}
// After 2*maxDuration we restart the backoff factor to the beginning
func hasExpired(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
// Unless an alternate function is provided, after 2*maxDuration we restart the backoff factor to the beginning
func (p *Backoff) hasExpired(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
if p.HasExpiredFunc != nil {
return p.HasExpiredFunc(eventTime, lastUpdate, maxDuration)
}
return eventTime.Sub(lastUpdate) > maxDuration*2 // consider stable if it's ok for twice the maxDuration
}

View File

@@ -125,6 +125,67 @@ func TestBackoffGC(t *testing.T) {
}
}
func TestAlternateBackoffGC(t *testing.T) {
cases := []struct {
name string
hasExpiredFunc func(time.Time, time.Time, time.Duration) bool
maxDuration time.Duration
nonExpiredTime time.Duration
expiredTime time.Duration
}{
{
name: "default GC",
maxDuration: time.Duration(50 * time.Second),
nonExpiredTime: time.Duration(5 * time.Second),
expiredTime: time.Duration(101 * time.Second),
},
{
name: "GC later than 2*maxDuration",
hasExpiredFunc: func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
return eventTime.Sub(lastUpdate) >= 200*time.Second
},
maxDuration: time.Duration(50 * time.Second),
nonExpiredTime: time.Duration(101 * time.Second),
expiredTime: time.Duration(501 * time.Second),
},
}
for _, tt := range cases {
clock := testingclock.NewFakeClock(time.Now())
base := time.Second
maxDuration := tt.maxDuration
id := tt.name
b := NewFakeBackOff(base, maxDuration, clock)
if tt.hasExpiredFunc != nil {
b.HasExpiredFunc = tt.hasExpiredFunc
}
// initialize backoff
lastUpdate := clock.Now()
b.Next(id, lastUpdate)
// increment to a time within GC expiration
clock.Step(tt.nonExpiredTime)
b.GC()
// confirm we did not GC this entry
_, found := b.perItemBackoff[id]
if !found {
t.Errorf("[%s] expected GC to skip entry, elapsed time=%s", tt.name, clock.Since(lastUpdate))
}
// increment to a time beyond GC expiration
clock.Step(tt.expiredTime)
b.GC()
r, found := b.perItemBackoff[id]
if found {
t.Errorf("[%s] expected GC of entry after %s got entry %v", tt.name, clock.Since(lastUpdate), r)
}
}
}
func TestIsInBackOffSinceUpdate(t *testing.T) {
id := "_idIsInBackOffSinceUpdate"
tc := testingclock.NewFakeClock(time.Now())
@@ -250,3 +311,67 @@ func TestBackoffWithJitter(t *testing.T) {
t.Logf("exponentially backed off jittered delays: %v", delays)
}
func TestAlternateHasExpiredFunc(t *testing.T) {
cases := []struct {
name string
hasExpiredFunc func(time.Time, time.Time, time.Duration) bool
maxDuration time.Duration
nonExpiredTime time.Duration
expiredTime time.Duration
}{
{
name: "default expiration",
maxDuration: time.Duration(50 * time.Second),
nonExpiredTime: time.Duration(5 * time.Second),
expiredTime: time.Duration(101 * time.Second),
},
{
name: "expires faster than maxDuration",
hasExpiredFunc: func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
return eventTime.Sub(lastUpdate) >= 8*time.Second
},
maxDuration: time.Duration(50 * time.Second),
nonExpiredTime: time.Duration(5 * time.Second),
expiredTime: time.Duration(9 * time.Second),
},
}
for _, tt := range cases {
clock := testingclock.NewFakeClock(time.Now())
base := time.Second
maxDuration := tt.maxDuration
id := tt.name
b := NewFakeBackOff(base, maxDuration, clock)
if tt.hasExpiredFunc != nil {
b.HasExpiredFunc = tt.hasExpiredFunc
}
// initialize backoff
b.Next(id, clock.Now())
// increment to a time within expiration
clock.Step(tt.nonExpiredTime)
b.Next(id, clock.Now())
// confirm we did a backoff
w := b.Get(id)
if w < base*2 {
t.Errorf("case %v: backoff object has not incremented like expected: want %s, got %s", tt.name, base*2, w)
}
// increment to a time beyond expiration
clock.Step(tt.expiredTime)
b.Next(id, clock.Now())
// confirm we have reset the backoff to base
w = b.Get(id)
if w != base {
t.Errorf("case %v: hasexpired value: expected %s (backoff to be reset to initial), got %s", tt.name, base, w)
}
clock.SetTime(time.Now())
b.Reset(id)
}
}