Merge pull request #111387 from marseel/feature/retry_internal_errors

Add option to retry internal api error in reflector.
This commit is contained in:
Kubernetes Prow Robot 2022-07-28 06:13:10 -07:00 committed by GitHub
commit 79a62d6235
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 307 additions and 1 deletions

View File

@ -377,6 +377,10 @@ func NewCacherFromConfig(config Config) (*Cacher, error) {
// Configure reflector's pager to for an appropriate pagination chunk size for fetching data from
// storage. The pager falls back to full list if paginated list calls fail due to an "Expired" error.
reflector.WatchListPageSize = storageWatchListPageSize
// When etcd loses leader for 3 cycles, it returns error "no leader".
// We don't want to terminate all watchers as recreating all watchers puts high load on api-server.
// In most of the cases, leader is reelected within few cycles.
reflector.MaxInternalErrorRetryDuration = time.Second * 30
cacher.watchCache = watchCache
cacher.reflector = reflector

View File

@ -71,6 +71,8 @@ type Reflector struct {
backoffManager wait.BackoffManager
// initConnBackoffManager manages backoff the initial connection with the Watch call of ListAndWatch.
initConnBackoffManager wait.BackoffManager
// MaxInternalErrorRetryDuration defines how long we should retry internal errors returned by watch.
MaxInternalErrorRetryDuration time.Duration
resyncPeriod time.Duration
// ShouldResync is invoked periodically and whenever it returns `true` the Store's Resync operation is invoked
@ -287,6 +289,7 @@ func (r *Reflector) ListAndWatch(stopCh <-chan struct{}) error {
}
}()
retry := NewRetryWithDeadline(r.MaxInternalErrorRetryDuration, time.Minute, apierrors.IsInternalError, r.clock)
for {
// give the stopCh a chance to stop the loop, even in case of continue statements further down on errors
select {
@ -323,7 +326,9 @@ func (r *Reflector) ListAndWatch(stopCh <-chan struct{}) error {
return err
}
if err := watchHandler(start, w, r.store, r.expectedType, r.expectedGVK, r.name, r.expectedTypeName, r.setLastSyncResourceVersion, r.clock, resyncerrc, stopCh); err != nil {
err = watchHandler(start, w, r.store, r.expectedType, r.expectedGVK, r.name, r.expectedTypeName, r.setLastSyncResourceVersion, r.clock, resyncerrc, stopCh)
retry.After(err)
if err != nil {
if err != errorStopRequested {
switch {
case isExpiredError(err):
@ -335,6 +340,9 @@ func (r *Reflector) ListAndWatch(stopCh <-chan struct{}) error {
klog.V(2).Infof("%s: watch of %v returned 429 - backing off", r.name, r.expectedTypeName)
<-r.initConnBackoffManager.Backoff().C()
continue
case apierrors.IsInternalError(err) && retry.ShouldRetry():
klog.V(2).Infof("%s: retrying watch of %v internal error: %v", r.name, r.expectedTypeName, err)
continue
default:
klog.Warningf("%s: watch of %v ended with: %v", r.name, r.expectedTypeName, err)
}

View File

@ -487,6 +487,79 @@ func TestBackoffOnTooManyRequests(t *testing.T) {
}
}
func TestRetryInternalError(t *testing.T) {
testCases := []struct {
name string
maxInternalDuration time.Duration
rewindTime int
wantRetries int
}{
{
name: "retries off",
maxInternalDuration: time.Duration(0),
wantRetries: 0,
},
{
name: "retries on, all calls fail",
maxInternalDuration: time.Second * 30,
wantRetries: 31,
},
{
name: "retries on, one call successful",
maxInternalDuration: time.Second * 30,
rewindTime: 10,
wantRetries: 40,
},
}
for _, tc := range testCases {
err := apierrors.NewInternalError(fmt.Errorf("etcdserver: no leader"))
fakeClock := testingclock.NewFakeClock(time.Now())
bm := &fakeBackoff{clock: fakeClock}
counter := 0
lw := &testLW{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
return &v1.PodList{ListMeta: metav1.ListMeta{ResourceVersion: "1"}}, nil
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
counter = counter + 1
t.Logf("Counter: %v", counter)
if counter == tc.rewindTime {
t.Logf("Rewinding")
fakeClock.Step(time.Minute)
}
fakeClock.Step(time.Second)
w := watch.NewFakeWithChanSize(1, false)
status := err.Status()
w.Error(&status)
return w, nil
},
}
r := &Reflector{
name: "test-reflector",
listerWatcher: lw,
store: NewFIFO(MetaNamespaceKeyFunc),
initConnBackoffManager: bm,
clock: fakeClock,
watchErrorHandler: WatchErrorHandler(DefaultWatchErrorHandler),
}
r.MaxInternalErrorRetryDuration = tc.maxInternalDuration
stopCh := make(chan struct{})
r.ListAndWatch(stopCh)
close(stopCh)
if counter-1 != tc.wantRetries {
t.Errorf("%v unexpected number of retries: %d", tc, counter-1)
}
}
}
func TestReflectorResync(t *testing.T) {
iteration := 0
stopCh := make(chan struct{})

View File

@ -0,0 +1,78 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"k8s.io/utils/clock"
"time"
)
type RetryWithDeadline interface {
After(error)
ShouldRetry() bool
}
type retryWithDeadlineImpl struct {
firstErrorTime time.Time
lastErrorTime time.Time
maxRetryDuration time.Duration
minResetPeriod time.Duration
isRetryable func(error) bool
clock clock.Clock
}
func NewRetryWithDeadline(maxRetryDuration, minResetPeriod time.Duration, isRetryable func(error) bool, clock clock.Clock) RetryWithDeadline {
return &retryWithDeadlineImpl{
firstErrorTime: time.Time{},
lastErrorTime: time.Time{},
maxRetryDuration: maxRetryDuration,
minResetPeriod: minResetPeriod,
isRetryable: isRetryable,
clock: clock,
}
}
func (r *retryWithDeadlineImpl) reset() {
r.firstErrorTime = time.Time{}
r.lastErrorTime = time.Time{}
}
func (r *retryWithDeadlineImpl) After(err error) {
if r.isRetryable(err) {
if r.clock.Now().Sub(r.lastErrorTime) >= r.minResetPeriod {
r.reset()
}
if r.firstErrorTime.IsZero() {
r.firstErrorTime = r.clock.Now()
}
r.lastErrorTime = r.clock.Now()
}
}
func (r *retryWithDeadlineImpl) ShouldRetry() bool {
if r.maxRetryDuration <= time.Duration(0) {
return false
}
if r.clock.Now().Sub(r.firstErrorTime) <= r.maxRetryDuration {
return true
}
r.reset()
return false
}

View File

@ -0,0 +1,143 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"fmt"
"testing"
"time"
apierrors "k8s.io/apimachinery/pkg/api/errors"
testingclock "k8s.io/utils/clock/testing"
)
type retryScenarioStep struct {
clockStep time.Duration
err error
wantRetry bool
}
func TestRetryWithDeadline(t *testing.T) {
internalError := apierrors.NewInternalError(fmt.Errorf("etcdserver: no leader"))
otherError := fmt.Errorf("some other error")
testCases := []struct {
name string
duration time.Duration
reset time.Duration
isRetryable func(error) bool
scenario []retryScenarioStep
}{
{
name: "Never retry when duration is zero",
duration: time.Duration(0),
reset: time.Second * 30,
isRetryable: func(err error) bool { return false },
scenario: []retryScenarioStep{
{
clockStep: time.Second * 1,
err: nil,
wantRetry: false,
},
{
clockStep: time.Second * 0,
err: internalError,
wantRetry: false,
},
{
clockStep: time.Second * 1,
err: otherError,
wantRetry: false,
},
},
},
{
name: "Retry when internal error happens only within duration",
duration: time.Second * 1,
reset: time.Second * 30,
isRetryable: apierrors.IsInternalError,
scenario: []retryScenarioStep{
{
clockStep: time.Second * 1,
err: internalError,
wantRetry: true,
},
{
clockStep: time.Second * 1,
err: internalError,
wantRetry: true,
},
{
clockStep: time.Second * 1,
err: internalError,
wantRetry: false,
},
},
},
{
name: "Don't retry when other error happens",
duration: time.Second * 1,
reset: time.Second * 30,
isRetryable: apierrors.IsInternalError,
scenario: []retryScenarioStep{
{
clockStep: time.Second * 1,
err: otherError,
wantRetry: false,
},
},
},
{
name: "Ignore other errors for retries",
duration: time.Second * 1,
reset: time.Second * 30,
isRetryable: apierrors.IsInternalError,
scenario: []retryScenarioStep{
{
clockStep: time.Second * 1,
err: internalError,
wantRetry: true,
},
{
clockStep: time.Second * 0,
err: otherError,
wantRetry: true,
},
{
clockStep: time.Second * 1,
err: internalError,
wantRetry: true,
},
},
},
}
for _, tc := range testCases {
fakeClock := testingclock.NewFakeClock(time.Now())
retry := NewRetryWithDeadline(tc.duration, tc.reset, tc.isRetryable, fakeClock)
for i, step := range tc.scenario {
fakeClock.Step(step.clockStep)
retry.After(step.err)
result := retry.ShouldRetry()
if result != step.wantRetry {
t.Errorf("%v unexpected retry, step %d, result %v want %v", tc, i, result, step.wantRetry)
break
}
}
}
}