Add exp backoff for connection refused errors

Currently when ListAndWatch() receives a connection refused error, it is assumed to be due to the apiserver being transiently unresponsive. In situations where a controller is running outside the k8s cluster it's controlling, it is more common for the controller to lose connection permanently to the apiserver and needs to exponentially backoff its retry rather than continously spamming logs with Watch attempts that will never succeed. Kubernetes-commit: 1ff789f2bb9bf7fbb3df35977bc249c0dd019d31
2025-09-05 17:10:27 +00:00 · 2020-08-25 19:15:21 +00:00
parent bb860d7b6e
commit 4d01b584c3
2 changed files with 88 additions and 7 deletions
--- a/tools/cache/reflector_test.go
+++ b/tools/cache/reflector_test.go
@@ -22,15 +22,17 @@ import (
 	"math/rand"
 	"reflect"
 	"strconv"
+	"syscall"
 	"testing"
 	"time"

-	"k8s.io/api/core/v1"
+	v1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/apimachinery/pkg/util/clock"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/apimachinery/pkg/watch"
 )
@@ -358,6 +360,82 @@ func TestReflectorListAndWatchWithErrors(t *testing.T) {
 	}
 }

+func TestReflectorListAndWatchInitConnBackoff(t *testing.T) {
+	maxBackoff := 50 * time.Millisecond
+	table := []struct {
+		numConnFails  int
+		expLowerBound time.Duration
+		expUpperBound time.Duration
+	}{
+		{5, 32 * time.Millisecond, 64 * time.Millisecond}, // case where maxBackoff is not hit, time should grow exponentially
+		{40, 35 * 2 * maxBackoff, 40 * 2 * maxBackoff},    // case where maxBoff is hit, backoff time should flatten
+
+	}
+	for _, test := range table {
+		t.Run(fmt.Sprintf("%d connection failures takes at least %d ms", test.numConnFails, 1<<test.numConnFails),
+			func(t *testing.T) {
+				stopCh := make(chan struct{})
+				connFails := test.numConnFails
+				fakeClock := clock.NewFakeClock(time.Unix(0, 0))
+				bm := wait.NewExponentialBackoffManager(time.Millisecond, maxBackoff, 100*time.Millisecond, 2.0, 1.0, fakeClock)
+				done := make(chan struct{})
+				defer close(done)
+				go func() {
+					i := 0
+					for {
+						select {
+						case <-done:
+							return
+						default:
+						}
+						if fakeClock.HasWaiters() {
+							step := (1 << (i + 1)) * time.Millisecond
+							if step > maxBackoff*2 {
+								step = maxBackoff * 2
+							}
+							fakeClock.Step(step)
+							i++
+						}
+						time.Sleep(100 * time.Microsecond)
+					}
+				}()
+				lw := &testLW{
+					WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
+						if connFails > 0 {
+							connFails--
+							return nil, syscall.ECONNREFUSED
+						}
+						close(stopCh)
+						return watch.NewFake(), nil
+					},
+					ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
+						return &v1.PodList{ListMeta: metav1.ListMeta{ResourceVersion: "1"}}, nil
+					},
+				}
+				r := &Reflector{
+					name:                   "test-reflector",
+					listerWatcher:          lw,
+					store:                  NewFIFO(MetaNamespaceKeyFunc),
+					initConnBackoffManager: bm,
+					clock:                  fakeClock,
+					watchErrorHandler:      WatchErrorHandler(DefaultWatchErrorHandler),
+				}
+				start := fakeClock.Now()
+				err := r.ListAndWatch(stopCh)
+				elapsed := fakeClock.Since(start)
+				if err != nil {
+					t.Errorf("unexpected error %v", err)
+				}
+				if elapsed < (test.expLowerBound) {
+					t.Errorf("expected lower bound of ListAndWatch: %v, got %v", test.expLowerBound, elapsed)
+				}
+				if elapsed > (test.expUpperBound) {
+					t.Errorf("expected upper bound of ListAndWatch: %v, got %v", test.expUpperBound, elapsed)
+				}
+			})
+	}
+}
+
 func TestReflectorResync(t *testing.T) {
 	iteration := 0
 	stopCh := make(chan struct{})