diff --git a/tools/cache/controller.go b/tools/cache/controller.go index 9a706821..916ca9cc 100644 --- a/tools/cache/controller.go +++ b/tools/cache/controller.go @@ -69,6 +69,9 @@ type Config struct { // question to this interface as a parameter. This is probably moot // now that this functionality appears at a higher level. RetryOnError bool + + // Called whenever the ListAndWatch drops the connection with an error. + WatchErrorHandler WatchErrorHandler } // ShouldResyncFunc is a type of function that indicates if a reflector should perform a @@ -132,6 +135,9 @@ func (c *controller) Run(stopCh <-chan struct{}) { ) r.ShouldResync = c.config.ShouldResync r.clock = c.clock + if c.config.WatchErrorHandler != nil { + r.watchErrorHandler = c.config.WatchErrorHandler + } c.reflectorMutex.Lock() c.reflector = r diff --git a/tools/cache/reflector.go b/tools/cache/reflector.go index dfdc2e73..7c581ced 100644 --- a/tools/cache/reflector.go +++ b/tools/cache/reflector.go @@ -95,6 +95,37 @@ type Reflector struct { // etcd, which is significantly less efficient and may lead to serious performance and // scalability problems. WatchListPageSize int64 + // Called whenever the ListAndWatch drops the connection with an error. + watchErrorHandler WatchErrorHandler +} + +// The WatchErrorHandler is called whenever ListAndWatch drops the +// connection with an error. After calling this handler, the informer +// will backoff and retry. +// +// The default implementation looks at the error type and tries to log +// the error message at an appropriate level. +// +// Implementations of this handler may display the error message in other +// ways. Implementations should return quickly - any expensive processing +// should be offloaded. +type WatchErrorHandler func(r *Reflector, err error) + +// DefaultWatchErrorHandler is the default implementation of WatchErrorHandler +func DefaultWatchErrorHandler(r *Reflector, err error) { + switch { + case isExpiredError(err): + // Don't set LastSyncResourceVersionExpired - LIST call with ResourceVersion=RV already + // has a semantic that it returns data at least as fresh as provided RV. + // So first try to LIST with setting RV to resource version of last observed object. + klog.V(4).Infof("%s: watch of %v closed with: %v", r.name, r.expectedTypeName, err) + case err == io.EOF: + // watch closed normally + case err == io.ErrUnexpectedEOF: + klog.V(1).Infof("%s: Watch for %v closed with unexpected EOF: %v", r.name, r.expectedTypeName, err) + default: + utilruntime.HandleError(fmt.Errorf("%s: Failed to watch %v: %v", r.name, r.expectedTypeName, err)) + } } var ( @@ -135,9 +166,10 @@ func NewNamedReflector(name string, lw ListerWatcher, expectedType interface{}, // We used to make the call every 1sec (1 QPS), the goal here is to achieve ~98% traffic reduction when // API server is not healthy. With these parameters, backoff will stop at [30,60) sec interval which is // 0.22 QPS. If we don't backoff for 2min, assume API server is healthy and we reset the backoff. - backoffManager: wait.NewExponentialBackoffManager(800*time.Millisecond, 30*time.Second, 2*time.Minute, 2.0, 1.0, realClock), - resyncPeriod: resyncPeriod, - clock: realClock, + backoffManager: wait.NewExponentialBackoffManager(800*time.Millisecond, 30*time.Second, 2*time.Minute, 2.0, 1.0, realClock), + resyncPeriod: resyncPeriod, + clock: realClock, + watchErrorHandler: WatchErrorHandler(DefaultWatchErrorHandler), } r.setExpectedType(expectedType) return r @@ -175,7 +207,7 @@ func (r *Reflector) Run(stopCh <-chan struct{}) { klog.V(2).Infof("Starting reflector %s (%s) from %s", r.expectedTypeName, r.resyncPeriod, r.name) wait.BackoffUntil(func() { if err := r.ListAndWatch(stopCh); err != nil { - utilruntime.HandleError(err) + r.watchErrorHandler(r, err) } }, r.backoffManager, true, stopCh) klog.V(2).Infof("Stopping reflector %s (%s) from %s", r.expectedTypeName, r.resyncPeriod, r.name) @@ -275,7 +307,7 @@ func (r *Reflector) ListAndWatch(stopCh <-chan struct{}) error { case <-listCh: } if err != nil { - return fmt.Errorf("%s: Failed to list %v: %v", r.name, r.expectedTypeName, err) + return fmt.Errorf("failed to list %v: %v", r.expectedTypeName, err) } // We check if the list was paginated and if so set the paginatedResult based on that. @@ -296,17 +328,17 @@ func (r *Reflector) ListAndWatch(stopCh <-chan struct{}) error { initTrace.Step("Objects listed") listMetaInterface, err := meta.ListAccessor(list) if err != nil { - return fmt.Errorf("%s: Unable to understand list result %#v: %v", r.name, list, err) + return fmt.Errorf("unable to understand list result %#v: %v", list, err) } resourceVersion = listMetaInterface.GetResourceVersion() initTrace.Step("Resource version extracted") items, err := meta.ExtractList(list) if err != nil { - return fmt.Errorf("%s: Unable to understand list result %#v (%v)", r.name, list, err) + return fmt.Errorf("unable to understand list result %#v (%v)", list, err) } initTrace.Step("Objects extracted") if err := r.syncWith(items, resourceVersion); err != nil { - return fmt.Errorf("%s: Unable to sync list result: %v", r.name, err) + return fmt.Errorf("unable to sync list result: %v", err) } initTrace.Step("SyncWith done") r.setLastSyncResourceVersion(resourceVersion) @@ -366,19 +398,6 @@ func (r *Reflector) ListAndWatch(stopCh <-chan struct{}) error { w, err := r.listerWatcher.Watch(options) if err != nil { - switch { - case isExpiredError(err): - // Don't set LastSyncResourceVersionExpired - LIST call with ResourceVersion=RV already - // has a semantic that it returns data at least as fresh as provided RV. - // So first try to LIST with setting RV to resource version of last observed object. - klog.V(4).Infof("%s: watch of %v closed with: %v", r.name, r.expectedTypeName, err) - case err == io.EOF: - // watch closed normally - case err == io.ErrUnexpectedEOF: - klog.V(1).Infof("%s: Watch for %v closed with unexpected EOF: %v", r.name, r.expectedTypeName, err) - default: - utilruntime.HandleError(fmt.Errorf("%s: Failed to watch %v: %v", r.name, r.expectedTypeName, err)) - } // If this is "connection refused" error, it means that most likely apiserver is not responsive. // It doesn't make sense to re-list all objects because most likely we will be able to restart // watch where we ended. @@ -387,7 +406,7 @@ func (r *Reflector) ListAndWatch(stopCh <-chan struct{}) error { time.Sleep(time.Second) continue } - return nil + return err } if err := r.watchHandler(w, &resourceVersion, resyncerrc, stopCh); err != nil { diff --git a/tools/cache/shared_informer.go b/tools/cache/shared_informer.go index 8caa818e..9d96a09f 100644 --- a/tools/cache/shared_informer.go +++ b/tools/cache/shared_informer.go @@ -165,6 +165,21 @@ type SharedInformer interface { // store. The value returned is not synchronized with access to the underlying store and is not // thread-safe. LastSyncResourceVersion() string + + // The WatchErrorHandler is called whenever ListAndWatch drops the + // connection with an error. After calling this handler, the informer + // will backoff and retry. + // + // The default implementation looks at the error type and tries to log + // the error message at an appropriate level. + // + // There's only one handler, so if you call this multiple times, last one + // wins; calling after the informer has been started returns an error. + // + // The handler is intended for visibility, not to e.g. pause the consumers. + // The handler should return quickly - any expensive processing should be + // offloaded. + SetWatchErrorHandler(handler WatchErrorHandler) error } // SharedIndexInformer provides add and get Indexers ability based on SharedInformer. @@ -300,6 +315,9 @@ type sharedIndexInformer struct { // blockDeltas gives a way to stop all event distribution so that a late event handler // can safely join the shared informer. blockDeltas sync.Mutex + + // Called whenever the ListAndWatch drops the connection with an error. + watchErrorHandler WatchErrorHandler } // dummyController hides the fact that a SharedInformer is different from a dedicated one @@ -335,6 +353,18 @@ type deleteNotification struct { oldObj interface{} } +func (s *sharedIndexInformer) SetWatchErrorHandler(handler WatchErrorHandler) error { + s.startedLock.Lock() + defer s.startedLock.Unlock() + + if s.started { + return fmt.Errorf("informer has already started") + } + + s.watchErrorHandler = handler + return nil +} + func (s *sharedIndexInformer) Run(stopCh <-chan struct{}) { defer utilruntime.HandleCrash() @@ -351,7 +381,8 @@ func (s *sharedIndexInformer) Run(stopCh <-chan struct{}) { RetryOnError: false, ShouldResync: s.processor.shouldResync, - Process: s.HandleDeltas, + Process: s.HandleDeltas, + WatchErrorHandler: s.watchErrorHandler, } func() { diff --git a/tools/cache/shared_informer_test.go b/tools/cache/shared_informer_test.go index 5468fc74..5882e585 100644 --- a/tools/cache/shared_informer_test.go +++ b/tools/cache/shared_informer_test.go @@ -18,6 +18,7 @@ package cache import ( "fmt" + "strings" "sync" "testing" "time" @@ -330,3 +331,29 @@ func TestSharedInformerWatchDisruption(t *testing.T) { } } } + +func TestSharedInformerErrorHandling(t *testing.T) { + source := fcache.NewFakeControllerSource() + source.Add(&v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}}) + source.ListError = fmt.Errorf("Access Denied") + + informer := NewSharedInformer(source, &v1.Pod{}, 1*time.Second).(*sharedIndexInformer) + + errCh := make(chan error) + _ = informer.SetWatchErrorHandler(func(_ *Reflector, err error) { + errCh <- err + }) + + stop := make(chan struct{}) + go informer.Run(stop) + + select { + case err := <-errCh: + if !strings.Contains(err.Error(), "Access Denied") { + t.Errorf("Expected 'Access Denied' error. Actual: %v", err) + } + case <-time.After(time.Second): + t.Errorf("Timeout waiting for error handler call") + } + close(stop) +} diff --git a/tools/cache/testing/fake_controller_source.go b/tools/cache/testing/fake_controller_source.go index 16e66fc6..6151582c 100644 --- a/tools/cache/testing/fake_controller_source.go +++ b/tools/cache/testing/fake_controller_source.go @@ -62,6 +62,9 @@ type FakeControllerSource struct { changes []watch.Event // one change per resourceVersion Broadcaster *watch.Broadcaster lastRV int + + // Set this to simulate an error on List() + ListError error } type FakePVControllerSource struct { @@ -174,6 +177,11 @@ func (f *FakeControllerSource) getListItemsLocked() ([]runtime.Object, error) { func (f *FakeControllerSource) List(options metav1.ListOptions) (runtime.Object, error) { f.lock.RLock() defer f.lock.RUnlock() + + if f.ListError != nil { + return nil, f.ListError + } + list, err := f.getListItemsLocked() if err != nil { return nil, err