diff --git a/cmd/kube-controller-manager/app/core.go b/cmd/kube-controller-manager/app/core.go index 500903de4d0..54c4a1dd7df 100644 --- a/cmd/kube-controller-manager/app/core.go +++ b/cmd/kube-controller-manager/app/core.go @@ -254,7 +254,11 @@ func startDeviceTaintEvictionController(ctx context.Context, controllerContext C controllerContext.InformerFactory.Resource().V1beta1().DeviceClasses(), controllerName, ) - go deviceTaintEvictionController.Run(ctx) + go func() { + if err := deviceTaintEvictionController.Run(ctx); err != nil { + klog.FromContext(ctx).Error(err, "Device taint processing leading to Pod eviction failed and is now paused") + } + }() return nil, true, nil } diff --git a/pkg/controller/devicetainteviction/device_taint_eviction.go b/pkg/controller/devicetainteviction/device_taint_eviction.go index a2cc9354a58..31da9712e8e 100644 --- a/pkg/controller/devicetainteviction/device_taint_eviction.go +++ b/pkg/controller/devicetainteviction/device_taint_eviction.go @@ -18,6 +18,7 @@ package devicetainteviction import ( "context" + "errors" "fmt" "math" "slices" @@ -319,7 +320,8 @@ func New(c clientset.Interface, podInformer coreinformers.PodInformer, claimInfo } // Run starts the controller which will run until the context is done. -func (tc *Controller) Run(ctx context.Context) { +// An error is returned for startup problems. +func (tc *Controller) Run(ctx context.Context) error { defer utilruntime.HandleCrash() logger := klog.FromContext(ctx) logger.Info("Starting", "controller", tc.name) @@ -370,7 +372,7 @@ func (tc *Controller) Run(ctx context.Context) { // mutex serializes event processing. var mutex sync.Mutex - claimHandler, _ := tc.claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + claimHandler, err := tc.claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj any) { claim, ok := obj.(*resourceapi.ResourceClaim) if !ok { @@ -409,12 +411,15 @@ func (tc *Controller) Run(ctx context.Context) { tc.handleClaimChange(claim, nil) }, }) + if err != nil { + return fmt.Errorf("adding claim event handler:%w", err) + } defer func() { _ = tc.claimInformer.Informer().RemoveEventHandler(claimHandler) }() tc.haveSynced = append(tc.haveSynced, claimHandler.HasSynced) - podHandler, _ := tc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + podHandler, err := tc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj any) { pod, ok := obj.(*v1.Pod) if !ok { @@ -453,6 +458,9 @@ func (tc *Controller) Run(ctx context.Context) { tc.handlePodChange(pod, nil) }, }) + if err != nil { + return fmt.Errorf("adding pod event handler: %w", err) + } defer func() { _ = tc.podInformer.Informer().RemoveEventHandler(podHandler) }() @@ -467,8 +475,7 @@ func (tc *Controller) Run(ctx context.Context) { } sliceTracker, err := resourceslicetracker.StartTracker(ctx, opts) if err != nil { - logger.Info("Failed to initialize ResourceSlice tracker; device taint processing leading to Pod eviction is now paused", "err", err) - return + return fmt.Errorf("initialize ResourceSlice tracker: %w", err) } tc.haveSynced = append(tc.haveSynced, sliceTracker.HasSynced) defer sliceTracker.Stop() @@ -478,11 +485,11 @@ func (tc *Controller) Run(ctx context.Context) { // work which might be done as events get emitted for intermediate // state. if !cache.WaitForNamedCacheSyncWithContext(ctx, tc.haveSynced...) { - return + return errors.New("wait for cache sync timed out") } logger.V(1).Info("Underlying informers have synced") - _, _ = sliceTracker.AddEventHandler(cache.ResourceEventHandlerFuncs{ + _, err = sliceTracker.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj any) { slice, ok := obj.(*resourceapi.ResourceSlice) if !ok { @@ -519,12 +526,16 @@ func (tc *Controller) Run(ctx context.Context) { tc.handleSliceChange(slice, nil) }, }) + if err != nil { + return fmt.Errorf("add slice event handler: %w", err) + } // sliceTracker.AddEventHandler blocked while delivering events for all known // ResourceSlices. Therefore our own state is up-to-date once we get here. tc.hasSynced.Store(1) <-ctx.Done() + return nil } func (tc *Controller) handleClaimChange(oldClaim, newClaim *resourceapi.ResourceClaim) { diff --git a/pkg/controller/devicetainteviction/device_taint_eviction_test.go b/pkg/controller/devicetainteviction/device_taint_eviction_test.go index ee1b5439fc5..95c9f626e59 100644 --- a/pkg/controller/devicetainteviction/device_taint_eviction_test.go +++ b/pkg/controller/devicetainteviction/device_taint_eviction_test.go @@ -1339,7 +1339,7 @@ func TestEviction(t *testing.T) { wg.Add(1) go func() { defer wg.Done() - controller.Run(tCtx) + assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed") }() // Eventually the controller should have synced it's informers. @@ -1450,7 +1450,7 @@ func testCancelEviction(tCtx ktesting.TContext, deletePod bool) { wg.Add(1) go func() { defer wg.Done() - controller.Run(tCtx) + assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed") }() // Eventually the pod gets scheduled for eviction. @@ -1543,7 +1543,7 @@ func TestParallelPodDeletion(t *testing.T) { wg.Add(1) go func() { defer wg.Done() - controller.Run(tCtx) + assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed") }() // Eventually the pod gets deleted, in this test by us. @@ -1622,7 +1622,7 @@ func TestRetry(t *testing.T) { wg.Add(1) go func() { defer wg.Done() - controller.Run(tCtx) + assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed") }() // Eventually the pod gets deleted. @@ -1694,7 +1694,7 @@ func TestEvictionFailure(t *testing.T) { wg.Add(1) go func() { defer wg.Done() - controller.Run(tCtx) + assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed") }() // Eventually deletion is attempted a few times.