DRA taint eviction: improve error handling

There was one error path that led to a "controller has shut down" log message. Other errors caused different log entries or are so unlikely (event handler registration failure!) that they weren't checked at all. It's clearer to let Run return an error in all cases and then log the "controller has shut down" error at the call site. This also enables tests to mark themselves as failed, should that ever happen.
2025-07-22 19:31:44 +00:00 · 2025-03-20 17:44:38 +01:00 · 2025-03-20 17:44:38 +01:00 · ac6e47cb14
commit ac6e47cb14
parent 473533adaa
3 changed files with 28 additions and 13 deletions
--- a/cmd/kube-controller-manager/app/core.go
+++ b/cmd/kube-controller-manager/app/core.go
@ -254,7 +254,11 @@ func startDeviceTaintEvictionController(ctx context.Context, controllerContext C
 		controllerContext.InformerFactory.Resource().V1beta1().DeviceClasses(),
 		controllerName,
 	)
-	go deviceTaintEvictionController.Run(ctx)
+	go func() {
+		if err := deviceTaintEvictionController.Run(ctx); err != nil {
+			klog.FromContext(ctx).Error(err, "Device taint processing leading to Pod eviction failed and is now paused")
+		}
+	}()
 	return nil, true, nil
 }

--- a/pkg/controller/devicetainteviction/device_taint_eviction.go
+++ b/pkg/controller/devicetainteviction/device_taint_eviction.go
@ -18,6 +18,7 @@ package devicetainteviction

 import (
 	"context"
+	"errors"
 	"fmt"
 	"math"
 	"slices"
@ -319,7 +320,8 @@ func New(c clientset.Interface, podInformer coreinformers.PodInformer, claimInfo
 }

 // Run starts the controller which will run until the context is done.
-func (tc *Controller) Run(ctx context.Context) {
+// An error is returned for startup problems.
+func (tc *Controller) Run(ctx context.Context) error {
 	defer utilruntime.HandleCrash()
 	logger := klog.FromContext(ctx)
 	logger.Info("Starting", "controller", tc.name)
@ -370,7 +372,7 @@ func (tc *Controller) Run(ctx context.Context) {
 	// mutex serializes event processing.
 	var mutex sync.Mutex

-	claimHandler, _ := tc.claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+	claimHandler, err := tc.claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
 		AddFunc: func(obj any) {
 			claim, ok := obj.(*resourceapi.ResourceClaim)
 			if !ok {
@ -409,12 +411,15 @@ func (tc *Controller) Run(ctx context.Context) {
 			tc.handleClaimChange(claim, nil)
 		},
 	})
+	if err != nil {
+		return fmt.Errorf("adding claim event handler:%w", err)
+	}
 	defer func() {
 		_ = tc.claimInformer.Informer().RemoveEventHandler(claimHandler)
 	}()
 	tc.haveSynced = append(tc.haveSynced, claimHandler.HasSynced)

-	podHandler, _ := tc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+	podHandler, err := tc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
 		AddFunc: func(obj any) {
 			pod, ok := obj.(*v1.Pod)
 			if !ok {
@ -453,6 +458,9 @@ func (tc *Controller) Run(ctx context.Context) {
 			tc.handlePodChange(pod, nil)
 		},
 	})
+	if err != nil {
+		return fmt.Errorf("adding pod event handler: %w", err)
+	}
 	defer func() {
 		_ = tc.podInformer.Informer().RemoveEventHandler(podHandler)
 	}()
@ -467,8 +475,7 @@ func (tc *Controller) Run(ctx context.Context) {
 	}
 	sliceTracker, err := resourceslicetracker.StartTracker(ctx, opts)
 	if err != nil {
-		logger.Info("Failed to initialize ResourceSlice tracker; device taint processing leading to Pod eviction is now paused", "err", err)
-		return
+		return fmt.Errorf("initialize ResourceSlice tracker: %w", err)
 	}
 	tc.haveSynced = append(tc.haveSynced, sliceTracker.HasSynced)
 	defer sliceTracker.Stop()
@ -478,11 +485,11 @@ func (tc *Controller) Run(ctx context.Context) {
 	// work which might be done as events get emitted for intermediate
 	// state.
 	if !cache.WaitForNamedCacheSyncWithContext(ctx, tc.haveSynced...) {
-		return
+		return errors.New("wait for cache sync timed out")
 	}
 	logger.V(1).Info("Underlying informers have synced")

-	_, _ = sliceTracker.AddEventHandler(cache.ResourceEventHandlerFuncs{
+	_, err = sliceTracker.AddEventHandler(cache.ResourceEventHandlerFuncs{
 		AddFunc: func(obj any) {
 			slice, ok := obj.(*resourceapi.ResourceSlice)
 			if !ok {
@ -519,12 +526,16 @@ func (tc *Controller) Run(ctx context.Context) {
 			tc.handleSliceChange(slice, nil)
 		},
 	})
+	if err != nil {
+		return fmt.Errorf("add slice event handler: %w", err)
+	}

 	// sliceTracker.AddEventHandler blocked while delivering events for all known
 	// ResourceSlices. Therefore our own state is up-to-date once we get here.
 	tc.hasSynced.Store(1)

 	<-ctx.Done()
+	return nil
 }

 func (tc *Controller) handleClaimChange(oldClaim, newClaim *resourceapi.ResourceClaim) {
--- a/pkg/controller/devicetainteviction/device_taint_eviction_test.go
+++ b/pkg/controller/devicetainteviction/device_taint_eviction_test.go
@ -1339,7 +1339,7 @@ func TestEviction(t *testing.T) {
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
-				controller.Run(tCtx)
+				assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
 			}()

 			// Eventually the controller should have synced it's informers.
@ -1450,7 +1450,7 @@ func testCancelEviction(tCtx ktesting.TContext, deletePod bool) {
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
-		controller.Run(tCtx)
+		assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
 	}()

 	// Eventually the pod gets scheduled for eviction.
@ -1543,7 +1543,7 @@ func TestParallelPodDeletion(t *testing.T) {
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
-		controller.Run(tCtx)
+		assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
 	}()

 	// Eventually the pod gets deleted, in this test by us.
@ -1622,7 +1622,7 @@ func TestRetry(t *testing.T) {
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
-		controller.Run(tCtx)
+		assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
 	}()

 	// Eventually the pod gets deleted.
@ -1694,7 +1694,7 @@ func TestEvictionFailure(t *testing.T) {
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
-		controller.Run(tCtx)
+		assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
 	}()

 	// Eventually deletion is attempted a few times.