diff --git a/pkg/controller/devicetainteviction/device_taint_eviction.go b/pkg/controller/devicetainteviction/device_taint_eviction.go index 858ad1eff55..d6075e98b31 100644 --- a/pkg/controller/devicetainteviction/device_taint_eviction.go +++ b/pkg/controller/devicetainteviction/device_taint_eviction.go @@ -127,14 +127,14 @@ type Controller struct { // pools indexes all slices by driver and pool name. pools map[poolID]pool - // taintRuleStats tracks information about work that was done for a specific DeviceTaintRule instance. - taintRuleStats map[types.UID]taintRuleStats + // evictingRules tracks all DeviceTaintRules by name which cause pod eviction. + evictingRules map[string]*resourcealpha.DeviceTaintRule - // simulateRule is set only during simulation of a None effect. + // taintRuleStats tracks information about work that was done for a specific DeviceTaintRule instance. // - // During such a simulation the corresponding rule from ruleLister - // has EffectNone and this one here has EffectNoExecute. - simulateRule *resourcealpha.DeviceTaintRule + // This is potentially a different set of rules than in evictingRules because a rule which is no + // longer evicting might have evicted in the past and thus can have some relevant stats. + taintRuleStats map[types.UID]taintRuleStats } type taintRuleStats struct { @@ -550,7 +550,7 @@ func (tc *Controller) maybeUpdateRuleStatus(ctx context.Context, ruleRef taintev deletePodAt: make(map[tainteviction.NamespacedObject]evictionAndReason), allocatedClaims: maps.Clone(tc.allocatedClaims), pools: tc.pools, - simulateRule: ruleEvict, + evictingRules: make(map[string]*resourcealpha.DeviceTaintRule), workqueue: &NOPQueue[workItem]{}, } defer tc.workqueue.ShutDown() @@ -729,6 +729,7 @@ func New(c clientset.Interface, podInformer coreinformers.PodInformer, claimInfo deletePodAt: make(map[tainteviction.NamespacedObject]evictionAndReason), allocatedClaims: make(map[types.NamespacedName]allocatedClaim), pools: make(map[poolID]pool), + evictingRules: make(map[string]*resourcealpha.DeviceTaintRule), taintRuleStats: make(map[types.UID]taintRuleStats), // Instantiate all informers now to ensure that they get started. haveSynced: []cache.InformerSynced{ @@ -1222,23 +1223,7 @@ func (tc *Controller) claimEvictionTime(claim *resourceapi.ResourceClaim) *evict // allEvictingDeviceTaints allows iterating over all DeviceTaintRules with NoExecute effect which affect the allocated device. // A taint may come from either the ResourceSlice informer (not the tracker!) or from a DeviceTaintRule, but not both. func (tc *Controller) allEvictingDeviceTaints(allocatedDevice resourceapi.DeviceRequestAllocationResult, slice *resourceapi.ResourceSlice, device *resourceapi.Device) iter.Seq[trackedTaint] { - var rules []*resourcealpha.DeviceTaintRule - var err error - if tc.ruleLister != nil { - rules, err = tc.ruleLister.List(labels.Everything()) - if err != nil { - // TODO: instead of listing and handling an error, keep track of rules in the informer event handler? - utilruntime.HandleErrorWithLogger(tc.logger, err, "Local cache failed to list DeviceTaintRules") - return func(yield func(trackedTaint) bool) {} - } - } - - if tc.simulateRule != nil { - // Mix the rule for which we simulate EffectNoExecute into the set of - // rules which will be evaluated. Typically this is the only rule - // during simulation. - rules = append(rules, tc.simulateRule) - } + evictingRules := tc.evictingRules return func(yield func(trackedTaint) bool) { if device != nil { @@ -1253,9 +1238,8 @@ func (tc *Controller) allEvictingDeviceTaints(allocatedDevice resourceapi.Device } } - for _, rule := range rules { - if rule.Spec.Taint.Effect != resourcealpha.DeviceTaintEffectNoExecute || - !ruleMatchesDevice(rule, allocatedDevice.Driver, allocatedDevice.Pool, allocatedDevice.Device) { + for _, rule := range evictingRules { + if !ruleMatchesDevice(rule, allocatedDevice.Driver, allocatedDevice.Pool, allocatedDevice.Device) { continue } @@ -1298,6 +1282,7 @@ func (tc *Controller) handleRuleChange(oldRule, newRule *resourcealpha.DeviceTai if newRule == nil { // Clean up to avoid memory leak. delete(tc.taintRuleStats, oldRule.UID) + delete(tc.evictingRules, oldRule.Name) // Removal from the work queue is handled when a worker handles the work item. // A work queue does not support canceling work. } @@ -1310,6 +1295,13 @@ func (tc *Controller) handleRuleChange(oldRule, newRule *resourcealpha.DeviceTai } // Rule spec changes should be rare. Simply do a brute-force re-evaluation of all allocated claims. + // Same with trying to avoid delete+add in evictingRules, the logic just becomes unnecessarily complex. + if oldRule != nil && oldRule.Spec.Taint.Effect == resourcealpha.DeviceTaintEffectNoExecute { + delete(tc.evictingRules, oldRule.Name) + } + if newRule != nil && newRule.Spec.Taint.Effect == resourcealpha.DeviceTaintEffectNoExecute { + tc.evictingRules[newRule.Name] = newRule + } for name, oldAllocatedClaim := range tc.allocatedClaims { newAllocatedClaim := allocatedClaim{ ResourceClaim: oldAllocatedClaim.ResourceClaim,