mirror of
https://github.com/k3s-io/kubernetes.git
synced 2026-02-21 22:57:15 +00:00
DRA device taint eviction: track evicting rules
This avoids having to call the rule lister (which theoretically, but not in practice) fail and having to iterate over rules which can be ignored (might be a small performance boost).
This commit is contained in:
@@ -127,14 +127,14 @@ type Controller struct {
|
||||
// pools indexes all slices by driver and pool name.
|
||||
pools map[poolID]pool
|
||||
|
||||
// taintRuleStats tracks information about work that was done for a specific DeviceTaintRule instance.
|
||||
taintRuleStats map[types.UID]taintRuleStats
|
||||
// evictingRules tracks all DeviceTaintRules by name which cause pod eviction.
|
||||
evictingRules map[string]*resourcealpha.DeviceTaintRule
|
||||
|
||||
// simulateRule is set only during simulation of a None effect.
|
||||
// taintRuleStats tracks information about work that was done for a specific DeviceTaintRule instance.
|
||||
//
|
||||
// During such a simulation the corresponding rule from ruleLister
|
||||
// has EffectNone and this one here has EffectNoExecute.
|
||||
simulateRule *resourcealpha.DeviceTaintRule
|
||||
// This is potentially a different set of rules than in evictingRules because a rule which is no
|
||||
// longer evicting might have evicted in the past and thus can have some relevant stats.
|
||||
taintRuleStats map[types.UID]taintRuleStats
|
||||
}
|
||||
|
||||
type taintRuleStats struct {
|
||||
@@ -550,7 +550,7 @@ func (tc *Controller) maybeUpdateRuleStatus(ctx context.Context, ruleRef taintev
|
||||
deletePodAt: make(map[tainteviction.NamespacedObject]evictionAndReason),
|
||||
allocatedClaims: maps.Clone(tc.allocatedClaims),
|
||||
pools: tc.pools,
|
||||
simulateRule: ruleEvict,
|
||||
evictingRules: make(map[string]*resourcealpha.DeviceTaintRule),
|
||||
workqueue: &NOPQueue[workItem]{},
|
||||
}
|
||||
defer tc.workqueue.ShutDown()
|
||||
@@ -729,6 +729,7 @@ func New(c clientset.Interface, podInformer coreinformers.PodInformer, claimInfo
|
||||
deletePodAt: make(map[tainteviction.NamespacedObject]evictionAndReason),
|
||||
allocatedClaims: make(map[types.NamespacedName]allocatedClaim),
|
||||
pools: make(map[poolID]pool),
|
||||
evictingRules: make(map[string]*resourcealpha.DeviceTaintRule),
|
||||
taintRuleStats: make(map[types.UID]taintRuleStats),
|
||||
// Instantiate all informers now to ensure that they get started.
|
||||
haveSynced: []cache.InformerSynced{
|
||||
@@ -1222,23 +1223,7 @@ func (tc *Controller) claimEvictionTime(claim *resourceapi.ResourceClaim) *evict
|
||||
// allEvictingDeviceTaints allows iterating over all DeviceTaintRules with NoExecute effect which affect the allocated device.
|
||||
// A taint may come from either the ResourceSlice informer (not the tracker!) or from a DeviceTaintRule, but not both.
|
||||
func (tc *Controller) allEvictingDeviceTaints(allocatedDevice resourceapi.DeviceRequestAllocationResult, slice *resourceapi.ResourceSlice, device *resourceapi.Device) iter.Seq[trackedTaint] {
|
||||
var rules []*resourcealpha.DeviceTaintRule
|
||||
var err error
|
||||
if tc.ruleLister != nil {
|
||||
rules, err = tc.ruleLister.List(labels.Everything())
|
||||
if err != nil {
|
||||
// TODO: instead of listing and handling an error, keep track of rules in the informer event handler?
|
||||
utilruntime.HandleErrorWithLogger(tc.logger, err, "Local cache failed to list DeviceTaintRules")
|
||||
return func(yield func(trackedTaint) bool) {}
|
||||
}
|
||||
}
|
||||
|
||||
if tc.simulateRule != nil {
|
||||
// Mix the rule for which we simulate EffectNoExecute into the set of
|
||||
// rules which will be evaluated. Typically this is the only rule
|
||||
// during simulation.
|
||||
rules = append(rules, tc.simulateRule)
|
||||
}
|
||||
evictingRules := tc.evictingRules
|
||||
|
||||
return func(yield func(trackedTaint) bool) {
|
||||
if device != nil {
|
||||
@@ -1253,9 +1238,8 @@ func (tc *Controller) allEvictingDeviceTaints(allocatedDevice resourceapi.Device
|
||||
}
|
||||
}
|
||||
|
||||
for _, rule := range rules {
|
||||
if rule.Spec.Taint.Effect != resourcealpha.DeviceTaintEffectNoExecute ||
|
||||
!ruleMatchesDevice(rule, allocatedDevice.Driver, allocatedDevice.Pool, allocatedDevice.Device) {
|
||||
for _, rule := range evictingRules {
|
||||
if !ruleMatchesDevice(rule, allocatedDevice.Driver, allocatedDevice.Pool, allocatedDevice.Device) {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -1298,6 +1282,7 @@ func (tc *Controller) handleRuleChange(oldRule, newRule *resourcealpha.DeviceTai
|
||||
if newRule == nil {
|
||||
// Clean up to avoid memory leak.
|
||||
delete(tc.taintRuleStats, oldRule.UID)
|
||||
delete(tc.evictingRules, oldRule.Name)
|
||||
// Removal from the work queue is handled when a worker handles the work item.
|
||||
// A work queue does not support canceling work.
|
||||
}
|
||||
@@ -1310,6 +1295,13 @@ func (tc *Controller) handleRuleChange(oldRule, newRule *resourcealpha.DeviceTai
|
||||
}
|
||||
|
||||
// Rule spec changes should be rare. Simply do a brute-force re-evaluation of all allocated claims.
|
||||
// Same with trying to avoid delete+add in evictingRules, the logic just becomes unnecessarily complex.
|
||||
if oldRule != nil && oldRule.Spec.Taint.Effect == resourcealpha.DeviceTaintEffectNoExecute {
|
||||
delete(tc.evictingRules, oldRule.Name)
|
||||
}
|
||||
if newRule != nil && newRule.Spec.Taint.Effect == resourcealpha.DeviceTaintEffectNoExecute {
|
||||
tc.evictingRules[newRule.Name] = newRule
|
||||
}
|
||||
for name, oldAllocatedClaim := range tc.allocatedClaims {
|
||||
newAllocatedClaim := allocatedClaim{
|
||||
ResourceClaim: oldAllocatedClaim.ResourceClaim,
|
||||
|
||||
Reference in New Issue
Block a user