mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-04 18:00:08 +00:00
Merge pull request #122292 from sanposhiho/nodeupdate
register Node/UpdateTaint event to plugins which has Node/Add only and doesn't have Node/UpdateTaint
This commit is contained in:
commit
aa73f3163a
@ -396,7 +396,16 @@ func (pl *dynamicResources) EventsToRegister() []framework.ClusterEventWithHint
|
|||||||
{Event: framework.ClusterEvent{Resource: framework.PodSchedulingContext, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPodSchedulingContextChange},
|
{Event: framework.ClusterEvent{Resource: framework.PodSchedulingContext, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPodSchedulingContextChange},
|
||||||
// A resource might depend on node labels for topology filtering.
|
// A resource might depend on node labels for topology filtering.
|
||||||
// A new or updated node may make pods schedulable.
|
// A new or updated node may make pods schedulable.
|
||||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
|
//
|
||||||
|
// A note about UpdateNodeTaint event:
|
||||||
|
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
|
||||||
|
// As a common problematic scenario,
|
||||||
|
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
|
||||||
|
// In such cases, this plugin may miss some events that actually make pods schedulable.
|
||||||
|
// As a workaround, we add UpdateNodeTaint event to catch the case.
|
||||||
|
// We can remove UpdateNodeTaint when we remove the preCheck feature.
|
||||||
|
// See: https://github.com/kubernetes/kubernetes/issues/110175
|
||||||
|
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}},
|
||||||
// A pod might be waiting for a class to get created or modified.
|
// A pod might be waiting for a class to get created or modified.
|
||||||
{Event: framework.ClusterEvent{Resource: framework.ResourceClass, ActionType: framework.Add | framework.Update}},
|
{Event: framework.ClusterEvent{Resource: framework.ResourceClass, ActionType: framework.Add | framework.Update}},
|
||||||
}
|
}
|
||||||
|
@ -64,8 +64,17 @@ func (pl *InterPodAffinity) EventsToRegister() []framework.ClusterEventWithHint
|
|||||||
// an unschedulable Pod schedulable.
|
// an unschedulable Pod schedulable.
|
||||||
// - Add. An unschedulable Pod may fail due to violating pod-affinity constraints,
|
// - Add. An unschedulable Pod may fail due to violating pod-affinity constraints,
|
||||||
// adding an assigned Pod may make it schedulable.
|
// adding an assigned Pod may make it schedulable.
|
||||||
|
//
|
||||||
|
// A note about UpdateNodeTaint event:
|
||||||
|
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
|
||||||
|
// As a common problematic scenario,
|
||||||
|
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
|
||||||
|
// In such cases, this plugin may miss some events that actually make pods schedulable.
|
||||||
|
// As a workaround, we add UpdateNodeTaint event to catch the case.
|
||||||
|
// We can remove UpdateNodeTaint when we remove the preCheck feature.
|
||||||
|
// See: https://github.com/kubernetes/kubernetes/issues/110175
|
||||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.All}},
|
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.All}},
|
||||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
|
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -146,7 +146,16 @@ func (pl *PodTopologySpread) EventsToRegister() []framework.ClusterEventWithHint
|
|||||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.All}, QueueingHintFn: pl.isSchedulableAfterPodChange},
|
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.All}, QueueingHintFn: pl.isSchedulableAfterPodChange},
|
||||||
// Node add|delete|update maybe lead an topology key changed,
|
// Node add|delete|update maybe lead an topology key changed,
|
||||||
// and make these pod in scheduling schedulable or unschedulable.
|
// and make these pod in scheduling schedulable or unschedulable.
|
||||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.Delete | framework.Update}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
|
//
|
||||||
|
// A note about UpdateNodeTaint event:
|
||||||
|
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
|
||||||
|
// As a common problematic scenario,
|
||||||
|
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
|
||||||
|
// In such cases, this plugin may miss some events that actually make pods schedulable.
|
||||||
|
// As a workaround, we add UpdateNodeTaint event to catch the case.
|
||||||
|
// We can remove UpdateNodeTaint when we remove the preCheck feature.
|
||||||
|
// See: https://github.com/kubernetes/kubernetes/issues/110175
|
||||||
|
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.Delete | framework.UpdateNodeLabel | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -103,7 +103,16 @@ func (pl *VolumeBinding) EventsToRegister() []framework.ClusterEventWithHint {
|
|||||||
// Pods may fail to find available PVs because the node labels do not
|
// Pods may fail to find available PVs because the node labels do not
|
||||||
// match the storage class's allowed topologies or PV's node affinity.
|
// match the storage class's allowed topologies or PV's node affinity.
|
||||||
// A new or updated node may make pods schedulable.
|
// A new or updated node may make pods schedulable.
|
||||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
|
//
|
||||||
|
// A note about UpdateNodeTaint event:
|
||||||
|
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
|
||||||
|
// As a common problematic scenario,
|
||||||
|
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
|
||||||
|
// In such cases, this plugin may miss some events that actually make pods schedulable.
|
||||||
|
// As a workaround, we add UpdateNodeTaint event to catch the case.
|
||||||
|
// We can remove UpdateNodeTaint when we remove the preCheck feature.
|
||||||
|
// See: https://github.com/kubernetes/kubernetes/issues/110175
|
||||||
|
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}},
|
||||||
// We rely on CSI node to translate in-tree PV to CSI.
|
// We rely on CSI node to translate in-tree PV to CSI.
|
||||||
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add | framework.Update}},
|
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add | framework.Update}},
|
||||||
// When CSIStorageCapacity is enabled, pods may become schedulable
|
// When CSIStorageCapacity is enabled, pods may become schedulable
|
||||||
|
@ -280,7 +280,16 @@ func (pl *VolumeZone) EventsToRegister() []framework.ClusterEventWithHint {
|
|||||||
// Due to immutable field `storageClass.volumeBindingMode`, storageClass update events are ignored.
|
// Due to immutable field `storageClass.volumeBindingMode`, storageClass update events are ignored.
|
||||||
{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add}},
|
{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add}},
|
||||||
// A new node or updating a node's volume zone labels may make a pod schedulable.
|
// A new node or updating a node's volume zone labels may make a pod schedulable.
|
||||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
|
//
|
||||||
|
// A note about UpdateNodeTaint event:
|
||||||
|
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
|
||||||
|
// As a common problematic scenario,
|
||||||
|
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
|
||||||
|
// In such cases, this plugin may miss some events that actually make pods schedulable.
|
||||||
|
// As a workaround, we add UpdateNodeTaint event to catch the case.
|
||||||
|
// We can remove UpdateNodeTaint when we remove the preCheck feature.
|
||||||
|
// See: https://github.com/kubernetes/kubernetes/issues/110175
|
||||||
|
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}},
|
||||||
// A new pvc may make a pod schedulable.
|
// A new pvc may make a pod schedulable.
|
||||||
// Due to fields are immutable except `spec.resources`, pvc update events are ignored.
|
// Due to fields are immutable except `spec.resources`, pvc update events are ignored.
|
||||||
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}},
|
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}},
|
||||||
|
@ -73,6 +73,15 @@ const (
|
|||||||
// - a Pod that was assumed, but gets un-assumed due to some errors in the binding cycle.
|
// - a Pod that was assumed, but gets un-assumed due to some errors in the binding cycle.
|
||||||
// - an existing Pod that was unscheduled but gets scheduled to a Node.
|
// - an existing Pod that was unscheduled but gets scheduled to a Node.
|
||||||
Pod GVK = "Pod"
|
Pod GVK = "Pod"
|
||||||
|
// A note about NodeAdd event and UpdateNodeTaint event:
|
||||||
|
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
|
||||||
|
// It's definitely not something expected for plugin developers,
|
||||||
|
// and registering UpdateNodeTaint event is the only mitigation for now.
|
||||||
|
// So, kube-scheduler registers UpdateNodeTaint event for plugins that has NodeAdded event, but don't have UpdateNodeTaint event.
|
||||||
|
// It has a bad impact for the requeuing efficiency though, a lot better than some Pods being stuck in the
|
||||||
|
// unschedulable pod pool.
|
||||||
|
// This behavior will be removed when we remove the preCheck feature.
|
||||||
|
// See: https://github.com/kubernetes/kubernetes/issues/110175
|
||||||
Node GVK = "Node"
|
Node GVK = "Node"
|
||||||
PersistentVolume GVK = "PersistentVolume"
|
PersistentVolume GVK = "PersistentVolume"
|
||||||
PersistentVolumeClaim GVK = "PersistentVolumeClaim"
|
PersistentVolumeClaim GVK = "PersistentVolumeClaim"
|
||||||
|
@ -118,6 +118,7 @@ type SchedulingQueue interface {
|
|||||||
AssignedPodAdded(logger klog.Logger, pod *v1.Pod)
|
AssignedPodAdded(logger klog.Logger, pod *v1.Pod)
|
||||||
AssignedPodUpdated(logger klog.Logger, oldPod, newPod *v1.Pod)
|
AssignedPodUpdated(logger klog.Logger, oldPod, newPod *v1.Pod)
|
||||||
PendingPods() ([]*v1.Pod, string)
|
PendingPods() ([]*v1.Pod, string)
|
||||||
|
PodsInActiveQ() []*v1.Pod
|
||||||
// Close closes the SchedulingQueue so that the goroutine which is
|
// Close closes the SchedulingQueue so that the goroutine which is
|
||||||
// waiting to pop items can exit gracefully.
|
// waiting to pop items can exit gracefully.
|
||||||
Close()
|
Close()
|
||||||
@ -1227,6 +1228,18 @@ func (p *PriorityQueue) getUnschedulablePodsWithMatchingAffinityTerm(logger klog
|
|||||||
return podsToMove
|
return podsToMove
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PodsInActiveQ returns all the Pods in the activeQ.
|
||||||
|
// This function is only used in tests.
|
||||||
|
func (p *PriorityQueue) PodsInActiveQ() []*v1.Pod {
|
||||||
|
p.lock.RLock()
|
||||||
|
defer p.lock.RUnlock()
|
||||||
|
var result []*v1.Pod
|
||||||
|
for _, pInfo := range p.activeQ.List() {
|
||||||
|
result = append(result, pInfo.(*framework.QueuedPodInfo).Pod)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
var pendingPodsSummary = "activeQ:%v; backoffQ:%v; unschedulablePods:%v"
|
var pendingPodsSummary = "activeQ:%v; backoffQ:%v; unschedulablePods:%v"
|
||||||
|
|
||||||
// PendingPods returns all the pending pods in the queue; accompanied by a debugging string
|
// PendingPods returns all the pending pods in the queue; accompanied by a debugging string
|
||||||
|
@ -62,8 +62,8 @@ const (
|
|||||||
numberOfHighestScoredNodesToReport = 3
|
numberOfHighestScoredNodesToReport = 3
|
||||||
)
|
)
|
||||||
|
|
||||||
// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
|
// ScheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
|
||||||
func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
func (sched *Scheduler) ScheduleOne(ctx context.Context) {
|
||||||
logger := klog.FromContext(ctx)
|
logger := klog.FromContext(ctx)
|
||||||
podInfo, err := sched.NextPod(logger)
|
podInfo, err := sched.NextPod(logger)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -811,7 +811,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
sched.scheduleOne(ctx)
|
sched.ScheduleOne(ctx)
|
||||||
<-called
|
<-called
|
||||||
if e, a := item.expectAssumedPod, gotAssumedPod; !reflect.DeepEqual(e, a) {
|
if e, a := item.expectAssumedPod, gotAssumedPod; !reflect.DeepEqual(e, a) {
|
||||||
t.Errorf("assumed pod: wanted %v, got %v", e, a)
|
t.Errorf("assumed pod: wanted %v, got %v", e, a)
|
||||||
@ -884,7 +884,7 @@ func TestSchedulerNoPhantomPodAfterExpire(t *testing.T) {
|
|||||||
// We use conflicted pod ports to incur fit predicate failure if first pod not removed.
|
// We use conflicted pod ports to incur fit predicate failure if first pod not removed.
|
||||||
secondPod := podWithPort("bar", "", 8080)
|
secondPod := podWithPort("bar", "", 8080)
|
||||||
queuedPodStore.Add(secondPod)
|
queuedPodStore.Add(secondPod)
|
||||||
scheduler.scheduleOne(ctx)
|
scheduler.ScheduleOne(ctx)
|
||||||
select {
|
select {
|
||||||
case b := <-bindingChan:
|
case b := <-bindingChan:
|
||||||
expectBinding := &v1.Binding{
|
expectBinding := &v1.Binding{
|
||||||
@ -921,7 +921,7 @@ func TestSchedulerNoPhantomPodAfterDelete(t *testing.T) {
|
|||||||
// queuedPodStore: [bar:8080]
|
// queuedPodStore: [bar:8080]
|
||||||
// cache: [(assumed)foo:8080]
|
// cache: [(assumed)foo:8080]
|
||||||
|
|
||||||
scheduler.scheduleOne(ctx)
|
scheduler.ScheduleOne(ctx)
|
||||||
select {
|
select {
|
||||||
case err := <-errChan:
|
case err := <-errChan:
|
||||||
expectErr := &framework.FitError{
|
expectErr := &framework.FitError{
|
||||||
@ -954,7 +954,7 @@ func TestSchedulerNoPhantomPodAfterDelete(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
queuedPodStore.Add(secondPod)
|
queuedPodStore.Add(secondPod)
|
||||||
scheduler.scheduleOne(ctx)
|
scheduler.ScheduleOne(ctx)
|
||||||
select {
|
select {
|
||||||
case b := <-bindingChan:
|
case b := <-bindingChan:
|
||||||
expectBinding := &v1.Binding{
|
expectBinding := &v1.Binding{
|
||||||
@ -1030,7 +1030,7 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) {
|
|||||||
scheduler, _, errChan := setupTestScheduler(ctx, t, queuedPodStore, scache, informerFactory, nil, fns...)
|
scheduler, _, errChan := setupTestScheduler(ctx, t, queuedPodStore, scache, informerFactory, nil, fns...)
|
||||||
|
|
||||||
queuedPodStore.Add(podWithTooBigResourceRequests)
|
queuedPodStore.Add(podWithTooBigResourceRequests)
|
||||||
scheduler.scheduleOne(ctx)
|
scheduler.ScheduleOne(ctx)
|
||||||
select {
|
select {
|
||||||
case err := <-errChan:
|
case err := <-errChan:
|
||||||
expectErr := &framework.FitError{
|
expectErr := &framework.FitError{
|
||||||
@ -1160,7 +1160,7 @@ func TestSchedulerWithVolumeBinding(t *testing.T) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
s.scheduleOne(ctx)
|
s.ScheduleOne(ctx)
|
||||||
// Wait for pod to succeed or fail scheduling
|
// Wait for pod to succeed or fail scheduling
|
||||||
select {
|
select {
|
||||||
case <-eventChan:
|
case <-eventChan:
|
||||||
@ -3481,7 +3481,7 @@ func setupTestSchedulerWithOnePodOnNode(ctx context.Context, t *testing.T, queue
|
|||||||
// queuedPodStore: [foo:8080]
|
// queuedPodStore: [foo:8080]
|
||||||
// cache: []
|
// cache: []
|
||||||
|
|
||||||
scheduler.scheduleOne(ctx)
|
scheduler.ScheduleOne(ctx)
|
||||||
// queuedPodStore: []
|
// queuedPodStore: []
|
||||||
// cache: [(assumed)foo:8080]
|
// cache: [(assumed)foo:8080]
|
||||||
|
|
||||||
|
@ -389,17 +389,47 @@ func buildQueueingHintMap(es []framework.EnqueueExtensions) internalqueue.Queuei
|
|||||||
// cannot be moved by any regular cluster event.
|
// cannot be moved by any regular cluster event.
|
||||||
// So, we can just ignore such EventsToRegister here.
|
// So, we can just ignore such EventsToRegister here.
|
||||||
|
|
||||||
|
registerNodeAdded := false
|
||||||
|
registerNodeTaintUpdated := false
|
||||||
for _, event := range events {
|
for _, event := range events {
|
||||||
fn := event.QueueingHintFn
|
fn := event.QueueingHintFn
|
||||||
if fn == nil || !utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
|
if fn == nil || !utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
|
||||||
fn = defaultQueueingHintFn
|
fn = defaultQueueingHintFn
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if event.Event.Resource == framework.Node {
|
||||||
|
if event.Event.ActionType&framework.Add != 0 {
|
||||||
|
registerNodeAdded = true
|
||||||
|
}
|
||||||
|
if event.Event.ActionType&framework.UpdateNodeTaint != 0 {
|
||||||
|
registerNodeTaintUpdated = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
queueingHintMap[event.Event] = append(queueingHintMap[event.Event], &internalqueue.QueueingHintFunction{
|
queueingHintMap[event.Event] = append(queueingHintMap[event.Event], &internalqueue.QueueingHintFunction{
|
||||||
PluginName: e.Name(),
|
PluginName: e.Name(),
|
||||||
QueueingHintFn: fn,
|
QueueingHintFn: fn,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
if registerNodeAdded && !registerNodeTaintUpdated {
|
||||||
|
// Temporally fix for the issue https://github.com/kubernetes/kubernetes/issues/109437
|
||||||
|
// NodeAdded QueueingHint isn't always called because of preCheck.
|
||||||
|
// It's definitely not something expected for plugin developers,
|
||||||
|
// and registering UpdateNodeTaint event is the only mitigation for now.
|
||||||
|
//
|
||||||
|
// So, here registers UpdateNodeTaint event for plugins that has NodeAdded event, but don't have UpdateNodeTaint event.
|
||||||
|
// It has a bad impact for the requeuing efficiency though, a lot better than some Pods being stuch in the
|
||||||
|
// unschedulable pod pool.
|
||||||
|
// This behavior will be removed when we remove the preCheck feature.
|
||||||
|
// See: https://github.com/kubernetes/kubernetes/issues/110175
|
||||||
|
queueingHintMap[framework.ClusterEvent{Resource: framework.Node, ActionType: framework.UpdateNodeTaint}] =
|
||||||
|
append(queueingHintMap[framework.ClusterEvent{Resource: framework.Node, ActionType: framework.UpdateNodeTaint}],
|
||||||
|
&internalqueue.QueueingHintFunction{
|
||||||
|
PluginName: e.Name(),
|
||||||
|
QueueingHintFn: defaultQueueingHintFn,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return queueingHintMap
|
return queueingHintMap
|
||||||
}
|
}
|
||||||
@ -415,7 +445,7 @@ func (sched *Scheduler) Run(ctx context.Context) {
|
|||||||
// If there are no new pods to schedule, it will be hanging there
|
// If there are no new pods to schedule, it will be hanging there
|
||||||
// and if done in this goroutine it will be blocking closing
|
// and if done in this goroutine it will be blocking closing
|
||||||
// SchedulingQueue, in effect causing a deadlock on shutdown.
|
// SchedulingQueue, in effect causing a deadlock on shutdown.
|
||||||
go wait.UntilWithContext(ctx, sched.scheduleOne, 0)
|
go wait.UntilWithContext(ctx, sched.ScheduleOne, 0)
|
||||||
|
|
||||||
<-ctx.Done()
|
<-ctx.Done()
|
||||||
sched.SchedulingQueue.Close()
|
sched.SchedulingQueue.Close()
|
||||||
|
@ -662,6 +662,9 @@ func Test_buildQueueingHintMap(t *testing.T) {
|
|||||||
{Resource: framework.Node, ActionType: framework.Add}: {
|
{Resource: framework.Node, ActionType: framework.Add}: {
|
||||||
{PluginName: fakeNode, QueueingHintFn: fakeNodePluginQueueingFn},
|
{PluginName: fakeNode, QueueingHintFn: fakeNodePluginQueueingFn},
|
||||||
},
|
},
|
||||||
|
{Resource: framework.Node, ActionType: framework.UpdateNodeTaint}: {
|
||||||
|
{PluginName: fakeNode, QueueingHintFn: defaultQueueingHintFn}, // When Node/Add is registered, Node/UpdateNodeTaint is automatically registered.
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -675,6 +678,9 @@ func Test_buildQueueingHintMap(t *testing.T) {
|
|||||||
{Resource: framework.Node, ActionType: framework.Add}: {
|
{Resource: framework.Node, ActionType: framework.Add}: {
|
||||||
{PluginName: fakeNode, QueueingHintFn: defaultQueueingHintFn}, // default queueing hint due to disabled feature gate.
|
{PluginName: fakeNode, QueueingHintFn: defaultQueueingHintFn}, // default queueing hint due to disabled feature gate.
|
||||||
},
|
},
|
||||||
|
{Resource: framework.Node, ActionType: framework.UpdateNodeTaint}: {
|
||||||
|
{PluginName: fakeNode, QueueingHintFn: defaultQueueingHintFn}, // When Node/Add is registered, Node/UpdateNodeTaint is automatically registered.
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -692,6 +698,9 @@ func Test_buildQueueingHintMap(t *testing.T) {
|
|||||||
{Resource: framework.Node, ActionType: framework.Add}: {
|
{Resource: framework.Node, ActionType: framework.Add}: {
|
||||||
{PluginName: fakeNode, QueueingHintFn: fakeNodePluginQueueingFn},
|
{PluginName: fakeNode, QueueingHintFn: fakeNodePluginQueueingFn},
|
||||||
},
|
},
|
||||||
|
{Resource: framework.Node, ActionType: framework.UpdateNodeTaint}: {
|
||||||
|
{PluginName: fakeNode, QueueingHintFn: defaultQueueingHintFn}, // When Node/Add is registered, Node/UpdateNodeTaint is automatically registered.
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@ -800,7 +809,7 @@ func Test_UnionedGVKs(t *testing.T) {
|
|||||||
Disabled: []schedulerapi.Plugin{{Name: "*"}}, // disable default plugins
|
Disabled: []schedulerapi.Plugin{{Name: "*"}}, // disable default plugins
|
||||||
},
|
},
|
||||||
want: map[framework.GVK]framework.ActionType{
|
want: map[framework.GVK]framework.ActionType{
|
||||||
framework.Node: framework.Add,
|
framework.Node: framework.Add | framework.UpdateNodeTaint, // When Node/Add is registered, Node/UpdateNodeTaint is automatically registered.
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -830,7 +839,7 @@ func Test_UnionedGVKs(t *testing.T) {
|
|||||||
},
|
},
|
||||||
want: map[framework.GVK]framework.ActionType{
|
want: map[framework.GVK]framework.ActionType{
|
||||||
framework.Pod: framework.Add,
|
framework.Pod: framework.Add,
|
||||||
framework.Node: framework.Add,
|
framework.Node: framework.Add | framework.UpdateNodeTaint, // When Node/Add is registered, Node/UpdateNodeTaint is automatically registered.
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -31,6 +31,7 @@ import (
|
|||||||
"k8s.io/apimachinery/pkg/runtime"
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||||
"k8s.io/apimachinery/pkg/types"
|
"k8s.io/apimachinery/pkg/types"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
"k8s.io/apimachinery/pkg/util/uuid"
|
"k8s.io/apimachinery/pkg/util/uuid"
|
||||||
"k8s.io/apimachinery/pkg/util/wait"
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||||
@ -154,6 +155,77 @@ func TestSchedulingGates(t *testing.T) {
|
|||||||
// TestCoreResourceEnqueue verify Pods failed by in-tree default plugins can be
|
// TestCoreResourceEnqueue verify Pods failed by in-tree default plugins can be
|
||||||
// moved properly upon their registered events.
|
// moved properly upon their registered events.
|
||||||
func TestCoreResourceEnqueue(t *testing.T) {
|
func TestCoreResourceEnqueue(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
// initialNode is the Node to be created at first.
|
||||||
|
initialNode *v1.Node
|
||||||
|
// initialPod is the Pod to be created at first if it's not empty.
|
||||||
|
initialPod *v1.Pod
|
||||||
|
// pods are the list of Pods to be created.
|
||||||
|
// All of them are expected to be unschedulable at first.
|
||||||
|
pods []*v1.Pod
|
||||||
|
// triggerFn is the function that triggers the event to move Pods.
|
||||||
|
triggerFn func(testCtx *testutils.TestContext) error
|
||||||
|
// wantRequeuedPods is the map of Pods that are expected to be requeued after triggerFn.
|
||||||
|
wantRequeuedPods sets.Set[string]
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Pod without a required toleration to a node isn't requeued to activeQ",
|
||||||
|
initialNode: st.MakeNode().Name("fake-node").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Taints([]v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoSchedule}}).Obj(),
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
// - Pod1 doesn't have the required toleration and will be rejected by the TaintToleration plugin.
|
||||||
|
// (TaintToleration plugin is evaluated before NodeResourcesFit plugin.)
|
||||||
|
// - Pod2 has the required toleration, but requests a large amount of CPU - will be rejected by the NodeResourcesFit plugin.
|
||||||
|
st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Obj(),
|
||||||
|
st.MakePod().Name("pod2").Toleration(v1.TaintNodeNotReady).Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Obj(),
|
||||||
|
},
|
||||||
|
triggerFn: func(testCtx *testutils.TestContext) error {
|
||||||
|
// Trigger a NodeChange event by increasing CPU capacity.
|
||||||
|
// It makes Pod2 schedulable.
|
||||||
|
// Pod1 is not requeued because the Node is still unready and it doesn't have the required toleration.
|
||||||
|
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Taints([]v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoSchedule}}).Obj(), metav1.UpdateOptions{}); err != nil {
|
||||||
|
return fmt.Errorf("failed to update the node: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
wantRequeuedPods: sets.New("pod2"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Pod rejected by the PodAffinity plugin is requeued when a new Node is created and turned to ready",
|
||||||
|
initialNode: st.MakeNode().Name("fake-node").Label("node", "fake-node").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Obj(),
|
||||||
|
initialPod: st.MakePod().Label("anti", "anti").Name("pod1").PodAntiAffinityExists("anti", "node", st.PodAntiAffinityWithRequiredReq).Container("image").Node("fake-node").Obj(),
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
// - Pod2 will be rejected by the PodAffinity plugin.
|
||||||
|
st.MakePod().Label("anti", "anti").Name("pod2").PodAntiAffinityExists("anti", "node", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(),
|
||||||
|
},
|
||||||
|
triggerFn: func(testCtx *testutils.TestContext) error {
|
||||||
|
// Trigger a NodeCreated event.
|
||||||
|
// Note that this Node has a un-ready taint and pod2 should be requeued ideally because unschedulable plugins registered for pod2 is PodAffinity.
|
||||||
|
// However, due to preCheck, it's not requeueing pod2 to activeQ.
|
||||||
|
// It'll be fixed by the removal of preCheck in the future.
|
||||||
|
// https://github.com/kubernetes/kubernetes/issues/110175
|
||||||
|
node := st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Taints([]v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoSchedule}}).Obj()
|
||||||
|
if _, err := testCtx.ClientSet.CoreV1().Nodes().Create(testCtx.Ctx, st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Taints([]v1.Taint{{Key: "foo", Effect: v1.TaintEffectNoSchedule}}).Obj(), metav1.CreateOptions{}); err != nil {
|
||||||
|
return fmt.Errorf("failed to create a newnode: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// As a mitigation of an issue described above, all plugins subscribing Node/Add event register UpdateNodeTaint too.
|
||||||
|
// So, this removal of taint moves pod2 to activeQ.
|
||||||
|
node.Spec.Taints = nil
|
||||||
|
if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, node, metav1.UpdateOptions{}); err != nil {
|
||||||
|
return fmt.Errorf("failed to remove taints off the node: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
wantRequeuedPods: sets.New("pod2"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, featureEnabled := range []bool{false, true} {
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(fmt.Sprintf("%s [SchedulerQueueingHints enabled: %v]", tt.name, featureEnabled), func(t *testing.T) {
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, featureEnabled)()
|
||||||
|
|
||||||
// Use zero backoff seconds to bypass backoffQ.
|
// Use zero backoff seconds to bypass backoffQ.
|
||||||
// It's intended to not start the scheduler's queue, and hence to
|
// It's intended to not start the scheduler's queue, and hence to
|
||||||
// not start any flushing logic. We will pop and schedule the Pods manually later.
|
// not start any flushing logic. We will pop and schedule the Pods manually later.
|
||||||
@ -170,72 +242,67 @@ func TestCoreResourceEnqueue(t *testing.T) {
|
|||||||
|
|
||||||
cs, ns, ctx := testCtx.ClientSet, testCtx.NS.Name, testCtx.Ctx
|
cs, ns, ctx := testCtx.ClientSet, testCtx.NS.Name, testCtx.Ctx
|
||||||
// Create one Node with a taint.
|
// Create one Node with a taint.
|
||||||
node := st.MakeNode().Name("fake-node").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Obj()
|
if _, err := cs.CoreV1().Nodes().Create(ctx, tt.initialNode, metav1.CreateOptions{}); err != nil {
|
||||||
node.Spec.Taints = []v1.Taint{{Key: "foo", Effect: v1.TaintEffectNoSchedule}}
|
t.Fatalf("Failed to create an initial Node %q: %v", tt.initialNode.Name, err)
|
||||||
if _, err := cs.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}); err != nil {
|
|
||||||
t.Fatalf("Failed to create Node %q: %v", node.Name, err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create two Pods that are both unschedulable.
|
if tt.initialPod != nil {
|
||||||
// - Pod1 is a best-effort Pod, but doesn't have the required toleration.
|
if _, err := cs.CoreV1().Pods(ns).Create(ctx, tt.initialPod, metav1.CreateOptions{}); err != nil {
|
||||||
// - Pod2 requests a large amount of CPU resource that the node cannot fit.
|
t.Fatalf("Failed to create an initial Pod %q: %v", tt.initialPod.Name, err)
|
||||||
// Note: Pod2 will fail the tainttoleration plugin b/c that's ordered prior to noderesources.
|
}
|
||||||
// - Pod3 has the required toleration, but requests a non-existing PVC.
|
}
|
||||||
pod1 := st.MakePod().Namespace(ns).Name("pod1").Container("image").Obj()
|
|
||||||
pod2 := st.MakePod().Namespace(ns).Name("pod2").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj()
|
for _, pod := range tt.pods {
|
||||||
pod3 := st.MakePod().Namespace(ns).Name("pod3").Toleration("foo").PVC("pvc").Container("image").Obj()
|
|
||||||
for _, pod := range []*v1.Pod{pod1, pod2, pod3} {
|
|
||||||
if _, err := cs.CoreV1().Pods(ns).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
|
if _, err := cs.CoreV1().Pods(ns).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
|
||||||
t.Fatalf("Failed to create Pod %q: %v", pod.Name, err)
|
t.Fatalf("Failed to create Pod %q: %v", pod.Name, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for the three pods to be present in the scheduling queue.
|
// Wait for the tt.pods to be present in the scheduling queue.
|
||||||
if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
pendingPods, _ := testCtx.Scheduler.SchedulingQueue.PendingPods()
|
pendingPods, _ := testCtx.Scheduler.SchedulingQueue.PendingPods()
|
||||||
return len(pendingPods) == 3, nil
|
return len(pendingPods) == len(tt.pods), nil
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pop the three pods out. They should be unschedulable.
|
t.Log("Confirmed Pods in the scheduling queue, starting to schedule them")
|
||||||
for i := 0; i < 3; i++ {
|
|
||||||
podInfo := testutils.NextPodOrDie(t, testCtx)
|
// Pop all pods out. They should be unschedulable.
|
||||||
fwk, ok := testCtx.Scheduler.Profiles[podInfo.Pod.Spec.SchedulerName]
|
for i := 0; i < len(tt.pods); i++ {
|
||||||
if !ok {
|
testCtx.Scheduler.ScheduleOne(testCtx.Ctx)
|
||||||
t.Fatalf("Cannot find the profile for Pod %v", podInfo.Pod.Name)
|
|
||||||
}
|
}
|
||||||
// Schedule the Pod manually.
|
// Wait for the tt.pods to be still present in the scheduling queue.
|
||||||
_, fitError := testCtx.Scheduler.SchedulePod(ctx, fwk, framework.NewCycleState(), podInfo.Pod)
|
if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
if fitError == nil {
|
pendingPods, _ := testCtx.Scheduler.SchedulingQueue.PendingPods()
|
||||||
t.Fatalf("Expect Pod %v to fail at scheduling.", podInfo.Pod.Name)
|
return len(pendingPods) == len(tt.pods), nil
|
||||||
}
|
}); err != nil {
|
||||||
testCtx.Scheduler.FailureHandler(ctx, fwk, podInfo, framework.NewStatus(framework.Unschedulable).WithError(fitError), nil, time.Now())
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Trigger a NodeTaintChange event.
|
t.Log("finished initial schedulings for all Pods, will trigger triggerFn")
|
||||||
// We expect this event to trigger moving the test Pod from unschedulablePods to activeQ.
|
|
||||||
node.Spec.Taints = nil
|
err := tt.triggerFn(testCtx)
|
||||||
if _, err := cs.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}); err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Failed to remove taints off the node: %v", err)
|
t.Fatalf("Failed to trigger the event: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now we should be able to pop the Pod from activeQ again.
|
t.Log("triggered tt.triggerFn, will check if tt.requeuedPods are requeued")
|
||||||
podInfo := testutils.NextPodOrDie(t, testCtx)
|
|
||||||
if podInfo.Attempts != 2 {
|
// Wait for the tt.pods to be still present in the scheduling queue.
|
||||||
t.Fatalf("Expected the Pod to be attempted 2 times, but got %v", podInfo.Attempts)
|
var requeuedPods sets.Set[string]
|
||||||
}
|
if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
if got := podInfo.Pod.Name; got != "pod1" {
|
requeuedPods = sets.Set[string]{} // reset
|
||||||
t.Fatalf("Expected pod1 to be popped, but got %v", got)
|
for _, requeuedPod := range testCtx.Scheduler.SchedulingQueue.PodsInActiveQ() {
|
||||||
|
requeuedPods.Insert(requeuedPod.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pod2 and Pod3 are not expected to be popped out.
|
return requeuedPods.Equal(tt.wantRequeuedPods), nil
|
||||||
// - Although the failure reason has been lifted, Pod2 still won't be moved to active due to
|
}); err != nil {
|
||||||
// the node event's preCheckForNode().
|
t.Fatalf("Expect Pods %v to be requeued, but %v are requeued actually", tt.wantRequeuedPods, requeuedPods)
|
||||||
// - Regarding Pod3, the NodeTaintChange event is irrelevant with its scheduling failure.
|
}
|
||||||
podInfo = testutils.NextPod(t, testCtx)
|
})
|
||||||
if podInfo != nil {
|
}
|
||||||
t.Fatalf("Unexpected pod %v get popped out", podInfo.Pod.Name)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1159,6 +1159,8 @@ func NextPodOrDie(t *testing.T, testCtx *TestContext) *schedulerframework.Queued
|
|||||||
}
|
}
|
||||||
|
|
||||||
// NextPod returns the next Pod in the scheduler queue, with a 5 seconds timeout.
|
// NextPod returns the next Pod in the scheduler queue, with a 5 seconds timeout.
|
||||||
|
// Note that this function leaks goroutines in the case of timeout; even after this function returns after timeout,
|
||||||
|
// the goroutine made by this function keep waiting to pop a pod from the queue.
|
||||||
func NextPod(t *testing.T, testCtx *TestContext) *schedulerframework.QueuedPodInfo {
|
func NextPod(t *testing.T, testCtx *TestContext) *schedulerframework.QueuedPodInfo {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user