diff --git a/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go b/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go index 7cc7988061b..5c309fc9360 100644 --- a/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go +++ b/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go @@ -51,12 +51,56 @@ const ( // EventsToRegister returns the possible events that may make a Pod // failed by this plugin schedulable. func (pl *NodeUnschedulable) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) { - // preCheck is not used when QHint is enabled. + if !pl.enableSchedulingQueueHint { + return []framework.ClusterEventWithHint{ + // A note about UpdateNodeLabel event: + // Ideally, it's supposed to register only Add | UpdateNodeTaint because UpdateNodeLabel will never change the result from this plugin. + // But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add. + // See: https://github.com/kubernetes/kubernetes/issues/109437 + {Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel}, QueueingHintFn: pl.isSchedulableAfterNodeChange}, + }, nil + } + return []framework.ClusterEventWithHint{ + // When QueueingHint is enabled, we don't use preCheck and we don't need to register UpdateNodeLabel event. {Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange}, + // When the QueueingHint feature is enabled, + // the scheduling queue uses Pod/Update Queueing Hint + // to determine whether a Pod's update makes the Pod schedulable or not. + // https://github.com/kubernetes/kubernetes/pull/122234 + {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange}, }, nil } +// isSchedulableAfterPodTolerationChange is invoked whenever a pod's toleration changed. +func (pl *NodeUnschedulable) isSchedulableAfterPodTolerationChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { + _, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj) + if err != nil { + return framework.Queue, err + } + + if pod.UID == modifiedPod.UID { + // Note: we don't need to check oldPod tolerations the taint because: + // - Taint can be added, but can't be modified nor removed. + // - If the Pod already has the toleration, it shouldn't have rejected by this plugin in the first place. + // Meaning, here this Pod has been rejected by this plugin, and hence it shouldn't have the toleration yet. + if v1helper.TolerationsTolerateTaint(modifiedPod.Spec.Tolerations, &v1.Taint{ + Key: v1.TaintNodeUnschedulable, + Effect: v1.TaintEffectNoSchedule, + }) { + // This update makes the pod tolerate the unschedulable taint. + logger.V(5).Info("a new toleration is added for the unschedulable Pod, and it may make it schedulable", "pod", klog.KObj(modifiedPod)) + return framework.Queue, nil + } + logger.V(5).Info("a new toleration is added for the unschedulable Pod, but it's an unrelated toleration", "pod", klog.KObj(modifiedPod)) + return framework.QueueSkip, nil + } + + logger.V(5).Info("a new toleration is added for a Pod, but it's an unrelated Pod and wouldn't change the TaintToleration plugin's decision", "pod", klog.KObj(modifiedPod)) + + return framework.QueueSkip, nil +} + // isSchedulableAfterNodeChange is invoked for all node events reported by // an informer. It checks whether that change made a previously unschedulable // pod schedulable. diff --git a/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable_test.go b/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable_test.go index d9659d6c66c..862190af452 100644 --- a/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable_test.go +++ b/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable_test.go @@ -23,6 +23,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" + st "k8s.io/kubernetes/pkg/scheduler/testing" "k8s.io/kubernetes/test/utils/ktesting" ) @@ -96,14 +97,14 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) { expectedErr bool }{ { - name: "backoff-wrong-new-object", + name: "error-wrong-new-object", pod: &v1.Pod{}, newObj: "not-a-node", expectedHint: framework.Queue, expectedErr: true, }, { - name: "backoff-wrong-old-object", + name: "error-wrong-old-object", pod: &v1.Pod{}, newObj: &v1.Node{ Spec: v1.NodeSpec{ @@ -186,3 +187,64 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) { }) } } + +func TestIsSchedulableAfterPodTolerationChange(t *testing.T) { + testCases := []struct { + name string + pod *v1.Pod + oldObj, newObj interface{} + expectedHint framework.QueueingHint + expectedErr bool + }{ + { + name: "error-wrong-new-object", + pod: &v1.Pod{}, + newObj: "not-a-pod", + expectedHint: framework.Queue, + expectedErr: true, + }, + { + name: "error-wrong-old-object", + pod: &v1.Pod{}, + newObj: &v1.Pod{}, + oldObj: "not-a-pod", + expectedHint: framework.Queue, + expectedErr: true, + }, + { + name: "skip-different-pod-update", + pod: st.MakePod().UID("uid").Obj(), + newObj: st.MakePod().UID("different-uid").Toleration(v1.TaintNodeUnschedulable).Obj(), + oldObj: st.MakePod().UID("different-uid").Obj(), + expectedHint: framework.QueueSkip, + }, + { + name: "queue-when-the-unsched-pod-gets-toleration", + pod: st.MakePod().UID("uid").Obj(), + newObj: st.MakePod().UID("uid").Toleration(v1.TaintNodeUnschedulable).Obj(), + oldObj: st.MakePod().UID("uid").Obj(), + expectedHint: framework.Queue, + }, + { + name: "skip-when-the-unsched-pod-gets-unrelated-toleration", + pod: st.MakePod().UID("uid").Obj(), + newObj: st.MakePod().UID("uid").Toleration("unrelated-key").Obj(), + oldObj: st.MakePod().UID("uid").Obj(), + expectedHint: framework.QueueSkip, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + logger, _ := ktesting.NewTestContext(t) + pl := &NodeUnschedulable{} + got, err := pl.isSchedulableAfterPodTolerationChange(logger, testCase.pod, testCase.oldObj, testCase.newObj) + if err != nil && !testCase.expectedErr { + t.Errorf("unexpected error: %v", err) + } + if got != testCase.expectedHint { + t.Errorf("isSchedulableAfterNodeChange() = %v, want %v", got, testCase.expectedHint) + } + }) + } +} diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index 90427b73237..90397f9087b 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -69,7 +69,8 @@ const ( UpdatePodLabel // UpdatePodScaleDown is an update for pod's scale down (i.e., any resource request is reduced). UpdatePodScaleDown - // UpdatePodTolerations is an update for pod's tolerations. + // UpdatePodTolerations is an addition for pod's tolerations. + // (Due to API validation, we can add, but cannot modify or remove tolerations.) UpdatePodTolerations // UpdatePodSchedulingGatesEliminated is an update for pod's scheduling gates, which eliminates all scheduling gates in the Pod. UpdatePodSchedulingGatesEliminated