diff --git a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go index d785372dd04..0a8c4f3561f 100644 --- a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go +++ b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go @@ -104,7 +104,7 @@ func (pl *NodeAffinity) EventsToRegister(_ context.Context) ([]framework.Cluster // isSchedulableAfterNodeChange is invoked whenever a node changed. It checks whether // that change made a previously unschedulable pod schedulable. func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { - _, modifiedNode, err := util.As[*v1.Node](oldObj, newObj) + originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj) if err != nil { return framework.Queue, err } @@ -119,16 +119,27 @@ func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1 if err != nil { return framework.Queue, err } - if isMatched { - logger.V(4).Info("node was created or updated, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + if !isMatched { + logger.V(5).Info("node was created or updated, but the pod's NodeAffinity doesn't match", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.QueueSkip, nil + } + // Since the node was added and it matches the pod's affinity criteria, we can unblock it. + if originalNode == nil { + logger.V(5).Info("node was created, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) return framework.Queue, nil } - - // TODO: also check if the original node meets the pod's requestments once preCheck is completely removed. - // See: https://github.com/kubernetes/kubernetes/issues/110175 - - logger.V(4).Info("node was created or updated, but it doesn't make this pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) - return framework.QueueSkip, nil + // At this point we know the operation is update so we can narrow down the criteria to unmatch -> match changes only + // (necessary affinity label was added to the node in this case). + wasMatched, err := requiredNodeAffinity.Match(originalNode) + if err != nil { + return framework.Queue, err + } + if wasMatched { + logger.V(5).Info("node updated, but the pod's NodeAffinity hasn't changed", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.QueueSkip, nil + } + logger.V(5).Info("node was updated and the pod's NodeAffinity changed to matched", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.Queue, nil } // PreFilter builds and writes cycle state used by Filter. diff --git a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go index 5f668e7ef04..39a0ece58ac 100644 --- a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go +++ b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go @@ -1297,6 +1297,13 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { newObj: st.MakeNode().Label("foo", "bar").Obj(), expectedHint: framework.Queue, }, + "skip-unrelated-change-that-keeps-pod-schedulable": { + args: &config.NodeAffinityArgs{}, + pod: podWithNodeAffinity.Obj(), + oldObj: st.MakeNode().Label("foo", "bar").Obj(), + newObj: st.MakeNode().Capacity(nil).Label("foo", "bar").Obj(), + expectedHint: framework.QueueSkip, + }, "skip-queue-on-add-scheduler-enforced-node-affinity": { args: &config.NodeAffinityArgs{ AddedAffinity: &v1.NodeAffinity{ diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index 90397f9087b..5a303bcfd6b 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -146,7 +146,7 @@ const ( type ClusterEventWithHint struct { Event ClusterEvent - // QueueingHintFn is executed for the plugin rejected by this plugin when the above Event happens, + // QueueingHintFn is executed for the Pod rejected by this plugin when the above Event happens, // and filters out events to reduce useless retry of Pod's scheduling. // It's an optional field. If not set, // the scheduling of Pods will be always retried with backoff when this Event happens. diff --git a/test/integration/scheduler/queue_test.go b/test/integration/scheduler/queue_test.go index 3ac0647d87d..735b81d0028 100644 --- a/test/integration/scheduler/queue_test.go +++ b/test/integration/scheduler/queue_test.go @@ -271,7 +271,48 @@ func TestCoreResourceEnqueue(t *testing.T) { // It causes pod1 to be requeued. // It causes pod2 not to be requeued. if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node1").Label("group", "b").Obj(), metav1.UpdateOptions{}); err != nil { - return fmt.Errorf("failed to remove taints off the node: %w", err) + return fmt.Errorf("failed to update the pod: %w", err) + } + return nil + }, + wantRequeuedPods: sets.New("pod1"), + }, + { + name: "Pod rejected by the NodeAffinity plugin is not requeued when an updated Node haven't changed the 'match' verdict", + initialNodes: []*v1.Node{ + st.MakeNode().Name("node1").Label("group", "a").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Obj(), + st.MakeNode().Name("node2").Label("group", "b").Obj()}, + pods: []*v1.Pod{ + // - The initial pod would be accepted by the NodeAffinity plugin for node1, but will be blocked by the NodeResources plugin. + // - The pod will be blocked by the NodeAffinity plugin for node2, therefore we know NodeAffinity will be queried for qhint for both testing nodes. + st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"a"}, st.NodeSelectorTypeMatchExpressions).Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Obj(), + }, + triggerFn: func(testCtx *testutils.TestContext) error { + // Trigger a NodeUpdate event to add new label. + // It won't cause pod to be requeued, because there was a match already before the update, meaning this plugin wasn't blocking the scheduling. + if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("node1").Label("group", "a").Label("node", "fake-node").Obj(), metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update the pod: %w", err) + } + return nil + }, + wantRequeuedPods: sets.Set[string]{}, + enableSchedulingQueueHint: []bool{true}, + }, + { + name: "Pod rejected by the NodeAffinity plugin is requeued when a Node is added", + initialNodes: []*v1.Node{st.MakeNode().Name("fake-node1").Label("group", "a").Obj()}, + pods: []*v1.Pod{ + // - Pod1 will be rejected by the NodeAffinity plugin. + st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(), + // - Pod2 will be rejected by the NodeAffinity plugin. + st.MakePod().Name("pod2").NodeAffinityIn("group", []string{"c"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(), + }, + triggerFn: func(testCtx *testutils.TestContext) error { + // Trigger a NodeAdd event with the awaited label. + // It causes pod1 to be requeued. + // It causes pod2 not to be requeued. + if _, err := testCtx.ClientSet.CoreV1().Nodes().Create(testCtx.Ctx, st.MakeNode().Name("fake-node2").Label("group", "b").Obj(), metav1.CreateOptions{}); err != nil { + return fmt.Errorf("failed to update the pod: %w", err) } return nil },