From c7db4bb450636e91183c3c68898978c2952626b9 Mon Sep 17 00:00:00 2001 From: dom4ha Date: Wed, 18 Sep 2024 13:36:52 +0000 Subject: [PATCH] Fine grain QueueHints for nodeaffinity plugin. Skip queue on unrelated change that keeps pod schedulable when QueueHints are enabled. Split add from QHints disabled case Remove case when QHints are disabled Remove two GHint alternatives in unit tests more fine-grained Node QHint for NodeResourceFit plugin Return early when updated Node causes unmatch Revert "more fine-grained Node QHint for NodeResourceFit plugin" This reverts commit dfbceb60e0c1c4e47748c12722d9ed6dba1a8366. Add integration test for requeue of a pod previously rejected by NodeAffinity plugin when a suitable Node is added Add integratin test for a Node update operation that does not trigger requeue in NodeAffinity plugin Remove innacurrate comment Apply review comments --- .../plugins/nodeaffinity/node_affinity.go | 29 +++++++++---- .../nodeaffinity/node_affinity_test.go | 7 +++ pkg/scheduler/framework/types.go | 2 +- test/integration/scheduler/queue_test.go | 43 ++++++++++++++++++- 4 files changed, 70 insertions(+), 11 deletions(-) diff --git a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go index d785372dd04..0a8c4f3561f 100644 --- a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go +++ b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go @@ -104,7 +104,7 @@ func (pl *NodeAffinity) EventsToRegister(_ context.Context) ([]framework.Cluster // isSchedulableAfterNodeChange is invoked whenever a node changed. It checks whether // that change made a previously unschedulable pod schedulable. func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { - _, modifiedNode, err := util.As[*v1.Node](oldObj, newObj) + originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj) if err != nil { return framework.Queue, err } @@ -119,16 +119,27 @@ func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1 if err != nil { return framework.Queue, err } - if isMatched { - logger.V(4).Info("node was created or updated, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + if !isMatched { + logger.V(5).Info("node was created or updated, but the pod's NodeAffinity doesn't match", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.QueueSkip, nil + } + // Since the node was added and it matches the pod's affinity criteria, we can unblock it. + if originalNode == nil { + logger.V(5).Info("node was created, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) return framework.Queue, nil } - - // TODO: also check if the original node meets the pod's requestments once preCheck is completely removed. - // See: https://github.com/kubernetes/kubernetes/issues/110175 - - logger.V(4).Info("node was created or updated, but it doesn't make this pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) - return framework.QueueSkip, nil + // At this point we know the operation is update so we can narrow down the criteria to unmatch -> match changes only + // (necessary affinity label was added to the node in this case). + wasMatched, err := requiredNodeAffinity.Match(originalNode) + if err != nil { + return framework.Queue, err + } + if wasMatched { + logger.V(5).Info("node updated, but the pod's NodeAffinity hasn't changed", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.QueueSkip, nil + } + logger.V(5).Info("node was updated and the pod's NodeAffinity changed to matched", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.Queue, nil } // PreFilter builds and writes cycle state used by Filter. diff --git a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go index 5f668e7ef04..39a0ece58ac 100644 --- a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go +++ b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go @@ -1297,6 +1297,13 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { newObj: st.MakeNode().Label("foo", "bar").Obj(), expectedHint: framework.Queue, }, + "skip-unrelated-change-that-keeps-pod-schedulable": { + args: &config.NodeAffinityArgs{}, + pod: podWithNodeAffinity.Obj(), + oldObj: st.MakeNode().Label("foo", "bar").Obj(), + newObj: st.MakeNode().Capacity(nil).Label("foo", "bar").Obj(), + expectedHint: framework.QueueSkip, + }, "skip-queue-on-add-scheduler-enforced-node-affinity": { args: &config.NodeAffinityArgs{ AddedAffinity: &v1.NodeAffinity{ diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index 90397f9087b..5a303bcfd6b 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -146,7 +146,7 @@ const ( type ClusterEventWithHint struct { Event ClusterEvent - // QueueingHintFn is executed for the plugin rejected by this plugin when the above Event happens, + // QueueingHintFn is executed for the Pod rejected by this plugin when the above Event happens, // and filters out events to reduce useless retry of Pod's scheduling. // It's an optional field. If not set, // the scheduling of Pods will be always retried with backoff when this Event happens. diff --git a/test/integration/scheduler/queue_test.go b/test/integration/scheduler/queue_test.go index 9877d7f2625..d0b64d8d99e 100644 --- a/test/integration/scheduler/queue_test.go +++ b/test/integration/scheduler/queue_test.go @@ -271,7 +271,48 @@ func TestCoreResourceEnqueue(t *testing.T) { // It causes pod1 to be requeued. // It causes pod2 not to be requeued. if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node1").Label("group", "b").Obj(), metav1.UpdateOptions{}); err != nil { - return fmt.Errorf("failed to remove taints off the node: %w", err) + return fmt.Errorf("failed to update the pod: %w", err) + } + return nil + }, + wantRequeuedPods: sets.New("pod1"), + }, + { + name: "Pod rejected by the NodeAffinity plugin is not requeued when an updated Node haven't changed the 'match' verdict", + initialNodes: []*v1.Node{ + st.MakeNode().Name("node1").Label("group", "a").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Obj(), + st.MakeNode().Name("node2").Label("group", "b").Obj()}, + pods: []*v1.Pod{ + // - The initial pod would be accepted by the NodeAffinity plugin for node1, but will be blocked by the NodeResources plugin. + // - The pod will be blocked by the NodeAffinity plugin for node2, therefore we know NodeAffinity will be queried for qhint for both testing nodes. + st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"a"}, st.NodeSelectorTypeMatchExpressions).Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Obj(), + }, + triggerFn: func(testCtx *testutils.TestContext) error { + // Trigger a NodeUpdate event to add new label. + // It won't cause pod to be requeued, because there was a match already before the update, meaning this plugin wasn't blocking the scheduling. + if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("node1").Label("group", "a").Label("node", "fake-node").Obj(), metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update the pod: %w", err) + } + return nil + }, + wantRequeuedPods: sets.Set[string]{}, + enableSchedulingQueueHint: []bool{true}, + }, + { + name: "Pod rejected by the NodeAffinity plugin is requeued when a Node is added", + initialNodes: []*v1.Node{st.MakeNode().Name("fake-node1").Label("group", "a").Obj()}, + pods: []*v1.Pod{ + // - Pod1 will be rejected by the NodeAffinity plugin. + st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(), + // - Pod2 will be rejected by the NodeAffinity plugin. + st.MakePod().Name("pod2").NodeAffinityIn("group", []string{"c"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(), + }, + triggerFn: func(testCtx *testutils.TestContext) error { + // Trigger a NodeAdd event with the awaited label. + // It causes pod1 to be requeued. + // It causes pod2 not to be requeued. + if _, err := testCtx.ClientSet.CoreV1().Nodes().Create(testCtx.Ctx, st.MakeNode().Name("fake-node2").Label("group", "b").Obj(), metav1.CreateOptions{}); err != nil { + return fmt.Errorf("failed to update the pod: %w", err) } return nil },