From c725e18e079d5208b77dad5f94893feea3e6dcb0 Mon Sep 17 00:00:00 2001 From: googs1025 Date: Thu, 19 Sep 2024 18:22:56 +0800 Subject: [PATCH] feature(scheduler): more fine-grained QHints for interpodaffinity plugin --- .../plugins/interpodaffinity/plugin.go | 66 ++++++++++++-- .../plugins/interpodaffinity/plugin_test.go | 68 ++++++++++++-- test/integration/scheduler/queueing/queue.go | 91 +++++++++++++++++-- 3 files changed, 199 insertions(+), 26 deletions(-) diff --git a/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go b/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go index f29994d3dd3..78d6d65e824 100644 --- a/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go +++ b/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go @@ -211,7 +211,7 @@ func (pl *InterPodAffinity) isSchedulableAfterPodChange(logger klog.Logger, pod } func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { - _, modifiedNode, err := util.As[*v1.Node](oldObj, newObj) + originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj) if err != nil { return framework.Queue, err } @@ -221,11 +221,35 @@ func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod return framework.Queue, err } + // When queuing this Pod: + // - 1. A new node is added with the pod affinity topologyKey, the pod may become schedulable. + // - 2. The original node does not have the pod affinity topologyKey but the modified node does, the pod may become schedulable. + // - 3. Both the original and modified nodes have the pod affinity topologyKey and they differ, the pod may become schedulable. for _, term := range terms { - if _, ok := modifiedNode.Labels[term.TopologyKey]; ok { - logger.V(5).Info("a node with matched pod affinity topologyKey was added/updated and it may make pod schedulable", + if originalNode == nil { + if _, ok := modifiedNode.Labels[term.TopologyKey]; ok { + // Case 1: A new node is added with the pod affinity topologyKey. + logger.V(5).Info("A node with a matched pod affinity topologyKey was added and it may make the pod schedulable", + "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.Queue, nil + } + continue + } + originalTopologyValue, originalHasKey := originalNode.Labels[term.TopologyKey] + modifiedTopologyValue, modifiedHasKey := modifiedNode.Labels[term.TopologyKey] + + if !originalHasKey && modifiedHasKey { + // Case 2: Original node does not have the pod affinity topologyKey, but the modified node does. + logger.V(5).Info("A node got updated to have the topology key of pod affinity, which may make the pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) - return framework.Queue, err + return framework.Queue, nil + } + + if originalHasKey && modifiedHasKey && (originalTopologyValue != modifiedTopologyValue) { + // Case 3: Both nodes have the pod affinity topologyKey, but the values differ. + logger.V(5).Info("A node is moved to a different domain of pod affinity, which may make the pod schedulable", + "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.Queue, nil } } @@ -234,11 +258,39 @@ func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod return framework.Queue, err } + // When queuing this Pod: + // - 1. A new node is added, the pod may become schedulable. + // - 2. The original node have the pod anti-affinity topologyKey but the modified node does not, the pod may become schedulable. + // - 3. Both the original and modified nodes have the pod anti-affinity topologyKey and they differ, the pod may become schedulable. for _, term := range antiTerms { - if _, ok := modifiedNode.Labels[term.TopologyKey]; ok { - logger.V(5).Info("a node with matched pod anti-affinity topologyKey was added/updated and it may make pod schedulable", + if originalNode == nil { + // Case 1: A new node is added. + // We always requeue the Pod with anti-affinity because: + // - the node without the topology key is always allowed to have a Pod with anti-affinity. + // - the addition of a node with the topology key makes Pods schedulable only when the topology it joins doesn't have any Pods that the Pod hates. + // But, it's out-of-scope of this QHint to check which Pods are in the topology this Node is in. + logger.V(5).Info("A node was added and it may make the pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) - return framework.Queue, err + return framework.Queue, nil + } + + originalTopologyValue, originalHasKey := originalNode.Labels[term.TopologyKey] + modifiedTopologyValue, modifiedHasKey := modifiedNode.Labels[term.TopologyKey] + + if originalHasKey && !modifiedHasKey { + // Case 2: The original node have the pod anti-affinity topologyKey but the modified node does not. + // Note that we don't need to check the opposite case (!originalHasKey && modifiedHasKey) + // because the node without the topology label can always accept pods with pod anti-affinity. + logger.V(5).Info("A node got updated to not have the topology key of pod anti-affinity, which may make the pod schedulable", + "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.Queue, nil + } + + if originalHasKey && modifiedHasKey && (originalTopologyValue != modifiedTopologyValue) { + // Case 3: Both nodes have the pod anti-affinity topologyKey, but the values differ. + logger.V(5).Info("A node is moved to a different domain of pod anti-affinity, which may make the pod schedulable", + "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.Queue, nil } } logger.V(5).Info("a node is added/updated but doesn't have any topologyKey which matches pod affinity/anti-affinity", diff --git a/pkg/scheduler/framework/plugins/interpodaffinity/plugin_test.go b/pkg/scheduler/framework/plugins/interpodaffinity/plugin_test.go index ca39b23156d..3c8be6b4d50 100644 --- a/pkg/scheduler/framework/plugins/interpodaffinity/plugin_test.go +++ b/pkg/scheduler/framework/plugins/interpodaffinity/plugin_test.go @@ -157,18 +157,13 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { oldNode, newNode *v1.Node expectedHint framework.QueueingHint }{ + // affinity { name: "add a new node with matched pod affinity topologyKey", pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(), newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), expectedHint: framework.Queue, }, - { - name: "add a new node with matched pod anti-affinity topologyKey", - pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(), - newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), - expectedHint: framework.Queue, - }, { name: "add a new node without matched topologyKey", pod: st.MakePod().Name("p").PodAffinityIn("service", "region", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(), @@ -176,19 +171,74 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { expectedHint: framework.QueueSkip, }, { - name: "update node topologyKey", + name: "update node label but not topologyKey", + pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(), + oldNode: st.MakeNode().Name("node-a").Label("aaa", "a").Obj(), + newNode: st.MakeNode().Name("node-a").Label("aaa", "b").Obj(), + expectedHint: framework.QueueSkip, + }, + { + name: "update node label that isn't related to the pod affinity", + pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(), + oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), + newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Label("unrelated-label", "unrelated").Obj(), + expectedHint: framework.QueueSkip, + }, + { + name: "update node with different affinity topologyKey value", pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(), oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), newNode: st.MakeNode().Name("node-a").Label("zone", "zone2").Obj(), expectedHint: framework.Queue, }, { - name: "update node lable but not topologyKey", + name: "update node to have the affinity topology label", pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(), oldNode: st.MakeNode().Name("node-a").Label("aaa", "a").Obj(), - newNode: st.MakeNode().Name("node-a").Label("aaa", "b").Obj(), + newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), + expectedHint: framework.Queue, + }, + // anti-affinity + { + name: "add a new node with matched pod anti-affinity topologyKey", + pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(), + newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), + expectedHint: framework.Queue, + }, + { + name: "add a new node without matched pod anti-affinity topologyKey", + pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(), + newNode: st.MakeNode().Name("node-a").Obj(), + expectedHint: framework.Queue, + }, + { + name: "update node label that isn't related to the pod anti-affinity", + pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(), + oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), + newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Label("unrelated-label", "unrelated").Obj(), expectedHint: framework.QueueSkip, }, + { + name: "update node with different anti-affinity topologyKey value", + pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(), + oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), + newNode: st.MakeNode().Name("node-a").Label("zone", "zone2").Obj(), + expectedHint: framework.Queue, + }, + { + name: "update node to have the anti-affinity topology label", + pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(), + oldNode: st.MakeNode().Name("node-a").Label("aaa", "a").Obj(), + newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), + expectedHint: framework.QueueSkip, + }, + { + name: "update node label not to have anti-affinity topology label", + pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(), + oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(), + newNode: st.MakeNode().Name("node-a").Label("aaa", "a").Obj(), + expectedHint: framework.Queue, + }, } for _, tc := range testcases { t.Run(tc.name, func(t *testing.T) { diff --git a/test/integration/scheduler/queueing/queue.go b/test/integration/scheduler/queueing/queue.go index e5d99cc9033..d4797988a05 100644 --- a/test/integration/scheduler/queueing/queue.go +++ b/test/integration/scheduler/queueing/queue.go @@ -456,7 +456,7 @@ var CoreResourceEnqueueTestCases = []*CoreResourceEnqueueTestCase{ }, }, { - Name: "Pod rejected by the PodAffinity plugin is requeued when deleting the existed pod's label to make it match the podAntiAffinity", + Name: "Pod rejected by the PodAffinity plugin is requeued when deleting the existed pod's label that matches the podAntiAffinity", InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Label("node", "fake-node").Obj()}, InitialPods: []*v1.Pod{ st.MakePod().Name("pod1").Label("anti1", "anti1").Label("anti2", "anti2").Container("image").Node("fake-node").Obj(), @@ -490,7 +490,7 @@ var CoreResourceEnqueueTestCases = []*CoreResourceEnqueueTestCase{ }, TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) { - // Add label to the pod which will make it match pod2's nodeAffinity. + // Add label to the pod which will make it match pod2's podAffinity. if _, err := testCtx.ClientSet.CoreV1().Pods(testCtx.NS.Name).Update(testCtx.Ctx, st.MakePod().Name("pod1").Label("aaa", "bbb").Container("image").Node("fake-node").Obj(), metav1.UpdateOptions{}); err != nil { return nil, fmt.Errorf("failed to update pod1: %w", err) } @@ -501,8 +501,8 @@ var CoreResourceEnqueueTestCases = []*CoreResourceEnqueueTestCase{ }, { - Name: "Pod rejected by the PodAffinity plugin is requeued when updating the label of the node to make it match the pod affinity", - InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Label("node", "fake-node").Obj()}, + Name: "Pod rejected by the interPodAffinity plugin is requeued when creating the node with the topologyKey label of the pod affinity", + InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node1").Label("aaa", "fake-node").Obj()}, Pods: []*v1.Pod{ // - pod1 and pod2 will be rejected by the PodAffinity plugin. st.MakePod().Name("pod1").PodAffinityExists("bbb", "zone", st.PodAffinityWithRequiredReq).Container("image").Obj(), @@ -510,15 +510,86 @@ var CoreResourceEnqueueTestCases = []*CoreResourceEnqueueTestCase{ }, TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) { - // Add label to the node which will make it match pod1's podAffinity. - if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node").Label("zone", "zone1").Obj(), metav1.UpdateOptions{}); err != nil { - return nil, fmt.Errorf("failed to update pod1: %w", err) + // Create the node which will make it match pod1's podAffinity. + if _, err := testCtx.ClientSet.CoreV1().Nodes().Create(testCtx.Ctx, st.MakeNode().Name("fake-node2").Label("zone", "zone1").Obj(), metav1.CreateOptions{}); err != nil { + return nil, fmt.Errorf("failed to create fake-node2: %w", err) } - return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.UpdateNodeLabel}: 1}, nil + return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.Add}: 1}, nil }, WantRequeuedPods: sets.New("pod1"), EnableSchedulingQueueHint: sets.New(true), }, + { + Name: "Pod rejected by the interPodAffinity plugin is requeued when updating the node with the topologyKey label of the pod affinity", + InitialNodes: []*v1.Node{ + st.MakeNode().Name("fake-node").Label("region", "region-1").Obj(), + st.MakeNode().Name("fake-node2").Label("region", "region-2").Label("group", "b").Obj(), + }, + InitialPods: []*v1.Pod{ + st.MakePod().Name("pod1").Label("affinity1", "affinity1").Container("image").Node("fake-node").Obj(), + }, + Pods: []*v1.Pod{ + // - pod2, pod3, pod4 will be rejected by the PodAffinity plugin. + st.MakePod().Name("pod2").Label("affinity1", "affinity1").PodAffinityExists("affinity1", "node", st.PodAffinityWithRequiredReq).Container("image").Obj(), + st.MakePod().Name("pod3").Label("affinity1", "affinity1").PodAffinityExists("affinity1", "other", st.PodAffinityWithRequiredReq).Container("image").Obj(), + // This Pod doesn't have any label so that this PodAffinity doesn't match this Pod itself. + st.MakePod().Name("pod4").PodAffinityExists("affinity1", "region", st.PodAffinityWithRequiredReq).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(), + }, + TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) { + // Add "node" label to the node which may make pod2 schedulable. + // Update "region" label to the node which may make pod4 schedulable. + if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node").Label("node", "fake-node").Label("region", "region-2").Obj(), metav1.UpdateOptions{}); err != nil { + return nil, fmt.Errorf("failed to update fake-node: %w", err) + } + return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.UpdateNodeLabel}: 1}, nil + }, + WantRequeuedPods: sets.New("pod2", "pod4"), + EnableSchedulingQueueHint: sets.New(true), + }, + { + Name: "Pod rejected by the interPodAffinity plugin is requeued when updating the node with the topologyKey label of the pod anti affinity", + InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Label("node", "fake-node").Label("service", "service-a").Label("other", "fake-node").Obj()}, + InitialPods: []*v1.Pod{ + st.MakePod().Name("pod1").Label("anti1", "anti1").Container("image").Node("fake-node").Obj(), + }, + Pods: []*v1.Pod{ + // - pod2, pod3, pod4 will be rejected by the PodAffinity plugin. + st.MakePod().Name("pod2").Label("anti1", "anti1").PodAntiAffinityExists("anti1", "node", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(), + st.MakePod().Name("pod3").Label("anti1", "anti1").PodAntiAffinityExists("anti1", "service", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(), + st.MakePod().Name("pod4").Label("anti1", "anti1").PodAntiAffinityExists("anti1", "other", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(), + }, + TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) { + // Remove "node" label from the node which will make it not match pod2's podAntiAffinity. + // Change "service" label from service-a to service-b which may pod3 schedulable. + if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node").Label("service", "service-b").Label("region", "fake-node").Label("other", "fake-node").Obj(), metav1.UpdateOptions{}); err != nil { + return nil, fmt.Errorf("failed to update fake-node: %w", err) + } + return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.UpdateNodeLabel}: 1}, nil + }, + WantRequeuedPods: sets.New("pod2", "pod3"), + EnableSchedulingQueueHint: sets.New(true), + }, + { + Name: "Pod rejected by the interPodAffinity plugin is requeued when creating the node with the topologyKey label of the pod anti affinity", + InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node1").Label("zone", "fake-node").Label("node", "fake-node").Obj()}, + InitialPods: []*v1.Pod{ + st.MakePod().Name("pod1").Label("anti1", "anti1").Container("image").Node("fake-node1").Obj(), + }, + Pods: []*v1.Pod{ + // - pod2, pod3 will be rejected by the PodAntiAffinity plugin. + st.MakePod().Name("pod2").PodAntiAffinityExists("anti1", "zone", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(), + st.MakePod().Name("pod3").PodAntiAffinityExists("anti1", "node", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(), + }, + TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) { + // Create the node which will make pod2, pod3 be schedulable. + if _, err := testCtx.ClientSet.CoreV1().Nodes().Create(testCtx.Ctx, st.MakeNode().Name("fake-node2").Label("zone", "fake-node").Obj(), metav1.CreateOptions{}); err != nil { + return nil, fmt.Errorf("failed to create fake-node2: %w", err) + } + return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.Add}: 1}, nil + }, + WantRequeuedPods: sets.New("pod2", "pod3"), + EnableSchedulingQueueHint: sets.New(true, false), + }, { Name: "Pod rejected with hostport by the NodePorts plugin is requeued when pod with common hostport is deleted", InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Label("node", "fake-node").Obj()}, @@ -2154,7 +2225,7 @@ func RunTestCoreResourceEnqueue(t *testing.T, tt *CoreResourceEnqueueTestCase) { pendingPods, _ := testCtx.Scheduler.SchedulingQueue.PendingPods() return len(pendingPods) == len(tt.Pods) && len(testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()) == len(tt.Pods), nil }); err != nil { - t.Fatal(err) + t.Fatalf("Failed to wait for all pods to be present in the scheduling queue: %v", err) } t.Log("Confirmed Pods in the scheduling queue, starting to schedule them") @@ -2173,7 +2244,7 @@ func RunTestCoreResourceEnqueue(t *testing.T, tt *CoreResourceEnqueueTestCase) { pendingPods, _ := testCtx.Scheduler.SchedulingQueue.PendingPods() return len(pendingPods) == len(tt.Pods), nil }); err != nil { - t.Fatal(err) + t.Fatalf("Failed to wait for all pods to remain in the scheduling queue after scheduling attempts: %v", err) } t.Log("finished initial schedulings for all Pods, will trigger triggerFn")