feature(scheduler): more fine-grained QHints for interpodaffinity plugin

This commit is contained in:
googs1025 2024-09-19 18:22:56 +08:00
parent 475ee33f69
commit c725e18e07
3 changed files with 199 additions and 26 deletions

View File

@ -211,7 +211,7 @@ func (pl *InterPodAffinity) isSchedulableAfterPodChange(logger klog.Logger, pod
}
func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
@ -221,11 +221,35 @@ func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod
return framework.Queue, err
}
// When queuing this Pod:
// - 1. A new node is added with the pod affinity topologyKey, the pod may become schedulable.
// - 2. The original node does not have the pod affinity topologyKey but the modified node does, the pod may become schedulable.
// - 3. Both the original and modified nodes have the pod affinity topologyKey and they differ, the pod may become schedulable.
for _, term := range terms {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
logger.V(5).Info("a node with matched pod affinity topologyKey was added/updated and it may make pod schedulable",
if originalNode == nil {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
// Case 1: A new node is added with the pod affinity topologyKey.
logger.V(5).Info("A node with a matched pod affinity topologyKey was added and it may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
continue
}
originalTopologyValue, originalHasKey := originalNode.Labels[term.TopologyKey]
modifiedTopologyValue, modifiedHasKey := modifiedNode.Labels[term.TopologyKey]
if !originalHasKey && modifiedHasKey {
// Case 2: Original node does not have the pod affinity topologyKey, but the modified node does.
logger.V(5).Info("A node got updated to have the topology key of pod affinity, which may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, err
return framework.Queue, nil
}
if originalHasKey && modifiedHasKey && (originalTopologyValue != modifiedTopologyValue) {
// Case 3: Both nodes have the pod affinity topologyKey, but the values differ.
logger.V(5).Info("A node is moved to a different domain of pod affinity, which may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
}
@ -234,11 +258,39 @@ func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod
return framework.Queue, err
}
// When queuing this Pod:
// - 1. A new node is added, the pod may become schedulable.
// - 2. The original node have the pod anti-affinity topologyKey but the modified node does not, the pod may become schedulable.
// - 3. Both the original and modified nodes have the pod anti-affinity topologyKey and they differ, the pod may become schedulable.
for _, term := range antiTerms {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
logger.V(5).Info("a node with matched pod anti-affinity topologyKey was added/updated and it may make pod schedulable",
if originalNode == nil {
// Case 1: A new node is added.
// We always requeue the Pod with anti-affinity because:
// - the node without the topology key is always allowed to have a Pod with anti-affinity.
// - the addition of a node with the topology key makes Pods schedulable only when the topology it joins doesn't have any Pods that the Pod hates.
// But, it's out-of-scope of this QHint to check which Pods are in the topology this Node is in.
logger.V(5).Info("A node was added and it may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, err
return framework.Queue, nil
}
originalTopologyValue, originalHasKey := originalNode.Labels[term.TopologyKey]
modifiedTopologyValue, modifiedHasKey := modifiedNode.Labels[term.TopologyKey]
if originalHasKey && !modifiedHasKey {
// Case 2: The original node have the pod anti-affinity topologyKey but the modified node does not.
// Note that we don't need to check the opposite case (!originalHasKey && modifiedHasKey)
// because the node without the topology label can always accept pods with pod anti-affinity.
logger.V(5).Info("A node got updated to not have the topology key of pod anti-affinity, which may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
if originalHasKey && modifiedHasKey && (originalTopologyValue != modifiedTopologyValue) {
// Case 3: Both nodes have the pod anti-affinity topologyKey, but the values differ.
logger.V(5).Info("A node is moved to a different domain of pod anti-affinity, which may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
}
logger.V(5).Info("a node is added/updated but doesn't have any topologyKey which matches pod affinity/anti-affinity",

View File

@ -157,18 +157,13 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) {
oldNode, newNode *v1.Node
expectedHint framework.QueueingHint
}{
// affinity
{
name: "add a new node with matched pod affinity topologyKey",
pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(),
newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
expectedHint: framework.Queue,
},
{
name: "add a new node with matched pod anti-affinity topologyKey",
pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(),
newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
expectedHint: framework.Queue,
},
{
name: "add a new node without matched topologyKey",
pod: st.MakePod().Name("p").PodAffinityIn("service", "region", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(),
@ -176,19 +171,74 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) {
expectedHint: framework.QueueSkip,
},
{
name: "update node topologyKey",
name: "update node label but not topologyKey",
pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(),
oldNode: st.MakeNode().Name("node-a").Label("aaa", "a").Obj(),
newNode: st.MakeNode().Name("node-a").Label("aaa", "b").Obj(),
expectedHint: framework.QueueSkip,
},
{
name: "update node label that isn't related to the pod affinity",
pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(),
oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Label("unrelated-label", "unrelated").Obj(),
expectedHint: framework.QueueSkip,
},
{
name: "update node with different affinity topologyKey value",
pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(),
oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
newNode: st.MakeNode().Name("node-a").Label("zone", "zone2").Obj(),
expectedHint: framework.Queue,
},
{
name: "update node lable but not topologyKey",
name: "update node to have the affinity topology label",
pod: st.MakePod().Name("p").PodAffinityIn("service", "zone", []string{"securityscan", "value2"}, st.PodAffinityWithRequiredReq).Obj(),
oldNode: st.MakeNode().Name("node-a").Label("aaa", "a").Obj(),
newNode: st.MakeNode().Name("node-a").Label("aaa", "b").Obj(),
newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
expectedHint: framework.Queue,
},
// anti-affinity
{
name: "add a new node with matched pod anti-affinity topologyKey",
pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(),
newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
expectedHint: framework.Queue,
},
{
name: "add a new node without matched pod anti-affinity topologyKey",
pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(),
newNode: st.MakeNode().Name("node-a").Obj(),
expectedHint: framework.Queue,
},
{
name: "update node label that isn't related to the pod anti-affinity",
pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(),
oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Label("unrelated-label", "unrelated").Obj(),
expectedHint: framework.QueueSkip,
},
{
name: "update node with different anti-affinity topologyKey value",
pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(),
oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
newNode: st.MakeNode().Name("node-a").Label("zone", "zone2").Obj(),
expectedHint: framework.Queue,
},
{
name: "update node to have the anti-affinity topology label",
pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(),
oldNode: st.MakeNode().Name("node-a").Label("aaa", "a").Obj(),
newNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
expectedHint: framework.QueueSkip,
},
{
name: "update node label not to have anti-affinity topology label",
pod: st.MakePod().Name("p").PodAntiAffinity("zone", &metav1.LabelSelector{MatchLabels: map[string]string{"another": "label"}}, st.PodAntiAffinityWithRequiredReq).Obj(),
oldNode: st.MakeNode().Name("node-a").Label("zone", "zone1").Obj(),
newNode: st.MakeNode().Name("node-a").Label("aaa", "a").Obj(),
expectedHint: framework.Queue,
},
}
for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {

View File

@ -456,7 +456,7 @@ var CoreResourceEnqueueTestCases = []*CoreResourceEnqueueTestCase{
},
},
{
Name: "Pod rejected by the PodAffinity plugin is requeued when deleting the existed pod's label to make it match the podAntiAffinity",
Name: "Pod rejected by the PodAffinity plugin is requeued when deleting the existed pod's label that matches the podAntiAffinity",
InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Label("node", "fake-node").Obj()},
InitialPods: []*v1.Pod{
st.MakePod().Name("pod1").Label("anti1", "anti1").Label("anti2", "anti2").Container("image").Node("fake-node").Obj(),
@ -490,7 +490,7 @@ var CoreResourceEnqueueTestCases = []*CoreResourceEnqueueTestCase{
},
TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) {
// Add label to the pod which will make it match pod2's nodeAffinity.
// Add label to the pod which will make it match pod2's podAffinity.
if _, err := testCtx.ClientSet.CoreV1().Pods(testCtx.NS.Name).Update(testCtx.Ctx, st.MakePod().Name("pod1").Label("aaa", "bbb").Container("image").Node("fake-node").Obj(), metav1.UpdateOptions{}); err != nil {
return nil, fmt.Errorf("failed to update pod1: %w", err)
}
@ -501,8 +501,8 @@ var CoreResourceEnqueueTestCases = []*CoreResourceEnqueueTestCase{
},
{
Name: "Pod rejected by the PodAffinity plugin is requeued when updating the label of the node to make it match the pod affinity",
InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Label("node", "fake-node").Obj()},
Name: "Pod rejected by the interPodAffinity plugin is requeued when creating the node with the topologyKey label of the pod affinity",
InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node1").Label("aaa", "fake-node").Obj()},
Pods: []*v1.Pod{
// - pod1 and pod2 will be rejected by the PodAffinity plugin.
st.MakePod().Name("pod1").PodAffinityExists("bbb", "zone", st.PodAffinityWithRequiredReq).Container("image").Obj(),
@ -510,15 +510,86 @@ var CoreResourceEnqueueTestCases = []*CoreResourceEnqueueTestCase{
},
TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) {
// Add label to the node which will make it match pod1's podAffinity.
if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node").Label("zone", "zone1").Obj(), metav1.UpdateOptions{}); err != nil {
return nil, fmt.Errorf("failed to update pod1: %w", err)
// Create the node which will make it match pod1's podAffinity.
if _, err := testCtx.ClientSet.CoreV1().Nodes().Create(testCtx.Ctx, st.MakeNode().Name("fake-node2").Label("zone", "zone1").Obj(), metav1.CreateOptions{}); err != nil {
return nil, fmt.Errorf("failed to create fake-node2: %w", err)
}
return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.UpdateNodeLabel}: 1}, nil
return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.Add}: 1}, nil
},
WantRequeuedPods: sets.New("pod1"),
EnableSchedulingQueueHint: sets.New(true),
},
{
Name: "Pod rejected by the interPodAffinity plugin is requeued when updating the node with the topologyKey label of the pod affinity",
InitialNodes: []*v1.Node{
st.MakeNode().Name("fake-node").Label("region", "region-1").Obj(),
st.MakeNode().Name("fake-node2").Label("region", "region-2").Label("group", "b").Obj(),
},
InitialPods: []*v1.Pod{
st.MakePod().Name("pod1").Label("affinity1", "affinity1").Container("image").Node("fake-node").Obj(),
},
Pods: []*v1.Pod{
// - pod2, pod3, pod4 will be rejected by the PodAffinity plugin.
st.MakePod().Name("pod2").Label("affinity1", "affinity1").PodAffinityExists("affinity1", "node", st.PodAffinityWithRequiredReq).Container("image").Obj(),
st.MakePod().Name("pod3").Label("affinity1", "affinity1").PodAffinityExists("affinity1", "other", st.PodAffinityWithRequiredReq).Container("image").Obj(),
// This Pod doesn't have any label so that this PodAffinity doesn't match this Pod itself.
st.MakePod().Name("pod4").PodAffinityExists("affinity1", "region", st.PodAffinityWithRequiredReq).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
},
TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) {
// Add "node" label to the node which may make pod2 schedulable.
// Update "region" label to the node which may make pod4 schedulable.
if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node").Label("node", "fake-node").Label("region", "region-2").Obj(), metav1.UpdateOptions{}); err != nil {
return nil, fmt.Errorf("failed to update fake-node: %w", err)
}
return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.UpdateNodeLabel}: 1}, nil
},
WantRequeuedPods: sets.New("pod2", "pod4"),
EnableSchedulingQueueHint: sets.New(true),
},
{
Name: "Pod rejected by the interPodAffinity plugin is requeued when updating the node with the topologyKey label of the pod anti affinity",
InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Label("node", "fake-node").Label("service", "service-a").Label("other", "fake-node").Obj()},
InitialPods: []*v1.Pod{
st.MakePod().Name("pod1").Label("anti1", "anti1").Container("image").Node("fake-node").Obj(),
},
Pods: []*v1.Pod{
// - pod2, pod3, pod4 will be rejected by the PodAffinity plugin.
st.MakePod().Name("pod2").Label("anti1", "anti1").PodAntiAffinityExists("anti1", "node", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(),
st.MakePod().Name("pod3").Label("anti1", "anti1").PodAntiAffinityExists("anti1", "service", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(),
st.MakePod().Name("pod4").Label("anti1", "anti1").PodAntiAffinityExists("anti1", "other", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(),
},
TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) {
// Remove "node" label from the node which will make it not match pod2's podAntiAffinity.
// Change "service" label from service-a to service-b which may pod3 schedulable.
if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node").Label("service", "service-b").Label("region", "fake-node").Label("other", "fake-node").Obj(), metav1.UpdateOptions{}); err != nil {
return nil, fmt.Errorf("failed to update fake-node: %w", err)
}
return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.UpdateNodeLabel}: 1}, nil
},
WantRequeuedPods: sets.New("pod2", "pod3"),
EnableSchedulingQueueHint: sets.New(true),
},
{
Name: "Pod rejected by the interPodAffinity plugin is requeued when creating the node with the topologyKey label of the pod anti affinity",
InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node1").Label("zone", "fake-node").Label("node", "fake-node").Obj()},
InitialPods: []*v1.Pod{
st.MakePod().Name("pod1").Label("anti1", "anti1").Container("image").Node("fake-node1").Obj(),
},
Pods: []*v1.Pod{
// - pod2, pod3 will be rejected by the PodAntiAffinity plugin.
st.MakePod().Name("pod2").PodAntiAffinityExists("anti1", "zone", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(),
st.MakePod().Name("pod3").PodAntiAffinityExists("anti1", "node", st.PodAntiAffinityWithRequiredReq).Container("image").Obj(),
},
TriggerFn: func(testCtx *testutils.TestContext) (map[framework.ClusterEvent]uint64, error) {
// Create the node which will make pod2, pod3 be schedulable.
if _, err := testCtx.ClientSet.CoreV1().Nodes().Create(testCtx.Ctx, st.MakeNode().Name("fake-node2").Label("zone", "fake-node").Obj(), metav1.CreateOptions{}); err != nil {
return nil, fmt.Errorf("failed to create fake-node2: %w", err)
}
return map[framework.ClusterEvent]uint64{{Resource: framework.Node, ActionType: framework.Add}: 1}, nil
},
WantRequeuedPods: sets.New("pod2", "pod3"),
EnableSchedulingQueueHint: sets.New(true, false),
},
{
Name: "Pod rejected with hostport by the NodePorts plugin is requeued when pod with common hostport is deleted",
InitialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Label("node", "fake-node").Obj()},
@ -2154,7 +2225,7 @@ func RunTestCoreResourceEnqueue(t *testing.T, tt *CoreResourceEnqueueTestCase) {
pendingPods, _ := testCtx.Scheduler.SchedulingQueue.PendingPods()
return len(pendingPods) == len(tt.Pods) && len(testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()) == len(tt.Pods), nil
}); err != nil {
t.Fatal(err)
t.Fatalf("Failed to wait for all pods to be present in the scheduling queue: %v", err)
}
t.Log("Confirmed Pods in the scheduling queue, starting to schedule them")
@ -2173,7 +2244,7 @@ func RunTestCoreResourceEnqueue(t *testing.T, tt *CoreResourceEnqueueTestCase) {
pendingPods, _ := testCtx.Scheduler.SchedulingQueue.PendingPods()
return len(pendingPods) == len(tt.Pods), nil
}); err != nil {
t.Fatal(err)
t.Fatalf("Failed to wait for all pods to remain in the scheduling queue after scheduling attempts: %v", err)
}
t.Log("finished initial schedulings for all Pods, will trigger triggerFn")