diff --git a/pkg/scheduler/framework/plugins/noderesources/fit.go b/pkg/scheduler/framework/plugins/noderesources/fit.go index d601b313572..7605001aa12 100644 --- a/pkg/scheduler/framework/plugins/noderesources/fit.go +++ b/pkg/scheduler/framework/plugins/noderesources/fit.go @@ -363,21 +363,68 @@ func (f *Fit) isSchedulableAfterPodScaleDown(targetPod, originalPod, modifiedPod } // isSchedulableAfterNodeChange is invoked whenever a node added or changed. It checks whether -// that change made a previously unschedulable pod schedulable. +// that change could make a previously unschedulable pod schedulable. func (f *Fit) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { - _, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj) + originalNode, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj) if err != nil { return framework.Queue, err } - // TODO: also check if the original node meets the pod's resource requestments once preCheck is completely removed. - // See: https://github.com/kubernetes/kubernetes/issues/110175 - if isFit(pod, modifiedNode) { - logger.V(5).Info("node was updated, and may fit with the pod's resource requestments", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + // Leaving in the queue, since the pod won't fit into the modified node anyway. + if !isFit(pod, modifiedNode) { + logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.QueueSkip, nil + } + // The pod will fit, so since it's add, unblock scheduling. + if originalNode == nil { + logger.V(5).Info("node was added and it might fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) return framework.Queue, nil } + // The pod will fit, but since there was no increase in available resources, the change won't make the pod schedulable. + if !haveAnyRequestedResourcesIncreased(pod, originalNode, modifiedNode) { + logger.V(5).Info("node was updated, but haven't changed the pod's resource requestments fit assessment", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.QueueSkip, nil + } - logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) - return framework.QueueSkip, nil + logger.V(5).Info("node was updated, and may now fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) + return framework.Queue, nil +} + +// haveAnyRequestedResourcesIncreased returns true if any of the resources requested by the pod have increased or if allowed pod number increased. +func haveAnyRequestedResourcesIncreased(pod *v1.Pod, originalNode, modifiedNode *v1.Node) bool { + podRequest := computePodResourceRequest(pod) + originalNodeInfo := framework.NewNodeInfo() + originalNodeInfo.SetNode(originalNode) + modifiedNodeInfo := framework.NewNodeInfo() + modifiedNodeInfo.SetNode(modifiedNode) + + if modifiedNodeInfo.Allocatable.AllowedPodNumber > originalNodeInfo.Allocatable.AllowedPodNumber { + return true + } + + if podRequest.MilliCPU == 0 && + podRequest.Memory == 0 && + podRequest.EphemeralStorage == 0 && + len(podRequest.ScalarResources) == 0 { + return false + } + + if (podRequest.MilliCPU > 0 && modifiedNodeInfo.Allocatable.MilliCPU > originalNodeInfo.Allocatable.MilliCPU) || + (podRequest.Memory > 0 && modifiedNodeInfo.Allocatable.Memory > originalNodeInfo.Allocatable.Memory) || + (podRequest.EphemeralStorage > 0 && modifiedNodeInfo.Allocatable.EphemeralStorage > originalNodeInfo.Allocatable.EphemeralStorage) { + return true + } + + for rName, rQuant := range podRequest.ScalarResources { + // Skip in case request quantity is zero + if rQuant == 0 { + continue + } + + if modifiedNodeInfo.Allocatable.ScalarResources[rName] > originalNodeInfo.Allocatable.ScalarResources[rName] { + return true + } + } + return false } // isFit checks if the pod fits the node. If the node is nil, it returns false. diff --git a/pkg/scheduler/framework/plugins/noderesources/fit_test.go b/pkg/scheduler/framework/plugins/noderesources/fit_test.go index 53aae58dfe6..8a10cb3755a 100644 --- a/pkg/scheduler/framework/plugins/noderesources/fit_test.go +++ b/pkg/scheduler/framework/plugins/noderesources/fit_test.go @@ -1276,14 +1276,37 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { }).Obj(), expectedHint: framework.Queue, }, - // uncomment this case when the isSchedulableAfterNodeChange also check the - // original node's resources. - // "skip-queue-on-node-unrelated-changes": { - // pod: &v1.Pod{}, - // oldObj: st.MakeNode().Obj(), - // newObj: st.MakeNode().Label("foo", "bar").Obj(), - // expectedHint: framework.QueueSkip, - // }, + "skip-queue-on-node-unrelated-changes": { + pod: newResourcePod(framework.Resource{ + Memory: 2, + ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1}, + }), + oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceMemory: "2", + extendedResourceA: "2", + }).Obj(), + newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceMemory: "2", + extendedResourceA: "1", + extendedResourceB: "2", + }).Obj(), + expectedHint: framework.QueueSkip, + }, + "queue-on-pod-requested-resources-increase": { + pod: newResourcePod(framework.Resource{ + Memory: 2, + ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1}, + }), + oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceMemory: "2", + extendedResourceA: "1", + }).Obj(), + newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceMemory: "2", + extendedResourceA: "2", + }).Obj(), + expectedHint: framework.Queue, + }, "skip-queue-on-node-changes-from-suitable-to-unsuitable": { pod: newResourcePod(framework.Resource{ Memory: 2, @@ -1364,3 +1387,136 @@ func TestIsFit(t *testing.T) { }) } } + +func TestHaveAnyRequestedResourcesIncreased(t *testing.T) { + testCases := map[string]struct { + pod *v1.Pod + originalNode *v1.Node + modifiedNode *v1.Node + expected bool + }{ + "no-requested-resources": { + pod: newResourcePod(framework.Resource{}), + originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourcePods: "1", + v1.ResourceCPU: "1", + v1.ResourceMemory: "1", + v1.ResourceEphemeralStorage: "1", + extendedResourceA: "1", + }).Obj(), + modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourcePods: "1", + v1.ResourceCPU: "2", + v1.ResourceMemory: "2", + v1.ResourceEphemeralStorage: "2", + extendedResourceA: "2", + }).Obj(), + expected: false, + }, + "no-requested-resources-pods-increased": { + pod: newResourcePod(framework.Resource{}), + originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourcePods: "1", + v1.ResourceCPU: "1", + v1.ResourceMemory: "1", + v1.ResourceEphemeralStorage: "1", + extendedResourceA: "1", + }).Obj(), + modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourcePods: "2", + v1.ResourceCPU: "1", + v1.ResourceMemory: "1", + v1.ResourceEphemeralStorage: "1", + extendedResourceA: "1", + }).Obj(), + expected: true, + }, + "requested-resources-decreased": { + pod: newResourcePod(framework.Resource{ + MilliCPU: 2, + Memory: 2, + EphemeralStorage: 2, + ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2}, + }), + originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceCPU: "1", + v1.ResourceMemory: "2", + v1.ResourceEphemeralStorage: "3", + extendedResourceA: "4", + }).Obj(), + modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceCPU: "1", + v1.ResourceMemory: "2", + v1.ResourceEphemeralStorage: "1", + extendedResourceA: "1", + }).Obj(), + expected: false, + }, + "requested-resources-increased": { + pod: newResourcePod(framework.Resource{ + MilliCPU: 2, + Memory: 2, + EphemeralStorage: 2, + ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2}, + }), + originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceCPU: "1", + v1.ResourceMemory: "2", + v1.ResourceEphemeralStorage: "3", + extendedResourceA: "4", + }).Obj(), + modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceCPU: "3", + v1.ResourceMemory: "4", + v1.ResourceEphemeralStorage: "3", + extendedResourceA: "4", + }).Obj(), + expected: true, + }, + "non-requested-resources-decreased": { + pod: newResourcePod(framework.Resource{ + MilliCPU: 2, + Memory: 2, + }), + originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceCPU: "1", + v1.ResourceMemory: "2", + v1.ResourceEphemeralStorage: "3", + extendedResourceA: "4", + }).Obj(), + modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceCPU: "1", + v1.ResourceMemory: "2", + v1.ResourceEphemeralStorage: "1", + extendedResourceA: "1", + }).Obj(), + expected: false, + }, + "non-requested-resources-increased": { + pod: newResourcePod(framework.Resource{ + MilliCPU: 2, + Memory: 2, + }), + originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceCPU: "1", + v1.ResourceMemory: "2", + v1.ResourceEphemeralStorage: "3", + extendedResourceA: "4", + }).Obj(), + modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{ + v1.ResourceCPU: "1", + v1.ResourceMemory: "2", + v1.ResourceEphemeralStorage: "5", + extendedResourceA: "6", + }).Obj(), + expected: false, + }, + } + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + if got := haveAnyRequestedResourcesIncreased(tc.pod, tc.originalNode, tc.modifiedNode); got != tc.expected { + t.Errorf("expected: %v, got: %v", tc.expected, got) + } + }) + } +} diff --git a/test/integration/scheduler/queue_test.go b/test/integration/scheduler/queue_test.go index 9877d7f2625..3ac0647d87d 100644 --- a/test/integration/scheduler/queue_test.go +++ b/test/integration/scheduler/queue_test.go @@ -370,6 +370,70 @@ func TestCoreResourceEnqueue(t *testing.T) { }, wantRequeuedPods: sets.New("pod1"), }, + { + name: "Pod rejected by the NodeResourcesFit plugin isn't requeued when a Node is updated without increase in the requested resources", + initialNodes: []*v1.Node{ + st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(), + st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(), + }, + pods: []*v1.Pod{ + // - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2. + st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(), + }, + triggerFn: func(testCtx *testutils.TestContext) error { + // Trigger a NodeUpdate event that increases unrealted (not requested) memory capacity of fake-node1, which should not requeue Pod1. + if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4", v1.ResourceMemory: "4000"}).Obj(), metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update fake-node: %w", err) + } + return nil + }, + wantRequeuedPods: sets.Set[string]{}, + enableSchedulingQueueHint: []bool{true}, + }, + { + name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the requested resources", + initialNodes: []*v1.Node{ + st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(), + st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(), + }, + pods: []*v1.Pod{ + // - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2. + st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(), + }, + triggerFn: func(testCtx *testutils.TestContext) error { + // Trigger a NodeUpdate event that increases the requested CPU capacity of fake-node1, which should requeue Pod1. + if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "5"}).Obj(), metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update fake-node: %w", err) + } + return nil + }, + wantRequeuedPods: sets.New("pod1"), + enableSchedulingQueueHint: []bool{true}, + }, + { + name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the allowed pods number", + initialNodes: []*v1.Node{ + st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "2"}).Obj(), + st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "1"}).Label("group", "b").Obj(), + }, + initialPods: []*v1.Pod{ + // - Pod1 will be scheduled on fake-node2 because of the affinity label. + st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Node("fake-node2").Obj(), + }, + pods: []*v1.Pod{ + // - Pod2 is unschedulable because Pod1 saturated ResourcePods limit in fake-node2. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2. + st.MakePod().Name("pod2").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(), + }, + triggerFn: func(testCtx *testutils.TestContext) error { + // Trigger a NodeUpdate event that increases the allowed Pods number of fake-node1, which should requeue Pod2. + if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "3"}).Obj(), metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update fake-node: %w", err) + } + return nil + }, + wantRequeuedPods: sets.New("pod2"), + enableSchedulingQueueHint: []bool{true}, + }, { name: "Updating pod condition doesn't retry scheduling if the Pod was rejected by TaintToleration", initialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Taints([]v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoSchedule}}).Obj()},