diff --git a/pkg/scheduler/framework/plugins/noderesources/balanced_allocation.go b/pkg/scheduler/framework/plugins/noderesources/balanced_allocation.go index 9162758da48..30c31f14e8f 100644 --- a/pkg/scheduler/framework/plugins/noderesources/balanced_allocation.go +++ b/pkg/scheduler/framework/plugins/noderesources/balanced_allocation.go @@ -63,8 +63,15 @@ func (s *balancedAllocationPreScoreState) Clone() framework.StateData { // PreScore calculates incoming pod's resource requests and writes them to the cycle state used. func (ba *BalancedAllocation) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status { + podRequests := ba.calculatePodResourceRequestList(pod, ba.resources) + if ba.isBestEffortPod(podRequests) { + // Skip BalancedAllocation scoring for best-effort pods to + // prevent a large number of pods from being scheduled to the same node. + // See https://github.com/kubernetes/kubernetes/issues/129138 for details. + return framework.NewStatus(framework.Skip) + } state := &balancedAllocationPreScoreState{ - podRequests: ba.calculatePodResourceRequestList(pod, ba.resources), + podRequests: podRequests, } cycleState.Write(balancedAllocationPreScoreStateKey, state) return nil @@ -93,6 +100,9 @@ func (ba *BalancedAllocation) Score(ctx context.Context, state *framework.CycleS s, err := getBalancedAllocationPreScoreState(state) if err != nil { s = &balancedAllocationPreScoreState{podRequests: ba.calculatePodResourceRequestList(pod, ba.resources)} + if ba.isBestEffortPod(s.podRequests) { + return 0, nil + } } // ba.score favors nodes with balanced resource usage rate. @@ -154,7 +164,6 @@ func balancedResourceScorer(requested, allocable []int64) int64 { // Otherwise, set the std to zero is enough. if len(resourceToFractions) == 2 { std = math.Abs((resourceToFractions[0] - resourceToFractions[1]) / 2) - } else if len(resourceToFractions) > 2 { mean := totalFraction / float64(len(resourceToFractions)) var sum float64 diff --git a/pkg/scheduler/framework/plugins/noderesources/balanced_allocation_test.go b/pkg/scheduler/framework/plugins/noderesources/balanced_allocation_test.go index cc7060fc365..f5e6bbb0746 100644 --- a/pkg/scheduler/framework/plugins/noderesources/balanced_allocation_test.go +++ b/pkg/scheduler/framework/plugins/noderesources/balanced_allocation_test.go @@ -23,7 +23,6 @@ import ( "github.com/google/go-cmp/cmp" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/klog/v2/ktesting" "k8s.io/kubernetes/pkg/scheduler/apis/config" "k8s.io/kubernetes/pkg/scheduler/backend/cache" @@ -57,14 +56,6 @@ func TestNodeResourcesBalancedAllocation(t *testing.T) { }, NodeName: "node1", } - labels1 := map[string]string{ - "foo": "bar", - "baz": "blah", - } - labels2 := map[string]string{ - "bar": "foo", - "baz": "blah", - } cpuOnly := v1.PodSpec{ NodeName: "node1", Containers: []v1.Container{ @@ -119,29 +110,23 @@ func TestNodeResourcesBalancedAllocation(t *testing.T) { } tests := []struct { - pod *v1.Pod - pods []*v1.Pod - nodes []*v1.Node - expectedList framework.NodeScoreList - name string - args config.NodeResourcesBalancedAllocationArgs - runPreScore bool + pod *v1.Pod + pods []*v1.Pod + nodes []*v1.Node + expectedList framework.NodeScoreList + name string + args config.NodeResourcesBalancedAllocationArgs + runPreScore bool + wantPreScoreStatusCode framework.Code }{ { - // Node1 scores (remaining resources) on 0-MaxNodeScore scale - // CPU Fraction: 0 / 4000 = 0% - // Memory Fraction: 0 / 10000 = 0% - // Node1 Score: (1-0) * MaxNodeScore = MaxNodeScore - // Node2 scores (remaining resources) on 0-MaxNodeScore scale - // CPU Fraction: 0 / 4000 = 0 % - // Memory Fraction: 0 / 10000 = 0% - // Node2 Score: (1-0) * MaxNodeScore = MaxNodeScore - pod: st.MakePod().Obj(), - nodes: []*v1.Node{makeNode("node1", 4000, 10000, nil), makeNode("node2", 4000, 10000, nil)}, - expectedList: []framework.NodeScore{{Name: "node1", Score: framework.MaxNodeScore}, {Name: "node2", Score: framework.MaxNodeScore}}, - name: "nothing scheduled, nothing requested", - args: config.NodeResourcesBalancedAllocationArgs{Resources: defaultResourceBalancedAllocationSet}, - runPreScore: true, + // bestEffort pods, skip in PreScore + pod: st.MakePod().Obj(), + nodes: []*v1.Node{makeNode("node1", 4000, 10000, nil), makeNode("node2", 4000, 10000, nil)}, + name: "nothing scheduled, nothing requested, skip in PreScore", + args: config.NodeResourcesBalancedAllocationArgs{Resources: defaultResourceBalancedAllocationSet}, + runPreScore: true, + wantPreScoreStatusCode: framework.Skip, }, { // Node1 scores on 0-MaxNodeScore scale @@ -161,76 +146,6 @@ func TestNodeResourcesBalancedAllocation(t *testing.T) { args: config.NodeResourcesBalancedAllocationArgs{Resources: defaultResourceBalancedAllocationSet}, runPreScore: true, }, - { - // Node1 scores on 0-MaxNodeScore scale - // CPU Fraction: 0 / 4000= 0% - // Memory Fraction: 0 / 10000 = 0% - // Node1 std: 0 - // Node1 Score: (1-0) * MaxNodeScore = MaxNodeScore - // Node2 scores on 0-MaxNodeScore scale - // CPU Fraction: 0 / 4000= 0% - // Memory Fraction: 0 / 10000 = 0% - // Node2 std: 0 - // Node2 Score: (1-0) * MaxNodeScore = MaxNodeScore - pod: st.MakePod().Obj(), - nodes: []*v1.Node{makeNode("node1", 4000, 10000, nil), makeNode("node2", 4000, 10000, nil)}, - expectedList: []framework.NodeScore{{Name: "node2", Score: framework.MaxNodeScore}, {Name: "node2", Score: framework.MaxNodeScore}}, - name: "no resources requested, pods without container scheduled", - pods: []*v1.Pod{ - st.MakePod().Node("node1").Labels(labels2).Obj(), - st.MakePod().Node("node1").Labels(labels1).Obj(), - st.MakePod().Node("node2").Labels(labels1).Obj(), - st.MakePod().Node("node2").Labels(labels1).Obj(), - }, - args: config.NodeResourcesBalancedAllocationArgs{Resources: defaultResourceBalancedAllocationSet}, - runPreScore: true, - }, - { - // Node1 scores on 0-MaxNodeScore scale - // CPU Fraction: 0 / 250 = 0% - // Memory Fraction: 0 / 1000 = 0% - // Node1 std: (0 - 0) / 2 = 0 - // Node1 Score: (1 - 0)*MaxNodeScore = 100 - // Node2 scores on 0-MaxNodeScore scale - // CPU Fraction: 0 / 250 = 0% - // Memory Fraction: 0 / 1000 = 0% - // Node2 std: (0 - 0) / 2 = 0 - // Node2 Score: (1 - 0)*MaxNodeScore = 100 - pod: st.MakePod().Obj(), - nodes: []*v1.Node{makeNode("node1", 250, 1000*1024*1024, nil), makeNode("node2", 250, 1000*1024*1024, nil)}, - expectedList: []framework.NodeScore{{Name: "node1", Score: 100}, {Name: "node2", Score: 100}}, - name: "no resources requested, pods with container scheduled", - pods: []*v1.Pod{ - st.MakePod().Node("node1").Obj(), - st.MakePod().Node("node1").Obj(), - }, - args: config.NodeResourcesBalancedAllocationArgs{Resources: defaultResourceBalancedAllocationSet}, - runPreScore: true, - }, - { - // Node1 scores on 0-MaxNodeScore scale - // CPU Fraction: 6000 / 10000 = 60% - // Memory Fraction: 0 / 20000 = 0% - // Node1 std: (0.6 - 0) / 2 = 0.3 - // Node1 Score: (1 - 0.3)*MaxNodeScore = 70 - // Node2 scores on 0-MaxNodeScore scale - // CPU Fraction: 6000 / 10000 = 60% - // Memory Fraction: 5000 / 20000 = 25% - // Node2 std: (0.6 - 0.25) / 2 = 0.175 - // Node2 Score: (1 - 0.175)*MaxNodeScore = 82 - pod: st.MakePod().Obj(), - nodes: []*v1.Node{makeNode("node1", 10000, 20000, nil), makeNode("node2", 10000, 20000, nil)}, - expectedList: []framework.NodeScore{{Name: "node1", Score: 70}, {Name: "node2", Score: 82}}, - name: "no resources requested, pods scheduled with resources", - pods: []*v1.Pod{ - {Spec: cpuOnly, ObjectMeta: metav1.ObjectMeta{Labels: labels2}}, - {Spec: cpuOnly, ObjectMeta: metav1.ObjectMeta{Labels: labels1}}, - {Spec: cpuOnly2, ObjectMeta: metav1.ObjectMeta{Labels: labels1}}, - {Spec: cpuAndMemory, ObjectMeta: metav1.ObjectMeta{Labels: labels1}}, - }, - args: config.NodeResourcesBalancedAllocationArgs{Resources: defaultResourceBalancedAllocationSet}, - runPreScore: true, - }, { // Node1 scores on 0-MaxNodeScore scale // CPU Fraction: 6000 / 10000 = 60% @@ -298,18 +213,6 @@ func TestNodeResourcesBalancedAllocation(t *testing.T) { args: config.NodeResourcesBalancedAllocationArgs{Resources: defaultResourceBalancedAllocationSet}, runPreScore: true, }, - { - pod: st.MakePod().Obj(), - nodes: []*v1.Node{makeNode("node1", 0, 0, nil), makeNode("node2", 0, 0, nil)}, - expectedList: []framework.NodeScore{{Name: "node1", Score: 100}, {Name: "node2", Score: 100}}, - name: "zero node resources, pods scheduled with resources", - pods: []*v1.Pod{ - {Spec: cpuOnly}, - {Spec: cpuAndMemory}, - }, - args: config.NodeResourcesBalancedAllocationArgs{Resources: defaultResourceBalancedAllocationSet}, - runPreScore: true, - }, // Node1 scores on 0-MaxNodeScore scale // CPU Fraction: 3000 / 3500 = 85.71% // Memory Fraction: 5000 / 40000 = 12.5% @@ -342,19 +245,25 @@ func TestNodeResourcesBalancedAllocation(t *testing.T) { runPreScore: true, }, // Only one node (node1) has the scalar resource, pod doesn't request the scalar resource and the scalar resource should be skipped for consideration. - // Node1: std = 0, score = 100 - // Node2: std = 0, score = 100 + // Node1 scores on 0-MaxNodeScore scale + // CPU Fraction: 3000 / 3500 = 85.71% + // Memory Fraction: 5000 / 40000 = 12.5% + // Node1 std: (0.8571 - 0.125) / 2 = 0.36605 + // Node1 Score: (1 - 0.22705)*MaxNodeScore = 63 + // Node2 scores on 0-MaxNodeScore scale + // CPU Fraction: 3000 / 3500 = 85.71% + // Memory Fraction: 5000 / 40000 = 12.5% + // Node2 std: (0.8571 - 0.125) / 2 = 0.36605 + // Node2 Score: (1 - 0.22705)*MaxNodeScore = 63 { - pod: st.MakePod().Obj(), + pod: &v1.Pod{Spec: cpuAndMemory}, nodes: []*v1.Node{makeNode("node1", 3500, 40000, scalarResource), makeNode("node2", 3500, 40000, nil)}, - expectedList: []framework.NodeScore{{Name: "node1", Score: 100}, {Name: "node2", Score: 100}}, - name: "node without the scalar resource results to a higher score", - pods: []*v1.Pod{ - {Spec: cpuOnly}, - {Spec: cpuOnly2}, - }, + expectedList: []framework.NodeScore{{Name: "node1", Score: 63}, {Name: "node2", Score: 63}}, + name: "node without the scalar resource should skip the scalar resource", + pods: []*v1.Pod{}, args: config.NodeResourcesBalancedAllocationArgs{Resources: []config.ResourceSpec{ {Name: string(v1.ResourceCPU), Weight: 1}, + {Name: string(v1.ResourceMemory), Weight: 1}, {Name: "nvidia.com/gpu", Weight: 1}, }}, runPreScore: true, @@ -392,13 +301,17 @@ func TestNodeResourcesBalancedAllocation(t *testing.T) { fh, _ := runtime.NewFramework(ctx, nil, nil, runtime.WithSnapshotSharedLister(snapshot)) p, _ := NewBalancedAllocation(ctx, &test.args, fh, feature.Features{}) state := framework.NewCycleState() - for i := range test.nodes { - if test.runPreScore { - status := p.(framework.PreScorePlugin).PreScore(ctx, state, test.pod, tf.BuildNodeInfos(test.nodes)) - if !status.IsSuccess() { - t.Errorf("PreScore is expected to return success, but didn't. Got status: %v", status) - } + if test.runPreScore { + status := p.(framework.PreScorePlugin).PreScore(ctx, state, test.pod, tf.BuildNodeInfos(test.nodes)) + if status.Code() != test.wantPreScoreStatusCode { + t.Errorf("unexpected status code, want: %v, got: %v", test.wantPreScoreStatusCode, status.Code()) } + if status.Code() == framework.Skip { + t.Log("skipping score test as PreScore returned skip") + return + } + } + for i := range test.nodes { nodeInfo, err := snapshot.Get(test.nodes[i].Name) if err != nil { t.Errorf("failed to get node %q from snapshot: %v", test.nodes[i].Name, err) diff --git a/pkg/scheduler/framework/plugins/noderesources/resource_allocation.go b/pkg/scheduler/framework/plugins/noderesources/resource_allocation.go index ad4970f2aaa..71eb9138bd4 100644 --- a/pkg/scheduler/framework/plugins/noderesources/resource_allocation.go +++ b/pkg/scheduler/framework/plugins/noderesources/resource_allocation.go @@ -146,3 +146,12 @@ func (r *resourceAllocationScorer) calculatePodResourceRequestList(pod *v1.Pod, } return podRequests } + +func (r *resourceAllocationScorer) isBestEffortPod(podRequests []int64) bool { + for _, request := range podRequests { + if request != 0 { + return false + } + } + return true +} diff --git a/pkg/scheduler/schedule_one_test.go b/pkg/scheduler/schedule_one_test.go index 8ed4d7bf098..403fc88cc36 100644 --- a/pkg/scheduler/schedule_one_test.go +++ b/pkg/scheduler/schedule_one_test.go @@ -2939,7 +2939,7 @@ func TestZeroRequest(t *testing.T) { {Spec: large1}, {Spec: noResources1}, {Spec: large2}, {Spec: small2}, }, - expectedScore: 150, + expectedScore: 50, }, { pod: &v1.Pod{Spec: small}, @@ -3105,10 +3105,10 @@ func Test_prioritizeNodes(t *testing.T) { }, { Name: "NodeResourcesBalancedAllocation", - Score: 100, + Score: 0, }, }, - TotalScore: 110, + TotalScore: 10, }, { Name: "node2", @@ -3119,10 +3119,10 @@ func Test_prioritizeNodes(t *testing.T) { }, { Name: "NodeResourcesBalancedAllocation", - Score: 100, + Score: 0, }, }, - TotalScore: 200, + TotalScore: 100, }, }, }, @@ -3172,10 +3172,10 @@ func Test_prioritizeNodes(t *testing.T) { }, { Name: "NodeResourcesBalancedAllocation", - Score: 100, + Score: 0, }, }, - TotalScore: 420, + TotalScore: 320, }, { Name: "node2", @@ -3190,10 +3190,10 @@ func Test_prioritizeNodes(t *testing.T) { }, { Name: "NodeResourcesBalancedAllocation", - Score: 100, + Score: 0, }, }, - TotalScore: 330, + TotalScore: 230, }, }, }, @@ -3222,10 +3222,10 @@ func Test_prioritizeNodes(t *testing.T) { }, { Name: "NodeResourcesBalancedAllocation", - Score: 100, + Score: 0, }, }, - TotalScore: 110, + TotalScore: 10, }, { Name: "node2", @@ -3236,10 +3236,10 @@ func Test_prioritizeNodes(t *testing.T) { }, { Name: "NodeResourcesBalancedAllocation", - Score: 100, + Score: 0, }, }, - TotalScore: 200, + TotalScore: 100, }, }, },