Implement resource limit priority function. This function checks if the input pod's

resource limits are satisfied by the input node's allocatable resources or not. If yes, the node is assigned a score of 1, otherwise the node's score is not changed.
2025-07-22 03:11:40 +00:00 · 2017-11-15 14:42:02 -05:00 · 2017-11-15 14:42:02 -05:00 · b571001999
commit b571001999
parent 02a7c12cbd
5 changed files with 292 additions and 0 deletions
--- a/pkg/features/kube_features.go
+++ b/pkg/features/kube_features.go
@ -206,6 +206,12 @@ const (
 	// alpha: v1.9
 	// Postpone deletion of a persistent volume claim in case it is used by a pod
 	PVCProtection utilfeature.Feature = "PVCProtection"
+
+	// owner: @aveshagarwal
+	// alpha: v1.9
+	//
+	// Enable resource limits priority function
+	ResourceLimitsPriorityFunction utilfeature.Feature = "ResourceLimitsPriorityFunction"
 )

 func init() {
@ -244,6 +250,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
 	CustomPodDNS:                                {Default: false, PreRelease: utilfeature.Alpha},
 	BlockVolume:                                 {Default: false, PreRelease: utilfeature.Alpha},
 	PVCProtection:                               {Default: false, PreRelease: utilfeature.Alpha},
+	ResourceLimitsPriorityFunction:              {Default: false, PreRelease: utilfeature.Alpha},

 	// inherited features from generic apiserver, relisted here to get a conflict if it is changed
 	// unintentionally on either side:
--- a/plugin/pkg/scheduler/algorithm/priorities/BUILD
+++ b/plugin/pkg/scheduler/algorithm/priorities/BUILD
@ -19,6 +19,7 @@ go_library(
        "node_label.go",
        "node_prefer_avoid_pods.go",
        "reduce.go",
+        "resource_limits.go",
        "selector_spreading.go",
        "taint_toleration.go",
        "test_util.go",
@ -54,6 +55,7 @@ go_test(
        "node_affinity_test.go",
        "node_label_test.go",
        "node_prefer_avoid_pods_test.go",
+        "resource_limits_test.go",
        "selector_spreading_test.go",
        "taint_toleration_test.go",
    ],
--- a/plugin/pkg/scheduler/algorithm/priorities/resource_limits.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/resource_limits.go
@ -0,0 +1,128 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package priorities
+
+import (
+	"fmt"
+
+	"k8s.io/api/core/v1"
+	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
+	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
+
+	"github.com/golang/glog"
+)
+
+// ResourceLimitsPriorityMap is a priority function that increases score of input node by 1 if the node satisfies
+// input pod's resource limits. In detail, this priority function works as follows: If a node does not publish its
+// allocatable resources (cpu and memory both), the node score is not affected. If a pod does not specify
+// its cpu and memory limits both, the node score is not affected. If one or both of cpu and memory limits
+// of the pod are satisfied, the node is assigned a score of 1.
+// Rationale of choosing the lowest score of 1 is that this is mainly selected to break ties between nodes that have
+// same scores assigned by one of least and most requested priority functions.
+func ResourceLimitsPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
+	node := nodeInfo.Node()
+	if node == nil {
+		return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
+	}
+
+	allocatableResources := nodeInfo.AllocatableResource()
+
+	// compute pod limits
+	podLimits := getResourceLimits(pod)
+
+	cpuScore := computeScore(podLimits.MilliCPU, allocatableResources.MilliCPU)
+	memScore := computeScore(podLimits.Memory, allocatableResources.Memory)
+
+	score := int(0)
+	if cpuScore == 1 || memScore == 1 {
+		score = 1
+	}
+
+	if glog.V(10) {
+		// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
+		// not logged. There is visible performance gain from it.
+		glog.Infof(
+			"%v -> %v: Resource Limits Priority, allocatable %d millicores %d memory bytes, pod limits %d millicores %d memory bytes, score %d",
+			pod.Name, node.Name,
+			allocatableResources.MilliCPU, allocatableResources.Memory,
+			podLimits.MilliCPU, podLimits.Memory,
+			score,
+		)
+	}
+
+	return schedulerapi.HostPriority{
+		Host:  node.Name,
+		Score: score,
+	}, nil
+}
+
+// computeScore return 1 if limit value is less than or equal to allocable
+// value, otherwise it returns 0.
+func computeScore(limit, allocatable int64) int64 {
+	if limit != 0 && allocatable != 0 && limit <= allocatable {
+		return 1
+	}
+	return 0
+}
+
+// getResourceLimits computes resource limits for input pod.
+// The reason to create this new function is to be consistent with other
+// priority functions because most or perhaps all priority functions work
+// with schedulercache.Resource.
+// TODO: cache it as part of metadata passed to priority functions.
+func getResourceLimits(pod *v1.Pod) *schedulercache.Resource {
+	result := &schedulercache.Resource{}
+	for _, container := range pod.Spec.Containers {
+		result.Add(container.Resources.Limits)
+	}
+
+	// take max_resource(sum_pod, any_init_container)
+	for _, container := range pod.Spec.InitContainers {
+		for rName, rQuantity := range container.Resources.Limits {
+			switch rName {
+			case v1.ResourceMemory:
+				if mem := rQuantity.Value(); mem > result.Memory {
+					result.Memory = mem
+				}
+			case v1.ResourceCPU:
+				if cpu := rQuantity.MilliValue(); cpu > result.MilliCPU {
+					result.MilliCPU = cpu
+				}
+				// keeping these resources though score computation in other priority functions and in this
+				// are only computed based on cpu and memory only.
+			case v1.ResourceEphemeralStorage:
+				if ephemeralStorage := rQuantity.Value(); ephemeralStorage > result.EphemeralStorage {
+					result.EphemeralStorage = ephemeralStorage
+				}
+			case v1.ResourceNvidiaGPU:
+				if gpu := rQuantity.Value(); gpu > result.NvidiaGPU {
+					result.NvidiaGPU = gpu
+				}
+			default:
+				if v1helper.IsScalarResourceName(rName) {
+					value := rQuantity.Value()
+					if value > result.ScalarResources[rName] {
+						result.SetScalar(rName, value)
+					}
+				}
+			}
+		}
+	}
+
+	return result
+}
--- a/plugin/pkg/scheduler/algorithm/priorities/resource_limits_test.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/resource_limits_test.go
@ -0,0 +1,151 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package priorities
+
+import (
+	"reflect"
+	"testing"
+
+	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	//metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
+)
+
+func TestResourceLimistPriority(t *testing.T) {
+	noResources := v1.PodSpec{
+		Containers: []v1.Container{},
+	}
+
+	cpuOnly := v1.PodSpec{
+		NodeName: "machine1",
+		Containers: []v1.Container{
+			{
+				Resources: v1.ResourceRequirements{
+					Limits: v1.ResourceList{
+						v1.ResourceCPU:    resource.MustParse("1000m"),
+						v1.ResourceMemory: resource.MustParse("0"),
+					},
+				},
+			},
+			{
+				Resources: v1.ResourceRequirements{
+					Limits: v1.ResourceList{
+						v1.ResourceCPU:    resource.MustParse("2000m"),
+						v1.ResourceMemory: resource.MustParse("0"),
+					},
+				},
+			},
+		},
+	}
+
+	memOnly := v1.PodSpec{
+		NodeName: "machine2",
+		Containers: []v1.Container{
+			{
+				Resources: v1.ResourceRequirements{
+					Limits: v1.ResourceList{
+						v1.ResourceCPU:    resource.MustParse("0"),
+						v1.ResourceMemory: resource.MustParse("2000"),
+					},
+				},
+			},
+			{
+				Resources: v1.ResourceRequirements{
+					Limits: v1.ResourceList{
+						v1.ResourceCPU:    resource.MustParse("0"),
+						v1.ResourceMemory: resource.MustParse("3000"),
+					},
+				},
+			},
+		},
+	}
+
+	cpuAndMemory := v1.PodSpec{
+		NodeName: "machine2",
+		Containers: []v1.Container{
+			{
+				Resources: v1.ResourceRequirements{
+					Limits: v1.ResourceList{
+						v1.ResourceCPU:    resource.MustParse("1000m"),
+						v1.ResourceMemory: resource.MustParse("2000"),
+					},
+				},
+			},
+			{
+				Resources: v1.ResourceRequirements{
+					Limits: v1.ResourceList{
+						v1.ResourceCPU:    resource.MustParse("2000m"),
+						v1.ResourceMemory: resource.MustParse("3000"),
+					},
+				},
+			},
+		},
+	}
+
+	tests := []struct {
+		// input pod
+		pod          *v1.Pod
+		nodes        []*v1.Node
+		expectedList schedulerapi.HostPriorityList
+		test         string
+	}{
+		{
+			pod:          &v1.Pod{Spec: noResources},
+			nodes:        []*v1.Node{makeNode("machine1", 4000, 10000), makeNode("machine2", 4000, 0), makeNode("machine3", 0, 10000), makeNode("machine4", 0, 0)},
+			expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 0}, {Host: "machine2", Score: 0}, {Host: "machine3", Score: 0}, {Host: "machine4", Score: 0}},
+			test:         "pod does not specify its resource limits",
+		},
+		{
+			pod:          &v1.Pod{Spec: cpuOnly},
+			nodes:        []*v1.Node{makeNode("machine1", 3000, 10000), makeNode("machine2", 2000, 10000)},
+			expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 1}, {Host: "machine2", Score: 0}},
+			test:         "pod only specifies  cpu limits",
+		},
+		{
+			pod:          &v1.Pod{Spec: memOnly},
+			nodes:        []*v1.Node{makeNode("machine1", 4000, 4000), makeNode("machine2", 5000, 10000)},
+			expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 0}, {Host: "machine2", Score: 1}},
+			test:         "pod only specifies  mem limits",
+		},
+		{
+			pod:          &v1.Pod{Spec: cpuAndMemory},
+			nodes:        []*v1.Node{makeNode("machine1", 4000, 4000), makeNode("machine2", 5000, 10000)},
+			expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 1}, {Host: "machine2", Score: 1}},
+			test:         "pod specifies both cpu and  mem limits",
+		},
+		{
+			pod:          &v1.Pod{Spec: cpuAndMemory},
+			nodes:        []*v1.Node{makeNode("machine1", 0, 0)},
+			expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 0}},
+			test:         "node does not advertise its allocatables",
+		},
+	}
+
+	for _, test := range tests {
+		nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(nil, test.nodes)
+		list, err := priorityFunction(ResourceLimitsPriorityMap, nil, nil)(test.pod, nodeNameToInfo, test.nodes)
+		if err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+		if !reflect.DeepEqual(test.expectedList, list) {
+			t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
+		}
+	}
+
+}
--- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go
+++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go
@ -106,6 +106,10 @@ func init() {
 	factory.RegisterPriorityFunction2("ImageLocalityPriority", priorities.ImageLocalityPriorityMap, nil, 1)
 	// Optional, cluster-autoscaler friendly priority function - give used nodes higher priority.
 	factory.RegisterPriorityFunction2("MostRequestedPriority", priorities.MostRequestedPriorityMap, nil, 1)
+	// Prioritizes nodes that satisfy pod's resource limits
+	if utilfeature.DefaultFeatureGate.Enabled(features.ResourceLimitsPriorityFunction) {
+		factory.RegisterPriorityFunction2("ResourceLimitsPriority", priorities.ResourceLimitsPriorityMap, nil, 1)
+	}
 }

 func defaultPredicates() sets.String {