diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 1ed30a62fda..5002f5e41ba 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -206,6 +206,12 @@ const ( // alpha: v1.9 // Postpone deletion of a persistent volume claim in case it is used by a pod PVCProtection utilfeature.Feature = "PVCProtection" + + // owner: @aveshagarwal + // alpha: v1.9 + // + // Enable resource limits priority function + ResourceLimitsPriorityFunction utilfeature.Feature = "ResourceLimitsPriorityFunction" ) func init() { @@ -244,6 +250,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS CustomPodDNS: {Default: false, PreRelease: utilfeature.Alpha}, BlockVolume: {Default: false, PreRelease: utilfeature.Alpha}, PVCProtection: {Default: false, PreRelease: utilfeature.Alpha}, + ResourceLimitsPriorityFunction: {Default: false, PreRelease: utilfeature.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/plugin/pkg/scheduler/algorithm/priorities/BUILD b/plugin/pkg/scheduler/algorithm/priorities/BUILD index 62913a89bd4..f473a44b4c2 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/BUILD +++ b/plugin/pkg/scheduler/algorithm/priorities/BUILD @@ -19,6 +19,7 @@ go_library( "node_label.go", "node_prefer_avoid_pods.go", "reduce.go", + "resource_limits.go", "selector_spreading.go", "taint_toleration.go", "test_util.go", @@ -54,6 +55,7 @@ go_test( "node_affinity_test.go", "node_label_test.go", "node_prefer_avoid_pods_test.go", + "resource_limits_test.go", "selector_spreading_test.go", "taint_toleration_test.go", ], diff --git a/plugin/pkg/scheduler/algorithm/priorities/resource_limits.go b/plugin/pkg/scheduler/algorithm/priorities/resource_limits.go new file mode 100644 index 00000000000..77ae0dca923 --- /dev/null +++ b/plugin/pkg/scheduler/algorithm/priorities/resource_limits.go @@ -0,0 +1,128 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package priorities + +import ( + "fmt" + + "k8s.io/api/core/v1" + v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" + schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" + "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" + + "github.com/golang/glog" +) + +// ResourceLimitsPriorityMap is a priority function that increases score of input node by 1 if the node satisfies +// input pod's resource limits. In detail, this priority function works as follows: If a node does not publish its +// allocatable resources (cpu and memory both), the node score is not affected. If a pod does not specify +// its cpu and memory limits both, the node score is not affected. If one or both of cpu and memory limits +// of the pod are satisfied, the node is assigned a score of 1. +// Rationale of choosing the lowest score of 1 is that this is mainly selected to break ties between nodes that have +// same scores assigned by one of least and most requested priority functions. +func ResourceLimitsPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { + node := nodeInfo.Node() + if node == nil { + return schedulerapi.HostPriority{}, fmt.Errorf("node not found") + } + + allocatableResources := nodeInfo.AllocatableResource() + + // compute pod limits + podLimits := getResourceLimits(pod) + + cpuScore := computeScore(podLimits.MilliCPU, allocatableResources.MilliCPU) + memScore := computeScore(podLimits.Memory, allocatableResources.Memory) + + score := int(0) + if cpuScore == 1 || memScore == 1 { + score = 1 + } + + if glog.V(10) { + // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is + // not logged. There is visible performance gain from it. + glog.Infof( + "%v -> %v: Resource Limits Priority, allocatable %d millicores %d memory bytes, pod limits %d millicores %d memory bytes, score %d", + pod.Name, node.Name, + allocatableResources.MilliCPU, allocatableResources.Memory, + podLimits.MilliCPU, podLimits.Memory, + score, + ) + } + + return schedulerapi.HostPriority{ + Host: node.Name, + Score: score, + }, nil +} + +// computeScore return 1 if limit value is less than or equal to allocable +// value, otherwise it returns 0. +func computeScore(limit, allocatable int64) int64 { + if limit != 0 && allocatable != 0 && limit <= allocatable { + return 1 + } + return 0 +} + +// getResourceLimits computes resource limits for input pod. +// The reason to create this new function is to be consistent with other +// priority functions because most or perhaps all priority functions work +// with schedulercache.Resource. +// TODO: cache it as part of metadata passed to priority functions. +func getResourceLimits(pod *v1.Pod) *schedulercache.Resource { + result := &schedulercache.Resource{} + for _, container := range pod.Spec.Containers { + result.Add(container.Resources.Limits) + } + + // take max_resource(sum_pod, any_init_container) + for _, container := range pod.Spec.InitContainers { + for rName, rQuantity := range container.Resources.Limits { + switch rName { + case v1.ResourceMemory: + if mem := rQuantity.Value(); mem > result.Memory { + result.Memory = mem + } + case v1.ResourceCPU: + if cpu := rQuantity.MilliValue(); cpu > result.MilliCPU { + result.MilliCPU = cpu + } + // keeping these resources though score computation in other priority functions and in this + // are only computed based on cpu and memory only. + case v1.ResourceEphemeralStorage: + if ephemeralStorage := rQuantity.Value(); ephemeralStorage > result.EphemeralStorage { + result.EphemeralStorage = ephemeralStorage + } + case v1.ResourceNvidiaGPU: + if gpu := rQuantity.Value(); gpu > result.NvidiaGPU { + result.NvidiaGPU = gpu + } + default: + if v1helper.IsScalarResourceName(rName) { + value := rQuantity.Value() + if value > result.ScalarResources[rName] { + result.SetScalar(rName, value) + } + } + } + } + } + + return result +} diff --git a/plugin/pkg/scheduler/algorithm/priorities/resource_limits_test.go b/plugin/pkg/scheduler/algorithm/priorities/resource_limits_test.go new file mode 100644 index 00000000000..0a48cc73308 --- /dev/null +++ b/plugin/pkg/scheduler/algorithm/priorities/resource_limits_test.go @@ -0,0 +1,151 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package priorities + +import ( + "reflect" + "testing" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + //metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" + "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" +) + +func TestResourceLimistPriority(t *testing.T) { + noResources := v1.PodSpec{ + Containers: []v1.Container{}, + } + + cpuOnly := v1.PodSpec{ + NodeName: "machine1", + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("1000m"), + v1.ResourceMemory: resource.MustParse("0"), + }, + }, + }, + { + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("2000m"), + v1.ResourceMemory: resource.MustParse("0"), + }, + }, + }, + }, + } + + memOnly := v1.PodSpec{ + NodeName: "machine2", + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("0"), + v1.ResourceMemory: resource.MustParse("2000"), + }, + }, + }, + { + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("0"), + v1.ResourceMemory: resource.MustParse("3000"), + }, + }, + }, + }, + } + + cpuAndMemory := v1.PodSpec{ + NodeName: "machine2", + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("1000m"), + v1.ResourceMemory: resource.MustParse("2000"), + }, + }, + }, + { + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("2000m"), + v1.ResourceMemory: resource.MustParse("3000"), + }, + }, + }, + }, + } + + tests := []struct { + // input pod + pod *v1.Pod + nodes []*v1.Node + expectedList schedulerapi.HostPriorityList + test string + }{ + { + pod: &v1.Pod{Spec: noResources}, + nodes: []*v1.Node{makeNode("machine1", 4000, 10000), makeNode("machine2", 4000, 0), makeNode("machine3", 0, 10000), makeNode("machine4", 0, 0)}, + expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 0}, {Host: "machine2", Score: 0}, {Host: "machine3", Score: 0}, {Host: "machine4", Score: 0}}, + test: "pod does not specify its resource limits", + }, + { + pod: &v1.Pod{Spec: cpuOnly}, + nodes: []*v1.Node{makeNode("machine1", 3000, 10000), makeNode("machine2", 2000, 10000)}, + expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 1}, {Host: "machine2", Score: 0}}, + test: "pod only specifies cpu limits", + }, + { + pod: &v1.Pod{Spec: memOnly}, + nodes: []*v1.Node{makeNode("machine1", 4000, 4000), makeNode("machine2", 5000, 10000)}, + expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 0}, {Host: "machine2", Score: 1}}, + test: "pod only specifies mem limits", + }, + { + pod: &v1.Pod{Spec: cpuAndMemory}, + nodes: []*v1.Node{makeNode("machine1", 4000, 4000), makeNode("machine2", 5000, 10000)}, + expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 1}, {Host: "machine2", Score: 1}}, + test: "pod specifies both cpu and mem limits", + }, + { + pod: &v1.Pod{Spec: cpuAndMemory}, + nodes: []*v1.Node{makeNode("machine1", 0, 0)}, + expectedList: []schedulerapi.HostPriority{{Host: "machine1", Score: 0}}, + test: "node does not advertise its allocatables", + }, + } + + for _, test := range tests { + nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(nil, test.nodes) + list, err := priorityFunction(ResourceLimitsPriorityMap, nil, nil)(test.pod, nodeNameToInfo, test.nodes) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !reflect.DeepEqual(test.expectedList, list) { + t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list) + } + } + +} diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go index 5bb5f136192..1b7b34d96bd 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -106,6 +106,10 @@ func init() { factory.RegisterPriorityFunction2("ImageLocalityPriority", priorities.ImageLocalityPriorityMap, nil, 1) // Optional, cluster-autoscaler friendly priority function - give used nodes higher priority. factory.RegisterPriorityFunction2("MostRequestedPriority", priorities.MostRequestedPriorityMap, nil, 1) + // Prioritizes nodes that satisfy pod's resource limits + if utilfeature.DefaultFeatureGate.Enabled(features.ResourceLimitsPriorityFunction) { + factory.RegisterPriorityFunction2("ResourceLimitsPriority", priorities.ResourceLimitsPriorityMap, nil, 1) + } } func defaultPredicates() sets.String {