diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities.go b/plugin/pkg/scheduler/algorithm/priorities/priorities.go index 4862453aecc..46afb6617fb 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/priorities.go +++ b/plugin/pkg/scheduler/algorithm/priorities/priorities.go @@ -39,9 +39,10 @@ func getNonZeroRequests(pod *api.Pod) *schedulercache.Resource { return result } -// the unused capacity is calculated on a scale of 0-10 -// 0 being the lowest priority and 10 being the highest -func calculateScore(requested int64, capacity int64, node string) int64 { +// The unused capacity is calculated on a scale of 0-10 +// 0 being the lowest priority and 10 being the highest. +// The more unused resources the higher the score is. +func calculateUnusedScore(requested int64, capacity int64, node string) int64 { if capacity == 0 { return 0 } @@ -53,17 +54,37 @@ func calculateScore(requested int64, capacity int64, node string) int64 { return ((capacity - requested) * 10) / capacity } -// Calculate the resource occupancy on a node. 'node' has information about the resources on the node. +// The used capacity is calculated on a scale of 0-10 +// 0 being the lowest priority and 10 being the highest. +// The more resources are used the higher the score is. This function +// is almost a reversed version of calculatUnusedScore (10 - calculateUnusedScore). +// The main difference is in rounding. It was added to keep the +// final formula clean and not to modify the widely used (by users +// in their default scheduling policies) calculateUSedScore. +func calculateUsedScore(requested int64, capacity int64, node string) int64 { + if capacity == 0 { + return 0 + } + if requested > capacity { + glog.V(2).Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s", + requested, capacity, node) + return 0 + } + return (requested * 10) / capacity +} + +// Calculates host priority based on the amount of unused resources. +// 'node' has information about the resources on the node. // 'pods' is a list of pods currently scheduled on the node. // TODO: Use Node() from nodeInfo instead of passing it. -func calculateResourceOccupancy(pod *api.Pod, podRequests *schedulercache.Resource, node *api.Node, nodeInfo *schedulercache.NodeInfo) schedulerapi.HostPriority { +func calculateUnusedPriority(pod *api.Pod, podRequests *schedulercache.Resource, node *api.Node, nodeInfo *schedulercache.NodeInfo) schedulerapi.HostPriority { allocatableResources := nodeInfo.AllocatableResource() totalResources := *podRequests totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU totalResources.Memory += nodeInfo.NonZeroRequest().Memory - cpuScore := calculateScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name) - memoryScore := calculateScore(totalResources.Memory, allocatableResources.Memory, node.Name) + cpuScore := calculateUnusedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name) + memoryScore := calculateUnusedScore(totalResources.Memory, allocatableResources.Memory, node.Name) if glog.V(10) { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is // not logged. There is visible performance gain from it. @@ -82,6 +103,35 @@ func calculateResourceOccupancy(pod *api.Pod, podRequests *schedulercache.Resour } } +// Calculate the resource used on a node. 'node' has information about the resources on the node. +// 'pods' is a list of pods currently scheduled on the node. +// TODO: Use Node() from nodeInfo instead of passing it. +func calculateUsedPriority(pod *api.Pod, podRequests *schedulercache.Resource, node *api.Node, nodeInfo *schedulercache.NodeInfo) schedulerapi.HostPriority { + allocatableResources := nodeInfo.AllocatableResource() + totalResources := *podRequests + totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU + totalResources.Memory += nodeInfo.NonZeroRequest().Memory + + cpuScore := calculateUsedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name) + memoryScore := calculateUsedScore(totalResources.Memory, allocatableResources.Memory, node.Name) + if glog.V(10) { + // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is + // not logged. There is visible performance gain from it. + glog.V(10).Infof( + "%v -> %v: Most Requested Priority, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d CPU %d memory", + pod.Name, node.Name, + allocatableResources.MilliCPU, allocatableResources.Memory, + totalResources.MilliCPU, totalResources.Memory, + cpuScore, memoryScore, + ) + } + + return schedulerapi.HostPriority{ + Host: node.Name, + Score: int((cpuScore + memoryScore) / 2), + } +} + // LeastRequestedPriority is a priority function that favors nodes with fewer requested resources. // It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes // based on the minimum of the average of the fraction of requested to capacity. @@ -90,7 +140,20 @@ func LeastRequestedPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulerca podResources := getNonZeroRequests(pod) list := make(schedulerapi.HostPriorityList, 0, len(nodes)) for _, node := range nodes { - list = append(list, calculateResourceOccupancy(pod, podResources, node, nodeNameToInfo[node.Name])) + list = append(list, calculateUnusedPriority(pod, podResources, node, nodeNameToInfo[node.Name])) + } + return list, nil +} + +// MostRequestedPriority is a priority function that favors nodes with most requested resources. +// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes +// based on the maximum of the average of the fraction of requested to capacity. +// Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / capacity)) / 2 +func MostRequestedPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*api.Node) (schedulerapi.HostPriorityList, error) { + podResources := getNonZeroRequests(pod) + list := make(schedulerapi.HostPriorityList, 0, len(nodes)) + for _, node := range nodes { + list = append(list, calculateUsedPriority(pod, podResources, node, nodeNameToInfo[node.Name])) } return list, nil } diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go b/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go index 931e66a990f..1b2512c4eff 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go +++ b/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go @@ -411,6 +411,161 @@ func TestLeastRequested(t *testing.T) { } } +func TestMostRequested(t *testing.T) { + labels1 := map[string]string{ + "foo": "bar", + "baz": "blah", + } + labels2 := map[string]string{ + "bar": "foo", + "baz": "blah", + } + noResources := api.PodSpec{ + Containers: []api.Container{}, + } + cpuOnly := api.PodSpec{ + NodeName: "machine1", + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + "cpu": resource.MustParse("1000m"), + "memory": resource.MustParse("0"), + }, + }, + }, + { + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + "cpu": resource.MustParse("2000m"), + "memory": resource.MustParse("0"), + }, + }, + }, + }, + } + cpuOnly2 := cpuOnly + cpuOnly2.NodeName = "machine2" + cpuAndMemory := api.PodSpec{ + NodeName: "machine2", + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + "cpu": resource.MustParse("1000m"), + "memory": resource.MustParse("2000"), + }, + }, + }, + { + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + "cpu": resource.MustParse("2000m"), + "memory": resource.MustParse("3000"), + }, + }, + }, + }, + } + tests := []struct { + pod *api.Pod + pods []*api.Pod + nodes []*api.Node + expectedList schedulerapi.HostPriorityList + test string + }{ + { + /* + Node1 scores (used resources) on 0-10 scale + CPU Score: (0 * 10 / 4000 = 0 + Memory Score: (0 * 10) / 10000 = 0 + Node1 Score: (0 + 0) / 2 = 0 + + Node2 scores (used resources) on 0-10 scale + CPU Score: (0 * 10 / 4000 = 0 + Memory Score: (0 * 10 / 10000 = 0 + Node2 Score: (0 + 0) / 2 = 0 + */ + pod: &api.Pod{Spec: noResources}, + nodes: []*api.Node{makeNode("machine1", 4000, 10000), makeNode("machine2", 4000, 10000)}, + expectedList: []schedulerapi.HostPriority{{"machine1", 0}, {"machine2", 0}}, + test: "nothing scheduled, nothing requested", + }, + { + /* + Node1 scores on 0-10 scale + CPU Score: (3000 * 10 / 4000 = 7.5 + Memory Score: (5000 * 10) / 10000 = 5 + Node1 Score: (7.5 + 5) / 2 = 6 + + Node2 scores on 0-10 scale + CPU Score: (3000 * 10 / 6000 = 5 + Memory Score: (5000 * 10 / 10000 = 5 + Node2 Score: (5 + 5) / 2 = 5 + */ + pod: &api.Pod{Spec: cpuAndMemory}, + nodes: []*api.Node{makeNode("machine1", 4000, 10000), makeNode("machine2", 6000, 10000)}, + expectedList: []schedulerapi.HostPriority{{"machine1", 6}, {"machine2", 5}}, + test: "nothing scheduled, resources requested, differently sized machines", + }, + { + /* + Node1 scores on 0-10 scale + CPU Score: (6000 * 10) / 10000 = 6 + Memory Score: (0 * 10) / 20000 = 10 + Node1 Score: (6 + 0) / 2 = 3 + + Node2 scores on 0-10 scale + CPU Score: (6000 * 10) / 10000 = 6 + Memory Score: (5000 * 10) / 20000 = 2.5 + Node2 Score: (6 + 2.5) / 2 = 4 + */ + pod: &api.Pod{Spec: noResources}, + nodes: []*api.Node{makeNode("machine1", 10000, 20000), makeNode("machine2", 10000, 20000)}, + expectedList: []schedulerapi.HostPriority{{"machine1", 3}, {"machine2", 4}}, + test: "no resources requested, pods scheduled with resources", + pods: []*api.Pod{ + {Spec: cpuOnly, ObjectMeta: api.ObjectMeta{Labels: labels2}}, + {Spec: cpuOnly, ObjectMeta: api.ObjectMeta{Labels: labels1}}, + {Spec: cpuOnly2, ObjectMeta: api.ObjectMeta{Labels: labels1}}, + {Spec: cpuAndMemory, ObjectMeta: api.ObjectMeta{Labels: labels1}}, + }, + }, + { + /* + Node1 scores on 0-10 scale + CPU Score: (6000 * 10) / 10000 = 6 + Memory Score: (5000 * 10) / 20000 = 2.5 + Node1 Score: (6 + 2.5) / 2 = 4 + + Node2 scores on 0-10 scale + CPU Score: (6000 * 10) / 10000 = 6 + Memory Score: (10000 * 10) / 20000 = 5 + Node2 Score: (6 + 5) / 2 = 5 + */ + pod: &api.Pod{Spec: cpuAndMemory}, + nodes: []*api.Node{makeNode("machine1", 10000, 20000), makeNode("machine2", 10000, 20000)}, + expectedList: []schedulerapi.HostPriority{{"machine1", 4}, {"machine2", 5}}, + test: "resources requested, pods scheduled with resources", + pods: []*api.Pod{ + {Spec: cpuOnly}, + {Spec: cpuAndMemory}, + }, + }, + } + + for _, test := range tests { + nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, test.nodes) + list, err := MostRequestedPriority(test.pod, nodeNameToInfo, test.nodes) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !reflect.DeepEqual(test.expectedList, list) { + t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list) + } + } +} + func TestNewNodeLabelPriority(t *testing.T) { label1 := map[string]string{"foo": "bar"} label2 := map[string]string{"bar": "foo"} diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go index 7ff6f75e303..15a1f34dc42 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -93,6 +93,8 @@ func init() { factory.RegisterFitPredicate("HostName", predicates.PodFitsHost) // Fit is determined by node selector query. factory.RegisterFitPredicate("MatchNodeSelector", predicates.PodSelectorMatches) + // Optional, cluster-autoscaler friendly priority function - give used nodes higher priority. + factory.RegisterPriorityFunction("MostRequestedPriority", priorities.MostRequestedPriority, 1) } func defaultPredicates() sets.String {