diff --git a/plugin/pkg/scheduler/algorithm/priorities/BUILD b/plugin/pkg/scheduler/algorithm/priorities/BUILD index a7a10dc50da..b6e8d618cd2 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/BUILD +++ b/plugin/pkg/scheduler/algorithm/priorities/BUILD @@ -19,6 +19,7 @@ go_library( "node_label.go", "node_prefer_avoid_pods.go", "reduce.go", + "resource_allocation.go", "resource_limits.go", "selector_spreading.go", "taint_toleration.go", diff --git a/plugin/pkg/scheduler/algorithm/priorities/balanced_resource_allocation.go b/plugin/pkg/scheduler/algorithm/priorities/balanced_resource_allocation.go index 791e28ee1a0..c57bcbfc68c 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/balanced_resource_allocation.go +++ b/plugin/pkg/scheduler/algorithm/priorities/balanced_resource_allocation.go @@ -17,76 +17,40 @@ limitations under the License. package priorities import ( - "fmt" "math" - "k8s.io/api/core/v1" - priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" - - "github.com/golang/glog" ) -// This is a reasonable size range of all container images. 90%ile of images on dockerhub drops into this range. -const ( - mb int64 = 1024 * 1024 - minImgSize int64 = 23 * mb - maxImgSize int64 = 1000 * mb +var ( + balanceResourcePriority = &ResourceAllocationPriority{"BalanceResourceAllocation", balancedResourceScorer} + + // BalancedResourceAllocationMap favors nodes with balanced resource usage rate. + // BalancedResourceAllocationMap should **NOT** be used alone, and **MUST** be used together + // with LeastRequestedPriority. It calculates the difference between the cpu and memory fraction + // of capacity, and prioritizes the host based on how close the two metrics are to each other. + // Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by: + // "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced + // Resource Utilization" + BalancedResourceAllocationMap = balanceResourcePriority.PriorityMap ) -// Also used in most/least_requested nad metadata. -// TODO: despaghettify it -func getNonZeroRequests(pod *v1.Pod) *schedulercache.Resource { - result := &schedulercache.Resource{} - for i := range pod.Spec.Containers { - container := &pod.Spec.Containers[i] - cpu, memory := priorityutil.GetNonzeroRequests(&container.Resources.Requests) - result.MilliCPU += cpu - result.Memory += memory - } - return result -} +func balancedResourceScorer(requested, allocable *schedulercache.Resource) int64 { + cpuFraction := fractionOfCapacity(requested.MilliCPU, allocable.MilliCPU) + memoryFraction := fractionOfCapacity(requested.Memory, allocable.Memory) -func calculateBalancedResourceAllocation(pod *v1.Pod, podRequests *schedulercache.Resource, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { - node := nodeInfo.Node() - if node == nil { - return schedulerapi.HostPriority{}, fmt.Errorf("node not found") - } - - allocatableResources := nodeInfo.AllocatableResource() - totalResources := *podRequests - totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU - totalResources.Memory += nodeInfo.NonZeroRequest().Memory - - cpuFraction := fractionOfCapacity(totalResources.MilliCPU, allocatableResources.MilliCPU) - memoryFraction := fractionOfCapacity(totalResources.Memory, allocatableResources.Memory) - score := int(0) if cpuFraction >= 1 || memoryFraction >= 1 { // if requested >= capacity, the corresponding host should never be preferred. - score = 0 - } else { - // Upper and lower boundary of difference between cpuFraction and memoryFraction are -1 and 1 - // respectively. Multilying the absolute value of the difference by 10 scales the value to - // 0-10 with 0 representing well balanced allocation and 10 poorly balanced. Subtracting it from - // 10 leads to the score which also scales from 0 to 10 while 10 representing well balanced. - diff := math.Abs(cpuFraction - memoryFraction) - score = int((1 - diff) * float64(schedulerapi.MaxPriority)) - } - if glog.V(10) { - glog.Infof( - "%v -> %v: Balanced Resource Allocation, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d", - pod.Name, node.Name, - allocatableResources.MilliCPU, allocatableResources.Memory, - totalResources.MilliCPU, totalResources.Memory, - score, - ) + return 0 } - return schedulerapi.HostPriority{ - Host: node.Name, - Score: score, - }, nil + // Upper and lower boundary of difference between cpuFraction and memoryFraction are -1 and 1 + // respectively. Multilying the absolute value of the difference by 10 scales the value to + // 0-10 with 0 representing well balanced allocation and 10 poorly balanced. Subtracting it from + // 10 leads to the score which also scales from 0 to 10 while 10 representing well balanced. + diff := math.Abs(cpuFraction - memoryFraction) + return int64((1 - diff) * float64(schedulerapi.MaxPriority)) } func fractionOfCapacity(requested, capacity int64) float64 { @@ -95,20 +59,3 @@ func fractionOfCapacity(requested, capacity int64) float64 { } return float64(requested) / float64(capacity) } - -// BalancedResourceAllocationMap favors nodes with balanced resource usage rate. -// BalancedResourceAllocationMap should **NOT** be used alone, and **MUST** be used together with LeastRequestedPriority. -// It calculates the difference between the cpu and memory fraction of capacity, and prioritizes the host based on how -// close the two metrics are to each other. -// Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by: -// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization" -func BalancedResourceAllocationMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { - var nonZeroRequest *schedulercache.Resource - if priorityMeta, ok := meta.(*priorityMetadata); ok { - nonZeroRequest = priorityMeta.nonZeroRequest - } else { - // We couldn't parse metadatat - fallback to computing it. - nonZeroRequest = getNonZeroRequests(pod) - } - return calculateBalancedResourceAllocation(pod, nonZeroRequest, nodeInfo) -} diff --git a/plugin/pkg/scheduler/algorithm/priorities/image_locality.go b/plugin/pkg/scheduler/algorithm/priorities/image_locality.go index 1629dee4fdf..86b3bb6c54f 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/image_locality.go +++ b/plugin/pkg/scheduler/algorithm/priorities/image_locality.go @@ -24,6 +24,13 @@ import ( "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" ) +// This is a reasonable size range of all container images. 90%ile of images on dockerhub drops into this range. +const ( + mb int64 = 1024 * 1024 + minImgSize int64 = 23 * mb + maxImgSize int64 = 1000 * mb +) + // ImageLocalityPriorityMap is a priority function that favors nodes that already have requested pod container's images. // It will detect whether the requested images are present on a node, and then calculate a score ranging from 0 to 10 // based on the total size of those images. diff --git a/plugin/pkg/scheduler/algorithm/priorities/least_requested.go b/plugin/pkg/scheduler/algorithm/priorities/least_requested.go index 73d5db676e9..39d3208f6fa 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/least_requested.go +++ b/plugin/pkg/scheduler/algorithm/priorities/least_requested.go @@ -17,73 +17,37 @@ limitations under the License. package priorities import ( - "fmt" - - "k8s.io/api/core/v1" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" - - "github.com/golang/glog" ) -// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources. -// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes -// based on the minimum of the average of the fraction of requested to capacity. -// Details: cpu((capacity - sum(requested)) * 10 / capacity) + memory((capacity - sum(requested)) * 10 / capacity) / 2 -func LeastRequestedPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { - var nonZeroRequest *schedulercache.Resource - if priorityMeta, ok := meta.(*priorityMetadata); ok { - nonZeroRequest = priorityMeta.nonZeroRequest - } else { - // We couldn't parse metadata - fallback to computing it. - nonZeroRequest = getNonZeroRequests(pod) - } - return calculateUnusedPriority(pod, nonZeroRequest, nodeInfo) +var ( + leastResourcePriority = &ResourceAllocationPriority{"LeastResourceAllocation", leastResourceScorer} + + // LeastRequestedPriority is a priority function that favors nodes with fewer requested resources. + // It calculates the percentage of memory and CPU requested by pods scheduled on the node, and + // prioritizes based on the minimum of the average of the fraction of requested to capacity. + // + // Details: + // cpu((capacity-sum(requested))*10/capacity) + memory((capacity-sum(requested))*10/capacity)/2 + LeastRequestedPriorityMap = leastResourcePriority.PriorityMap +) + +func leastResourceScorer(requested, allocable *schedulercache.Resource) int64 { + return (leastRequestedScore(requested.MilliCPU, allocable.MilliCPU) + + leastRequestedScore(requested.Memory, allocable.Memory)) / 2 } // The unused capacity is calculated on a scale of 0-10 // 0 being the lowest priority and 10 being the highest. // The more unused resources the higher the score is. -func calculateUnusedScore(requested int64, capacity int64, node string) int64 { +func leastRequestedScore(requested, capacity int64) int64 { if capacity == 0 { return 0 } if requested > capacity { - glog.V(10).Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s", - requested, capacity, node) return 0 } + return ((capacity - requested) * int64(schedulerapi.MaxPriority)) / capacity } - -// Calculates host priority based on the amount of unused resources. -// 'node' has information about the resources on the node. -// 'pods' is a list of pods currently scheduled on the node. -func calculateUnusedPriority(pod *v1.Pod, podRequests *schedulercache.Resource, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { - node := nodeInfo.Node() - if node == nil { - return schedulerapi.HostPriority{}, fmt.Errorf("node not found") - } - - allocatableResources := nodeInfo.AllocatableResource() - totalResources := *podRequests - totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU - totalResources.Memory += nodeInfo.NonZeroRequest().Memory - - cpuScore := calculateUnusedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name) - memoryScore := calculateUnusedScore(totalResources.Memory, allocatableResources.Memory, node.Name) - if glog.V(10) { - glog.Infof( - "%v -> %v: Least Requested Priority, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d CPU %d memory", - pod.Name, node.Name, - allocatableResources.MilliCPU, allocatableResources.Memory, - totalResources.MilliCPU, totalResources.Memory, - cpuScore, memoryScore, - ) - } - - return schedulerapi.HostPriority{ - Host: node.Name, - Score: int((cpuScore + memoryScore) / 2), - }, nil -} diff --git a/plugin/pkg/scheduler/algorithm/priorities/most_requested.go b/plugin/pkg/scheduler/algorithm/priorities/most_requested.go index 9d1697db93e..9cba1a32ee5 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/most_requested.go +++ b/plugin/pkg/scheduler/algorithm/priorities/most_requested.go @@ -17,28 +17,23 @@ limitations under the License. package priorities import ( - "fmt" - - "k8s.io/api/core/v1" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" - - "github.com/golang/glog" ) -// MostRequestedPriority is a priority function that favors nodes with most requested resources. -// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes -// based on the maximum of the average of the fraction of requested to capacity. -// Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / capacity)) / 2 -func MostRequestedPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { - var nonZeroRequest *schedulercache.Resource - if priorityMeta, ok := meta.(*priorityMetadata); ok { - nonZeroRequest = priorityMeta.nonZeroRequest - } else { - // We couldn't parse metadatat - fallback to computing it. - nonZeroRequest = getNonZeroRequests(pod) - } - return calculateUsedPriority(pod, nonZeroRequest, nodeInfo) +var ( + mostResourcePriority = &ResourceAllocationPriority{"MostResourceAllocation", mostResourceScorer} + + // MostRequestedPriority is a priority function that favors nodes with most requested resources. + // It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes + // based on the maximum of the average of the fraction of requested to capacity. + // Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / capacity)) / 2 + MostRequestedPriorityMap = mostResourcePriority.PriorityMap +) + +func mostResourceScorer(requested, allocable *schedulercache.Resource) int64 { + return (mostRequestedScore(requested.MilliCPU, allocable.MilliCPU) + + mostRequestedScore(requested.Memory, allocable.Memory)) / 2 } // The used capacity is calculated on a scale of 0-10 @@ -48,45 +43,13 @@ func MostRequestedPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *scheduler // (10 - calculateUnusedScore). The main difference is in rounding. It was added to // keep the final formula clean and not to modify the widely used (by users // in their default scheduling policies) calculateUSedScore. -func calculateUsedScore(requested int64, capacity int64, node string) int64 { +func mostRequestedScore(requested, capacity int64) int64 { if capacity == 0 { return 0 } if requested > capacity { - glog.V(10).Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s", - requested, capacity, node) return 0 } + return (requested * schedulerapi.MaxPriority) / capacity } - -// Calculate the resource used on a node. 'node' has information about the resources on the node. -// 'pods' is a list of pods currently scheduled on the node. -func calculateUsedPriority(pod *v1.Pod, podRequests *schedulercache.Resource, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { - node := nodeInfo.Node() - if node == nil { - return schedulerapi.HostPriority{}, fmt.Errorf("node not found") - } - - allocatableResources := nodeInfo.AllocatableResource() - totalResources := *podRequests - totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU - totalResources.Memory += nodeInfo.NonZeroRequest().Memory - - cpuScore := calculateUsedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name) - memoryScore := calculateUsedScore(totalResources.Memory, allocatableResources.Memory, node.Name) - if glog.V(10) { - glog.Infof( - "%v -> %v: Most Requested Priority, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d CPU %d memory", - pod.Name, node.Name, - allocatableResources.MilliCPU, allocatableResources.Memory, - totalResources.MilliCPU, totalResources.Memory, - cpuScore, memoryScore, - ) - } - - return schedulerapi.HostPriority{ - Host: node.Name, - Score: int((cpuScore + memoryScore) / 2), - }, nil -} diff --git a/plugin/pkg/scheduler/algorithm/priorities/resource_allocation.go b/plugin/pkg/scheduler/algorithm/priorities/resource_allocation.go new file mode 100644 index 00000000000..9723eff142e --- /dev/null +++ b/plugin/pkg/scheduler/algorithm/priorities/resource_allocation.go @@ -0,0 +1,82 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package priorities + +import ( + "fmt" + + "github.com/golang/glog" + "k8s.io/api/core/v1" + priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util" + schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" + "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" +) + +type ResourceAllocationPriority struct { + Name string + scorer func(requested, allocable *schedulercache.Resource) int64 +} + +func (r *ResourceAllocationPriority) PriorityMap( + pod *v1.Pod, + meta interface{}, + nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { + node := nodeInfo.Node() + if node == nil { + return schedulerapi.HostPriority{}, fmt.Errorf("node not found") + } + allocatable := nodeInfo.AllocatableResource() + + var requested schedulercache.Resource + if priorityMeta, ok := meta.(*priorityMetadata); ok { + requested = *priorityMeta.nonZeroRequest + } else { + // We couldn't parse metadatat - fallback to computing it. + requested = *getNonZeroRequests(pod) + } + + requested.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU + requested.Memory += nodeInfo.NonZeroRequest().Memory + + score := r.scorer(&requested, &allocatable) + + if glog.V(10) { + glog.Infof( + "%v -> %v: %v, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d", + pod.Name, node.Name, r.Name, + allocatable.MilliCPU, allocatable.Memory, + requested.MilliCPU+allocatable.MilliCPU, requested.Memory+allocatable.Memory, + score, + ) + } + + return schedulerapi.HostPriority{ + Host: node.Name, + Score: int(score), + }, nil +} + +func getNonZeroRequests(pod *v1.Pod) *schedulercache.Resource { + result := &schedulercache.Resource{} + for i := range pod.Spec.Containers { + container := &pod.Spec.Containers[i] + cpu, memory := priorityutil.GetNonzeroRequests(&container.Resources.Requests) + result.MilliCPU += cpu + result.Memory += memory + } + return result +}