mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 19:56:01 +00:00
Merge pull request #55442 from anfernee/priority_resource
Automatic merge from submit-queue (batch tested with PRs 57257, 55442). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Merge 3 resource allocation priority functions **What this PR does / why we need it**: those 3 priority functions are closed related, and share a lot of the same logic, put them together. **Release note**: ```release-note None ```
This commit is contained in:
commit
754bb1350f
@ -19,6 +19,7 @@ go_library(
|
||||
"node_label.go",
|
||||
"node_prefer_avoid_pods.go",
|
||||
"reduce.go",
|
||||
"resource_allocation.go",
|
||||
"resource_limits.go",
|
||||
"selector_spreading.go",
|
||||
"taint_toleration.go",
|
||||
|
@ -17,76 +17,40 @@ limitations under the License.
|
||||
package priorities
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util"
|
||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// This is a reasonable size range of all container images. 90%ile of images on dockerhub drops into this range.
|
||||
const (
|
||||
mb int64 = 1024 * 1024
|
||||
minImgSize int64 = 23 * mb
|
||||
maxImgSize int64 = 1000 * mb
|
||||
var (
|
||||
balanceResourcePriority = &ResourceAllocationPriority{"BalanceResourceAllocation", balancedResourceScorer}
|
||||
|
||||
// BalancedResourceAllocationMap favors nodes with balanced resource usage rate.
|
||||
// BalancedResourceAllocationMap should **NOT** be used alone, and **MUST** be used together
|
||||
// with LeastRequestedPriority. It calculates the difference between the cpu and memory fraction
|
||||
// of capacity, and prioritizes the host based on how close the two metrics are to each other.
|
||||
// Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by:
|
||||
// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced
|
||||
// Resource Utilization"
|
||||
BalancedResourceAllocationMap = balanceResourcePriority.PriorityMap
|
||||
)
|
||||
|
||||
// Also used in most/least_requested nad metadata.
|
||||
// TODO: despaghettify it
|
||||
func getNonZeroRequests(pod *v1.Pod) *schedulercache.Resource {
|
||||
result := &schedulercache.Resource{}
|
||||
for i := range pod.Spec.Containers {
|
||||
container := &pod.Spec.Containers[i]
|
||||
cpu, memory := priorityutil.GetNonzeroRequests(&container.Resources.Requests)
|
||||
result.MilliCPU += cpu
|
||||
result.Memory += memory
|
||||
}
|
||||
return result
|
||||
}
|
||||
func balancedResourceScorer(requested, allocable *schedulercache.Resource) int64 {
|
||||
cpuFraction := fractionOfCapacity(requested.MilliCPU, allocable.MilliCPU)
|
||||
memoryFraction := fractionOfCapacity(requested.Memory, allocable.Memory)
|
||||
|
||||
func calculateBalancedResourceAllocation(pod *v1.Pod, podRequests *schedulercache.Resource, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
|
||||
}
|
||||
|
||||
allocatableResources := nodeInfo.AllocatableResource()
|
||||
totalResources := *podRequests
|
||||
totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
|
||||
totalResources.Memory += nodeInfo.NonZeroRequest().Memory
|
||||
|
||||
cpuFraction := fractionOfCapacity(totalResources.MilliCPU, allocatableResources.MilliCPU)
|
||||
memoryFraction := fractionOfCapacity(totalResources.Memory, allocatableResources.Memory)
|
||||
score := int(0)
|
||||
if cpuFraction >= 1 || memoryFraction >= 1 {
|
||||
// if requested >= capacity, the corresponding host should never be preferred.
|
||||
score = 0
|
||||
} else {
|
||||
// Upper and lower boundary of difference between cpuFraction and memoryFraction are -1 and 1
|
||||
// respectively. Multilying the absolute value of the difference by 10 scales the value to
|
||||
// 0-10 with 0 representing well balanced allocation and 10 poorly balanced. Subtracting it from
|
||||
// 10 leads to the score which also scales from 0 to 10 while 10 representing well balanced.
|
||||
diff := math.Abs(cpuFraction - memoryFraction)
|
||||
score = int((1 - diff) * float64(schedulerapi.MaxPriority))
|
||||
}
|
||||
if glog.V(10) {
|
||||
glog.Infof(
|
||||
"%v -> %v: Balanced Resource Allocation, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d",
|
||||
pod.Name, node.Name,
|
||||
allocatableResources.MilliCPU, allocatableResources.Memory,
|
||||
totalResources.MilliCPU, totalResources.Memory,
|
||||
score,
|
||||
)
|
||||
return 0
|
||||
}
|
||||
|
||||
return schedulerapi.HostPriority{
|
||||
Host: node.Name,
|
||||
Score: score,
|
||||
}, nil
|
||||
// Upper and lower boundary of difference between cpuFraction and memoryFraction are -1 and 1
|
||||
// respectively. Multilying the absolute value of the difference by 10 scales the value to
|
||||
// 0-10 with 0 representing well balanced allocation and 10 poorly balanced. Subtracting it from
|
||||
// 10 leads to the score which also scales from 0 to 10 while 10 representing well balanced.
|
||||
diff := math.Abs(cpuFraction - memoryFraction)
|
||||
return int64((1 - diff) * float64(schedulerapi.MaxPriority))
|
||||
}
|
||||
|
||||
func fractionOfCapacity(requested, capacity int64) float64 {
|
||||
@ -95,20 +59,3 @@ func fractionOfCapacity(requested, capacity int64) float64 {
|
||||
}
|
||||
return float64(requested) / float64(capacity)
|
||||
}
|
||||
|
||||
// BalancedResourceAllocationMap favors nodes with balanced resource usage rate.
|
||||
// BalancedResourceAllocationMap should **NOT** be used alone, and **MUST** be used together with LeastRequestedPriority.
|
||||
// It calculates the difference between the cpu and memory fraction of capacity, and prioritizes the host based on how
|
||||
// close the two metrics are to each other.
|
||||
// Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by:
|
||||
// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization"
|
||||
func BalancedResourceAllocationMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
|
||||
var nonZeroRequest *schedulercache.Resource
|
||||
if priorityMeta, ok := meta.(*priorityMetadata); ok {
|
||||
nonZeroRequest = priorityMeta.nonZeroRequest
|
||||
} else {
|
||||
// We couldn't parse metadatat - fallback to computing it.
|
||||
nonZeroRequest = getNonZeroRequests(pod)
|
||||
}
|
||||
return calculateBalancedResourceAllocation(pod, nonZeroRequest, nodeInfo)
|
||||
}
|
||||
|
@ -24,6 +24,13 @@ import (
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
)
|
||||
|
||||
// This is a reasonable size range of all container images. 90%ile of images on dockerhub drops into this range.
|
||||
const (
|
||||
mb int64 = 1024 * 1024
|
||||
minImgSize int64 = 23 * mb
|
||||
maxImgSize int64 = 1000 * mb
|
||||
)
|
||||
|
||||
// ImageLocalityPriorityMap is a priority function that favors nodes that already have requested pod container's images.
|
||||
// It will detect whether the requested images are present on a node, and then calculate a score ranging from 0 to 10
|
||||
// based on the total size of those images.
|
||||
|
@ -17,73 +17,37 @@ limitations under the License.
|
||||
package priorities
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources.
|
||||
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
|
||||
// based on the minimum of the average of the fraction of requested to capacity.
|
||||
// Details: cpu((capacity - sum(requested)) * 10 / capacity) + memory((capacity - sum(requested)) * 10 / capacity) / 2
|
||||
func LeastRequestedPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
|
||||
var nonZeroRequest *schedulercache.Resource
|
||||
if priorityMeta, ok := meta.(*priorityMetadata); ok {
|
||||
nonZeroRequest = priorityMeta.nonZeroRequest
|
||||
} else {
|
||||
// We couldn't parse metadata - fallback to computing it.
|
||||
nonZeroRequest = getNonZeroRequests(pod)
|
||||
}
|
||||
return calculateUnusedPriority(pod, nonZeroRequest, nodeInfo)
|
||||
var (
|
||||
leastResourcePriority = &ResourceAllocationPriority{"LeastResourceAllocation", leastResourceScorer}
|
||||
|
||||
// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources.
|
||||
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and
|
||||
// prioritizes based on the minimum of the average of the fraction of requested to capacity.
|
||||
//
|
||||
// Details:
|
||||
// cpu((capacity-sum(requested))*10/capacity) + memory((capacity-sum(requested))*10/capacity)/2
|
||||
LeastRequestedPriorityMap = leastResourcePriority.PriorityMap
|
||||
)
|
||||
|
||||
func leastResourceScorer(requested, allocable *schedulercache.Resource) int64 {
|
||||
return (leastRequestedScore(requested.MilliCPU, allocable.MilliCPU) +
|
||||
leastRequestedScore(requested.Memory, allocable.Memory)) / 2
|
||||
}
|
||||
|
||||
// The unused capacity is calculated on a scale of 0-10
|
||||
// 0 being the lowest priority and 10 being the highest.
|
||||
// The more unused resources the higher the score is.
|
||||
func calculateUnusedScore(requested int64, capacity int64, node string) int64 {
|
||||
func leastRequestedScore(requested, capacity int64) int64 {
|
||||
if capacity == 0 {
|
||||
return 0
|
||||
}
|
||||
if requested > capacity {
|
||||
glog.V(10).Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s",
|
||||
requested, capacity, node)
|
||||
return 0
|
||||
}
|
||||
|
||||
return ((capacity - requested) * int64(schedulerapi.MaxPriority)) / capacity
|
||||
}
|
||||
|
||||
// Calculates host priority based on the amount of unused resources.
|
||||
// 'node' has information about the resources on the node.
|
||||
// 'pods' is a list of pods currently scheduled on the node.
|
||||
func calculateUnusedPriority(pod *v1.Pod, podRequests *schedulercache.Resource, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
|
||||
}
|
||||
|
||||
allocatableResources := nodeInfo.AllocatableResource()
|
||||
totalResources := *podRequests
|
||||
totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
|
||||
totalResources.Memory += nodeInfo.NonZeroRequest().Memory
|
||||
|
||||
cpuScore := calculateUnusedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name)
|
||||
memoryScore := calculateUnusedScore(totalResources.Memory, allocatableResources.Memory, node.Name)
|
||||
if glog.V(10) {
|
||||
glog.Infof(
|
||||
"%v -> %v: Least Requested Priority, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d CPU %d memory",
|
||||
pod.Name, node.Name,
|
||||
allocatableResources.MilliCPU, allocatableResources.Memory,
|
||||
totalResources.MilliCPU, totalResources.Memory,
|
||||
cpuScore, memoryScore,
|
||||
)
|
||||
}
|
||||
|
||||
return schedulerapi.HostPriority{
|
||||
Host: node.Name,
|
||||
Score: int((cpuScore + memoryScore) / 2),
|
||||
}, nil
|
||||
}
|
||||
|
@ -17,28 +17,23 @@ limitations under the License.
|
||||
package priorities
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// MostRequestedPriority is a priority function that favors nodes with most requested resources.
|
||||
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
|
||||
// based on the maximum of the average of the fraction of requested to capacity.
|
||||
// Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / capacity)) / 2
|
||||
func MostRequestedPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
|
||||
var nonZeroRequest *schedulercache.Resource
|
||||
if priorityMeta, ok := meta.(*priorityMetadata); ok {
|
||||
nonZeroRequest = priorityMeta.nonZeroRequest
|
||||
} else {
|
||||
// We couldn't parse metadatat - fallback to computing it.
|
||||
nonZeroRequest = getNonZeroRequests(pod)
|
||||
}
|
||||
return calculateUsedPriority(pod, nonZeroRequest, nodeInfo)
|
||||
var (
|
||||
mostResourcePriority = &ResourceAllocationPriority{"MostResourceAllocation", mostResourceScorer}
|
||||
|
||||
// MostRequestedPriority is a priority function that favors nodes with most requested resources.
|
||||
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
|
||||
// based on the maximum of the average of the fraction of requested to capacity.
|
||||
// Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / capacity)) / 2
|
||||
MostRequestedPriorityMap = mostResourcePriority.PriorityMap
|
||||
)
|
||||
|
||||
func mostResourceScorer(requested, allocable *schedulercache.Resource) int64 {
|
||||
return (mostRequestedScore(requested.MilliCPU, allocable.MilliCPU) +
|
||||
mostRequestedScore(requested.Memory, allocable.Memory)) / 2
|
||||
}
|
||||
|
||||
// The used capacity is calculated on a scale of 0-10
|
||||
@ -48,45 +43,13 @@ func MostRequestedPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *scheduler
|
||||
// (10 - calculateUnusedScore). The main difference is in rounding. It was added to
|
||||
// keep the final formula clean and not to modify the widely used (by users
|
||||
// in their default scheduling policies) calculateUSedScore.
|
||||
func calculateUsedScore(requested int64, capacity int64, node string) int64 {
|
||||
func mostRequestedScore(requested, capacity int64) int64 {
|
||||
if capacity == 0 {
|
||||
return 0
|
||||
}
|
||||
if requested > capacity {
|
||||
glog.V(10).Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s",
|
||||
requested, capacity, node)
|
||||
return 0
|
||||
}
|
||||
|
||||
return (requested * schedulerapi.MaxPriority) / capacity
|
||||
}
|
||||
|
||||
// Calculate the resource used on a node. 'node' has information about the resources on the node.
|
||||
// 'pods' is a list of pods currently scheduled on the node.
|
||||
func calculateUsedPriority(pod *v1.Pod, podRequests *schedulercache.Resource, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
|
||||
}
|
||||
|
||||
allocatableResources := nodeInfo.AllocatableResource()
|
||||
totalResources := *podRequests
|
||||
totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
|
||||
totalResources.Memory += nodeInfo.NonZeroRequest().Memory
|
||||
|
||||
cpuScore := calculateUsedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name)
|
||||
memoryScore := calculateUsedScore(totalResources.Memory, allocatableResources.Memory, node.Name)
|
||||
if glog.V(10) {
|
||||
glog.Infof(
|
||||
"%v -> %v: Most Requested Priority, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d CPU %d memory",
|
||||
pod.Name, node.Name,
|
||||
allocatableResources.MilliCPU, allocatableResources.Memory,
|
||||
totalResources.MilliCPU, totalResources.Memory,
|
||||
cpuScore, memoryScore,
|
||||
)
|
||||
}
|
||||
|
||||
return schedulerapi.HostPriority{
|
||||
Host: node.Name,
|
||||
Score: int((cpuScore + memoryScore) / 2),
|
||||
}, nil
|
||||
}
|
||||
|
@ -0,0 +1,82 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package priorities
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/api/core/v1"
|
||||
priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util"
|
||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
)
|
||||
|
||||
type ResourceAllocationPriority struct {
|
||||
Name string
|
||||
scorer func(requested, allocable *schedulercache.Resource) int64
|
||||
}
|
||||
|
||||
func (r *ResourceAllocationPriority) PriorityMap(
|
||||
pod *v1.Pod,
|
||||
meta interface{},
|
||||
nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
|
||||
}
|
||||
allocatable := nodeInfo.AllocatableResource()
|
||||
|
||||
var requested schedulercache.Resource
|
||||
if priorityMeta, ok := meta.(*priorityMetadata); ok {
|
||||
requested = *priorityMeta.nonZeroRequest
|
||||
} else {
|
||||
// We couldn't parse metadatat - fallback to computing it.
|
||||
requested = *getNonZeroRequests(pod)
|
||||
}
|
||||
|
||||
requested.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
|
||||
requested.Memory += nodeInfo.NonZeroRequest().Memory
|
||||
|
||||
score := r.scorer(&requested, &allocatable)
|
||||
|
||||
if glog.V(10) {
|
||||
glog.Infof(
|
||||
"%v -> %v: %v, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d",
|
||||
pod.Name, node.Name, r.Name,
|
||||
allocatable.MilliCPU, allocatable.Memory,
|
||||
requested.MilliCPU+allocatable.MilliCPU, requested.Memory+allocatable.Memory,
|
||||
score,
|
||||
)
|
||||
}
|
||||
|
||||
return schedulerapi.HostPriority{
|
||||
Host: node.Name,
|
||||
Score: int(score),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func getNonZeroRequests(pod *v1.Pod) *schedulercache.Resource {
|
||||
result := &schedulercache.Resource{}
|
||||
for i := range pod.Spec.Containers {
|
||||
container := &pod.Spec.Containers[i]
|
||||
cpu, memory := priorityutil.GetNonzeroRequests(&container.Resources.Requests)
|
||||
result.MilliCPU += cpu
|
||||
result.Memory += memory
|
||||
}
|
||||
return result
|
||||
}
|
Loading…
Reference in New Issue
Block a user