mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-27 13:37:30 +00:00
ClusterAutoscaler-friendly scheduler priority function that promotes well-used nodes.
This commit is contained in:
parent
ec4d645da4
commit
de2fea95ca
@ -39,9 +39,10 @@ func getNonZeroRequests(pod *api.Pod) *schedulercache.Resource {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// the unused capacity is calculated on a scale of 0-10
|
// The unused capacity is calculated on a scale of 0-10
|
||||||
// 0 being the lowest priority and 10 being the highest
|
// 0 being the lowest priority and 10 being the highest.
|
||||||
func calculateScore(requested int64, capacity int64, node string) int64 {
|
// The more unused resources the higher the score is.
|
||||||
|
func calculateUnusedScore(requested int64, capacity int64, node string) int64 {
|
||||||
if capacity == 0 {
|
if capacity == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
@ -53,17 +54,37 @@ func calculateScore(requested int64, capacity int64, node string) int64 {
|
|||||||
return ((capacity - requested) * 10) / capacity
|
return ((capacity - requested) * 10) / capacity
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the resource occupancy on a node. 'node' has information about the resources on the node.
|
// The used capacity is calculated on a scale of 0-10
|
||||||
|
// 0 being the lowest priority and 10 being the highest.
|
||||||
|
// The more resources are used the higher the score is. This function
|
||||||
|
// is almost a reversed version of calculatUnusedScore (10 - calculateUnusedScore).
|
||||||
|
// The main difference is in rounding. It was added to keep the
|
||||||
|
// final formula clean and not to modify the widely used (by users
|
||||||
|
// in their default scheduling policies) calculateUSedScore.
|
||||||
|
func calculateUsedScore(requested int64, capacity int64, node string) int64 {
|
||||||
|
if capacity == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
if requested > capacity {
|
||||||
|
glog.V(2).Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s",
|
||||||
|
requested, capacity, node)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return (requested * 10) / capacity
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculates host priority based on the amount of unused resources.
|
||||||
|
// 'node' has information about the resources on the node.
|
||||||
// 'pods' is a list of pods currently scheduled on the node.
|
// 'pods' is a list of pods currently scheduled on the node.
|
||||||
// TODO: Use Node() from nodeInfo instead of passing it.
|
// TODO: Use Node() from nodeInfo instead of passing it.
|
||||||
func calculateResourceOccupancy(pod *api.Pod, podRequests *schedulercache.Resource, node *api.Node, nodeInfo *schedulercache.NodeInfo) schedulerapi.HostPriority {
|
func calculateUnusedPriority(pod *api.Pod, podRequests *schedulercache.Resource, node *api.Node, nodeInfo *schedulercache.NodeInfo) schedulerapi.HostPriority {
|
||||||
allocatableResources := nodeInfo.AllocatableResource()
|
allocatableResources := nodeInfo.AllocatableResource()
|
||||||
totalResources := *podRequests
|
totalResources := *podRequests
|
||||||
totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
|
totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
|
||||||
totalResources.Memory += nodeInfo.NonZeroRequest().Memory
|
totalResources.Memory += nodeInfo.NonZeroRequest().Memory
|
||||||
|
|
||||||
cpuScore := calculateScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name)
|
cpuScore := calculateUnusedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name)
|
||||||
memoryScore := calculateScore(totalResources.Memory, allocatableResources.Memory, node.Name)
|
memoryScore := calculateUnusedScore(totalResources.Memory, allocatableResources.Memory, node.Name)
|
||||||
if glog.V(10) {
|
if glog.V(10) {
|
||||||
// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
|
// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
|
||||||
// not logged. There is visible performance gain from it.
|
// not logged. There is visible performance gain from it.
|
||||||
@ -82,6 +103,35 @@ func calculateResourceOccupancy(pod *api.Pod, podRequests *schedulercache.Resour
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Calculate the resource used on a node. 'node' has information about the resources on the node.
|
||||||
|
// 'pods' is a list of pods currently scheduled on the node.
|
||||||
|
// TODO: Use Node() from nodeInfo instead of passing it.
|
||||||
|
func calculateUsedPriority(pod *api.Pod, podRequests *schedulercache.Resource, node *api.Node, nodeInfo *schedulercache.NodeInfo) schedulerapi.HostPriority {
|
||||||
|
allocatableResources := nodeInfo.AllocatableResource()
|
||||||
|
totalResources := *podRequests
|
||||||
|
totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
|
||||||
|
totalResources.Memory += nodeInfo.NonZeroRequest().Memory
|
||||||
|
|
||||||
|
cpuScore := calculateUsedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name)
|
||||||
|
memoryScore := calculateUsedScore(totalResources.Memory, allocatableResources.Memory, node.Name)
|
||||||
|
if glog.V(10) {
|
||||||
|
// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
|
||||||
|
// not logged. There is visible performance gain from it.
|
||||||
|
glog.V(10).Infof(
|
||||||
|
"%v -> %v: Most Requested Priority, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d CPU %d memory",
|
||||||
|
pod.Name, node.Name,
|
||||||
|
allocatableResources.MilliCPU, allocatableResources.Memory,
|
||||||
|
totalResources.MilliCPU, totalResources.Memory,
|
||||||
|
cpuScore, memoryScore,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return schedulerapi.HostPriority{
|
||||||
|
Host: node.Name,
|
||||||
|
Score: int((cpuScore + memoryScore) / 2),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources.
|
// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources.
|
||||||
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
|
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
|
||||||
// based on the minimum of the average of the fraction of requested to capacity.
|
// based on the minimum of the average of the fraction of requested to capacity.
|
||||||
@ -90,7 +140,20 @@ func LeastRequestedPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulerca
|
|||||||
podResources := getNonZeroRequests(pod)
|
podResources := getNonZeroRequests(pod)
|
||||||
list := make(schedulerapi.HostPriorityList, 0, len(nodes))
|
list := make(schedulerapi.HostPriorityList, 0, len(nodes))
|
||||||
for _, node := range nodes {
|
for _, node := range nodes {
|
||||||
list = append(list, calculateResourceOccupancy(pod, podResources, node, nodeNameToInfo[node.Name]))
|
list = append(list, calculateUnusedPriority(pod, podResources, node, nodeNameToInfo[node.Name]))
|
||||||
|
}
|
||||||
|
return list, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// MostRequestedPriority is a priority function that favors nodes with most requested resources.
|
||||||
|
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
|
||||||
|
// based on the maximum of the average of the fraction of requested to capacity.
|
||||||
|
// Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / capacity)) / 2
|
||||||
|
func MostRequestedPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*api.Node) (schedulerapi.HostPriorityList, error) {
|
||||||
|
podResources := getNonZeroRequests(pod)
|
||||||
|
list := make(schedulerapi.HostPriorityList, 0, len(nodes))
|
||||||
|
for _, node := range nodes {
|
||||||
|
list = append(list, calculateUsedPriority(pod, podResources, node, nodeNameToInfo[node.Name]))
|
||||||
}
|
}
|
||||||
return list, nil
|
return list, nil
|
||||||
}
|
}
|
||||||
|
@ -411,6 +411,161 @@ func TestLeastRequested(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMostRequested(t *testing.T) {
|
||||||
|
labels1 := map[string]string{
|
||||||
|
"foo": "bar",
|
||||||
|
"baz": "blah",
|
||||||
|
}
|
||||||
|
labels2 := map[string]string{
|
||||||
|
"bar": "foo",
|
||||||
|
"baz": "blah",
|
||||||
|
}
|
||||||
|
noResources := api.PodSpec{
|
||||||
|
Containers: []api.Container{},
|
||||||
|
}
|
||||||
|
cpuOnly := api.PodSpec{
|
||||||
|
NodeName: "machine1",
|
||||||
|
Containers: []api.Container{
|
||||||
|
{
|
||||||
|
Resources: api.ResourceRequirements{
|
||||||
|
Requests: api.ResourceList{
|
||||||
|
"cpu": resource.MustParse("1000m"),
|
||||||
|
"memory": resource.MustParse("0"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Resources: api.ResourceRequirements{
|
||||||
|
Requests: api.ResourceList{
|
||||||
|
"cpu": resource.MustParse("2000m"),
|
||||||
|
"memory": resource.MustParse("0"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
cpuOnly2 := cpuOnly
|
||||||
|
cpuOnly2.NodeName = "machine2"
|
||||||
|
cpuAndMemory := api.PodSpec{
|
||||||
|
NodeName: "machine2",
|
||||||
|
Containers: []api.Container{
|
||||||
|
{
|
||||||
|
Resources: api.ResourceRequirements{
|
||||||
|
Requests: api.ResourceList{
|
||||||
|
"cpu": resource.MustParse("1000m"),
|
||||||
|
"memory": resource.MustParse("2000"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Resources: api.ResourceRequirements{
|
||||||
|
Requests: api.ResourceList{
|
||||||
|
"cpu": resource.MustParse("2000m"),
|
||||||
|
"memory": resource.MustParse("3000"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
tests := []struct {
|
||||||
|
pod *api.Pod
|
||||||
|
pods []*api.Pod
|
||||||
|
nodes []*api.Node
|
||||||
|
expectedList schedulerapi.HostPriorityList
|
||||||
|
test string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
Node1 scores (used resources) on 0-10 scale
|
||||||
|
CPU Score: (0 * 10 / 4000 = 0
|
||||||
|
Memory Score: (0 * 10) / 10000 = 0
|
||||||
|
Node1 Score: (0 + 0) / 2 = 0
|
||||||
|
|
||||||
|
Node2 scores (used resources) on 0-10 scale
|
||||||
|
CPU Score: (0 * 10 / 4000 = 0
|
||||||
|
Memory Score: (0 * 10 / 10000 = 0
|
||||||
|
Node2 Score: (0 + 0) / 2 = 0
|
||||||
|
*/
|
||||||
|
pod: &api.Pod{Spec: noResources},
|
||||||
|
nodes: []*api.Node{makeNode("machine1", 4000, 10000), makeNode("machine2", 4000, 10000)},
|
||||||
|
expectedList: []schedulerapi.HostPriority{{"machine1", 0}, {"machine2", 0}},
|
||||||
|
test: "nothing scheduled, nothing requested",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
Node1 scores on 0-10 scale
|
||||||
|
CPU Score: (3000 * 10 / 4000 = 7.5
|
||||||
|
Memory Score: (5000 * 10) / 10000 = 5
|
||||||
|
Node1 Score: (7.5 + 5) / 2 = 6
|
||||||
|
|
||||||
|
Node2 scores on 0-10 scale
|
||||||
|
CPU Score: (3000 * 10 / 6000 = 5
|
||||||
|
Memory Score: (5000 * 10 / 10000 = 5
|
||||||
|
Node2 Score: (5 + 5) / 2 = 5
|
||||||
|
*/
|
||||||
|
pod: &api.Pod{Spec: cpuAndMemory},
|
||||||
|
nodes: []*api.Node{makeNode("machine1", 4000, 10000), makeNode("machine2", 6000, 10000)},
|
||||||
|
expectedList: []schedulerapi.HostPriority{{"machine1", 6}, {"machine2", 5}},
|
||||||
|
test: "nothing scheduled, resources requested, differently sized machines",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
Node1 scores on 0-10 scale
|
||||||
|
CPU Score: (6000 * 10) / 10000 = 6
|
||||||
|
Memory Score: (0 * 10) / 20000 = 10
|
||||||
|
Node1 Score: (6 + 0) / 2 = 3
|
||||||
|
|
||||||
|
Node2 scores on 0-10 scale
|
||||||
|
CPU Score: (6000 * 10) / 10000 = 6
|
||||||
|
Memory Score: (5000 * 10) / 20000 = 2.5
|
||||||
|
Node2 Score: (6 + 2.5) / 2 = 4
|
||||||
|
*/
|
||||||
|
pod: &api.Pod{Spec: noResources},
|
||||||
|
nodes: []*api.Node{makeNode("machine1", 10000, 20000), makeNode("machine2", 10000, 20000)},
|
||||||
|
expectedList: []schedulerapi.HostPriority{{"machine1", 3}, {"machine2", 4}},
|
||||||
|
test: "no resources requested, pods scheduled with resources",
|
||||||
|
pods: []*api.Pod{
|
||||||
|
{Spec: cpuOnly, ObjectMeta: api.ObjectMeta{Labels: labels2}},
|
||||||
|
{Spec: cpuOnly, ObjectMeta: api.ObjectMeta{Labels: labels1}},
|
||||||
|
{Spec: cpuOnly2, ObjectMeta: api.ObjectMeta{Labels: labels1}},
|
||||||
|
{Spec: cpuAndMemory, ObjectMeta: api.ObjectMeta{Labels: labels1}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
Node1 scores on 0-10 scale
|
||||||
|
CPU Score: (6000 * 10) / 10000 = 6
|
||||||
|
Memory Score: (5000 * 10) / 20000 = 2.5
|
||||||
|
Node1 Score: (6 + 2.5) / 2 = 4
|
||||||
|
|
||||||
|
Node2 scores on 0-10 scale
|
||||||
|
CPU Score: (6000 * 10) / 10000 = 6
|
||||||
|
Memory Score: (10000 * 10) / 20000 = 5
|
||||||
|
Node2 Score: (6 + 5) / 2 = 5
|
||||||
|
*/
|
||||||
|
pod: &api.Pod{Spec: cpuAndMemory},
|
||||||
|
nodes: []*api.Node{makeNode("machine1", 10000, 20000), makeNode("machine2", 10000, 20000)},
|
||||||
|
expectedList: []schedulerapi.HostPriority{{"machine1", 4}, {"machine2", 5}},
|
||||||
|
test: "resources requested, pods scheduled with resources",
|
||||||
|
pods: []*api.Pod{
|
||||||
|
{Spec: cpuOnly},
|
||||||
|
{Spec: cpuAndMemory},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, test.nodes)
|
||||||
|
list, err := MostRequestedPriority(test.pod, nodeNameToInfo, test.nodes)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(test.expectedList, list) {
|
||||||
|
t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNewNodeLabelPriority(t *testing.T) {
|
func TestNewNodeLabelPriority(t *testing.T) {
|
||||||
label1 := map[string]string{"foo": "bar"}
|
label1 := map[string]string{"foo": "bar"}
|
||||||
label2 := map[string]string{"bar": "foo"}
|
label2 := map[string]string{"bar": "foo"}
|
||||||
|
@ -93,6 +93,8 @@ func init() {
|
|||||||
factory.RegisterFitPredicate("HostName", predicates.PodFitsHost)
|
factory.RegisterFitPredicate("HostName", predicates.PodFitsHost)
|
||||||
// Fit is determined by node selector query.
|
// Fit is determined by node selector query.
|
||||||
factory.RegisterFitPredicate("MatchNodeSelector", predicates.PodSelectorMatches)
|
factory.RegisterFitPredicate("MatchNodeSelector", predicates.PodSelectorMatches)
|
||||||
|
// Optional, cluster-autoscaler friendly priority function - give used nodes higher priority.
|
||||||
|
factory.RegisterPriorityFunction("MostRequestedPriority", priorities.MostRequestedPriority, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func defaultPredicates() sets.String {
|
func defaultPredicates() sets.String {
|
||||||
|
Loading…
Reference in New Issue
Block a user