Merge pull request #55442 from anfernee/priority_resource

Automatic merge from submit-queue (batch tested with PRs 57257, 55442). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Merge 3 resource allocation priority functions **What this PR does / why we need it**: those 3 priority functions are closed related, and share a lot of the same logic, put them together. **Release note**: ```release-note None ```
2025-07-29 06:27:05 +00:00 · 2017-12-20 23:56:49 -08:00 · 2017-12-20 23:56:49 -08:00 · 754bb1350f
commit 754bb1350f
parent fd826d6e2d c65225ee19
6 changed files with 143 additions and 179 deletions
--- a/plugin/pkg/scheduler/algorithm/priorities/BUILD
+++ b/plugin/pkg/scheduler/algorithm/priorities/BUILD
@ -19,6 +19,7 @@ go_library(
        "node_label.go",
        "node_prefer_avoid_pods.go",
        "reduce.go",
+        "resource_allocation.go",
        "resource_limits.go",
        "selector_spreading.go",
        "taint_toleration.go",
--- a/plugin/pkg/scheduler/algorithm/priorities/balanced_resource_allocation.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/balanced_resource_allocation.go
@ -17,76 +17,40 @@ limitations under the License.
 package priorities

 import (
-	"fmt"
 	"math"

-	"k8s.io/api/core/v1"
-	priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
-
-	"github.com/golang/glog"
 )

-// This is a reasonable size range of all container images. 90%ile of images on dockerhub drops into this range.
-const (
-	mb         int64 = 1024 * 1024
-	minImgSize int64 = 23 * mb
-	maxImgSize int64 = 1000 * mb
+var (
+	balanceResourcePriority = &ResourceAllocationPriority{"BalanceResourceAllocation", balancedResourceScorer}
+
+	// BalancedResourceAllocationMap favors nodes with balanced resource usage rate.
+	// BalancedResourceAllocationMap should **NOT** be used alone, and **MUST** be used together
+	// with LeastRequestedPriority. It calculates the difference between the cpu and memory fraction
+	// of capacity, and prioritizes the host based on how close the two metrics are to each other.
+	// Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by:
+	// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced
+	// Resource Utilization"
+	BalancedResourceAllocationMap = balanceResourcePriority.PriorityMap
 )

-// Also used in most/least_requested nad metadata.
-// TODO: despaghettify it
-func getNonZeroRequests(pod *v1.Pod) *schedulercache.Resource {
-	result := &schedulercache.Resource{}
-	for i := range pod.Spec.Containers {
-		container := &pod.Spec.Containers[i]
-		cpu, memory := priorityutil.GetNonzeroRequests(&container.Resources.Requests)
-		result.MilliCPU += cpu
-		result.Memory += memory
-	}
-	return result
-}
+func balancedResourceScorer(requested, allocable *schedulercache.Resource) int64 {
+	cpuFraction := fractionOfCapacity(requested.MilliCPU, allocable.MilliCPU)
+	memoryFraction := fractionOfCapacity(requested.Memory, allocable.Memory)

-func calculateBalancedResourceAllocation(pod *v1.Pod, podRequests *schedulercache.Resource, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
-	node := nodeInfo.Node()
-	if node == nil {
-		return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
-	}
-
-	allocatableResources := nodeInfo.AllocatableResource()
-	totalResources := *podRequests
-	totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
-	totalResources.Memory += nodeInfo.NonZeroRequest().Memory
-
-	cpuFraction := fractionOfCapacity(totalResources.MilliCPU, allocatableResources.MilliCPU)
-	memoryFraction := fractionOfCapacity(totalResources.Memory, allocatableResources.Memory)
-	score := int(0)
 	if cpuFraction >= 1 || memoryFraction >= 1 {
 		// if requested >= capacity, the corresponding host should never be preferred.
-		score = 0
-	} else {
-		// Upper and lower boundary of difference between cpuFraction and memoryFraction are -1 and 1
-		// respectively. Multilying the absolute value of the difference by 10 scales the value to
-		// 0-10 with 0 representing well balanced allocation and 10 poorly balanced. Subtracting it from
-		// 10 leads to the score which also scales from 0 to 10 while 10 representing well balanced.
-		diff := math.Abs(cpuFraction - memoryFraction)
-		score = int((1 - diff) * float64(schedulerapi.MaxPriority))
-	}
-	if glog.V(10) {
-		glog.Infof(
-			"%v -> %v: Balanced Resource Allocation, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d",
-			pod.Name, node.Name,
-			allocatableResources.MilliCPU, allocatableResources.Memory,
-			totalResources.MilliCPU, totalResources.Memory,
-			score,
-		)
+		return 0
 	}

-	return schedulerapi.HostPriority{
-		Host:  node.Name,
-		Score: score,
-	}, nil
+	// Upper and lower boundary of difference between cpuFraction and memoryFraction are -1 and 1
+	// respectively. Multilying the absolute value of the difference by 10 scales the value to
+	// 0-10 with 0 representing well balanced allocation and 10 poorly balanced. Subtracting it from
+	// 10 leads to the score which also scales from 0 to 10 while 10 representing well balanced.
+	diff := math.Abs(cpuFraction - memoryFraction)
+	return int64((1 - diff) * float64(schedulerapi.MaxPriority))
 }

 func fractionOfCapacity(requested, capacity int64) float64 {
@ -95,20 +59,3 @@ func fractionOfCapacity(requested, capacity int64) float64 {
 	}
 	return float64(requested) / float64(capacity)
 }
-
-// BalancedResourceAllocationMap favors nodes with balanced resource usage rate.
-// BalancedResourceAllocationMap should **NOT** be used alone, and **MUST** be used together with LeastRequestedPriority.
-// It calculates the difference between the cpu and memory fraction of capacity, and prioritizes the host based on how
-// close the two metrics are to each other.
-// Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by:
-// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization"
-func BalancedResourceAllocationMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
-	var nonZeroRequest *schedulercache.Resource
-	if priorityMeta, ok := meta.(*priorityMetadata); ok {
-		nonZeroRequest = priorityMeta.nonZeroRequest
-	} else {
-		// We couldn't parse metadatat - fallback to computing it.
-		nonZeroRequest = getNonZeroRequests(pod)
-	}
-	return calculateBalancedResourceAllocation(pod, nonZeroRequest, nodeInfo)
-}
--- a/plugin/pkg/scheduler/algorithm/priorities/image_locality.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/image_locality.go
@ -24,6 +24,13 @@ import (
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )

+// This is a reasonable size range of all container images. 90%ile of images on dockerhub drops into this range.
+const (
+	mb         int64 = 1024 * 1024
+	minImgSize int64 = 23 * mb
+	maxImgSize int64 = 1000 * mb
+)
+
 // ImageLocalityPriorityMap is a priority function that favors nodes that already have requested pod container's images.
 // It will detect whether the requested images are present on a node, and then calculate a score ranging from 0 to 10
 // based on the total size of those images.
--- a/plugin/pkg/scheduler/algorithm/priorities/least_requested.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/least_requested.go
@ -17,73 +17,37 @@ limitations under the License.
 package priorities

 import (
-	"fmt"
-
-	"k8s.io/api/core/v1"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
-
-	"github.com/golang/glog"
 )

-// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources.
-// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
-// based on the minimum of the average of the fraction of requested to capacity.
-// Details: cpu((capacity - sum(requested)) * 10 / capacity) + memory((capacity - sum(requested)) * 10 / capacity) / 2
-func LeastRequestedPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
-	var nonZeroRequest *schedulercache.Resource
-	if priorityMeta, ok := meta.(*priorityMetadata); ok {
-		nonZeroRequest = priorityMeta.nonZeroRequest
-	} else {
-		// We couldn't parse metadata - fallback to computing it.
-		nonZeroRequest = getNonZeroRequests(pod)
-	}
-	return calculateUnusedPriority(pod, nonZeroRequest, nodeInfo)
+var (
+	leastResourcePriority = &ResourceAllocationPriority{"LeastResourceAllocation", leastResourceScorer}
+
+	// LeastRequestedPriority is a priority function that favors nodes with fewer requested resources.
+	// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and
+	// prioritizes based on the minimum of the average of the fraction of requested to capacity.
+	//
+	// Details:
+	// cpu((capacity-sum(requested))*10/capacity) + memory((capacity-sum(requested))*10/capacity)/2
+	LeastRequestedPriorityMap = leastResourcePriority.PriorityMap
+)
+
+func leastResourceScorer(requested, allocable *schedulercache.Resource) int64 {
+	return (leastRequestedScore(requested.MilliCPU, allocable.MilliCPU) +
+		leastRequestedScore(requested.Memory, allocable.Memory)) / 2
 }

 // The unused capacity is calculated on a scale of 0-10
 // 0 being the lowest priority and 10 being the highest.
 // The more unused resources the higher the score is.
-func calculateUnusedScore(requested int64, capacity int64, node string) int64 {
+func leastRequestedScore(requested, capacity int64) int64 {
 	if capacity == 0 {
 		return 0
 	}
 	if requested > capacity {
-		glog.V(10).Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s",
-			requested, capacity, node)
 		return 0
 	}
+
 	return ((capacity - requested) * int64(schedulerapi.MaxPriority)) / capacity
 }
-
-// Calculates host priority based on the amount of unused resources.
-// 'node' has information about the resources on the node.
-// 'pods' is a list of pods currently scheduled on the node.
-func calculateUnusedPriority(pod *v1.Pod, podRequests *schedulercache.Resource, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
-	node := nodeInfo.Node()
-	if node == nil {
-		return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
-	}
-
-	allocatableResources := nodeInfo.AllocatableResource()
-	totalResources := *podRequests
-	totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
-	totalResources.Memory += nodeInfo.NonZeroRequest().Memory
-
-	cpuScore := calculateUnusedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name)
-	memoryScore := calculateUnusedScore(totalResources.Memory, allocatableResources.Memory, node.Name)
-	if glog.V(10) {
-		glog.Infof(
-			"%v -> %v: Least Requested Priority, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d CPU %d memory",
-			pod.Name, node.Name,
-			allocatableResources.MilliCPU, allocatableResources.Memory,
-			totalResources.MilliCPU, totalResources.Memory,
-			cpuScore, memoryScore,
-		)
-	}
-
-	return schedulerapi.HostPriority{
-		Host:  node.Name,
-		Score: int((cpuScore + memoryScore) / 2),
-	}, nil
-}
--- a/plugin/pkg/scheduler/algorithm/priorities/most_requested.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/most_requested.go
@ -17,28 +17,23 @@ limitations under the License.
 package priorities

 import (
-	"fmt"
-
-	"k8s.io/api/core/v1"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
-
-	"github.com/golang/glog"
 )

-// MostRequestedPriority is a priority function that favors nodes with most requested resources.
-// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
-// based on the maximum of the average of the fraction of requested to capacity.
-// Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / capacity)) / 2
-func MostRequestedPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
-	var nonZeroRequest *schedulercache.Resource
-	if priorityMeta, ok := meta.(*priorityMetadata); ok {
-		nonZeroRequest = priorityMeta.nonZeroRequest
-	} else {
-		// We couldn't parse metadatat - fallback to computing it.
-		nonZeroRequest = getNonZeroRequests(pod)
-	}
-	return calculateUsedPriority(pod, nonZeroRequest, nodeInfo)
+var (
+	mostResourcePriority = &ResourceAllocationPriority{"MostResourceAllocation", mostResourceScorer}
+
+	// MostRequestedPriority is a priority function that favors nodes with most requested resources.
+	// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
+	// based on the maximum of the average of the fraction of requested to capacity.
+	// Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / capacity)) / 2
+	MostRequestedPriorityMap = mostResourcePriority.PriorityMap
+)
+
+func mostResourceScorer(requested, allocable *schedulercache.Resource) int64 {
+	return (mostRequestedScore(requested.MilliCPU, allocable.MilliCPU) +
+		mostRequestedScore(requested.Memory, allocable.Memory)) / 2
 }

 // The used capacity is calculated on a scale of 0-10
@ -48,45 +43,13 @@ func MostRequestedPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *scheduler
 // (10 - calculateUnusedScore). The main difference is in rounding. It was added to
 // keep the final formula clean and not to modify the widely used (by users
 // in their default scheduling policies) calculateUSedScore.
-func calculateUsedScore(requested int64, capacity int64, node string) int64 {
+func mostRequestedScore(requested, capacity int64) int64 {
 	if capacity == 0 {
 		return 0
 	}
 	if requested > capacity {
-		glog.V(10).Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s",
-			requested, capacity, node)
 		return 0
 	}
+
 	return (requested * schedulerapi.MaxPriority) / capacity
 }
-
-// Calculate the resource used on a node.  'node' has information about the resources on the node.
-// 'pods' is a list of pods currently scheduled on the node.
-func calculateUsedPriority(pod *v1.Pod, podRequests *schedulercache.Resource, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
-	node := nodeInfo.Node()
-	if node == nil {
-		return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
-	}
-
-	allocatableResources := nodeInfo.AllocatableResource()
-	totalResources := *podRequests
-	totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
-	totalResources.Memory += nodeInfo.NonZeroRequest().Memory
-
-	cpuScore := calculateUsedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name)
-	memoryScore := calculateUsedScore(totalResources.Memory, allocatableResources.Memory, node.Name)
-	if glog.V(10) {
-		glog.Infof(
-			"%v -> %v: Most Requested Priority, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d CPU %d memory",
-			pod.Name, node.Name,
-			allocatableResources.MilliCPU, allocatableResources.Memory,
-			totalResources.MilliCPU, totalResources.Memory,
-			cpuScore, memoryScore,
-		)
-	}
-
-	return schedulerapi.HostPriority{
-		Host:  node.Name,
-		Score: int((cpuScore + memoryScore) / 2),
-	}, nil
-}
--- a/plugin/pkg/scheduler/algorithm/priorities/resource_allocation.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/resource_allocation.go
@ -0,0 +1,82 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package priorities
+
+import (
+	"fmt"
+
+	"github.com/golang/glog"
+	"k8s.io/api/core/v1"
+	priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util"
+	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
+)
+
+type ResourceAllocationPriority struct {
+	Name   string
+	scorer func(requested, allocable *schedulercache.Resource) int64
+}
+
+func (r *ResourceAllocationPriority) PriorityMap(
+	pod *v1.Pod,
+	meta interface{},
+	nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
+	node := nodeInfo.Node()
+	if node == nil {
+		return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
+	}
+	allocatable := nodeInfo.AllocatableResource()
+
+	var requested schedulercache.Resource
+	if priorityMeta, ok := meta.(*priorityMetadata); ok {
+		requested = *priorityMeta.nonZeroRequest
+	} else {
+		// We couldn't parse metadatat - fallback to computing it.
+		requested = *getNonZeroRequests(pod)
+	}
+
+	requested.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU
+	requested.Memory += nodeInfo.NonZeroRequest().Memory
+
+	score := r.scorer(&requested, &allocatable)
+
+	if glog.V(10) {
+		glog.Infof(
+			"%v -> %v: %v, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d",
+			pod.Name, node.Name, r.Name,
+			allocatable.MilliCPU, allocatable.Memory,
+			requested.MilliCPU+allocatable.MilliCPU, requested.Memory+allocatable.Memory,
+			score,
+		)
+	}
+
+	return schedulerapi.HostPriority{
+		Host:  node.Name,
+		Score: int(score),
+	}, nil
+}
+
+func getNonZeroRequests(pod *v1.Pod) *schedulercache.Resource {
+	result := &schedulercache.Resource{}
+	for i := range pod.Spec.Containers {
+		container := &pod.Spec.Containers[i]
+		cpu, memory := priorityutil.GetNonzeroRequests(&container.Resources.Requests)
+		result.MilliCPU += cpu
+		result.Memory += memory
+	}
+	return result
+}