mirror of
https://github.com/k3s-io/kubernetes.git
synced 2026-01-04 15:05:20 +00:00
Rework image locality with spread-based scoring
This commit is contained in:
@@ -26,11 +26,12 @@ import (
|
||||
"k8s.io/kubernetes/pkg/util/parsers"
|
||||
)
|
||||
|
||||
// This is a reasonable size range of all container images. 90%ile of images on dockerhub drops into this range.
|
||||
// The two thresholds are used as bounds for the image score range. They correspond to a reasonable size range for
|
||||
// container images compressed and stored in registries; 90%ile of images on dockerhub drops into this range.
|
||||
const (
|
||||
mb int64 = 1024 * 1024
|
||||
minImgSize int64 = 23 * mb
|
||||
maxImgSize int64 = 1000 * mb
|
||||
mb int64 = 1024 * 1024
|
||||
minThreshold int64 = 23 * mb
|
||||
maxThreshold int64 = 1000 * mb
|
||||
)
|
||||
|
||||
// ImageLocalityPriorityMap is a priority function that favors nodes that already have requested pod container's images.
|
||||
@@ -44,44 +45,55 @@ func ImageLocalityPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *scheduler
|
||||
return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
|
||||
}
|
||||
|
||||
sumSize := totalImageSize(nodeInfo, pod.Spec.Containers)
|
||||
var score int
|
||||
if priorityMeta, ok := meta.(*priorityMetadata); ok {
|
||||
score = calculatePriority(sumImageScores(nodeInfo, pod.Spec.Containers, priorityMeta.totalNumNodes))
|
||||
} else {
|
||||
// if we are not able to parse priority meta data, skip this priority
|
||||
score = 0
|
||||
}
|
||||
|
||||
return schedulerapi.HostPriority{
|
||||
Host: node.Name,
|
||||
Score: calculateScoreFromSize(sumSize),
|
||||
Score: score,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// calculateScoreFromSize calculates the priority of a node. sumSize is sum size of requested images on this node.
|
||||
// 1. Split image size range into 10 buckets.
|
||||
// 2. Decide the priority of a given sumSize based on which bucket it belongs to.
|
||||
func calculateScoreFromSize(sumSize int64) int {
|
||||
switch {
|
||||
case sumSize == 0 || sumSize < minImgSize:
|
||||
// 0 means none of the images required by this pod are present on this
|
||||
// node or the total size of the images present are too small to be taken into further consideration.
|
||||
return 0
|
||||
|
||||
case sumSize >= maxImgSize:
|
||||
// If existing images' total size is larger than max, just make it highest priority.
|
||||
return schedulerapi.MaxPriority
|
||||
// calculatePriority returns the priority of a node. Given the sumScores of requested images on the node, the node's
|
||||
// priority is obtained by scaling the maximum priority value with a ratio proportional to the sumScores.
|
||||
func calculatePriority(sumScores int64) int {
|
||||
if sumScores < minThreshold {
|
||||
sumScores = minThreshold
|
||||
} else if sumScores > maxThreshold {
|
||||
sumScores = maxThreshold
|
||||
}
|
||||
|
||||
return int((int64(schedulerapi.MaxPriority) * (sumSize - minImgSize) / (maxImgSize - minImgSize)) + 1)
|
||||
return int(int64(schedulerapi.MaxPriority) * (sumScores - minThreshold) / (maxThreshold - minThreshold))
|
||||
}
|
||||
|
||||
// totalImageSize returns the total image size of all the containers that are already on the node.
|
||||
func totalImageSize(nodeInfo *schedulercache.NodeInfo, containers []v1.Container) int64 {
|
||||
var total int64
|
||||
// sumImageScores returns the sum of image scores of all the containers that are already on the node.
|
||||
// Each image receives a raw score of its size, scaled by scaledImageScore. The raw scores are later used to calculate
|
||||
// the final score. Note that the init containers are not considered for it's rare for users to deploy huge init containers.
|
||||
func sumImageScores(nodeInfo *schedulercache.NodeInfo, containers []v1.Container, totalNumNodes int) int64 {
|
||||
var sum int64
|
||||
imageStates := nodeInfo.ImageStates()
|
||||
|
||||
imageSizes := nodeInfo.ImageSizes()
|
||||
for _, container := range containers {
|
||||
if size, ok := imageSizes[normalizedImageName(container.Image)]; ok {
|
||||
total += size
|
||||
if state, ok := imageStates[normalizedImageName(container.Image)]; ok {
|
||||
sum += scaledImageScore(state, totalNumNodes)
|
||||
}
|
||||
}
|
||||
|
||||
return total
|
||||
return sum
|
||||
}
|
||||
|
||||
// scaledImageScore returns an adaptively scaled score for the given state of an image.
|
||||
// The size of the image is used as the base score, scaled by a factor which considers how much nodes the image has "spread" to.
|
||||
// This heuristic aims to mitigate the undesirable "node heating problem", i.e., pods get assigned to the same or
|
||||
// a few nodes due to image locality.
|
||||
func scaledImageScore(imageState *schedulercache.ImageStateSummary, totalNumNodes int) int64 {
|
||||
spread := float64(imageState.NumNodes) / float64(totalNumNodes)
|
||||
return int64(float64(imageState.Size) * spread)
|
||||
}
|
||||
|
||||
// normalizedImageName returns the CRI compliant name for a given image.
|
||||
|
||||
Reference in New Issue
Block a user