mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-30 23:15:14 +00:00
Zone-scheduler: Fixes per code-review
This commit is contained in:
parent
6aa16c744b
commit
cd433c974f
@ -25,6 +25,14 @@ import (
|
||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||
)
|
||||
|
||||
// The maximum priority value to give to a node
|
||||
// Prioritiy values range from 0-maxPriority
|
||||
const maxPriority = 10
|
||||
|
||||
// When zone information is present, give 2/3 of the weighting to zone spreading, 1/3 to node spreading
|
||||
// TODO: Any way to justify this weighting?
|
||||
const zoneWeighting = 2.0 / 3.0
|
||||
|
||||
type SelectorSpread struct {
|
||||
serviceLister algorithm.ServiceLister
|
||||
controllerLister algorithm.ControllerLister
|
||||
@ -53,12 +61,18 @@ func getZoneKey(node *api.Node) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
return region + ":" + failureDomain
|
||||
// We include the null character just in case region or failureDomain has a colon
|
||||
// (We do assume there's no null characters in a region or failureDomain)
|
||||
// As a nice side-benefit, the null character is not printed by fmt.Print or glog
|
||||
return region + ":\x00:" + failureDomain
|
||||
}
|
||||
|
||||
// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
|
||||
// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
|
||||
// pods which match the same selectors of Services and RCs as current pod.
|
||||
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
|
||||
// When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
|
||||
// It favors nodes that have fewer existing matching pods.
|
||||
// i.e. it pushes the scheduler towards a node where there's the smallest number of
|
||||
// pods which match the same service selectors or RC selectors as the pod being scheduled.
|
||||
// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods.
|
||||
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
|
||||
var nsPods []*api.Pod
|
||||
|
||||
@ -94,36 +108,40 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
|
||||
return nil, err
|
||||
}
|
||||
|
||||
maxCountByNodeName := 0
|
||||
// Count similar pods by node
|
||||
countsByNodeName := map[string]int{}
|
||||
if len(nsPods) > 0 {
|
||||
for _, pod := range nsPods {
|
||||
// When we are replacing a failed pod, we often see the previous deleted version
|
||||
// while scheduling the replacement. Ignore the previous deleted version for spreading
|
||||
// purposes (it can still be considered for resource restrictions etc.)
|
||||
if pod.DeletionTimestamp != nil {
|
||||
glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
|
||||
continue
|
||||
}
|
||||
matches := false
|
||||
for _, selector := range selectors {
|
||||
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
|
||||
matches = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if matches {
|
||||
countsByNodeName[pod.Spec.NodeName]++
|
||||
// Compute the maximum number of pods hosted on any node
|
||||
if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName {
|
||||
maxCountByNodeName = countsByNodeName[pod.Spec.NodeName]
|
||||
}
|
||||
for _, pod := range nsPods {
|
||||
// When we are replacing a failed pod, we often see the previous deleted version
|
||||
// while scheduling the replacement. Ignore the previous deleted version for spreading
|
||||
// purposes (it can still be considered for resource restrictions etc.)
|
||||
if pod.DeletionTimestamp != nil {
|
||||
glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
|
||||
continue
|
||||
}
|
||||
matches := false
|
||||
for _, selector := range selectors {
|
||||
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
|
||||
matches = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !matches {
|
||||
continue
|
||||
}
|
||||
|
||||
countsByNodeName[pod.Spec.NodeName]++
|
||||
}
|
||||
|
||||
// Aggregate by-node information
|
||||
// Compute the maximum number of pods hosted on any node
|
||||
maxCountByNodeName := 0
|
||||
for _, count := range countsByNodeName {
|
||||
if count > maxCountByNodeName {
|
||||
maxCountByNodeName = count
|
||||
}
|
||||
}
|
||||
|
||||
maxCountByZone := 0
|
||||
haveZones := false
|
||||
// Count similar pods by zone, if zone information is present
|
||||
countsByZone := map[string]int{}
|
||||
for i := range nodes.Items {
|
||||
node := &nodes.Items[i]
|
||||
@ -138,35 +156,37 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
|
||||
continue
|
||||
}
|
||||
|
||||
haveZones = true
|
||||
countsByZone[zoneId] += count
|
||||
// Compute the maximum number of pods hosted in any zone
|
||||
if countsByZone[zoneId] > maxCountByZone {
|
||||
maxCountByZone = countsByZone[zoneId]
|
||||
}
|
||||
|
||||
// Aggregate by-zone information
|
||||
// Compute the maximum number of pods hosted in any zone
|
||||
haveZones := len(countsByZone) != 0
|
||||
maxCountByZone := 0
|
||||
for _, count := range countsByZone {
|
||||
if count > maxCountByZone {
|
||||
maxCountByZone = count
|
||||
}
|
||||
}
|
||||
|
||||
result := []schedulerapi.HostPriority{}
|
||||
//score int - scale of 0-10
|
||||
// 0 being the lowest priority and 10 being the highest
|
||||
//score int - scale of 0-maxPriority
|
||||
// 0 being the lowest priority and maxPriority being the highest
|
||||
for i := range nodes.Items {
|
||||
node := &nodes.Items[i]
|
||||
// initializing to the default/max node score of 10
|
||||
fScore := float32(10)
|
||||
// initializing to the default/max node score of maxPriority
|
||||
fScore := float32(maxPriority)
|
||||
if maxCountByNodeName > 0 {
|
||||
fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
|
||||
fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
|
||||
}
|
||||
|
||||
// If there is zone information present, incorporate it
|
||||
if haveZones {
|
||||
zoneId := getZoneKey(node)
|
||||
if zoneId != "" {
|
||||
fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
|
||||
zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
|
||||
fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)
|
||||
}
|
||||
|
||||
// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
|
||||
// TODO: Any way to justify this weighting?
|
||||
fScore /= 3.0
|
||||
}
|
||||
|
||||
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
|
||||
@ -241,13 +261,13 @@ func (s *ServiceAntiAffinity) CalculateAntiAffinityPriority(pod *api.Pod, podLis
|
||||
|
||||
numServicePods := len(nsServicePods)
|
||||
result := []schedulerapi.HostPriority{}
|
||||
//score int - scale of 0-10
|
||||
// 0 being the lowest priority and 10 being the highest
|
||||
//score int - scale of 0-maxPriority
|
||||
// 0 being the lowest priority and maxPriority being the highest
|
||||
for node := range labeledNodes {
|
||||
// initializing to the default/max node score of 10
|
||||
fScore := float32(10)
|
||||
// initializing to the default/max node score of maxPriority
|
||||
fScore := float32(maxPriority)
|
||||
if numServicePods > 0 {
|
||||
fScore = 10 * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
|
||||
fScore = maxPriority * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
|
||||
}
|
||||
result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)})
|
||||
}
|
||||
|
@ -238,22 +238,27 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
|
||||
"label2": "l2",
|
||||
"baz": "blah",
|
||||
}
|
||||
nodeLabelsZone1 := map[string]string{
|
||||
wellknownlabels.LabelZoneFailureDomain: "zone1",
|
||||
}
|
||||
nodeLabelsZone2 := map[string]string{
|
||||
wellknownlabels.LabelZoneFailureDomain: "zone2",
|
||||
}
|
||||
nodeLabelsZone3 := map[string]string{
|
||||
wellknownlabels.LabelZoneFailureDomain: "zone3",
|
||||
|
||||
const nodeMachine1Zone1 = "machine1.zone1"
|
||||
const nodeMachine1Zone2 = "machine1.zone2"
|
||||
const nodeMachine2Zone2 = "machine2.zone2"
|
||||
const nodeMachine1Zone3 = "machine1.zone3"
|
||||
const nodeMachine2Zone3 = "machine2.zone3"
|
||||
const nodeMachine3Zone3 = "machine3.zone3"
|
||||
|
||||
buildNodeLabels := func(failureDomain string) map[string]string {
|
||||
labels := map[string]string{
|
||||
wellknownlabels.LabelZoneFailureDomain: failureDomain,
|
||||
}
|
||||
return labels
|
||||
}
|
||||
labeledNodes := map[string]map[string]string{
|
||||
"machine1.zone1": nodeLabelsZone1,
|
||||
"machine1.zone2": nodeLabelsZone2,
|
||||
"machine2.zone2": nodeLabelsZone2,
|
||||
"machine1.zone3": nodeLabelsZone3,
|
||||
"machine2.zone3": nodeLabelsZone3,
|
||||
"machine3.zone3": nodeLabelsZone3,
|
||||
nodeMachine1Zone1: buildNodeLabels("zone1"),
|
||||
nodeMachine1Zone2: buildNodeLabels("zone2"),
|
||||
nodeMachine2Zone2: buildNodeLabels("zone2"),
|
||||
nodeMachine1Zone3: buildNodeLabels("zone3"),
|
||||
nodeMachine2Zone3: buildNodeLabels("zone3"),
|
||||
nodeMachine3Zone3: buildNodeLabels("zone3"),
|
||||
}
|
||||
|
||||
buildPod := func(nodeName string, labels map[string]string) *api.Pod {
|
||||
@ -273,139 +278,139 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
|
||||
{
|
||||
pod: new(api.Pod),
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 10},
|
||||
{"machine2.zone2", 10},
|
||||
{"machine1.zone3", 10},
|
||||
{"machine2.zone3", 10},
|
||||
{"machine3.zone3", 10},
|
||||
{nodeMachine1Zone1, 10},
|
||||
{nodeMachine1Zone2, 10},
|
||||
{nodeMachine2Zone2, 10},
|
||||
{nodeMachine1Zone3, 10},
|
||||
{nodeMachine2Zone3, 10},
|
||||
{nodeMachine3Zone3, 10},
|
||||
},
|
||||
test: "nothing scheduled",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{buildPod("machine1.zone1", nil)},
|
||||
pods: []*api.Pod{buildPod(nodeMachine1Zone1, nil)},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 10},
|
||||
{"machine2.zone2", 10},
|
||||
{"machine1.zone3", 10},
|
||||
{"machine2.zone3", 10},
|
||||
{"machine3.zone3", 10},
|
||||
{nodeMachine1Zone1, 10},
|
||||
{nodeMachine1Zone2, 10},
|
||||
{nodeMachine2Zone2, 10},
|
||||
{nodeMachine1Zone3, 10},
|
||||
{nodeMachine2Zone3, 10},
|
||||
{nodeMachine3Zone3, 10},
|
||||
},
|
||||
test: "no services",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{buildPod("machine1.zone1", labels2)},
|
||||
pods: []*api.Pod{buildPod(nodeMachine1Zone1, labels2)},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 10},
|
||||
{"machine2.zone2", 10},
|
||||
{"machine1.zone3", 10},
|
||||
{"machine2.zone3", 10},
|
||||
{"machine3.zone3", 10},
|
||||
{nodeMachine1Zone1, 10},
|
||||
{nodeMachine1Zone2, 10},
|
||||
{nodeMachine2Zone2, 10},
|
||||
{nodeMachine1Zone3, 10},
|
||||
{nodeMachine2Zone3, 10},
|
||||
{nodeMachine3Zone3, 10},
|
||||
},
|
||||
test: "different services",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone1", labels2),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
buildPod(nodeMachine1Zone1, labels2),
|
||||
buildPod(nodeMachine1Zone2, labels1),
|
||||
},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 0}, // Already have pod on machine
|
||||
{"machine2.zone2", 3}, // Already have pod in zone
|
||||
{"machine1.zone3", 10},
|
||||
{"machine2.zone3", 10},
|
||||
{"machine3.zone3", 10},
|
||||
{nodeMachine1Zone1, 10},
|
||||
{nodeMachine1Zone2, 0}, // Already have pod on machine
|
||||
{nodeMachine2Zone2, 3}, // Already have pod in zone
|
||||
{nodeMachine1Zone3, 10},
|
||||
{nodeMachine2Zone3, 10},
|
||||
{nodeMachine3Zone3, 10},
|
||||
},
|
||||
test: "two pods, 1 matching (in z2)",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone1", labels2),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
buildPod("machine2.zone2", labels1),
|
||||
buildPod("machine1.zone3", labels2),
|
||||
buildPod("machine2.zone3", labels1),
|
||||
buildPod(nodeMachine1Zone1, labels2),
|
||||
buildPod(nodeMachine1Zone2, labels1),
|
||||
buildPod(nodeMachine2Zone2, labels1),
|
||||
buildPod(nodeMachine1Zone3, labels2),
|
||||
buildPod(nodeMachine2Zone3, labels1),
|
||||
},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 0}, // Pod on node
|
||||
{"machine2.zone2", 0}, // Pod on node
|
||||
{"machine1.zone3", 6}, // Pod in zone
|
||||
{"machine2.zone3", 3}, // Pod on node
|
||||
{"machine3.zone3", 6}, // Pod in zone
|
||||
{nodeMachine1Zone1, 10},
|
||||
{nodeMachine1Zone2, 0}, // Pod on node
|
||||
{nodeMachine2Zone2, 0}, // Pod on node
|
||||
{nodeMachine1Zone3, 6}, // Pod in zone
|
||||
{nodeMachine2Zone3, 3}, // Pod on node
|
||||
{nodeMachine3Zone3, 6}, // Pod in zone
|
||||
},
|
||||
test: "five pods, 3 matching (z2=2, z3=1)",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone1", labels1),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
buildPod("machine2.zone2", labels2),
|
||||
buildPod("machine1.zone3", labels1),
|
||||
buildPod(nodeMachine1Zone1, labels1),
|
||||
buildPod(nodeMachine1Zone2, labels1),
|
||||
buildPod(nodeMachine2Zone2, labels2),
|
||||
buildPod(nodeMachine1Zone3, labels1),
|
||||
},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 0}, // Pod on node
|
||||
{"machine1.zone2", 0}, // Pod on node
|
||||
{"machine2.zone2", 3}, // Pod in zone
|
||||
{"machine1.zone3", 0}, // Pod on node
|
||||
{"machine2.zone3", 3}, // Pod in zone
|
||||
{"machine3.zone3", 3}, // Pod in zone
|
||||
{nodeMachine1Zone1, 0}, // Pod on node
|
||||
{nodeMachine1Zone2, 0}, // Pod on node
|
||||
{nodeMachine2Zone2, 3}, // Pod in zone
|
||||
{nodeMachine1Zone3, 0}, // Pod on node
|
||||
{nodeMachine2Zone3, 3}, // Pod in zone
|
||||
{nodeMachine3Zone3, 3}, // Pod in zone
|
||||
},
|
||||
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone1", labels1),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
buildPod("machine1.zone3", labels1),
|
||||
buildPod("machine2.zone2", labels2),
|
||||
buildPod(nodeMachine1Zone1, labels1),
|
||||
buildPod(nodeMachine1Zone2, labels1),
|
||||
buildPod(nodeMachine1Zone3, labels1),
|
||||
buildPod(nodeMachine2Zone2, labels2),
|
||||
},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 0}, // Pod on node
|
||||
{"machine1.zone2", 0}, // Pod on node
|
||||
{"machine2.zone2", 3}, // Pod in zone
|
||||
{"machine1.zone3", 0}, // Pod on node
|
||||
{"machine2.zone3", 3}, // Pod in zone
|
||||
{"machine3.zone3", 3}, // Pod in zone
|
||||
{nodeMachine1Zone1, 0}, // Pod on node
|
||||
{nodeMachine1Zone2, 0}, // Pod on node
|
||||
{nodeMachine2Zone2, 3}, // Pod in zone
|
||||
{nodeMachine1Zone3, 0}, // Pod on node
|
||||
{nodeMachine2Zone3, 3}, // Pod in zone
|
||||
{nodeMachine3Zone3, 3}, // Pod in zone
|
||||
},
|
||||
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone3", labels1),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
buildPod("machine1.zone3", labels1),
|
||||
buildPod(nodeMachine1Zone3, labels1),
|
||||
buildPod(nodeMachine1Zone2, labels1),
|
||||
buildPod(nodeMachine1Zone3, labels1),
|
||||
},
|
||||
rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
// Note that because we put two pods on the same node (machine1.zone3),
|
||||
// the values here are questionable for zone2, in particular for machine1.zone2.
|
||||
// Note that because we put two pods on the same node (nodeMachine1Zone3),
|
||||
// the values here are questionable for zone2, in particular for nodeMachine1Zone2.
|
||||
// However they kind of make sense; zone1 is still most-highly favored.
|
||||
// zone3 is in general least favored, and m1.z3 particularly low priority.
|
||||
// We would probably prefer to see a bigger gap between putting a second
|
||||
// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
|
||||
// This is also consistent with what we have already.
|
||||
{"machine1.zone1", 10}, // No pods in zone
|
||||
{"machine1.zone2", 5}, // Pod on node
|
||||
{"machine2.zone2", 6}, // Pod in zone
|
||||
{"machine1.zone3", 0}, // Two pods on node
|
||||
{"machine2.zone3", 3}, // Pod in zone
|
||||
{"machine3.zone3", 3}, // Pod in zone
|
||||
{nodeMachine1Zone1, 10}, // No pods in zone
|
||||
{nodeMachine1Zone2, 5}, // Pod on node
|
||||
{nodeMachine2Zone2, 6}, // Pod in zone
|
||||
{nodeMachine1Zone3, 0}, // Two pods on node
|
||||
{nodeMachine2Zone3, 3}, // Pod in zone
|
||||
{nodeMachine3Zone3, 3}, // Pod in zone
|
||||
},
|
||||
test: "Replication controller spreading (z1=0, z2=1, z3=2)",
|
||||
},
|
||||
|
Loading…
Reference in New Issue
Block a user