Zone-scheduler: Fixes per code-review

This commit is contained in:
Justin Santa Barbara 2015-12-05 22:26:41 -05:00
parent 6aa16c744b
commit cd433c974f
2 changed files with 157 additions and 132 deletions

View File

@ -25,6 +25,14 @@ import (
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
) )
// The maximum priority value to give to a node
// Prioritiy values range from 0-maxPriority
const maxPriority = 10
// When zone information is present, give 2/3 of the weighting to zone spreading, 1/3 to node spreading
// TODO: Any way to justify this weighting?
const zoneWeighting = 2.0 / 3.0
type SelectorSpread struct { type SelectorSpread struct {
serviceLister algorithm.ServiceLister serviceLister algorithm.ServiceLister
controllerLister algorithm.ControllerLister controllerLister algorithm.ControllerLister
@ -53,12 +61,18 @@ func getZoneKey(node *api.Node) string {
return "" return ""
} }
return region + ":" + failureDomain // We include the null character just in case region or failureDomain has a colon
// (We do assume there's no null characters in a region or failureDomain)
// As a nice side-benefit, the null character is not printed by fmt.Print or glog
return region + ":\x00:" + failureDomain
} }
// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under // CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of // When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
// pods which match the same selectors of Services and RCs as current pod. // It favors nodes that have fewer existing matching pods.
// i.e. it pushes the scheduler towards a node where there's the smallest number of
// pods which match the same service selectors or RC selectors as the pod being scheduled.
// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods.
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
var nsPods []*api.Pod var nsPods []*api.Pod
@ -94,9 +108,8 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
return nil, err return nil, err
} }
maxCountByNodeName := 0 // Count similar pods by node
countsByNodeName := map[string]int{} countsByNodeName := map[string]int{}
if len(nsPods) > 0 {
for _, pod := range nsPods { for _, pod := range nsPods {
// When we are replacing a failed pod, we often see the previous deleted version // When we are replacing a failed pod, we often see the previous deleted version
// while scheduling the replacement. Ignore the previous deleted version for spreading // while scheduling the replacement. Ignore the previous deleted version for spreading
@ -112,18 +125,23 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
break break
} }
} }
if matches { if !matches {
continue
}
countsByNodeName[pod.Spec.NodeName]++ countsByNodeName[pod.Spec.NodeName]++
}
// Aggregate by-node information
// Compute the maximum number of pods hosted on any node // Compute the maximum number of pods hosted on any node
if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName { maxCountByNodeName := 0
maxCountByNodeName = countsByNodeName[pod.Spec.NodeName] for _, count := range countsByNodeName {
} if count > maxCountByNodeName {
} maxCountByNodeName = count
} }
} }
maxCountByZone := 0 // Count similar pods by zone, if zone information is present
haveZones := false
countsByZone := map[string]int{} countsByZone := map[string]int{}
for i := range nodes.Items { for i := range nodes.Items {
node := &nodes.Items[i] node := &nodes.Items[i]
@ -138,35 +156,37 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
continue continue
} }
haveZones = true
countsByZone[zoneId] += count countsByZone[zoneId] += count
}
// Aggregate by-zone information
// Compute the maximum number of pods hosted in any zone // Compute the maximum number of pods hosted in any zone
if countsByZone[zoneId] > maxCountByZone { haveZones := len(countsByZone) != 0
maxCountByZone = countsByZone[zoneId] maxCountByZone := 0
for _, count := range countsByZone {
if count > maxCountByZone {
maxCountByZone = count
} }
} }
result := []schedulerapi.HostPriority{} result := []schedulerapi.HostPriority{}
//score int - scale of 0-10 //score int - scale of 0-maxPriority
// 0 being the lowest priority and 10 being the highest // 0 being the lowest priority and maxPriority being the highest
for i := range nodes.Items { for i := range nodes.Items {
node := &nodes.Items[i] node := &nodes.Items[i]
// initializing to the default/max node score of 10 // initializing to the default/max node score of maxPriority
fScore := float32(10) fScore := float32(maxPriority)
if maxCountByNodeName > 0 { if maxCountByNodeName > 0 {
fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName)) fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
} }
// If there is zone information present, incorporate it // If there is zone information present, incorporate it
if haveZones { if haveZones {
zoneId := getZoneKey(node) zoneId := getZoneKey(node)
if zoneId != "" { if zoneId != "" {
fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone)) zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)
} }
// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
// TODO: Any way to justify this weighting?
fScore /= 3.0
} }
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
@ -241,13 +261,13 @@ func (s *ServiceAntiAffinity) CalculateAntiAffinityPriority(pod *api.Pod, podLis
numServicePods := len(nsServicePods) numServicePods := len(nsServicePods)
result := []schedulerapi.HostPriority{} result := []schedulerapi.HostPriority{}
//score int - scale of 0-10 //score int - scale of 0-maxPriority
// 0 being the lowest priority and 10 being the highest // 0 being the lowest priority and maxPriority being the highest
for node := range labeledNodes { for node := range labeledNodes {
// initializing to the default/max node score of 10 // initializing to the default/max node score of maxPriority
fScore := float32(10) fScore := float32(maxPriority)
if numServicePods > 0 { if numServicePods > 0 {
fScore = 10 * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods)) fScore = maxPriority * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
} }
result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)}) result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)})
} }

View File

@ -238,22 +238,27 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
"label2": "l2", "label2": "l2",
"baz": "blah", "baz": "blah",
} }
nodeLabelsZone1 := map[string]string{
wellknownlabels.LabelZoneFailureDomain: "zone1", const nodeMachine1Zone1 = "machine1.zone1"
const nodeMachine1Zone2 = "machine1.zone2"
const nodeMachine2Zone2 = "machine2.zone2"
const nodeMachine1Zone3 = "machine1.zone3"
const nodeMachine2Zone3 = "machine2.zone3"
const nodeMachine3Zone3 = "machine3.zone3"
buildNodeLabels := func(failureDomain string) map[string]string {
labels := map[string]string{
wellknownlabels.LabelZoneFailureDomain: failureDomain,
} }
nodeLabelsZone2 := map[string]string{ return labels
wellknownlabels.LabelZoneFailureDomain: "zone2",
}
nodeLabelsZone3 := map[string]string{
wellknownlabels.LabelZoneFailureDomain: "zone3",
} }
labeledNodes := map[string]map[string]string{ labeledNodes := map[string]map[string]string{
"machine1.zone1": nodeLabelsZone1, nodeMachine1Zone1: buildNodeLabels("zone1"),
"machine1.zone2": nodeLabelsZone2, nodeMachine1Zone2: buildNodeLabels("zone2"),
"machine2.zone2": nodeLabelsZone2, nodeMachine2Zone2: buildNodeLabels("zone2"),
"machine1.zone3": nodeLabelsZone3, nodeMachine1Zone3: buildNodeLabels("zone3"),
"machine2.zone3": nodeLabelsZone3, nodeMachine2Zone3: buildNodeLabels("zone3"),
"machine3.zone3": nodeLabelsZone3, nodeMachine3Zone3: buildNodeLabels("zone3"),
} }
buildPod := func(nodeName string, labels map[string]string) *api.Pod { buildPod := func(nodeName string, labels map[string]string) *api.Pod {
@ -273,139 +278,139 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
{ {
pod: new(api.Pod), pod: new(api.Pod),
expectedList: []schedulerapi.HostPriority{ expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10}, {nodeMachine1Zone1, 10},
{"machine1.zone2", 10}, {nodeMachine1Zone2, 10},
{"machine2.zone2", 10}, {nodeMachine2Zone2, 10},
{"machine1.zone3", 10}, {nodeMachine1Zone3, 10},
{"machine2.zone3", 10}, {nodeMachine2Zone3, 10},
{"machine3.zone3", 10}, {nodeMachine3Zone3, 10},
}, },
test: "nothing scheduled", test: "nothing scheduled",
}, },
{ {
pod: buildPod("", labels1), pod: buildPod("", labels1),
pods: []*api.Pod{buildPod("machine1.zone1", nil)}, pods: []*api.Pod{buildPod(nodeMachine1Zone1, nil)},
expectedList: []schedulerapi.HostPriority{ expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10}, {nodeMachine1Zone1, 10},
{"machine1.zone2", 10}, {nodeMachine1Zone2, 10},
{"machine2.zone2", 10}, {nodeMachine2Zone2, 10},
{"machine1.zone3", 10}, {nodeMachine1Zone3, 10},
{"machine2.zone3", 10}, {nodeMachine2Zone3, 10},
{"machine3.zone3", 10}, {nodeMachine3Zone3, 10},
}, },
test: "no services", test: "no services",
}, },
{ {
pod: buildPod("", labels1), pod: buildPod("", labels1),
pods: []*api.Pod{buildPod("machine1.zone1", labels2)}, pods: []*api.Pod{buildPod(nodeMachine1Zone1, labels2)},
services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}}, services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
expectedList: []schedulerapi.HostPriority{ expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10}, {nodeMachine1Zone1, 10},
{"machine1.zone2", 10}, {nodeMachine1Zone2, 10},
{"machine2.zone2", 10}, {nodeMachine2Zone2, 10},
{"machine1.zone3", 10}, {nodeMachine1Zone3, 10},
{"machine2.zone3", 10}, {nodeMachine2Zone3, 10},
{"machine3.zone3", 10}, {nodeMachine3Zone3, 10},
}, },
test: "different services", test: "different services",
}, },
{ {
pod: buildPod("", labels1), pod: buildPod("", labels1),
pods: []*api.Pod{ pods: []*api.Pod{
buildPod("machine1.zone1", labels2), buildPod(nodeMachine1Zone1, labels2),
buildPod("machine1.zone2", labels1), buildPod(nodeMachine1Zone2, labels1),
}, },
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{ expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10}, {nodeMachine1Zone1, 10},
{"machine1.zone2", 0}, // Already have pod on machine {nodeMachine1Zone2, 0}, // Already have pod on machine
{"machine2.zone2", 3}, // Already have pod in zone {nodeMachine2Zone2, 3}, // Already have pod in zone
{"machine1.zone3", 10}, {nodeMachine1Zone3, 10},
{"machine2.zone3", 10}, {nodeMachine2Zone3, 10},
{"machine3.zone3", 10}, {nodeMachine3Zone3, 10},
}, },
test: "two pods, 1 matching (in z2)", test: "two pods, 1 matching (in z2)",
}, },
{ {
pod: buildPod("", labels1), pod: buildPod("", labels1),
pods: []*api.Pod{ pods: []*api.Pod{
buildPod("machine1.zone1", labels2), buildPod(nodeMachine1Zone1, labels2),
buildPod("machine1.zone2", labels1), buildPod(nodeMachine1Zone2, labels1),
buildPod("machine2.zone2", labels1), buildPod(nodeMachine2Zone2, labels1),
buildPod("machine1.zone3", labels2), buildPod(nodeMachine1Zone3, labels2),
buildPod("machine2.zone3", labels1), buildPod(nodeMachine2Zone3, labels1),
}, },
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{ expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10}, {nodeMachine1Zone1, 10},
{"machine1.zone2", 0}, // Pod on node {nodeMachine1Zone2, 0}, // Pod on node
{"machine2.zone2", 0}, // Pod on node {nodeMachine2Zone2, 0}, // Pod on node
{"machine1.zone3", 6}, // Pod in zone {nodeMachine1Zone3, 6}, // Pod in zone
{"machine2.zone3", 3}, // Pod on node {nodeMachine2Zone3, 3}, // Pod on node
{"machine3.zone3", 6}, // Pod in zone {nodeMachine3Zone3, 6}, // Pod in zone
}, },
test: "five pods, 3 matching (z2=2, z3=1)", test: "five pods, 3 matching (z2=2, z3=1)",
}, },
{ {
pod: buildPod("", labels1), pod: buildPod("", labels1),
pods: []*api.Pod{ pods: []*api.Pod{
buildPod("machine1.zone1", labels1), buildPod(nodeMachine1Zone1, labels1),
buildPod("machine1.zone2", labels1), buildPod(nodeMachine1Zone2, labels1),
buildPod("machine2.zone2", labels2), buildPod(nodeMachine2Zone2, labels2),
buildPod("machine1.zone3", labels1), buildPod(nodeMachine1Zone3, labels1),
}, },
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{ expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 0}, // Pod on node {nodeMachine1Zone1, 0}, // Pod on node
{"machine1.zone2", 0}, // Pod on node {nodeMachine1Zone2, 0}, // Pod on node
{"machine2.zone2", 3}, // Pod in zone {nodeMachine2Zone2, 3}, // Pod in zone
{"machine1.zone3", 0}, // Pod on node {nodeMachine1Zone3, 0}, // Pod on node
{"machine2.zone3", 3}, // Pod in zone {nodeMachine2Zone3, 3}, // Pod in zone
{"machine3.zone3", 3}, // Pod in zone {nodeMachine3Zone3, 3}, // Pod in zone
}, },
test: "four pods, 3 matching (z1=1, z2=1, z3=1)", test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
}, },
{ {
pod: buildPod("", labels1), pod: buildPod("", labels1),
pods: []*api.Pod{ pods: []*api.Pod{
buildPod("machine1.zone1", labels1), buildPod(nodeMachine1Zone1, labels1),
buildPod("machine1.zone2", labels1), buildPod(nodeMachine1Zone2, labels1),
buildPod("machine1.zone3", labels1), buildPod(nodeMachine1Zone3, labels1),
buildPod("machine2.zone2", labels2), buildPod(nodeMachine2Zone2, labels2),
}, },
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{ expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 0}, // Pod on node {nodeMachine1Zone1, 0}, // Pod on node
{"machine1.zone2", 0}, // Pod on node {nodeMachine1Zone2, 0}, // Pod on node
{"machine2.zone2", 3}, // Pod in zone {nodeMachine2Zone2, 3}, // Pod in zone
{"machine1.zone3", 0}, // Pod on node {nodeMachine1Zone3, 0}, // Pod on node
{"machine2.zone3", 3}, // Pod in zone {nodeMachine2Zone3, 3}, // Pod in zone
{"machine3.zone3", 3}, // Pod in zone {nodeMachine3Zone3, 3}, // Pod in zone
}, },
test: "four pods, 3 matching (z1=1, z2=1, z3=1)", test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
}, },
{ {
pod: buildPod("", labels1), pod: buildPod("", labels1),
pods: []*api.Pod{ pods: []*api.Pod{
buildPod("machine1.zone3", labels1), buildPod(nodeMachine1Zone3, labels1),
buildPod("machine1.zone2", labels1), buildPod(nodeMachine1Zone2, labels1),
buildPod("machine1.zone3", labels1), buildPod(nodeMachine1Zone3, labels1),
}, },
rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}}, rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{ expectedList: []schedulerapi.HostPriority{
// Note that because we put two pods on the same node (machine1.zone3), // Note that because we put two pods on the same node (nodeMachine1Zone3),
// the values here are questionable for zone2, in particular for machine1.zone2. // the values here are questionable for zone2, in particular for nodeMachine1Zone2.
// However they kind of make sense; zone1 is still most-highly favored. // However they kind of make sense; zone1 is still most-highly favored.
// zone3 is in general least favored, and m1.z3 particularly low priority. // zone3 is in general least favored, and m1.z3 particularly low priority.
// We would probably prefer to see a bigger gap between putting a second // We would probably prefer to see a bigger gap between putting a second
// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct. // pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
// This is also consistent with what we have already. // This is also consistent with what we have already.
{"machine1.zone1", 10}, // No pods in zone {nodeMachine1Zone1, 10}, // No pods in zone
{"machine1.zone2", 5}, // Pod on node {nodeMachine1Zone2, 5}, // Pod on node
{"machine2.zone2", 6}, // Pod in zone {nodeMachine2Zone2, 6}, // Pod in zone
{"machine1.zone3", 0}, // Two pods on node {nodeMachine1Zone3, 0}, // Two pods on node
{"machine2.zone3", 3}, // Pod in zone {nodeMachine2Zone3, 3}, // Pod in zone
{"machine3.zone3", 3}, // Pod in zone {nodeMachine3Zone3, 3}, // Pod in zone
}, },
test: "Replication controller spreading (z1=0, z2=1, z3=2)", test: "Replication controller spreading (z1=0, z2=1, z3=2)",
}, },