Zone-scheduler: Fixes per code-review

This commit is contained in:
Justin Santa Barbara 2015-12-05 22:26:41 -05:00
parent 6aa16c744b
commit cd433c974f
2 changed files with 157 additions and 132 deletions

View File

@ -25,6 +25,14 @@ import (
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
)
// The maximum priority value to give to a node
// Prioritiy values range from 0-maxPriority
const maxPriority = 10
// When zone information is present, give 2/3 of the weighting to zone spreading, 1/3 to node spreading
// TODO: Any way to justify this weighting?
const zoneWeighting = 2.0 / 3.0
type SelectorSpread struct {
serviceLister algorithm.ServiceLister
controllerLister algorithm.ControllerLister
@ -53,12 +61,18 @@ func getZoneKey(node *api.Node) string {
return ""
}
return region + ":" + failureDomain
// We include the null character just in case region or failureDomain has a colon
// (We do assume there's no null characters in a region or failureDomain)
// As a nice side-benefit, the null character is not printed by fmt.Print or glog
return region + ":\x00:" + failureDomain
}
// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
// pods which match the same selectors of Services and RCs as current pod.
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
// When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
// It favors nodes that have fewer existing matching pods.
// i.e. it pushes the scheduler towards a node where there's the smallest number of
// pods which match the same service selectors or RC selectors as the pod being scheduled.
// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods.
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
var nsPods []*api.Pod
@ -94,36 +108,40 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
return nil, err
}
maxCountByNodeName := 0
// Count similar pods by node
countsByNodeName := map[string]int{}
if len(nsPods) > 0 {
for _, pod := range nsPods {
// When we are replacing a failed pod, we often see the previous deleted version
// while scheduling the replacement. Ignore the previous deleted version for spreading
// purposes (it can still be considered for resource restrictions etc.)
if pod.DeletionTimestamp != nil {
glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
continue
}
matches := false
for _, selector := range selectors {
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
matches = true
break
}
}
if matches {
countsByNodeName[pod.Spec.NodeName]++
// Compute the maximum number of pods hosted on any node
if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName {
maxCountByNodeName = countsByNodeName[pod.Spec.NodeName]
}
for _, pod := range nsPods {
// When we are replacing a failed pod, we often see the previous deleted version
// while scheduling the replacement. Ignore the previous deleted version for spreading
// purposes (it can still be considered for resource restrictions etc.)
if pod.DeletionTimestamp != nil {
glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
continue
}
matches := false
for _, selector := range selectors {
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
matches = true
break
}
}
if !matches {
continue
}
countsByNodeName[pod.Spec.NodeName]++
}
// Aggregate by-node information
// Compute the maximum number of pods hosted on any node
maxCountByNodeName := 0
for _, count := range countsByNodeName {
if count > maxCountByNodeName {
maxCountByNodeName = count
}
}
maxCountByZone := 0
haveZones := false
// Count similar pods by zone, if zone information is present
countsByZone := map[string]int{}
for i := range nodes.Items {
node := &nodes.Items[i]
@ -138,35 +156,37 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
continue
}
haveZones = true
countsByZone[zoneId] += count
// Compute the maximum number of pods hosted in any zone
if countsByZone[zoneId] > maxCountByZone {
maxCountByZone = countsByZone[zoneId]
}
// Aggregate by-zone information
// Compute the maximum number of pods hosted in any zone
haveZones := len(countsByZone) != 0
maxCountByZone := 0
for _, count := range countsByZone {
if count > maxCountByZone {
maxCountByZone = count
}
}
result := []schedulerapi.HostPriority{}
//score int - scale of 0-10
// 0 being the lowest priority and 10 being the highest
//score int - scale of 0-maxPriority
// 0 being the lowest priority and maxPriority being the highest
for i := range nodes.Items {
node := &nodes.Items[i]
// initializing to the default/max node score of 10
fScore := float32(10)
// initializing to the default/max node score of maxPriority
fScore := float32(maxPriority)
if maxCountByNodeName > 0 {
fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
}
// If there is zone information present, incorporate it
if haveZones {
zoneId := getZoneKey(node)
if zoneId != "" {
fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)
}
// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
// TODO: Any way to justify this weighting?
fScore /= 3.0
}
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
@ -241,13 +261,13 @@ func (s *ServiceAntiAffinity) CalculateAntiAffinityPriority(pod *api.Pod, podLis
numServicePods := len(nsServicePods)
result := []schedulerapi.HostPriority{}
//score int - scale of 0-10
// 0 being the lowest priority and 10 being the highest
//score int - scale of 0-maxPriority
// 0 being the lowest priority and maxPriority being the highest
for node := range labeledNodes {
// initializing to the default/max node score of 10
fScore := float32(10)
// initializing to the default/max node score of maxPriority
fScore := float32(maxPriority)
if numServicePods > 0 {
fScore = 10 * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
fScore = maxPriority * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
}
result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)})
}

View File

@ -238,22 +238,27 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
"label2": "l2",
"baz": "blah",
}
nodeLabelsZone1 := map[string]string{
wellknownlabels.LabelZoneFailureDomain: "zone1",
}
nodeLabelsZone2 := map[string]string{
wellknownlabels.LabelZoneFailureDomain: "zone2",
}
nodeLabelsZone3 := map[string]string{
wellknownlabels.LabelZoneFailureDomain: "zone3",
const nodeMachine1Zone1 = "machine1.zone1"
const nodeMachine1Zone2 = "machine1.zone2"
const nodeMachine2Zone2 = "machine2.zone2"
const nodeMachine1Zone3 = "machine1.zone3"
const nodeMachine2Zone3 = "machine2.zone3"
const nodeMachine3Zone3 = "machine3.zone3"
buildNodeLabels := func(failureDomain string) map[string]string {
labels := map[string]string{
wellknownlabels.LabelZoneFailureDomain: failureDomain,
}
return labels
}
labeledNodes := map[string]map[string]string{
"machine1.zone1": nodeLabelsZone1,
"machine1.zone2": nodeLabelsZone2,
"machine2.zone2": nodeLabelsZone2,
"machine1.zone3": nodeLabelsZone3,
"machine2.zone3": nodeLabelsZone3,
"machine3.zone3": nodeLabelsZone3,
nodeMachine1Zone1: buildNodeLabels("zone1"),
nodeMachine1Zone2: buildNodeLabels("zone2"),
nodeMachine2Zone2: buildNodeLabels("zone2"),
nodeMachine1Zone3: buildNodeLabels("zone3"),
nodeMachine2Zone3: buildNodeLabels("zone3"),
nodeMachine3Zone3: buildNodeLabels("zone3"),
}
buildPod := func(nodeName string, labels map[string]string) *api.Pod {
@ -273,139 +278,139 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
{
pod: new(api.Pod),
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 10},
{"machine2.zone2", 10},
{"machine1.zone3", 10},
{"machine2.zone3", 10},
{"machine3.zone3", 10},
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 10},
{nodeMachine2Zone2, 10},
{nodeMachine1Zone3, 10},
{nodeMachine2Zone3, 10},
{nodeMachine3Zone3, 10},
},
test: "nothing scheduled",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{buildPod("machine1.zone1", nil)},
pods: []*api.Pod{buildPod(nodeMachine1Zone1, nil)},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 10},
{"machine2.zone2", 10},
{"machine1.zone3", 10},
{"machine2.zone3", 10},
{"machine3.zone3", 10},
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 10},
{nodeMachine2Zone2, 10},
{nodeMachine1Zone3, 10},
{nodeMachine2Zone3, 10},
{nodeMachine3Zone3, 10},
},
test: "no services",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{buildPod("machine1.zone1", labels2)},
pods: []*api.Pod{buildPod(nodeMachine1Zone1, labels2)},
services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 10},
{"machine2.zone2", 10},
{"machine1.zone3", 10},
{"machine2.zone3", 10},
{"machine3.zone3", 10},
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 10},
{nodeMachine2Zone2, 10},
{nodeMachine1Zone3, 10},
{nodeMachine2Zone3, 10},
{nodeMachine3Zone3, 10},
},
test: "different services",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone1", labels2),
buildPod("machine1.zone2", labels1),
buildPod(nodeMachine1Zone1, labels2),
buildPod(nodeMachine1Zone2, labels1),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 0}, // Already have pod on machine
{"machine2.zone2", 3}, // Already have pod in zone
{"machine1.zone3", 10},
{"machine2.zone3", 10},
{"machine3.zone3", 10},
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 0}, // Already have pod on machine
{nodeMachine2Zone2, 3}, // Already have pod in zone
{nodeMachine1Zone3, 10},
{nodeMachine2Zone3, 10},
{nodeMachine3Zone3, 10},
},
test: "two pods, 1 matching (in z2)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone1", labels2),
buildPod("machine1.zone2", labels1),
buildPod("machine2.zone2", labels1),
buildPod("machine1.zone3", labels2),
buildPod("machine2.zone3", labels1),
buildPod(nodeMachine1Zone1, labels2),
buildPod(nodeMachine1Zone2, labels1),
buildPod(nodeMachine2Zone2, labels1),
buildPod(nodeMachine1Zone3, labels2),
buildPod(nodeMachine2Zone3, labels1),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 0}, // Pod on node
{"machine2.zone2", 0}, // Pod on node
{"machine1.zone3", 6}, // Pod in zone
{"machine2.zone3", 3}, // Pod on node
{"machine3.zone3", 6}, // Pod in zone
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 0}, // Pod on node
{nodeMachine2Zone2, 0}, // Pod on node
{nodeMachine1Zone3, 6}, // Pod in zone
{nodeMachine2Zone3, 3}, // Pod on node
{nodeMachine3Zone3, 6}, // Pod in zone
},
test: "five pods, 3 matching (z2=2, z3=1)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone1", labels1),
buildPod("machine1.zone2", labels1),
buildPod("machine2.zone2", labels2),
buildPod("machine1.zone3", labels1),
buildPod(nodeMachine1Zone1, labels1),
buildPod(nodeMachine1Zone2, labels1),
buildPod(nodeMachine2Zone2, labels2),
buildPod(nodeMachine1Zone3, labels1),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 0}, // Pod on node
{"machine1.zone2", 0}, // Pod on node
{"machine2.zone2", 3}, // Pod in zone
{"machine1.zone3", 0}, // Pod on node
{"machine2.zone3", 3}, // Pod in zone
{"machine3.zone3", 3}, // Pod in zone
{nodeMachine1Zone1, 0}, // Pod on node
{nodeMachine1Zone2, 0}, // Pod on node
{nodeMachine2Zone2, 3}, // Pod in zone
{nodeMachine1Zone3, 0}, // Pod on node
{nodeMachine2Zone3, 3}, // Pod in zone
{nodeMachine3Zone3, 3}, // Pod in zone
},
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone1", labels1),
buildPod("machine1.zone2", labels1),
buildPod("machine1.zone3", labels1),
buildPod("machine2.zone2", labels2),
buildPod(nodeMachine1Zone1, labels1),
buildPod(nodeMachine1Zone2, labels1),
buildPod(nodeMachine1Zone3, labels1),
buildPod(nodeMachine2Zone2, labels2),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 0}, // Pod on node
{"machine1.zone2", 0}, // Pod on node
{"machine2.zone2", 3}, // Pod in zone
{"machine1.zone3", 0}, // Pod on node
{"machine2.zone3", 3}, // Pod in zone
{"machine3.zone3", 3}, // Pod in zone
{nodeMachine1Zone1, 0}, // Pod on node
{nodeMachine1Zone2, 0}, // Pod on node
{nodeMachine2Zone2, 3}, // Pod in zone
{nodeMachine1Zone3, 0}, // Pod on node
{nodeMachine2Zone3, 3}, // Pod in zone
{nodeMachine3Zone3, 3}, // Pod in zone
},
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone3", labels1),
buildPod("machine1.zone2", labels1),
buildPod("machine1.zone3", labels1),
buildPod(nodeMachine1Zone3, labels1),
buildPod(nodeMachine1Zone2, labels1),
buildPod(nodeMachine1Zone3, labels1),
},
rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
// Note that because we put two pods on the same node (machine1.zone3),
// the values here are questionable for zone2, in particular for machine1.zone2.
// Note that because we put two pods on the same node (nodeMachine1Zone3),
// the values here are questionable for zone2, in particular for nodeMachine1Zone2.
// However they kind of make sense; zone1 is still most-highly favored.
// zone3 is in general least favored, and m1.z3 particularly low priority.
// We would probably prefer to see a bigger gap between putting a second
// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
// This is also consistent with what we have already.
{"machine1.zone1", 10}, // No pods in zone
{"machine1.zone2", 5}, // Pod on node
{"machine2.zone2", 6}, // Pod in zone
{"machine1.zone3", 0}, // Two pods on node
{"machine2.zone3", 3}, // Pod in zone
{"machine3.zone3", 3}, // Pod in zone
{nodeMachine1Zone1, 10}, // No pods in zone
{nodeMachine1Zone2, 5}, // Pod on node
{nodeMachine2Zone2, 6}, // Pod in zone
{nodeMachine1Zone3, 0}, // Two pods on node
{nodeMachine2Zone3, 3}, // Pod in zone
{nodeMachine3Zone3, 3}, // Pod in zone
},
test: "Replication controller spreading (z1=0, z2=1, z3=2)",
},