diff --git a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go index 3a643496ae2..f1202e50055 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go +++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go @@ -25,6 +25,14 @@ import ( schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" ) +// The maximum priority value to give to a node +// Prioritiy values range from 0-maxPriority +const maxPriority = 10 + +// When zone information is present, give 2/3 of the weighting to zone spreading, 1/3 to node spreading +// TODO: Any way to justify this weighting? +const zoneWeighting = 2.0 / 3.0 + type SelectorSpread struct { serviceLister algorithm.ServiceLister controllerLister algorithm.ControllerLister @@ -53,12 +61,18 @@ func getZoneKey(node *api.Node) string { return "" } - return region + ":" + failureDomain + // We include the null character just in case region or failureDomain has a colon + // (We do assume there's no null characters in a region or failureDomain) + // As a nice side-benefit, the null character is not printed by fmt.Print or glog + return region + ":\x00:" + failureDomain } -// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under -// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of -// pods which match the same selectors of Services and RCs as current pod. +// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller. +// When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors. +// It favors nodes that have fewer existing matching pods. +// i.e. it pushes the scheduler towards a node where there's the smallest number of +// pods which match the same service selectors or RC selectors as the pod being scheduled. +// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods. func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { var nsPods []*api.Pod @@ -94,36 +108,40 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit return nil, err } - maxCountByNodeName := 0 + // Count similar pods by node countsByNodeName := map[string]int{} - if len(nsPods) > 0 { - for _, pod := range nsPods { - // When we are replacing a failed pod, we often see the previous deleted version - // while scheduling the replacement. Ignore the previous deleted version for spreading - // purposes (it can still be considered for resource restrictions etc.) - if pod.DeletionTimestamp != nil { - glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name) - continue - } - matches := false - for _, selector := range selectors { - if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) { - matches = true - break - } - } - if matches { - countsByNodeName[pod.Spec.NodeName]++ - // Compute the maximum number of pods hosted on any node - if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName { - maxCountByNodeName = countsByNodeName[pod.Spec.NodeName] - } + for _, pod := range nsPods { + // When we are replacing a failed pod, we often see the previous deleted version + // while scheduling the replacement. Ignore the previous deleted version for spreading + // purposes (it can still be considered for resource restrictions etc.) + if pod.DeletionTimestamp != nil { + glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name) + continue + } + matches := false + for _, selector := range selectors { + if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) { + matches = true + break } } + if !matches { + continue + } + + countsByNodeName[pod.Spec.NodeName]++ + } + + // Aggregate by-node information + // Compute the maximum number of pods hosted on any node + maxCountByNodeName := 0 + for _, count := range countsByNodeName { + if count > maxCountByNodeName { + maxCountByNodeName = count + } } - maxCountByZone := 0 - haveZones := false + // Count similar pods by zone, if zone information is present countsByZone := map[string]int{} for i := range nodes.Items { node := &nodes.Items[i] @@ -138,35 +156,37 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit continue } - haveZones = true countsByZone[zoneId] += count - // Compute the maximum number of pods hosted in any zone - if countsByZone[zoneId] > maxCountByZone { - maxCountByZone = countsByZone[zoneId] + } + + // Aggregate by-zone information + // Compute the maximum number of pods hosted in any zone + haveZones := len(countsByZone) != 0 + maxCountByZone := 0 + for _, count := range countsByZone { + if count > maxCountByZone { + maxCountByZone = count } } result := []schedulerapi.HostPriority{} - //score int - scale of 0-10 - // 0 being the lowest priority and 10 being the highest + //score int - scale of 0-maxPriority + // 0 being the lowest priority and maxPriority being the highest for i := range nodes.Items { node := &nodes.Items[i] - // initializing to the default/max node score of 10 - fScore := float32(10) + // initializing to the default/max node score of maxPriority + fScore := float32(maxPriority) if maxCountByNodeName > 0 { - fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName)) + fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName)) } // If there is zone information present, incorporate it if haveZones { zoneId := getZoneKey(node) if zoneId != "" { - fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone)) + zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone)) + fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore) } - - // Give 2/3 of the weighting to zone spreading, 1/3 to node spreading - // TODO: Any way to justify this weighting? - fScore /= 3.0 } result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) @@ -241,13 +261,13 @@ func (s *ServiceAntiAffinity) CalculateAntiAffinityPriority(pod *api.Pod, podLis numServicePods := len(nsServicePods) result := []schedulerapi.HostPriority{} - //score int - scale of 0-10 - // 0 being the lowest priority and 10 being the highest + //score int - scale of 0-maxPriority + // 0 being the lowest priority and maxPriority being the highest for node := range labeledNodes { - // initializing to the default/max node score of 10 - fScore := float32(10) + // initializing to the default/max node score of maxPriority + fScore := float32(maxPriority) if numServicePods > 0 { - fScore = 10 * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods)) + fScore = maxPriority * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods)) } result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)}) } diff --git a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go index fc62b752e50..a9ee18c87a2 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go +++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go @@ -238,22 +238,27 @@ func TestZoneSelectorSpreadPriority(t *testing.T) { "label2": "l2", "baz": "blah", } - nodeLabelsZone1 := map[string]string{ - wellknownlabels.LabelZoneFailureDomain: "zone1", - } - nodeLabelsZone2 := map[string]string{ - wellknownlabels.LabelZoneFailureDomain: "zone2", - } - nodeLabelsZone3 := map[string]string{ - wellknownlabels.LabelZoneFailureDomain: "zone3", + + const nodeMachine1Zone1 = "machine1.zone1" + const nodeMachine1Zone2 = "machine1.zone2" + const nodeMachine2Zone2 = "machine2.zone2" + const nodeMachine1Zone3 = "machine1.zone3" + const nodeMachine2Zone3 = "machine2.zone3" + const nodeMachine3Zone3 = "machine3.zone3" + + buildNodeLabels := func(failureDomain string) map[string]string { + labels := map[string]string{ + wellknownlabels.LabelZoneFailureDomain: failureDomain, + } + return labels } labeledNodes := map[string]map[string]string{ - "machine1.zone1": nodeLabelsZone1, - "machine1.zone2": nodeLabelsZone2, - "machine2.zone2": nodeLabelsZone2, - "machine1.zone3": nodeLabelsZone3, - "machine2.zone3": nodeLabelsZone3, - "machine3.zone3": nodeLabelsZone3, + nodeMachine1Zone1: buildNodeLabels("zone1"), + nodeMachine1Zone2: buildNodeLabels("zone2"), + nodeMachine2Zone2: buildNodeLabels("zone2"), + nodeMachine1Zone3: buildNodeLabels("zone3"), + nodeMachine2Zone3: buildNodeLabels("zone3"), + nodeMachine3Zone3: buildNodeLabels("zone3"), } buildPod := func(nodeName string, labels map[string]string) *api.Pod { @@ -273,139 +278,139 @@ func TestZoneSelectorSpreadPriority(t *testing.T) { { pod: new(api.Pod), expectedList: []schedulerapi.HostPriority{ - {"machine1.zone1", 10}, - {"machine1.zone2", 10}, - {"machine2.zone2", 10}, - {"machine1.zone3", 10}, - {"machine2.zone3", 10}, - {"machine3.zone3", 10}, + {nodeMachine1Zone1, 10}, + {nodeMachine1Zone2, 10}, + {nodeMachine2Zone2, 10}, + {nodeMachine1Zone3, 10}, + {nodeMachine2Zone3, 10}, + {nodeMachine3Zone3, 10}, }, test: "nothing scheduled", }, { pod: buildPod("", labels1), - pods: []*api.Pod{buildPod("machine1.zone1", nil)}, + pods: []*api.Pod{buildPod(nodeMachine1Zone1, nil)}, expectedList: []schedulerapi.HostPriority{ - {"machine1.zone1", 10}, - {"machine1.zone2", 10}, - {"machine2.zone2", 10}, - {"machine1.zone3", 10}, - {"machine2.zone3", 10}, - {"machine3.zone3", 10}, + {nodeMachine1Zone1, 10}, + {nodeMachine1Zone2, 10}, + {nodeMachine2Zone2, 10}, + {nodeMachine1Zone3, 10}, + {nodeMachine2Zone3, 10}, + {nodeMachine3Zone3, 10}, }, test: "no services", }, { pod: buildPod("", labels1), - pods: []*api.Pod{buildPod("machine1.zone1", labels2)}, + pods: []*api.Pod{buildPod(nodeMachine1Zone1, labels2)}, services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}}, expectedList: []schedulerapi.HostPriority{ - {"machine1.zone1", 10}, - {"machine1.zone2", 10}, - {"machine2.zone2", 10}, - {"machine1.zone3", 10}, - {"machine2.zone3", 10}, - {"machine3.zone3", 10}, + {nodeMachine1Zone1, 10}, + {nodeMachine1Zone2, 10}, + {nodeMachine2Zone2, 10}, + {nodeMachine1Zone3, 10}, + {nodeMachine2Zone3, 10}, + {nodeMachine3Zone3, 10}, }, test: "different services", }, { pod: buildPod("", labels1), pods: []*api.Pod{ - buildPod("machine1.zone1", labels2), - buildPod("machine1.zone2", labels1), + buildPod(nodeMachine1Zone1, labels2), + buildPod(nodeMachine1Zone2, labels1), }, services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, expectedList: []schedulerapi.HostPriority{ - {"machine1.zone1", 10}, - {"machine1.zone2", 0}, // Already have pod on machine - {"machine2.zone2", 3}, // Already have pod in zone - {"machine1.zone3", 10}, - {"machine2.zone3", 10}, - {"machine3.zone3", 10}, + {nodeMachine1Zone1, 10}, + {nodeMachine1Zone2, 0}, // Already have pod on machine + {nodeMachine2Zone2, 3}, // Already have pod in zone + {nodeMachine1Zone3, 10}, + {nodeMachine2Zone3, 10}, + {nodeMachine3Zone3, 10}, }, test: "two pods, 1 matching (in z2)", }, { pod: buildPod("", labels1), pods: []*api.Pod{ - buildPod("machine1.zone1", labels2), - buildPod("machine1.zone2", labels1), - buildPod("machine2.zone2", labels1), - buildPod("machine1.zone3", labels2), - buildPod("machine2.zone3", labels1), + buildPod(nodeMachine1Zone1, labels2), + buildPod(nodeMachine1Zone2, labels1), + buildPod(nodeMachine2Zone2, labels1), + buildPod(nodeMachine1Zone3, labels2), + buildPod(nodeMachine2Zone3, labels1), }, services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, expectedList: []schedulerapi.HostPriority{ - {"machine1.zone1", 10}, - {"machine1.zone2", 0}, // Pod on node - {"machine2.zone2", 0}, // Pod on node - {"machine1.zone3", 6}, // Pod in zone - {"machine2.zone3", 3}, // Pod on node - {"machine3.zone3", 6}, // Pod in zone + {nodeMachine1Zone1, 10}, + {nodeMachine1Zone2, 0}, // Pod on node + {nodeMachine2Zone2, 0}, // Pod on node + {nodeMachine1Zone3, 6}, // Pod in zone + {nodeMachine2Zone3, 3}, // Pod on node + {nodeMachine3Zone3, 6}, // Pod in zone }, test: "five pods, 3 matching (z2=2, z3=1)", }, { pod: buildPod("", labels1), pods: []*api.Pod{ - buildPod("machine1.zone1", labels1), - buildPod("machine1.zone2", labels1), - buildPod("machine2.zone2", labels2), - buildPod("machine1.zone3", labels1), + buildPod(nodeMachine1Zone1, labels1), + buildPod(nodeMachine1Zone2, labels1), + buildPod(nodeMachine2Zone2, labels2), + buildPod(nodeMachine1Zone3, labels1), }, services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, expectedList: []schedulerapi.HostPriority{ - {"machine1.zone1", 0}, // Pod on node - {"machine1.zone2", 0}, // Pod on node - {"machine2.zone2", 3}, // Pod in zone - {"machine1.zone3", 0}, // Pod on node - {"machine2.zone3", 3}, // Pod in zone - {"machine3.zone3", 3}, // Pod in zone + {nodeMachine1Zone1, 0}, // Pod on node + {nodeMachine1Zone2, 0}, // Pod on node + {nodeMachine2Zone2, 3}, // Pod in zone + {nodeMachine1Zone3, 0}, // Pod on node + {nodeMachine2Zone3, 3}, // Pod in zone + {nodeMachine3Zone3, 3}, // Pod in zone }, test: "four pods, 3 matching (z1=1, z2=1, z3=1)", }, { pod: buildPod("", labels1), pods: []*api.Pod{ - buildPod("machine1.zone1", labels1), - buildPod("machine1.zone2", labels1), - buildPod("machine1.zone3", labels1), - buildPod("machine2.zone2", labels2), + buildPod(nodeMachine1Zone1, labels1), + buildPod(nodeMachine1Zone2, labels1), + buildPod(nodeMachine1Zone3, labels1), + buildPod(nodeMachine2Zone2, labels2), }, services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, expectedList: []schedulerapi.HostPriority{ - {"machine1.zone1", 0}, // Pod on node - {"machine1.zone2", 0}, // Pod on node - {"machine2.zone2", 3}, // Pod in zone - {"machine1.zone3", 0}, // Pod on node - {"machine2.zone3", 3}, // Pod in zone - {"machine3.zone3", 3}, // Pod in zone + {nodeMachine1Zone1, 0}, // Pod on node + {nodeMachine1Zone2, 0}, // Pod on node + {nodeMachine2Zone2, 3}, // Pod in zone + {nodeMachine1Zone3, 0}, // Pod on node + {nodeMachine2Zone3, 3}, // Pod in zone + {nodeMachine3Zone3, 3}, // Pod in zone }, test: "four pods, 3 matching (z1=1, z2=1, z3=1)", }, { pod: buildPod("", labels1), pods: []*api.Pod{ - buildPod("machine1.zone3", labels1), - buildPod("machine1.zone2", labels1), - buildPod("machine1.zone3", labels1), + buildPod(nodeMachine1Zone3, labels1), + buildPod(nodeMachine1Zone2, labels1), + buildPod(nodeMachine1Zone3, labels1), }, rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}}, expectedList: []schedulerapi.HostPriority{ - // Note that because we put two pods on the same node (machine1.zone3), - // the values here are questionable for zone2, in particular for machine1.zone2. + // Note that because we put two pods on the same node (nodeMachine1Zone3), + // the values here are questionable for zone2, in particular for nodeMachine1Zone2. // However they kind of make sense; zone1 is still most-highly favored. // zone3 is in general least favored, and m1.z3 particularly low priority. // We would probably prefer to see a bigger gap between putting a second // pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct. // This is also consistent with what we have already. - {"machine1.zone1", 10}, // No pods in zone - {"machine1.zone2", 5}, // Pod on node - {"machine2.zone2", 6}, // Pod in zone - {"machine1.zone3", 0}, // Two pods on node - {"machine2.zone3", 3}, // Pod in zone - {"machine3.zone3", 3}, // Pod in zone + {nodeMachine1Zone1, 10}, // No pods in zone + {nodeMachine1Zone2, 5}, // Pod on node + {nodeMachine2Zone2, 6}, // Pod in zone + {nodeMachine1Zone3, 0}, // Two pods on node + {nodeMachine2Zone3, 3}, // Pod in zone + {nodeMachine3Zone3, 3}, // Pod in zone }, test: "Replication controller spreading (z1=0, z2=1, z3=2)", },