From 6aa16c744b599d371cb6f5db1a981c982f7c8c63 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Sun, 29 Nov 2015 10:02:40 -0500 Subject: [PATCH] When scheduling, spread between zones if labeled We already spread across nodes; we modify this spreading preference to spread across zones when nodes are labeled with zone information. --- .../priorities/selector_spreading.go | 80 ++++++- .../priorities/selector_spreading_test.go | 198 ++++++++++++++++++ 2 files changed, 270 insertions(+), 8 deletions(-) diff --git a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go index fcc66eb0f48..3a643496ae2 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go +++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go @@ -19,6 +19,7 @@ package priorities import ( "github.com/golang/glog" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/unversioned" "k8s.io/kubernetes/pkg/labels" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" @@ -37,11 +38,28 @@ func NewSelectorSpreadPriority(serviceLister algorithm.ServiceLister, controller return selectorSpread.CalculateSpreadPriority } +// Helper function that builds a string identifier that is unique per failure-zone +// Returns empty-string for no zone +func getZoneKey(node *api.Node) string { + labels := node.Labels + if labels == nil { + return "" + } + + region, _ := labels[unversioned.LabelZoneRegion] + failureDomain, _ := labels[unversioned.LabelZoneFailureDomain] + + if region == "" && failureDomain == "" { + return "" + } + + return region + ":" + failureDomain +} + // CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under // Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of // pods which match the same selectors of Services and RCs as current pod. func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) { - var maxCount int var nsPods []*api.Pod selectors := make([]labels.Selector, 0) @@ -76,9 +94,17 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit return nil, err } - counts := map[string]int{} + maxCountByNodeName := 0 + countsByNodeName := map[string]int{} if len(nsPods) > 0 { for _, pod := range nsPods { + // When we are replacing a failed pod, we often see the previous deleted version + // while scheduling the replacement. Ignore the previous deleted version for spreading + // purposes (it can still be considered for resource restrictions etc.) + if pod.DeletionTimestamp != nil { + glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name) + continue + } matches := false for _, selector := range selectors { if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) { @@ -87,24 +113,62 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit } } if matches { - counts[pod.Spec.NodeName]++ + countsByNodeName[pod.Spec.NodeName]++ // Compute the maximum number of pods hosted on any node - if counts[pod.Spec.NodeName] > maxCount { - maxCount = counts[pod.Spec.NodeName] + if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName { + maxCountByNodeName = countsByNodeName[pod.Spec.NodeName] } } } } + maxCountByZone := 0 + haveZones := false + countsByZone := map[string]int{} + for i := range nodes.Items { + node := &nodes.Items[i] + + count, found := countsByNodeName[node.Name] + if !found { + continue + } + + zoneId := getZoneKey(node) + if zoneId == "" { + continue + } + + haveZones = true + countsByZone[zoneId] += count + // Compute the maximum number of pods hosted in any zone + if countsByZone[zoneId] > maxCountByZone { + maxCountByZone = countsByZone[zoneId] + } + } + result := []schedulerapi.HostPriority{} //score int - scale of 0-10 // 0 being the lowest priority and 10 being the highest - for _, node := range nodes.Items { + for i := range nodes.Items { + node := &nodes.Items[i] // initializing to the default/max node score of 10 fScore := float32(10) - if maxCount > 0 { - fScore = 10 * (float32(maxCount-counts[node.Name]) / float32(maxCount)) + if maxCountByNodeName > 0 { + fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName)) } + + // If there is zone information present, incorporate it + if haveZones { + zoneId := getZoneKey(node) + if zoneId != "" { + fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone)) + } + + // Give 2/3 of the weighting to zone spreading, 1/3 to node spreading + // TODO: Any way to justify this weighting? + fScore /= 3.0 + } + result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)}) glog.V(10).Infof( "%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore), diff --git a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go index 93084308554..fc62b752e50 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go +++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go @@ -22,6 +22,7 @@ import ( "testing" "k8s.io/kubernetes/pkg/api" + wellknownlabels "k8s.io/kubernetes/pkg/api/unversioned" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" ) @@ -228,6 +229,203 @@ func TestSelectorSpreadPriority(t *testing.T) { } } +func TestZoneSelectorSpreadPriority(t *testing.T) { + labels1 := map[string]string{ + "label1": "l1", + "baz": "blah", + } + labels2 := map[string]string{ + "label2": "l2", + "baz": "blah", + } + nodeLabelsZone1 := map[string]string{ + wellknownlabels.LabelZoneFailureDomain: "zone1", + } + nodeLabelsZone2 := map[string]string{ + wellknownlabels.LabelZoneFailureDomain: "zone2", + } + nodeLabelsZone3 := map[string]string{ + wellknownlabels.LabelZoneFailureDomain: "zone3", + } + labeledNodes := map[string]map[string]string{ + "machine1.zone1": nodeLabelsZone1, + "machine1.zone2": nodeLabelsZone2, + "machine2.zone2": nodeLabelsZone2, + "machine1.zone3": nodeLabelsZone3, + "machine2.zone3": nodeLabelsZone3, + "machine3.zone3": nodeLabelsZone3, + } + + buildPod := func(nodeName string, labels map[string]string) *api.Pod { + pod := &api.Pod{Spec: api.PodSpec{NodeName: nodeName}, ObjectMeta: api.ObjectMeta{Labels: labels}} + return pod + } + + tests := []struct { + pod *api.Pod + pods []*api.Pod + nodes []string + rcs []api.ReplicationController + services []api.Service + expectedList schedulerapi.HostPriorityList + test string + }{ + { + pod: new(api.Pod), + expectedList: []schedulerapi.HostPriority{ + {"machine1.zone1", 10}, + {"machine1.zone2", 10}, + {"machine2.zone2", 10}, + {"machine1.zone3", 10}, + {"machine2.zone3", 10}, + {"machine3.zone3", 10}, + }, + test: "nothing scheduled", + }, + { + pod: buildPod("", labels1), + pods: []*api.Pod{buildPod("machine1.zone1", nil)}, + expectedList: []schedulerapi.HostPriority{ + {"machine1.zone1", 10}, + {"machine1.zone2", 10}, + {"machine2.zone2", 10}, + {"machine1.zone3", 10}, + {"machine2.zone3", 10}, + {"machine3.zone3", 10}, + }, + test: "no services", + }, + { + pod: buildPod("", labels1), + pods: []*api.Pod{buildPod("machine1.zone1", labels2)}, + services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}}, + expectedList: []schedulerapi.HostPriority{ + {"machine1.zone1", 10}, + {"machine1.zone2", 10}, + {"machine2.zone2", 10}, + {"machine1.zone3", 10}, + {"machine2.zone3", 10}, + {"machine3.zone3", 10}, + }, + test: "different services", + }, + { + pod: buildPod("", labels1), + pods: []*api.Pod{ + buildPod("machine1.zone1", labels2), + buildPod("machine1.zone2", labels1), + }, + services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, + expectedList: []schedulerapi.HostPriority{ + {"machine1.zone1", 10}, + {"machine1.zone2", 0}, // Already have pod on machine + {"machine2.zone2", 3}, // Already have pod in zone + {"machine1.zone3", 10}, + {"machine2.zone3", 10}, + {"machine3.zone3", 10}, + }, + test: "two pods, 1 matching (in z2)", + }, + { + pod: buildPod("", labels1), + pods: []*api.Pod{ + buildPod("machine1.zone1", labels2), + buildPod("machine1.zone2", labels1), + buildPod("machine2.zone2", labels1), + buildPod("machine1.zone3", labels2), + buildPod("machine2.zone3", labels1), + }, + services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, + expectedList: []schedulerapi.HostPriority{ + {"machine1.zone1", 10}, + {"machine1.zone2", 0}, // Pod on node + {"machine2.zone2", 0}, // Pod on node + {"machine1.zone3", 6}, // Pod in zone + {"machine2.zone3", 3}, // Pod on node + {"machine3.zone3", 6}, // Pod in zone + }, + test: "five pods, 3 matching (z2=2, z3=1)", + }, + { + pod: buildPod("", labels1), + pods: []*api.Pod{ + buildPod("machine1.zone1", labels1), + buildPod("machine1.zone2", labels1), + buildPod("machine2.zone2", labels2), + buildPod("machine1.zone3", labels1), + }, + services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, + expectedList: []schedulerapi.HostPriority{ + {"machine1.zone1", 0}, // Pod on node + {"machine1.zone2", 0}, // Pod on node + {"machine2.zone2", 3}, // Pod in zone + {"machine1.zone3", 0}, // Pod on node + {"machine2.zone3", 3}, // Pod in zone + {"machine3.zone3", 3}, // Pod in zone + }, + test: "four pods, 3 matching (z1=1, z2=1, z3=1)", + }, + { + pod: buildPod("", labels1), + pods: []*api.Pod{ + buildPod("machine1.zone1", labels1), + buildPod("machine1.zone2", labels1), + buildPod("machine1.zone3", labels1), + buildPod("machine2.zone2", labels2), + }, + services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}}, + expectedList: []schedulerapi.HostPriority{ + {"machine1.zone1", 0}, // Pod on node + {"machine1.zone2", 0}, // Pod on node + {"machine2.zone2", 3}, // Pod in zone + {"machine1.zone3", 0}, // Pod on node + {"machine2.zone3", 3}, // Pod in zone + {"machine3.zone3", 3}, // Pod in zone + }, + test: "four pods, 3 matching (z1=1, z2=1, z3=1)", + }, + { + pod: buildPod("", labels1), + pods: []*api.Pod{ + buildPod("machine1.zone3", labels1), + buildPod("machine1.zone2", labels1), + buildPod("machine1.zone3", labels1), + }, + rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}}, + expectedList: []schedulerapi.HostPriority{ + // Note that because we put two pods on the same node (machine1.zone3), + // the values here are questionable for zone2, in particular for machine1.zone2. + // However they kind of make sense; zone1 is still most-highly favored. + // zone3 is in general least favored, and m1.z3 particularly low priority. + // We would probably prefer to see a bigger gap between putting a second + // pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct. + // This is also consistent with what we have already. + {"machine1.zone1", 10}, // No pods in zone + {"machine1.zone2", 5}, // Pod on node + {"machine2.zone2", 6}, // Pod in zone + {"machine1.zone3", 0}, // Two pods on node + {"machine2.zone3", 3}, // Pod in zone + {"machine3.zone3", 3}, // Pod in zone + }, + test: "Replication controller spreading (z1=0, z2=1, z3=2)", + }, + } + + for _, test := range tests { + selectorSpread := SelectorSpread{serviceLister: algorithm.FakeServiceLister(test.services), controllerLister: algorithm.FakeControllerLister(test.rcs)} + list, err := selectorSpread.CalculateSpreadPriority(test.pod, algorithm.FakePodLister(test.pods), algorithm.FakeNodeLister(makeLabeledNodeList(labeledNodes))) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + // sort the two lists to avoid failures on account of different ordering + sort.Sort(test.expectedList) + sort.Sort(list) + if !reflect.DeepEqual(test.expectedList, list) { + t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list) + } + } +} + func TestZoneSpreadPriority(t *testing.T) { labels1 := map[string]string{ "foo": "bar",