When scheduling, spread between zones if labeled

We already spread across nodes; we modify this spreading preference to spread across zones when nodes are labeled with zone information.
2026-01-05 07:27:21 +00:00 · 2015-11-29 10:02:40 -05:00
parent c458cd7bb7
commit 6aa16c744b
2 changed files with 270 additions and 8 deletions
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
@@ -19,6 +19,7 @@ package priorities
 import (
 	"github.com/golang/glog"
 	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/pkg/labels"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
@@ -37,11 +38,28 @@ func NewSelectorSpreadPriority(serviceLister algorithm.ServiceLister, controller
 	return selectorSpread.CalculateSpreadPriority
 }

+// Helper function that builds a string identifier that is unique per failure-zone
+// Returns empty-string for no zone
+func getZoneKey(node *api.Node) string {
+	labels := node.Labels
+	if labels == nil {
+		return ""
+	}
+
+	region, _ := labels[unversioned.LabelZoneRegion]
+	failureDomain, _ := labels[unversioned.LabelZoneFailureDomain]
+
+	if region == "" && failureDomain == "" {
+		return ""
+	}
+
+	return region + ":" + failureDomain
+}
+
 // CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
 // Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
 // pods which match the same selectors of Services and RCs as current pod.
 func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
-	var maxCount int
 	var nsPods []*api.Pod

 	selectors := make([]labels.Selector, 0)
@@ -76,9 +94,17 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
 		return nil, err
 	}

-	counts := map[string]int{}
+	maxCountByNodeName := 0
+	countsByNodeName := map[string]int{}
 	if len(nsPods) > 0 {
 		for _, pod := range nsPods {
+			// When we are replacing a failed pod, we often see the previous deleted version
+			// while scheduling the replacement.  Ignore the previous deleted version for spreading
+			// purposes (it can still be considered for resource restrictions etc.)
+			if pod.DeletionTimestamp != nil {
+				glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
+				continue
+			}
 			matches := false
 			for _, selector := range selectors {
 				if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
@@ -87,24 +113,62 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
 				}
 			}
 			if matches {
-				counts[pod.Spec.NodeName]++
+				countsByNodeName[pod.Spec.NodeName]++
 				// Compute the maximum number of pods hosted on any node
-				if counts[pod.Spec.NodeName] > maxCount {
-					maxCount = counts[pod.Spec.NodeName]
+				if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName {
+					maxCountByNodeName = countsByNodeName[pod.Spec.NodeName]
 				}
 			}
 		}
 	}

+	maxCountByZone := 0
+	haveZones := false
+	countsByZone := map[string]int{}
+	for i := range nodes.Items {
+		node := &nodes.Items[i]
+
+		count, found := countsByNodeName[node.Name]
+		if !found {
+			continue
+		}
+
+		zoneId := getZoneKey(node)
+		if zoneId == "" {
+			continue
+		}
+
+		haveZones = true
+		countsByZone[zoneId] += count
+		// Compute the maximum number of pods hosted in any zone
+		if countsByZone[zoneId] > maxCountByZone {
+			maxCountByZone = countsByZone[zoneId]
+		}
+	}
+
 	result := []schedulerapi.HostPriority{}
 	//score int - scale of 0-10
 	// 0 being the lowest priority and 10 being the highest
-	for _, node := range nodes.Items {
+	for i := range nodes.Items {
+		node := &nodes.Items[i]
 		// initializing to the default/max node score of 10
 		fScore := float32(10)
-		if maxCount > 0 {
-			fScore = 10 * (float32(maxCount-counts[node.Name]) / float32(maxCount))
+		if maxCountByNodeName > 0 {
+			fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
 		}
+
+		// If there is zone information present, incorporate it
+		if haveZones {
+			zoneId := getZoneKey(node)
+			if zoneId != "" {
+				fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
+			}
+
+			// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
+			// TODO: Any way to justify this weighting?
+			fScore /= 3.0
+		}
+
 		result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
 		glog.V(10).Infof(
 			"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
@@ -22,6 +22,7 @@ import (
 	"testing"

 	"k8s.io/kubernetes/pkg/api"
+	wellknownlabels "k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 )
@@ -228,6 +229,203 @@ func TestSelectorSpreadPriority(t *testing.T) {
 	}
 }

+func TestZoneSelectorSpreadPriority(t *testing.T) {
+	labels1 := map[string]string{
+		"label1": "l1",
+		"baz":    "blah",
+	}
+	labels2 := map[string]string{
+		"label2": "l2",
+		"baz":    "blah",
+	}
+	nodeLabelsZone1 := map[string]string{
+		wellknownlabels.LabelZoneFailureDomain: "zone1",
+	}
+	nodeLabelsZone2 := map[string]string{
+		wellknownlabels.LabelZoneFailureDomain: "zone2",
+	}
+	nodeLabelsZone3 := map[string]string{
+		wellknownlabels.LabelZoneFailureDomain: "zone3",
+	}
+	labeledNodes := map[string]map[string]string{
+		"machine1.zone1": nodeLabelsZone1,
+		"machine1.zone2": nodeLabelsZone2,
+		"machine2.zone2": nodeLabelsZone2,
+		"machine1.zone3": nodeLabelsZone3,
+		"machine2.zone3": nodeLabelsZone3,
+		"machine3.zone3": nodeLabelsZone3,
+	}
+
+	buildPod := func(nodeName string, labels map[string]string) *api.Pod {
+		pod := &api.Pod{Spec: api.PodSpec{NodeName: nodeName}, ObjectMeta: api.ObjectMeta{Labels: labels}}
+		return pod
+	}
+
+	tests := []struct {
+		pod          *api.Pod
+		pods         []*api.Pod
+		nodes        []string
+		rcs          []api.ReplicationController
+		services     []api.Service
+		expectedList schedulerapi.HostPriorityList
+		test         string
+	}{
+		{
+			pod: new(api.Pod),
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 10},
+				{"machine2.zone2", 10},
+				{"machine1.zone3", 10},
+				{"machine2.zone3", 10},
+				{"machine3.zone3", 10},
+			},
+			test: "nothing scheduled",
+		},
+		{
+			pod:  buildPod("", labels1),
+			pods: []*api.Pod{buildPod("machine1.zone1", nil)},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 10},
+				{"machine2.zone2", 10},
+				{"machine1.zone3", 10},
+				{"machine2.zone3", 10},
+				{"machine3.zone3", 10},
+			},
+			test: "no services",
+		},
+		{
+			pod:      buildPod("", labels1),
+			pods:     []*api.Pod{buildPod("machine1.zone1", labels2)},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 10},
+				{"machine2.zone2", 10},
+				{"machine1.zone3", 10},
+				{"machine2.zone3", 10},
+				{"machine3.zone3", 10},
+			},
+			test: "different services",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone1", labels2),
+				buildPod("machine1.zone2", labels1),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 0}, // Already have pod on machine
+				{"machine2.zone2", 3}, // Already have pod in zone
+				{"machine1.zone3", 10},
+				{"machine2.zone3", 10},
+				{"machine3.zone3", 10},
+			},
+			test: "two pods, 1 matching (in z2)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone1", labels2),
+				buildPod("machine1.zone2", labels1),
+				buildPod("machine2.zone2", labels1),
+				buildPod("machine1.zone3", labels2),
+				buildPod("machine2.zone3", labels1),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 0}, // Pod on node
+				{"machine2.zone2", 0}, // Pod on node
+				{"machine1.zone3", 6}, // Pod in zone
+				{"machine2.zone3", 3}, // Pod on node
+				{"machine3.zone3", 6}, // Pod in zone
+			},
+			test: "five pods, 3 matching (z2=2, z3=1)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone1", labels1),
+				buildPod("machine1.zone2", labels1),
+				buildPod("machine2.zone2", labels2),
+				buildPod("machine1.zone3", labels1),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 0}, // Pod on node
+				{"machine1.zone2", 0}, // Pod on node
+				{"machine2.zone2", 3}, // Pod in zone
+				{"machine1.zone3", 0}, // Pod on node
+				{"machine2.zone3", 3}, // Pod in zone
+				{"machine3.zone3", 3}, // Pod in zone
+			},
+			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone1", labels1),
+				buildPod("machine1.zone2", labels1),
+				buildPod("machine1.zone3", labels1),
+				buildPod("machine2.zone2", labels2),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 0}, // Pod on node
+				{"machine1.zone2", 0}, // Pod on node
+				{"machine2.zone2", 3}, // Pod in zone
+				{"machine1.zone3", 0}, // Pod on node
+				{"machine2.zone3", 3}, // Pod in zone
+				{"machine3.zone3", 3}, // Pod in zone
+			},
+			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone3", labels1),
+				buildPod("machine1.zone2", labels1),
+				buildPod("machine1.zone3", labels1),
+			},
+			rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				// Note that because we put two pods on the same node (machine1.zone3),
+				// the values here are questionable for zone2, in particular for machine1.zone2.
+				// However they kind of make sense; zone1 is still most-highly favored.
+				// zone3 is in general least favored, and m1.z3 particularly low priority.
+				// We would probably prefer to see a bigger gap between putting a second
+				// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
+				// This is also consistent with what we have already.
+				{"machine1.zone1", 10}, // No pods in zone
+				{"machine1.zone2", 5},  // Pod on node
+				{"machine2.zone2", 6},  // Pod in zone
+				{"machine1.zone3", 0},  // Two pods on node
+				{"machine2.zone3", 3},  // Pod in zone
+				{"machine3.zone3", 3},  // Pod in zone
+			},
+			test: "Replication controller spreading (z1=0, z2=1, z3=2)",
+		},
+	}
+
+	for _, test := range tests {
+		selectorSpread := SelectorSpread{serviceLister: algorithm.FakeServiceLister(test.services), controllerLister: algorithm.FakeControllerLister(test.rcs)}
+		list, err := selectorSpread.CalculateSpreadPriority(test.pod, algorithm.FakePodLister(test.pods), algorithm.FakeNodeLister(makeLabeledNodeList(labeledNodes)))
+		if err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+		// sort the two lists to avoid failures on account of different ordering
+		sort.Sort(test.expectedList)
+		sort.Sort(list)
+		if !reflect.DeepEqual(test.expectedList, list) {
+			t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
+		}
+	}
+}
+
 func TestZoneSpreadPriority(t *testing.T) {
 	labels1 := map[string]string{
 		"foo": "bar",