Zone-scheduler: Fixes per code-review

2025-09-14 13:45:06 +00:00 · 2015-12-05 22:26:41 -05:00
parent 6aa16c744b
commit cd433c974f
2 changed files with 157 additions and 132 deletions
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
@@ -25,6 +25,14 @@ import (
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 )

+// The maximum priority value to give to a node
+// Prioritiy values range from 0-maxPriority
+const maxPriority = 10
+
+// When zone information is present, give 2/3 of the weighting to zone spreading, 1/3 to node spreading
+// TODO: Any way to justify this weighting?
+const zoneWeighting = 2.0 / 3.0
+
 type SelectorSpread struct {
 	serviceLister    algorithm.ServiceLister
 	controllerLister algorithm.ControllerLister
@@ -53,12 +61,18 @@ func getZoneKey(node *api.Node) string {
 		return ""
 	}

-	return region + ":" + failureDomain
+	// We include the null character just in case region or failureDomain has a colon
+	// (We do assume there's no null characters in a region or failureDomain)
+	// As a nice side-benefit, the null character is not printed by fmt.Print or glog
+	return region + ":\x00:" + failureDomain
 }

-// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
-// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
-// pods which match the same selectors of Services and RCs as current pod.
+// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
+// When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
+// It favors nodes that have fewer existing matching pods.
+// i.e. it pushes the scheduler towards a node where there's the smallest number of
+// pods which match the same service selectors or RC selectors as the pod being scheduled.
+// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods.
 func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
 	var nsPods []*api.Pod

@@ -94,36 +108,40 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
 		return nil, err
 	}

-	maxCountByNodeName := 0
+	// Count similar pods by node
 	countsByNodeName := map[string]int{}
-	if len(nsPods) > 0 {
-		for _, pod := range nsPods {
-			// When we are replacing a failed pod, we often see the previous deleted version
-			// while scheduling the replacement.  Ignore the previous deleted version for spreading
-			// purposes (it can still be considered for resource restrictions etc.)
-			if pod.DeletionTimestamp != nil {
-				glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
-				continue
-			}
-			matches := false
-			for _, selector := range selectors {
-				if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
-					matches = true
-					break
-				}
-			}
-			if matches {
-				countsByNodeName[pod.Spec.NodeName]++
-				// Compute the maximum number of pods hosted on any node
-				if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName {
-					maxCountByNodeName = countsByNodeName[pod.Spec.NodeName]
-				}
+	for _, pod := range nsPods {
+		// When we are replacing a failed pod, we often see the previous deleted version
+		// while scheduling the replacement.  Ignore the previous deleted version for spreading
+		// purposes (it can still be considered for resource restrictions etc.)
+		if pod.DeletionTimestamp != nil {
+			glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
+			continue
+		}
+		matches := false
+		for _, selector := range selectors {
+			if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
+				matches = true
+				break
 			}
 		}
+		if !matches {
+			continue
+		}
+
+		countsByNodeName[pod.Spec.NodeName]++
+	}
+
+	// Aggregate by-node information
+	// Compute the maximum number of pods hosted on any node
+	maxCountByNodeName := 0
+	for _, count := range countsByNodeName {
+		if count > maxCountByNodeName {
+			maxCountByNodeName = count
+		}
 	}

-	maxCountByZone := 0
-	haveZones := false
+	// Count similar pods by zone, if zone information is present
 	countsByZone := map[string]int{}
 	for i := range nodes.Items {
 		node := &nodes.Items[i]
@@ -138,35 +156,37 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
 			continue
 		}

-		haveZones = true
 		countsByZone[zoneId] += count
-		// Compute the maximum number of pods hosted in any zone
-		if countsByZone[zoneId] > maxCountByZone {
-			maxCountByZone = countsByZone[zoneId]
+	}
+
+	// Aggregate by-zone information
+	// Compute the maximum number of pods hosted in any zone
+	haveZones := len(countsByZone) != 0
+	maxCountByZone := 0
+	for _, count := range countsByZone {
+		if count > maxCountByZone {
+			maxCountByZone = count
 		}
 	}

 	result := []schedulerapi.HostPriority{}
-	//score int - scale of 0-10
-	// 0 being the lowest priority and 10 being the highest
+	//score int - scale of 0-maxPriority
+	// 0 being the lowest priority and maxPriority being the highest
 	for i := range nodes.Items {
 		node := &nodes.Items[i]
-		// initializing to the default/max node score of 10
-		fScore := float32(10)
+		// initializing to the default/max node score of maxPriority
+		fScore := float32(maxPriority)
 		if maxCountByNodeName > 0 {
-			fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
+			fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
 		}

 		// If there is zone information present, incorporate it
 		if haveZones {
 			zoneId := getZoneKey(node)
 			if zoneId != "" {
-				fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
+				zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
+				fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)
 			}
-
-			// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
-			// TODO: Any way to justify this weighting?
-			fScore /= 3.0
 		}

 		result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
@@ -241,13 +261,13 @@ func (s *ServiceAntiAffinity) CalculateAntiAffinityPriority(pod *api.Pod, podLis

 	numServicePods := len(nsServicePods)
 	result := []schedulerapi.HostPriority{}
-	//score int - scale of 0-10
-	// 0 being the lowest priority and 10 being the highest
+	//score int - scale of 0-maxPriority
+	// 0 being the lowest priority and maxPriority being the highest
 	for node := range labeledNodes {
-		// initializing to the default/max node score of 10
-		fScore := float32(10)
+		// initializing to the default/max node score of maxPriority
+		fScore := float32(maxPriority)
 		if numServicePods > 0 {
-			fScore = 10 * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
+			fScore = maxPriority * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
 		}
 		result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)})
 	}
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
@@ -238,22 +238,27 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
 		"label2": "l2",
 		"baz":    "blah",
 	}
-	nodeLabelsZone1 := map[string]string{
-		wellknownlabels.LabelZoneFailureDomain: "zone1",
-	}
-	nodeLabelsZone2 := map[string]string{
-		wellknownlabels.LabelZoneFailureDomain: "zone2",
-	}
-	nodeLabelsZone3 := map[string]string{
-		wellknownlabels.LabelZoneFailureDomain: "zone3",
+
+	const nodeMachine1Zone1 = "machine1.zone1"
+	const nodeMachine1Zone2 = "machine1.zone2"
+	const nodeMachine2Zone2 = "machine2.zone2"
+	const nodeMachine1Zone3 = "machine1.zone3"
+	const nodeMachine2Zone3 = "machine2.zone3"
+	const nodeMachine3Zone3 = "machine3.zone3"
+
+	buildNodeLabels := func(failureDomain string) map[string]string {
+		labels := map[string]string{
+			wellknownlabels.LabelZoneFailureDomain: failureDomain,
+		}
+		return labels
 	}
 	labeledNodes := map[string]map[string]string{
-		"machine1.zone1": nodeLabelsZone1,
-		"machine1.zone2": nodeLabelsZone2,
-		"machine2.zone2": nodeLabelsZone2,
-		"machine1.zone3": nodeLabelsZone3,
-		"machine2.zone3": nodeLabelsZone3,
-		"machine3.zone3": nodeLabelsZone3,
+		nodeMachine1Zone1: buildNodeLabels("zone1"),
+		nodeMachine1Zone2: buildNodeLabels("zone2"),
+		nodeMachine2Zone2: buildNodeLabels("zone2"),
+		nodeMachine1Zone3: buildNodeLabels("zone3"),
+		nodeMachine2Zone3: buildNodeLabels("zone3"),
+		nodeMachine3Zone3: buildNodeLabels("zone3"),
 	}

 	buildPod := func(nodeName string, labels map[string]string) *api.Pod {
@@ -273,139 +278,139 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
 		{
 			pod: new(api.Pod),
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 10},
-				{"machine2.zone2", 10},
-				{"machine1.zone3", 10},
-				{"machine2.zone3", 10},
-				{"machine3.zone3", 10},
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 10},
+				{nodeMachine2Zone2, 10},
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
 			},
 			test: "nothing scheduled",
 		},
 		{
 			pod:  buildPod("", labels1),
-			pods: []*api.Pod{buildPod("machine1.zone1", nil)},
+			pods: []*api.Pod{buildPod(nodeMachine1Zone1, nil)},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 10},
-				{"machine2.zone2", 10},
-				{"machine1.zone3", 10},
-				{"machine2.zone3", 10},
-				{"machine3.zone3", 10},
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 10},
+				{nodeMachine2Zone2, 10},
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
 			},
 			test: "no services",
 		},
 		{
 			pod:      buildPod("", labels1),
-			pods:     []*api.Pod{buildPod("machine1.zone1", labels2)},
+			pods:     []*api.Pod{buildPod(nodeMachine1Zone1, labels2)},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 10},
-				{"machine2.zone2", 10},
-				{"machine1.zone3", 10},
-				{"machine2.zone3", 10},
-				{"machine3.zone3", 10},
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 10},
+				{nodeMachine2Zone2, 10},
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
 			},
 			test: "different services",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone1", labels2),
-				buildPod("machine1.zone2", labels1),
+				buildPod(nodeMachine1Zone1, labels2),
+				buildPod(nodeMachine1Zone2, labels1),
 			},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 0}, // Already have pod on machine
-				{"machine2.zone2", 3}, // Already have pod in zone
-				{"machine1.zone3", 10},
-				{"machine2.zone3", 10},
-				{"machine3.zone3", 10},
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 0}, // Already have pod on machine
+				{nodeMachine2Zone2, 3}, // Already have pod in zone
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
 			},
 			test: "two pods, 1 matching (in z2)",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone1", labels2),
-				buildPod("machine1.zone2", labels1),
-				buildPod("machine2.zone2", labels1),
-				buildPod("machine1.zone3", labels2),
-				buildPod("machine2.zone3", labels1),
+				buildPod(nodeMachine1Zone1, labels2),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine2Zone2, labels1),
+				buildPod(nodeMachine1Zone3, labels2),
+				buildPod(nodeMachine2Zone3, labels1),
 			},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 0}, // Pod on node
-				{"machine2.zone2", 0}, // Pod on node
-				{"machine1.zone3", 6}, // Pod in zone
-				{"machine2.zone3", 3}, // Pod on node
-				{"machine3.zone3", 6}, // Pod in zone
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 0}, // Pod on node
+				{nodeMachine2Zone2, 0}, // Pod on node
+				{nodeMachine1Zone3, 6}, // Pod in zone
+				{nodeMachine2Zone3, 3}, // Pod on node
+				{nodeMachine3Zone3, 6}, // Pod in zone
 			},
 			test: "five pods, 3 matching (z2=2, z3=1)",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone1", labels1),
-				buildPod("machine1.zone2", labels1),
-				buildPod("machine2.zone2", labels2),
-				buildPod("machine1.zone3", labels1),
+				buildPod(nodeMachine1Zone1, labels1),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine2Zone2, labels2),
+				buildPod(nodeMachine1Zone3, labels1),
 			},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 0}, // Pod on node
-				{"machine1.zone2", 0}, // Pod on node
-				{"machine2.zone2", 3}, // Pod in zone
-				{"machine1.zone3", 0}, // Pod on node
-				{"machine2.zone3", 3}, // Pod in zone
-				{"machine3.zone3", 3}, // Pod in zone
+				{nodeMachine1Zone1, 0}, // Pod on node
+				{nodeMachine1Zone2, 0}, // Pod on node
+				{nodeMachine2Zone2, 3}, // Pod in zone
+				{nodeMachine1Zone3, 0}, // Pod on node
+				{nodeMachine2Zone3, 3}, // Pod in zone
+				{nodeMachine3Zone3, 3}, // Pod in zone
 			},
 			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone1", labels1),
-				buildPod("machine1.zone2", labels1),
-				buildPod("machine1.zone3", labels1),
-				buildPod("machine2.zone2", labels2),
+				buildPod(nodeMachine1Zone1, labels1),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine1Zone3, labels1),
+				buildPod(nodeMachine2Zone2, labels2),
 			},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 0}, // Pod on node
-				{"machine1.zone2", 0}, // Pod on node
-				{"machine2.zone2", 3}, // Pod in zone
-				{"machine1.zone3", 0}, // Pod on node
-				{"machine2.zone3", 3}, // Pod in zone
-				{"machine3.zone3", 3}, // Pod in zone
+				{nodeMachine1Zone1, 0}, // Pod on node
+				{nodeMachine1Zone2, 0}, // Pod on node
+				{nodeMachine2Zone2, 3}, // Pod in zone
+				{nodeMachine1Zone3, 0}, // Pod on node
+				{nodeMachine2Zone3, 3}, // Pod in zone
+				{nodeMachine3Zone3, 3}, // Pod in zone
 			},
 			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone3", labels1),
-				buildPod("machine1.zone2", labels1),
-				buildPod("machine1.zone3", labels1),
+				buildPod(nodeMachine1Zone3, labels1),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine1Zone3, labels1),
 			},
 			rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				// Note that because we put two pods on the same node (machine1.zone3),
-				// the values here are questionable for zone2, in particular for machine1.zone2.
+				// Note that because we put two pods on the same node (nodeMachine1Zone3),
+				// the values here are questionable for zone2, in particular for nodeMachine1Zone2.
 				// However they kind of make sense; zone1 is still most-highly favored.
 				// zone3 is in general least favored, and m1.z3 particularly low priority.
 				// We would probably prefer to see a bigger gap between putting a second
 				// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
 				// This is also consistent with what we have already.
-				{"machine1.zone1", 10}, // No pods in zone
-				{"machine1.zone2", 5},  // Pod on node
-				{"machine2.zone2", 6},  // Pod in zone
-				{"machine1.zone3", 0},  // Two pods on node
-				{"machine2.zone3", 3},  // Pod in zone
-				{"machine3.zone3", 3},  // Pod in zone
+				{nodeMachine1Zone1, 10}, // No pods in zone
+				{nodeMachine1Zone2, 5},  // Pod on node
+				{nodeMachine2Zone2, 6},  // Pod in zone
+				{nodeMachine1Zone3, 0},  // Two pods on node
+				{nodeMachine2Zone3, 3},  // Pod in zone
+				{nodeMachine3Zone3, 3},  // Pod in zone
 			},
 			test: "Replication controller spreading (z1=0, z2=1, z3=2)",
 		},