From 6aa16c744b599d371cb6f5db1a981c982f7c8c63 Mon Sep 17 00:00:00 2001
From: Justin Santa Barbara <justin@fathomdb.com>
Date: Sun, 29 Nov 2015 10:02:40 -0500
Subject: [PATCH 1/3] When scheduling, spread between zones if labeled

We already spread across nodes; we modify this spreading preference to
spread across zones when nodes are labeled with zone information.
---
 .../priorities/selector_spreading.go          |  80 ++++++-
 .../priorities/selector_spreading_test.go     | 198 ++++++++++++++++++
 2 files changed, 270 insertions(+), 8 deletions(-)

diff --git a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
index fcc66eb0f48..3a643496ae2 100644
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
@@ -19,6 +19,7 @@ package priorities
 import (
 	"github.com/golang/glog"
 	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/pkg/labels"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
@@ -37,11 +38,28 @@ func NewSelectorSpreadPriority(serviceLister algorithm.ServiceLister, controller
 	return selectorSpread.CalculateSpreadPriority
 }
 
+// Helper function that builds a string identifier that is unique per failure-zone
+// Returns empty-string for no zone
+func getZoneKey(node *api.Node) string {
+	labels := node.Labels
+	if labels == nil {
+		return ""
+	}
+
+	region, _ := labels[unversioned.LabelZoneRegion]
+	failureDomain, _ := labels[unversioned.LabelZoneFailureDomain]
+
+	if region == "" && failureDomain == "" {
+		return ""
+	}
+
+	return region + ":" + failureDomain
+}
+
 // CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
 // Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
 // pods which match the same selectors of Services and RCs as current pod.
 func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
-	var maxCount int
 	var nsPods []*api.Pod
 
 	selectors := make([]labels.Selector, 0)
@@ -76,9 +94,17 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
 		return nil, err
 	}
 
-	counts := map[string]int{}
+	maxCountByNodeName := 0
+	countsByNodeName := map[string]int{}
 	if len(nsPods) > 0 {
 		for _, pod := range nsPods {
+			// When we are replacing a failed pod, we often see the previous deleted version
+			// while scheduling the replacement.  Ignore the previous deleted version for spreading
+			// purposes (it can still be considered for resource restrictions etc.)
+			if pod.DeletionTimestamp != nil {
+				glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
+				continue
+			}
 			matches := false
 			for _, selector := range selectors {
 				if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
@@ -87,24 +113,62 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
 				}
 			}
 			if matches {
-				counts[pod.Spec.NodeName]++
+				countsByNodeName[pod.Spec.NodeName]++
 				// Compute the maximum number of pods hosted on any node
-				if counts[pod.Spec.NodeName] > maxCount {
-					maxCount = counts[pod.Spec.NodeName]
+				if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName {
+					maxCountByNodeName = countsByNodeName[pod.Spec.NodeName]
 				}
 			}
 		}
 	}
 
+	maxCountByZone := 0
+	haveZones := false
+	countsByZone := map[string]int{}
+	for i := range nodes.Items {
+		node := &nodes.Items[i]
+
+		count, found := countsByNodeName[node.Name]
+		if !found {
+			continue
+		}
+
+		zoneId := getZoneKey(node)
+		if zoneId == "" {
+			continue
+		}
+
+		haveZones = true
+		countsByZone[zoneId] += count
+		// Compute the maximum number of pods hosted in any zone
+		if countsByZone[zoneId] > maxCountByZone {
+			maxCountByZone = countsByZone[zoneId]
+		}
+	}
+
 	result := []schedulerapi.HostPriority{}
 	//score int - scale of 0-10
 	// 0 being the lowest priority and 10 being the highest
-	for _, node := range nodes.Items {
+	for i := range nodes.Items {
+		node := &nodes.Items[i]
 		// initializing to the default/max node score of 10
 		fScore := float32(10)
-		if maxCount > 0 {
-			fScore = 10 * (float32(maxCount-counts[node.Name]) / float32(maxCount))
+		if maxCountByNodeName > 0 {
+			fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
 		}
+
+		// If there is zone information present, incorporate it
+		if haveZones {
+			zoneId := getZoneKey(node)
+			if zoneId != "" {
+				fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
+			}
+
+			// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
+			// TODO: Any way to justify this weighting?
+			fScore /= 3.0
+		}
+
 		result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
 		glog.V(10).Infof(
 			"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),
diff --git a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
index 93084308554..fc62b752e50 100644
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
@@ -22,6 +22,7 @@ import (
 	"testing"
 
 	"k8s.io/kubernetes/pkg/api"
+	wellknownlabels "k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 )
@@ -228,6 +229,203 @@ func TestSelectorSpreadPriority(t *testing.T) {
 	}
 }
 
+func TestZoneSelectorSpreadPriority(t *testing.T) {
+	labels1 := map[string]string{
+		"label1": "l1",
+		"baz":    "blah",
+	}
+	labels2 := map[string]string{
+		"label2": "l2",
+		"baz":    "blah",
+	}
+	nodeLabelsZone1 := map[string]string{
+		wellknownlabels.LabelZoneFailureDomain: "zone1",
+	}
+	nodeLabelsZone2 := map[string]string{
+		wellknownlabels.LabelZoneFailureDomain: "zone2",
+	}
+	nodeLabelsZone3 := map[string]string{
+		wellknownlabels.LabelZoneFailureDomain: "zone3",
+	}
+	labeledNodes := map[string]map[string]string{
+		"machine1.zone1": nodeLabelsZone1,
+		"machine1.zone2": nodeLabelsZone2,
+		"machine2.zone2": nodeLabelsZone2,
+		"machine1.zone3": nodeLabelsZone3,
+		"machine2.zone3": nodeLabelsZone3,
+		"machine3.zone3": nodeLabelsZone3,
+	}
+
+	buildPod := func(nodeName string, labels map[string]string) *api.Pod {
+		pod := &api.Pod{Spec: api.PodSpec{NodeName: nodeName}, ObjectMeta: api.ObjectMeta{Labels: labels}}
+		return pod
+	}
+
+	tests := []struct {
+		pod          *api.Pod
+		pods         []*api.Pod
+		nodes        []string
+		rcs          []api.ReplicationController
+		services     []api.Service
+		expectedList schedulerapi.HostPriorityList
+		test         string
+	}{
+		{
+			pod: new(api.Pod),
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 10},
+				{"machine2.zone2", 10},
+				{"machine1.zone3", 10},
+				{"machine2.zone3", 10},
+				{"machine3.zone3", 10},
+			},
+			test: "nothing scheduled",
+		},
+		{
+			pod:  buildPod("", labels1),
+			pods: []*api.Pod{buildPod("machine1.zone1", nil)},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 10},
+				{"machine2.zone2", 10},
+				{"machine1.zone3", 10},
+				{"machine2.zone3", 10},
+				{"machine3.zone3", 10},
+			},
+			test: "no services",
+		},
+		{
+			pod:      buildPod("", labels1),
+			pods:     []*api.Pod{buildPod("machine1.zone1", labels2)},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 10},
+				{"machine2.zone2", 10},
+				{"machine1.zone3", 10},
+				{"machine2.zone3", 10},
+				{"machine3.zone3", 10},
+			},
+			test: "different services",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone1", labels2),
+				buildPod("machine1.zone2", labels1),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 0}, // Already have pod on machine
+				{"machine2.zone2", 3}, // Already have pod in zone
+				{"machine1.zone3", 10},
+				{"machine2.zone3", 10},
+				{"machine3.zone3", 10},
+			},
+			test: "two pods, 1 matching (in z2)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone1", labels2),
+				buildPod("machine1.zone2", labels1),
+				buildPod("machine2.zone2", labels1),
+				buildPod("machine1.zone3", labels2),
+				buildPod("machine2.zone3", labels1),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 10},
+				{"machine1.zone2", 0}, // Pod on node
+				{"machine2.zone2", 0}, // Pod on node
+				{"machine1.zone3", 6}, // Pod in zone
+				{"machine2.zone3", 3}, // Pod on node
+				{"machine3.zone3", 6}, // Pod in zone
+			},
+			test: "five pods, 3 matching (z2=2, z3=1)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone1", labels1),
+				buildPod("machine1.zone2", labels1),
+				buildPod("machine2.zone2", labels2),
+				buildPod("machine1.zone3", labels1),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 0}, // Pod on node
+				{"machine1.zone2", 0}, // Pod on node
+				{"machine2.zone2", 3}, // Pod in zone
+				{"machine1.zone3", 0}, // Pod on node
+				{"machine2.zone3", 3}, // Pod in zone
+				{"machine3.zone3", 3}, // Pod in zone
+			},
+			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone1", labels1),
+				buildPod("machine1.zone2", labels1),
+				buildPod("machine1.zone3", labels1),
+				buildPod("machine2.zone2", labels2),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{"machine1.zone1", 0}, // Pod on node
+				{"machine1.zone2", 0}, // Pod on node
+				{"machine2.zone2", 3}, // Pod in zone
+				{"machine1.zone3", 0}, // Pod on node
+				{"machine2.zone3", 3}, // Pod in zone
+				{"machine3.zone3", 3}, // Pod in zone
+			},
+			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod("machine1.zone3", labels1),
+				buildPod("machine1.zone2", labels1),
+				buildPod("machine1.zone3", labels1),
+			},
+			rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				// Note that because we put two pods on the same node (machine1.zone3),
+				// the values here are questionable for zone2, in particular for machine1.zone2.
+				// However they kind of make sense; zone1 is still most-highly favored.
+				// zone3 is in general least favored, and m1.z3 particularly low priority.
+				// We would probably prefer to see a bigger gap between putting a second
+				// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
+				// This is also consistent with what we have already.
+				{"machine1.zone1", 10}, // No pods in zone
+				{"machine1.zone2", 5},  // Pod on node
+				{"machine2.zone2", 6},  // Pod in zone
+				{"machine1.zone3", 0},  // Two pods on node
+				{"machine2.zone3", 3},  // Pod in zone
+				{"machine3.zone3", 3},  // Pod in zone
+			},
+			test: "Replication controller spreading (z1=0, z2=1, z3=2)",
+		},
+	}
+
+	for _, test := range tests {
+		selectorSpread := SelectorSpread{serviceLister: algorithm.FakeServiceLister(test.services), controllerLister: algorithm.FakeControllerLister(test.rcs)}
+		list, err := selectorSpread.CalculateSpreadPriority(test.pod, algorithm.FakePodLister(test.pods), algorithm.FakeNodeLister(makeLabeledNodeList(labeledNodes)))
+		if err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+		// sort the two lists to avoid failures on account of different ordering
+		sort.Sort(test.expectedList)
+		sort.Sort(list)
+		if !reflect.DeepEqual(test.expectedList, list) {
+			t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
+		}
+	}
+}
+
 func TestZoneSpreadPriority(t *testing.T) {
 	labels1 := map[string]string{
 		"foo": "bar",

From cd433c974f2981901ea93ad5f64e0507275b0824 Mon Sep 17 00:00:00 2001
From: Justin Santa Barbara <justin@fathomdb.com>
Date: Sat, 5 Dec 2015 22:26:41 -0500
Subject: [PATCH 2/3] Zone-scheduler: Fixes per code-review

---
 .../priorities/selector_spreading.go          | 116 +++++++-----
 .../priorities/selector_spreading_test.go     | 173 +++++++++---------
 2 files changed, 157 insertions(+), 132 deletions(-)

diff --git a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
index 3a643496ae2..f1202e50055 100644
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
@@ -25,6 +25,14 @@ import (
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 )
 
+// The maximum priority value to give to a node
+// Prioritiy values range from 0-maxPriority
+const maxPriority = 10
+
+// When zone information is present, give 2/3 of the weighting to zone spreading, 1/3 to node spreading
+// TODO: Any way to justify this weighting?
+const zoneWeighting = 2.0 / 3.0
+
 type SelectorSpread struct {
 	serviceLister    algorithm.ServiceLister
 	controllerLister algorithm.ControllerLister
@@ -53,12 +61,18 @@ func getZoneKey(node *api.Node) string {
 		return ""
 	}
 
-	return region + ":" + failureDomain
+	// We include the null character just in case region or failureDomain has a colon
+	// (We do assume there's no null characters in a region or failureDomain)
+	// As a nice side-benefit, the null character is not printed by fmt.Print or glog
+	return region + ":\x00:" + failureDomain
 }
 
-// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
-// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
-// pods which match the same selectors of Services and RCs as current pod.
+// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
+// When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
+// It favors nodes that have fewer existing matching pods.
+// i.e. it pushes the scheduler towards a node where there's the smallest number of
+// pods which match the same service selectors or RC selectors as the pod being scheduled.
+// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods.
 func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
 	var nsPods []*api.Pod
 
@@ -94,36 +108,40 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
 		return nil, err
 	}
 
-	maxCountByNodeName := 0
+	// Count similar pods by node
 	countsByNodeName := map[string]int{}
-	if len(nsPods) > 0 {
-		for _, pod := range nsPods {
-			// When we are replacing a failed pod, we often see the previous deleted version
-			// while scheduling the replacement.  Ignore the previous deleted version for spreading
-			// purposes (it can still be considered for resource restrictions etc.)
-			if pod.DeletionTimestamp != nil {
-				glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
-				continue
-			}
-			matches := false
-			for _, selector := range selectors {
-				if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
-					matches = true
-					break
-				}
-			}
-			if matches {
-				countsByNodeName[pod.Spec.NodeName]++
-				// Compute the maximum number of pods hosted on any node
-				if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName {
-					maxCountByNodeName = countsByNodeName[pod.Spec.NodeName]
-				}
+	for _, pod := range nsPods {
+		// When we are replacing a failed pod, we often see the previous deleted version
+		// while scheduling the replacement.  Ignore the previous deleted version for spreading
+		// purposes (it can still be considered for resource restrictions etc.)
+		if pod.DeletionTimestamp != nil {
+			glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
+			continue
+		}
+		matches := false
+		for _, selector := range selectors {
+			if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
+				matches = true
+				break
 			}
 		}
+		if !matches {
+			continue
+		}
+
+		countsByNodeName[pod.Spec.NodeName]++
+	}
+
+	// Aggregate by-node information
+	// Compute the maximum number of pods hosted on any node
+	maxCountByNodeName := 0
+	for _, count := range countsByNodeName {
+		if count > maxCountByNodeName {
+			maxCountByNodeName = count
+		}
 	}
 
-	maxCountByZone := 0
-	haveZones := false
+	// Count similar pods by zone, if zone information is present
 	countsByZone := map[string]int{}
 	for i := range nodes.Items {
 		node := &nodes.Items[i]
@@ -138,35 +156,37 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
 			continue
 		}
 
-		haveZones = true
 		countsByZone[zoneId] += count
-		// Compute the maximum number of pods hosted in any zone
-		if countsByZone[zoneId] > maxCountByZone {
-			maxCountByZone = countsByZone[zoneId]
+	}
+
+	// Aggregate by-zone information
+	// Compute the maximum number of pods hosted in any zone
+	haveZones := len(countsByZone) != 0
+	maxCountByZone := 0
+	for _, count := range countsByZone {
+		if count > maxCountByZone {
+			maxCountByZone = count
 		}
 	}
 
 	result := []schedulerapi.HostPriority{}
-	//score int - scale of 0-10
-	// 0 being the lowest priority and 10 being the highest
+	//score int - scale of 0-maxPriority
+	// 0 being the lowest priority and maxPriority being the highest
 	for i := range nodes.Items {
 		node := &nodes.Items[i]
-		// initializing to the default/max node score of 10
-		fScore := float32(10)
+		// initializing to the default/max node score of maxPriority
+		fScore := float32(maxPriority)
 		if maxCountByNodeName > 0 {
-			fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
+			fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
 		}
 
 		// If there is zone information present, incorporate it
 		if haveZones {
 			zoneId := getZoneKey(node)
 			if zoneId != "" {
-				fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
+				zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
+				fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)
 			}
-
-			// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
-			// TODO: Any way to justify this weighting?
-			fScore /= 3.0
 		}
 
 		result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
@@ -241,13 +261,13 @@ func (s *ServiceAntiAffinity) CalculateAntiAffinityPriority(pod *api.Pod, podLis
 
 	numServicePods := len(nsServicePods)
 	result := []schedulerapi.HostPriority{}
-	//score int - scale of 0-10
-	// 0 being the lowest priority and 10 being the highest
+	//score int - scale of 0-maxPriority
+	// 0 being the lowest priority and maxPriority being the highest
 	for node := range labeledNodes {
-		// initializing to the default/max node score of 10
-		fScore := float32(10)
+		// initializing to the default/max node score of maxPriority
+		fScore := float32(maxPriority)
 		if numServicePods > 0 {
-			fScore = 10 * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
+			fScore = maxPriority * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
 		}
 		result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)})
 	}
diff --git a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
index fc62b752e50..a9ee18c87a2 100644
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
@@ -238,22 +238,27 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
 		"label2": "l2",
 		"baz":    "blah",
 	}
-	nodeLabelsZone1 := map[string]string{
-		wellknownlabels.LabelZoneFailureDomain: "zone1",
-	}
-	nodeLabelsZone2 := map[string]string{
-		wellknownlabels.LabelZoneFailureDomain: "zone2",
-	}
-	nodeLabelsZone3 := map[string]string{
-		wellknownlabels.LabelZoneFailureDomain: "zone3",
+
+	const nodeMachine1Zone1 = "machine1.zone1"
+	const nodeMachine1Zone2 = "machine1.zone2"
+	const nodeMachine2Zone2 = "machine2.zone2"
+	const nodeMachine1Zone3 = "machine1.zone3"
+	const nodeMachine2Zone3 = "machine2.zone3"
+	const nodeMachine3Zone3 = "machine3.zone3"
+
+	buildNodeLabels := func(failureDomain string) map[string]string {
+		labels := map[string]string{
+			wellknownlabels.LabelZoneFailureDomain: failureDomain,
+		}
+		return labels
 	}
 	labeledNodes := map[string]map[string]string{
-		"machine1.zone1": nodeLabelsZone1,
-		"machine1.zone2": nodeLabelsZone2,
-		"machine2.zone2": nodeLabelsZone2,
-		"machine1.zone3": nodeLabelsZone3,
-		"machine2.zone3": nodeLabelsZone3,
-		"machine3.zone3": nodeLabelsZone3,
+		nodeMachine1Zone1: buildNodeLabels("zone1"),
+		nodeMachine1Zone2: buildNodeLabels("zone2"),
+		nodeMachine2Zone2: buildNodeLabels("zone2"),
+		nodeMachine1Zone3: buildNodeLabels("zone3"),
+		nodeMachine2Zone3: buildNodeLabels("zone3"),
+		nodeMachine3Zone3: buildNodeLabels("zone3"),
 	}
 
 	buildPod := func(nodeName string, labels map[string]string) *api.Pod {
@@ -273,139 +278,139 @@ func TestZoneSelectorSpreadPriority(t *testing.T) {
 		{
 			pod: new(api.Pod),
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 10},
-				{"machine2.zone2", 10},
-				{"machine1.zone3", 10},
-				{"machine2.zone3", 10},
-				{"machine3.zone3", 10},
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 10},
+				{nodeMachine2Zone2, 10},
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
 			},
 			test: "nothing scheduled",
 		},
 		{
 			pod:  buildPod("", labels1),
-			pods: []*api.Pod{buildPod("machine1.zone1", nil)},
+			pods: []*api.Pod{buildPod(nodeMachine1Zone1, nil)},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 10},
-				{"machine2.zone2", 10},
-				{"machine1.zone3", 10},
-				{"machine2.zone3", 10},
-				{"machine3.zone3", 10},
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 10},
+				{nodeMachine2Zone2, 10},
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
 			},
 			test: "no services",
 		},
 		{
 			pod:      buildPod("", labels1),
-			pods:     []*api.Pod{buildPod("machine1.zone1", labels2)},
+			pods:     []*api.Pod{buildPod(nodeMachine1Zone1, labels2)},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 10},
-				{"machine2.zone2", 10},
-				{"machine1.zone3", 10},
-				{"machine2.zone3", 10},
-				{"machine3.zone3", 10},
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 10},
+				{nodeMachine2Zone2, 10},
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
 			},
 			test: "different services",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone1", labels2),
-				buildPod("machine1.zone2", labels1),
+				buildPod(nodeMachine1Zone1, labels2),
+				buildPod(nodeMachine1Zone2, labels1),
 			},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 0}, // Already have pod on machine
-				{"machine2.zone2", 3}, // Already have pod in zone
-				{"machine1.zone3", 10},
-				{"machine2.zone3", 10},
-				{"machine3.zone3", 10},
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 0}, // Already have pod on machine
+				{nodeMachine2Zone2, 3}, // Already have pod in zone
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
 			},
 			test: "two pods, 1 matching (in z2)",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone1", labels2),
-				buildPod("machine1.zone2", labels1),
-				buildPod("machine2.zone2", labels1),
-				buildPod("machine1.zone3", labels2),
-				buildPod("machine2.zone3", labels1),
+				buildPod(nodeMachine1Zone1, labels2),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine2Zone2, labels1),
+				buildPod(nodeMachine1Zone3, labels2),
+				buildPod(nodeMachine2Zone3, labels1),
 			},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 10},
-				{"machine1.zone2", 0}, // Pod on node
-				{"machine2.zone2", 0}, // Pod on node
-				{"machine1.zone3", 6}, // Pod in zone
-				{"machine2.zone3", 3}, // Pod on node
-				{"machine3.zone3", 6}, // Pod in zone
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 0}, // Pod on node
+				{nodeMachine2Zone2, 0}, // Pod on node
+				{nodeMachine1Zone3, 6}, // Pod in zone
+				{nodeMachine2Zone3, 3}, // Pod on node
+				{nodeMachine3Zone3, 6}, // Pod in zone
 			},
 			test: "five pods, 3 matching (z2=2, z3=1)",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone1", labels1),
-				buildPod("machine1.zone2", labels1),
-				buildPod("machine2.zone2", labels2),
-				buildPod("machine1.zone3", labels1),
+				buildPod(nodeMachine1Zone1, labels1),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine2Zone2, labels2),
+				buildPod(nodeMachine1Zone3, labels1),
 			},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 0}, // Pod on node
-				{"machine1.zone2", 0}, // Pod on node
-				{"machine2.zone2", 3}, // Pod in zone
-				{"machine1.zone3", 0}, // Pod on node
-				{"machine2.zone3", 3}, // Pod in zone
-				{"machine3.zone3", 3}, // Pod in zone
+				{nodeMachine1Zone1, 0}, // Pod on node
+				{nodeMachine1Zone2, 0}, // Pod on node
+				{nodeMachine2Zone2, 3}, // Pod in zone
+				{nodeMachine1Zone3, 0}, // Pod on node
+				{nodeMachine2Zone3, 3}, // Pod in zone
+				{nodeMachine3Zone3, 3}, // Pod in zone
 			},
 			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone1", labels1),
-				buildPod("machine1.zone2", labels1),
-				buildPod("machine1.zone3", labels1),
-				buildPod("machine2.zone2", labels2),
+				buildPod(nodeMachine1Zone1, labels1),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine1Zone3, labels1),
+				buildPod(nodeMachine2Zone2, labels2),
 			},
 			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				{"machine1.zone1", 0}, // Pod on node
-				{"machine1.zone2", 0}, // Pod on node
-				{"machine2.zone2", 3}, // Pod in zone
-				{"machine1.zone3", 0}, // Pod on node
-				{"machine2.zone3", 3}, // Pod in zone
-				{"machine3.zone3", 3}, // Pod in zone
+				{nodeMachine1Zone1, 0}, // Pod on node
+				{nodeMachine1Zone2, 0}, // Pod on node
+				{nodeMachine2Zone2, 3}, // Pod in zone
+				{nodeMachine1Zone3, 0}, // Pod on node
+				{nodeMachine2Zone3, 3}, // Pod in zone
+				{nodeMachine3Zone3, 3}, // Pod in zone
 			},
 			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
 		},
 		{
 			pod: buildPod("", labels1),
 			pods: []*api.Pod{
-				buildPod("machine1.zone3", labels1),
-				buildPod("machine1.zone2", labels1),
-				buildPod("machine1.zone3", labels1),
+				buildPod(nodeMachine1Zone3, labels1),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine1Zone3, labels1),
 			},
 			rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
 			expectedList: []schedulerapi.HostPriority{
-				// Note that because we put two pods on the same node (machine1.zone3),
-				// the values here are questionable for zone2, in particular for machine1.zone2.
+				// Note that because we put two pods on the same node (nodeMachine1Zone3),
+				// the values here are questionable for zone2, in particular for nodeMachine1Zone2.
 				// However they kind of make sense; zone1 is still most-highly favored.
 				// zone3 is in general least favored, and m1.z3 particularly low priority.
 				// We would probably prefer to see a bigger gap between putting a second
 				// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
 				// This is also consistent with what we have already.
-				{"machine1.zone1", 10}, // No pods in zone
-				{"machine1.zone2", 5},  // Pod on node
-				{"machine2.zone2", 6},  // Pod in zone
-				{"machine1.zone3", 0},  // Two pods on node
-				{"machine2.zone3", 3},  // Pod in zone
-				{"machine3.zone3", 3},  // Pod in zone
+				{nodeMachine1Zone1, 10}, // No pods in zone
+				{nodeMachine1Zone2, 5},  // Pod on node
+				{nodeMachine2Zone2, 6},  // Pod in zone
+				{nodeMachine1Zone3, 0},  // Two pods on node
+				{nodeMachine2Zone3, 3},  // Pod in zone
+				{nodeMachine3Zone3, 3},  // Pod in zone
 			},
 			test: "Replication controller spreading (z1=0, z2=1, z3=2)",
 		},

From 541ff002c03eaec8c5f768e0d5c2826568025f48 Mon Sep 17 00:00:00 2001
From: Justin Santa Barbara <justin@fathomdb.com>
Date: Sat, 5 Dec 2015 22:30:46 -0500
Subject: [PATCH 3/3] Zone scheduler: Update scheduler docs

There's not a huge amount of detail in the docs as to how the scheduler
actually works, which is probably a good thing both for readability and
because it makes it easier to tweak the zone-spreading approach in the
future, but we should include some information that we do spread across
zones if zone information is present on the nodes.
---
 docs/devel/scheduler.md           | 2 +-
 docs/devel/scheduler_algorithm.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/devel/scheduler.md b/docs/devel/scheduler.md
index ffc73ca1c6c..2bdb4c1671d 100755
--- a/docs/devel/scheduler.md
+++ b/docs/devel/scheduler.md
@@ -47,7 +47,7 @@ will filter out nodes that don't have at least that much resources available (co
 as the capacity of the node minus the sum of the resource requests of the containers that
 are already running on the node). Second, it applies a set of "priority functions"
 that rank the nodes that weren't filtered out by the predicate check. For example,
-it tries to spread Pods across nodes while at the same time favoring the least-loaded
+it tries to spread Pods across nodes and zones while at the same time favoring the least-loaded
 nodes (where "load" here is sum of the resource requests of the containers running on the node,
 divided by the node's capacity).
 Finally, the node with the highest priority is chosen
diff --git a/docs/devel/scheduler_algorithm.md b/docs/devel/scheduler_algorithm.md
index c8790af9e90..3888786c7a3 100755
--- a/docs/devel/scheduler_algorithm.md
+++ b/docs/devel/scheduler_algorithm.md
@@ -61,7 +61,7 @@ Currently, Kubernetes scheduler provides some practical priority functions, incl
 - `LeastRequestedPriority`: The node is prioritized based on the fraction of the node that would be free if the new Pod were scheduled onto the node. (In other words, (capacity - sum of requests of all Pods already on the node - request of Pod that is being scheduled) / capacity). CPU and memory are equally weighted. The node with the highest free fraction is the most preferred. Note that this priority function has the effect of spreading Pods across the nodes with respect to resource consumption.
 - `CalculateNodeLabelPriority`: Prefer nodes that have the specified label.
 - `BalancedResourceAllocation`: This priority function tries to put the Pod on a node such that the CPU and Memory utilization rate is balanced after the Pod is deployed.
-- `CalculateSpreadPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on the same node.
+- `CalculateSpreadPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on the same node.  If zone information is present on the nodes, the priority will be adjusted so that pods are spread across zones and nodes.
 - `CalculateAntiAffinityPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on nodes with the same value for a particular label.
 
 The details of the above priority functions can be found in [plugin/pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithm/priorities/). Kubernetes uses some, but not all, of these priority functions by default. You can see which ones are used by default in [plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go). Similar as predicates, you can combine the above priority functions and assign weight factors (positive number) to them as you want (check [scheduler.md](scheduler.md) for how to customize).