When scheduling, spread between zones if labeled

We already spread across nodes; we modify this spreading preference to
spread across zones when nodes are labeled with zone information.
This commit is contained in:
Justin Santa Barbara 2015-11-29 10:02:40 -05:00
parent c458cd7bb7
commit 6aa16c744b
2 changed files with 270 additions and 8 deletions

View File

@ -19,6 +19,7 @@ package priorities
import (
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
@ -37,11 +38,28 @@ func NewSelectorSpreadPriority(serviceLister algorithm.ServiceLister, controller
return selectorSpread.CalculateSpreadPriority
}
// Helper function that builds a string identifier that is unique per failure-zone
// Returns empty-string for no zone
func getZoneKey(node *api.Node) string {
labels := node.Labels
if labels == nil {
return ""
}
region, _ := labels[unversioned.LabelZoneRegion]
failureDomain, _ := labels[unversioned.LabelZoneFailureDomain]
if region == "" && failureDomain == "" {
return ""
}
return region + ":" + failureDomain
}
// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
// pods which match the same selectors of Services and RCs as current pod.
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
var maxCount int
var nsPods []*api.Pod
selectors := make([]labels.Selector, 0)
@ -76,9 +94,17 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
return nil, err
}
counts := map[string]int{}
maxCountByNodeName := 0
countsByNodeName := map[string]int{}
if len(nsPods) > 0 {
for _, pod := range nsPods {
// When we are replacing a failed pod, we often see the previous deleted version
// while scheduling the replacement. Ignore the previous deleted version for spreading
// purposes (it can still be considered for resource restrictions etc.)
if pod.DeletionTimestamp != nil {
glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
continue
}
matches := false
for _, selector := range selectors {
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
@ -87,24 +113,62 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
}
}
if matches {
counts[pod.Spec.NodeName]++
countsByNodeName[pod.Spec.NodeName]++
// Compute the maximum number of pods hosted on any node
if counts[pod.Spec.NodeName] > maxCount {
maxCount = counts[pod.Spec.NodeName]
if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName {
maxCountByNodeName = countsByNodeName[pod.Spec.NodeName]
}
}
}
}
maxCountByZone := 0
haveZones := false
countsByZone := map[string]int{}
for i := range nodes.Items {
node := &nodes.Items[i]
count, found := countsByNodeName[node.Name]
if !found {
continue
}
zoneId := getZoneKey(node)
if zoneId == "" {
continue
}
haveZones = true
countsByZone[zoneId] += count
// Compute the maximum number of pods hosted in any zone
if countsByZone[zoneId] > maxCountByZone {
maxCountByZone = countsByZone[zoneId]
}
}
result := []schedulerapi.HostPriority{}
//score int - scale of 0-10
// 0 being the lowest priority and 10 being the highest
for _, node := range nodes.Items {
for i := range nodes.Items {
node := &nodes.Items[i]
// initializing to the default/max node score of 10
fScore := float32(10)
if maxCount > 0 {
fScore = 10 * (float32(maxCount-counts[node.Name]) / float32(maxCount))
if maxCountByNodeName > 0 {
fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
}
// If there is zone information present, incorporate it
if haveZones {
zoneId := getZoneKey(node)
if zoneId != "" {
fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
}
// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
// TODO: Any way to justify this weighting?
fScore /= 3.0
}
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
glog.V(10).Infof(
"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),

View File

@ -22,6 +22,7 @@ import (
"testing"
"k8s.io/kubernetes/pkg/api"
wellknownlabels "k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
)
@ -228,6 +229,203 @@ func TestSelectorSpreadPriority(t *testing.T) {
}
}
func TestZoneSelectorSpreadPriority(t *testing.T) {
labels1 := map[string]string{
"label1": "l1",
"baz": "blah",
}
labels2 := map[string]string{
"label2": "l2",
"baz": "blah",
}
nodeLabelsZone1 := map[string]string{
wellknownlabels.LabelZoneFailureDomain: "zone1",
}
nodeLabelsZone2 := map[string]string{
wellknownlabels.LabelZoneFailureDomain: "zone2",
}
nodeLabelsZone3 := map[string]string{
wellknownlabels.LabelZoneFailureDomain: "zone3",
}
labeledNodes := map[string]map[string]string{
"machine1.zone1": nodeLabelsZone1,
"machine1.zone2": nodeLabelsZone2,
"machine2.zone2": nodeLabelsZone2,
"machine1.zone3": nodeLabelsZone3,
"machine2.zone3": nodeLabelsZone3,
"machine3.zone3": nodeLabelsZone3,
}
buildPod := func(nodeName string, labels map[string]string) *api.Pod {
pod := &api.Pod{Spec: api.PodSpec{NodeName: nodeName}, ObjectMeta: api.ObjectMeta{Labels: labels}}
return pod
}
tests := []struct {
pod *api.Pod
pods []*api.Pod
nodes []string
rcs []api.ReplicationController
services []api.Service
expectedList schedulerapi.HostPriorityList
test string
}{
{
pod: new(api.Pod),
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 10},
{"machine2.zone2", 10},
{"machine1.zone3", 10},
{"machine2.zone3", 10},
{"machine3.zone3", 10},
},
test: "nothing scheduled",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{buildPod("machine1.zone1", nil)},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 10},
{"machine2.zone2", 10},
{"machine1.zone3", 10},
{"machine2.zone3", 10},
{"machine3.zone3", 10},
},
test: "no services",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{buildPod("machine1.zone1", labels2)},
services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 10},
{"machine2.zone2", 10},
{"machine1.zone3", 10},
{"machine2.zone3", 10},
{"machine3.zone3", 10},
},
test: "different services",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone1", labels2),
buildPod("machine1.zone2", labels1),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 0}, // Already have pod on machine
{"machine2.zone2", 3}, // Already have pod in zone
{"machine1.zone3", 10},
{"machine2.zone3", 10},
{"machine3.zone3", 10},
},
test: "two pods, 1 matching (in z2)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone1", labels2),
buildPod("machine1.zone2", labels1),
buildPod("machine2.zone2", labels1),
buildPod("machine1.zone3", labels2),
buildPod("machine2.zone3", labels1),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 10},
{"machine1.zone2", 0}, // Pod on node
{"machine2.zone2", 0}, // Pod on node
{"machine1.zone3", 6}, // Pod in zone
{"machine2.zone3", 3}, // Pod on node
{"machine3.zone3", 6}, // Pod in zone
},
test: "five pods, 3 matching (z2=2, z3=1)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone1", labels1),
buildPod("machine1.zone2", labels1),
buildPod("machine2.zone2", labels2),
buildPod("machine1.zone3", labels1),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 0}, // Pod on node
{"machine1.zone2", 0}, // Pod on node
{"machine2.zone2", 3}, // Pod in zone
{"machine1.zone3", 0}, // Pod on node
{"machine2.zone3", 3}, // Pod in zone
{"machine3.zone3", 3}, // Pod in zone
},
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone1", labels1),
buildPod("machine1.zone2", labels1),
buildPod("machine1.zone3", labels1),
buildPod("machine2.zone2", labels2),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{"machine1.zone1", 0}, // Pod on node
{"machine1.zone2", 0}, // Pod on node
{"machine2.zone2", 3}, // Pod in zone
{"machine1.zone3", 0}, // Pod on node
{"machine2.zone3", 3}, // Pod in zone
{"machine3.zone3", 3}, // Pod in zone
},
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod("machine1.zone3", labels1),
buildPod("machine1.zone2", labels1),
buildPod("machine1.zone3", labels1),
},
rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
// Note that because we put two pods on the same node (machine1.zone3),
// the values here are questionable for zone2, in particular for machine1.zone2.
// However they kind of make sense; zone1 is still most-highly favored.
// zone3 is in general least favored, and m1.z3 particularly low priority.
// We would probably prefer to see a bigger gap between putting a second
// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
// This is also consistent with what we have already.
{"machine1.zone1", 10}, // No pods in zone
{"machine1.zone2", 5}, // Pod on node
{"machine2.zone2", 6}, // Pod in zone
{"machine1.zone3", 0}, // Two pods on node
{"machine2.zone3", 3}, // Pod in zone
{"machine3.zone3", 3}, // Pod in zone
},
test: "Replication controller spreading (z1=0, z2=1, z3=2)",
},
}
for _, test := range tests {
selectorSpread := SelectorSpread{serviceLister: algorithm.FakeServiceLister(test.services), controllerLister: algorithm.FakeControllerLister(test.rcs)}
list, err := selectorSpread.CalculateSpreadPriority(test.pod, algorithm.FakePodLister(test.pods), algorithm.FakeNodeLister(makeLabeledNodeList(labeledNodes)))
if err != nil {
t.Errorf("unexpected error: %v", err)
}
// sort the two lists to avoid failures on account of different ordering
sort.Sort(test.expectedList)
sort.Sort(list)
if !reflect.DeepEqual(test.expectedList, list) {
t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
}
}
}
func TestZoneSpreadPriority(t *testing.T) {
labels1 := map[string]string{
"foo": "bar",