mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-11 21:12:07 +00:00
When scheduling, spread between zones if labeled
We already spread across nodes; we modify this spreading preference to spread across zones when nodes are labeled with zone information.
This commit is contained in:
parent
c458cd7bb7
commit
6aa16c744b
@ -19,6 +19,7 @@ package priorities
|
||||
import (
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/unversioned"
|
||||
"k8s.io/kubernetes/pkg/labels"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||
@ -37,11 +38,28 @@ func NewSelectorSpreadPriority(serviceLister algorithm.ServiceLister, controller
|
||||
return selectorSpread.CalculateSpreadPriority
|
||||
}
|
||||
|
||||
// Helper function that builds a string identifier that is unique per failure-zone
|
||||
// Returns empty-string for no zone
|
||||
func getZoneKey(node *api.Node) string {
|
||||
labels := node.Labels
|
||||
if labels == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
region, _ := labels[unversioned.LabelZoneRegion]
|
||||
failureDomain, _ := labels[unversioned.LabelZoneFailureDomain]
|
||||
|
||||
if region == "" && failureDomain == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
return region + ":" + failureDomain
|
||||
}
|
||||
|
||||
// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
|
||||
// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
|
||||
// pods which match the same selectors of Services and RCs as current pod.
|
||||
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
|
||||
var maxCount int
|
||||
var nsPods []*api.Pod
|
||||
|
||||
selectors := make([]labels.Selector, 0)
|
||||
@ -76,9 +94,17 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
|
||||
return nil, err
|
||||
}
|
||||
|
||||
counts := map[string]int{}
|
||||
maxCountByNodeName := 0
|
||||
countsByNodeName := map[string]int{}
|
||||
if len(nsPods) > 0 {
|
||||
for _, pod := range nsPods {
|
||||
// When we are replacing a failed pod, we often see the previous deleted version
|
||||
// while scheduling the replacement. Ignore the previous deleted version for spreading
|
||||
// purposes (it can still be considered for resource restrictions etc.)
|
||||
if pod.DeletionTimestamp != nil {
|
||||
glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
|
||||
continue
|
||||
}
|
||||
matches := false
|
||||
for _, selector := range selectors {
|
||||
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
|
||||
@ -87,24 +113,62 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
|
||||
}
|
||||
}
|
||||
if matches {
|
||||
counts[pod.Spec.NodeName]++
|
||||
countsByNodeName[pod.Spec.NodeName]++
|
||||
// Compute the maximum number of pods hosted on any node
|
||||
if counts[pod.Spec.NodeName] > maxCount {
|
||||
maxCount = counts[pod.Spec.NodeName]
|
||||
if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName {
|
||||
maxCountByNodeName = countsByNodeName[pod.Spec.NodeName]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
maxCountByZone := 0
|
||||
haveZones := false
|
||||
countsByZone := map[string]int{}
|
||||
for i := range nodes.Items {
|
||||
node := &nodes.Items[i]
|
||||
|
||||
count, found := countsByNodeName[node.Name]
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
|
||||
zoneId := getZoneKey(node)
|
||||
if zoneId == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
haveZones = true
|
||||
countsByZone[zoneId] += count
|
||||
// Compute the maximum number of pods hosted in any zone
|
||||
if countsByZone[zoneId] > maxCountByZone {
|
||||
maxCountByZone = countsByZone[zoneId]
|
||||
}
|
||||
}
|
||||
|
||||
result := []schedulerapi.HostPriority{}
|
||||
//score int - scale of 0-10
|
||||
// 0 being the lowest priority and 10 being the highest
|
||||
for _, node := range nodes.Items {
|
||||
for i := range nodes.Items {
|
||||
node := &nodes.Items[i]
|
||||
// initializing to the default/max node score of 10
|
||||
fScore := float32(10)
|
||||
if maxCount > 0 {
|
||||
fScore = 10 * (float32(maxCount-counts[node.Name]) / float32(maxCount))
|
||||
if maxCountByNodeName > 0 {
|
||||
fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
|
||||
}
|
||||
|
||||
// If there is zone information present, incorporate it
|
||||
if haveZones {
|
||||
zoneId := getZoneKey(node)
|
||||
if zoneId != "" {
|
||||
fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
|
||||
}
|
||||
|
||||
// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
|
||||
// TODO: Any way to justify this weighting?
|
||||
fScore /= 3.0
|
||||
}
|
||||
|
||||
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
|
||||
glog.V(10).Infof(
|
||||
"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),
|
||||
|
@ -22,6 +22,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
wellknownlabels "k8s.io/kubernetes/pkg/api/unversioned"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||
)
|
||||
@ -228,6 +229,203 @@ func TestSelectorSpreadPriority(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestZoneSelectorSpreadPriority(t *testing.T) {
|
||||
labels1 := map[string]string{
|
||||
"label1": "l1",
|
||||
"baz": "blah",
|
||||
}
|
||||
labels2 := map[string]string{
|
||||
"label2": "l2",
|
||||
"baz": "blah",
|
||||
}
|
||||
nodeLabelsZone1 := map[string]string{
|
||||
wellknownlabels.LabelZoneFailureDomain: "zone1",
|
||||
}
|
||||
nodeLabelsZone2 := map[string]string{
|
||||
wellknownlabels.LabelZoneFailureDomain: "zone2",
|
||||
}
|
||||
nodeLabelsZone3 := map[string]string{
|
||||
wellknownlabels.LabelZoneFailureDomain: "zone3",
|
||||
}
|
||||
labeledNodes := map[string]map[string]string{
|
||||
"machine1.zone1": nodeLabelsZone1,
|
||||
"machine1.zone2": nodeLabelsZone2,
|
||||
"machine2.zone2": nodeLabelsZone2,
|
||||
"machine1.zone3": nodeLabelsZone3,
|
||||
"machine2.zone3": nodeLabelsZone3,
|
||||
"machine3.zone3": nodeLabelsZone3,
|
||||
}
|
||||
|
||||
buildPod := func(nodeName string, labels map[string]string) *api.Pod {
|
||||
pod := &api.Pod{Spec: api.PodSpec{NodeName: nodeName}, ObjectMeta: api.ObjectMeta{Labels: labels}}
|
||||
return pod
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
pod *api.Pod
|
||||
pods []*api.Pod
|
||||
nodes []string
|
||||
rcs []api.ReplicationController
|
||||
services []api.Service
|
||||
expectedList schedulerapi.HostPriorityList
|
||||
test string
|
||||
}{
|
||||
{
|
||||
pod: new(api.Pod),
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 10},
|
||||
{"machine2.zone2", 10},
|
||||
{"machine1.zone3", 10},
|
||||
{"machine2.zone3", 10},
|
||||
{"machine3.zone3", 10},
|
||||
},
|
||||
test: "nothing scheduled",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{buildPod("machine1.zone1", nil)},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 10},
|
||||
{"machine2.zone2", 10},
|
||||
{"machine1.zone3", 10},
|
||||
{"machine2.zone3", 10},
|
||||
{"machine3.zone3", 10},
|
||||
},
|
||||
test: "no services",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{buildPod("machine1.zone1", labels2)},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 10},
|
||||
{"machine2.zone2", 10},
|
||||
{"machine1.zone3", 10},
|
||||
{"machine2.zone3", 10},
|
||||
{"machine3.zone3", 10},
|
||||
},
|
||||
test: "different services",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone1", labels2),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 0}, // Already have pod on machine
|
||||
{"machine2.zone2", 3}, // Already have pod in zone
|
||||
{"machine1.zone3", 10},
|
||||
{"machine2.zone3", 10},
|
||||
{"machine3.zone3", 10},
|
||||
},
|
||||
test: "two pods, 1 matching (in z2)",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone1", labels2),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
buildPod("machine2.zone2", labels1),
|
||||
buildPod("machine1.zone3", labels2),
|
||||
buildPod("machine2.zone3", labels1),
|
||||
},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 10},
|
||||
{"machine1.zone2", 0}, // Pod on node
|
||||
{"machine2.zone2", 0}, // Pod on node
|
||||
{"machine1.zone3", 6}, // Pod in zone
|
||||
{"machine2.zone3", 3}, // Pod on node
|
||||
{"machine3.zone3", 6}, // Pod in zone
|
||||
},
|
||||
test: "five pods, 3 matching (z2=2, z3=1)",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone1", labels1),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
buildPod("machine2.zone2", labels2),
|
||||
buildPod("machine1.zone3", labels1),
|
||||
},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 0}, // Pod on node
|
||||
{"machine1.zone2", 0}, // Pod on node
|
||||
{"machine2.zone2", 3}, // Pod in zone
|
||||
{"machine1.zone3", 0}, // Pod on node
|
||||
{"machine2.zone3", 3}, // Pod in zone
|
||||
{"machine3.zone3", 3}, // Pod in zone
|
||||
},
|
||||
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone1", labels1),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
buildPod("machine1.zone3", labels1),
|
||||
buildPod("machine2.zone2", labels2),
|
||||
},
|
||||
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
{"machine1.zone1", 0}, // Pod on node
|
||||
{"machine1.zone2", 0}, // Pod on node
|
||||
{"machine2.zone2", 3}, // Pod in zone
|
||||
{"machine1.zone3", 0}, // Pod on node
|
||||
{"machine2.zone3", 3}, // Pod in zone
|
||||
{"machine3.zone3", 3}, // Pod in zone
|
||||
},
|
||||
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
|
||||
},
|
||||
{
|
||||
pod: buildPod("", labels1),
|
||||
pods: []*api.Pod{
|
||||
buildPod("machine1.zone3", labels1),
|
||||
buildPod("machine1.zone2", labels1),
|
||||
buildPod("machine1.zone3", labels1),
|
||||
},
|
||||
rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
|
||||
expectedList: []schedulerapi.HostPriority{
|
||||
// Note that because we put two pods on the same node (machine1.zone3),
|
||||
// the values here are questionable for zone2, in particular for machine1.zone2.
|
||||
// However they kind of make sense; zone1 is still most-highly favored.
|
||||
// zone3 is in general least favored, and m1.z3 particularly low priority.
|
||||
// We would probably prefer to see a bigger gap between putting a second
|
||||
// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
|
||||
// This is also consistent with what we have already.
|
||||
{"machine1.zone1", 10}, // No pods in zone
|
||||
{"machine1.zone2", 5}, // Pod on node
|
||||
{"machine2.zone2", 6}, // Pod in zone
|
||||
{"machine1.zone3", 0}, // Two pods on node
|
||||
{"machine2.zone3", 3}, // Pod in zone
|
||||
{"machine3.zone3", 3}, // Pod in zone
|
||||
},
|
||||
test: "Replication controller spreading (z1=0, z2=1, z3=2)",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
selectorSpread := SelectorSpread{serviceLister: algorithm.FakeServiceLister(test.services), controllerLister: algorithm.FakeControllerLister(test.rcs)}
|
||||
list, err := selectorSpread.CalculateSpreadPriority(test.pod, algorithm.FakePodLister(test.pods), algorithm.FakeNodeLister(makeLabeledNodeList(labeledNodes)))
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
// sort the two lists to avoid failures on account of different ordering
|
||||
sort.Sort(test.expectedList)
|
||||
sort.Sort(list)
|
||||
if !reflect.DeepEqual(test.expectedList, list) {
|
||||
t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestZoneSpreadPriority(t *testing.T) {
|
||||
labels1 := map[string]string{
|
||||
"foo": "bar",
|
||||
|
Loading…
Reference in New Issue
Block a user