mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-12 21:36:24 +00:00
When scheduling, spread between zones if labeled
We already spread across nodes; we modify this spreading preference to spread across zones when nodes are labeled with zone information.
This commit is contained in:
parent
c458cd7bb7
commit
6aa16c744b
@ -19,6 +19,7 @@ package priorities
|
|||||||
import (
|
import (
|
||||||
"github.com/golang/glog"
|
"github.com/golang/glog"
|
||||||
"k8s.io/kubernetes/pkg/api"
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/api/unversioned"
|
||||||
"k8s.io/kubernetes/pkg/labels"
|
"k8s.io/kubernetes/pkg/labels"
|
||||||
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
||||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||||
@ -37,11 +38,28 @@ func NewSelectorSpreadPriority(serviceLister algorithm.ServiceLister, controller
|
|||||||
return selectorSpread.CalculateSpreadPriority
|
return selectorSpread.CalculateSpreadPriority
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper function that builds a string identifier that is unique per failure-zone
|
||||||
|
// Returns empty-string for no zone
|
||||||
|
func getZoneKey(node *api.Node) string {
|
||||||
|
labels := node.Labels
|
||||||
|
if labels == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
region, _ := labels[unversioned.LabelZoneRegion]
|
||||||
|
failureDomain, _ := labels[unversioned.LabelZoneFailureDomain]
|
||||||
|
|
||||||
|
if region == "" && failureDomain == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
return region + ":" + failureDomain
|
||||||
|
}
|
||||||
|
|
||||||
// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
|
// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
|
||||||
// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
|
// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
|
||||||
// pods which match the same selectors of Services and RCs as current pod.
|
// pods which match the same selectors of Services and RCs as current pod.
|
||||||
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
|
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
|
||||||
var maxCount int
|
|
||||||
var nsPods []*api.Pod
|
var nsPods []*api.Pod
|
||||||
|
|
||||||
selectors := make([]labels.Selector, 0)
|
selectors := make([]labels.Selector, 0)
|
||||||
@ -76,9 +94,17 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
counts := map[string]int{}
|
maxCountByNodeName := 0
|
||||||
|
countsByNodeName := map[string]int{}
|
||||||
if len(nsPods) > 0 {
|
if len(nsPods) > 0 {
|
||||||
for _, pod := range nsPods {
|
for _, pod := range nsPods {
|
||||||
|
// When we are replacing a failed pod, we often see the previous deleted version
|
||||||
|
// while scheduling the replacement. Ignore the previous deleted version for spreading
|
||||||
|
// purposes (it can still be considered for resource restrictions etc.)
|
||||||
|
if pod.DeletionTimestamp != nil {
|
||||||
|
glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
matches := false
|
matches := false
|
||||||
for _, selector := range selectors {
|
for _, selector := range selectors {
|
||||||
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
|
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
|
||||||
@ -87,24 +113,62 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if matches {
|
if matches {
|
||||||
counts[pod.Spec.NodeName]++
|
countsByNodeName[pod.Spec.NodeName]++
|
||||||
// Compute the maximum number of pods hosted on any node
|
// Compute the maximum number of pods hosted on any node
|
||||||
if counts[pod.Spec.NodeName] > maxCount {
|
if countsByNodeName[pod.Spec.NodeName] > maxCountByNodeName {
|
||||||
maxCount = counts[pod.Spec.NodeName]
|
maxCountByNodeName = countsByNodeName[pod.Spec.NodeName]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
maxCountByZone := 0
|
||||||
|
haveZones := false
|
||||||
|
countsByZone := map[string]int{}
|
||||||
|
for i := range nodes.Items {
|
||||||
|
node := &nodes.Items[i]
|
||||||
|
|
||||||
|
count, found := countsByNodeName[node.Name]
|
||||||
|
if !found {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
zoneId := getZoneKey(node)
|
||||||
|
if zoneId == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
haveZones = true
|
||||||
|
countsByZone[zoneId] += count
|
||||||
|
// Compute the maximum number of pods hosted in any zone
|
||||||
|
if countsByZone[zoneId] > maxCountByZone {
|
||||||
|
maxCountByZone = countsByZone[zoneId]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
result := []schedulerapi.HostPriority{}
|
result := []schedulerapi.HostPriority{}
|
||||||
//score int - scale of 0-10
|
//score int - scale of 0-10
|
||||||
// 0 being the lowest priority and 10 being the highest
|
// 0 being the lowest priority and 10 being the highest
|
||||||
for _, node := range nodes.Items {
|
for i := range nodes.Items {
|
||||||
|
node := &nodes.Items[i]
|
||||||
// initializing to the default/max node score of 10
|
// initializing to the default/max node score of 10
|
||||||
fScore := float32(10)
|
fScore := float32(10)
|
||||||
if maxCount > 0 {
|
if maxCountByNodeName > 0 {
|
||||||
fScore = 10 * (float32(maxCount-counts[node.Name]) / float32(maxCount))
|
fScore = 10 * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If there is zone information present, incorporate it
|
||||||
|
if haveZones {
|
||||||
|
zoneId := getZoneKey(node)
|
||||||
|
if zoneId != "" {
|
||||||
|
fScore += 20 * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Give 2/3 of the weighting to zone spreading, 1/3 to node spreading
|
||||||
|
// TODO: Any way to justify this weighting?
|
||||||
|
fScore /= 3.0
|
||||||
|
}
|
||||||
|
|
||||||
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
|
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
|
||||||
glog.V(10).Infof(
|
glog.V(10).Infof(
|
||||||
"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),
|
"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),
|
||||||
|
@ -22,6 +22,7 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"k8s.io/kubernetes/pkg/api"
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
wellknownlabels "k8s.io/kubernetes/pkg/api/unversioned"
|
||||||
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
||||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||||
)
|
)
|
||||||
@ -228,6 +229,203 @@ func TestSelectorSpreadPriority(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestZoneSelectorSpreadPriority(t *testing.T) {
|
||||||
|
labels1 := map[string]string{
|
||||||
|
"label1": "l1",
|
||||||
|
"baz": "blah",
|
||||||
|
}
|
||||||
|
labels2 := map[string]string{
|
||||||
|
"label2": "l2",
|
||||||
|
"baz": "blah",
|
||||||
|
}
|
||||||
|
nodeLabelsZone1 := map[string]string{
|
||||||
|
wellknownlabels.LabelZoneFailureDomain: "zone1",
|
||||||
|
}
|
||||||
|
nodeLabelsZone2 := map[string]string{
|
||||||
|
wellknownlabels.LabelZoneFailureDomain: "zone2",
|
||||||
|
}
|
||||||
|
nodeLabelsZone3 := map[string]string{
|
||||||
|
wellknownlabels.LabelZoneFailureDomain: "zone3",
|
||||||
|
}
|
||||||
|
labeledNodes := map[string]map[string]string{
|
||||||
|
"machine1.zone1": nodeLabelsZone1,
|
||||||
|
"machine1.zone2": nodeLabelsZone2,
|
||||||
|
"machine2.zone2": nodeLabelsZone2,
|
||||||
|
"machine1.zone3": nodeLabelsZone3,
|
||||||
|
"machine2.zone3": nodeLabelsZone3,
|
||||||
|
"machine3.zone3": nodeLabelsZone3,
|
||||||
|
}
|
||||||
|
|
||||||
|
buildPod := func(nodeName string, labels map[string]string) *api.Pod {
|
||||||
|
pod := &api.Pod{Spec: api.PodSpec{NodeName: nodeName}, ObjectMeta: api.ObjectMeta{Labels: labels}}
|
||||||
|
return pod
|
||||||
|
}
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
pod *api.Pod
|
||||||
|
pods []*api.Pod
|
||||||
|
nodes []string
|
||||||
|
rcs []api.ReplicationController
|
||||||
|
services []api.Service
|
||||||
|
expectedList schedulerapi.HostPriorityList
|
||||||
|
test string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
pod: new(api.Pod),
|
||||||
|
expectedList: []schedulerapi.HostPriority{
|
||||||
|
{"machine1.zone1", 10},
|
||||||
|
{"machine1.zone2", 10},
|
||||||
|
{"machine2.zone2", 10},
|
||||||
|
{"machine1.zone3", 10},
|
||||||
|
{"machine2.zone3", 10},
|
||||||
|
{"machine3.zone3", 10},
|
||||||
|
},
|
||||||
|
test: "nothing scheduled",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
pod: buildPod("", labels1),
|
||||||
|
pods: []*api.Pod{buildPod("machine1.zone1", nil)},
|
||||||
|
expectedList: []schedulerapi.HostPriority{
|
||||||
|
{"machine1.zone1", 10},
|
||||||
|
{"machine1.zone2", 10},
|
||||||
|
{"machine2.zone2", 10},
|
||||||
|
{"machine1.zone3", 10},
|
||||||
|
{"machine2.zone3", 10},
|
||||||
|
{"machine3.zone3", 10},
|
||||||
|
},
|
||||||
|
test: "no services",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
pod: buildPod("", labels1),
|
||||||
|
pods: []*api.Pod{buildPod("machine1.zone1", labels2)},
|
||||||
|
services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
|
||||||
|
expectedList: []schedulerapi.HostPriority{
|
||||||
|
{"machine1.zone1", 10},
|
||||||
|
{"machine1.zone2", 10},
|
||||||
|
{"machine2.zone2", 10},
|
||||||
|
{"machine1.zone3", 10},
|
||||||
|
{"machine2.zone3", 10},
|
||||||
|
{"machine3.zone3", 10},
|
||||||
|
},
|
||||||
|
test: "different services",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
pod: buildPod("", labels1),
|
||||||
|
pods: []*api.Pod{
|
||||||
|
buildPod("machine1.zone1", labels2),
|
||||||
|
buildPod("machine1.zone2", labels1),
|
||||||
|
},
|
||||||
|
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||||
|
expectedList: []schedulerapi.HostPriority{
|
||||||
|
{"machine1.zone1", 10},
|
||||||
|
{"machine1.zone2", 0}, // Already have pod on machine
|
||||||
|
{"machine2.zone2", 3}, // Already have pod in zone
|
||||||
|
{"machine1.zone3", 10},
|
||||||
|
{"machine2.zone3", 10},
|
||||||
|
{"machine3.zone3", 10},
|
||||||
|
},
|
||||||
|
test: "two pods, 1 matching (in z2)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
pod: buildPod("", labels1),
|
||||||
|
pods: []*api.Pod{
|
||||||
|
buildPod("machine1.zone1", labels2),
|
||||||
|
buildPod("machine1.zone2", labels1),
|
||||||
|
buildPod("machine2.zone2", labels1),
|
||||||
|
buildPod("machine1.zone3", labels2),
|
||||||
|
buildPod("machine2.zone3", labels1),
|
||||||
|
},
|
||||||
|
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||||
|
expectedList: []schedulerapi.HostPriority{
|
||||||
|
{"machine1.zone1", 10},
|
||||||
|
{"machine1.zone2", 0}, // Pod on node
|
||||||
|
{"machine2.zone2", 0}, // Pod on node
|
||||||
|
{"machine1.zone3", 6}, // Pod in zone
|
||||||
|
{"machine2.zone3", 3}, // Pod on node
|
||||||
|
{"machine3.zone3", 6}, // Pod in zone
|
||||||
|
},
|
||||||
|
test: "five pods, 3 matching (z2=2, z3=1)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
pod: buildPod("", labels1),
|
||||||
|
pods: []*api.Pod{
|
||||||
|
buildPod("machine1.zone1", labels1),
|
||||||
|
buildPod("machine1.zone2", labels1),
|
||||||
|
buildPod("machine2.zone2", labels2),
|
||||||
|
buildPod("machine1.zone3", labels1),
|
||||||
|
},
|
||||||
|
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||||
|
expectedList: []schedulerapi.HostPriority{
|
||||||
|
{"machine1.zone1", 0}, // Pod on node
|
||||||
|
{"machine1.zone2", 0}, // Pod on node
|
||||||
|
{"machine2.zone2", 3}, // Pod in zone
|
||||||
|
{"machine1.zone3", 0}, // Pod on node
|
||||||
|
{"machine2.zone3", 3}, // Pod in zone
|
||||||
|
{"machine3.zone3", 3}, // Pod in zone
|
||||||
|
},
|
||||||
|
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
pod: buildPod("", labels1),
|
||||||
|
pods: []*api.Pod{
|
||||||
|
buildPod("machine1.zone1", labels1),
|
||||||
|
buildPod("machine1.zone2", labels1),
|
||||||
|
buildPod("machine1.zone3", labels1),
|
||||||
|
buildPod("machine2.zone2", labels2),
|
||||||
|
},
|
||||||
|
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
|
||||||
|
expectedList: []schedulerapi.HostPriority{
|
||||||
|
{"machine1.zone1", 0}, // Pod on node
|
||||||
|
{"machine1.zone2", 0}, // Pod on node
|
||||||
|
{"machine2.zone2", 3}, // Pod in zone
|
||||||
|
{"machine1.zone3", 0}, // Pod on node
|
||||||
|
{"machine2.zone3", 3}, // Pod in zone
|
||||||
|
{"machine3.zone3", 3}, // Pod in zone
|
||||||
|
},
|
||||||
|
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
pod: buildPod("", labels1),
|
||||||
|
pods: []*api.Pod{
|
||||||
|
buildPod("machine1.zone3", labels1),
|
||||||
|
buildPod("machine1.zone2", labels1),
|
||||||
|
buildPod("machine1.zone3", labels1),
|
||||||
|
},
|
||||||
|
rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
|
||||||
|
expectedList: []schedulerapi.HostPriority{
|
||||||
|
// Note that because we put two pods on the same node (machine1.zone3),
|
||||||
|
// the values here are questionable for zone2, in particular for machine1.zone2.
|
||||||
|
// However they kind of make sense; zone1 is still most-highly favored.
|
||||||
|
// zone3 is in general least favored, and m1.z3 particularly low priority.
|
||||||
|
// We would probably prefer to see a bigger gap between putting a second
|
||||||
|
// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
|
||||||
|
// This is also consistent with what we have already.
|
||||||
|
{"machine1.zone1", 10}, // No pods in zone
|
||||||
|
{"machine1.zone2", 5}, // Pod on node
|
||||||
|
{"machine2.zone2", 6}, // Pod in zone
|
||||||
|
{"machine1.zone3", 0}, // Two pods on node
|
||||||
|
{"machine2.zone3", 3}, // Pod in zone
|
||||||
|
{"machine3.zone3", 3}, // Pod in zone
|
||||||
|
},
|
||||||
|
test: "Replication controller spreading (z1=0, z2=1, z3=2)",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
selectorSpread := SelectorSpread{serviceLister: algorithm.FakeServiceLister(test.services), controllerLister: algorithm.FakeControllerLister(test.rcs)}
|
||||||
|
list, err := selectorSpread.CalculateSpreadPriority(test.pod, algorithm.FakePodLister(test.pods), algorithm.FakeNodeLister(makeLabeledNodeList(labeledNodes)))
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
// sort the two lists to avoid failures on account of different ordering
|
||||||
|
sort.Sort(test.expectedList)
|
||||||
|
sort.Sort(list)
|
||||||
|
if !reflect.DeepEqual(test.expectedList, list) {
|
||||||
|
t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestZoneSpreadPriority(t *testing.T) {
|
func TestZoneSpreadPriority(t *testing.T) {
|
||||||
labels1 := map[string]string{
|
labels1 := map[string]string{
|
||||||
"foo": "bar",
|
"foo": "bar",
|
||||||
|
Loading…
Reference in New Issue
Block a user