EvenPodsSpread: Core Priority logic

2026-01-05 15:37:24 +00:00 · 2019-05-10 14:56:25 -07:00
parent eefc18a763
commit f25cc921e1
2 changed files with 495 additions and 0 deletions
--- a/pkg/scheduler/algorithm/priorities/even_pods_spread.go
+++ b/pkg/scheduler/algorithm/priorities/even_pods_spread.go
@@ -15,3 +15,196 @@ limitations under the License.
 */

 package priorities
+
+import (
+	"context"
+	"sync"
+	"sync/atomic"
+
+	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/client-go/util/workqueue"
+	"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
+	schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
+	schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
+
+	"k8s.io/klog"
+)
+
+type topologyPair struct {
+	key   string
+	value string
+}
+
+type topologySpreadConstrantsMap struct {
+	// The first error that we faced.
+	firstError error
+	sync.Mutex
+
+	// counts store the mapping from node name to so-far computed score of
+	// the node.
+	counts map[string]*int64
+	// total number of matching pods on each qualified <topologyKey:value> pair
+	total int64
+	// topologyPairToNodeNames store the mapping from potential <topologyKey:value>
+	// pair to node names
+	topologyPairToNodeNames map[topologyPair][]string
+}
+
+func newTopologySpreadConstrantsMap(len int) *topologySpreadConstrantsMap {
+	return &topologySpreadConstrantsMap{
+		counts:                  make(map[string]*int64, len),
+		topologyPairToNodeNames: make(map[topologyPair][]string),
+	}
+}
+
+func (t *topologySpreadConstrantsMap) setError(err error) {
+	t.Lock()
+	if t.firstError == nil {
+		t.firstError = err
+	}
+	t.Unlock()
+}
+
+func (t *topologySpreadConstrantsMap) initialize(pod *v1.Pod, nodes []*v1.Node) {
+	constraints := getSoftTopologySpreadConstraints(pod)
+	for _, node := range nodes {
+		labelSet := labels.Set(node.Labels)
+		allMatch := true
+		var pairs []topologyPair
+		for _, constraint := range constraints {
+			tpKey := constraint.TopologyKey
+			if !labelSet.Has(tpKey) {
+				allMatch = false
+				break
+			}
+			pairs = append(pairs, topologyPair{key: tpKey, value: node.Labels[tpKey]})
+		}
+		if allMatch {
+			for _, pair := range pairs {
+				t.topologyPairToNodeNames[pair] = append(t.topologyPairToNodeNames[pair], node.Name)
+			}
+			t.counts[node.Name] = new(int64)
+		}
+		// for those nodes which don't have all required topologyKeys present, it's intentional to
+		// leave counts[nodeName] as nil, so that we're able to score these nodes to 0 afterwards
+	}
+}
+
+// CalculateEvenPodsSpreadPriority computes a score by checking through the topologySpreadConstraints
+// that are with WhenUnsatifiable=ScheduleAnyway (a.k.a soft constraint).
+// For each node (not only "filtered" nodes by Predicates), it adds the number of matching pods
+// (all topologySpreadConstraints must be satified) as a "weight" to any "filtered" node
+// which has the <topologyKey:value> pair present.
+// Then the sumed "weight" are normalized to 0~10, and the node(s) with the highest score are
+// the most preferred.
+// Symmetry is not considered.
+func CalculateEvenPodsSpreadPriority(pod *v1.Pod, nodeNameToInfo map[string]*schedulernodeinfo.NodeInfo, nodes []*v1.Node) (schedulerapi.HostPriorityList, error) {
+	nodesLen := len(nodes)
+	result := make(schedulerapi.HostPriorityList, nodesLen)
+	// if incoming pod doesn't have soft topology spread constraints, return
+	constraints := getSoftTopologySpreadConstraints(pod)
+	if len(constraints) == 0 {
+		return result, nil
+	}
+
+	t := newTopologySpreadConstrantsMap(len(nodes))
+	t.initialize(pod, nodes)
+
+	allNodeNames := make([]string, 0, len(nodeNameToInfo))
+	for name := range nodeNameToInfo {
+		allNodeNames = append(allNodeNames, name)
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	processNode := func(i int) {
+		nodeInfo := nodeNameToInfo[allNodeNames[i]]
+		if node := nodeInfo.Node(); node != nil {
+			// (1) `node` should satisfy incoming pod's NodeSelector/NodeAffinity
+			// (2) All topologyKeys need to be present in `node`
+			if !predicates.PodMatchesNodeSelectorAndAffinityTerms(pod, node) ||
+				!predicates.NodeLabelsMatchSpreadConstraints(node.Labels, constraints) {
+				return
+			}
+			matchCount := 0
+			for _, existingPod := range nodeInfo.Pods() {
+				match, err := predicates.PodMatchesAllSpreadConstraints(existingPod, pod.Namespace, constraints)
+				if err != nil {
+					t.setError(err)
+					cancel()
+					return
+				}
+				if match {
+					matchCount++
+				}
+			}
+			// add matchCount up to EACH node which is at least in one topology domain
+			// with current node
+			for _, constraint := range constraints {
+				tpKey := constraint.TopologyKey
+				pair := topologyPair{key: tpKey, value: node.Labels[tpKey]}
+				for _, nodeName := range t.topologyPairToNodeNames[pair] {
+					atomic.AddInt64(t.counts[nodeName], int64(matchCount))
+					atomic.AddInt64(&t.total, int64(matchCount))
+				}
+			}
+		}
+	}
+	workqueue.ParallelizeUntil(ctx, 16, len(allNodeNames), processNode)
+	if t.firstError != nil {
+		return nil, t.firstError
+	}
+
+	var maxCount, minCount int64
+	for _, node := range nodes {
+		if t.counts[node.Name] == nil {
+			continue
+		}
+		// reverse
+		count := t.total - *t.counts[node.Name]
+		if count > maxCount {
+			maxCount = count
+		} else if count < minCount {
+			minCount = count
+		}
+		t.counts[node.Name] = &count
+	}
+	// calculate final priority score for each node
+	// TODO(Huang-Wei): in alpha version, we keep the formula as simple as possible.
+	// current version ranks the nodes properly, but it doesn't take MaxSkew into
+	// consideration, we may come up with a better formula in the future.
+	maxMinDiff := maxCount - minCount
+	for i := range nodes {
+		node := nodes[i]
+		result[i].Host = node.Name
+		if t.counts[node.Name] == nil {
+			result[i].Score = 0
+			continue
+		}
+		if maxMinDiff == 0 {
+			result[i].Score = schedulerapi.MaxPriority
+			continue
+		}
+		fScore := float64(schedulerapi.MaxPriority) * (float64(*t.counts[node.Name]-minCount) / float64(maxMinDiff))
+		// need to reverse b/c the more matching pods it has, the less qualified it is
+		// result[i].Score = schedulerapi.MaxPriority - int(fScore)
+		result[i].Score = int(fScore)
+		if klog.V(10) {
+			klog.Infof("%v -> %v: EvenPodsSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore))
+		}
+	}
+
+	return result, nil
+}
+
+// TODO(Huang-Wei): combine this with getHardTopologySpreadConstraints() in predicates package
+func getSoftTopologySpreadConstraints(pod *v1.Pod) (constraints []v1.TopologySpreadConstraint) {
+	if pod != nil {
+		for _, constraint := range pod.Spec.TopologySpreadConstraints {
+			if constraint.WhenUnsatisfiable == v1.ScheduleAnyway {
+				constraints = append(constraints, constraint)
+			}
+		}
+	}
+	return
+}