Merge pull request #62211 from bsalamat/affinity_performance

Automatic merge from submit-queue (batch tested with PRs 62467, 62482, 62211). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Improve performance of affinity/anti-affinity predicate by 20x in large clusters **What this PR does / why we need it**: Improves performance of affinity/anti-affinity predicate by over 20x in large clusters. Performance improvement is smaller in small clusters, but it is still very significant and is about 4x. Also, before this PR, performance of the predicate was dropping quadratically with increasing size of nodes and pods. As the results shows, the slow down is now linear in larger clusters. Affinity/anti-affinity predicate was checking all pods of the cluster for each node in the cluster to determine feasibility of affinit/anti-affinity terms of the pod being scheduled. This optimization first finds all the pods in a cluster that match the affinity/anti-affinity terms of the pod being scheduled once and stores the metadata. It then only checks the topology of the matching pods for each node in the cluster. This results in major reduction of the search space per node and improves performance significantly. Below results are obtained by running scheduler benchmarks: ``` make test-integration WHAT=./test/integration/scheduler_perf KUBE_TEST_ARGS="-run=xxx -bench=.*BenchmarkSchedulingAntiAffinity" ``` ``` AntiAffinity Topology: Hostname before: BenchmarkSchedulingAntiAffinity/500Nodes/250Pods-12 37031638 ns/op after: BenchmarkSchedulingAntiAffinity/500Nodes/250Pods-12 10373222 ns/op before: BenchmarkSchedulingAntiAffinity/500Nodes/5000Pods-12 134205302 ns/op after: BenchmarkSchedulingAntiAffinity/500Nodes/5000Pods-12 12000580 ns/op befor: BenchmarkSchedulingAntiAffinity/1000Nodes/10000Pods-12 498439953 ns/op after: BenchmarkSchedulingAntiAffinity/1000Nodes/10000Pods-12 24692552 ns/op AntiAffinity Topology: Region before: BenchmarkSchedulingAntiAffinity/500Nodes/250Pods-12 60003672 ns/op after: BenchmarkSchedulingAntiAffinity/500Nodes/250Pods-12 13346400 ns/op before: BenchmarkSchedulingAntiAffinity/1000Nodes/10000Pods-12 600085491 ns/op after: BenchmarkSchedulingAntiAffinity/1000Nodes/10000Pods-12 27783333 ns/op ``` **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # ref/ #56032 #47318 #25319 **Release note**: ```release-note improve performance of affinity/anti-affinity predicate of default scheduler significantly. ``` /sig scheduling
2026-01-06 07:57:35 +00:00 · 2018-04-13 07:25:21 -07:00
parent 2bfa8a943d 3041698e52
commit 3cdf5eecd7
4 changed files with 475 additions and 58 deletions
--- a/pkg/scheduler/algorithm/predicates/predicates.go
+++ b/pkg/scheduler/algorithm/predicates/predicates.go
@@ -1150,7 +1150,7 @@ func (c *PodAffinityChecker) InterPodAffinityMatches(pod *v1.Pod, meta algorithm
 	if affinity == nil || (affinity.PodAffinity == nil && affinity.PodAntiAffinity == nil) {
 		return true, nil, nil
 	}
-	if failedPredicates, error := c.satisfiesPodsAffinityAntiAffinity(pod, nodeInfo, affinity); failedPredicates != nil {
+	if failedPredicates, error := c.satisfiesPodsAffinityAntiAffinity(pod, meta, nodeInfo, affinity); failedPredicates != nil {
 		failedPredicates := append([]algorithm.PredicateFailureReason{ErrPodAffinityNotMatch}, failedPredicates)
 		return false, failedPredicates, error
 	}
@@ -1380,60 +1380,129 @@ func (c *PodAffinityChecker) satisfiesExistingPodsAntiAffinity(pod *v1.Pod, meta
 	return nil, nil
 }

+// anyMatchingPodInTopology checks that any of the given Pods are in the
+// topology specified by the affinity term.
+func (c *PodAffinityChecker) anyMatchingPodInTopology(pod *v1.Pod, matchingPods map[string][]*v1.Pod, nodeInfo *schedulercache.NodeInfo, term *v1.PodAffinityTerm) (bool, error) {
+	if len(term.TopologyKey) == 0 {
+		return false, fmt.Errorf("empty topologyKey is not allowed except for PreferredDuringScheduling pod anti-affinity")
+	}
+	if len(matchingPods) == 0 {
+		return false, nil
+	}
+	// Special case: When the topological domain is node, we can limit our
+	// search to pods on that node without searching the entire cluster.
+	if term.TopologyKey == kubeletapis.LabelHostname {
+		if pods, ok := matchingPods[nodeInfo.Node().Name]; ok {
+			// It may seem odd that we are comparing a node with itself to see if it
+			// has the same topology key, but it is necessary to check extra conditions
+			// that the function performs, such as checking that node labels are not nil.
+			return len(pods) > 0 && priorityutil.NodesHaveSameTopologyKey(nodeInfo.Node(), nodeInfo.Node(), term.TopologyKey), nil
+		}
+		return false, nil
+	}
+	// Topology key is not "Hostname". Checking all matching pods.
+	for nodeName, pods := range matchingPods {
+		matchingPodNodeInfo, err := c.info.GetNodeInfo(nodeName)
+		if err != nil {
+			return false, err
+		}
+		if len(pods) > 0 && priorityutil.NodesHaveSameTopologyKey(nodeInfo.Node(), matchingPodNodeInfo, term.TopologyKey) {
+			return true, nil
+		}
+	}
+	return false, nil
+}
+
 // Checks if scheduling the pod onto this node would break any rules of this pod.
-func (c *PodAffinityChecker) satisfiesPodsAffinityAntiAffinity(pod *v1.Pod, nodeInfo *schedulercache.NodeInfo, affinity *v1.Affinity) (algorithm.PredicateFailureReason, error) {
+func (c *PodAffinityChecker) satisfiesPodsAffinityAntiAffinity(pod *v1.Pod,
+	meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo,
+	affinity *v1.Affinity) (algorithm.PredicateFailureReason, error) {
 	node := nodeInfo.Node()
 	if node == nil {
 		return ErrPodAffinityRulesNotMatch, fmt.Errorf("Node is nil")
 	}
-	filteredPods, err := c.podLister.FilteredList(nodeInfo.Filter, labels.Everything())
-	if err != nil {
-		return ErrPodAffinityRulesNotMatch, err
-	}
-
-	// Check all affinity terms.
-	for _, term := range GetPodAffinityTerms(affinity.PodAffinity) {
-		termMatches, matchingPodExists, err := c.anyPodMatchesPodAffinityTerm(pod, filteredPods, nodeInfo, &term)
-		if err != nil {
-			errMessage := fmt.Sprintf("Cannot schedule pod %+v onto node %v, because of PodAffinityTerm %v, err: %v", podName(pod), node.Name, term, err)
-			glog.Error(errMessage)
-			return ErrPodAffinityRulesNotMatch, errors.New(errMessage)
-		}
-		if !termMatches {
-			// If the requirement matches a pod's own labels are namespace, and there are
-			// no other such pods, then disregard the requirement. This is necessary to
-			// not block forever because the first pod of the collection can't be scheduled.
-			if matchingPodExists {
-				glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAffinityTerm %v",
-					podName(pod), node.Name, term)
-				return ErrPodAffinityRulesNotMatch, nil
-			}
-			namespaces := priorityutil.GetNamespacesFromPodAffinityTerm(pod, &term)
-			selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
+	if predicateMeta, ok := meta.(*predicateMetadata); ok {
+		// Check all affinity terms.
+		matchingPods := predicateMeta.nodeNameToMatchingAffinityPods
+		for _, term := range GetPodAffinityTerms(affinity.PodAffinity) {
+			termMatches, err := c.anyMatchingPodInTopology(pod, matchingPods, nodeInfo, &term)
 			if err != nil {
-				errMessage := fmt.Sprintf("Cannot parse selector on term %v for pod %v. Details %v", term, podName(pod), err)
+				errMessage := fmt.Sprintf("Cannot schedule pod %+v onto node %v, because of PodAffinityTerm %v, err: %v", podName(pod), node.Name, term, err)
+				glog.Errorf(errMessage)
+				return ErrPodAffinityRulesNotMatch, errors.New(errMessage)
+			}
+			if !termMatches {
+				// This pod may the first pod in a series that have affinity to themselves. In order
+				// to not leave such pods in pending state forever, we check that if no other pod
+				// in the cluster matches the namespace and selector of this pod and the pod matches
+				// its own terms, then we allow the pod to pass the affinity check.
+				if !(len(matchingPods) == 0 && targetPodMatchesAffinityOfPod(pod, pod)) {
+					glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAffinityTerm %v",
+						podName(pod), node.Name, term)
+					return ErrPodAffinityRulesNotMatch, nil
+				}
+			}
+		}
+
+		// Check all anti-affinity terms.
+		matchingPods = predicateMeta.nodeNameToMatchingAntiAffinityPods
+		for _, term := range GetPodAntiAffinityTerms(affinity.PodAntiAffinity) {
+			termMatches, err := c.anyMatchingPodInTopology(pod, matchingPods, nodeInfo, &term)
+			if err != nil || termMatches {
+				glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAntiAffinityTerm %v, err: %v",
+					podName(pod), node.Name, term, err)
+				return ErrPodAntiAffinityRulesNotMatch, nil
+			}
+		}
+	} else { // We don't have precomputed metadata. We have to follow a slow path to check affinity rules.
+		filteredPods, err := c.podLister.FilteredList(nodeInfo.Filter, labels.Everything())
+		if err != nil {
+			return ErrPodAffinityRulesNotMatch, err
+		}
+
+		// Check all affinity terms.
+		for _, term := range GetPodAffinityTerms(affinity.PodAffinity) {
+			termMatches, matchingPodExists, err := c.anyPodMatchesPodAffinityTerm(pod, filteredPods, nodeInfo, &term)
+			if err != nil {
+				errMessage := fmt.Sprintf("Cannot schedule pod %+v onto node %v, because of PodAffinityTerm %v, err: %v", podName(pod), node.Name, term, err)
 				glog.Error(errMessage)
 				return ErrPodAffinityRulesNotMatch, errors.New(errMessage)
 			}
-			match := priorityutil.PodMatchesTermsNamespaceAndSelector(pod, namespaces, selector)
-			if !match {
-				glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAffinityTerm %v",
-					podName(pod), node.Name, term)
-				return ErrPodAffinityRulesNotMatch, nil
+			if !termMatches {
+				// If the requirement matches a pod's own labels are namespace, and there are
+				// no other such pods, then disregard the requirement. This is necessary to
+				// not block forever because the first pod of the collection can't be scheduled.
+				if matchingPodExists {
+					glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAffinityTerm %v",
+						podName(pod), node.Name, term)
+					return ErrPodAffinityRulesNotMatch, nil
+				}
+				namespaces := priorityutil.GetNamespacesFromPodAffinityTerm(pod, &term)
+				selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
+				if err != nil {
+					errMessage := fmt.Sprintf("Cannot parse selector on term %v for pod %v. Details %v", term, podName(pod), err)
+					glog.Error(errMessage)
+					return ErrPodAffinityRulesNotMatch, errors.New(errMessage)
+				}
+				match := priorityutil.PodMatchesTermsNamespaceAndSelector(pod, namespaces, selector)
+				if !match {
+					glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAffinityTerm %v",
+						podName(pod), node.Name, term)
+					return ErrPodAffinityRulesNotMatch, nil
+				}
+			}
+		}
+
+		// Check all anti-affinity terms.
+		for _, term := range GetPodAntiAffinityTerms(affinity.PodAntiAffinity) {
+			termMatches, _, err := c.anyPodMatchesPodAffinityTerm(pod, filteredPods, nodeInfo, &term)
+			if err != nil || termMatches {
+				glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAntiAffinityTerm %v, err: %v",
+					podName(pod), node.Name, term, err)
+				return ErrPodAntiAffinityRulesNotMatch, nil
 			}
 		}
 	}
-
-	// Check all anti-affinity terms.
-	for _, term := range GetPodAntiAffinityTerms(affinity.PodAntiAffinity) {
-		termMatches, _, err := c.anyPodMatchesPodAffinityTerm(pod, filteredPods, nodeInfo, &term)
-		if err != nil || termMatches {
-			glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAntiAffinityTerm %v, err: %v",
-				podName(pod), node.Name, term, err)
-			return ErrPodAntiAffinityRulesNotMatch, nil
-		}
-	}
-
 	if glog.V(10) {
 		// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
 		// not logged. There is visible performance gain from it.