Merge pull request #57476 from misterikkit/podAffinityNode

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Restrict pod affinity search when TopologyKey=kubernetes.io/hostname **What this PR does / why we need it**: When a PodAffinityTerm uses TopologyKey=kubernetes.io/hostname, we can avoid searching the entire cluster for a match by only listing pods on the given node. This is a set of 3 PRs targeting affinity predicate performance. (#57476, #57477, #57478) The key takeaway is approximately 2x speedup in the large affinity benchmark. The unexpected increase in BenchmarkScheduling/1000Nodes/1000Pods seems to be an outlier, and did not recur on subsequent runs. The benchmarks have a moderate amount of variance to them, and I did not run them enough times to measure mean and standard deviation. | test | b.N | master | #57476 | #57477 | #57478 | combined | | ---- | --- | ------ | ---------- | ---------- | ---------- | -------- | | BenchmarkScheduling/100Nodes/0Pods | 100 | 39629010 ns/op | 36898566 ns/op (-6.89%) | 38461530 ns/op (-2.95%) | 36214136 ns/op (-8.62%) | 43090781 ns/op (+8.74%) | | BenchmarkScheduling/100Nodes/1000Pods | 100 | 85489577 ns/op | 69538016 ns/op (-18.66%) | 70104254 ns/op (-18.00%) | 75015585 ns/op (-12.25%) | 80986960 ns/op (-5.27%) | | BenchmarkScheduling/1000Nodes/0Pods | 100 | 219356660 ns/op | 200149051 ns/op (-8.76%) | 192867469 ns/op (-12.08%) | 196896770 ns/op (-10.24%) | 212563662 ns/op (-3.10%) | | BenchmarkScheduling/1000Nodes/1000Pods | 100 | 380368238 ns/op | 381786369 ns/op (+0.37%) | 387224973 ns/op (+1.80%) | 417974358 ns/op (+9.89%) | 411140230 ns/op (+8.09%) | | BenchmarkSchedulingAntiAffinity/500Nodes/250Pods | 250 | 124399176 ns/op | 97568988 ns/op (-21.57%) | 112027363 ns/op (-9.95%) | 129134326 ns/op (+3.81%) | 98607941 ns/op (-20.73%) | | BenchmarkSchedulingAntiAffinity/500Nodes/5000Pods | 250 | 491677096 ns/op | 441562422 ns/op (-10.19%) | 278127757 ns/op (-43.43%) | 447355609 ns/op (-9.01%) | 226310721 ns/op (-53.97%) | Combined performance contains all three patches. Percentages are relative to master. Methodology: I ran the tests on each branch with this command. ``` make test-integration WHAT="./test/integration/scheduler_perf" KUBE_TEST_ARGS="-run=xxxx -bench=." ``` The benchmarks have a fair amount of variance to them, and I did not run them enough times to measure mean and standard deviation. **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # The three PRs in this set should collectively fix #54189. **Special notes for your reviewer**: **Release note**: ```release-note Improve scheduler performance of MatchInterPodAffinity predicate. ```
2025-09-14 05:36:12 +00:00 · 2017-12-21 18:41:07 -08:00
parent f4d3f4dcc0 732e785e0a
commit c7ffecfb05
1 changed files with 10 additions and 5 deletions
--- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go
+++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go
@@ -1116,7 +1116,7 @@ func (c *PodAffinityChecker) InterPodAffinityMatches(pod *v1.Pod, meta algorithm
 // First return value indicates whether a matching pod exists on a node that matches the topology key,
 // while the second return value indicates whether a matching pod exists anywhere.
 // TODO: Do we really need any pod matching, or all pods matching? I think the latter.
-func (c *PodAffinityChecker) anyPodMatchesPodAffinityTerm(pod *v1.Pod, allPods []*v1.Pod, node *v1.Node, term *v1.PodAffinityTerm) (bool, bool, error) {
+func (c *PodAffinityChecker) anyPodMatchesPodAffinityTerm(pod *v1.Pod, pods []*v1.Pod, nodeInfo *schedulercache.NodeInfo, term *v1.PodAffinityTerm) (bool, bool, error) {
 	if len(term.TopologyKey) == 0 {
 		return false, false, fmt.Errorf("empty topologyKey is not allowed except for PreferredDuringScheduling pod anti-affinity")
 	}
@@ -1126,7 +1126,12 @@ func (c *PodAffinityChecker) anyPodMatchesPodAffinityTerm(pod *v1.Pod, allPods [
 	if err != nil {
 		return false, false, err
 	}
-	for _, existingPod := range allPods {
+	// Special case: When the topological domain is node, we can limit our
+	// search to pods on that node without searching the entire cluster.
+	if term.TopologyKey == kubeletapis.LabelHostname {
+		pods = nodeInfo.Pods()
+	}
+	for _, existingPod := range pods {
 		match := priorityutil.PodMatchesTermsNamespaceAndSelector(existingPod, namespaces, selector)
 		if match {
 			matchingPodExists = true
@@ -1134,7 +1139,7 @@ func (c *PodAffinityChecker) anyPodMatchesPodAffinityTerm(pod *v1.Pod, allPods [
 			if err != nil {
 				return false, matchingPodExists, err
 			}
-			if priorityutil.NodesHaveSameTopologyKey(node, existingPodNode, term.TopologyKey) {
+			if priorityutil.NodesHaveSameTopologyKey(nodeInfo.Node(), existingPodNode, term.TopologyKey) {
 				return true, matchingPodExists, nil
 			}
 		}
@@ -1334,7 +1339,7 @@ func (c *PodAffinityChecker) satisfiesPodsAffinityAntiAffinity(pod *v1.Pod, node

 	// Check all affinity terms.
 	for _, term := range GetPodAffinityTerms(affinity.PodAffinity) {
-		termMatches, matchingPodExists, err := c.anyPodMatchesPodAffinityTerm(pod, filteredPods, node, &term)
+		termMatches, matchingPodExists, err := c.anyPodMatchesPodAffinityTerm(pod, filteredPods, nodeInfo, &term)
 		if err != nil {
 			errMessage := fmt.Sprintf("Cannot schedule pod %+v onto node %v, because of PodAffinityTerm %v, err: %v", podName(pod), node.Name, term, err)
 			glog.Error(errMessage)
@@ -1367,7 +1372,7 @@ func (c *PodAffinityChecker) satisfiesPodsAffinityAntiAffinity(pod *v1.Pod, node

 	// Check all anti-affinity terms.
 	for _, term := range GetPodAntiAffinityTerms(affinity.PodAntiAffinity) {
-		termMatches, _, err := c.anyPodMatchesPodAffinityTerm(pod, filteredPods, node, &term)
+		termMatches, _, err := c.anyPodMatchesPodAffinityTerm(pod, filteredPods, nodeInfo, &term)
 		if err != nil || termMatches {
 			glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because of PodAntiAffinityTerm %v, err: %v",
 				podName(pod), node.Name, term, err)