Merge pull request #103934 from boenn/tainttoleration

De-duplicate predicate (known as filter now) logic shared in kubelet and scheduler
2025-09-21 18:11:22 +00:00 · 2022-02-09 16:53:46 -08:00
parent cfb2219ded cec2aae1e5
commit 0dcd6eaa0d
8 changed files with 294 additions and 188 deletions
--- a/pkg/scheduler/eventhandlers.go
+++ b/pkg/scheduler/eventhandlers.go
@@ -21,20 +21,20 @@ import (
 	"reflect"
 	"strings"

-	"k8s.io/apimachinery/pkg/runtime/schema"
-	"k8s.io/client-go/dynamic/dynamicinformer"
-	"k8s.io/klog/v2"
-
 	v1 "k8s.io/api/core/v1"
 	storagev1 "k8s.io/api/storage/v1"
+	"k8s.io/apimachinery/pkg/runtime/schema"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	"k8s.io/apiserver/pkg/util/feature"
+	"k8s.io/client-go/dynamic/dynamicinformer"
 	"k8s.io/client-go/informers"
 	"k8s.io/client-go/tools/cache"
-	v1helper "k8s.io/component-helpers/scheduling/corev1"
-	"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
+	corev1helpers "k8s.io/component-helpers/scheduling/corev1"
+	corev1nodeaffinity "k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
+	"k8s.io/klog/v2"
 	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/kubernetes/pkg/scheduler/framework"
+	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity"
 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename"
 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports"
 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources"
@@ -470,37 +470,62 @@ func nodeSpecUnschedulableChanged(newNode *v1.Node, oldNode *v1.Node) bool {
 }

 func preCheckForNode(nodeInfo *framework.NodeInfo) queue.PreEnqueueCheck {
-	// In addition to the checks in kubelet (pkg/kubelet/lifecycle/predicate.go#GeneralPredicates),
-	// the following logic appends a taint/toleration check.
-	// TODO: verify if kubelet should also apply the taint/toleration check, and then unify the
-	// logic with kubelet and move to a shared place.
-	//
 	// Note: the following checks doesn't take preemption into considerations, in very rare
 	// cases (e.g., node resizing), "pod" may still fail a check but preemption helps. We deliberately
 	// chose to ignore those cases as unschedulable pods will be re-queued eventually.
 	return func(pod *v1.Pod) bool {
-		if len(noderesources.Fits(pod, nodeInfo, feature.DefaultFeatureGate.Enabled(features.PodOverhead))) != 0 {
+		admissionResults := AdmissionCheck(pod, nodeInfo, false)
+		if len(admissionResults) != 0 {
 			return false
 		}
-
-		// Ignore parsing errors for backwards compatibility.
-		matches, _ := nodeaffinity.GetRequiredNodeAffinity(pod).Match(nodeInfo.Node())
-		if !matches {
-			return false
-		}
-
-		if !nodename.Fits(pod, nodeInfo) {
-			return false
-		}
-
-		if !nodeports.Fits(pod, nodeInfo) {
-			return false
-		}
-
-		_, isUntolerated := v1helper.FindMatchingUntoleratedTaint(nodeInfo.Node().Spec.Taints, pod.Spec.Tolerations, func(t *v1.Taint) bool {
-			// PodToleratesNodeTaints is only interested in NoSchedule and NoExecute taints.
-			return t.Effect == v1.TaintEffectNoSchedule || t.Effect == v1.TaintEffectNoExecute
+		_, isUntolerated := corev1helpers.FindMatchingUntoleratedTaint(nodeInfo.Node().Spec.Taints, pod.Spec.Tolerations, func(t *v1.Taint) bool {
+			return t.Effect == v1.TaintEffectNoSchedule
 		})
 		return !isUntolerated
 	}
 }
+
+// AdmissionCheck calls the filtering logic of noderesources/nodeport/nodeAffinity/nodename
+// and returns the failure reasons. It's used in kubelet(pkg/kubelet/lifecycle/predicate.go) and scheduler.
+// It returns the first failure if `includeAllFailures` is set to false; otherwise
+// returns all failures.
+func AdmissionCheck(pod *v1.Pod, nodeInfo *framework.NodeInfo, includeAllFailures bool) []AdmissionResult {
+	var admissionResults []AdmissionResult
+	insufficientResources := noderesources.Fits(pod, nodeInfo, feature.DefaultFeatureGate.Enabled(features.PodOverhead))
+	if len(insufficientResources) != 0 {
+		for i := range insufficientResources {
+			admissionResults = append(admissionResults, AdmissionResult{InsufficientResource: &insufficientResources[i]})
+		}
+		if !includeAllFailures {
+			return admissionResults
+		}
+	}
+
+	if matches, _ := corev1nodeaffinity.GetRequiredNodeAffinity(pod).Match(nodeInfo.Node()); !matches {
+		admissionResults = append(admissionResults, AdmissionResult{Name: nodeaffinity.Name, Reason: nodeaffinity.ErrReasonPod})
+		if !includeAllFailures {
+			return admissionResults
+		}
+	}
+	if !nodename.Fits(pod, nodeInfo) {
+		admissionResults = append(admissionResults, AdmissionResult{Name: nodename.Name, Reason: nodename.ErrReason})
+		if !includeAllFailures {
+			return admissionResults
+		}
+	}
+	if !nodeports.Fits(pod, nodeInfo) {
+		admissionResults = append(admissionResults, AdmissionResult{Name: nodeports.Name, Reason: nodeports.ErrReason})
+		if !includeAllFailures {
+			return admissionResults
+		}
+	}
+	return admissionResults
+}
+
+// AdmissionResult describes the reason why Scheduler can't admit the pod.
+// If the reason is a resource fit one, then AdmissionResult.InsufficientResource includes the details.
+type AdmissionResult struct {
+	Name                 string
+	Reason               string
+	InsufficientResource *noderesources.InsufficientResource
+}