Merge pull request #50949 from bsalamat/preemption_eviction

Automatic merge from submit-queue Add pod preemption to the scheduler **What this PR does / why we need it**: This is the last of a series of PRs to add priority-based preemption to the scheduler. This PR connects the preemption logic to the scheduler workflow. **Which issue this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes #48646 **Special notes for your reviewer**: This PR includes other PRs which are under review (#50805, #50405, #50190). All the new code is located in 43627afdf9. **Release note**: ```release-note Add priority-based preemption to the scheduler. ``` ref/ #47604 /assign @davidopp @kubernetes/sig-scheduling-pr-reviews
2025-11-23 12:29:01 +00:00 · 2017-09-08 14:19:42 -07:00
parent ed154988c5 c0b718373b
commit f695a3120a
34 changed files with 1900 additions and 91 deletions
--- a/plugin/pkg/scheduler/core/generic_scheduler.go
+++ b/plugin/pkg/scheduler/core/generic_scheduler.go
@@ -18,6 +18,7 @@ package core

 import (
 	"fmt"
+	"math"
 	"sort"
 	"strings"
 	"sync"
@@ -32,6 +33,7 @@ import (
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/util"

 	"github.com/golang/glog"
 )
@@ -45,7 +47,14 @@ type FitError struct {

 var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")

-const NoNodeAvailableMsg = "No nodes are available that match all of the following predicates"
+const (
+	NoNodeAvailableMsg = "No nodes are available that match all of the predicates"
+	// NominatedNodeAnnotationKey is used to annotate a pod that has preempted other pods.
+	// The scheduler uses the annotation to find that the pod shouldn't preempt more pods
+	// when it gets to the head of scheduling queue again.
+	// See podEligibleToPreemptOthers() for more information.
+	NominatedNodeAnnotationKey = "NominatedNodeName"
+)

 // Error returns detailed information of why the pod failed to fit on each node
 func (f *FitError) Error() string {
@@ -73,7 +82,7 @@ type genericScheduler struct {
 	equivalenceCache      *EquivalenceCache
 	predicates            map[string]algorithm.FitPredicate
 	priorityMetaProducer  algorithm.MetadataProducer
-	predicateMetaProducer algorithm.MetadataProducer
+	predicateMetaProducer algorithm.PredicateMetadataProducer
 	prioritizers          []algorithm.PriorityConfig
 	extenders             []algorithm.SchedulerExtender
 	pods                  algorithm.PodLister
@@ -159,6 +168,65 @@ func (g *genericScheduler) selectHost(priorityList schedulerapi.HostPriorityList
 	return priorityList[ix].Host, nil
 }

+// preempt finds nodes with pods that can be preempted to make room for "pod" to
+// schedule. It chooses one of the nodes and preempts the pods on the node and
+// returns the node and the list of preempted pods if such a node is found.
+// TODO(bsalamat): Add priority-based scheduling. More info: today one or more
+// pending pods (different from the pod that triggered the preemption(s)) may
+// schedule into some portion of the resources freed up by the preemption(s)
+// before the pod that triggered the preemption(s) has a chance to schedule
+// there, thereby preventing the pod that triggered the preemption(s) from
+// scheduling. Solution is given at:
+// https://github.com/kubernetes/community/blob/master/contributors/design-proposals/pod-preemption.md#preemption-mechanics
+func (g *genericScheduler) Preempt(pod *v1.Pod, nodeLister algorithm.NodeLister, scheduleErr error) (*v1.Node, []*v1.Pod, error) {
+	// Scheduler may return various types of errors. Consider preemption only if
+	// the error is of type FitError.
+	fitError, ok := scheduleErr.(*FitError)
+	if !ok || fitError == nil {
+		return nil, nil, nil
+	}
+	err := g.cache.UpdateNodeNameToInfoMap(g.cachedNodeInfoMap)
+	if err != nil {
+		return nil, nil, err
+	}
+	if !podEligibleToPreemptOthers(pod, g.cachedNodeInfoMap) {
+		glog.V(5).Infof("Pod %v is not eligible for more preemption.", pod.Name)
+		return nil, nil, nil
+	}
+	allNodes, err := nodeLister.List()
+	if err != nil {
+		return nil, nil, err
+	}
+	if len(allNodes) == 0 {
+		return nil, nil, ErrNoNodesAvailable
+	}
+	potentialNodes := nodesWherePreemptionMightHelp(pod, allNodes, fitError.FailedPredicates)
+	if len(potentialNodes) == 0 {
+		glog.V(3).Infof("Preemption will not help schedule pod %v on any node.", pod.Name)
+		return nil, nil, nil
+	}
+	nodeToPods, err := selectNodesForPreemption(pod, g.cachedNodeInfoMap, potentialNodes, g.predicates, g.predicateMetaProducer)
+	if err != nil {
+		return nil, nil, err
+	}
+	for len(nodeToPods) > 0 {
+		node := pickOneNodeForPreemption(nodeToPods)
+		if node == nil {
+			return nil, nil, err
+		}
+		passes, pErr := nodePassesExtendersForPreemption(pod, node.Name, nodeToPods[node], g.cachedNodeInfoMap, g.extenders)
+		if passes && pErr == nil {
+			return node, nodeToPods[node], err
+		}
+		if pErr != nil {
+			glog.Errorf("Error occurred while checking extenders for preemption on node %v: %v", node, pErr)
+		}
+		// Remove the node from the map and try to pick a different node.
+		delete(nodeToPods, node)
+	}
+	return nil, nil, err
+}
+
 // Filters the nodes to find the ones that fit based on the given predicate functions
 // Each node is passed through the predicate functions to determine if it is a fit
 func findNodesThatFit(
@@ -167,7 +235,7 @@ func findNodesThatFit(
 	nodes []*v1.Node,
 	predicateFuncs map[string]algorithm.FitPredicate,
 	extenders []algorithm.SchedulerExtender,
-	metadataProducer algorithm.MetadataProducer,
+	metadataProducer algorithm.PredicateMetadataProducer,
 	ecache *EquivalenceCache,
 ) ([]*v1.Node, FailedPredicateMap, error) {
 	var filtered []*v1.Node
@@ -232,7 +300,7 @@ func findNodesThatFit(
 }

 // Checks whether node with a given name and NodeInfo satisfies all predicateFuncs.
-func podFitsOnNode(pod *v1.Pod, meta interface{}, info *schedulercache.NodeInfo, predicateFuncs map[string]algorithm.FitPredicate,
+func podFitsOnNode(pod *v1.Pod, meta algorithm.PredicateMetadata, info *schedulercache.NodeInfo, predicateFuncs map[string]algorithm.FitPredicate,
 	ecache *EquivalenceCache) (bool, []algorithm.PredicateFailureReason, error) {
 	var (
 		equivalenceHash  uint64
@@ -422,11 +490,288 @@ func EqualPriorityMap(_ *v1.Pod, _ interface{}, nodeInfo *schedulercache.NodeInf
 	}, nil
 }

+// pickOneNodeForPreemption chooses one node among the given nodes. It assumes
+// pods in each map entry are ordered by decreasing priority.
+// It picks a node based on the following criteria:
+// 1. A node with minimum highest priority victim is picked.
+// 2. Ties are broken by sum of priorities of all victims.
+// 3. If there are still ties, node with the minimum number of victims is picked.
+// 4. If there are still ties, the first such node is picked (sort of randomly).
+//TODO(bsalamat): Try to reuse the "nodeScore" slices in order to save GC time.
+func pickOneNodeForPreemption(nodesToPods map[*v1.Node][]*v1.Pod) *v1.Node {
+	type nodeScore struct {
+		node            *v1.Node
+		highestPriority int32
+		sumPriorities   int64
+		numPods         int
+	}
+	if len(nodesToPods) == 0 {
+		return nil
+	}
+	minHighestPriority := int32(math.MaxInt32)
+	minPriorityScores := []*nodeScore{}
+	for node, pods := range nodesToPods {
+		if len(pods) == 0 {
+			// We found a node that doesn't need any preemption. Return it!
+			// This should happen rarely when one or more pods are terminated between
+			// the time that scheduler tries to schedule the pod and the time that
+			// preemption logic tries to find nodes for preemption.
+			return node
+		}
+		// highestPodPriority is the highest priority among the victims on this node.
+		highestPodPriority := util.GetPodPriority(pods[0])
+		if highestPodPriority < minHighestPriority {
+			minHighestPriority = highestPodPriority
+			minPriorityScores = nil
+		}
+		if highestPodPriority == minHighestPriority {
+			minPriorityScores = append(minPriorityScores, &nodeScore{node: node, highestPriority: highestPodPriority, numPods: len(pods)})
+		}
+	}
+	if len(minPriorityScores) == 1 {
+		return minPriorityScores[0].node
+	}
+	// There are a few nodes with minimum highest priority victim. Find the
+	// smallest sum of priorities.
+	minSumPriorities := int64(math.MaxInt64)
+	minSumPriorityScores := []*nodeScore{}
+	for _, nodeScore := range minPriorityScores {
+		var sumPriorities int64
+		for _, pod := range nodesToPods[nodeScore.node] {
+			// We add MaxInt32+1 to all priorities to make all of them >= 0. This is
+			// needed so that a node with a few pods with negative priority is not
+			// picked over a node with a smaller number of pods with the same negative
+			// priority (and similar scenarios).
+			sumPriorities += int64(util.GetPodPriority(pod)) + int64(math.MaxInt32+1)
+		}
+		if sumPriorities < minSumPriorities {
+			minSumPriorities = sumPriorities
+			minSumPriorityScores = nil
+		}
+		nodeScore.sumPriorities = sumPriorities
+		if sumPriorities == minSumPriorities {
+			minSumPriorityScores = append(minSumPriorityScores, nodeScore)
+		}
+	}
+	if len(minSumPriorityScores) == 1 {
+		return minSumPriorityScores[0].node
+	}
+	// There are a few nodes with minimum highest priority victim and sum of priorities.
+	// Find one with the minimum number of pods.
+	minNumPods := math.MaxInt32
+	minNumPodScores := []*nodeScore{}
+	for _, nodeScore := range minSumPriorityScores {
+		if nodeScore.numPods < minNumPods {
+			minNumPods = nodeScore.numPods
+			minNumPodScores = nil
+		}
+		if nodeScore.numPods == minNumPods {
+			minNumPodScores = append(minNumPodScores, nodeScore)
+		}
+	}
+	// At this point, even if there are more than one node with the same score,
+	// return the first one.
+	if len(minNumPodScores) > 0 {
+		return minNumPodScores[0].node
+	}
+	glog.Errorf("Error in logic of node scoring for preemption. We should never reach here!")
+	return nil
+}
+
+// selectNodesForPreemption finds all the nodes with possible victims for
+// preemption in parallel.
+func selectNodesForPreemption(pod *v1.Pod,
+	nodeNameToInfo map[string]*schedulercache.NodeInfo,
+	potentialNodes []*v1.Node,
+	predicates map[string]algorithm.FitPredicate,
+	metadataProducer algorithm.PredicateMetadataProducer,
+) (map[*v1.Node][]*v1.Pod, error) {
+
+	nodeNameToPods := map[*v1.Node][]*v1.Pod{}
+	var resultLock sync.Mutex
+
+	// We can use the same metadata producer for all nodes.
+	meta := metadataProducer(pod, nodeNameToInfo)
+	checkNode := func(i int) {
+		nodeName := potentialNodes[i].Name
+		var metaCopy algorithm.PredicateMetadata
+		if meta != nil {
+			metaCopy = meta.ShallowCopy()
+		}
+		pods, fits := selectVictimsOnNode(pod, metaCopy, nodeNameToInfo[nodeName], predicates)
+		if fits {
+			resultLock.Lock()
+			nodeNameToPods[potentialNodes[i]] = pods
+			resultLock.Unlock()
+		}
+	}
+	workqueue.Parallelize(16, len(potentialNodes), checkNode)
+	return nodeNameToPods, nil
+}
+
+func nodePassesExtendersForPreemption(
+	pod *v1.Pod,
+	nodeName string,
+	victims []*v1.Pod,
+	nodeNameToInfo map[string]*schedulercache.NodeInfo,
+	extenders []algorithm.SchedulerExtender) (bool, error) {
+	// If there are any extenders, run them and filter the list of candidate nodes.
+	if len(extenders) == 0 {
+		return true, nil
+	}
+	// Remove the victims from the corresponding nodeInfo and send nodes to the
+	// extenders for filtering.
+	originalNodeInfo := nodeNameToInfo[nodeName]
+	nodeInfoCopy := nodeNameToInfo[nodeName].Clone()
+	for _, victim := range victims {
+		nodeInfoCopy.RemovePod(victim)
+	}
+	nodeNameToInfo[nodeName] = nodeInfoCopy
+	defer func() { nodeNameToInfo[nodeName] = originalNodeInfo }()
+	filteredNodes := []*v1.Node{nodeInfoCopy.Node()}
+	for _, extender := range extenders {
+		var err error
+		var failedNodesMap map[string]string
+		filteredNodes, failedNodesMap, err = extender.Filter(pod, filteredNodes, nodeNameToInfo)
+		if err != nil {
+			return false, err
+		}
+		if _, found := failedNodesMap[nodeName]; found || len(filteredNodes) == 0 {
+			return false, nil
+		}
+	}
+	return true, nil
+}
+
+// selectVictimsOnNode finds minimum set of pods on the given node that should
+// be preempted in order to make enough room for "pod" to be scheduled. The
+// minimum set selected is subject to the constraint that a higher-priority pod
+// is never preempted when a lower-priority pod could be (higher/lower relative
+// to one another, not relative to the preemptor "pod").
+// The algorithm first checks if the pod can be scheduled on the node when all the
+// lower priority pods are gone. If so, it sorts all the lower priority pods by
+// their priority and starts from the highest priority one, tries to keep as
+// many of them as possible while checking that the "pod" can still fit on the node.
+// NOTE: This function assumes that it is never called if "pod" cannot be scheduled
+// due to pod affinity, node affinity, or node anti-affinity reasons. None of
+// these predicates can be satisfied by removing more pods from the node.
+// TODO(bsalamat): Add support for PodDisruptionBudget.
+func selectVictimsOnNode(
+	pod *v1.Pod,
+	meta algorithm.PredicateMetadata,
+	nodeInfo *schedulercache.NodeInfo,
+	fitPredicates map[string]algorithm.FitPredicate) ([]*v1.Pod, bool) {
+	potentialVictims := util.SortableList{CompFunc: util.HigherPriorityPod}
+	nodeInfoCopy := nodeInfo.Clone()
+
+	removePod := func(rp *v1.Pod) {
+		nodeInfoCopy.RemovePod(rp)
+		if meta != nil {
+			meta.RemovePod(rp)
+		}
+	}
+	addPod := func(ap *v1.Pod) {
+		nodeInfoCopy.AddPod(ap)
+		if meta != nil {
+			meta.AddPod(ap, nodeInfoCopy)
+		}
+	}
+	// As the first step, remove all the lower priority pods from the node and
+	// check if the given pod can be scheduled.
+	podPriority := util.GetPodPriority(pod)
+	for _, p := range nodeInfoCopy.Pods() {
+		if util.GetPodPriority(p) < podPriority {
+			potentialVictims.Items = append(potentialVictims.Items, p)
+			removePod(p)
+		}
+	}
+	potentialVictims.Sort()
+	// If the new pod does not fit after removing all the lower priority pods,
+	// we are almost done and this node is not suitable for preemption. The only condition
+	// that we should check is if the "pod" is failing to schedule due to pod affinity
+	// failure.
+	// TODO(bsalamat): Consider checking affinity to lower priority pods if feasible with reasonable performance.
+	if fits, _, err := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits {
+		if err != nil {
+			glog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, err)
+		}
+		return nil, false
+	}
+	victims := []*v1.Pod{}
+	// Try to reprieve as many pods as possible starting from the highest priority one.
+	for _, p := range potentialVictims.Items {
+		lpp := p.(*v1.Pod)
+		addPod(lpp)
+		if fits, _, _ := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits {
+			removePod(lpp)
+			victims = append(victims, lpp)
+			glog.V(5).Infof("Pod %v is a potential preemption victim on node %v.", lpp.Name, nodeInfo.Node().Name)
+		}
+	}
+	return victims, true
+}
+
+// nodesWherePreemptionMightHelp returns a list of nodes with failed predicates
+// that may be satisfied by removing pods from the node.
+func nodesWherePreemptionMightHelp(pod *v1.Pod, nodes []*v1.Node, failedPredicatesMap FailedPredicateMap) []*v1.Node {
+	potentialNodes := []*v1.Node{}
+	for _, node := range nodes {
+		unresolvableReasonExist := false
+		failedPredicates, found := failedPredicatesMap[node.Name]
+		// If we assume that scheduler looks at all nodes and populates the failedPredicateMap
+		// (which is the case today), the !found case should never happen, but we'd prefer
+		// to rely less on such assumptions in the code when checking does not impose
+		// significant overhead.
+		for _, failedPredicate := range failedPredicates {
+			switch failedPredicate {
+			case
+				predicates.ErrNodeSelectorNotMatch,
+				predicates.ErrPodNotMatchHostName,
+				predicates.ErrTaintsTolerationsNotMatch,
+				predicates.ErrNodeLabelPresenceViolated,
+				predicates.ErrNodeNotReady,
+				predicates.ErrNodeNetworkUnavailable,
+				predicates.ErrNodeUnschedulable,
+				predicates.ErrNodeUnknownCondition:
+				unresolvableReasonExist = true
+				break
+				// TODO(bsalamat): Please add affinity failure cases once we have specific affinity failure errors.
+			}
+		}
+		if !found || !unresolvableReasonExist {
+			glog.V(3).Infof("Node %v is a potential node for preemption.", node.Name)
+			potentialNodes = append(potentialNodes, node)
+		}
+	}
+	return potentialNodes
+}
+
+// podEligibleToPreemptOthers determines whether this pod should be considered
+// for preempting other pods or not. If this pod has already preempted other
+// pods and those are in their graceful termination period, it shouldn't be
+// considered for preemption.
+// We look at the node that is nominated for this pod and as long as there are
+// terminating pods on the node, we don't consider this for preempting more pods.
+// TODO(bsalamat): Revisit this algorithm once scheduling by priority is added.
+func podEligibleToPreemptOthers(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) bool {
+	if nodeName, found := pod.Annotations[NominatedNodeAnnotationKey]; found {
+		if nodeInfo, found := nodeNameToInfo[nodeName]; found {
+			for _, p := range nodeInfo.Pods() {
+				if p.DeletionTimestamp != nil && util.GetPodPriority(p) < util.GetPodPriority(pod) {
+					// There is a terminating pod on the nominated node.
+					return false
+				}
+			}
+		}
+	}
+	return true
+}
+
 func NewGenericScheduler(
 	cache schedulercache.Cache,
 	eCache *EquivalenceCache,
 	predicates map[string]algorithm.FitPredicate,
-	predicateMetaProducer algorithm.MetadataProducer,
+	predicateMetaProducer algorithm.PredicateMetadataProducer,
 	prioritizers []algorithm.PriorityConfig,
 	priorityMetaProducer algorithm.MetadataProducer,
 	extenders []algorithm.SchedulerExtender) algorithm.ScheduleAlgorithm {