mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-11-23 12:29:01 +00:00
Merge pull request #50949 from bsalamat/preemption_eviction
Automatic merge from submit-queue
Add pod preemption to the scheduler
**What this PR does / why we need it**:
This is the last of a series of PRs to add priority-based preemption to the scheduler. This PR connects the preemption logic to the scheduler workflow.
**Which issue this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes #48646
**Special notes for your reviewer**:
This PR includes other PRs which are under review (#50805, #50405, #50190). All the new code is located in 43627afdf9.
**Release note**:
```release-note
Add priority-based preemption to the scheduler.
```
ref/ #47604
/assign @davidopp
@kubernetes/sig-scheduling-pr-reviews
This commit is contained in:
@@ -18,6 +18,7 @@ package core
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -32,6 +33,7 @@ import (
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates"
|
||||
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/util"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
@@ -45,7 +47,14 @@ type FitError struct {
|
||||
|
||||
var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")
|
||||
|
||||
const NoNodeAvailableMsg = "No nodes are available that match all of the following predicates"
|
||||
const (
|
||||
NoNodeAvailableMsg = "No nodes are available that match all of the predicates"
|
||||
// NominatedNodeAnnotationKey is used to annotate a pod that has preempted other pods.
|
||||
// The scheduler uses the annotation to find that the pod shouldn't preempt more pods
|
||||
// when it gets to the head of scheduling queue again.
|
||||
// See podEligibleToPreemptOthers() for more information.
|
||||
NominatedNodeAnnotationKey = "NominatedNodeName"
|
||||
)
|
||||
|
||||
// Error returns detailed information of why the pod failed to fit on each node
|
||||
func (f *FitError) Error() string {
|
||||
@@ -73,7 +82,7 @@ type genericScheduler struct {
|
||||
equivalenceCache *EquivalenceCache
|
||||
predicates map[string]algorithm.FitPredicate
|
||||
priorityMetaProducer algorithm.MetadataProducer
|
||||
predicateMetaProducer algorithm.MetadataProducer
|
||||
predicateMetaProducer algorithm.PredicateMetadataProducer
|
||||
prioritizers []algorithm.PriorityConfig
|
||||
extenders []algorithm.SchedulerExtender
|
||||
pods algorithm.PodLister
|
||||
@@ -159,6 +168,65 @@ func (g *genericScheduler) selectHost(priorityList schedulerapi.HostPriorityList
|
||||
return priorityList[ix].Host, nil
|
||||
}
|
||||
|
||||
// preempt finds nodes with pods that can be preempted to make room for "pod" to
|
||||
// schedule. It chooses one of the nodes and preempts the pods on the node and
|
||||
// returns the node and the list of preempted pods if such a node is found.
|
||||
// TODO(bsalamat): Add priority-based scheduling. More info: today one or more
|
||||
// pending pods (different from the pod that triggered the preemption(s)) may
|
||||
// schedule into some portion of the resources freed up by the preemption(s)
|
||||
// before the pod that triggered the preemption(s) has a chance to schedule
|
||||
// there, thereby preventing the pod that triggered the preemption(s) from
|
||||
// scheduling. Solution is given at:
|
||||
// https://github.com/kubernetes/community/blob/master/contributors/design-proposals/pod-preemption.md#preemption-mechanics
|
||||
func (g *genericScheduler) Preempt(pod *v1.Pod, nodeLister algorithm.NodeLister, scheduleErr error) (*v1.Node, []*v1.Pod, error) {
|
||||
// Scheduler may return various types of errors. Consider preemption only if
|
||||
// the error is of type FitError.
|
||||
fitError, ok := scheduleErr.(*FitError)
|
||||
if !ok || fitError == nil {
|
||||
return nil, nil, nil
|
||||
}
|
||||
err := g.cache.UpdateNodeNameToInfoMap(g.cachedNodeInfoMap)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
if !podEligibleToPreemptOthers(pod, g.cachedNodeInfoMap) {
|
||||
glog.V(5).Infof("Pod %v is not eligible for more preemption.", pod.Name)
|
||||
return nil, nil, nil
|
||||
}
|
||||
allNodes, err := nodeLister.List()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
if len(allNodes) == 0 {
|
||||
return nil, nil, ErrNoNodesAvailable
|
||||
}
|
||||
potentialNodes := nodesWherePreemptionMightHelp(pod, allNodes, fitError.FailedPredicates)
|
||||
if len(potentialNodes) == 0 {
|
||||
glog.V(3).Infof("Preemption will not help schedule pod %v on any node.", pod.Name)
|
||||
return nil, nil, nil
|
||||
}
|
||||
nodeToPods, err := selectNodesForPreemption(pod, g.cachedNodeInfoMap, potentialNodes, g.predicates, g.predicateMetaProducer)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
for len(nodeToPods) > 0 {
|
||||
node := pickOneNodeForPreemption(nodeToPods)
|
||||
if node == nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
passes, pErr := nodePassesExtendersForPreemption(pod, node.Name, nodeToPods[node], g.cachedNodeInfoMap, g.extenders)
|
||||
if passes && pErr == nil {
|
||||
return node, nodeToPods[node], err
|
||||
}
|
||||
if pErr != nil {
|
||||
glog.Errorf("Error occurred while checking extenders for preemption on node %v: %v", node, pErr)
|
||||
}
|
||||
// Remove the node from the map and try to pick a different node.
|
||||
delete(nodeToPods, node)
|
||||
}
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Filters the nodes to find the ones that fit based on the given predicate functions
|
||||
// Each node is passed through the predicate functions to determine if it is a fit
|
||||
func findNodesThatFit(
|
||||
@@ -167,7 +235,7 @@ func findNodesThatFit(
|
||||
nodes []*v1.Node,
|
||||
predicateFuncs map[string]algorithm.FitPredicate,
|
||||
extenders []algorithm.SchedulerExtender,
|
||||
metadataProducer algorithm.MetadataProducer,
|
||||
metadataProducer algorithm.PredicateMetadataProducer,
|
||||
ecache *EquivalenceCache,
|
||||
) ([]*v1.Node, FailedPredicateMap, error) {
|
||||
var filtered []*v1.Node
|
||||
@@ -232,7 +300,7 @@ func findNodesThatFit(
|
||||
}
|
||||
|
||||
// Checks whether node with a given name and NodeInfo satisfies all predicateFuncs.
|
||||
func podFitsOnNode(pod *v1.Pod, meta interface{}, info *schedulercache.NodeInfo, predicateFuncs map[string]algorithm.FitPredicate,
|
||||
func podFitsOnNode(pod *v1.Pod, meta algorithm.PredicateMetadata, info *schedulercache.NodeInfo, predicateFuncs map[string]algorithm.FitPredicate,
|
||||
ecache *EquivalenceCache) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
var (
|
||||
equivalenceHash uint64
|
||||
@@ -422,11 +490,288 @@ func EqualPriorityMap(_ *v1.Pod, _ interface{}, nodeInfo *schedulercache.NodeInf
|
||||
}, nil
|
||||
}
|
||||
|
||||
// pickOneNodeForPreemption chooses one node among the given nodes. It assumes
|
||||
// pods in each map entry are ordered by decreasing priority.
|
||||
// It picks a node based on the following criteria:
|
||||
// 1. A node with minimum highest priority victim is picked.
|
||||
// 2. Ties are broken by sum of priorities of all victims.
|
||||
// 3. If there are still ties, node with the minimum number of victims is picked.
|
||||
// 4. If there are still ties, the first such node is picked (sort of randomly).
|
||||
//TODO(bsalamat): Try to reuse the "nodeScore" slices in order to save GC time.
|
||||
func pickOneNodeForPreemption(nodesToPods map[*v1.Node][]*v1.Pod) *v1.Node {
|
||||
type nodeScore struct {
|
||||
node *v1.Node
|
||||
highestPriority int32
|
||||
sumPriorities int64
|
||||
numPods int
|
||||
}
|
||||
if len(nodesToPods) == 0 {
|
||||
return nil
|
||||
}
|
||||
minHighestPriority := int32(math.MaxInt32)
|
||||
minPriorityScores := []*nodeScore{}
|
||||
for node, pods := range nodesToPods {
|
||||
if len(pods) == 0 {
|
||||
// We found a node that doesn't need any preemption. Return it!
|
||||
// This should happen rarely when one or more pods are terminated between
|
||||
// the time that scheduler tries to schedule the pod and the time that
|
||||
// preemption logic tries to find nodes for preemption.
|
||||
return node
|
||||
}
|
||||
// highestPodPriority is the highest priority among the victims on this node.
|
||||
highestPodPriority := util.GetPodPriority(pods[0])
|
||||
if highestPodPriority < minHighestPriority {
|
||||
minHighestPriority = highestPodPriority
|
||||
minPriorityScores = nil
|
||||
}
|
||||
if highestPodPriority == minHighestPriority {
|
||||
minPriorityScores = append(minPriorityScores, &nodeScore{node: node, highestPriority: highestPodPriority, numPods: len(pods)})
|
||||
}
|
||||
}
|
||||
if len(minPriorityScores) == 1 {
|
||||
return minPriorityScores[0].node
|
||||
}
|
||||
// There are a few nodes with minimum highest priority victim. Find the
|
||||
// smallest sum of priorities.
|
||||
minSumPriorities := int64(math.MaxInt64)
|
||||
minSumPriorityScores := []*nodeScore{}
|
||||
for _, nodeScore := range minPriorityScores {
|
||||
var sumPriorities int64
|
||||
for _, pod := range nodesToPods[nodeScore.node] {
|
||||
// We add MaxInt32+1 to all priorities to make all of them >= 0. This is
|
||||
// needed so that a node with a few pods with negative priority is not
|
||||
// picked over a node with a smaller number of pods with the same negative
|
||||
// priority (and similar scenarios).
|
||||
sumPriorities += int64(util.GetPodPriority(pod)) + int64(math.MaxInt32+1)
|
||||
}
|
||||
if sumPriorities < minSumPriorities {
|
||||
minSumPriorities = sumPriorities
|
||||
minSumPriorityScores = nil
|
||||
}
|
||||
nodeScore.sumPriorities = sumPriorities
|
||||
if sumPriorities == minSumPriorities {
|
||||
minSumPriorityScores = append(minSumPriorityScores, nodeScore)
|
||||
}
|
||||
}
|
||||
if len(minSumPriorityScores) == 1 {
|
||||
return minSumPriorityScores[0].node
|
||||
}
|
||||
// There are a few nodes with minimum highest priority victim and sum of priorities.
|
||||
// Find one with the minimum number of pods.
|
||||
minNumPods := math.MaxInt32
|
||||
minNumPodScores := []*nodeScore{}
|
||||
for _, nodeScore := range minSumPriorityScores {
|
||||
if nodeScore.numPods < minNumPods {
|
||||
minNumPods = nodeScore.numPods
|
||||
minNumPodScores = nil
|
||||
}
|
||||
if nodeScore.numPods == minNumPods {
|
||||
minNumPodScores = append(minNumPodScores, nodeScore)
|
||||
}
|
||||
}
|
||||
// At this point, even if there are more than one node with the same score,
|
||||
// return the first one.
|
||||
if len(minNumPodScores) > 0 {
|
||||
return minNumPodScores[0].node
|
||||
}
|
||||
glog.Errorf("Error in logic of node scoring for preemption. We should never reach here!")
|
||||
return nil
|
||||
}
|
||||
|
||||
// selectNodesForPreemption finds all the nodes with possible victims for
|
||||
// preemption in parallel.
|
||||
func selectNodesForPreemption(pod *v1.Pod,
|
||||
nodeNameToInfo map[string]*schedulercache.NodeInfo,
|
||||
potentialNodes []*v1.Node,
|
||||
predicates map[string]algorithm.FitPredicate,
|
||||
metadataProducer algorithm.PredicateMetadataProducer,
|
||||
) (map[*v1.Node][]*v1.Pod, error) {
|
||||
|
||||
nodeNameToPods := map[*v1.Node][]*v1.Pod{}
|
||||
var resultLock sync.Mutex
|
||||
|
||||
// We can use the same metadata producer for all nodes.
|
||||
meta := metadataProducer(pod, nodeNameToInfo)
|
||||
checkNode := func(i int) {
|
||||
nodeName := potentialNodes[i].Name
|
||||
var metaCopy algorithm.PredicateMetadata
|
||||
if meta != nil {
|
||||
metaCopy = meta.ShallowCopy()
|
||||
}
|
||||
pods, fits := selectVictimsOnNode(pod, metaCopy, nodeNameToInfo[nodeName], predicates)
|
||||
if fits {
|
||||
resultLock.Lock()
|
||||
nodeNameToPods[potentialNodes[i]] = pods
|
||||
resultLock.Unlock()
|
||||
}
|
||||
}
|
||||
workqueue.Parallelize(16, len(potentialNodes), checkNode)
|
||||
return nodeNameToPods, nil
|
||||
}
|
||||
|
||||
func nodePassesExtendersForPreemption(
|
||||
pod *v1.Pod,
|
||||
nodeName string,
|
||||
victims []*v1.Pod,
|
||||
nodeNameToInfo map[string]*schedulercache.NodeInfo,
|
||||
extenders []algorithm.SchedulerExtender) (bool, error) {
|
||||
// If there are any extenders, run them and filter the list of candidate nodes.
|
||||
if len(extenders) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
// Remove the victims from the corresponding nodeInfo and send nodes to the
|
||||
// extenders for filtering.
|
||||
originalNodeInfo := nodeNameToInfo[nodeName]
|
||||
nodeInfoCopy := nodeNameToInfo[nodeName].Clone()
|
||||
for _, victim := range victims {
|
||||
nodeInfoCopy.RemovePod(victim)
|
||||
}
|
||||
nodeNameToInfo[nodeName] = nodeInfoCopy
|
||||
defer func() { nodeNameToInfo[nodeName] = originalNodeInfo }()
|
||||
filteredNodes := []*v1.Node{nodeInfoCopy.Node()}
|
||||
for _, extender := range extenders {
|
||||
var err error
|
||||
var failedNodesMap map[string]string
|
||||
filteredNodes, failedNodesMap, err = extender.Filter(pod, filteredNodes, nodeNameToInfo)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if _, found := failedNodesMap[nodeName]; found || len(filteredNodes) == 0 {
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// selectVictimsOnNode finds minimum set of pods on the given node that should
|
||||
// be preempted in order to make enough room for "pod" to be scheduled. The
|
||||
// minimum set selected is subject to the constraint that a higher-priority pod
|
||||
// is never preempted when a lower-priority pod could be (higher/lower relative
|
||||
// to one another, not relative to the preemptor "pod").
|
||||
// The algorithm first checks if the pod can be scheduled on the node when all the
|
||||
// lower priority pods are gone. If so, it sorts all the lower priority pods by
|
||||
// their priority and starts from the highest priority one, tries to keep as
|
||||
// many of them as possible while checking that the "pod" can still fit on the node.
|
||||
// NOTE: This function assumes that it is never called if "pod" cannot be scheduled
|
||||
// due to pod affinity, node affinity, or node anti-affinity reasons. None of
|
||||
// these predicates can be satisfied by removing more pods from the node.
|
||||
// TODO(bsalamat): Add support for PodDisruptionBudget.
|
||||
func selectVictimsOnNode(
|
||||
pod *v1.Pod,
|
||||
meta algorithm.PredicateMetadata,
|
||||
nodeInfo *schedulercache.NodeInfo,
|
||||
fitPredicates map[string]algorithm.FitPredicate) ([]*v1.Pod, bool) {
|
||||
potentialVictims := util.SortableList{CompFunc: util.HigherPriorityPod}
|
||||
nodeInfoCopy := nodeInfo.Clone()
|
||||
|
||||
removePod := func(rp *v1.Pod) {
|
||||
nodeInfoCopy.RemovePod(rp)
|
||||
if meta != nil {
|
||||
meta.RemovePod(rp)
|
||||
}
|
||||
}
|
||||
addPod := func(ap *v1.Pod) {
|
||||
nodeInfoCopy.AddPod(ap)
|
||||
if meta != nil {
|
||||
meta.AddPod(ap, nodeInfoCopy)
|
||||
}
|
||||
}
|
||||
// As the first step, remove all the lower priority pods from the node and
|
||||
// check if the given pod can be scheduled.
|
||||
podPriority := util.GetPodPriority(pod)
|
||||
for _, p := range nodeInfoCopy.Pods() {
|
||||
if util.GetPodPriority(p) < podPriority {
|
||||
potentialVictims.Items = append(potentialVictims.Items, p)
|
||||
removePod(p)
|
||||
}
|
||||
}
|
||||
potentialVictims.Sort()
|
||||
// If the new pod does not fit after removing all the lower priority pods,
|
||||
// we are almost done and this node is not suitable for preemption. The only condition
|
||||
// that we should check is if the "pod" is failing to schedule due to pod affinity
|
||||
// failure.
|
||||
// TODO(bsalamat): Consider checking affinity to lower priority pods if feasible with reasonable performance.
|
||||
if fits, _, err := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits {
|
||||
if err != nil {
|
||||
glog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, err)
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
victims := []*v1.Pod{}
|
||||
// Try to reprieve as many pods as possible starting from the highest priority one.
|
||||
for _, p := range potentialVictims.Items {
|
||||
lpp := p.(*v1.Pod)
|
||||
addPod(lpp)
|
||||
if fits, _, _ := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits {
|
||||
removePod(lpp)
|
||||
victims = append(victims, lpp)
|
||||
glog.V(5).Infof("Pod %v is a potential preemption victim on node %v.", lpp.Name, nodeInfo.Node().Name)
|
||||
}
|
||||
}
|
||||
return victims, true
|
||||
}
|
||||
|
||||
// nodesWherePreemptionMightHelp returns a list of nodes with failed predicates
|
||||
// that may be satisfied by removing pods from the node.
|
||||
func nodesWherePreemptionMightHelp(pod *v1.Pod, nodes []*v1.Node, failedPredicatesMap FailedPredicateMap) []*v1.Node {
|
||||
potentialNodes := []*v1.Node{}
|
||||
for _, node := range nodes {
|
||||
unresolvableReasonExist := false
|
||||
failedPredicates, found := failedPredicatesMap[node.Name]
|
||||
// If we assume that scheduler looks at all nodes and populates the failedPredicateMap
|
||||
// (which is the case today), the !found case should never happen, but we'd prefer
|
||||
// to rely less on such assumptions in the code when checking does not impose
|
||||
// significant overhead.
|
||||
for _, failedPredicate := range failedPredicates {
|
||||
switch failedPredicate {
|
||||
case
|
||||
predicates.ErrNodeSelectorNotMatch,
|
||||
predicates.ErrPodNotMatchHostName,
|
||||
predicates.ErrTaintsTolerationsNotMatch,
|
||||
predicates.ErrNodeLabelPresenceViolated,
|
||||
predicates.ErrNodeNotReady,
|
||||
predicates.ErrNodeNetworkUnavailable,
|
||||
predicates.ErrNodeUnschedulable,
|
||||
predicates.ErrNodeUnknownCondition:
|
||||
unresolvableReasonExist = true
|
||||
break
|
||||
// TODO(bsalamat): Please add affinity failure cases once we have specific affinity failure errors.
|
||||
}
|
||||
}
|
||||
if !found || !unresolvableReasonExist {
|
||||
glog.V(3).Infof("Node %v is a potential node for preemption.", node.Name)
|
||||
potentialNodes = append(potentialNodes, node)
|
||||
}
|
||||
}
|
||||
return potentialNodes
|
||||
}
|
||||
|
||||
// podEligibleToPreemptOthers determines whether this pod should be considered
|
||||
// for preempting other pods or not. If this pod has already preempted other
|
||||
// pods and those are in their graceful termination period, it shouldn't be
|
||||
// considered for preemption.
|
||||
// We look at the node that is nominated for this pod and as long as there are
|
||||
// terminating pods on the node, we don't consider this for preempting more pods.
|
||||
// TODO(bsalamat): Revisit this algorithm once scheduling by priority is added.
|
||||
func podEligibleToPreemptOthers(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) bool {
|
||||
if nodeName, found := pod.Annotations[NominatedNodeAnnotationKey]; found {
|
||||
if nodeInfo, found := nodeNameToInfo[nodeName]; found {
|
||||
for _, p := range nodeInfo.Pods() {
|
||||
if p.DeletionTimestamp != nil && util.GetPodPriority(p) < util.GetPodPriority(pod) {
|
||||
// There is a terminating pod on the nominated node.
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func NewGenericScheduler(
|
||||
cache schedulercache.Cache,
|
||||
eCache *EquivalenceCache,
|
||||
predicates map[string]algorithm.FitPredicate,
|
||||
predicateMetaProducer algorithm.MetadataProducer,
|
||||
predicateMetaProducer algorithm.PredicateMetadataProducer,
|
||||
prioritizers []algorithm.PriorityConfig,
|
||||
priorityMetaProducer algorithm.MetadataProducer,
|
||||
extenders []algorithm.SchedulerExtender) algorithm.ScheduleAlgorithm {
|
||||
|
||||
Reference in New Issue
Block a user