From 058e3d425812cf5559a1e4c6a90fcd9de2427633 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 18 Jun 2020 16:21:16 -0700 Subject: [PATCH] Move Preempt() and its related functions to defaultpreemption package Refactor genericScheduler and signature of preemption funcs - remove podNominator from genericScheduler - simplify signature of preemption functions Make Preempt() private --- pkg/scheduler/core/BUILD | 18 - pkg/scheduler/core/extender_test.go | 10 +- pkg/scheduler/core/generic_scheduler.go | 565 +------ pkg/scheduler/core/generic_scheduler_test.go | 1478 +---------------- pkg/scheduler/factory.go | 1 - .../framework/plugins/defaultpreemption/BUILD | 50 +- .../defaultpreemption/default_preemption.go | 546 +++++- .../default_preemption_test.go | 1476 ++++++++++++++++ pkg/scheduler/scheduler_test.go | 4 +- 9 files changed, 2127 insertions(+), 2021 deletions(-) create mode 100644 pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go diff --git a/pkg/scheduler/core/BUILD b/pkg/scheduler/core/BUILD index 6696aaa933d..c415e12dfd2 100644 --- a/pkg/scheduler/core/BUILD +++ b/pkg/scheduler/core/BUILD @@ -10,7 +10,6 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/api/v1/pod:go_default_library", - "//pkg/features:go_default_library", "//pkg/scheduler/apis/config:go_default_library", "//pkg/scheduler/framework/runtime:go_default_library", "//pkg/scheduler/framework/v1alpha1:go_default_library", @@ -20,12 +19,8 @@ go_library( "//pkg/scheduler/profile:go_default_library", "//pkg/scheduler/util:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", - "//staging/src/k8s.io/api/policy/v1beta1:go_default_library", - "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", - "//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/net:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library", - "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//staging/src/k8s.io/client-go/listers/core/v1:go_default_library", "//staging/src/k8s.io/client-go/rest:go_default_library", "//staging/src/k8s.io/kube-scheduler/extender/v1:go_default_library", @@ -42,21 +37,12 @@ go_test( ], embed = [":go_default_library"], deps = [ - "//pkg/controller/volume/scheduling:go_default_library", "//pkg/scheduler/apis/config:go_default_library", "//pkg/scheduler/framework/plugins/defaultbinder:go_default_library", "//pkg/scheduler/framework/plugins/defaultpodtopologyspread:go_default_library", - "//pkg/scheduler/framework/plugins/interpodaffinity:go_default_library", - "//pkg/scheduler/framework/plugins/nodeaffinity:go_default_library", - "//pkg/scheduler/framework/plugins/nodelabel:go_default_library", - "//pkg/scheduler/framework/plugins/nodename:go_default_library", "//pkg/scheduler/framework/plugins/noderesources:go_default_library", - "//pkg/scheduler/framework/plugins/nodeunschedulable:go_default_library", "//pkg/scheduler/framework/plugins/podtopologyspread:go_default_library", "//pkg/scheduler/framework/plugins/queuesort:go_default_library", - "//pkg/scheduler/framework/plugins/tainttoleration:go_default_library", - "//pkg/scheduler/framework/plugins/volumerestrictions:go_default_library", - "//pkg/scheduler/framework/plugins/volumezone:go_default_library", "//pkg/scheduler/framework/runtime:go_default_library", "//pkg/scheduler/framework/v1alpha1:go_default_library", "//pkg/scheduler/framework/v1alpha1/fake:go_default_library", @@ -66,7 +52,6 @@ go_test( "//pkg/scheduler/testing:go_default_library", "//pkg/scheduler/util:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", - "//staging/src/k8s.io/api/policy/v1beta1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library", @@ -75,9 +60,6 @@ go_test( "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", "//staging/src/k8s.io/client-go/informers:go_default_library", "//staging/src/k8s.io/client-go/kubernetes/fake:go_default_library", - "//staging/src/k8s.io/client-go/testing:go_default_library", - "//staging/src/k8s.io/client-go/tools/events:go_default_library", - "//staging/src/k8s.io/kube-scheduler/extender/v1:go_default_library", ], ) diff --git a/pkg/scheduler/core/extender_test.go b/pkg/scheduler/core/extender_test.go index 53367786048..919fa283136 100644 --- a/pkg/scheduler/core/extender_test.go +++ b/pkg/scheduler/core/extender_test.go @@ -260,7 +260,7 @@ func TestGenericSchedulerWithExtenders(t *testing.T) { client := clientsetfake.NewSimpleClientset() informerFactory := informers.NewSharedInformerFactory(client, 0) - extenders := []framework.Extender{} + var extenders []framework.Extender for ii := range test.extenders { extenders = append(extenders, &test.extenders[ii]) } @@ -268,9 +268,12 @@ func TestGenericSchedulerWithExtenders(t *testing.T) { for _, name := range test.nodes { cache.AddNode(createNode(name)) } - queue := internalqueue.NewSchedulingQueue(nil) - fwk, err := st.NewFramework(test.registerPlugins, runtime.WithClientSet(client)) + fwk, err := st.NewFramework( + test.registerPlugins, + runtime.WithClientSet(client), + runtime.WithPodNominator(internalqueue.NewPodNominator()), + ) if err != nil { t.Fatal(err) } @@ -280,7 +283,6 @@ func TestGenericSchedulerWithExtenders(t *testing.T) { scheduler := NewGenericScheduler( cache, - queue, emptySnapshot, extenders, informerFactory.Core().V1().PersistentVolumeClaims().Lister(), diff --git a/pkg/scheduler/core/generic_scheduler.go b/pkg/scheduler/core/generic_scheduler.go index a3e0f6c056a..e6c2bcb3956 100644 --- a/pkg/scheduler/core/generic_scheduler.go +++ b/pkg/scheduler/core/generic_scheduler.go @@ -19,7 +19,6 @@ package core import ( "context" "fmt" - "math" "math/rand" "sort" "strings" @@ -30,14 +29,9 @@ import ( "k8s.io/klog/v2" v1 "k8s.io/api/core/v1" - policy "k8s.io/api/policy/v1beta1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - utilfeature "k8s.io/apiserver/pkg/util/feature" corelisters "k8s.io/client-go/listers/core/v1" extenderv1 "k8s.io/kube-scheduler/extender/v1" podutil "k8s.io/kubernetes/pkg/api/v1/pod" - kubefeatures "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/framework/runtime" framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1" internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" @@ -120,7 +114,6 @@ type ScheduleResult struct { type genericScheduler struct { cache internalcache.Cache - podNominator framework.PodNominator extenders []framework.Extender nodeInfoSnapshot *internalcache.Snapshot pvcLister corelisters.PersistentVolumeClaimLister @@ -233,169 +226,6 @@ func (g *genericScheduler) selectHost(nodeScoreList framework.NodeScoreList) (st return selected, nil } -// Preempt finds nodes with pods that can be preempted to make room for "pod" to -// schedule. It chooses one of the nodes and preempts the pods on the node and -// returns 1) the node, 2) the list of preempted pods if such a node is found, -// 3) A list of pods whose nominated node name should be cleared, and 4) any -// possible error. -// Preempt does not update its snapshot. It uses the same snapshot used in the -// scheduling cycle. This is to avoid a scenario where preempt finds feasible -// nodes without preempting any pod. When there are many pending pods in the -// scheduling queue a nominated pod will go back to the queue and behind -// other pods with the same priority. The nominated pod prevents other pods from -// using the nominated resources and the nominated pod could take a long time -// before it is retried after many other pending pods. -func Preempt(ctx context.Context, fh framework.FrameworkHandle, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusMap) (string, error) { - cs := fh.ClientSet() - // TODO(Huang-Wei): get pod from informer cache instead of API server. - pod, err := util.GetUpdatedPod(cs, pod) - if err != nil { - klog.Errorf("Error getting the updated preemptor pod object: %v", err) - return "", err - } - - if !podEligibleToPreemptOthers(pod, fh.SnapshotSharedLister().NodeInfos()) { - klog.V(5).Infof("Pod %v/%v is not eligible for more preemption.", pod.Namespace, pod.Name) - return "", nil - } - allNodes, err := fh.SnapshotSharedLister().NodeInfos().List() - if err != nil { - return "", err - } - if len(allNodes) == 0 { - return "", ErrNoNodesAvailable - } - potentialNodes := nodesWherePreemptionMightHelp(allNodes, m) - if len(potentialNodes) == 0 { - klog.V(3).Infof("Preemption will not help schedule pod %v/%v on any node.", pod.Namespace, pod.Name) - // In this case, we should clean-up any existing nominated node name of the pod. - if err := util.ClearNominatedNodeName(cs, pod); err != nil { - klog.Errorf("Cannot clear 'NominatedNodeName' field of pod %v/%v: %v", pod.Namespace, pod.Name, err) - // We do not return as this error is not critical. - } - return "", nil - } - if klog.V(5).Enabled() { - var sample []string - for i := 0; i < 10 && i < len(potentialNodes); i++ { - sample = append(sample, potentialNodes[i].Node().Name) - } - klog.Infof("%v potential nodes for preemption, first %v are: %v", len(potentialNodes), len(sample), sample) - } - pdbs, err := getPodDisruptionBudgets(fh) - if err != nil { - return "", err - } - nodeNameToVictims, err := selectNodesForPreemption(ctx, fh.PreemptHandle(), fh.PreemptHandle(), state, pod, potentialNodes, pdbs) - if err != nil { - return "", err - } - - // We will only check nodeNameToVictims with extenders that support preemption. - // Extenders which do not support preemption may later prevent preemptor from being scheduled on the nominated - // node. In that case, scheduler will find a different host for the preemptor in subsequent scheduling cycles. - nodeNameToVictims, err = processPreemptionWithExtenders(fh, pod, nodeNameToVictims) - if err != nil { - return "", err - } - - candidateNode := pickOneNodeForPreemption(nodeNameToVictims) - if len(candidateNode) == 0 { - return "", nil - } - - victims := nodeNameToVictims[candidateNode].Pods - for _, victim := range victims { - if err := util.DeletePod(cs, victim); err != nil { - klog.Errorf("Error preempting pod %v/%v: %v", victim.Namespace, victim.Name, err) - return "", err - } - // If the victim is a WaitingPod, send a reject message to the PermitPlugin - if waitingPod := fh.GetWaitingPod(victim.UID); waitingPod != nil { - waitingPod.Reject("preempted") - } - fh.EventRecorder().Eventf(victim, pod, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by %v/%v on node %v", pod.Namespace, pod.Name, candidateNode) - } - metrics.PreemptionVictims.Observe(float64(len(victims))) - - // Lower priority pods nominated to run on this node, may no longer fit on - // this node. So, we should remove their nomination. Removing their - // nomination updates these pods and moves them to the active queue. It - // lets scheduler find another place for them. - nominatedPods := getLowerPriorityNominatedPods(fh.PreemptHandle(), pod, candidateNode) - if err := util.ClearNominatedNodeName(cs, nominatedPods...); err != nil { - klog.Errorf("Cannot clear 'NominatedNodeName' field: %v", err) - // We do not return as this error is not critical. - } - - return candidateNode, nil -} - -func getPodDisruptionBudgets(fh framework.FrameworkHandle) ([]*policy.PodDisruptionBudget, error) { - if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.PodDisruptionBudget) { - return fh.SharedInformerFactory().Policy().V1beta1().PodDisruptionBudgets().Lister().List(labels.Everything()) - } - return nil, nil -} - -// processPreemptionWithExtenders processes preemption with extenders -func processPreemptionWithExtenders(fh framework.FrameworkHandle, pod *v1.Pod, nodeNameToVictims map[string]*extenderv1.Victims) (map[string]*extenderv1.Victims, error) { - if len(nodeNameToVictims) > 0 { - for _, extender := range fh.PreemptHandle().Extenders() { - if extender.SupportsPreemption() && extender.IsInterested(pod) { - newNodeNameToVictims, err := extender.ProcessPreemption( - pod, - nodeNameToVictims, - fh.SnapshotSharedLister().NodeInfos(), - ) - if err != nil { - if extender.IsIgnorable() { - klog.Warningf("Skipping extender %v as it returned error %v and has ignorable flag set", - extender, err) - continue - } - return nil, err - } - - // Replace nodeNameToVictims with new result after preemption. So the - // rest of extenders can continue use it as parameter. - nodeNameToVictims = newNodeNameToVictims - - // If node list becomes empty, no preemption can happen regardless of other extenders. - if len(nodeNameToVictims) == 0 { - break - } - } - } - } - - return nodeNameToVictims, nil -} - -// getLowerPriorityNominatedPods returns pods whose priority is smaller than the -// priority of the given "pod" and are nominated to run on the given node. -// Note: We could possibly check if the nominated lower priority pods still fit -// and return those that no longer fit, but that would require lots of -// manipulation of NodeInfo and PreFilter state per nominated pod. It may not be -// worth the complexity, especially because we generally expect to have a very -// small number of nominated pods per node. -func getLowerPriorityNominatedPods(pn framework.PodNominator, pod *v1.Pod, nodeName string) []*v1.Pod { - pods := pn.NominatedPodsForNode(nodeName) - - if len(pods) == 0 { - return nil - } - - var lowerPriorityPods []*v1.Pod - podPriority := podutil.GetPodPriority(pod) - for _, p := range pods { - if podutil.GetPodPriority(p) < podPriority { - lowerPriorityPods = append(lowerPriorityPods, p) - } - } - return lowerPriorityPods -} - // numFeasibleNodesToFind returns the number of feasible nodes that once found, the scheduler stops // its search for more feasible nodes. func (g *genericScheduler) numFeasibleNodesToFind(numAllNodes int32) (numNodes int32) { @@ -472,7 +302,7 @@ func (g *genericScheduler) findNodesThatPassFilters(ctx context.Context, prof *p // We check the nodes starting from where we left off in the previous scheduling cycle, // this is to make sure all nodes have the same chance of being examined across pods. nodeInfo := allNodes[(g.nextStartNodeIndex+i)%len(allNodes)] - fits, status, err := podPassesFiltersOnNode(ctx, prof, g.podNominator, state, pod, nodeInfo) + fits, status, err := PodPassesFiltersOnNode(ctx, prof.PreemptHandle(), state, pod, nodeInfo) if err != nil { errCh.SendErrorWithCancel(err, cancel) return @@ -550,12 +380,12 @@ func (g *genericScheduler) findNodesThatPassExtenders(pod *v1.Pod, filtered []*v // addNominatedPods adds pods with equal or greater priority which are nominated // to run on the node. It returns 1) whether any pod was added, 2) augmented cycleState, // 3) augmented nodeInfo. -func addNominatedPods(ctx context.Context, pr framework.PluginsRunner, nominator framework.PodNominator, pod *v1.Pod, state *framework.CycleState, nodeInfo *framework.NodeInfo) (bool, *framework.CycleState, *framework.NodeInfo, error) { - if nominator == nil || nodeInfo == nil || nodeInfo.Node() == nil { +func addNominatedPods(ctx context.Context, ph framework.PreemptHandle, pod *v1.Pod, state *framework.CycleState, nodeInfo *framework.NodeInfo) (bool, *framework.CycleState, *framework.NodeInfo, error) { + if ph == nil || nodeInfo == nil || nodeInfo.Node() == nil { // This may happen only in tests. return false, state, nodeInfo, nil } - nominatedPods := nominator.NominatedPodsForNode(nodeInfo.Node().Name) + nominatedPods := ph.NominatedPodsForNode(nodeInfo.Node().Name) if len(nominatedPods) == 0 { return false, state, nodeInfo, nil } @@ -565,7 +395,7 @@ func addNominatedPods(ctx context.Context, pr framework.PluginsRunner, nominator for _, p := range nominatedPods { if podutil.GetPodPriority(p) >= podutil.GetPodPriority(pod) && p.UID != pod.UID { nodeInfoOut.AddPod(p) - status := pr.RunPreFilterExtensionAddPod(ctx, stateOut, pod, p, nodeInfoOut) + status := ph.RunPreFilterExtensionAddPod(ctx, stateOut, pod, p, nodeInfoOut) if !status.IsSuccess() { return false, state, nodeInfo, status.AsError() } @@ -575,7 +405,7 @@ func addNominatedPods(ctx context.Context, pr framework.PluginsRunner, nominator return podsAdded, stateOut, nodeInfoOut, nil } -// podPassesFiltersOnNode checks whether a node given by NodeInfo satisfies the +// PodPassesFiltersOnNode checks whether a node given by NodeInfo satisfies the // filter plugins. // This function is called from two different places: Schedule and Preempt. // When it is called from Schedule, we want to test whether the pod is @@ -585,10 +415,10 @@ func addNominatedPods(ctx context.Context, pr framework.PluginsRunner, nominator // and add the nominated pods. Removal of the victims is done by // SelectVictimsOnNode(). Preempt removes victims from PreFilter state and // NodeInfo before calling this function. -func podPassesFiltersOnNode( +// TODO: move this out so that plugins don't need to depend on pkg. +func PodPassesFiltersOnNode( ctx context.Context, - pr framework.PluginsRunner, - nominator framework.PodNominator, + ph framework.PreemptHandle, state *framework.CycleState, pod *v1.Pod, info *framework.NodeInfo, @@ -619,7 +449,7 @@ func podPassesFiltersOnNode( nodeInfoToUse := info if i == 0 { var err error - podsAdded, stateToUse, nodeInfoToUse, err = addNominatedPods(ctx, pr, nominator, pod, state, info) + podsAdded, stateToUse, nodeInfoToUse, err = addNominatedPods(ctx, ph, pod, state, info) if err != nil { return false, nil, err } @@ -627,7 +457,7 @@ func podPassesFiltersOnNode( break } - statusMap := pr.RunFilterPlugins(ctx, stateToUse, pod, nodeInfoToUse) + statusMap := ph.RunFilterPlugins(ctx, stateToUse, pod, nodeInfoToUse) status = statusMap.Merge() if !status.IsSuccess() && !status.IsUnschedulable() { return false, status, status.AsError() @@ -738,377 +568,6 @@ func (g *genericScheduler) prioritizeNodes( return result, nil } -// pickOneNodeForPreemption chooses one node among the given nodes. It assumes -// pods in each map entry are ordered by decreasing priority. -// It picks a node based on the following criteria: -// 1. A node with minimum number of PDB violations. -// 2. A node with minimum highest priority victim is picked. -// 3. Ties are broken by sum of priorities of all victims. -// 4. If there are still ties, node with the minimum number of victims is picked. -// 5. If there are still ties, node with the latest start time of all highest priority victims is picked. -// 6. If there are still ties, the first such node is picked (sort of randomly). -// The 'minNodes1' and 'minNodes2' are being reused here to save the memory -// allocation and garbage collection time. -func pickOneNodeForPreemption(nodesToVictims map[string]*extenderv1.Victims) string { - if len(nodesToVictims) == 0 { - return "" - } - minNumPDBViolatingPods := int64(math.MaxInt32) - var minNodes1 []string - lenNodes1 := 0 - for node, victims := range nodesToVictims { - if len(victims.Pods) == 0 { - // We found a node that doesn't need any preemption. Return it! - // This should happen rarely when one or more pods are terminated between - // the time that scheduler tries to schedule the pod and the time that - // preemption logic tries to find nodes for preemption. - return node - } - numPDBViolatingPods := victims.NumPDBViolations - if numPDBViolatingPods < minNumPDBViolatingPods { - minNumPDBViolatingPods = numPDBViolatingPods - minNodes1 = nil - lenNodes1 = 0 - } - if numPDBViolatingPods == minNumPDBViolatingPods { - minNodes1 = append(minNodes1, node) - lenNodes1++ - } - } - if lenNodes1 == 1 { - return minNodes1[0] - } - - // There are more than one node with minimum number PDB violating pods. Find - // the one with minimum highest priority victim. - minHighestPriority := int32(math.MaxInt32) - var minNodes2 = make([]string, lenNodes1) - lenNodes2 := 0 - for i := 0; i < lenNodes1; i++ { - node := minNodes1[i] - victims := nodesToVictims[node] - // highestPodPriority is the highest priority among the victims on this node. - highestPodPriority := podutil.GetPodPriority(victims.Pods[0]) - if highestPodPriority < minHighestPriority { - minHighestPriority = highestPodPriority - lenNodes2 = 0 - } - if highestPodPriority == minHighestPriority { - minNodes2[lenNodes2] = node - lenNodes2++ - } - } - if lenNodes2 == 1 { - return minNodes2[0] - } - - // There are a few nodes with minimum highest priority victim. Find the - // smallest sum of priorities. - minSumPriorities := int64(math.MaxInt64) - lenNodes1 = 0 - for i := 0; i < lenNodes2; i++ { - var sumPriorities int64 - node := minNodes2[i] - for _, pod := range nodesToVictims[node].Pods { - // We add MaxInt32+1 to all priorities to make all of them >= 0. This is - // needed so that a node with a few pods with negative priority is not - // picked over a node with a smaller number of pods with the same negative - // priority (and similar scenarios). - sumPriorities += int64(podutil.GetPodPriority(pod)) + int64(math.MaxInt32+1) - } - if sumPriorities < minSumPriorities { - minSumPriorities = sumPriorities - lenNodes1 = 0 - } - if sumPriorities == minSumPriorities { - minNodes1[lenNodes1] = node - lenNodes1++ - } - } - if lenNodes1 == 1 { - return minNodes1[0] - } - - // There are a few nodes with minimum highest priority victim and sum of priorities. - // Find one with the minimum number of pods. - minNumPods := math.MaxInt32 - lenNodes2 = 0 - for i := 0; i < lenNodes1; i++ { - node := minNodes1[i] - numPods := len(nodesToVictims[node].Pods) - if numPods < minNumPods { - minNumPods = numPods - lenNodes2 = 0 - } - if numPods == minNumPods { - minNodes2[lenNodes2] = node - lenNodes2++ - } - } - if lenNodes2 == 1 { - return minNodes2[0] - } - - // There are a few nodes with same number of pods. - // Find the node that satisfies latest(earliestStartTime(all highest-priority pods on node)) - latestStartTime := util.GetEarliestPodStartTime(nodesToVictims[minNodes2[0]]) - if latestStartTime == nil { - // If the earliest start time of all pods on the 1st node is nil, just return it, - // which is not expected to happen. - klog.Errorf("earliestStartTime is nil for node %s. Should not reach here.", minNodes2[0]) - return minNodes2[0] - } - nodeToReturn := minNodes2[0] - for i := 1; i < lenNodes2; i++ { - node := minNodes2[i] - // Get earliest start time of all pods on the current node. - earliestStartTimeOnNode := util.GetEarliestPodStartTime(nodesToVictims[node]) - if earliestStartTimeOnNode == nil { - klog.Errorf("earliestStartTime is nil for node %s. Should not reach here.", node) - continue - } - if earliestStartTimeOnNode.After(latestStartTime.Time) { - latestStartTime = earliestStartTimeOnNode - nodeToReturn = node - } - } - - return nodeToReturn -} - -// selectNodesForPreemption finds all the nodes with possible victims for -// preemption in parallel. -func selectNodesForPreemption( - ctx context.Context, - pr framework.PluginsRunner, - nominator framework.PodNominator, - state *framework.CycleState, - pod *v1.Pod, - potentialNodes []*framework.NodeInfo, - pdbs []*policy.PodDisruptionBudget, -) (map[string]*extenderv1.Victims, error) { - nodeNameToVictims := map[string]*extenderv1.Victims{} - var resultLock sync.Mutex - - checkNode := func(i int) { - nodeInfoCopy := potentialNodes[i].Clone() - stateCopy := state.Clone() - pods, numPDBViolations, fits := selectVictimsOnNode(ctx, pr, nominator, stateCopy, pod, nodeInfoCopy, pdbs) - if fits { - resultLock.Lock() - victims := extenderv1.Victims{ - Pods: pods, - NumPDBViolations: int64(numPDBViolations), - } - nodeNameToVictims[potentialNodes[i].Node().Name] = &victims - resultLock.Unlock() - } - } - parallelize.Until(ctx, len(potentialNodes), checkNode) - return nodeNameToVictims, nil -} - -// filterPodsWithPDBViolation groups the given "pods" into two groups of "violatingPods" -// and "nonViolatingPods" based on whether their PDBs will be violated if they are -// preempted. -// This function is stable and does not change the order of received pods. So, if it -// receives a sorted list, grouping will preserve the order of the input list. -func filterPodsWithPDBViolation(pods []*v1.Pod, pdbs []*policy.PodDisruptionBudget) (violatingPods, nonViolatingPods []*v1.Pod) { - pdbsAllowed := make([]int32, len(pdbs)) - for i, pdb := range pdbs { - pdbsAllowed[i] = pdb.Status.DisruptionsAllowed - } - - for _, obj := range pods { - pod := obj - pdbForPodIsViolated := false - // A pod with no labels will not match any PDB. So, no need to check. - if len(pod.Labels) != 0 { - for i, pdb := range pdbs { - if pdb.Namespace != pod.Namespace { - continue - } - selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector) - if err != nil { - continue - } - // A PDB with a nil or empty selector matches nothing. - if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) { - continue - } - - // Existing in DisruptedPods means it has been processed in API server, - // we don't treat it as a violating case. - if _, exist := pdb.Status.DisruptedPods[pod.Name]; exist { - continue - } - // Only decrement the matched pdb when it's not in its ; - // otherwise we may over-decrement the budget number. - pdbsAllowed[i]-- - // We have found a matching PDB. - if pdbsAllowed[i] < 0 { - pdbForPodIsViolated = true - } - } - } - if pdbForPodIsViolated { - violatingPods = append(violatingPods, pod) - } else { - nonViolatingPods = append(nonViolatingPods, pod) - } - } - return violatingPods, nonViolatingPods -} - -// selectVictimsOnNode finds minimum set of pods on the given node that should -// be preempted in order to make enough room for "pod" to be scheduled. The -// minimum set selected is subject to the constraint that a higher-priority pod -// is never preempted when a lower-priority pod could be (higher/lower relative -// to one another, not relative to the preemptor "pod"). -// The algorithm first checks if the pod can be scheduled on the node when all the -// lower priority pods are gone. If so, it sorts all the lower priority pods by -// their priority and then puts them into two groups of those whose PodDisruptionBudget -// will be violated if preempted and other non-violating pods. Both groups are -// sorted by priority. It first tries to reprieve as many PDB violating pods as -// possible and then does them same for non-PDB-violating pods while checking -// that the "pod" can still fit on the node. -// NOTE: This function assumes that it is never called if "pod" cannot be scheduled -// due to pod affinity, node affinity, or node anti-affinity reasons. None of -// these predicates can be satisfied by removing more pods from the node. -func selectVictimsOnNode( - ctx context.Context, - pr framework.PluginsRunner, - nominator framework.PodNominator, - state *framework.CycleState, - pod *v1.Pod, - nodeInfo *framework.NodeInfo, - pdbs []*policy.PodDisruptionBudget, -) ([]*v1.Pod, int, bool) { - var potentialVictims []*v1.Pod - - removePod := func(rp *v1.Pod) error { - if err := nodeInfo.RemovePod(rp); err != nil { - return err - } - status := pr.RunPreFilterExtensionRemovePod(ctx, state, pod, rp, nodeInfo) - if !status.IsSuccess() { - return status.AsError() - } - return nil - } - addPod := func(ap *v1.Pod) error { - nodeInfo.AddPod(ap) - status := pr.RunPreFilterExtensionAddPod(ctx, state, pod, ap, nodeInfo) - if !status.IsSuccess() { - return status.AsError() - } - return nil - } - // As the first step, remove all the lower priority pods from the node and - // check if the given pod can be scheduled. - podPriority := podutil.GetPodPriority(pod) - for _, p := range nodeInfo.Pods { - if podutil.GetPodPriority(p.Pod) < podPriority { - potentialVictims = append(potentialVictims, p.Pod) - if err := removePod(p.Pod); err != nil { - return nil, 0, false - } - } - } - // If the new pod does not fit after removing all the lower priority pods, - // we are almost done and this node is not suitable for preemption. The only - // condition that we could check is if the "pod" is failing to schedule due to - // inter-pod affinity to one or more victims, but we have decided not to - // support this case for performance reasons. Having affinity to lower - // priority pods is not a recommended configuration anyway. - if fits, _, err := podPassesFiltersOnNode(ctx, pr, nominator, state, pod, nodeInfo); !fits { - if err != nil { - klog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, err) - } - - return nil, 0, false - } - var victims []*v1.Pod - numViolatingVictim := 0 - sort.Slice(potentialVictims, func(i, j int) bool { return util.MoreImportantPod(potentialVictims[i], potentialVictims[j]) }) - // Try to reprieve as many pods as possible. We first try to reprieve the PDB - // violating victims and then other non-violating ones. In both cases, we start - // from the highest priority victims. - violatingVictims, nonViolatingVictims := filterPodsWithPDBViolation(potentialVictims, pdbs) - reprievePod := func(p *v1.Pod) (bool, error) { - if err := addPod(p); err != nil { - return false, err - } - fits, _, _ := podPassesFiltersOnNode(ctx, pr, nominator, state, pod, nodeInfo) - if !fits { - if err := removePod(p); err != nil { - return false, err - } - victims = append(victims, p) - klog.V(5).Infof("Pod %v/%v is a potential preemption victim on node %v.", p.Namespace, p.Name, nodeInfo.Node().Name) - } - return fits, nil - } - for _, p := range violatingVictims { - if fits, err := reprievePod(p); err != nil { - klog.Warningf("Failed to reprieve pod %q: %v", p.Name, err) - return nil, 0, false - } else if !fits { - numViolatingVictim++ - } - } - // Now we try to reprieve non-violating victims. - for _, p := range nonViolatingVictims { - if _, err := reprievePod(p); err != nil { - klog.Warningf("Failed to reprieve pod %q: %v", p.Name, err) - return nil, 0, false - } - } - return victims, numViolatingVictim, true -} - -// nodesWherePreemptionMightHelp returns a list of nodes with failed predicates -// that may be satisfied by removing pods from the node. -func nodesWherePreemptionMightHelp(nodes []*framework.NodeInfo, m framework.NodeToStatusMap) []*framework.NodeInfo { - var potentialNodes []*framework.NodeInfo - for _, node := range nodes { - name := node.Node().Name - // We reply on the status by each plugin - 'Unschedulable' or 'UnschedulableAndUnresolvable' - // to determine whether preemption may help or not on the node. - if m[name].Code() == framework.UnschedulableAndUnresolvable { - continue - } - potentialNodes = append(potentialNodes, node) - } - return potentialNodes -} - -// podEligibleToPreemptOthers determines whether this pod should be considered -// for preempting other pods or not. If this pod has already preempted other -// pods and those are in their graceful termination period, it shouldn't be -// considered for preemption. -// We look at the node that is nominated for this pod and as long as there are -// terminating pods on the node, we don't consider this for preempting more pods. -func podEligibleToPreemptOthers(pod *v1.Pod, nodeInfos framework.NodeInfoLister) bool { - if pod.Spec.PreemptionPolicy != nil && *pod.Spec.PreemptionPolicy == v1.PreemptNever { - klog.V(5).Infof("Pod %v/%v is not eligible for preemption because it has a preemptionPolicy of %v", pod.Namespace, pod.Name, v1.PreemptNever) - return false - } - nomNodeName := pod.Status.NominatedNodeName - if len(nomNodeName) > 0 { - if nodeInfo, _ := nodeInfos.Get(nomNodeName); nodeInfo != nil { - podPriority := podutil.GetPodPriority(pod) - for _, p := range nodeInfo.Pods { - if p.Pod.DeletionTimestamp != nil && podutil.GetPodPriority(p.Pod) < podPriority { - // There is a terminating pod on the nominated node. - return false - } - } - } - } - return true -} - // podPassesBasicChecks makes sanity checks on the pod if it can be scheduled. func podPassesBasicChecks(pod *v1.Pod, pvcLister corelisters.PersistentVolumeClaimLister) error { // Check PVCs used by the pod @@ -1138,7 +597,6 @@ func podPassesBasicChecks(pod *v1.Pod, pvcLister corelisters.PersistentVolumeCla // NewGenericScheduler creates a genericScheduler object. func NewGenericScheduler( cache internalcache.Cache, - podNominator framework.PodNominator, nodeInfoSnapshot *internalcache.Snapshot, extenders []framework.Extender, pvcLister corelisters.PersistentVolumeClaimLister, @@ -1146,7 +604,6 @@ func NewGenericScheduler( percentageOfNodesToScore int32) ScheduleAlgorithm { return &genericScheduler{ cache: cache, - podNominator: podNominator, extenders: extenders, nodeInfoSnapshot: nodeInfoSnapshot, pvcLister: pvcLister, diff --git a/pkg/scheduler/core/generic_scheduler_test.go b/pkg/scheduler/core/generic_scheduler_test.go index 0a4ccbc08dd..4ac535dbb61 100644 --- a/pkg/scheduler/core/generic_scheduler_test.go +++ b/pkg/scheduler/core/generic_scheduler_test.go @@ -22,12 +22,10 @@ import ( "math" "reflect" "strconv" - "strings" "testing" "time" v1 "k8s.io/api/core/v1" - policy "k8s.io/api/policy/v1beta1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -36,24 +34,12 @@ import ( "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/informers" clientsetfake "k8s.io/client-go/kubernetes/fake" - clienttesting "k8s.io/client-go/testing" - "k8s.io/client-go/tools/events" - extenderv1 "k8s.io/kube-scheduler/extender/v1" - volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling" schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpodtopologyspread" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodelabel" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1" fakeframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1/fake" @@ -706,7 +692,11 @@ func TestGenericScheduler(t *testing.T) { } snapshot := internalcache.NewSnapshot(test.pods, nodes) - fwk, err := st.NewFramework(test.registerPlugins, frameworkruntime.WithSnapshotSharedLister(snapshot)) + fwk, err := st.NewFramework( + test.registerPlugins, + frameworkruntime.WithSnapshotSharedLister(snapshot), + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator()), + ) if err != nil { t.Fatal(err) } @@ -720,7 +710,6 @@ func TestGenericScheduler(t *testing.T) { scheduler := NewGenericScheduler( cache, - internalqueue.NewSchedulingQueue(nil), snapshot, []framework.Extender{}, pvcLister, @@ -749,7 +738,6 @@ func makeScheduler(nodes []*v1.Node) *genericScheduler { s := NewGenericScheduler( cache, - internalqueue.NewSchedulingQueue(nil), emptySnapshot, nil, nil, false, schedulerapi.DefaultPercentageOfNodesToScore) @@ -757,28 +745,22 @@ func makeScheduler(nodes []*v1.Node) *genericScheduler { return s.(*genericScheduler) } -func makeProfile(fns ...st.RegisterPluginFunc) (*profile.Profile, error) { - fwk, err := st.NewFramework(fns) - if err != nil { - return nil, err - } - return &profile.Profile{ - Framework: fwk, - }, nil -} - func TestFindFitAllError(t *testing.T) { nodes := makeNodeList([]string{"3", "2", "1"}) scheduler := makeScheduler(nodes) - prof, err := makeProfile( - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + fwk, err := st.NewFramework( + []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator()), ) if err != nil { t.Fatal(err) } + prof := &profile.Profile{Framework: fwk} _, nodeToStatusMap, err := scheduler.findNodesThatFitPod(context.Background(), prof, framework.NewCycleState(), &v1.Pod{}) @@ -807,15 +789,19 @@ func TestFindFitAllError(t *testing.T) { func TestFindFitSomeError(t *testing.T) { nodes := makeNodeList([]string{"3", "2", "1"}) scheduler := makeScheduler(nodes) - prof, err := makeProfile( - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + fwk, err := st.NewFramework( + []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator()), ) if err != nil { t.Fatal(err) } + prof := &profile.Profile{Framework: fwk} pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "1", UID: types.UID("1")}} _, nodeToStatusMap, err := scheduler.findNodesThatFitPod(context.Background(), prof, framework.NewCycleState(), pod) @@ -878,16 +864,20 @@ func TestFindFitPredicateCallCounts(t *testing.T) { registerFakeFilterFunc, st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), } - prof, err := makeProfile(registerPlugins...) + fwk, err := st.NewFramework( + registerPlugins, + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator()), + ) if err != nil { t.Fatal(err) } + prof := &profile.Profile{Framework: fwk} scheduler := makeScheduler(nodes) if err := scheduler.cache.UpdateSnapshot(scheduler.nodeInfoSnapshot); err != nil { t.Fatal(err) } - scheduler.podNominator.AddNominatedPod(&v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: "nominated"}, Spec: v1.PodSpec{Priority: &midPriority}}, "1") + fwk.PreemptHandle().AddNominatedPod(&v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: "nominated"}, Spec: v1.PodSpec{Priority: &midPriority}}, "1") _, _, err = scheduler.findNodesThatFitPod(context.Background(), prof, framework.NewCycleState(), test.pod) @@ -1032,6 +1022,7 @@ func TestZeroRequest(t *testing.T) { frameworkruntime.WithInformerFactory(informerFactory), frameworkruntime.WithSnapshotSharedLister(snapshot), frameworkruntime.WithClientSet(client), + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator()), ) if err != nil { t.Fatalf("error creating framework: %+v", err) @@ -1039,7 +1030,6 @@ func TestZeroRequest(t *testing.T) { prof := &profile.Profile{Framework: fwk} scheduler := NewGenericScheduler( - nil, nil, emptySnapshot, []framework.Extender{}, @@ -1068,1368 +1058,7 @@ func TestZeroRequest(t *testing.T) { } } -func printNodeNameToVictims(nodeNameToVictims map[string]*extenderv1.Victims) string { - var output string - for nodeName, victims := range nodeNameToVictims { - output += nodeName + ": [" - for _, pod := range victims.Pods { - output += pod.Name + ", " - } - output += "]" - } - return output -} - -type victims struct { - pods sets.String - numPDBViolations int64 -} - -func checkPreemptionVictims(expected map[string]victims, nodeToPods map[string]*extenderv1.Victims) error { - if len(expected) == len(nodeToPods) { - for k, victims := range nodeToPods { - if expVictims, ok := expected[k]; ok { - if len(victims.Pods) != len(expVictims.pods) { - return fmt.Errorf("unexpected number of pods. expected: %v, got: %v", expected, printNodeNameToVictims(nodeToPods)) - } - prevPriority := int32(math.MaxInt32) - for _, p := range victims.Pods { - // Check that pods are sorted by their priority. - if *p.Spec.Priority > prevPriority { - return fmt.Errorf("pod %v of node %v was not sorted by priority", p.Name, k) - } - prevPriority = *p.Spec.Priority - if !expVictims.pods.Has(p.Name) { - return fmt.Errorf("pod %v was not expected. Expected: %v", p.Name, expVictims.pods) - } - } - if expVictims.numPDBViolations != victims.NumPDBViolations { - return fmt.Errorf("unexpected numPDBViolations. expected: %d, got: %d", expVictims.numPDBViolations, victims.NumPDBViolations) - } - } else { - return fmt.Errorf("unexpected machines. expected: %v, got: %v", expected, printNodeNameToVictims(nodeToPods)) - } - } - } else { - return fmt.Errorf("unexpected number of machines. expected: %v, got: %v", expected, printNodeNameToVictims(nodeToPods)) - } - return nil -} - -var smallContainers = []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(schedutil.DefaultMilliCPURequest, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(schedutil.DefaultMemoryRequest, 10)), - }, - }, - }, -} -var mediumContainers = []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(schedutil.DefaultMilliCPURequest*2, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(schedutil.DefaultMemoryRequest*2, 10)), - }, - }, - }, -} -var largeContainers = []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(schedutil.DefaultMilliCPURequest*3, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(schedutil.DefaultMemoryRequest*3, 10)), - }, - }, - }, -} -var veryLargeContainers = []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(schedutil.DefaultMilliCPURequest*5, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(schedutil.DefaultMemoryRequest*5, 10)), - }, - }, - }, -} -var negPriority, lowPriority, midPriority, highPriority, veryHighPriority = int32(-100), int32(0), int32(100), int32(1000), int32(10000) - -var startTime = metav1.Date(2019, 1, 1, 1, 1, 1, 0, time.UTC) - -var startTime20190102 = metav1.Date(2019, 1, 2, 1, 1, 1, 0, time.UTC) -var startTime20190103 = metav1.Date(2019, 1, 3, 1, 1, 1, 0, time.UTC) -var startTime20190104 = metav1.Date(2019, 1, 4, 1, 1, 1, 0, time.UTC) -var startTime20190105 = metav1.Date(2019, 1, 5, 1, 1, 1, 0, time.UTC) -var startTime20190106 = metav1.Date(2019, 1, 6, 1, 1, 1, 0, time.UTC) -var startTime20190107 = metav1.Date(2019, 1, 7, 1, 1, 1, 0, time.UTC) - -// TestSelectNodesForPreemption tests selectNodesForPreemption. This test assumes -// that podsFitsOnNode works correctly and is tested separately. -func TestSelectNodesForPreemption(t *testing.T) { - tests := []struct { - name string - registerPlugins []st.RegisterPluginFunc - nodes []string - pod *v1.Pod - pods []*v1.Pod - pdbs []*policy.PodDisruptionBudget - filterReturnCode framework.Code - expected map[string]victims - expectedNumFilterCalled int32 - }{ - { - name: "a pod that does not fit on any machine", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("FalseFilter", st.NewFalseFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "new", UID: types.UID("new")}, Spec: v1.PodSpec{Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]victims{}, - expectedNumFilterCalled: 2, - }, - { - name: "a pod that fits with no preemption", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "new", UID: types.UID("new")}, Spec: v1.PodSpec{Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]victims{"machine1": {}, "machine2": {}}, - expectedNumFilterCalled: 4, - }, - { - name: "a pod that fits on one machine with no preemption", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]victims{"machine1": {}}, - expectedNumFilterCalled: 3, - }, - { - name: "a pod that fits on both machines when lower priority pods are preempted", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]victims{"machine1": {pods: sets.NewString("a")}, "machine2": {pods: sets.NewString("b")}}, - expectedNumFilterCalled: 4, - }, - { - name: "a pod that would fit on the machines, but other pods running are higher priority", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &lowPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]victims{}, - expectedNumFilterCalled: 2, - }, - { - name: "medium priority pod is preempted, but lower priority one stays as it is small", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "c", UID: types.UID("c")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]victims{"machine1": {pods: sets.NewString("b")}, "machine2": {pods: sets.NewString("c")}}, - expectedNumFilterCalled: 5, - }, - { - name: "mixed priority pods are preempted", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "c", UID: types.UID("c")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "d", UID: types.UID("d")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "e", UID: types.UID("e")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}}, - expected: map[string]victims{"machine1": {pods: sets.NewString("b", "c")}}, - expectedNumFilterCalled: 5, - }, - { - name: "mixed priority pods are preempted, pick later StartTime one when priorities are equal", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190107}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190106}}, - {ObjectMeta: metav1.ObjectMeta{Name: "c", UID: types.UID("c")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190105}}, - {ObjectMeta: metav1.ObjectMeta{Name: "d", UID: types.UID("d")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, - {ObjectMeta: metav1.ObjectMeta{Name: "e", UID: types.UID("e")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190103}}}, - expected: map[string]victims{"machine1": {pods: sets.NewString("a", "c")}}, - expectedNumFilterCalled: 5, - }, - { - name: "pod with anti-affinity is preempted", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterPluginAsExtensions(interpodaffinity.Name, interpodaffinity.New, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{ - Name: "machine1", - Labels: map[string]string{"pod": "preemptor"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"service": "securityscan"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1", Affinity: &v1.Affinity{ - PodAntiAffinity: &v1.PodAntiAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ - { - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "pod", - Operator: metav1.LabelSelectorOpIn, - Values: []string{"preemptor", "value2"}, - }, - }, - }, - TopologyKey: "hostname", - }, - }, - }}}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "d", UID: types.UID("d")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "e", UID: types.UID("e")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}}, - expected: map[string]victims{"machine1": {pods: sets.NewString("a")}, "machine2": {}}, - expectedNumFilterCalled: 4, - }, - { - name: "preemption to resolve even pods spread FitError", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions( - podtopologyspread.Name, - podtopologyspread.New, - "PreFilter", - "Filter", - ), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"node-a/zone1", "node-b/zone1", "node-x/zone2"}, - pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "p", - Labels: map[string]string{"foo": ""}, - }, - Spec: v1.PodSpec{ - Priority: &highPriority, - TopologySpreadConstraints: []v1.TopologySpreadConstraint{ - { - MaxSkew: 1, - TopologyKey: "zone", - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "foo", - Operator: metav1.LabelSelectorOpExists, - }, - }, - }, - }, - { - MaxSkew: 1, - TopologyKey: "hostname", - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "foo", - Operator: metav1.LabelSelectorOpExists, - }, - }, - }, - }, - }, - }, - }, - pods: []*v1.Pod{ - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-a1", UID: types.UID("pod-a1"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-a", Priority: &midPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-a2", UID: types.UID("pod-a2"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-a", Priority: &lowPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-b1", UID: types.UID("pod-b1"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-b", Priority: &lowPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-x1", UID: types.UID("pod-x1"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-x2", UID: types.UID("pod-x2"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - }, - expected: map[string]victims{ - "node-a": {pods: sets.NewString("pod-a2")}, - "node-b": {pods: sets.NewString("pod-b1")}, - }, - expectedNumFilterCalled: 6, - }, - { - name: "get Unschedulable in the preemption phase when the filter plugins filtering the nodes", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, - filterReturnCode: framework.Unschedulable, - expected: map[string]victims{}, - expectedNumFilterCalled: 2, - }, - { - name: "preemption with violation of same pdb", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}}, - pdbs: []*policy.PodDisruptionBudget{ - {Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1}}}, - expected: map[string]victims{"machine1": {pods: sets.NewString("a", "b"), numPDBViolations: 1}}, - expectedNumFilterCalled: 3, - }, - { - name: "preemption with violation of the pdb with pod whose eviction was processed, the victim doesn't belong to DisruptedPods", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}}, - pdbs: []*policy.PodDisruptionBudget{ - {Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1, DisruptedPods: map[string]metav1.Time{"c": {Time: time.Now()}}}}}, - expected: map[string]victims{"machine1": {pods: sets.NewString("a", "b"), numPDBViolations: 1}}, - expectedNumFilterCalled: 3, - }, - { - name: "preemption with violation of the pdb with pod whose eviction was processed, the victim belongs to DisruptedPods", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}}, - pdbs: []*policy.PodDisruptionBudget{ - {Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1, DisruptedPods: map[string]metav1.Time{"b": {Time: time.Now()}}}}}, - expected: map[string]victims{"machine1": {pods: sets.NewString("a", "b"), numPDBViolations: 0}}, - expectedNumFilterCalled: 3, - }, - { - name: "preemption with violation of the pdb with pod whose eviction was processed, the victim which belongs to DisruptedPods is treated as 'nonViolating'", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "c", UID: types.UID("c"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}}, - pdbs: []*policy.PodDisruptionBudget{ - {Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1, DisruptedPods: map[string]metav1.Time{"c": {Time: time.Now()}}}}}, - expected: map[string]victims{"machine1": {pods: sets.NewString("a", "b", "c"), numPDBViolations: 1}}, - expectedNumFilterCalled: 4, - }, - } - labelKeys := []string{"hostname", "zone", "region"} - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - filterFailedNodeReturnCodeMap := map[string]framework.Code{} - cache := internalcache.New(time.Duration(0), wait.NeverStop) - for _, pod := range test.pods { - cache.AddPod(pod) - } - for _, name := range test.nodes { - filterFailedNodeReturnCodeMap[name] = test.filterReturnCode - cache.AddNode(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: name, Labels: map[string]string{"hostname": name}}}) - } - - var nodes []*v1.Node - for _, n := range test.nodes { - node := makeNode(n, 1000*5, schedutil.DefaultMemoryRequest*5) - // if possible, split node name by '/' to form labels in a format of - // {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]} - node.ObjectMeta.Labels = make(map[string]string) - for i, label := range strings.Split(node.Name, "/") { - node.ObjectMeta.Labels[labelKeys[i]] = label - } - node.Name = node.ObjectMeta.Labels["hostname"] - nodes = append(nodes, node) - } - - // For each test, prepend a FakeFilterPlugin. - fakePlugin := st.FakeFilterPlugin{} - fakePlugin.FailedNodeReturnCodeMap = filterFailedNodeReturnCodeMap - registerFakeFilterFunc := st.RegisterFilterPlugin( - "FakeFilter", - func(_ runtime.Object, fh framework.FrameworkHandle) (framework.Plugin, error) { - return &fakePlugin, nil - }, - ) - registerPlugins := append([]st.RegisterPluginFunc{registerFakeFilterFunc}, test.registerPlugins...) - // Use a real snapshot since it's needed in some Filter Plugin (e.g., PodAffinity) - snapshot := internalcache.NewSnapshot(test.pods, nodes) - fwk, err := st.NewFramework(registerPlugins, frameworkruntime.WithSnapshotSharedLister(snapshot)) - if err != nil { - t.Fatal(err) - } - prof := &profile.Profile{Framework: fwk} - - scheduler := NewGenericScheduler( - nil, - internalqueue.NewSchedulingQueue(nil), - snapshot, - []framework.Extender{}, - nil, - false, - schedulerapi.DefaultPercentageOfNodesToScore) - g := scheduler.(*genericScheduler) - - assignDefaultStartTime(test.pods) - - state := framework.NewCycleState() - // Some tests rely on PreFilter plugin to compute its CycleState. - preFilterStatus := prof.RunPreFilterPlugins(context.Background(), state, test.pod) - if !preFilterStatus.IsSuccess() { - t.Errorf("Unexpected preFilterStatus: %v", preFilterStatus) - } - nodeInfos, err := nodesToNodeInfos(nodes, snapshot) - if err != nil { - t.Fatal(err) - } - nodeToPods, err := selectNodesForPreemption(context.Background(), prof, g.podNominator, state, test.pod, nodeInfos, test.pdbs) - if err != nil { - t.Error(err) - } - - if test.expectedNumFilterCalled != fakePlugin.NumFilterCalled { - t.Errorf("expected fakePlugin.numFilterCalled is %d, but got %d", test.expectedNumFilterCalled, fakePlugin.NumFilterCalled) - } - - if err := checkPreemptionVictims(test.expected, nodeToPods); err != nil { - t.Error(err) - } - }) - } -} - -// TestPickOneNodeForPreemption tests pickOneNodeForPreemption. -func TestPickOneNodeForPreemption(t *testing.T) { - tests := []struct { - name string - registerPlugins []st.RegisterPluginFunc - nodes []string - pod *v1.Pod - pods []*v1.Pod - expected []string // any of the items is valid - }{ - { - name: "No node needs preemption", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}}, - expected: []string{"machine1"}, - }, - { - name: "a pod that fits on both machines when lower priority pods are preempted", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}}, - expected: []string{"machine1", "machine2"}, - }, - { - name: "a pod that fits on a machine with no preemption", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}}, - expected: []string{"machine3"}, - }, - { - name: "machine with min highest priority pod is picked", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - }, - expected: []string{"machine3"}, - }, - { - name: "when highest priorities are the same, minimum sum of priorities is picked", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - }, - expected: []string{"machine2"}, - }, - { - name: "when highest priority and sum are the same, minimum number of pods is picked", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.3", UID: types.UID("m1.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.4", UID: types.UID("m1.4")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &negPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.3", UID: types.UID("m3.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - }, - expected: []string{"machine2"}, - }, - { - // pickOneNodeForPreemption adjusts pod priorities when finding the sum of the victims. This - // test ensures that the logic works correctly. - name: "sum of adjusted priorities is considered", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.3", UID: types.UID("m1.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &negPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.3", UID: types.UID("m3.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - }, - expected: []string{"machine2"}, - }, - { - name: "non-overlapping lowest high priority, sum priorities, and number of pods", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3", "machine4"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &veryHighPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.3", UID: types.UID("m1.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.3", UID: types.UID("m3.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.4", UID: types.UID("m3.4")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m4.1", UID: types.UID("m4.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine4"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m4.2", UID: types.UID("m4.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine4"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m4.3", UID: types.UID("m4.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine4"}, Status: v1.PodStatus{StartTime: &startTime}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m4.4", UID: types.UID("m4.4")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine4"}, Status: v1.PodStatus{StartTime: &startTime}}, - }, - expected: []string{"machine1"}, - }, - { - name: "same priority, same number of victims, different start time for each machine's pod", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190103}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190103}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190102}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190102}}, - }, - expected: []string{"machine2"}, - }, - { - name: "same priority, same number of victims, different start time for all pods", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190105}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190103}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190106}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190102}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190107}}, - }, - expected: []string{"machine3"}, - }, - { - name: "different priority, same number of victims, different start time for all pods", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190105}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190103}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190107}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190102}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190106}}, - }, - expected: []string{"machine2"}, - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - var nodes []*v1.Node - for _, n := range test.nodes { - nodes = append(nodes, makeNode(n, schedutil.DefaultMilliCPURequest*5, schedutil.DefaultMemoryRequest*5)) - } - snapshot := internalcache.NewSnapshot(test.pods, nodes) - fwk, err := st.NewFramework(test.registerPlugins, frameworkruntime.WithSnapshotSharedLister(snapshot)) - if err != nil { - t.Fatal(err) - } - prof := &profile.Profile{Framework: fwk} - - g := &genericScheduler{ - nodeInfoSnapshot: snapshot, - } - assignDefaultStartTime(test.pods) - - nodeInfos, err := nodesToNodeInfos(nodes, snapshot) - if err != nil { - t.Fatal(err) - } - state := framework.NewCycleState() - // Some tests rely on PreFilter plugin to compute its CycleState. - preFilterStatus := prof.RunPreFilterPlugins(context.Background(), state, test.pod) - if !preFilterStatus.IsSuccess() { - t.Errorf("Unexpected preFilterStatus: %v", preFilterStatus) - } - candidateNodes, _ := selectNodesForPreemption(context.Background(), prof, g.podNominator, state, test.pod, nodeInfos, nil) - node := pickOneNodeForPreemption(candidateNodes) - found := false - for _, nodeName := range test.expected { - if node == nodeName { - found = true - break - } - } - if !found { - t.Errorf("unexpected node: %v", node) - } - }) - } -} - -func TestNodesWherePreemptionMightHelp(t *testing.T) { - // Prepare 4 node names. - nodeNames := make([]string, 0, 4) - for i := 1; i < 5; i++ { - nodeNames = append(nodeNames, fmt.Sprintf("machine%d", i)) - } - - tests := []struct { - name string - nodesStatuses framework.NodeToStatusMap - expected map[string]bool // set of expected node names. Value is ignored. - }{ - { - name: "No node should be attempted", - nodesStatuses: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodeaffinity.ErrReason), - "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodename.ErrReason), - "machine3": framework.NewStatus(framework.UnschedulableAndUnresolvable, tainttoleration.ErrReasonNotMatch), - "machine4": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodelabel.ErrReasonPresenceViolated), - }, - expected: map[string]bool{}, - }, - { - name: "ErrReasonAffinityNotMatch should be tried as it indicates that the pod is unschedulable due to inter-pod affinity or anti-affinity", - nodesStatuses: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.Unschedulable, interpodaffinity.ErrReasonAffinityNotMatch), - "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodename.ErrReason), - "machine3": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodeunschedulable.ErrReasonUnschedulable), - }, - expected: map[string]bool{"machine1": true, "machine4": true}, - }, - { - name: "pod with both pod affinity and anti-affinity should be tried", - nodesStatuses: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.Unschedulable, interpodaffinity.ErrReasonAffinityNotMatch), - "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodename.ErrReason), - }, - expected: map[string]bool{"machine1": true, "machine3": true, "machine4": true}, - }, - { - name: "ErrReasonAffinityRulesNotMatch should not be tried as it indicates that the pod is unschedulable due to inter-pod affinity, but ErrReasonAffinityNotMatch should be tried as it indicates that the pod is unschedulable due to inter-pod affinity or anti-affinity", - nodesStatuses: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, interpodaffinity.ErrReasonAffinityRulesNotMatch), - "machine2": framework.NewStatus(framework.Unschedulable, interpodaffinity.ErrReasonAffinityNotMatch), - }, - expected: map[string]bool{"machine2": true, "machine3": true, "machine4": true}, - }, - { - name: "Mix of failed predicates works fine", - nodesStatuses: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, volumerestrictions.ErrReasonDiskConflict), - "machine2": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("Insufficient %v", v1.ResourceMemory)), - }, - expected: map[string]bool{"machine2": true, "machine3": true, "machine4": true}, - }, - { - name: "Node condition errors should be considered unresolvable", - nodesStatuses: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodeunschedulable.ErrReasonUnknownCondition), - }, - expected: map[string]bool{"machine2": true, "machine3": true, "machine4": true}, - }, - { - name: "ErrVolume... errors should not be tried as it indicates that the pod is unschedulable due to no matching volumes for pod on node", - nodesStatuses: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, volumezone.ErrReasonConflict), - "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, string(volumescheduling.ErrReasonNodeConflict)), - "machine3": framework.NewStatus(framework.UnschedulableAndUnresolvable, string(volumescheduling.ErrReasonBindConflict)), - }, - expected: map[string]bool{"machine4": true}, - }, - { - name: "ErrTopologySpreadConstraintsNotMatch should be tried as it indicates that the pod is unschedulable due to topology spread constraints", - nodesStatuses: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), - "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodename.ErrReason), - "machine3": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), - }, - expected: map[string]bool{"machine1": true, "machine3": true, "machine4": true}, - }, - { - name: "UnschedulableAndUnresolvable status should be skipped but Unschedulable should be tried", - nodesStatuses: framework.NodeToStatusMap{ - "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, ""), - "machine3": framework.NewStatus(framework.Unschedulable, ""), - "machine4": framework.NewStatus(framework.UnschedulableAndUnresolvable, ""), - }, - expected: map[string]bool{"machine1": true, "machine3": true}, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - var nodeInfos []*framework.NodeInfo - for _, n := range makeNodeList(nodeNames) { - ni := framework.NewNodeInfo() - ni.SetNode(n) - nodeInfos = append(nodeInfos, ni) - } - nodes := nodesWherePreemptionMightHelp(nodeInfos, test.nodesStatuses) - if len(test.expected) != len(nodes) { - t.Errorf("number of nodes is not the same as expected. exptectd: %d, got: %d. Nodes: %v", len(test.expected), len(nodes), nodes) - } - for _, node := range nodes { - name := node.Node().Name - if _, found := test.expected[name]; !found { - t.Errorf("node %v is not expected.", name) - } - } - }) - } -} - -func TestPreempt(t *testing.T) { - defaultFailedNodeToStatusMap := framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("Insufficient %v", v1.ResourceMemory)), - "machine2": framework.NewStatus(framework.Unschedulable, volumerestrictions.ErrReasonDiskConflict), - "machine3": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("Insufficient %v", v1.ResourceMemory)), - } - // Prepare 3 node names. - var defaultNodeNames []string - for i := 1; i < 4; i++ { - defaultNodeNames = append(defaultNodeNames, fmt.Sprintf("machine%d", i)) - } - var ( - preemptLowerPriority = v1.PreemptLowerPriority - preemptNever = v1.PreemptNever - ) - tests := []struct { - name string - pod *v1.Pod - pods []*v1.Pod - extenders []*st.FakeExtender - failedNodeToStatusMap framework.NodeToStatusMap - nodeNames []string - registerPlugins []st.RegisterPluginFunc - expectedNode string - expectedPods []string // list of preempted pods - }{ - { - name: "basic preemption logic", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ - Containers: veryLargeContainers, - Priority: &highPriority, - PreemptionPolicy: &preemptLowerPriority}, - }, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - }, - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - expectedNode: "machine1", - expectedPods: []string{"m1.1", "m1.2"}, - }, - { - name: "One node doesn't need any preemption", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ - Containers: veryLargeContainers, - Priority: &highPriority, - PreemptionPolicy: &preemptLowerPriority}, - }, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - }, - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - expectedNode: "machine3", - expectedPods: []string{}, - }, - { - name: "preemption for topology spread constraints", - pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "p", - Labels: map[string]string{"foo": ""}, - }, - Spec: v1.PodSpec{ - Priority: &highPriority, - TopologySpreadConstraints: []v1.TopologySpreadConstraint{ - { - MaxSkew: 1, - TopologyKey: "zone", - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "foo", - Operator: metav1.LabelSelectorOpExists, - }, - }, - }, - }, - { - MaxSkew: 1, - TopologyKey: "hostname", - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "foo", - Operator: metav1.LabelSelectorOpExists, - }, - }, - }, - }, - }, - }, - }, - pods: []*v1.Pod{ - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-a1", UID: types.UID("pod-a1"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-a", Priority: &highPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-a2", UID: types.UID("pod-a2"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-a", Priority: &highPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-b1", UID: types.UID("pod-b1"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-b", Priority: &lowPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-x1", UID: types.UID("pod-x1"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod-x2", UID: types.UID("pod-x2"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, - Status: v1.PodStatus{Phase: v1.PodRunning}, - }, - }, - failedNodeToStatusMap: framework.NodeToStatusMap{ - "node-a": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), - "node-b": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), - "node-x": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), - }, - nodeNames: []string{"node-a/zone1", "node-b/zone1", "node-x/zone2"}, - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions( - podtopologyspread.Name, - podtopologyspread.New, - "PreFilter", - "Filter", - ), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - expectedNode: "node-b", - expectedPods: []string{"pod-b1"}, - }, - { - name: "Scheduler extenders allow only machine1, otherwise machine3 would have been chosen", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ - Containers: veryLargeContainers, - Priority: &highPriority, - PreemptionPolicy: &preemptLowerPriority}, - }, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - }, - extenders: []*st.FakeExtender{ - { - Predicates: []st.FitPredicate{st.TruePredicateExtender}, - }, - { - Predicates: []st.FitPredicate{st.Machine1PredicateExtender}, - }, - }, - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - expectedNode: "machine1", - expectedPods: []string{"m1.1", "m1.2"}, - }, - { - name: "Scheduler extenders do not allow any preemption", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ - Containers: veryLargeContainers, - Priority: &highPriority, - PreemptionPolicy: &preemptLowerPriority}, - }, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - }, - extenders: []*st.FakeExtender{ - { - Predicates: []st.FitPredicate{st.FalsePredicateExtender}, - }, - }, - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - expectedNode: "", - expectedPods: []string{}, - }, - { - name: "One scheduler extender allows only machine1, the other returns error but ignorable. Only machine1 would be chosen", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ - Containers: veryLargeContainers, - Priority: &highPriority, - PreemptionPolicy: &preemptLowerPriority}, - }, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - }, - extenders: []*st.FakeExtender{ - { - Predicates: []st.FitPredicate{st.ErrorPredicateExtender}, - Ignorable: true, - }, - { - Predicates: []st.FitPredicate{st.Machine1PredicateExtender}, - }, - }, - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - expectedNode: "machine1", - expectedPods: []string{"m1.1", "m1.2"}, - }, - { - name: "One scheduler extender allows only machine1, but it is not interested in given pod, otherwise machine1 would have been chosen", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ - Containers: veryLargeContainers, - Priority: &highPriority, - PreemptionPolicy: &preemptLowerPriority}, - }, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - }, - extenders: []*st.FakeExtender{ - { - Predicates: []st.FitPredicate{st.Machine1PredicateExtender}, - UnInterested: true, - }, - { - Predicates: []st.FitPredicate{st.TruePredicateExtender}, - }, - }, - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - expectedNode: "machine3", - expectedPods: []string{}, - }, - { - name: "no preempting in pod", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ - Containers: veryLargeContainers, - Priority: &highPriority, - PreemptionPolicy: &preemptNever}, - }, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - }, - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - expectedNode: "", - expectedPods: nil, - }, - { - name: "PreemptionPolicy is nil", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ - Containers: veryLargeContainers, - Priority: &highPriority, - PreemptionPolicy: nil}, - }, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, - }, - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - expectedNode: "machine1", - expectedPods: []string{"m1.1", "m1.2"}, - }, - } - - labelKeys := []string{"hostname", "zone", "region"} - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - apiObjs := mergeObjs(test.pod, test.pods) - client := clientsetfake.NewSimpleClientset(apiObjs...) - deletedPodNames := make(sets.String) - client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName()) - return true, nil, nil - }) - - stop := make(chan struct{}) - cache := internalcache.New(time.Duration(0), stop) - for _, pod := range test.pods { - cache.AddPod(pod) - } - cachedNodeInfoMap := map[string]*framework.NodeInfo{} - nodeNames := defaultNodeNames - if len(test.nodeNames) != 0 { - nodeNames = test.nodeNames - } - var nodes []*v1.Node - for i, name := range nodeNames { - node := makeNode(name, 1000*5, schedutil.DefaultMemoryRequest*5) - // if possible, split node name by '/' to form labels in a format of - // {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]} - node.ObjectMeta.Labels = make(map[string]string) - for i, label := range strings.Split(node.Name, "/") { - node.ObjectMeta.Labels[labelKeys[i]] = label - } - node.Name = node.ObjectMeta.Labels["hostname"] - cache.AddNode(node) - nodes = append(nodes, node) - nodeNames[i] = node.Name - - // Set nodeInfo to extenders to mock extenders' cache for preemption. - cachedNodeInfo := framework.NewNodeInfo() - cachedNodeInfo.SetNode(node) - cachedNodeInfoMap[node.Name] = cachedNodeInfo - } - var extenders []framework.Extender - for _, extender := range test.extenders { - // Set nodeInfoMap as extenders cached node information. - extender.CachedNodeNameToInfo = cachedNodeInfoMap - extenders = append(extenders, extender) - } - - podNominator := internalqueue.NewPodNominator() - snapshot := internalcache.NewSnapshot(test.pods, nodes) - fwk, err := st.NewFramework( - test.registerPlugins, - frameworkruntime.WithClientSet(client), - frameworkruntime.WithEventRecorder(&events.FakeRecorder{}), - frameworkruntime.WithExtenders(extenders), - frameworkruntime.WithPodNominator(podNominator), - frameworkruntime.WithSnapshotSharedLister(snapshot), - frameworkruntime.WithInformerFactory(informers.NewSharedInformerFactory(client, 0)), - ) - if err != nil { - t.Fatal(err) - } - - state := framework.NewCycleState() - // Some tests rely on PreFilter plugin to compute its CycleState. - preFilterStatus := fwk.RunPreFilterPlugins(context.Background(), state, test.pod) - if !preFilterStatus.IsSuccess() { - t.Errorf("Unexpected preFilterStatus: %v", preFilterStatus) - } - // Call Preempt and check the expected results. - failedNodeToStatusMap := defaultFailedNodeToStatusMap - if test.failedNodeToStatusMap != nil { - failedNodeToStatusMap = test.failedNodeToStatusMap - } - node, err := Preempt(context.Background(), fwk, state, test.pod, failedNodeToStatusMap) - if err != nil { - t.Errorf("unexpected error in preemption: %v", err) - } - if len(node) != 0 && node != test.expectedNode { - t.Errorf("expected node: %v, got: %v", test.expectedNode, node) - } - if len(node) == 0 && len(test.expectedNode) != 0 { - t.Errorf("expected node: %v, got: nothing", test.expectedNode) - } - if len(deletedPodNames) != len(test.expectedPods) { - t.Errorf("expected %v pods, got %v.", len(test.expectedPods), len(deletedPodNames)) - } - for victimName := range deletedPodNames { - found := false - for _, expPod := range test.expectedPods { - if expPod == victimName { - found = true - break - } - } - if !found { - t.Fatalf("pod %v is not expected to be a victim.", victimName) - } - } - test.pod.Status.NominatedNodeName = node - client.CoreV1().Pods(test.pod.Namespace).Update(context.TODO(), test.pod, metav1.UpdateOptions{}) - - // Manually set the deleted Pods' deletionTimestamp to non-nil. - for _, pod := range test.pods { - if deletedPodNames.Has(pod.Name) { - now := metav1.Now() - pod.DeletionTimestamp = &now - deletedPodNames.Delete(pod.Name) - } - } - - // Call preempt again and make sure it doesn't preempt any more pods. - node, err = Preempt(context.Background(), fwk, state, test.pod, failedNodeToStatusMap) - if err != nil { - t.Errorf("unexpected error in preemption: %v", err) - } - if len(node) != 0 && len(deletedPodNames) > 0 { - t.Errorf("didn't expect any more preemption. Node %v is selected for preemption.", node) - } - close(stop) - }) - } -} +var lowPriority, midPriority, highPriority = int32(0), int32(100), int32(1000) func TestNumFeasibleNodesToFind(t *testing.T) { tests := []struct { @@ -2484,16 +1113,6 @@ func TestNumFeasibleNodesToFind(t *testing.T) { } } -func assignDefaultStartTime(pods []*v1.Pod) { - now := metav1.Now() - for i := range pods { - pod := pods[i] - if pod.Status.StartTime == nil { - pod.Status.StartTime = &now - } - } -} - func TestFairEvaluationForNodes(t *testing.T) { numAllNodes := 500 nodeNames := make([]string, 0, numAllNodes) @@ -2502,14 +1121,18 @@ func TestFairEvaluationForNodes(t *testing.T) { } nodes := makeNodeList(nodeNames) g := makeScheduler(nodes) - prof, err := makeProfile( - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + fwk, err := st.NewFramework( + []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator()), ) if err != nil { t.Fatal(err) } + prof := &profile.Profile{Framework: fwk} // To make numAllNodes % nodesToFind != 0 g.percentageOfNodesToScore = 30 nodesToFind := int(g.numFeasibleNodesToFind(int32(numAllNodes))) @@ -2528,26 +1151,3 @@ func TestFairEvaluationForNodes(t *testing.T) { } } } - -func nodesToNodeInfos(nodes []*v1.Node, snapshot *internalcache.Snapshot) ([]*framework.NodeInfo, error) { - var nodeInfos []*framework.NodeInfo - for _, n := range nodes { - nodeInfo, err := snapshot.NodeInfos().Get(n.Name) - if err != nil { - return nil, err - } - nodeInfos = append(nodeInfos, nodeInfo) - } - return nodeInfos, nil -} - -func mergeObjs(pod *v1.Pod, pods []*v1.Pod) []runtime.Object { - var objs []runtime.Object - if pod != nil { - objs = append(objs, pod) - } - for i := range pods { - objs = append(objs, pods[i]) - } - return objs -} diff --git a/pkg/scheduler/factory.go b/pkg/scheduler/factory.go index 948550c56d6..8165968d28a 100644 --- a/pkg/scheduler/factory.go +++ b/pkg/scheduler/factory.go @@ -186,7 +186,6 @@ func (c *Configurator) create() (*Scheduler, error) { algo := core.NewGenericScheduler( c.schedulerCache, - nominator, c.nodeInfoSnapshot, extenders, c.informerFactory.Core().V1().PersistentVolumeClaims().Lister(), diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/BUILD b/pkg/scheduler/framework/plugins/defaultpreemption/BUILD index 37b8e17c490..fd0417dbf36 100644 --- a/pkg/scheduler/framework/plugins/defaultpreemption/BUILD +++ b/pkg/scheduler/framework/plugins/defaultpreemption/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "go_default_library", @@ -6,13 +6,21 @@ go_library( importpath = "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption", visibility = ["//visibility:public"], deps = [ + "//pkg/api/v1/pod:go_default_library", "//pkg/features:go_default_library", "//pkg/scheduler/core:go_default_library", "//pkg/scheduler/framework/v1alpha1:go_default_library", + "//pkg/scheduler/internal/parallelize:go_default_library", "//pkg/scheduler/metrics:go_default_library", + "//pkg/scheduler/util:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", + "//staging/src/k8s.io/api/policy/v1beta1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library", "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", + "//staging/src/k8s.io/kube-scheduler/extender/v1:go_default_library", + "//vendor/k8s.io/klog/v2:go_default_library", ], ) @@ -29,3 +37,43 @@ filegroup( tags = ["automanaged"], visibility = ["//visibility:public"], ) + +go_test( + name = "go_default_test", + srcs = ["default_preemption_test.go"], + embed = [":go_default_library"], + deps = [ + "//pkg/controller/volume/scheduling:go_default_library", + "//pkg/scheduler/framework/plugins/defaultbinder:go_default_library", + "//pkg/scheduler/framework/plugins/interpodaffinity:go_default_library", + "//pkg/scheduler/framework/plugins/nodeaffinity:go_default_library", + "//pkg/scheduler/framework/plugins/nodelabel:go_default_library", + "//pkg/scheduler/framework/plugins/nodename:go_default_library", + "//pkg/scheduler/framework/plugins/noderesources:go_default_library", + "//pkg/scheduler/framework/plugins/nodeunschedulable:go_default_library", + "//pkg/scheduler/framework/plugins/podtopologyspread:go_default_library", + "//pkg/scheduler/framework/plugins/queuesort:go_default_library", + "//pkg/scheduler/framework/plugins/tainttoleration:go_default_library", + "//pkg/scheduler/framework/plugins/volumerestrictions:go_default_library", + "//pkg/scheduler/framework/plugins/volumezone:go_default_library", + "//pkg/scheduler/framework/runtime:go_default_library", + "//pkg/scheduler/framework/v1alpha1:go_default_library", + "//pkg/scheduler/internal/cache:go_default_library", + "//pkg/scheduler/internal/queue:go_default_library", + "//pkg/scheduler/testing:go_default_library", + "//pkg/scheduler/util:go_default_library", + "//staging/src/k8s.io/api/core/v1:go_default_library", + "//staging/src/k8s.io/api/policy/v1beta1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/types:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//staging/src/k8s.io/client-go/informers:go_default_library", + "//staging/src/k8s.io/client-go/kubernetes/fake:go_default_library", + "//staging/src/k8s.io/client-go/testing:go_default_library", + "//staging/src/k8s.io/client-go/tools/events:go_default_library", + "//staging/src/k8s.io/kube-scheduler/extender/v1:go_default_library", + ], +) diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go index cdfbaeafcf7..c72cb959083 100644 --- a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go +++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go @@ -18,15 +18,27 @@ package defaultpreemption import ( "context" + "math" + "sort" + "sync" "time" + "k8s.io/klog/v2" + v1 "k8s.io/api/core/v1" + policy "k8s.io/api/policy/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" utilfeature "k8s.io/apiserver/pkg/util/feature" + extenderv1 "k8s.io/kube-scheduler/extender/v1" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" kubefeatures "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/core" framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1" + "k8s.io/kubernetes/pkg/scheduler/internal/parallelize" "k8s.io/kubernetes/pkg/scheduler/metrics" + "k8s.io/kubernetes/pkg/scheduler/util" ) const ( @@ -65,7 +77,7 @@ func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.Cy metrics.DeprecatedSchedulingDuration.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime)) }() - nnn, err := core.Preempt(ctx, pl.fh, state, pod, m) + nnn, err := preempt(ctx, pl.fh, state, pod, m) if err != nil { return nil, framework.NewStatus(framework.Error, err.Error()) } @@ -74,3 +86,535 @@ func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.Cy } return &framework.PostFilterResult{NominatedNodeName: nnn}, framework.NewStatus(framework.Success) } + +// preempt finds nodes with pods that can be preempted to make room for "pod" to +// schedule. It chooses one of the nodes and preempts the pods on the node and +// returns 1) the node, 2) the list of preempted pods if such a node is found, +// 3) A list of pods whose nominated node name should be cleared, and 4) any +// possible error. +// preempt does not update its snapshot. It uses the same snapshot used in the +// scheduling cycle. This is to avoid a scenario where preempt finds feasible +// nodes without preempting any pod. When there are many pending pods in the +// scheduling queue a nominated pod will go back to the queue and behind +// other pods with the same priority. The nominated pod prevents other pods from +// using the nominated resources and the nominated pod could take a long time +// before it is retried after many other pending pods. +func preempt(ctx context.Context, fh framework.FrameworkHandle, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusMap) (string, error) { + cs := fh.ClientSet() + // TODO(Huang-Wei): get pod from informer cache instead of API server. + pod, err := util.GetUpdatedPod(cs, pod) + if err != nil { + klog.Errorf("Error getting the updated preemptor pod object: %v", err) + return "", err + } + + if !podEligibleToPreemptOthers(pod, fh.SnapshotSharedLister().NodeInfos()) { + klog.V(5).Infof("Pod %v/%v is not eligible for more preemption.", pod.Namespace, pod.Name) + return "", nil + } + allNodes, err := fh.SnapshotSharedLister().NodeInfos().List() + if err != nil { + return "", err + } + if len(allNodes) == 0 { + return "", core.ErrNoNodesAvailable + } + potentialNodes := nodesWherePreemptionMightHelp(allNodes, m) + if len(potentialNodes) == 0 { + klog.V(3).Infof("Preemption will not help schedule pod %v/%v on any node.", pod.Namespace, pod.Name) + // In this case, we should clean-up any existing nominated node name of the pod. + if err := util.ClearNominatedNodeName(cs, pod); err != nil { + klog.Errorf("Cannot clear 'NominatedNodeName' field of pod %v/%v: %v", pod.Namespace, pod.Name, err) + // We do not return as this error is not critical. + } + return "", nil + } + if klog.V(5).Enabled() { + var sample []string + for i := 0; i < 10 && i < len(potentialNodes); i++ { + sample = append(sample, potentialNodes[i].Node().Name) + } + klog.Infof("%v potential nodes for preemption, first %v are: %v", len(potentialNodes), len(sample), sample) + } + pdbs, err := getPodDisruptionBudgets(fh) + if err != nil { + return "", err + } + nodeNameToVictims, err := selectNodesForPreemption(ctx, fh.PreemptHandle(), state, pod, potentialNodes, pdbs) + if err != nil { + return "", err + } + + // We will only check nodeNameToVictims with extenders that support preemption. + // Extenders which do not support preemption may later prevent preemptor from being scheduled on the nominated + // node. In that case, scheduler will find a different host for the preemptor in subsequent scheduling cycles. + nodeNameToVictims, err = processPreemptionWithExtenders(fh, pod, nodeNameToVictims) + if err != nil { + return "", err + } + + candidateNode := pickOneNodeForPreemption(nodeNameToVictims) + if len(candidateNode) == 0 { + return "", nil + } + + victims := nodeNameToVictims[candidateNode].Pods + for _, victim := range victims { + if err := util.DeletePod(cs, victim); err != nil { + klog.Errorf("Error preempting pod %v/%v: %v", victim.Namespace, victim.Name, err) + return "", err + } + // If the victim is a WaitingPod, send a reject message to the PermitPlugin + if waitingPod := fh.GetWaitingPod(victim.UID); waitingPod != nil { + waitingPod.Reject("preempted") + } + fh.EventRecorder().Eventf(victim, pod, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by %v/%v on node %v", pod.Namespace, pod.Name, candidateNode) + } + metrics.PreemptionVictims.Observe(float64(len(victims))) + + // Lower priority pods nominated to run on this node, may no longer fit on + // this node. So, we should remove their nomination. Removing their + // nomination updates these pods and moves them to the active queue. It + // lets scheduler find another place for them. + nominatedPods := getLowerPriorityNominatedPods(fh.PreemptHandle(), pod, candidateNode) + if err := util.ClearNominatedNodeName(cs, nominatedPods...); err != nil { + klog.Errorf("Cannot clear 'NominatedNodeName' field: %v", err) + // We do not return as this error is not critical. + } + + return candidateNode, nil +} + +// podEligibleToPreemptOthers determines whether this pod should be considered +// for preempting other pods or not. If this pod has already preempted other +// pods and those are in their graceful termination period, it shouldn't be +// considered for preemption. +// We look at the node that is nominated for this pod and as long as there are +// terminating pods on the node, we don't consider this for preempting more pods. +func podEligibleToPreemptOthers(pod *v1.Pod, nodeInfos framework.NodeInfoLister) bool { + if pod.Spec.PreemptionPolicy != nil && *pod.Spec.PreemptionPolicy == v1.PreemptNever { + klog.V(5).Infof("Pod %v/%v is not eligible for preemption because it has a preemptionPolicy of %v", pod.Namespace, pod.Name, v1.PreemptNever) + return false + } + nomNodeName := pod.Status.NominatedNodeName + if len(nomNodeName) > 0 { + if nodeInfo, _ := nodeInfos.Get(nomNodeName); nodeInfo != nil { + podPriority := podutil.GetPodPriority(pod) + for _, p := range nodeInfo.Pods { + if p.Pod.DeletionTimestamp != nil && podutil.GetPodPriority(p.Pod) < podPriority { + // There is a terminating pod on the nominated node. + return false + } + } + } + } + return true +} + +// nodesWherePreemptionMightHelp returns a list of nodes with failed predicates +// that may be satisfied by removing pods from the node. +func nodesWherePreemptionMightHelp(nodes []*framework.NodeInfo, m framework.NodeToStatusMap) []*framework.NodeInfo { + var potentialNodes []*framework.NodeInfo + for _, node := range nodes { + name := node.Node().Name + // We reply on the status by each plugin - 'Unschedulable' or 'UnschedulableAndUnresolvable' + // to determine whether preemption may help or not on the node. + if m[name].Code() == framework.UnschedulableAndUnresolvable { + continue + } + potentialNodes = append(potentialNodes, node) + } + return potentialNodes +} + +// selectNodesForPreemption finds all the nodes with possible victims for +// preemption in parallel. +func selectNodesForPreemption( + ctx context.Context, + fh framework.PreemptHandle, + state *framework.CycleState, + pod *v1.Pod, + potentialNodes []*framework.NodeInfo, + pdbs []*policy.PodDisruptionBudget, +) (map[string]*extenderv1.Victims, error) { + nodeNameToVictims := map[string]*extenderv1.Victims{} + var resultLock sync.Mutex + + checkNode := func(i int) { + nodeInfoCopy := potentialNodes[i].Clone() + stateCopy := state.Clone() + pods, numPDBViolations, fits := selectVictimsOnNode(ctx, fh, stateCopy, pod, nodeInfoCopy, pdbs) + if fits { + resultLock.Lock() + victims := extenderv1.Victims{ + Pods: pods, + NumPDBViolations: int64(numPDBViolations), + } + nodeNameToVictims[potentialNodes[i].Node().Name] = &victims + resultLock.Unlock() + } + } + parallelize.Until(ctx, len(potentialNodes), checkNode) + return nodeNameToVictims, nil +} + +// processPreemptionWithExtenders processes preemption with extenders +func processPreemptionWithExtenders(fh framework.FrameworkHandle, pod *v1.Pod, nodeNameToVictims map[string]*extenderv1.Victims) (map[string]*extenderv1.Victims, error) { + if len(nodeNameToVictims) > 0 { + for _, extender := range fh.PreemptHandle().Extenders() { + if extender.SupportsPreemption() && extender.IsInterested(pod) { + newNodeNameToVictims, err := extender.ProcessPreemption( + pod, + nodeNameToVictims, + fh.SnapshotSharedLister().NodeInfos(), + ) + if err != nil { + if extender.IsIgnorable() { + klog.Warningf("Skipping extender %v as it returned error %v and has ignorable flag set", + extender, err) + continue + } + return nil, err + } + + // Replace nodeNameToVictims with new result after preemption. So the + // rest of extenders can continue use it as parameter. + nodeNameToVictims = newNodeNameToVictims + + // If node list becomes empty, no preemption can happen regardless of other extenders. + if len(nodeNameToVictims) == 0 { + break + } + } + } + } + + return nodeNameToVictims, nil +} + +// pickOneNodeForPreemption chooses one node among the given nodes. It assumes +// pods in each map entry are ordered by decreasing priority. +// It picks a node based on the following criteria: +// 1. A node with minimum number of PDB violations. +// 2. A node with minimum highest priority victim is picked. +// 3. Ties are broken by sum of priorities of all victims. +// 4. If there are still ties, node with the minimum number of victims is picked. +// 5. If there are still ties, node with the latest start time of all highest priority victims is picked. +// 6. If there are still ties, the first such node is picked (sort of randomly). +// The 'minNodes1' and 'minNodes2' are being reused here to save the memory +// allocation and garbage collection time. +func pickOneNodeForPreemption(nodesToVictims map[string]*extenderv1.Victims) string { + if len(nodesToVictims) == 0 { + return "" + } + minNumPDBViolatingPods := int64(math.MaxInt32) + var minNodes1 []string + lenNodes1 := 0 + for node, victims := range nodesToVictims { + if len(victims.Pods) == 0 { + // We found a node that doesn't need any preemption. Return it! + // This should happen rarely when one or more pods are terminated between + // the time that scheduler tries to schedule the pod and the time that + // preemption logic tries to find nodes for preemption. + return node + } + numPDBViolatingPods := victims.NumPDBViolations + if numPDBViolatingPods < minNumPDBViolatingPods { + minNumPDBViolatingPods = numPDBViolatingPods + minNodes1 = nil + lenNodes1 = 0 + } + if numPDBViolatingPods == minNumPDBViolatingPods { + minNodes1 = append(minNodes1, node) + lenNodes1++ + } + } + if lenNodes1 == 1 { + return minNodes1[0] + } + + // There are more than one node with minimum number PDB violating pods. Find + // the one with minimum highest priority victim. + minHighestPriority := int32(math.MaxInt32) + var minNodes2 = make([]string, lenNodes1) + lenNodes2 := 0 + for i := 0; i < lenNodes1; i++ { + node := minNodes1[i] + victims := nodesToVictims[node] + // highestPodPriority is the highest priority among the victims on this node. + highestPodPriority := podutil.GetPodPriority(victims.Pods[0]) + if highestPodPriority < minHighestPriority { + minHighestPriority = highestPodPriority + lenNodes2 = 0 + } + if highestPodPriority == minHighestPriority { + minNodes2[lenNodes2] = node + lenNodes2++ + } + } + if lenNodes2 == 1 { + return minNodes2[0] + } + + // There are a few nodes with minimum highest priority victim. Find the + // smallest sum of priorities. + minSumPriorities := int64(math.MaxInt64) + lenNodes1 = 0 + for i := 0; i < lenNodes2; i++ { + var sumPriorities int64 + node := minNodes2[i] + for _, pod := range nodesToVictims[node].Pods { + // We add MaxInt32+1 to all priorities to make all of them >= 0. This is + // needed so that a node with a few pods with negative priority is not + // picked over a node with a smaller number of pods with the same negative + // priority (and similar scenarios). + sumPriorities += int64(podutil.GetPodPriority(pod)) + int64(math.MaxInt32+1) + } + if sumPriorities < minSumPriorities { + minSumPriorities = sumPriorities + lenNodes1 = 0 + } + if sumPriorities == minSumPriorities { + minNodes1[lenNodes1] = node + lenNodes1++ + } + } + if lenNodes1 == 1 { + return minNodes1[0] + } + + // There are a few nodes with minimum highest priority victim and sum of priorities. + // Find one with the minimum number of pods. + minNumPods := math.MaxInt32 + lenNodes2 = 0 + for i := 0; i < lenNodes1; i++ { + node := minNodes1[i] + numPods := len(nodesToVictims[node].Pods) + if numPods < minNumPods { + minNumPods = numPods + lenNodes2 = 0 + } + if numPods == minNumPods { + minNodes2[lenNodes2] = node + lenNodes2++ + } + } + if lenNodes2 == 1 { + return minNodes2[0] + } + + // There are a few nodes with same number of pods. + // Find the node that satisfies latest(earliestStartTime(all highest-priority pods on node)) + latestStartTime := util.GetEarliestPodStartTime(nodesToVictims[minNodes2[0]]) + if latestStartTime == nil { + // If the earliest start time of all pods on the 1st node is nil, just return it, + // which is not expected to happen. + klog.Errorf("earliestStartTime is nil for node %s. Should not reach here.", minNodes2[0]) + return minNodes2[0] + } + nodeToReturn := minNodes2[0] + for i := 1; i < lenNodes2; i++ { + node := minNodes2[i] + // Get earliest start time of all pods on the current node. + earliestStartTimeOnNode := util.GetEarliestPodStartTime(nodesToVictims[node]) + if earliestStartTimeOnNode == nil { + klog.Errorf("earliestStartTime is nil for node %s. Should not reach here.", node) + continue + } + if earliestStartTimeOnNode.After(latestStartTime.Time) { + latestStartTime = earliestStartTimeOnNode + nodeToReturn = node + } + } + + return nodeToReturn +} + +// selectVictimsOnNode finds minimum set of pods on the given node that should +// be preempted in order to make enough room for "pod" to be scheduled. The +// minimum set selected is subject to the constraint that a higher-priority pod +// is never preempted when a lower-priority pod could be (higher/lower relative +// to one another, not relative to the preemptor "pod"). +// The algorithm first checks if the pod can be scheduled on the node when all the +// lower priority pods are gone. If so, it sorts all the lower priority pods by +// their priority and then puts them into two groups of those whose PodDisruptionBudget +// will be violated if preempted and other non-violating pods. Both groups are +// sorted by priority. It first tries to reprieve as many PDB violating pods as +// possible and then does them same for non-PDB-violating pods while checking +// that the "pod" can still fit on the node. +// NOTE: This function assumes that it is never called if "pod" cannot be scheduled +// due to pod affinity, node affinity, or node anti-affinity reasons. None of +// these predicates can be satisfied by removing more pods from the node. +func selectVictimsOnNode( + ctx context.Context, + ph framework.PreemptHandle, + state *framework.CycleState, + pod *v1.Pod, + nodeInfo *framework.NodeInfo, + pdbs []*policy.PodDisruptionBudget, +) ([]*v1.Pod, int, bool) { + var potentialVictims []*v1.Pod + + removePod := func(rp *v1.Pod) error { + if err := nodeInfo.RemovePod(rp); err != nil { + return err + } + status := ph.RunPreFilterExtensionRemovePod(ctx, state, pod, rp, nodeInfo) + if !status.IsSuccess() { + return status.AsError() + } + return nil + } + addPod := func(ap *v1.Pod) error { + nodeInfo.AddPod(ap) + status := ph.RunPreFilterExtensionAddPod(ctx, state, pod, ap, nodeInfo) + if !status.IsSuccess() { + return status.AsError() + } + return nil + } + // As the first step, remove all the lower priority pods from the node and + // check if the given pod can be scheduled. + podPriority := podutil.GetPodPriority(pod) + for _, p := range nodeInfo.Pods { + if podutil.GetPodPriority(p.Pod) < podPriority { + potentialVictims = append(potentialVictims, p.Pod) + if err := removePod(p.Pod); err != nil { + return nil, 0, false + } + } + } + // If the new pod does not fit after removing all the lower priority pods, + // we are almost done and this node is not suitable for preemption. The only + // condition that we could check is if the "pod" is failing to schedule due to + // inter-pod affinity to one or more victims, but we have decided not to + // support this case for performance reasons. Having affinity to lower + // priority pods is not a recommended configuration anyway. + if fits, _, err := core.PodPassesFiltersOnNode(ctx, ph, state, pod, nodeInfo); !fits { + if err != nil { + klog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, err) + } + + return nil, 0, false + } + var victims []*v1.Pod + numViolatingVictim := 0 + sort.Slice(potentialVictims, func(i, j int) bool { return util.MoreImportantPod(potentialVictims[i], potentialVictims[j]) }) + // Try to reprieve as many pods as possible. We first try to reprieve the PDB + // violating victims and then other non-violating ones. In both cases, we start + // from the highest priority victims. + violatingVictims, nonViolatingVictims := filterPodsWithPDBViolation(potentialVictims, pdbs) + reprievePod := func(p *v1.Pod) (bool, error) { + if err := addPod(p); err != nil { + return false, err + } + fits, _, _ := core.PodPassesFiltersOnNode(ctx, ph, state, pod, nodeInfo) + if !fits { + if err := removePod(p); err != nil { + return false, err + } + victims = append(victims, p) + klog.V(5).Infof("Pod %v/%v is a potential preemption victim on node %v.", p.Namespace, p.Name, nodeInfo.Node().Name) + } + return fits, nil + } + for _, p := range violatingVictims { + if fits, err := reprievePod(p); err != nil { + klog.Warningf("Failed to reprieve pod %q: %v", p.Name, err) + return nil, 0, false + } else if !fits { + numViolatingVictim++ + } + } + // Now we try to reprieve non-violating victims. + for _, p := range nonViolatingVictims { + if _, err := reprievePod(p); err != nil { + klog.Warningf("Failed to reprieve pod %q: %v", p.Name, err) + return nil, 0, false + } + } + return victims, numViolatingVictim, true +} + +// getLowerPriorityNominatedPods returns pods whose priority is smaller than the +// priority of the given "pod" and are nominated to run on the given node. +// Note: We could possibly check if the nominated lower priority pods still fit +// and return those that no longer fit, but that would require lots of +// manipulation of NodeInfo and PreFilter state per nominated pod. It may not be +// worth the complexity, especially because we generally expect to have a very +// small number of nominated pods per node. +func getLowerPriorityNominatedPods(pn framework.PodNominator, pod *v1.Pod, nodeName string) []*v1.Pod { + pods := pn.NominatedPodsForNode(nodeName) + + if len(pods) == 0 { + return nil + } + + var lowerPriorityPods []*v1.Pod + podPriority := podutil.GetPodPriority(pod) + for _, p := range pods { + if podutil.GetPodPriority(p) < podPriority { + lowerPriorityPods = append(lowerPriorityPods, p) + } + } + return lowerPriorityPods +} + +// filterPodsWithPDBViolation groups the given "pods" into two groups of "violatingPods" +// and "nonViolatingPods" based on whether their PDBs will be violated if they are +// preempted. +// This function is stable and does not change the order of received pods. So, if it +// receives a sorted list, grouping will preserve the order of the input list. +func filterPodsWithPDBViolation(pods []*v1.Pod, pdbs []*policy.PodDisruptionBudget) (violatingPods, nonViolatingPods []*v1.Pod) { + pdbsAllowed := make([]int32, len(pdbs)) + for i, pdb := range pdbs { + pdbsAllowed[i] = pdb.Status.DisruptionsAllowed + } + + for _, obj := range pods { + pod := obj + pdbForPodIsViolated := false + // A pod with no labels will not match any PDB. So, no need to check. + if len(pod.Labels) != 0 { + for i, pdb := range pdbs { + if pdb.Namespace != pod.Namespace { + continue + } + selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector) + if err != nil { + continue + } + // A PDB with a nil or empty selector matches nothing. + if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) { + continue + } + + // Existing in DisruptedPods means it has been processed in API server, + // we don't treat it as a violating case. + if _, exist := pdb.Status.DisruptedPods[pod.Name]; exist { + continue + } + // Only decrement the matched pdb when it's not in its ; + // otherwise we may over-decrement the budget number. + pdbsAllowed[i]-- + // We have found a matching PDB. + if pdbsAllowed[i] < 0 { + pdbForPodIsViolated = true + } + } + } + if pdbForPodIsViolated { + violatingPods = append(violatingPods, pod) + } else { + nonViolatingPods = append(nonViolatingPods, pod) + } + } + return violatingPods, nonViolatingPods +} + +func getPodDisruptionBudgets(fh framework.FrameworkHandle) ([]*policy.PodDisruptionBudget, error) { + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.PodDisruptionBudget) { + return fh.SharedInformerFactory().Policy().V1beta1().PodDisruptionBudgets().Lister().List(labels.Everything()) + } + return nil, nil +} diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go new file mode 100644 index 00000000000..b189f69c045 --- /dev/null +++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go @@ -0,0 +1,1476 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package defaultpreemption + +import ( + "context" + "fmt" + "math" + "strconv" + "strings" + "testing" + "time" + + v1 "k8s.io/api/core/v1" + policy "k8s.io/api/policy/v1beta1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/informers" + clientsetfake "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" + "k8s.io/client-go/tools/events" + extenderv1 "k8s.io/kube-scheduler/extender/v1" + volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodelabel" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone" + frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" + framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1" + internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" + internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" + st "k8s.io/kubernetes/pkg/scheduler/testing" + schedutil "k8s.io/kubernetes/pkg/scheduler/util" +) + +var smallContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(schedutil.DefaultMilliCPURequest, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(schedutil.DefaultMemoryRequest, 10)), + }, + }, + }, +} +var mediumContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(schedutil.DefaultMilliCPURequest*2, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(schedutil.DefaultMemoryRequest*2, 10)), + }, + }, + }, +} +var largeContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(schedutil.DefaultMilliCPURequest*3, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(schedutil.DefaultMemoryRequest*3, 10)), + }, + }, + }, +} +var veryLargeContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(schedutil.DefaultMilliCPURequest*5, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(schedutil.DefaultMemoryRequest*5, 10)), + }, + }, + }, +} + +var negPriority, lowPriority, midPriority, highPriority, veryHighPriority = int32(-100), int32(0), int32(100), int32(1000), int32(10000) + +var startTime = metav1.Date(2019, 1, 1, 1, 1, 1, 0, time.UTC) +var startTime20190102 = metav1.Date(2019, 1, 2, 1, 1, 1, 0, time.UTC) +var startTime20190103 = metav1.Date(2019, 1, 3, 1, 1, 1, 0, time.UTC) +var startTime20190104 = metav1.Date(2019, 1, 4, 1, 1, 1, 0, time.UTC) +var startTime20190105 = metav1.Date(2019, 1, 5, 1, 1, 1, 0, time.UTC) +var startTime20190106 = metav1.Date(2019, 1, 6, 1, 1, 1, 0, time.UTC) +var startTime20190107 = metav1.Date(2019, 1, 7, 1, 1, 1, 0, time.UTC) + +type victims struct { + pods sets.String + numPDBViolations int64 +} + +func checkPreemptionVictims(expected map[string]victims, nodeToPods map[string]*extenderv1.Victims) error { + if len(expected) == len(nodeToPods) { + for k, victims := range nodeToPods { + if expVictims, ok := expected[k]; ok { + if len(victims.Pods) != len(expVictims.pods) { + return fmt.Errorf("unexpected number of pods. expected: %v, got: %v", expected, printNodeNameToVictims(nodeToPods)) + } + prevPriority := int32(math.MaxInt32) + for _, p := range victims.Pods { + // Check that pods are sorted by their priority. + if *p.Spec.Priority > prevPriority { + return fmt.Errorf("pod %v of node %v was not sorted by priority", p.Name, k) + } + prevPriority = *p.Spec.Priority + if !expVictims.pods.Has(p.Name) { + return fmt.Errorf("pod %v was not expected. Expected: %v", p.Name, expVictims.pods) + } + } + if expVictims.numPDBViolations != victims.NumPDBViolations { + return fmt.Errorf("unexpected numPDBViolations. expected: %d, got: %d", expVictims.numPDBViolations, victims.NumPDBViolations) + } + } else { + return fmt.Errorf("unexpected machines. expected: %v, got: %v", expected, printNodeNameToVictims(nodeToPods)) + } + } + } else { + return fmt.Errorf("unexpected number of machines. expected: %v, got: %v", expected, printNodeNameToVictims(nodeToPods)) + } + return nil +} + +func printNodeNameToVictims(nodeNameToVictims map[string]*extenderv1.Victims) string { + var output string + for nodeName, victims := range nodeNameToVictims { + output += nodeName + ": [" + for _, pod := range victims.Pods { + output += pod.Name + ", " + } + output += "]" + } + return output +} + +func assignDefaultStartTime(pods []*v1.Pod) { + now := metav1.Now() + for i := range pods { + pod := pods[i] + if pod.Status.StartTime == nil { + pod.Status.StartTime = &now + } + } +} + +func nodesToNodeInfos(nodes []*v1.Node, snapshot *internalcache.Snapshot) ([]*framework.NodeInfo, error) { + var nodeInfos []*framework.NodeInfo + for _, n := range nodes { + nodeInfo, err := snapshot.NodeInfos().Get(n.Name) + if err != nil { + return nil, err + } + nodeInfos = append(nodeInfos, nodeInfo) + } + return nodeInfos, nil +} + +func makeNode(node string, milliCPU, memory int64) *v1.Node { + return &v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: node}, + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), + "pods": *resource.NewQuantity(100, resource.DecimalSI), + }, + Allocatable: v1.ResourceList{ + + v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), + "pods": *resource.NewQuantity(100, resource.DecimalSI), + }, + }, + } +} + +func makeNodeList(nodeNames []string) []*v1.Node { + result := make([]*v1.Node, 0, len(nodeNames)) + for _, nodeName := range nodeNames { + result = append(result, &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}) + } + return result +} + +func mergeObjs(pod *v1.Pod, pods []*v1.Pod) []runtime.Object { + var objs []runtime.Object + if pod != nil { + objs = append(objs, pod) + } + for i := range pods { + objs = append(objs, pods[i]) + } + return objs +} + +// TestSelectNodesForPreemption tests selectNodesForPreemption. This test assumes +// that podsFitsOnNode works correctly and is tested separately. +func TestSelectNodesForPreemption(t *testing.T) { + tests := []struct { + name string + registerPlugins []st.RegisterPluginFunc + nodes []string + pod *v1.Pod + pods []*v1.Pod + pdbs []*policy.PodDisruptionBudget + filterReturnCode framework.Code + expected map[string]victims + expectedNumFilterCalled int32 + }{ + { + name: "a pod that does not fit on any machine", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("FalseFilter", st.NewFalseFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "new", UID: types.UID("new")}, Spec: v1.PodSpec{Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]victims{}, + expectedNumFilterCalled: 2, + }, + { + name: "a pod that fits with no preemption", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "new", UID: types.UID("new")}, Spec: v1.PodSpec{Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]victims{"machine1": {}, "machine2": {}}, + expectedNumFilterCalled: 4, + }, + { + name: "a pod that fits on one machine with no preemption", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]victims{"machine1": {}}, + expectedNumFilterCalled: 3, + }, + { + name: "a pod that fits on both machines when lower priority pods are preempted", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]victims{"machine1": {pods: sets.NewString("a")}, "machine2": {pods: sets.NewString("b")}}, + expectedNumFilterCalled: 4, + }, + { + name: "a pod that would fit on the machines, but other pods running are higher priority", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &lowPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]victims{}, + expectedNumFilterCalled: 2, + }, + { + name: "medium priority pod is preempted, but lower priority one stays as it is small", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "c", UID: types.UID("c")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]victims{"machine1": {pods: sets.NewString("b")}, "machine2": {pods: sets.NewString("c")}}, + expectedNumFilterCalled: 5, + }, + { + name: "mixed priority pods are preempted", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "c", UID: types.UID("c")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "d", UID: types.UID("d")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "e", UID: types.UID("e")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}}, + expected: map[string]victims{"machine1": {pods: sets.NewString("b", "c")}}, + expectedNumFilterCalled: 5, + }, + { + name: "mixed priority pods are preempted, pick later StartTime one when priorities are equal", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190107}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190106}}, + {ObjectMeta: metav1.ObjectMeta{Name: "c", UID: types.UID("c")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190105}}, + {ObjectMeta: metav1.ObjectMeta{Name: "d", UID: types.UID("d")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, + {ObjectMeta: metav1.ObjectMeta{Name: "e", UID: types.UID("e")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190103}}}, + expected: map[string]victims{"machine1": {pods: sets.NewString("a", "c")}}, + expectedNumFilterCalled: 5, + }, + { + name: "pod with anti-affinity is preempted", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterPluginAsExtensions(interpodaffinity.Name, interpodaffinity.New, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{ + Name: "machine1", + Labels: map[string]string{"pod": "preemptor"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"service": "securityscan"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1", Affinity: &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "pod", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"preemptor", "value2"}, + }, + }, + }, + TopologyKey: "hostname", + }, + }, + }}}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "d", UID: types.UID("d")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "e", UID: types.UID("e")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}}, + expected: map[string]victims{"machine1": {pods: sets.NewString("a")}, "machine2": {}}, + expectedNumFilterCalled: 4, + }, + { + name: "preemption to resolve even pods spread FitError", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions( + podtopologyspread.Name, + podtopologyspread.New, + "PreFilter", + "Filter", + ), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"node-a/zone1", "node-b/zone1", "node-x/zone2"}, + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "p", + Labels: map[string]string{"foo": ""}, + }, + Spec: v1.PodSpec{ + Priority: &highPriority, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + { + MaxSkew: 1, + TopologyKey: "hostname", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + }, + }, + }, + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-a1", UID: types.UID("pod-a1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-a", Priority: &midPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-a2", UID: types.UID("pod-a2"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-a", Priority: &lowPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-b1", UID: types.UID("pod-b1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-b", Priority: &lowPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-x1", UID: types.UID("pod-x1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-x2", UID: types.UID("pod-x2"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + }, + expected: map[string]victims{ + "node-a": {pods: sets.NewString("pod-a2")}, + "node-b": {pods: sets.NewString("pod-b1")}, + }, + expectedNumFilterCalled: 6, + }, + { + name: "get Unschedulable in the preemption phase when the filter plugins filtering the nodes", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + filterReturnCode: framework.Unschedulable, + expected: map[string]victims{}, + expectedNumFilterCalled: 2, + }, + { + name: "preemption with violation of same pdb", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}}, + pdbs: []*policy.PodDisruptionBudget{ + {Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1}}}, + expected: map[string]victims{"machine1": {pods: sets.NewString("a", "b"), numPDBViolations: 1}}, + expectedNumFilterCalled: 3, + }, + { + name: "preemption with violation of the pdb with pod whose eviction was processed, the victim doesn't belong to DisruptedPods", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}}, + pdbs: []*policy.PodDisruptionBudget{ + {Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1, DisruptedPods: map[string]metav1.Time{"c": {Time: time.Now()}}}}}, + expected: map[string]victims{"machine1": {pods: sets.NewString("a", "b"), numPDBViolations: 1}}, + expectedNumFilterCalled: 3, + }, + { + name: "preemption with violation of the pdb with pod whose eviction was processed, the victim belongs to DisruptedPods", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}}, + pdbs: []*policy.PodDisruptionBudget{ + {Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1, DisruptedPods: map[string]metav1.Time{"b": {Time: time.Now()}}}}}, + expected: map[string]victims{"machine1": {pods: sets.NewString("a", "b"), numPDBViolations: 0}}, + expectedNumFilterCalled: 3, + }, + { + name: "preemption with violation of the pdb with pod whose eviction was processed, the victim which belongs to DisruptedPods is treated as 'nonViolating'", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", UID: types.UID("a"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", UID: types.UID("b"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "c", UID: types.UID("c"), Labels: map[string]string{"app": "foo"}}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}}, + pdbs: []*policy.PodDisruptionBudget{ + {Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1, DisruptedPods: map[string]metav1.Time{"c": {Time: time.Now()}}}}}, + expected: map[string]victims{"machine1": {pods: sets.NewString("a", "b", "c"), numPDBViolations: 1}}, + expectedNumFilterCalled: 4, + }, + } + labelKeys := []string{"hostname", "zone", "region"} + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + filterFailedNodeReturnCodeMap := map[string]framework.Code{} + cache := internalcache.New(time.Duration(0), wait.NeverStop) + for _, pod := range test.pods { + cache.AddPod(pod) + } + for _, name := range test.nodes { + filterFailedNodeReturnCodeMap[name] = test.filterReturnCode + cache.AddNode(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: name, Labels: map[string]string{"hostname": name}}}) + } + + var nodes []*v1.Node + for _, n := range test.nodes { + node := makeNode(n, 1000*5, schedutil.DefaultMemoryRequest*5) + // if possible, split node name by '/' to form labels in a format of + // {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]} + node.ObjectMeta.Labels = make(map[string]string) + for i, label := range strings.Split(node.Name, "/") { + node.ObjectMeta.Labels[labelKeys[i]] = label + } + node.Name = node.ObjectMeta.Labels["hostname"] + nodes = append(nodes, node) + } + + // For each test, prepend a FakeFilterPlugin. + fakePlugin := st.FakeFilterPlugin{} + fakePlugin.FailedNodeReturnCodeMap = filterFailedNodeReturnCodeMap + registerFakeFilterFunc := st.RegisterFilterPlugin( + "FakeFilter", + func(_ runtime.Object, fh framework.FrameworkHandle) (framework.Plugin, error) { + return &fakePlugin, nil + }, + ) + registerPlugins := append([]st.RegisterPluginFunc{registerFakeFilterFunc}, test.registerPlugins...) + // Use a real snapshot since it's needed in some Filter Plugin (e.g., PodAffinity) + snapshot := internalcache.NewSnapshot(test.pods, nodes) + fwk, err := st.NewFramework( + registerPlugins, + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator()), + frameworkruntime.WithSnapshotSharedLister(snapshot), + ) + if err != nil { + t.Fatal(err) + } + + assignDefaultStartTime(test.pods) + + state := framework.NewCycleState() + // Some tests rely on PreFilter plugin to compute its CycleState. + preFilterStatus := fwk.RunPreFilterPlugins(context.Background(), state, test.pod) + if !preFilterStatus.IsSuccess() { + t.Errorf("Unexpected preFilterStatus: %v", preFilterStatus) + } + nodeInfos, err := nodesToNodeInfos(nodes, snapshot) + if err != nil { + t.Fatal(err) + } + nodeToPods, err := selectNodesForPreemption(context.Background(), fwk.PreemptHandle(), state, test.pod, nodeInfos, test.pdbs) + if err != nil { + t.Error(err) + } + + if test.expectedNumFilterCalled != fakePlugin.NumFilterCalled { + t.Errorf("expected fakePlugin.numFilterCalled is %d, but got %d", test.expectedNumFilterCalled, fakePlugin.NumFilterCalled) + } + + if err := checkPreemptionVictims(test.expected, nodeToPods); err != nil { + t.Error(err) + } + }) + } +} + +// TestPickOneNodeForPreemption tests pickOneNodeForPreemption. +func TestPickOneNodeForPreemption(t *testing.T) { + tests := []struct { + name string + registerPlugins []st.RegisterPluginFunc + nodes []string + pod *v1.Pod + pods []*v1.Pod + expected []string // any of the items is valid + }{ + { + name: "No node needs preemption", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}}, + expected: []string{"machine1"}, + }, + { + name: "a pod that fits on both machines when lower priority pods are preempted", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}}, + expected: []string{"machine1", "machine2"}, + }, + { + name: "a pod that fits on a machine with no preemption", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}}, + expected: []string{"machine3"}, + }, + { + name: "machine with min highest priority pod is picked", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + }, + expected: []string{"machine3"}, + }, + { + name: "when highest priorities are the same, minimum sum of priorities is picked", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + }, + expected: []string{"machine2"}, + }, + { + name: "when highest priority and sum are the same, minimum number of pods is picked", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.3", UID: types.UID("m1.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.4", UID: types.UID("m1.4")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &negPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.3", UID: types.UID("m3.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + }, + expected: []string{"machine2"}, + }, + { + // pickOneNodeForPreemption adjusts pod priorities when finding the sum of the victims. This + // test ensures that the logic works correctly. + name: "sum of adjusted priorities is considered", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.3", UID: types.UID("m1.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &negPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.3", UID: types.UID("m3.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + }, + expected: []string{"machine2"}, + }, + { + name: "non-overlapping lowest high priority, sum priorities, and number of pods", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3", "machine4"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &veryHighPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.3", UID: types.UID("m1.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.3", UID: types.UID("m3.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.4", UID: types.UID("m3.4")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m4.1", UID: types.UID("m4.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine4"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m4.2", UID: types.UID("m4.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine4"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m4.3", UID: types.UID("m4.3")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine4"}, Status: v1.PodStatus{StartTime: &startTime}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m4.4", UID: types.UID("m4.4")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine4"}, Status: v1.PodStatus{StartTime: &startTime}}, + }, + expected: []string{"machine1"}, + }, + { + name: "same priority, same number of victims, different start time for each machine's pod", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190103}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190103}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190102}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190102}}, + }, + expected: []string{"machine2"}, + }, + { + name: "same priority, same number of victims, different start time for all pods", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190105}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190103}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190106}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190102}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190107}}, + }, + expected: []string{"machine3"}, + }, + { + name: "different priority, same number of victims, different start time for all pods", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190105}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{StartTime: &startTime20190103}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190107}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2", UID: types.UID("m2.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}, Status: v1.PodStatus{StartTime: &startTime20190102}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190104}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2", UID: types.UID("m3.2")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{StartTime: &startTime20190106}}, + }, + expected: []string{"machine2"}, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + var nodes []*v1.Node + for _, n := range test.nodes { + nodes = append(nodes, makeNode(n, schedutil.DefaultMilliCPURequest*5, schedutil.DefaultMemoryRequest*5)) + } + snapshot := internalcache.NewSnapshot(test.pods, nodes) + fwk, err := st.NewFramework( + test.registerPlugins, + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator()), + frameworkruntime.WithSnapshotSharedLister(snapshot), + ) + if err != nil { + t.Fatal(err) + } + + assignDefaultStartTime(test.pods) + + nodeInfos, err := nodesToNodeInfos(nodes, snapshot) + if err != nil { + t.Fatal(err) + } + state := framework.NewCycleState() + // Some tests rely on PreFilter plugin to compute its CycleState. + preFilterStatus := fwk.RunPreFilterPlugins(context.Background(), state, test.pod) + if !preFilterStatus.IsSuccess() { + t.Errorf("Unexpected preFilterStatus: %v", preFilterStatus) + } + candidateNodes, _ := selectNodesForPreemption(context.Background(), fwk.PreemptHandle(), state, test.pod, nodeInfos, nil) + node := pickOneNodeForPreemption(candidateNodes) + found := false + for _, nodeName := range test.expected { + if node == nodeName { + found = true + break + } + } + if !found { + t.Errorf("unexpected node: %v", node) + } + }) + } +} + +func TestNodesWherePreemptionMightHelp(t *testing.T) { + // Prepare 4 node names. + nodeNames := make([]string, 0, 4) + for i := 1; i < 5; i++ { + nodeNames = append(nodeNames, fmt.Sprintf("machine%d", i)) + } + + tests := []struct { + name string + nodesStatuses framework.NodeToStatusMap + expected map[string]bool // set of expected node names. Value is ignored. + }{ + { + name: "No node should be attempted", + nodesStatuses: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodeaffinity.ErrReason), + "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodename.ErrReason), + "machine3": framework.NewStatus(framework.UnschedulableAndUnresolvable, tainttoleration.ErrReasonNotMatch), + "machine4": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodelabel.ErrReasonPresenceViolated), + }, + expected: map[string]bool{}, + }, + { + name: "ErrReasonAffinityNotMatch should be tried as it indicates that the pod is unschedulable due to inter-pod affinity or anti-affinity", + nodesStatuses: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.Unschedulable, interpodaffinity.ErrReasonAffinityNotMatch), + "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodename.ErrReason), + "machine3": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodeunschedulable.ErrReasonUnschedulable), + }, + expected: map[string]bool{"machine1": true, "machine4": true}, + }, + { + name: "pod with both pod affinity and anti-affinity should be tried", + nodesStatuses: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.Unschedulable, interpodaffinity.ErrReasonAffinityNotMatch), + "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodename.ErrReason), + }, + expected: map[string]bool{"machine1": true, "machine3": true, "machine4": true}, + }, + { + name: "ErrReasonAffinityRulesNotMatch should not be tried as it indicates that the pod is unschedulable due to inter-pod affinity, but ErrReasonAffinityNotMatch should be tried as it indicates that the pod is unschedulable due to inter-pod affinity or anti-affinity", + nodesStatuses: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, interpodaffinity.ErrReasonAffinityRulesNotMatch), + "machine2": framework.NewStatus(framework.Unschedulable, interpodaffinity.ErrReasonAffinityNotMatch), + }, + expected: map[string]bool{"machine2": true, "machine3": true, "machine4": true}, + }, + { + name: "Mix of failed predicates works fine", + nodesStatuses: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, volumerestrictions.ErrReasonDiskConflict), + "machine2": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("Insufficient %v", v1.ResourceMemory)), + }, + expected: map[string]bool{"machine2": true, "machine3": true, "machine4": true}, + }, + { + name: "Node condition errors should be considered unresolvable", + nodesStatuses: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodeunschedulable.ErrReasonUnknownCondition), + }, + expected: map[string]bool{"machine2": true, "machine3": true, "machine4": true}, + }, + { + name: "ErrVolume... errors should not be tried as it indicates that the pod is unschedulable due to no matching volumes for pod on node", + nodesStatuses: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, volumezone.ErrReasonConflict), + "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, string(volumescheduling.ErrReasonNodeConflict)), + "machine3": framework.NewStatus(framework.UnschedulableAndUnresolvable, string(volumescheduling.ErrReasonBindConflict)), + }, + expected: map[string]bool{"machine4": true}, + }, + { + name: "ErrTopologySpreadConstraintsNotMatch should be tried as it indicates that the pod is unschedulable due to topology spread constraints", + nodesStatuses: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), + "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, nodename.ErrReason), + "machine3": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), + }, + expected: map[string]bool{"machine1": true, "machine3": true, "machine4": true}, + }, + { + name: "UnschedulableAndUnresolvable status should be skipped but Unschedulable should be tried", + nodesStatuses: framework.NodeToStatusMap{ + "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, ""), + "machine3": framework.NewStatus(framework.Unschedulable, ""), + "machine4": framework.NewStatus(framework.UnschedulableAndUnresolvable, ""), + }, + expected: map[string]bool{"machine1": true, "machine3": true}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + var nodeInfos []*framework.NodeInfo + for _, n := range makeNodeList(nodeNames) { + ni := framework.NewNodeInfo() + ni.SetNode(n) + nodeInfos = append(nodeInfos, ni) + } + nodes := nodesWherePreemptionMightHelp(nodeInfos, test.nodesStatuses) + if len(test.expected) != len(nodes) { + t.Errorf("number of nodes is not the same as expected. exptectd: %d, got: %d. Nodes: %v", len(test.expected), len(nodes), nodes) + } + for _, node := range nodes { + name := node.Node().Name + if _, found := test.expected[name]; !found { + t.Errorf("node %v is not expected.", name) + } + } + }) + } +} + +func TestPreempt(t *testing.T) { + defaultFailedNodeToStatusMap := framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("Insufficient %v", v1.ResourceMemory)), + "machine2": framework.NewStatus(framework.Unschedulable, volumerestrictions.ErrReasonDiskConflict), + "machine3": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("Insufficient %v", v1.ResourceMemory)), + } + // Prepare 3 node names. + var defaultNodeNames []string + for i := 1; i < 4; i++ { + defaultNodeNames = append(defaultNodeNames, fmt.Sprintf("machine%d", i)) + } + var ( + preemptLowerPriority = v1.PreemptLowerPriority + preemptNever = v1.PreemptNever + ) + tests := []struct { + name string + pod *v1.Pod + pods []*v1.Pod + extenders []*st.FakeExtender + failedNodeToStatusMap framework.NodeToStatusMap + nodeNames []string + registerPlugins []st.RegisterPluginFunc + expectedNode string + expectedPods []string // list of preempted pods + }{ + { + name: "basic preemption logic", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority, + PreemptionPolicy: &preemptLowerPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + expectedNode: "machine1", + expectedPods: []string{"m1.1", "m1.2"}, + }, + { + name: "One node doesn't need any preemption", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority, + PreemptionPolicy: &preemptLowerPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + expectedNode: "machine3", + expectedPods: []string{}, + }, + { + name: "preemption for topology spread constraints", + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "p", + Labels: map[string]string{"foo": ""}, + }, + Spec: v1.PodSpec{ + Priority: &highPriority, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + { + MaxSkew: 1, + TopologyKey: "hostname", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + }, + }, + }, + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-a1", UID: types.UID("pod-a1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-a", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-a2", UID: types.UID("pod-a2"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-a", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-b1", UID: types.UID("pod-b1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-b", Priority: &lowPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-x1", UID: types.UID("pod-x1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-x2", UID: types.UID("pod-x2"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + }, + failedNodeToStatusMap: framework.NodeToStatusMap{ + "node-a": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), + "node-b": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), + "node-x": framework.NewStatus(framework.Unschedulable, podtopologyspread.ErrReasonConstraintsNotMatch), + }, + nodeNames: []string{"node-a/zone1", "node-b/zone1", "node-x/zone2"}, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions( + podtopologyspread.Name, + podtopologyspread.New, + "PreFilter", + "Filter", + ), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + expectedNode: "node-b", + expectedPods: []string{"pod-b1"}, + }, + { + name: "Scheduler extenders allow only machine1, otherwise machine3 would have been chosen", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority, + PreemptionPolicy: &preemptLowerPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + extenders: []*st.FakeExtender{ + { + Predicates: []st.FitPredicate{st.TruePredicateExtender}, + }, + { + Predicates: []st.FitPredicate{st.Machine1PredicateExtender}, + }, + }, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + expectedNode: "machine1", + expectedPods: []string{"m1.1", "m1.2"}, + }, + { + name: "Scheduler extenders do not allow any preemption", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority, + PreemptionPolicy: &preemptLowerPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + extenders: []*st.FakeExtender{ + { + Predicates: []st.FitPredicate{st.FalsePredicateExtender}, + }, + }, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + expectedNode: "", + expectedPods: []string{}, + }, + { + name: "One scheduler extender allows only machine1, the other returns error but ignorable. Only machine1 would be chosen", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority, + PreemptionPolicy: &preemptLowerPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + extenders: []*st.FakeExtender{ + { + Predicates: []st.FitPredicate{st.ErrorPredicateExtender}, + Ignorable: true, + }, + { + Predicates: []st.FitPredicate{st.Machine1PredicateExtender}, + }, + }, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + expectedNode: "machine1", + expectedPods: []string{"m1.1", "m1.2"}, + }, + { + name: "One scheduler extender allows only machine1, but it is not interested in given pod, otherwise machine1 would have been chosen", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority, + PreemptionPolicy: &preemptLowerPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + extenders: []*st.FakeExtender{ + { + Predicates: []st.FitPredicate{st.Machine1PredicateExtender}, + UnInterested: true, + }, + { + Predicates: []st.FitPredicate{st.TruePredicateExtender}, + }, + }, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + expectedNode: "machine3", + expectedPods: []string{}, + }, + { + name: "no preempting in pod", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority, + PreemptionPolicy: &preemptNever}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + expectedNode: "", + expectedPods: nil, + }, + { + name: "PreemptionPolicy is nil", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority, + PreemptionPolicy: nil}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1", UID: types.UID("m1.1")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2", UID: types.UID("m1.2")}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1", UID: types.UID("m2.1")}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1", UID: types.UID("m3.1")}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + expectedNode: "machine1", + expectedPods: []string{"m1.1", "m1.2"}, + }, + } + + labelKeys := []string{"hostname", "zone", "region"} + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + apiObjs := mergeObjs(test.pod, test.pods) + client := clientsetfake.NewSimpleClientset(apiObjs...) + deletedPodNames := make(sets.String) + client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName()) + return true, nil, nil + }) + + stop := make(chan struct{}) + cache := internalcache.New(time.Duration(0), stop) + for _, pod := range test.pods { + cache.AddPod(pod) + } + cachedNodeInfoMap := map[string]*framework.NodeInfo{} + nodeNames := defaultNodeNames + if len(test.nodeNames) != 0 { + nodeNames = test.nodeNames + } + var nodes []*v1.Node + for i, name := range nodeNames { + node := makeNode(name, 1000*5, schedutil.DefaultMemoryRequest*5) + // if possible, split node name by '/' to form labels in a format of + // {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]} + node.ObjectMeta.Labels = make(map[string]string) + for i, label := range strings.Split(node.Name, "/") { + node.ObjectMeta.Labels[labelKeys[i]] = label + } + node.Name = node.ObjectMeta.Labels["hostname"] + cache.AddNode(node) + nodes = append(nodes, node) + nodeNames[i] = node.Name + + // Set nodeInfo to extenders to mock extenders' cache for preemption. + cachedNodeInfo := framework.NewNodeInfo() + cachedNodeInfo.SetNode(node) + cachedNodeInfoMap[node.Name] = cachedNodeInfo + } + var extenders []framework.Extender + for _, extender := range test.extenders { + // Set nodeInfoMap as extenders cached node information. + extender.CachedNodeNameToInfo = cachedNodeInfoMap + extenders = append(extenders, extender) + } + + podNominator := internalqueue.NewPodNominator() + snapshot := internalcache.NewSnapshot(test.pods, nodes) + fwk, err := st.NewFramework( + test.registerPlugins, + frameworkruntime.WithClientSet(client), + frameworkruntime.WithEventRecorder(&events.FakeRecorder{}), + frameworkruntime.WithExtenders(extenders), + frameworkruntime.WithPodNominator(podNominator), + frameworkruntime.WithSnapshotSharedLister(snapshot), + frameworkruntime.WithInformerFactory(informers.NewSharedInformerFactory(client, 0)), + ) + if err != nil { + t.Fatal(err) + } + + state := framework.NewCycleState() + // Some tests rely on PreFilter plugin to compute its CycleState. + preFilterStatus := fwk.RunPreFilterPlugins(context.Background(), state, test.pod) + if !preFilterStatus.IsSuccess() { + t.Errorf("Unexpected preFilterStatus: %v", preFilterStatus) + } + // Call preempt and check the expected results. + failedNodeToStatusMap := defaultFailedNodeToStatusMap + if test.failedNodeToStatusMap != nil { + failedNodeToStatusMap = test.failedNodeToStatusMap + } + node, err := preempt(context.Background(), fwk, state, test.pod, failedNodeToStatusMap) + if err != nil { + t.Errorf("unexpected error in preemption: %v", err) + } + if len(node) != 0 && node != test.expectedNode { + t.Errorf("expected node: %v, got: %v", test.expectedNode, node) + } + if len(node) == 0 && len(test.expectedNode) != 0 { + t.Errorf("expected node: %v, got: nothing", test.expectedNode) + } + if len(deletedPodNames) != len(test.expectedPods) { + t.Errorf("expected %v pods, got %v.", len(test.expectedPods), len(deletedPodNames)) + } + for victimName := range deletedPodNames { + found := false + for _, expPod := range test.expectedPods { + if expPod == victimName { + found = true + break + } + } + if !found { + t.Errorf("pod %v is not expected to be a victim.", victimName) + } + } + test.pod.Status.NominatedNodeName = node + client.CoreV1().Pods(test.pod.Namespace).Update(context.TODO(), test.pod, metav1.UpdateOptions{}) + + // Manually set the deleted Pods' deletionTimestamp to non-nil. + for _, pod := range test.pods { + if deletedPodNames.Has(pod.Name) { + now := metav1.Now() + pod.DeletionTimestamp = &now + deletedPodNames.Delete(pod.Name) + } + } + + // Call preempt again and make sure it doesn't preempt any more pods. + node, err = preempt(context.Background(), fwk, state, test.pod, failedNodeToStatusMap) + if err != nil { + t.Errorf("unexpected error in preemption: %v", err) + } + if len(node) != 0 && len(deletedPodNames) > 0 { + t.Errorf("didn't expect any more preemption. Node %v is selected for preemption.", node) + } + close(stop) + }) + } +} diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index 02b7d649d50..2f505d50127 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -767,7 +767,7 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache internalcache.C return true, b, nil }) - fwk, _ := st.NewFramework(fns, frameworkruntime.WithClientSet(client)) + fwk, _ := st.NewFramework(fns, frameworkruntime.WithClientSet(client), frameworkruntime.WithPodNominator(internalqueue.NewPodNominator())) prof := &profile.Profile{ Framework: fwk, Recorder: &events.FakeRecorder{}, @@ -782,7 +782,6 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache internalcache.C algo := core.NewGenericScheduler( scache, - internalqueue.NewSchedulingQueue(nil), internalcache.NewEmptySnapshot(), []framework.Extender{}, informerFactory.Core().V1().PersistentVolumeClaims().Lister(), @@ -1135,7 +1134,6 @@ func TestSchedulerBinding(t *testing.T) { algo := core.NewGenericScheduler( scache, nil, - nil, test.extenders, nil, false,