Add PDB support during pod preemption

2025-07-28 05:57:25 +00:00 · 2017-11-21 13:29:16 -08:00 · 2017-11-21 13:29:16 -08:00 · 3d4ae31d91
commit 3d4ae31d91
parent 1489d19443
3 changed files with 438 additions and 77 deletions
--- a/plugin/pkg/scheduler/core/generic_scheduler.go
+++ b/plugin/pkg/scheduler/core/generic_scheduler.go
@ -26,6 +26,9 @@ import (
 	"time"
 	"k8s.io/api/core/v1"
 	policy "k8s.io/api/policy/v1beta1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/util/errors"
 	utiltrace "k8s.io/apiserver/pkg/util/trace"
 	"k8s.io/client-go/util/workqueue"
@ -46,6 +49,11 @@ type FitError struct {
 	FailedPredicates FailedPredicateMap
 }
 type Victims struct {
 	pods             []*v1.Pod
 	numPDBViolations int
 }
 var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")
 const (
@ -209,29 +217,33 @@ func (g *genericScheduler) Preempt(pod *v1.Pod, nodeLister algorithm.NodeLister,
 		// In this case, we should clean-up any existing nominated node name of the pod.
 		return nil, nil, []*v1.Pod{pod}, nil
 	}
-	nodeToPods, err := selectNodesForPreemption(pod, g.cachedNodeInfoMap, potentialNodes, g.predicates, g.predicateMetaProducer, g.schedulingQueue)
+	pdbs, err := g.cache.ListPDBs(labels.Everything())
 	if err != nil {
 		return nil, nil, nil, err
 	}
-	for len(nodeToPods) > 0 {
+	nodeToVictims, err := selectNodesForPreemption(pod, g.cachedNodeInfoMap, potentialNodes, g.predicates, g.predicateMetaProducer, g.schedulingQueue, pdbs)
-		node := pickOneNodeForPreemption(nodeToPods)
+	if err != nil {
 		return nil, nil, nil, err
 	}
 	for len(nodeToVictims) > 0 {
 		node := pickOneNodeForPreemption(nodeToVictims)
 		if node == nil {
 			return nil, nil, nil, err
 		}
-		passes, pErr := nodePassesExtendersForPreemption(pod, node.Name, nodeToPods[node], g.cachedNodeInfoMap, g.extenders)
+		passes, pErr := nodePassesExtendersForPreemption(pod, node.Name, nodeToVictims[node].pods, g.cachedNodeInfoMap, g.extenders)
 		if passes && pErr == nil {
 			// Lower priority pods nominated to run on this node, may no longer fit on
 			// this node. So, we should remove their nomination. Removing their
 			// nomination updates these pods and moves them to the active queue. It
 			// lets scheduler find another place for them.
 			nominatedPods := g.getLowerPriorityNominatedPods(pod, node.Name)
-			return node, nodeToPods[node], nominatedPods, err
+			return node, nodeToVictims[node].pods, nominatedPods, err
 		}
 		if pErr != nil {
 			glog.Errorf("Error occurred while checking extenders for preemption on node %v: %v", node, pErr)
 		}
 		// Remove the node from the map and try to pick a different node.
-		delete(nodeToPods, node)
+		delete(nodeToVictims, node)
 	}
 	return nil, nil, nil, err
 }
@ -625,51 +637,66 @@ func EqualPriorityMap(_ *v1.Pod, _ interface{}, nodeInfo *schedulercache.NodeInf
 // pickOneNodeForPreemption chooses one node among the given nodes. It assumes
 // pods in each map entry are ordered by decreasing priority.
 // It picks a node based on the following criteria:
-// 1. A node with minimum highest priority victim is picked.
+// 1. A node with minimum number of PDB violations.
-// 2. Ties are broken by sum of priorities of all victims.
+// 2. A node with minimum highest priority victim is picked.
-// 3. If there are still ties, node with the minimum number of victims is picked.
+// 3. Ties are broken by sum of priorities of all victims.
-// 4. If there are still ties, the first such node is picked (sort of randomly).
+// 4. If there are still ties, node with the minimum number of victims is picked.
-//TODO(bsalamat): Try to reuse the "nodeScore" slices in order to save GC time.
+// 5. If there are still ties, the first such node is picked (sort of randomly).
-func pickOneNodeForPreemption(nodesToPods map[*v1.Node][]*v1.Pod) *v1.Node {
+//TODO(bsalamat): Try to reuse the "min*Nodes" slices in order to save GC time.
-	type nodeScore struct {
+func pickOneNodeForPreemption(nodesToVictims map[*v1.Node]*Victims) *v1.Node {
-		node            *v1.Node
+	if len(nodesToVictims) == 0 {
 		highestPriority int32
 		sumPriorities   int64
 		numPods         int
 	}
 	if len(nodesToPods) == 0 {
 		return nil
 	}
-	minHighestPriority := int32(math.MaxInt32)
+	minNumPDBViolatingPods := math.MaxInt32
-	minPriorityScores := []*nodeScore{}
+	var minPDBViolatingNodes []*v1.Node
-	for node, pods := range nodesToPods {
+	for node, victims := range nodesToVictims {
-		if len(pods) == 0 {
+		if len(victims.pods) == 0 {
 			// We found a node that doesn't need any preemption. Return it!
 			// This should happen rarely when one or more pods are terminated between
 			// the time that scheduler tries to schedule the pod and the time that
 			// preemption logic tries to find nodes for preemption.
 			return node
 		}
 		numPDBViolatingPods := victims.numPDBViolations
 		if numPDBViolatingPods < minNumPDBViolatingPods {
 			minNumPDBViolatingPods = numPDBViolatingPods
 			minPDBViolatingNodes = nil
 		}
 		if numPDBViolatingPods == minNumPDBViolatingPods {
 			minPDBViolatingNodes = append(minPDBViolatingNodes, node)
 		}
 	}
 	if len(minPDBViolatingNodes) == 1 {
 		return minPDBViolatingNodes[0]
 	}
 	// There are more than one node with minimum number PDB violating pods. Find
 	// the one with minimum highest priority victim.
 	minHighestPriority := int32(math.MaxInt32)
 	var minPriorityNodes []*v1.Node
 	for _, node := range minPDBViolatingNodes {
 		victims := nodesToVictims[node]
 		// highestPodPriority is the highest priority among the victims on this node.
-		highestPodPriority := util.GetPodPriority(pods[0])
+		highestPodPriority := util.GetPodPriority(victims.pods[0])
 		if highestPodPriority < minHighestPriority {
 			minHighestPriority = highestPodPriority
-			minPriorityScores = nil
+			minPriorityNodes = nil
 		}
 		if highestPodPriority == minHighestPriority {
-			minPriorityScores = append(minPriorityScores, &nodeScore{node: node, highestPriority: highestPodPriority, numPods: len(pods)})
+			minPriorityNodes = append(minPriorityNodes, node)
 		}
 	}
-	if len(minPriorityScores) == 1 {
+	if len(minPriorityNodes) == 1 {
-		return minPriorityScores[0].node
+		return minPriorityNodes[0]
 	}
 	// There are a few nodes with minimum highest priority victim. Find the
 	// smallest sum of priorities.
 	minSumPriorities := int64(math.MaxInt64)
-	minSumPriorityScores := []*nodeScore{}
+	var minSumPriorityNodes []*v1.Node
-	for _, nodeScore := range minPriorityScores {
+	for _, node := range minPriorityNodes {
 		var sumPriorities int64
-		for _, pod := range nodesToPods[nodeScore.node] {
+		for _, pod := range nodesToVictims[node].pods {
 			// We add MaxInt32+1 to all priorities to make all of them >= 0. This is
 			// needed so that a node with a few pods with negative priority is not
 			// picked over a node with a smaller number of pods with the same negative
@ -678,33 +705,34 @@ func pickOneNodeForPreemption(nodesToPods map[*v1.Node][]*v1.Pod) *v1.Node {
 		}
 		if sumPriorities < minSumPriorities {
 			minSumPriorities = sumPriorities
-			minSumPriorityScores = nil
+			minSumPriorityNodes = nil
 		}
 		nodeScore.sumPriorities = sumPriorities
 		if sumPriorities == minSumPriorities {
-			minSumPriorityScores = append(minSumPriorityScores, nodeScore)
+			minSumPriorityNodes = append(minSumPriorityNodes, node)
 		}
 	}
-	if len(minSumPriorityScores) == 1 {
+	if len(minSumPriorityNodes) == 1 {
-		return minSumPriorityScores[0].node
+		return minSumPriorityNodes[0]
 	}
 	// There are a few nodes with minimum highest priority victim and sum of priorities.
 	// Find one with the minimum number of pods.
 	minNumPods := math.MaxInt32
-	minNumPodScores := []*nodeScore{}
+	var minNumPodNodes []*v1.Node
-	for _, nodeScore := range minSumPriorityScores {
+	for _, node := range minSumPriorityNodes {
-		if nodeScore.numPods < minNumPods {
+		numPods := len(nodesToVictims[node].pods)
-			minNumPods = nodeScore.numPods
+		if numPods < minNumPods {
-			minNumPodScores = nil
+			minNumPods = numPods
 			minNumPodNodes = nil
 		}
-		if nodeScore.numPods == minNumPods {
+		if numPods == minNumPods {
-			minNumPodScores = append(minNumPodScores, nodeScore)
+			minNumPodNodes = append(minNumPodNodes, node)
 		}
 	}
 	// At this point, even if there are more than one node with the same score,
 	// return the first one.
-	if len(minNumPodScores) > 0 {
+	if len(minNumPodNodes) > 0 {
-		return minNumPodScores[0].node
+		return minNumPodNodes[0]
 	}
 	glog.Errorf("Error in logic of node scoring for preemption. We should never reach here!")
 	return nil
@ -718,9 +746,10 @@ func selectNodesForPreemption(pod *v1.Pod,
 	predicates map[string]algorithm.FitPredicate,
 	metadataProducer algorithm.PredicateMetadataProducer,
 	queue SchedulingQueue,
-) (map[*v1.Node][]*v1.Pod, error) {
+	pdbs []*policy.PodDisruptionBudget,
 ) (map[*v1.Node]*Victims, error) {
-	nodeNameToPods := map[*v1.Node][]*v1.Pod{}
+	nodeNameToVictims := map[*v1.Node]*Victims{}
 	var resultLock sync.Mutex
 	// We can use the same metadata producer for all nodes.
@ -731,15 +760,19 @@ func selectNodesForPreemption(pod *v1.Pod,
 		if meta != nil {
 			metaCopy = meta.ShallowCopy()
 		}
-		pods, fits := selectVictimsOnNode(pod, metaCopy, nodeNameToInfo[nodeName], predicates, queue)
+		pods, numPDBViolations, fits := selectVictimsOnNode(pod, metaCopy, nodeNameToInfo[nodeName], predicates, queue, pdbs)
 		if fits {
 			resultLock.Lock()
-			nodeNameToPods[potentialNodes[i]] = pods
+			victims := Victims{
 				pods:             pods,
 				numPDBViolations: numPDBViolations,
 			}
 			nodeNameToVictims[potentialNodes[i]] = &victims
 			resultLock.Unlock()
 		}
 	}
 	workqueue.Parallelize(16, len(potentialNodes), checkNode)
-	return nodeNameToPods, nil
+	return nodeNameToVictims, nil
 }
 func nodePassesExtendersForPreemption(
@ -776,6 +809,45 @@ func nodePassesExtendersForPreemption(
 	return true, nil
 }
 // filterPodsWithPDBViolation groups the given "pods" into two groups of "violatingPods"
 // and "nonViolatingPods" based on whether their PDBs will be violated if they are
 // preempted.
 // This function is stable and does not change the order of received pods. So, if it
 // receives a sorted list, grouping will preserve the order of the input list.
 func filterPodsWithPDBViolation(pods []interface{}, pdbs []*policy.PodDisruptionBudget) (violatingPods, nonViolatingPods []*v1.Pod) {
 	for _, obj := range pods {
 		pod := obj.(*v1.Pod)
 		pdbForPodIsViolated := false
 		// A pod with no labels will not match any PDB. So, no need to check.
 		if len(pod.Labels) != 0 {
 			for _, pdb := range pdbs {
 				if pdb.Namespace != pod.Namespace {
 					continue
 				}
 				selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector)
 				if err != nil {
 					continue
 				}
 				// A PDB with a nil or empty selector matches nothing.
 				if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) {
 					continue
 				}
 				// We have found a matching PDB.
 				if pdb.Status.PodDisruptionsAllowed <= 0 {
 					pdbForPodIsViolated = true
 					break
 				}
 			}
 		}
 		if pdbForPodIsViolated {
 			violatingPods = append(violatingPods, pod)
 		} else {
 			nonViolatingPods = append(nonViolatingPods, pod)
 		}
 	}
 	return violatingPods, nonViolatingPods
 }
 // selectVictimsOnNode finds minimum set of pods on the given node that should
 // be preempted in order to make enough room for "pod" to be scheduled. The
 // minimum set selected is subject to the constraint that a higher-priority pod
@ -783,19 +855,22 @@ func nodePassesExtendersForPreemption(
 // to one another, not relative to the preemptor "pod").
 // The algorithm first checks if the pod can be scheduled on the node when all the
 // lower priority pods are gone. If so, it sorts all the lower priority pods by
-// their priority and starts from the highest priority one, tries to keep as
+// their priority and then puts them into two groups of those whose PodDisruptionBudget
-// many of them as possible while checking that the "pod" can still fit on the node.
+// will be violated if preempted and other non-violating pods. Both groups are
 // sorted by priority. It first tries to reprieve as many PDB violating pods as
 // possible and then does them same for non-PDB-violating pods while checking
 // that the "pod" can still fit on the node.
 // NOTE: This function assumes that it is never called if "pod" cannot be scheduled
 // due to pod affinity, node affinity, or node anti-affinity reasons. None of
 // these predicates can be satisfied by removing more pods from the node.
 // TODO(bsalamat): Add support for PodDisruptionBudget.
 func selectVictimsOnNode(
 	pod *v1.Pod,
 	meta algorithm.PredicateMetadata,
 	nodeInfo *schedulercache.NodeInfo,
 	fitPredicates map[string]algorithm.FitPredicate,
 	queue SchedulingQueue,
-) ([]*v1.Pod, bool) {
+	pdbs []*policy.PodDisruptionBudget,
 ) ([]*v1.Pod, int, bool) {
 	potentialVictims := util.SortableList{CompFunc: util.HigherPriorityPod}
 	nodeInfoCopy := nodeInfo.Clone()
@ -830,20 +905,34 @@ func selectVictimsOnNode(
 		if err != nil {
 			glog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, err)
 		}
-		return nil, false
+		return nil, 0, false
 	}
-	victims := []*v1.Pod{}
+	var victims []*v1.Pod
-	// Try to reprieve as many pods as possible starting from the highest priority one.
+	numViolatingVictim := 0
-	for _, p := range potentialVictims.Items {
+	// Try to reprieve as many pods as possible. We first try to reprieve the PDB
-		lpp := p.(*v1.Pod)
+	// violating victims and then other non-violating ones. In both cases, we start
-		addPod(lpp)
+	// from the highest priority victims.
-		if fits, _, _ := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil, queue); !fits {
+	violatingVictims, nonViolatingVictims := filterPodsWithPDBViolation(potentialVictims.Items, pdbs)
-			removePod(lpp)
+	reprievePod := func(p *v1.Pod) bool {
-			victims = append(victims, lpp)
+		addPod(p)
-			glog.V(5).Infof("Pod %v is a potential preemption victim on node %v.", lpp.Name, nodeInfo.Node().Name)
+		fits, _, _ := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil, queue)
 		if !fits {
 			removePod(p)
 			victims = append(victims, p)
 			glog.V(5).Infof("Pod %v is a potential preemption victim on node %v.", p.Name, nodeInfo.Node().Name)
 		}
 		return fits
 	}
 	for _, p := range violatingVictims {
 		if !reprievePod(p) {
 			numViolatingVictim++
 		}
 	}
-	return victims, true
+	// Now we try to reprieve non-violating victims.
 	for _, p := range nonViolatingVictims {
 		reprievePod(p)
 	}
 	return victims, numViolatingVictim, true
 }
 // nodesWherePreemptionMightHelp returns a list of nodes with failed predicates
--- a/plugin/pkg/scheduler/core/generic_scheduler_test.go
+++ b/plugin/pkg/scheduler/core/generic_scheduler_test.go
@ -560,11 +560,11 @@ func TestZeroRequest(t *testing.T) {
 	}
 }
-func printNodeToPods(nodeToPods map[*v1.Node][]*v1.Pod) string {
+func printNodeToVictims(nodeToVictims map[*v1.Node]*Victims) string {
 	var output string
-	for node, pods := range nodeToPods {
+	for node, victims := range nodeToVictims {
 		output += node.Name + ": ["
-		for _, pod := range pods {
+		for _, pod := range victims.pods {
 			output += pod.Name + ", "
 		}
 		output += "]"
@ -572,15 +572,15 @@ func printNodeToPods(nodeToPods map[*v1.Node][]*v1.Pod) string {
 	return output
 }
-func checkPreemptionVictims(testName string, expected map[string]map[string]bool, nodeToPods map[*v1.Node][]*v1.Pod) error {
+func checkPreemptionVictims(testName string, expected map[string]map[string]bool, nodeToPods map[*v1.Node]*Victims) error {
 	if len(expected) == len(nodeToPods) {
-		for k, pods := range nodeToPods {
+		for k, victims := range nodeToPods {
 			if expPods, ok := expected[k.Name]; ok {
-				if len(pods) != len(expPods) {
+				if len(victims.pods) != len(expPods) {
-					return fmt.Errorf("test [%v]: unexpected number of pods. expected: %v, got: %v", testName, expected, printNodeToPods(nodeToPods))
+					return fmt.Errorf("test [%v]: unexpected number of pods. expected: %v, got: %v", testName, expected, printNodeToVictims(nodeToPods))
 				}
 				prevPriority := int32(math.MaxInt32)
-				for _, p := range pods {
+				for _, p := range victims.pods {
 					// Check that pods are sorted by their priority.
 					if *p.Spec.Priority > prevPriority {
 						return fmt.Errorf("test [%v]: pod %v of node %v was not sorted by priority", testName, p.Name, k)
@ -591,11 +591,11 @@ func checkPreemptionVictims(testName string, expected map[string]map[string]bool
 					}
 				}
 			} else {
-				return fmt.Errorf("test [%v]: unexpected machines. expected: %v, got: %v", testName, expected, printNodeToPods(nodeToPods))
+				return fmt.Errorf("test [%v]: unexpected machines. expected: %v, got: %v", testName, expected, printNodeToVictims(nodeToPods))
 			}
 		}
 	} else {
-		return fmt.Errorf("test [%v]: unexpected number of machines. expected: %v, got: %v", testName, expected, printNodeToPods(nodeToPods))
+		return fmt.Errorf("test [%v]: unexpected number of machines. expected: %v, got: %v", testName, expected, printNodeToVictims(nodeToPods))
 	}
 	return nil
 }
@ -790,7 +790,7 @@ func TestSelectNodesForPreemption(t *testing.T) {
 			test.predicates[predicates.MatchInterPodAffinity] = algorithmpredicates.NewPodAffinityPredicate(FakeNodeInfo(*nodes[0]), schedulertesting.FakePodLister(test.pods))
 		}
 		nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, nodes)
-		nodeToPods, err := selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata, nil)
+		nodeToPods, err := selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata, nil, nil)
 		if err != nil {
 			t.Error(err)
 		}
@ -947,7 +947,7 @@ func TestPickOneNodeForPreemption(t *testing.T) {
 			nodes = append(nodes, makeNode(n, priorityutil.DefaultMilliCpuRequest*5, priorityutil.DefaultMemoryRequest*5))
 		}
 		nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, nodes)
-		candidateNodes, _ := selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata, nil)
+		candidateNodes, _ := selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata, nil, nil)
 		node := pickOneNodeForPreemption(candidateNodes)
 		found := false
 		for _, nodeName := range test.expected {
--- a/test/integration/scheduler/preemption_test.go
+++ b/test/integration/scheduler/preemption_test.go
@ -24,8 +24,11 @@ import (
 	"time"
 	"k8s.io/api/core/v1"
 	policy "k8s.io/api/policy/v1beta1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apimachinery/pkg/util/wait"
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	clientset "k8s.io/client-go/kubernetes"
@ -495,3 +498,272 @@ func TestNominatedNodeCleanUp(t *testing.T) {
 		t.Errorf("The nominated node name of the medium priority pod was not cleared: %v", err)
 	}
 }
 func mkMinAvailablePDB(name, namespace string, minAvailable int, matchLabels map[string]string) *policy.PodDisruptionBudget {
 	intMinAvailable := intstr.FromInt(minAvailable)
 	return &policy.PodDisruptionBudget{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      name,
 			Namespace: namespace,
 		},
 		Spec: policy.PodDisruptionBudgetSpec{
 			MinAvailable: &intMinAvailable,
 			Selector:     &metav1.LabelSelector{MatchLabels: matchLabels},
 		},
 	}
 }
 // TestPDBInPreemption tests PodDisruptionBudget support in preemption.
 func TestPDBInPreemption(t *testing.T) {
 	// Enable PodPriority feature gate.
 	utilfeature.DefaultFeatureGate.Set(fmt.Sprintf("%s=true", features.PodPriority))
 	// Initialize scheduler.
 	context := initTest(t, "preemption-pdb")
 	defer cleanupTest(t, context)
 	cs := context.clientSet
 	defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
 		v1.ResourceCPU:    *resource.NewMilliQuantity(100, resource.DecimalSI),
 		v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI)},
 	}
 	defaultNodeRes := &v1.ResourceList{
 		v1.ResourcePods:   *resource.NewQuantity(32, resource.DecimalSI),
 		v1.ResourceCPU:    *resource.NewMilliQuantity(500, resource.DecimalSI),
 		v1.ResourceMemory: *resource.NewQuantity(500, resource.BinarySI),
 	}
 	type nodeConfig struct {
 		name string
 		res  *v1.ResourceList
 	}
 	tests := []struct {
 		description         string
 		nodes               []*nodeConfig
 		pdbs                []*policy.PodDisruptionBudget
 		existingPods        []*v1.Pod
 		pod                 *v1.Pod
 		preemptedPodIndexes map[int]struct{}
 	}{
 		{
 			description: "A non-PDB violating pod is preempted despite its higher priority",
 			nodes:       []*nodeConfig{{name: "node-1", res: defaultNodeRes}},
 			pdbs: []*policy.PodDisruptionBudget{
 				mkMinAvailablePDB("pdb-1", context.ns.Name, 2, map[string]string{"foo": "bar"}),
 			},
 			existingPods: []*v1.Pod{
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "low-pod1",
 					Namespace: context.ns.Name,
 					Priority:  &lowPriority,
 					Resources: defaultPodRes,
 					Labels:    map[string]string{"foo": "bar"},
 				}),
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "low-pod2",
 					Namespace: context.ns.Name,
 					Priority:  &lowPriority,
 					Resources: defaultPodRes,
 					Labels:    map[string]string{"foo": "bar"},
 				}),
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "mid-pod3",
 					Namespace: context.ns.Name,
 					Priority:  &mediumPriority,
 					Resources: defaultPodRes,
 				}),
 			},
 			pod: initPausePod(cs, &pausePodConfig{
 				Name:      "preemptor-pod",
 				Namespace: context.ns.Name,
 				Priority:  &highPriority,
 				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
 					v1.ResourceCPU:    *resource.NewMilliQuantity(300, resource.DecimalSI),
 					v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
 				},
 			}),
 			preemptedPodIndexes: map[int]struct{}{2: {}},
 		},
 		{
 			description: "A node without any PDB violating pods is preferred for preemption",
 			nodes: []*nodeConfig{
 				{name: "node-1", res: defaultNodeRes},
 				{name: "node-2", res: defaultNodeRes},
 			},
 			pdbs: []*policy.PodDisruptionBudget{
 				mkMinAvailablePDB("pdb-1", context.ns.Name, 2, map[string]string{"foo": "bar"}),
 			},
 			existingPods: []*v1.Pod{
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "low-pod1",
 					Namespace: context.ns.Name,
 					Priority:  &lowPriority,
 					Resources: defaultPodRes,
 					NodeName:  "node-1",
 					Labels:    map[string]string{"foo": "bar"},
 				}),
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "mid-pod2",
 					Namespace: context.ns.Name,
 					Priority:  &mediumPriority,
 					NodeName:  "node-2",
 					Resources: defaultPodRes,
 				}),
 			},
 			pod: initPausePod(cs, &pausePodConfig{
 				Name:      "preemptor-pod",
 				Namespace: context.ns.Name,
 				Priority:  &highPriority,
 				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
 					v1.ResourceCPU:    *resource.NewMilliQuantity(500, resource.DecimalSI),
 					v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
 				},
 			}),
 			preemptedPodIndexes: map[int]struct{}{1: {}},
 		},
 		{
 			description: "A node with fewer PDB violating pods is preferred for preemption",
 			nodes: []*nodeConfig{
 				{name: "node-1", res: defaultNodeRes},
 				{name: "node-2", res: defaultNodeRes},
 				{name: "node-3", res: defaultNodeRes},
 			},
 			pdbs: []*policy.PodDisruptionBudget{
 				mkMinAvailablePDB("pdb-1", context.ns.Name, 2, map[string]string{"foo1": "bar"}),
 				mkMinAvailablePDB("pdb-2", context.ns.Name, 2, map[string]string{"foo2": "bar"}),
 			},
 			existingPods: []*v1.Pod{
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "low-pod1",
 					Namespace: context.ns.Name,
 					Priority:  &lowPriority,
 					Resources: defaultPodRes,
 					NodeName:  "node-1",
 					Labels:    map[string]string{"foo1": "bar"},
 				}),
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "mid-pod1",
 					Namespace: context.ns.Name,
 					Priority:  &mediumPriority,
 					Resources: defaultPodRes,
 					NodeName:  "node-1",
 				}),
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "low-pod2",
 					Namespace: context.ns.Name,
 					Priority:  &lowPriority,
 					Resources: defaultPodRes,
 					NodeName:  "node-2",
 					Labels:    map[string]string{"foo2": "bar"},
 				}),
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "mid-pod2",
 					Namespace: context.ns.Name,
 					Priority:  &mediumPriority,
 					Resources: defaultPodRes,
 					NodeName:  "node-2",
 					Labels:    map[string]string{"foo2": "bar"},
 				}),
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "low-pod4",
 					Namespace: context.ns.Name,
 					Priority:  &lowPriority,
 					Resources: defaultPodRes,
 					NodeName:  "node-3",
 					Labels:    map[string]string{"foo2": "bar"},
 				}),
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "low-pod5",
 					Namespace: context.ns.Name,
 					Priority:  &lowPriority,
 					Resources: defaultPodRes,
 					NodeName:  "node-3",
 					Labels:    map[string]string{"foo2": "bar"},
 				}),
 				initPausePod(context.clientSet, &pausePodConfig{
 					Name:      "low-pod6",
 					Namespace: context.ns.Name,
 					Priority:  &lowPriority,
 					Resources: defaultPodRes,
 					NodeName:  "node-3",
 					Labels:    map[string]string{"foo2": "bar"},
 				}),
 			},
 			pod: initPausePod(cs, &pausePodConfig{
 				Name:      "preemptor-pod",
 				Namespace: context.ns.Name,
 				Priority:  &highPriority,
 				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
 					v1.ResourceCPU:    *resource.NewMilliQuantity(500, resource.DecimalSI),
 					v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
 				},
 			}),
 			preemptedPodIndexes: map[int]struct{}{0: {}, 1: {}},
 		},
 	}
 	for _, test := range tests {
 		for _, nodeConf := range test.nodes {
 			_, err := createNode(cs, nodeConf.name, nodeConf.res)
 			if err != nil {
 				t.Fatalf("Error creating node %v: %v", nodeConf.name, err)
 			}
 		}
 		// Create PDBs.
 		for _, pdb := range test.pdbs {
 			_, err := context.clientSet.PolicyV1beta1().PodDisruptionBudgets(context.ns.Name).Create(pdb)
 			if err != nil {
 				t.Fatalf("Failed to create PDB: %v", err)
 			}
 		}
 		// Wait for PDBs to show up in the scheduler's cache.
 		if err := wait.Poll(time.Second, 15*time.Second, func() (bool, error) {
 			cachedPDBs, err := context.scheduler.Config().SchedulerCache.ListPDBs(labels.Everything())
 			if err != nil {
 				t.Errorf("Error while polling for PDB: %v", err)
 				return false, err
 			}
 			return len(cachedPDBs) == len(test.pdbs), err
 		}); err != nil {
 			t.Fatalf("Not all PDBs were added to the cache: %v", err)
 		}
 		pods := make([]*v1.Pod, len(test.existingPods))
 		var err error
 		// Create and run existingPods.
 		for i, p := range test.existingPods {
 			if pods[i], err = runPausePod(cs, p); err != nil {
 				t.Fatalf("Test [%v]: Error running pause pod: %v", test.description, err)
 			}
 		}
 		// Create the "pod".
 		preemptor, err := createPausePod(cs, test.pod)
 		if err != nil {
 			t.Errorf("Error while creating high priority pod: %v", err)
 		}
 		// Wait for preemption of pods and make sure the other ones are not preempted.
 		for i, p := range pods {
 			if _, found := test.preemptedPodIndexes[i]; found {
 				if err = wait.Poll(time.Second, wait.ForeverTestTimeout, podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil {
 					t.Errorf("Test [%v]: Pod %v is not getting evicted.", test.description, p.Name)
 				}
 			} else {
 				if p.DeletionTimestamp != nil {
 					t.Errorf("Test [%v]: Didn't expect pod %v to get preempted.", test.description, p.Name)
 				}
 			}
 		}
 		// Also check that the preemptor pod gets the annotation for nominated node name.
 		if len(test.preemptedPodIndexes) > 0 {
 			if err := waitForNominatedNodeAnnotation(cs, preemptor); err != nil {
 				t.Errorf("Test [%v]: NominatedNodeName annotation was not set for pod %v: %v", test.description, preemptor.Name, err)
 			}
 		}
 		// Cleanup
 		pods = append(pods, preemptor)
 		cleanupPods(cs, t, pods)
 		cs.PolicyV1beta1().PodDisruptionBudgets(context.ns.Name).DeleteCollection(nil, metav1.ListOptions{})
 		cs.CoreV1().Nodes().DeleteCollection(nil, metav1.ListOptions{})
 	}
 }