From 1cec8bac9c1786668ccd2d339e42b5835bd5927d Mon Sep 17 00:00:00 2001 From: "Bobby (Babak) Salamat" Date: Wed, 9 Aug 2017 11:36:35 -0700 Subject: [PATCH] Add the logic to pick one node for preemption --- .../pkg/scheduler/core/generic_scheduler.go | 91 ++++++- .../scheduler/core/generic_scheduler_test.go | 252 +++++++++++++++--- 2 files changed, 301 insertions(+), 42 deletions(-) diff --git a/plugin/pkg/scheduler/core/generic_scheduler.go b/plugin/pkg/scheduler/core/generic_scheduler.go index f51326ee0fd..b3e35477c66 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler.go +++ b/plugin/pkg/scheduler/core/generic_scheduler.go @@ -18,6 +18,7 @@ package core import ( "fmt" + "math" "sort" "strings" "sync" @@ -176,8 +177,9 @@ func (g *genericScheduler) preempt(pod *v1.Pod, nodeLister algorithm.NodeLister) if len(nodeToPods) == 0 { return "", nil } - // TODO: Add a node scoring mechanism and perform preemption - return "", nil + node := pickOneNodeForPreemption(nodeToPods) + // TODO: Add actual preemption of pods + return node, nil } // Filters the nodes to find the ones that fit based on the given predicate functions @@ -444,6 +446,89 @@ func EqualPriorityMap(_ *v1.Pod, _ interface{}, nodeInfo *schedulercache.NodeInf }, nil } +// pickOneNodeForPreemption chooses one node among the given nodes. It assumes +// pods in each map entry are ordered by decreasing priority. +// It picks a node based on the following criteria: +// 1. A node with minimum highest priority victim is picked. +// 2. Ties are broken by sum of priorities of all victims. +// 3. If there are still ties, node with the minimum number of victims is picked. +// 4. If there are still ties, the first such node in the map is picked (sort of randomly). +func pickOneNodeForPreemption(nodesToPods map[string][]*v1.Pod) string { + type nodeScore struct { + nodeName string + highestPriority int32 + sumPriorities int64 + numPods int + } + if len(nodesToPods) == 0 { + return "" + } + minHighestPriority := int32(math.MaxInt32) + nodeScores := []*nodeScore{} + for nodeName, pods := range nodesToPods { + if len(pods) == 0 { + // We found a node that doesn't need any preemption. Return it! + // This should happen rarely when one or more pods are terminated between + // the time that scheduler tries to schedule the pod and the time that + // preemption logic tries to find nodes for preemption. + return nodeName + } + // highestPodPriority is the highest priority among the victims on this node. + highestPodPriority := util.GetPodPriority(pods[0]) + if highestPodPriority < minHighestPriority { + minHighestPriority = highestPodPriority + } + nodeScores = append(nodeScores, &nodeScore{nodeName: nodeName, highestPriority: highestPodPriority, numPods: len(pods)}) + } + // Find the nodes with minimum highest priority victim. + minSumPriorities := int64(math.MaxInt64) + lowestHighPriorityNodes := []*nodeScore{} + for _, nodeScore := range nodeScores { + if nodeScore.highestPriority == minHighestPriority { + lowestHighPriorityNodes = append(lowestHighPriorityNodes, nodeScore) + var sumPriorities int64 + for _, pod := range nodesToPods[nodeScore.nodeName] { + // We add MaxInt32+1 to all priorities to make all of them >= 0. This is + // needed so that a node with a few pods with negative priority is not + // picked over a node with a smaller number of pods with the same negative + // priority (and similar scenarios). + sumPriorities += int64(util.GetPodPriority(pod)) + int64(math.MaxInt32+1) + } + if sumPriorities < minSumPriorities { + minSumPriorities = sumPriorities + } + nodeScore.sumPriorities = sumPriorities + } + } + if len(lowestHighPriorityNodes) == 1 { + return lowestHighPriorityNodes[0].nodeName + } + // There are multiple nodes with the same minimum highest priority victim. + // Choose the one(s) with lowest sum of priorities. + minNumPods := math.MaxInt32 + lowestSumPriorityNodes := []*nodeScore{} + for _, nodeScore := range lowestHighPriorityNodes { + if nodeScore.sumPriorities == minSumPriorities { + lowestSumPriorityNodes = append(lowestSumPriorityNodes, nodeScore) + if nodeScore.numPods < minNumPods { + minNumPods = nodeScore.numPods + } + } + } + if len(lowestSumPriorityNodes) == 1 { + return lowestSumPriorityNodes[0].nodeName + } + // There are still more than one node with minimum highest priority victim and + // lowest sum of victim priorities. Find the anyone with minimum number of victims. + for _, nodeScore := range lowestSumPriorityNodes { + if nodeScore.numPods == minNumPods { + return nodeScore.nodeName + } + } + glog.Errorf("We should never reach here!") + return "" +} + // selectNodesForPreemption finds all the nodes with possible victims for // preemption in parallel. func selectNodesForPreemption(pod *v1.Pod, @@ -461,7 +546,7 @@ func selectNodesForPreemption(pod *v1.Pod, checkNode := func(i int) { nodeName := nodes[i].Name pods, fits := selectVictimsOnNode(pod, meta.ShallowCopy(), nodeNameToInfo[nodeName], predicates) - if fits && len(pods) != 0 { + if fits { resultLock.Lock() nodeNameToPods[nodeName] = pods resultLock.Unlock() diff --git a/plugin/pkg/scheduler/core/generic_scheduler_test.go b/plugin/pkg/scheduler/core/generic_scheduler_test.go index cfe1f6708ea..d2ed01dff7f 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler_test.go +++ b/plugin/pkg/scheduler/core/generic_scheduler_test.go @@ -587,46 +587,59 @@ func PredicateMetadata(p *v1.Pod, nodeInfo map[string]*schedulercache.NodeInfo) return algorithmpredicates.NewPredicateMetadataFactory(schedulertesting.FakePodLister{p})(p, nodeInfo) } +var smallContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest, 10)), + }, + }, + }, +} +var mediumContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*2, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest*2, 10)), + }, + }, + }, +} +var largeContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*3, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest*3, 10)), + }, + }, + }, +} +var veryLargeContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*5, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest*5, 10)), + }, + }, + }, +} +var negPriority, lowPriority, midPriority, highPriority, veryHighPriority = int32(-100), int32(0), int32(100), int32(1000), int32(10000) + // TestSelectNodesForPreemption tests selectNodesForPreemption. This test assumes // that podsFitsOnNode works correctly and is tested separately. func TestSelectNodesForPreemption(t *testing.T) { - smallContainers := []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMilliCpuRequest, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMemoryRequest, 10)), - }, - }, - }, - } - mediumContainers := []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*2, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMemoryRequest*2, 10)), - }, - }, - }, - } - largeContainers := []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*3, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMemoryRequest*3, 10)), - }, - }, - }, - } - lowPriority, midPriority, highPriority := int32(0), int32(100), int32(1000) tests := []struct { name string predicates map[string]algorithm.FitPredicate @@ -654,7 +667,7 @@ func TestSelectNodesForPreemption(t *testing.T) { pods: []*v1.Pod{ {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]map[string]bool{}, + expected: map[string]map[string]bool{"machine1": {}, "machine2": {}}, }, { name: "a pod that fits on one machine with no preemption", @@ -664,7 +677,7 @@ func TestSelectNodesForPreemption(t *testing.T) { pods: []*v1.Pod{ {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]map[string]bool{}, + expected: map[string]map[string]bool{"machine1": {}}, }, { name: "a pod that fits on both machines when lower priority pods are preempted", @@ -789,3 +802,164 @@ func TestSelectNodesForPreemption(t *testing.T) { } } } + +// TestPickOneNodeForPreemption tests pickOneNodeForPreemption. +func TestPickOneNodeForPreemption(t *testing.T) { + tests := []struct { + name string + predicates map[string]algorithm.FitPredicate + nodes []string + pod *v1.Pod + pods []*v1.Pod + expected []string // any of the items is valid + }{ + { + name: "No node needs preemption", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}}, + expected: []string{"machine1"}, + }, + { + name: "a pod that fits on both machines when lower priority pods are preempted", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: []string{"machine1", "machine2"}, + }, + { + name: "a pod that fits on a machine with no preemption", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: []string{"machine3"}, + }, + { + name: "machine with min highest priority pod is picked", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}}, + }, + expected: []string{"machine3"}, + }, + { + name: "when highest priorities are the same, minimum sum of priorities is picked", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + }, + expected: []string{"machine2"}, + }, + { + name: "when highest priority and sum are the same, minimum number of pods is picked", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.4"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &negPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}}, + }, + expected: []string{"machine2"}, + }, + { + // pickOneNodeForPreemption adjusts pod priorities when finding the sum of the victims. This + // test ensures that the logic works correctly. + name: "sum of adjusted priorities is considered", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &negPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}}, + }, + expected: []string{"machine2"}, + }, + { + name: "non-overlapping lowest high priority, sum priorities, and number of pods", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3", "machine4"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &veryHighPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.4"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m4.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine4"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m4.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine4"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m4.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine4"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m4.4"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine4"}}, + }, + expected: []string{"machine1"}, + }, + } + for _, test := range tests { + nodes := []*v1.Node{} + for _, n := range test.nodes { + nodes = append(nodes, makeNode(n, priorityutil.DefaultMilliCpuRequest*5, priorityutil.DefaultMemoryRequest*5)) + } + nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, nodes) + node := pickOneNodeForPreemption(selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata)) + found := false + for _, nodeName := range test.expected { + if node == nodeName { + found = true + break + } + } + if !found { + t.Errorf("test [%v]: unexpected node: %v", test.name, node) + } + } +}