From a4edc6c87134680646030df83780559ed7152c7e Mon Sep 17 00:00:00 2001 From: "Bobby (Babak) Salamat" Date: Wed, 2 Aug 2017 18:20:45 -0700 Subject: [PATCH 1/7] Add preemption victim selector logic to scheduler --- .../algorithm/predicates/predicates.go | 5 +- .../pkg/scheduler/core/generic_scheduler.go | 140 ++++++++++ .../scheduler/core/generic_scheduler_test.go | 245 ++++++++++++++++++ plugin/pkg/scheduler/schedulercache/cache.go | 4 +- .../pkg/scheduler/schedulercache/node_info.go | 41 ++- plugin/pkg/scheduler/schedulercache/util.go | 2 +- plugin/pkg/scheduler/util/utils.go | 42 +++ plugin/pkg/scheduler/util/utils_test.go | 95 +++++++ 8 files changed, 564 insertions(+), 10 deletions(-) create mode 100644 plugin/pkg/scheduler/util/utils_test.go diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index dc811b8e80f..c9f55181412 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -781,6 +781,7 @@ func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta interface{}, no s.serviceAffinityMetadataProducer(pm) pods, services = pm.serviceAffinityMatchingPodList, pm.serviceAffinityMatchingPodServices } + filteredPods := nodeInfo.FilterOutPods(pods) node := nodeInfo.Node() if node == nil { return false, nil, fmt.Errorf("node not found") @@ -790,8 +791,8 @@ func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta interface{}, no // Step 1: If we don't have all constraints, introspect nodes to find the missing constraints. if len(s.labels) > len(affinityLabels) { if len(services) > 0 { - if len(pods) > 0 { - nodeWithAffinityLabels, err := s.nodeInfo.GetNodeInfo(pods[0].Spec.NodeName) + if len(filteredPods) > 0 { + nodeWithAffinityLabels, err := s.nodeInfo.GetNodeInfo(filteredPods[0].Spec.NodeName) if err != nil { return false, nil, err } diff --git a/plugin/pkg/scheduler/core/generic_scheduler.go b/plugin/pkg/scheduler/core/generic_scheduler.go index a9244f6636e..bb635739ad4 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler.go +++ b/plugin/pkg/scheduler/core/generic_scheduler.go @@ -32,6 +32,7 @@ import ( "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" + "k8s.io/kubernetes/plugin/pkg/scheduler/util" "github.com/golang/glog" ) @@ -159,6 +160,26 @@ func (g *genericScheduler) selectHost(priorityList schedulerapi.HostPriorityList return priorityList[ix].Host, nil } +// preempt finds nodes with pods that can be preempted to make room for "pod" to +// schedule. It chooses one of the nodes and preempts the pods on the node and +// returns the node name if such a node is found. +// TODO(bsalamat): This function is under construction! DO NOT USE! +func (g *genericScheduler) preempt(pod *v1.Pod, nodeLister algorithm.NodeLister) (string, error) { + nodes, err := nodeLister.List() + if err != nil { + return "", err + } + if len(nodes) == 0 { + return "", ErrNoNodesAvailable + } + nodeToPods := selectNodesForPreemption(pod, g.cachedNodeInfoMap, nodes, g.predicates, g.predicateMetaProducer) + if len(nodeToPods) == 0 { + return "", nil + } + // TODO: Add a node scoring mechanism and perform preemption + return "", nil +} + // Filters the nodes to find the ones that fit based on the given predicate functions // Each node is passed through the predicate functions to determine if it is a fit func findNodesThatFit( @@ -423,6 +444,125 @@ func EqualPriorityMap(_ *v1.Pod, _ interface{}, nodeInfo *schedulercache.NodeInf }, nil } +// selectNodesForPreemption finds all the nodes with possible victims for +// preemption in parallel. +func selectNodesForPreemption(pod *v1.Pod, + nodeNameToInfo map[string]*schedulercache.NodeInfo, + nodes []*v1.Node, + predicates map[string]algorithm.FitPredicate, + metadataProducer algorithm.MetadataProducer, +) map[string][]*v1.Pod { + + nodeNameToPods := map[string][]*v1.Pod{} + var resultLock sync.Mutex + + // We can use the same metadata producer for all nodes. + meta := metadataProducer(pod, nodeNameToInfo) + checkNode := func(i int) { + nodeName := nodes[i].Name + pods, fits := selectVictimsOnNode(pod, meta.ShallowCopy(), nodeNameToInfo[nodeName], predicates) + if fits && len(pods) != 0 { + resultLock.Lock() + nodeNameToPods[nodeName] = pods + resultLock.Unlock() + } + } + workqueue.Parallelize(16, len(nodes), checkNode) + return nodeNameToPods +} + +// selectVictimsOnNode finds minimum set of pods on the given node that should +// be preempted in order to make enough room for "pod" to be scheduled. The +// minimum set selected is subject to the constraint that a higher-priority pod +// is never preempted when a lower-priority pod could be (higher/lower relative +// to one another, not relative to the preemptor "pod"). +// The algorithm first checks if the pod can be scheduled on the node when all the +// lower priority pods are gone. If so, it sorts all the lower priority pods by +// their priority and starting from the highest priority one, tries to keep as +// many of them as possible while checking that the "pod" can still fit on the node. +// NOTE: This function assumes that it is never called if "pod" cannot be scheduled +// due to pod affinity, node affinity, or node anti-affinity reasons. None of +// these predicates can be satisfied by removing more pods from the node. +func selectVictimsOnNode(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo, fitPredicates map[string]algorithm.FitPredicate) ([]*v1.Pod, bool) { + higherPriority := func(pod1, pod2 interface{}) bool { + return util.GetPodPriority(pod1.(*v1.Pod)) > util.GetPodPriority(pod2.(*v1.Pod)) + } + potentialVictims := util.SortableList{CompFunc: higherPriority} + nodeInfoCopy := nodeInfo.Clone() + + removePod := func(rp *v1.Pod) { + nodeInfoCopy.RemovePod(rp) + meta.RemovePod(rp) + } + addPod := func(ap *v1.Pod) { + nodeInfoCopy.AddPod(ap) + meta.AddPod(ap, nodeInfoCopy) + } + // As the first step, remove all the lower priority pods from the node and + // check if the given pod can be scheduled. + podPriority := util.GetPodPriority(pod) + for _, p := range nodeInfoCopy.Pods() { + if util.GetPodPriority(p) < podPriority { + potentialVictims.Items = append(potentialVictims.Items, p) + removePod(p) + } + } + potentialVictims.Sort() + // If the new pod does not fit after removing all the lower priority pods, + // we are almost done and this node is not suitable for preemption. The only condition + // that we should check is if the "pod" is failing to schedule due to pod affinity + // failure. + if fits, failedPredicates, err := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits { + if err != nil { + glog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, err) + return nil, false + } + // If the new pod still cannot be scheduled for any reason other than pod + // affinity, the new pod will not fit on this node and we are done here. + affinity := pod.Spec.Affinity + if affinity == nil || affinity.PodAffinity == nil { + return nil, false + } + for _, failedPred := range failedPredicates { + if failedPred != predicates.ErrPodAffinityNotMatch { + return nil, false + } + } + // If we reach here, it means that the pod cannot be scheduled due to pod + // affinity or anti-affinity. Since failure reason for both affinity and + // anti-affinity is the same, we cannot say which one caused it. So, we try + // adding pods one at a time and see if any of them satisfies the affinity rules. + for i, p := range potentialVictims.Items { + existingPod := p.(*v1.Pod) + addPod(existingPod) + if fits, _, _ = podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits { + removePod(existingPod) + } else { + // We found the pod needed to satisfy pod affinity. Let's remove it from + // potential victims list. + // NOTE: We assume that pod affinity can be satisfied by only one pod, + // not multiple pods. This is how scheduler works today. + potentialVictims.Items = append(potentialVictims.Items[:i], potentialVictims.Items[i+1:]...) + break + } + } + if !fits { + return nil, false + } + } + victims := []*v1.Pod{} + // Try to reprieve as may pods as possible starting from the highest priority one. + for _, p := range potentialVictims.Items { + lpp := p.(*v1.Pod) + addPod(lpp) + if fits, _, _ := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits { + removePod(lpp) + victims = append(victims, lpp) + } + } + return victims, true +} + func NewGenericScheduler( cache schedulercache.Cache, eCache *EquivalenceCache, diff --git a/plugin/pkg/scheduler/core/generic_scheduler_test.go b/plugin/pkg/scheduler/core/generic_scheduler_test.go index f4abcaa5057..bbd95f85cb9 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler_test.go +++ b/plugin/pkg/scheduler/core/generic_scheduler_test.go @@ -392,10 +392,13 @@ func makeNode(node string, milliCPU, memory int64) *v1.Node { Capacity: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), + "pods": *resource.NewQuantity(100, resource.DecimalSI), }, Allocatable: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), + "pods": *resource.NewQuantity(100, resource.DecimalSI), }, }, } @@ -544,3 +547,245 @@ func TestZeroRequest(t *testing.T) { } } } + +func checkPreemptionVictims(testName string, expected map[string]map[string]bool, nodeToPods map[string][]*v1.Pod) error { + if len(expected) == len(nodeToPods) { + for k, pods := range nodeToPods { + if expPods, ok := expected[k]; ok { + if len(pods) != len(expPods) { + return fmt.Errorf("test [%v]: unexpected number of pods. expected: %v, got: %v", testName, expected, nodeToPods) + } + prevPriority := int32(math.MaxInt32) + for _, p := range pods { + // Check that pods are sorted by their priority. + if *p.Spec.Priority > prevPriority { + return fmt.Errorf("test [%v]: pod %v of node %v was not sorted by priority", testName, p.Name, k) + } + prevPriority = *p.Spec.Priority + if _, ok := expPods[p.Name]; !ok { + return fmt.Errorf("test [%v]: pod %v was not expected. Expected: %v", testName, p.Name, expPods) + } + } + } else { + return fmt.Errorf("test [%v]: unexpected machines. expected: %v, got: %v", testName, expected, nodeToPods) + } + } + } else { + return fmt.Errorf("test [%v]: unexpected number of machines. expected: %v, got: %v", testName, expected, nodeToPods) + } + return nil +} + +type FakeNodeInfo v1.Node + +func (n FakeNodeInfo) GetNodeInfo(nodeName string) (*v1.Node, error) { + node := v1.Node(n) + return &node, nil +} + +func PredicateMetadata(p *v1.Pod, nodeInfo map[string]*schedulercache.NodeInfo) interface{} { + return algorithmpredicates.NewPredicateMetadataFactory(schedulertesting.FakePodLister{p})(p, nodeInfo) +} + +// TestSelectNodesForPreemption tests selectNodesForPreemption. This test assumes +// that podsFitsOnNode works correctly and is tested separately. +func TestSelectNodesForPreemption(t *testing.T) { + smallContainers := []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest, 10)), + }, + }, + }, + } + mediumContainers := []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*2, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest*2, 10)), + }, + }, + }, + } + largeContainers := []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*3, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest*3, 10)), + }, + }, + }, + } + lowPriority, midPriority, highPriority := int32(0), int32(100), int32(1000) + tests := []struct { + name string + predicates map[string]algorithm.FitPredicate + nodes []string + pod *v1.Pod + pods []*v1.Pod + expected map[string]map[string]bool // Map from node name to a list of pods names which should be preempted. + addAffinityPredicate bool + }{ + { + name: "a pod that does not fit on any machine", + predicates: map[string]algorithm.FitPredicate{"matches": falsePredicate}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "new"}, Spec: v1.PodSpec{Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]map[string]bool{}, + }, + { + name: "a pod that fits with no preemption", + predicates: map[string]algorithm.FitPredicate{"matches": truePredicate}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "new"}, Spec: v1.PodSpec{Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]map[string]bool{}, + }, + { + name: "a pod that fits on one machine with no preemption", + predicates: map[string]algorithm.FitPredicate{"matches": matchesPredicate}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]map[string]bool{}, + }, + { + name: "a pod that fits on both machines when lower priority pods are preempted", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]map[string]bool{"machine1": {"a": true}, "machine2": {"b": true}}, + }, + { + name: "a pod that would fit on the machines, but other pods running are higher priority", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &lowPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]map[string]bool{}, + }, + { + name: "medium priority pod is preempted, but lower priority one stays as it is small", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "c"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: map[string]map[string]bool{"machine1": {"b": true}, "machine2": {"c": true}}, + }, + { + name: "mixed priority pods are preempted", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "c"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "d"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "e"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}}, + expected: map[string]map[string]bool{"machine1": {"b": true, "c": true}}, + }, + { + name: "lower priority pod is not preempted to satisfy pod affinity", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, Affinity: &v1.Affinity{ + PodAffinity: &v1.PodAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "service", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"securityscan", "value2"}, + }, + }, + }, + TopologyKey: "hostname", + }, + }, + }}}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", Labels: map[string]string{"service": "securityscan"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "c"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "d"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "e"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}}, + expected: map[string]map[string]bool{"machine1": {"b": true, "c": true}}, + addAffinityPredicate: true, + }, + { + name: "between two pods that satisfy affinity, the higher priority one stays", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, Affinity: &v1.Affinity{ + PodAffinity: &v1.PodAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "service", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"securityscan", "value2"}, + }, + }, + }, + TopologyKey: "hostname", + }, + }, + }}}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", Labels: map[string]string{"service": "securityscan"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "b", Labels: map[string]string{"service": "securityscan"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "c"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "d"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "e"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}}, + expected: map[string]map[string]bool{"machine1": {"a": true, "c": true}}, + addAffinityPredicate: true, + }, + } + + for _, test := range tests { + nodes := []*v1.Node{} + for _, n := range test.nodes { + node := makeNode(n, priorityutil.DefaultMilliCpuRequest*5, priorityutil.DefaultMemoryRequest*5) + node.ObjectMeta.Labels = map[string]string{"hostname": node.Name} + nodes = append(nodes, node) + } + if test.addAffinityPredicate { + test.predicates["affinity"] = algorithmpredicates.NewPodAffinityPredicate(FakeNodeInfo(*nodes[0]), schedulertesting.FakePodLister(test.pods)) + } + nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, nodes) + nodeToPods := selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata) + if err := checkPreemptionVictims(test.name, test.expected, nodeToPods); err != nil { + t.Error(err) + } + } +} diff --git a/plugin/pkg/scheduler/schedulercache/cache.go b/plugin/pkg/scheduler/schedulercache/cache.go index be436f7b7e2..0f562e25925 100644 --- a/plugin/pkg/scheduler/schedulercache/cache.go +++ b/plugin/pkg/scheduler/schedulercache/cache.go @@ -193,7 +193,7 @@ func (cache *schedulerCache) addPod(pod *v1.Pod) { n = NewNodeInfo() cache.nodes[pod.Spec.NodeName] = n } - n.addPod(pod) + n.AddPod(pod) } // Assumes that lock is already acquired. @@ -208,7 +208,7 @@ func (cache *schedulerCache) updatePod(oldPod, newPod *v1.Pod) error { // Assumes that lock is already acquired. func (cache *schedulerCache) removePod(pod *v1.Pod) error { n := cache.nodes[pod.Spec.NodeName] - if err := n.removePod(pod); err != nil { + if err := n.RemovePod(pod); err != nil { return err } if len(n.pods) == 0 && n.node == nil { diff --git a/plugin/pkg/scheduler/schedulercache/node_info.go b/plugin/pkg/scheduler/schedulercache/node_info.go index dd3f8206b09..4192aea7264 100644 --- a/plugin/pkg/scheduler/schedulercache/node_info.go +++ b/plugin/pkg/scheduler/schedulercache/node_info.go @@ -162,7 +162,7 @@ func NewNodeInfo(pods ...*v1.Pod) *NodeInfo { usedPorts: make(map[int]bool), } for _, pod := range pods { - ni.addPod(pod) + ni.AddPod(pod) } return ni } @@ -294,8 +294,8 @@ func hasPodAffinityConstraints(pod *v1.Pod) bool { return affinity != nil && (affinity.PodAffinity != nil || affinity.PodAntiAffinity != nil) } -// addPod adds pod information to this NodeInfo. -func (n *NodeInfo) addPod(pod *v1.Pod) { +// AddPod adds pod information to this NodeInfo. +func (n *NodeInfo) AddPod(pod *v1.Pod) { res, non0_cpu, non0_mem := calculateResource(pod) n.requestedResource.MilliCPU += res.MilliCPU n.requestedResource.Memory += res.Memory @@ -320,8 +320,8 @@ func (n *NodeInfo) addPod(pod *v1.Pod) { n.generation++ } -// removePod subtracts pod information to this NodeInfo. -func (n *NodeInfo) removePod(pod *v1.Pod) error { +// RemovePod subtracts pod information to this NodeInfo. +func (n *NodeInfo) RemovePod(pod *v1.Pod) error { k1, err := getPodKey(pod) if err != nil { return err @@ -441,6 +441,37 @@ func (n *NodeInfo) RemoveNode(node *v1.Node) error { return nil } +// FilterOutPods receives a list of pods and filters out those whose node names +// are equal to the node of this NodeInfo, but are not found in the pods of this NodeInfo. +// +// Preemption logic simulates removal of pods on a node by removing them from the +// corresponding NodeInfo. In order for the simulation to work, we call this method +// on the pods returned from SchedulerCache, so that predicate functions see +// only the pods that are not removed from the NodeInfo. +func (n *NodeInfo) FilterOutPods(pods []*v1.Pod) []*v1.Pod { + node := n.Node() + if node == nil { + return pods + } + filtered := make([]*v1.Pod, 0, len(pods)) + for _, p := range pods { + if p.Spec.NodeName == node.Name { + // If pod is on the given node, add it to 'filtered' only if it is present in nodeInfo. + podKey, _ := getPodKey(p) + for _, np := range n.Pods() { + npodkey, _ := getPodKey(np) + if npodkey == podKey { + filtered = append(filtered, p) + break + } + } + } else { + filtered = append(filtered, p) + } + } + return filtered +} + // getPodKey returns the string key of a pod. func getPodKey(pod *v1.Pod) (string, error) { return clientcache.MetaNamespaceKeyFunc(pod) diff --git a/plugin/pkg/scheduler/schedulercache/util.go b/plugin/pkg/scheduler/schedulercache/util.go index dc3c28a6e30..e2ac2d90777 100644 --- a/plugin/pkg/scheduler/schedulercache/util.go +++ b/plugin/pkg/scheduler/schedulercache/util.go @@ -27,7 +27,7 @@ func CreateNodeNameToInfoMap(pods []*v1.Pod, nodes []*v1.Node) map[string]*NodeI if _, ok := nodeNameToInfo[nodeName]; !ok { nodeNameToInfo[nodeName] = NewNodeInfo() } - nodeNameToInfo[nodeName].addPod(pod) + nodeNameToInfo[nodeName].AddPod(pod) } for _, node := range nodes { if _, ok := nodeNameToInfo[node.Name]; !ok { diff --git a/plugin/pkg/scheduler/util/utils.go b/plugin/pkg/scheduler/util/utils.go index a230cf55fa6..2bf10adf6b1 100644 --- a/plugin/pkg/scheduler/util/utils.go +++ b/plugin/pkg/scheduler/util/utils.go @@ -17,7 +17,10 @@ limitations under the License. package util import ( + "sort" + "k8s.io/api/core/v1" + "k8s.io/kubernetes/pkg/apis/scheduling" ) // GetUsedPorts returns the used host ports of Pods: if 'port' was used, a 'port:true' pair @@ -46,3 +49,42 @@ func GetPodFullName(pod *v1.Pod) string { // (DNS subdomain format). return pod.Name + "_" + pod.Namespace } + +// GetPodPriority return priority of the given pod. +func GetPodPriority(pod *v1.Pod) int32 { + if pod.Spec.Priority != nil { + return *pod.Spec.Priority + } + // When priority of a running pod is nil, it means it was created at a time + // that there was no global default priority class and the priority class + // name of the pod was empty. So, we resolve to the static default priority. + return scheduling.DefaultPriorityWhenNoDefaultClassExists +} + +// SortableList is a list that implements sort.Interface. +type SortableList struct { + Items []interface{} + CompFunc LessFunc +} + +// LessFunc is a function that receives two Pods and returns true if the first +// pod should be placed before pod2 when the list is sorted. +type LessFunc func(item1, item2 interface{}) bool + +var _ = sort.Interface(&SortableList{}) + +func (l *SortableList) Len() int { return len(l.Items) } + +func (l *SortableList) Less(i, j int) bool { + return l.CompFunc(l.Items[i], l.Items[j]) +} + +func (l *SortableList) Swap(i, j int) { + l.Items[i], l.Items[j] = l.Items[j], l.Items[i] +} + +// Sort sorts the items in the list using the given CompFunc. Item1 is placed +// before Item2 when CompFunc(Item1, Item2) returns true. +func (l *SortableList) Sort() { + sort.Sort(l) +} diff --git a/plugin/pkg/scheduler/util/utils_test.go b/plugin/pkg/scheduler/util/utils_test.go new file mode 100644 index 00000000000..653c3b9b0d6 --- /dev/null +++ b/plugin/pkg/scheduler/util/utils_test.go @@ -0,0 +1,95 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package util + +import ( + "testing" + + "k8s.io/api/core/v1" + "k8s.io/kubernetes/pkg/apis/scheduling" +) + +// TestGetPodPriority tests GetPodPriority function. +func TestGetPodPriority(t *testing.T) { + p := int32(20) + tests := []struct { + name string + pod *v1.Pod + expectedPriority int32 + }{ + { + name: "no priority pod resolves to static default priority", + pod: &v1.Pod{ + Spec: v1.PodSpec{Containers: []v1.Container{ + {Name: "container", Image: "image"}}, + }, + }, + expectedPriority: scheduling.DefaultPriorityWhenNoDefaultClassExists, + }, + { + name: "pod with priority resolves correctly", + pod: &v1.Pod{ + Spec: v1.PodSpec{Containers: []v1.Container{ + {Name: "container", Image: "image"}}, + Priority: &p, + }, + }, + expectedPriority: p, + }, + } + for _, test := range tests { + if GetPodPriority(test.pod) != test.expectedPriority { + t.Errorf("expected pod priority: %v, got %v", test.expectedPriority, GetPodPriority(test.pod)) + } + + } +} + +// TestSortableList tests SortableList by storing pods in the list and sorting +// them by their priority. +func TestSortableList(t *testing.T) { + higherPriority := func(pod1, pod2 interface{}) bool { + return GetPodPriority(pod1.(*v1.Pod)) > GetPodPriority(pod2.(*v1.Pod)) + } + podList := SortableList{CompFunc: higherPriority} + // Add a few Pods with different priorities from lowest to highest priority. + for i := 0; i < 10; i++ { + var p int32 = int32(i) + pod := &v1.Pod{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "container", + Image: "image", + }, + }, + Priority: &p, + }, + } + podList.Items = append(podList.Items, pod) + } + podList.Sort() + if len(podList.Items) != 10 { + t.Errorf("expected length of list was 10, got: %v", len(podList.Items)) + } + var prevPriority = int32(10) + for _, p := range podList.Items { + if *p.(*v1.Pod).Spec.Priority >= prevPriority { + t.Errorf("Pods are not soreted. Current pod pririty is %v, while previous one was %v.", *p.(*v1.Pod).Spec.Priority, prevPriority) + } + } +} From 20931d7a68ba3a02102cc9145f6166c9a99b6848 Mon Sep 17 00:00:00 2001 From: "Bobby (Babak) Salamat" Date: Thu, 17 Aug 2017 17:08:41 -0700 Subject: [PATCH 2/7] Add specific types for PredicateMetadata and PredicateMetadataProducer --- .../algorithm/predicates/metadata.go | 27 ++++++++++++- .../algorithm/predicates/predicates.go | 40 +++++++++---------- .../algorithm/predicates/predicates_test.go | 4 +- plugin/pkg/scheduler/algorithm/types.go | 21 ++++++++-- .../algorithmprovider/defaults/defaults.go | 2 +- plugin/pkg/scheduler/core/extender_test.go | 2 +- .../pkg/scheduler/core/generic_scheduler.go | 12 +++--- .../scheduler/core/generic_scheduler_test.go | 16 ++++---- plugin/pkg/scheduler/factory/factory.go | 2 +- plugin/pkg/scheduler/factory/factory_test.go | 4 +- plugin/pkg/scheduler/factory/plugins.go | 12 ++++-- plugin/pkg/scheduler/scheduler.go | 2 +- plugin/pkg/scheduler/scheduler_test.go | 4 +- plugin/pkg/scheduler/testutil.go | 2 +- test/integration/scheduler/scheduler_test.go | 4 +- 15 files changed, 98 insertions(+), 56 deletions(-) diff --git a/plugin/pkg/scheduler/algorithm/predicates/metadata.go b/plugin/pkg/scheduler/algorithm/predicates/metadata.go index 5f4ece5a24d..0adaf84dc81 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/metadata.go +++ b/plugin/pkg/scheduler/algorithm/predicates/metadata.go @@ -54,6 +54,9 @@ type predicateMetadata struct { serviceAffinityMatchingPodServices []*v1.Service } +// Ensure that predicateMetadata implements algorithm.PredicateMetadata. +var _ algorithm.PredicateMetadata = &predicateMetadata{} + // PredicateMetadataProducer: Helper types/variables... type PredicateMetadataProducer func(pm *predicateMetadata) @@ -66,7 +69,7 @@ func RegisterPredicateMetadataProducer(predicateName string, precomp PredicateMe predicateMetadataProducers[predicateName] = precomp } -func NewPredicateMetadataFactory(podLister algorithm.PodLister) algorithm.MetadataProducer { +func NewPredicateMetadataFactory(podLister algorithm.PodLister) algorithm.PredicateMetadataProducer { factory := &PredicateMetadataFactory{ podLister, } @@ -74,7 +77,7 @@ func NewPredicateMetadataFactory(podLister algorithm.PodLister) algorithm.Metada } // GetMetadata returns the predicateMetadata used which will be used by various predicates. -func (pfactory *PredicateMetadataFactory) GetMetadata(pod *v1.Pod, nodeNameToInfoMap map[string]*schedulercache.NodeInfo) interface{} { +func (pfactory *PredicateMetadataFactory) GetMetadata(pod *v1.Pod, nodeNameToInfoMap map[string]*schedulercache.NodeInfo) algorithm.PredicateMetadata { // If we cannot compute metadata, just return nil if pod == nil { return nil @@ -159,3 +162,23 @@ func (meta *predicateMetadata) AddPod(addedPod *v1.Pod, nodeInfo *schedulercache } return nil } + +// ShallowCopy copies a metadata struct into a new struct and creates a copy of +// its maps and slices, but it does not copy the contents of pointer values. +func (meta *predicateMetadata) ShallowCopy() algorithm.PredicateMetadata { + newPredMeta := &predicateMetadata{ + pod: meta.pod, + podBestEffort: meta.podBestEffort, + podRequest: meta.podRequest, + serviceAffinityInUse: meta.serviceAffinityInUse, + } + for k, v := range meta.podPorts { + newPredMeta.podPorts[k] = v + } + for k, v := range meta.matchingAntiAffinityTerms { + newPredMeta.matchingAntiAffinityTerms[k] = append([]matchingPodAntiAffinityTerm(nil), v...) + } + newPredMeta.serviceAffinityMatchingPodServices = append([]*v1.Service(nil), meta.serviceAffinityMatchingPodServices...) + newPredMeta.serviceAffinityMatchingPodList = append([]*v1.Pod(nil), meta.serviceAffinityMatchingPodList...) + return (algorithm.PredicateMetadata)(newPredMeta) +} diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index c9f55181412..f28c7a901ce 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -152,7 +152,7 @@ func isVolumeConflict(volume v1.Volume, pod *v1.Pod) bool { // - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image. // - ISCSI forbids if any two pods share at least same IQN, LUN and Target // TODO: migrate this into some per-volume specific code? -func NoDiskConflict(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func NoDiskConflict(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { for _, v := range pod.Spec.Volumes { for _, ev := range nodeInfo.Pods() { if isVolumeConflict(v, ev) { @@ -250,7 +250,7 @@ func (c *MaxPDVolumeCountChecker) filterVolumes(volumes []v1.Volume, namespace s return nil } -func (c *MaxPDVolumeCountChecker) predicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func (c *MaxPDVolumeCountChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { // If a pod doesn't have any volume attached to it, the predicate will always be true. // Thus we make a fast path for it, to avoid unnecessary computations in this case. if len(pod.Spec.Volumes) == 0 { @@ -371,7 +371,7 @@ func NewVolumeZonePredicate(pvInfo PersistentVolumeInfo, pvcInfo PersistentVolum return c.predicate } -func (c *VolumeZoneChecker) predicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func (c *VolumeZoneChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { // If a pod doesn't have any volume attached to it, the predicate will always be true. // Thus we make a fast path for it, to avoid unnecessary computations in this case. if len(pod.Spec.Volumes) == 0 { @@ -517,7 +517,7 @@ func podName(pod *v1.Pod) string { // PodFitsResources checks if a node has sufficient resources, such as cpu, memory, gpu, opaque int resources etc to run a pod. // First return value indicates whether a node has sufficient resources to run a pod while the second return value indicates the // predicate failure reasons if the node has insufficient resources to run the pod. -func PodFitsResources(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PodFitsResources(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { node := nodeInfo.Node() if node == nil { return false, nil, fmt.Errorf("node not found") @@ -635,7 +635,7 @@ func podMatchesNodeLabels(pod *v1.Pod, node *v1.Node) bool { } // PodMatchNodeSelector checks if a pod node selector matches the node label. -func PodMatchNodeSelector(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PodMatchNodeSelector(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { node := nodeInfo.Node() if node == nil { return false, nil, fmt.Errorf("node not found") @@ -647,7 +647,7 @@ func PodMatchNodeSelector(pod *v1.Pod, meta interface{}, nodeInfo *schedulercach } // PodFitsHost checks if a pod spec node name matches the current node. -func PodFitsHost(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PodFitsHost(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { if len(pod.Spec.NodeName) == 0 { return true, nil, nil } @@ -686,7 +686,7 @@ func NewNodeLabelPredicate(labels []string, presence bool) algorithm.FitPredicat // Alternately, eliminating nodes that have a certain label, regardless of value, is also useful // A node may have a label with "retiring" as key and the date as the value // and it may be desirable to avoid scheduling new pods on this node -func (n *NodeLabelChecker) CheckNodeLabelPresence(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func (n *NodeLabelChecker) CheckNodeLabelPresence(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { node := nodeInfo.Node() if node == nil { return false, nil, fmt.Errorf("node not found") @@ -769,7 +769,7 @@ func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister al // // WARNING: This Predicate is NOT guaranteed to work if some of the predicateMetadata data isn't precomputed... // For that reason it is not exported, i.e. it is highly coupled to the implementation of the FitPredicate construction. -func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { var services []*v1.Service var pods []*v1.Pod if pm, ok := meta.(*predicateMetadata); ok && (pm.serviceAffinityMatchingPodList != nil || pm.serviceAffinityMatchingPodServices != nil) { @@ -808,7 +808,7 @@ func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta interface{}, no } // PodFitsHostPorts checks if a node has free ports for the requested pod ports. -func PodFitsHostPorts(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PodFitsHostPorts(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { var wantPorts map[int]bool if predicateMeta, ok := meta.(*predicateMetadata); ok { wantPorts = predicateMeta.podPorts @@ -849,7 +849,7 @@ func haveSame(a1, a2 []string) bool { // GeneralPredicates checks whether noncriticalPredicates and EssentialPredicates pass. noncriticalPredicates are the predicates // that only non-critical pods need and EssentialPredicates are the predicates that all pods, including critical pods, need -func GeneralPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func GeneralPredicates(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { var predicateFails []algorithm.PredicateFailureReason fit, reasons, err := noncriticalPredicates(pod, meta, nodeInfo) if err != nil { @@ -871,7 +871,7 @@ func GeneralPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.N } // noncriticalPredicates are the predicates that only non-critical pods need -func noncriticalPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func noncriticalPredicates(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { var predicateFails []algorithm.PredicateFailureReason fit, reasons, err := PodFitsResources(pod, meta, nodeInfo) if err != nil { @@ -885,7 +885,7 @@ func noncriticalPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercac } // EssentialPredicates are the predicates that all pods, including critical pods, need -func EssentialPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func EssentialPredicates(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { var predicateFails []algorithm.PredicateFailureReason fit, reasons, err := PodFitsHost(pod, meta, nodeInfo) if err != nil { @@ -931,7 +931,7 @@ func NewPodAffinityPredicate(info NodeInfo, podLister algorithm.PodLister) algor // InterPodAffinityMatches checks if a pod can be scheduled on the specified node with pod affinity/anti-affinity configuration. // First return value indicates whether a pod can be scheduled on the specified node while the second return value indicates the // predicate failure reasons if the pod cannot be scheduled on the specified node. -func (c *PodAffinityChecker) InterPodAffinityMatches(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func (c *PodAffinityChecker) InterPodAffinityMatches(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { node := nodeInfo.Node() if node == nil { return false, nil, fmt.Errorf("node not found") @@ -1116,7 +1116,7 @@ func (c *PodAffinityChecker) getMatchingAntiAffinityTerms(pod *v1.Pod, allPods [ // Checks if scheduling the pod onto this node would break any anti-affinity // rules indicated by the existing pods. -func (c *PodAffinityChecker) satisfiesExistingPodsAntiAffinity(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) bool { +func (c *PodAffinityChecker) satisfiesExistingPodsAntiAffinity(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) bool { node := nodeInfo.Node() if node == nil { return false @@ -1224,7 +1224,7 @@ func (c *PodAffinityChecker) satisfiesPodsAffinityAntiAffinity(pod *v1.Pod, node } // PodToleratesNodeTaints checks if a pod tolerations can tolerate the node taints -func PodToleratesNodeTaints(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PodToleratesNodeTaints(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { return podToleratesNodeTaints(pod, nodeInfo, func(t *v1.Taint) bool { // PodToleratesNodeTaints is only interested in NoSchedule and NoExecute taints. return t.Effect == v1.TaintEffectNoSchedule || t.Effect == v1.TaintEffectNoExecute @@ -1232,7 +1232,7 @@ func PodToleratesNodeTaints(pod *v1.Pod, meta interface{}, nodeInfo *schedulerca } // PodToleratesNodeNoExecuteTaints checks if a pod tolerations can tolerate the node's NoExecute taints -func PodToleratesNodeNoExecuteTaints(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PodToleratesNodeNoExecuteTaints(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { return podToleratesNodeTaints(pod, nodeInfo, func(t *v1.Taint) bool { return t.Effect == v1.TaintEffectNoExecute }) @@ -1257,7 +1257,7 @@ func isPodBestEffort(pod *v1.Pod) bool { // CheckNodeMemoryPressurePredicate checks if a pod can be scheduled on a node // reporting memory pressure condition. -func CheckNodeMemoryPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func CheckNodeMemoryPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { var podBestEffort bool if predicateMeta, ok := meta.(*predicateMetadata); ok { podBestEffort = predicateMeta.podBestEffort @@ -1279,7 +1279,7 @@ func CheckNodeMemoryPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *s // CheckNodeDiskPressurePredicate checks if a pod can be scheduled on a node // reporting disk pressure condition. -func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { // check if node is under disk pressure if nodeInfo.DiskPressureCondition() == v1.ConditionTrue { return false, []algorithm.PredicateFailureReason{ErrNodeUnderDiskPressure}, nil @@ -1289,7 +1289,7 @@ func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *sch // CheckNodeConditionPredicate checks if a pod can be scheduled on a node reporting out of disk, // network unavailable and not ready condition. Only node conditions are accounted in this predicate. -func CheckNodeConditionPredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func CheckNodeConditionPredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { reasons := []algorithm.PredicateFailureReason{} if nodeInfo == nil || nodeInfo.Node() == nil { @@ -1337,7 +1337,7 @@ func NewVolumeNodePredicate(pvInfo PersistentVolumeInfo, pvcInfo PersistentVolum return c.predicate } -func (c *VolumeNodeChecker) predicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func (c *VolumeNodeChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { if !utilfeature.DefaultFeatureGate.Enabled(features.PersistentLocalVolumes) { return true, nil, nil } diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go index 7c878c5ac3c..a2605627dcf 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go @@ -139,7 +139,7 @@ func newResourceInitPod(pod *v1.Pod, usage ...schedulercache.Resource) *v1.Pod { return pod } -func PredicateMetadata(p *v1.Pod, nodeInfo map[string]*schedulercache.NodeInfo) interface{} { +func PredicateMetadata(p *v1.Pod, nodeInfo map[string]*schedulercache.NodeInfo) algorithm.PredicateMetadata { pm := PredicateMetadataFactory{schedulertesting.FakePodLister{p}} return pm.GetMetadata(p, nodeInfo) } @@ -2984,7 +2984,7 @@ func TestInterPodAffinityWithMultipleNodes(t *testing.T) { nodeInfo.SetNode(&node) nodeInfoMap := map[string]*schedulercache.NodeInfo{node.Name: nodeInfo} - var meta interface{} = nil + var meta algorithm.PredicateMetadata = nil if !test.nometa { meta = PredicateMetadata(test.pod, nodeInfoMap) diff --git a/plugin/pkg/scheduler/algorithm/types.go b/plugin/pkg/scheduler/algorithm/types.go index 4223f69a8a2..8932b1f6bd9 100644 --- a/plugin/pkg/scheduler/algorithm/types.go +++ b/plugin/pkg/scheduler/algorithm/types.go @@ -27,8 +27,7 @@ import ( // FitPredicate is a function that indicates if a pod fits into an existing node. // The failure information is given by the error. -// TODO: Change interface{} to a specific type. -type FitPredicate func(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []PredicateFailureReason, error) +type FitPredicate func(pod *v1.Pod, meta PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []PredicateFailureReason, error) // PriorityMapFunction is a function that computes per-node results for a given node. // TODO: Figure out the exact API of this method. @@ -41,7 +40,12 @@ type PriorityMapFunction func(pod *v1.Pod, meta interface{}, nodeInfo *scheduler // TODO: Change interface{} to a specific type. type PriorityReduceFunction func(pod *v1.Pod, meta interface{}, nodeNameToInfo map[string]*schedulercache.NodeInfo, result schedulerapi.HostPriorityList) error -// MetadataProducer is a function that computes metadata for a given pod. +// PredicateMetadataProducer is a function that computes predicate metadata for a given pod. +type PredicateMetadataProducer func(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) PredicateMetadata + +// MetadataProducer is a function that computes metadata for a given pod. This +// is now used for only for priority functions. For predicates please use PredicateMetadataProducer. +// TODO: Rename this once we have a specific type for priority metadata producer. type MetadataProducer func(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) interface{} // DEPRECATED @@ -57,6 +61,11 @@ type PriorityConfig struct { Weight int } +// EmptyPredicateMetadataProducer returns a no-op MetadataProducer type. +func EmptyPredicateMetadataProducer(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) PredicateMetadata { + return nil +} + // EmptyMetadataProducer returns a no-op MetadataProducer type. func EmptyMetadataProducer(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) interface{} { return nil @@ -147,3 +156,9 @@ type EmptyStatefulSetLister struct{} func (f EmptyStatefulSetLister) GetPodStatefulSets(pod *v1.Pod) (sss []*apps.StatefulSet, err error) { return nil, nil } + +type PredicateMetadata interface { + ShallowCopy() PredicateMetadata + AddPod(addedPod *v1.Pod, nodeInfo *schedulercache.NodeInfo) error + RemovePod(deletedPod *v1.Pod) error +} diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go index cb868fa543d..d38fbb78f62 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -52,7 +52,7 @@ const ( func init() { // Register functions that extract metadata used by predicates and priorities computations. factory.RegisterPredicateMetadataProducerFactory( - func(args factory.PluginFactoryArgs) algorithm.MetadataProducer { + func(args factory.PluginFactoryArgs) algorithm.PredicateMetadataProducer { return predicates.NewPredicateMetadataFactory(args.PodLister) }) factory.RegisterPriorityMetadataProducerFactory( diff --git a/plugin/pkg/scheduler/core/extender_test.go b/plugin/pkg/scheduler/core/extender_test.go index 455ddb69f94..cb211e27f46 100644 --- a/plugin/pkg/scheduler/core/extender_test.go +++ b/plugin/pkg/scheduler/core/extender_test.go @@ -314,7 +314,7 @@ func TestGenericSchedulerWithExtenders(t *testing.T) { cache.AddNode(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: name}}) } scheduler := NewGenericScheduler( - cache, nil, test.predicates, algorithm.EmptyMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, extenders) + cache, nil, test.predicates, algorithm.EmptyPredicateMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, extenders) podIgnored := &v1.Pod{} machine, err := scheduler.Schedule(podIgnored, schedulertesting.FakeNodeLister(makeNodeList(test.nodes))) if test.expectsErr { diff --git a/plugin/pkg/scheduler/core/generic_scheduler.go b/plugin/pkg/scheduler/core/generic_scheduler.go index bb635739ad4..f51326ee0fd 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler.go +++ b/plugin/pkg/scheduler/core/generic_scheduler.go @@ -74,7 +74,7 @@ type genericScheduler struct { equivalenceCache *EquivalenceCache predicates map[string]algorithm.FitPredicate priorityMetaProducer algorithm.MetadataProducer - predicateMetaProducer algorithm.MetadataProducer + predicateMetaProducer algorithm.PredicateMetadataProducer prioritizers []algorithm.PriorityConfig extenders []algorithm.SchedulerExtender pods algorithm.PodLister @@ -188,7 +188,7 @@ func findNodesThatFit( nodes []*v1.Node, predicateFuncs map[string]algorithm.FitPredicate, extenders []algorithm.SchedulerExtender, - metadataProducer algorithm.MetadataProducer, + metadataProducer algorithm.PredicateMetadataProducer, ecache *EquivalenceCache, ) ([]*v1.Node, FailedPredicateMap, error) { var filtered []*v1.Node @@ -253,7 +253,7 @@ func findNodesThatFit( } // Checks whether node with a given name and NodeInfo satisfies all predicateFuncs. -func podFitsOnNode(pod *v1.Pod, meta interface{}, info *schedulercache.NodeInfo, predicateFuncs map[string]algorithm.FitPredicate, +func podFitsOnNode(pod *v1.Pod, meta algorithm.PredicateMetadata, info *schedulercache.NodeInfo, predicateFuncs map[string]algorithm.FitPredicate, ecache *EquivalenceCache) (bool, []algorithm.PredicateFailureReason, error) { var ( equivalenceHash uint64 @@ -450,7 +450,7 @@ func selectNodesForPreemption(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*v1.Node, predicates map[string]algorithm.FitPredicate, - metadataProducer algorithm.MetadataProducer, + metadataProducer algorithm.PredicateMetadataProducer, ) map[string][]*v1.Pod { nodeNameToPods := map[string][]*v1.Pod{} @@ -483,7 +483,7 @@ func selectNodesForPreemption(pod *v1.Pod, // NOTE: This function assumes that it is never called if "pod" cannot be scheduled // due to pod affinity, node affinity, or node anti-affinity reasons. None of // these predicates can be satisfied by removing more pods from the node. -func selectVictimsOnNode(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo, fitPredicates map[string]algorithm.FitPredicate) ([]*v1.Pod, bool) { +func selectVictimsOnNode(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo, fitPredicates map[string]algorithm.FitPredicate) ([]*v1.Pod, bool) { higherPriority := func(pod1, pod2 interface{}) bool { return util.GetPodPriority(pod1.(*v1.Pod)) > util.GetPodPriority(pod2.(*v1.Pod)) } @@ -567,7 +567,7 @@ func NewGenericScheduler( cache schedulercache.Cache, eCache *EquivalenceCache, predicates map[string]algorithm.FitPredicate, - predicateMetaProducer algorithm.MetadataProducer, + predicateMetaProducer algorithm.PredicateMetadataProducer, prioritizers []algorithm.PriorityConfig, priorityMetaProducer algorithm.MetadataProducer, extenders []algorithm.SchedulerExtender) algorithm.ScheduleAlgorithm { diff --git a/plugin/pkg/scheduler/core/generic_scheduler_test.go b/plugin/pkg/scheduler/core/generic_scheduler_test.go index bbd95f85cb9..cfe1f6708ea 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler_test.go +++ b/plugin/pkg/scheduler/core/generic_scheduler_test.go @@ -41,15 +41,15 @@ import ( schedulertesting "k8s.io/kubernetes/plugin/pkg/scheduler/testing" ) -func falsePredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func falsePredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { return false, []algorithm.PredicateFailureReason{algorithmpredicates.ErrFakePredicate}, nil } -func truePredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func truePredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { return true, nil, nil } -func matchesPredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func matchesPredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { node := nodeInfo.Node() if node == nil { return false, nil, fmt.Errorf("node not found") @@ -60,7 +60,7 @@ func matchesPredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.No return false, []algorithm.PredicateFailureReason{algorithmpredicates.ErrFakePredicate}, nil } -func hasNoPodsPredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func hasNoPodsPredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { if len(nodeInfo.Pods()) == 0 { return true, nil, nil } @@ -307,7 +307,7 @@ func TestGenericScheduler(t *testing.T) { } scheduler := NewGenericScheduler( - cache, nil, test.predicates, algorithm.EmptyMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, + cache, nil, test.predicates, algorithm.EmptyPredicateMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, []algorithm.SchedulerExtender{}) machine, err := scheduler.Schedule(test.pod, schedulertesting.FakeNodeLister(makeNodeList(test.nodes))) @@ -328,7 +328,7 @@ func TestFindFitAllError(t *testing.T) { "2": schedulercache.NewNodeInfo(), "1": schedulercache.NewNodeInfo(), } - _, predicateMap, err := findNodesThatFit(&v1.Pod{}, nodeNameToInfo, makeNodeList(nodes), predicates, nil, algorithm.EmptyMetadataProducer, nil) + _, predicateMap, err := findNodesThatFit(&v1.Pod{}, nodeNameToInfo, makeNodeList(nodes), predicates, nil, algorithm.EmptyPredicateMetadataProducer, nil) if err != nil { t.Errorf("unexpected error: %v", err) @@ -362,7 +362,7 @@ func TestFindFitSomeError(t *testing.T) { nodeNameToInfo[name].SetNode(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: name}}) } - _, predicateMap, err := findNodesThatFit(pod, nodeNameToInfo, makeNodeList(nodes), predicates, nil, algorithm.EmptyMetadataProducer, nil) + _, predicateMap, err := findNodesThatFit(pod, nodeNameToInfo, makeNodeList(nodes), predicates, nil, algorithm.EmptyPredicateMetadataProducer, nil) if err != nil { t.Errorf("unexpected error: %v", err) } @@ -583,7 +583,7 @@ func (n FakeNodeInfo) GetNodeInfo(nodeName string) (*v1.Node, error) { return &node, nil } -func PredicateMetadata(p *v1.Pod, nodeInfo map[string]*schedulercache.NodeInfo) interface{} { +func PredicateMetadata(p *v1.Pod, nodeInfo map[string]*schedulercache.NodeInfo) algorithm.PredicateMetadata { return algorithmpredicates.NewPredicateMetadataFactory(schedulertesting.FakePodLister{p})(p, nodeInfo) } diff --git a/plugin/pkg/scheduler/factory/factory.go b/plugin/pkg/scheduler/factory/factory.go index 10675413640..31cb17548ed 100644 --- a/plugin/pkg/scheduler/factory/factory.go +++ b/plugin/pkg/scheduler/factory/factory.go @@ -753,7 +753,7 @@ func (f *ConfigFactory) GetPriorityMetadataProducer() (algorithm.MetadataProduce return getPriorityMetadataProducer(*pluginArgs) } -func (f *ConfigFactory) GetPredicateMetadataProducer() (algorithm.MetadataProducer, error) { +func (f *ConfigFactory) GetPredicateMetadataProducer() (algorithm.PredicateMetadataProducer, error) { pluginArgs, err := f.getPluginArgs() if err != nil { return nil, err diff --git a/plugin/pkg/scheduler/factory/factory_test.go b/plugin/pkg/scheduler/factory/factory_test.go index 28cfaa56a64..f75f782e257 100644 --- a/plugin/pkg/scheduler/factory/factory_test.go +++ b/plugin/pkg/scheduler/factory/factory_test.go @@ -226,11 +226,11 @@ func TestCreateFromEmptyConfig(t *testing.T) { factory.CreateFromConfig(policy) } -func PredicateOne(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PredicateOne(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { return true, nil, nil } -func PredicateTwo(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PredicateTwo(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { return true, nil, nil } diff --git a/plugin/pkg/scheduler/factory/plugins.go b/plugin/pkg/scheduler/factory/plugins.go index 14776a7dbe1..08138bea6ac 100644 --- a/plugin/pkg/scheduler/factory/plugins.go +++ b/plugin/pkg/scheduler/factory/plugins.go @@ -47,8 +47,12 @@ type PluginFactoryArgs struct { } // MetadataProducerFactory produces MetadataProducer from the given args. +// TODO: Rename this to PriorityMetadataProducerFactory. type MetadataProducerFactory func(PluginFactoryArgs) algorithm.MetadataProducer +// PredicateMetadataProducerFactory produces PredicateMetadataProducer from the given args. +type PredicateMetadataProducerFactory func(PluginFactoryArgs) algorithm.PredicateMetadataProducer + // A FitPredicateFactory produces a FitPredicate from the given args. type FitPredicateFactory func(PluginFactoryArgs) algorithm.FitPredicate @@ -80,7 +84,7 @@ var ( // Registered metadata producers priorityMetadataProducer MetadataProducerFactory - predicateMetadataProducer MetadataProducerFactory + predicateMetadataProducer PredicateMetadataProducerFactory // get equivalence pod function getEquivalencePodFunc algorithm.GetEquivalencePodFunc @@ -181,7 +185,7 @@ func RegisterPriorityMetadataProducerFactory(factory MetadataProducerFactory) { priorityMetadataProducer = factory } -func RegisterPredicateMetadataProducerFactory(factory MetadataProducerFactory) { +func RegisterPredicateMetadataProducerFactory(factory PredicateMetadataProducerFactory) { schedulerFactoryMutex.Lock() defer schedulerFactoryMutex.Unlock() predicateMetadataProducer = factory @@ -343,12 +347,12 @@ func getPriorityMetadataProducer(args PluginFactoryArgs) (algorithm.MetadataProd return priorityMetadataProducer(args), nil } -func getPredicateMetadataProducer(args PluginFactoryArgs) (algorithm.MetadataProducer, error) { +func getPredicateMetadataProducer(args PluginFactoryArgs) (algorithm.PredicateMetadataProducer, error) { schedulerFactoryMutex.Lock() defer schedulerFactoryMutex.Unlock() if predicateMetadataProducer == nil { - return algorithm.EmptyMetadataProducer, nil + return algorithm.EmptyPredicateMetadataProducer, nil } return predicateMetadataProducer(args), nil } diff --git a/plugin/pkg/scheduler/scheduler.go b/plugin/pkg/scheduler/scheduler.go index c54ce22a7db..a58a31bc34a 100644 --- a/plugin/pkg/scheduler/scheduler.go +++ b/plugin/pkg/scheduler/scheduler.go @@ -66,7 +66,7 @@ func (sched *Scheduler) StopEverything() { type Configurator interface { GetPriorityFunctionConfigs(priorityKeys sets.String) ([]algorithm.PriorityConfig, error) GetPriorityMetadataProducer() (algorithm.MetadataProducer, error) - GetPredicateMetadataProducer() (algorithm.MetadataProducer, error) + GetPredicateMetadataProducer() (algorithm.PredicateMetadataProducer, error) GetPredicates(predicateKeys sets.String) (map[string]algorithm.FitPredicate, error) GetHardPodAffinitySymmetricWeight() int GetSchedulerName() string diff --git a/plugin/pkg/scheduler/scheduler_test.go b/plugin/pkg/scheduler/scheduler_test.go index 8febe8f9b24..3ad54d400b3 100644 --- a/plugin/pkg/scheduler/scheduler_test.go +++ b/plugin/pkg/scheduler/scheduler_test.go @@ -500,7 +500,7 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache schedulercache. scache, nil, predicateMap, - algorithm.EmptyMetadataProducer, + algorithm.EmptyPredicateMetadataProducer, []algorithm.PriorityConfig{}, algorithm.EmptyMetadataProducer, []algorithm.SchedulerExtender{}) @@ -536,7 +536,7 @@ func setupTestSchedulerLongBindingWithRetry(queuedPodStore *clientcache.FIFO, sc scache, nil, predicateMap, - algorithm.EmptyMetadataProducer, + algorithm.EmptyPredicateMetadataProducer, []algorithm.PriorityConfig{}, algorithm.EmptyMetadataProducer, []algorithm.SchedulerExtender{}) diff --git a/plugin/pkg/scheduler/testutil.go b/plugin/pkg/scheduler/testutil.go index 25f64f98134..d4bd356aff6 100644 --- a/plugin/pkg/scheduler/testutil.go +++ b/plugin/pkg/scheduler/testutil.go @@ -45,7 +45,7 @@ func (fc *FakeConfigurator) GetPriorityMetadataProducer() (algorithm.MetadataPro } // GetPredicateMetadataProducer is not implemented yet. -func (fc *FakeConfigurator) GetPredicateMetadataProducer() (algorithm.MetadataProducer, error) { +func (fc *FakeConfigurator) GetPredicateMetadataProducer() (algorithm.PredicateMetadataProducer, error) { return nil, fmt.Errorf("not implemented") } diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go index 6322e551631..e0c2b586b46 100644 --- a/test/integration/scheduler/scheduler_test.go +++ b/test/integration/scheduler/scheduler_test.go @@ -56,11 +56,11 @@ type nodeStateManager struct { makeUnSchedulable nodeMutationFunc } -func PredicateOne(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PredicateOne(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { return true, nil, nil } -func PredicateTwo(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { +func PredicateTwo(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { return true, nil, nil } From 2c63705e09efa4abadd2422ae7e1b68e489190d5 Mon Sep 17 00:00:00 2001 From: "Bobby (Babak) Salamat" Date: Sat, 5 Aug 2017 03:31:59 -0700 Subject: [PATCH 3/7] autogenerated files --- plugin/pkg/scheduler/core/BUILD | 1 + plugin/pkg/scheduler/util/BUILD | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/plugin/pkg/scheduler/core/BUILD b/plugin/pkg/scheduler/core/BUILD index 6b985508a5a..f7c3d6b9c62 100644 --- a/plugin/pkg/scheduler/core/BUILD +++ b/plugin/pkg/scheduler/core/BUILD @@ -45,6 +45,7 @@ go_library( "//plugin/pkg/scheduler/algorithm/predicates:go_default_library", "//plugin/pkg/scheduler/api:go_default_library", "//plugin/pkg/scheduler/schedulercache:go_default_library", + "//plugin/pkg/scheduler/util:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/golang/groupcache/lru:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", diff --git a/plugin/pkg/scheduler/util/BUILD b/plugin/pkg/scheduler/util/BUILD index e3381b02f70..8bc4cac5e95 100644 --- a/plugin/pkg/scheduler/util/BUILD +++ b/plugin/pkg/scheduler/util/BUILD @@ -8,9 +8,16 @@ load( go_test( name = "go_default_test", - srcs = ["backoff_utils_test.go"], + srcs = [ + "backoff_utils_test.go", + "utils_test.go", + ], library = ":go_default_library", - deps = ["//vendor/k8s.io/apimachinery/pkg/types:go_default_library"], + deps = [ + "//pkg/apis/scheduling:go_default_library", + "//vendor/k8s.io/api/core/v1:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/types:go_default_library", + ], ) go_library( @@ -23,6 +30,7 @@ go_library( deps = [ "//pkg/api:go_default_library", "//pkg/api/install:go_default_library", + "//pkg/apis/scheduling:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library", From 1cec8bac9c1786668ccd2d339e42b5835bd5927d Mon Sep 17 00:00:00 2001 From: "Bobby (Babak) Salamat" Date: Wed, 9 Aug 2017 11:36:35 -0700 Subject: [PATCH 4/7] Add the logic to pick one node for preemption --- .../pkg/scheduler/core/generic_scheduler.go | 91 ++++++- .../scheduler/core/generic_scheduler_test.go | 252 +++++++++++++++--- 2 files changed, 301 insertions(+), 42 deletions(-) diff --git a/plugin/pkg/scheduler/core/generic_scheduler.go b/plugin/pkg/scheduler/core/generic_scheduler.go index f51326ee0fd..b3e35477c66 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler.go +++ b/plugin/pkg/scheduler/core/generic_scheduler.go @@ -18,6 +18,7 @@ package core import ( "fmt" + "math" "sort" "strings" "sync" @@ -176,8 +177,9 @@ func (g *genericScheduler) preempt(pod *v1.Pod, nodeLister algorithm.NodeLister) if len(nodeToPods) == 0 { return "", nil } - // TODO: Add a node scoring mechanism and perform preemption - return "", nil + node := pickOneNodeForPreemption(nodeToPods) + // TODO: Add actual preemption of pods + return node, nil } // Filters the nodes to find the ones that fit based on the given predicate functions @@ -444,6 +446,89 @@ func EqualPriorityMap(_ *v1.Pod, _ interface{}, nodeInfo *schedulercache.NodeInf }, nil } +// pickOneNodeForPreemption chooses one node among the given nodes. It assumes +// pods in each map entry are ordered by decreasing priority. +// It picks a node based on the following criteria: +// 1. A node with minimum highest priority victim is picked. +// 2. Ties are broken by sum of priorities of all victims. +// 3. If there are still ties, node with the minimum number of victims is picked. +// 4. If there are still ties, the first such node in the map is picked (sort of randomly). +func pickOneNodeForPreemption(nodesToPods map[string][]*v1.Pod) string { + type nodeScore struct { + nodeName string + highestPriority int32 + sumPriorities int64 + numPods int + } + if len(nodesToPods) == 0 { + return "" + } + minHighestPriority := int32(math.MaxInt32) + nodeScores := []*nodeScore{} + for nodeName, pods := range nodesToPods { + if len(pods) == 0 { + // We found a node that doesn't need any preemption. Return it! + // This should happen rarely when one or more pods are terminated between + // the time that scheduler tries to schedule the pod and the time that + // preemption logic tries to find nodes for preemption. + return nodeName + } + // highestPodPriority is the highest priority among the victims on this node. + highestPodPriority := util.GetPodPriority(pods[0]) + if highestPodPriority < minHighestPriority { + minHighestPriority = highestPodPriority + } + nodeScores = append(nodeScores, &nodeScore{nodeName: nodeName, highestPriority: highestPodPriority, numPods: len(pods)}) + } + // Find the nodes with minimum highest priority victim. + minSumPriorities := int64(math.MaxInt64) + lowestHighPriorityNodes := []*nodeScore{} + for _, nodeScore := range nodeScores { + if nodeScore.highestPriority == minHighestPriority { + lowestHighPriorityNodes = append(lowestHighPriorityNodes, nodeScore) + var sumPriorities int64 + for _, pod := range nodesToPods[nodeScore.nodeName] { + // We add MaxInt32+1 to all priorities to make all of them >= 0. This is + // needed so that a node with a few pods with negative priority is not + // picked over a node with a smaller number of pods with the same negative + // priority (and similar scenarios). + sumPriorities += int64(util.GetPodPriority(pod)) + int64(math.MaxInt32+1) + } + if sumPriorities < minSumPriorities { + minSumPriorities = sumPriorities + } + nodeScore.sumPriorities = sumPriorities + } + } + if len(lowestHighPriorityNodes) == 1 { + return lowestHighPriorityNodes[0].nodeName + } + // There are multiple nodes with the same minimum highest priority victim. + // Choose the one(s) with lowest sum of priorities. + minNumPods := math.MaxInt32 + lowestSumPriorityNodes := []*nodeScore{} + for _, nodeScore := range lowestHighPriorityNodes { + if nodeScore.sumPriorities == minSumPriorities { + lowestSumPriorityNodes = append(lowestSumPriorityNodes, nodeScore) + if nodeScore.numPods < minNumPods { + minNumPods = nodeScore.numPods + } + } + } + if len(lowestSumPriorityNodes) == 1 { + return lowestSumPriorityNodes[0].nodeName + } + // There are still more than one node with minimum highest priority victim and + // lowest sum of victim priorities. Find the anyone with minimum number of victims. + for _, nodeScore := range lowestSumPriorityNodes { + if nodeScore.numPods == minNumPods { + return nodeScore.nodeName + } + } + glog.Errorf("We should never reach here!") + return "" +} + // selectNodesForPreemption finds all the nodes with possible victims for // preemption in parallel. func selectNodesForPreemption(pod *v1.Pod, @@ -461,7 +546,7 @@ func selectNodesForPreemption(pod *v1.Pod, checkNode := func(i int) { nodeName := nodes[i].Name pods, fits := selectVictimsOnNode(pod, meta.ShallowCopy(), nodeNameToInfo[nodeName], predicates) - if fits && len(pods) != 0 { + if fits { resultLock.Lock() nodeNameToPods[nodeName] = pods resultLock.Unlock() diff --git a/plugin/pkg/scheduler/core/generic_scheduler_test.go b/plugin/pkg/scheduler/core/generic_scheduler_test.go index cfe1f6708ea..d2ed01dff7f 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler_test.go +++ b/plugin/pkg/scheduler/core/generic_scheduler_test.go @@ -587,46 +587,59 @@ func PredicateMetadata(p *v1.Pod, nodeInfo map[string]*schedulercache.NodeInfo) return algorithmpredicates.NewPredicateMetadataFactory(schedulertesting.FakePodLister{p})(p, nodeInfo) } +var smallContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest, 10)), + }, + }, + }, +} +var mediumContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*2, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest*2, 10)), + }, + }, + }, +} +var largeContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*3, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest*3, 10)), + }, + }, + }, +} +var veryLargeContainers = []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*5, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(priorityutil.DefaultMemoryRequest*5, 10)), + }, + }, + }, +} +var negPriority, lowPriority, midPriority, highPriority, veryHighPriority = int32(-100), int32(0), int32(100), int32(1000), int32(10000) + // TestSelectNodesForPreemption tests selectNodesForPreemption. This test assumes // that podsFitsOnNode works correctly and is tested separately. func TestSelectNodesForPreemption(t *testing.T) { - smallContainers := []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMilliCpuRequest, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMemoryRequest, 10)), - }, - }, - }, - } - mediumContainers := []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*2, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMemoryRequest*2, 10)), - }, - }, - }, - } - largeContainers := []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "cpu": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMilliCpuRequest*3, 10) + "m"), - "memory": resource.MustParse( - strconv.FormatInt(priorityutil.DefaultMemoryRequest*3, 10)), - }, - }, - }, - } - lowPriority, midPriority, highPriority := int32(0), int32(100), int32(1000) tests := []struct { name string predicates map[string]algorithm.FitPredicate @@ -654,7 +667,7 @@ func TestSelectNodesForPreemption(t *testing.T) { pods: []*v1.Pod{ {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]map[string]bool{}, + expected: map[string]map[string]bool{"machine1": {}, "machine2": {}}, }, { name: "a pod that fits on one machine with no preemption", @@ -664,7 +677,7 @@ func TestSelectNodesForPreemption(t *testing.T) { pods: []*v1.Pod{ {ObjectMeta: metav1.ObjectMeta{Name: "a"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine1"}}, {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Priority: &midPriority, NodeName: "machine2"}}}, - expected: map[string]map[string]bool{}, + expected: map[string]map[string]bool{"machine1": {}}, }, { name: "a pod that fits on both machines when lower priority pods are preempted", @@ -789,3 +802,164 @@ func TestSelectNodesForPreemption(t *testing.T) { } } } + +// TestPickOneNodeForPreemption tests pickOneNodeForPreemption. +func TestPickOneNodeForPreemption(t *testing.T) { + tests := []struct { + name string + predicates map[string]algorithm.FitPredicate + nodes []string + pod *v1.Pod + pods []*v1.Pod + expected []string // any of the items is valid + }{ + { + name: "No node needs preemption", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}}, + expected: []string{"machine1"}, + }, + { + name: "a pod that fits on both machines when lower priority pods are preempted", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: []string{"machine1", "machine2"}, + }, + { + name: "a pod that fits on a machine with no preemption", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}}, + expected: []string{"machine3"}, + }, + { + name: "machine with min highest priority pod is picked", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}}, + }, + expected: []string{"machine3"}, + }, + { + name: "when highest priorities are the same, minimum sum of priorities is picked", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + }, + expected: []string{"machine2"}, + }, + { + name: "when highest priority and sum are the same, minimum number of pods is picked", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.4"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &negPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}}, + }, + expected: []string{"machine2"}, + }, + { + // pickOneNodeForPreemption adjusts pod priorities when finding the sum of the victims. This + // test ensures that the logic works correctly. + name: "sum of adjusted priorities is considered", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m2.2"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &negPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}}, + }, + expected: []string{"machine2"}, + }, + { + name: "non-overlapping lowest high priority, sum priorities, and number of pods", + predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, + nodes: []string{"machine1", "machine2", "machine3", "machine4"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Spec: v1.PodSpec{Containers: veryLargeContainers, Priority: &veryHighPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine3"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m3.4"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &lowPriority, NodeName: "machine3"}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m4.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine4"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m4.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine4"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m4.3"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine4"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m4.4"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &negPriority, NodeName: "machine4"}}, + }, + expected: []string{"machine1"}, + }, + } + for _, test := range tests { + nodes := []*v1.Node{} + for _, n := range test.nodes { + nodes = append(nodes, makeNode(n, priorityutil.DefaultMilliCpuRequest*5, priorityutil.DefaultMemoryRequest*5)) + } + nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, nodes) + node := pickOneNodeForPreemption(selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata)) + found := false + for _, nodeName := range test.expected { + if node == nodeName { + found = true + break + } + } + if !found { + t.Errorf("test [%v]: unexpected node: %v", test.name, node) + } + } +} From 4a08dff1681c7c9fb1a989ec4659bbe1f4a900f6 Mon Sep 17 00:00:00 2001 From: "Bobby (Babak) Salamat" Date: Wed, 9 Aug 2017 18:15:40 -0700 Subject: [PATCH 5/7] Add pod eviction logic for scheduler preemption Add Preempt to scheduler interface Add preemption to the scheduling workflow Minor changes to the scheduler integration test library --- .../scheduler/algorithm/predicates/error.go | 5 + .../algorithm/predicates/metadata.go | 12 +- .../algorithm/predicates/metadata_test.go | 43 ++ .../algorithm/predicates/predicates.go | 4 + .../algorithm/scheduler_interface.go | 4 + .../algorithmprovider/defaults/defaults.go | 2 +- plugin/pkg/scheduler/core/extender_test.go | 2 + .../pkg/scheduler/core/generic_scheduler.go | 342 ++++++++++------ .../scheduler/core/generic_scheduler_test.go | 375 +++++++++++++++--- plugin/pkg/scheduler/factory/factory.go | 26 ++ plugin/pkg/scheduler/scheduler.go | 54 +++ plugin/pkg/scheduler/scheduler_test.go | 4 + .../pkg/scheduler/schedulercache/node_info.go | 2 +- plugin/pkg/scheduler/util/utils.go | 11 +- test/e2e/scheduling/predicates.go | 6 +- test/e2e/scheduling/preemption.go | 128 ++++++ test/integration/scheduler/priorities_test.go | 12 +- test/integration/scheduler/scheduler_test.go | 257 +++++++++++- test/integration/scheduler/util.go | 38 +- 19 files changed, 1128 insertions(+), 199 deletions(-) create mode 100644 test/e2e/scheduling/preemption.go diff --git a/plugin/pkg/scheduler/algorithm/predicates/error.go b/plugin/pkg/scheduler/algorithm/predicates/error.go index 8cc85bce2c0..155fe18f431 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/error.go +++ b/plugin/pkg/scheduler/algorithm/predicates/error.go @@ -25,6 +25,11 @@ import ( var ( // The predicateName tries to be consistent as the predicate name used in DefaultAlgorithmProvider defined in // defaults.go (which tend to be stable for backward compatibility) + + // NOTE: If you add a new predicate failure error for a predicate that can never + // be made to pass by removing pods, or you change an existing predicate so that + // it can never be made to pass by removing pods, you need to add the predicate + // failure error in nodesWherePreemptionMightHelp() in scheduler/core/generic_scheduler.go ErrDiskConflict = newPredicateFailureError("NoDiskConflict") ErrVolumeZoneConflict = newPredicateFailureError("NoVolumeZoneConflict") ErrNodeSelectorNotMatch = newPredicateFailureError("MatchNodeSelector") diff --git a/plugin/pkg/scheduler/algorithm/predicates/metadata.go b/plugin/pkg/scheduler/algorithm/predicates/metadata.go index 0adaf84dc81..bb15272db3d 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/metadata.go +++ b/plugin/pkg/scheduler/algorithm/predicates/metadata.go @@ -40,8 +40,8 @@ type matchingPodAntiAffinityTerm struct { node *v1.Node } -// NOTE: When new fields are added/removed or logic is changed, please make sure -// that RemovePod and AddPod functions are updated to work with the new changes. +// NOTE: When new fields are added/removed or logic is changed, please make sure that +// RemovePod, AddPod, and ShallowCopy functions are updated to work with the new changes. type predicateMetadata struct { pod *v1.Pod podBestEffort bool @@ -172,13 +172,17 @@ func (meta *predicateMetadata) ShallowCopy() algorithm.PredicateMetadata { podRequest: meta.podRequest, serviceAffinityInUse: meta.serviceAffinityInUse, } + newPredMeta.podPorts = map[int]bool{} for k, v := range meta.podPorts { newPredMeta.podPorts[k] = v } + newPredMeta.matchingAntiAffinityTerms = map[string][]matchingPodAntiAffinityTerm{} for k, v := range meta.matchingAntiAffinityTerms { newPredMeta.matchingAntiAffinityTerms[k] = append([]matchingPodAntiAffinityTerm(nil), v...) } - newPredMeta.serviceAffinityMatchingPodServices = append([]*v1.Service(nil), meta.serviceAffinityMatchingPodServices...) - newPredMeta.serviceAffinityMatchingPodList = append([]*v1.Pod(nil), meta.serviceAffinityMatchingPodList...) + newPredMeta.serviceAffinityMatchingPodServices = append([]*v1.Service(nil), + meta.serviceAffinityMatchingPodServices...) + newPredMeta.serviceAffinityMatchingPodList = append([]*v1.Pod(nil), + meta.serviceAffinityMatchingPodList...) return (algorithm.PredicateMetadata)(newPredMeta) } diff --git a/plugin/pkg/scheduler/algorithm/predicates/metadata_test.go b/plugin/pkg/scheduler/algorithm/predicates/metadata_test.go index 40476283e8c..640da1fcace 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/metadata_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/metadata_test.go @@ -355,3 +355,46 @@ func TestPredicateMetadata_AddRemovePod(t *testing.T) { } } } + +// TestPredicateMetadata_ShallowCopy tests the ShallowCopy function. It is based +// on the idea that shallow-copy should produce an object that is deep-equal to the original +// object. +func TestPredicateMetadata_ShallowCopy(t *testing.T) { + source := predicateMetadata{ + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "testns", + }, + }, + podBestEffort: true, + podRequest: &schedulercache.Resource{ + MilliCPU: 1000, + Memory: 300, + AllowedPodNumber: 4, + }, + podPorts: map[int]bool{1234: true, 456: false}, + matchingAntiAffinityTerms: map[string][]matchingPodAntiAffinityTerm{ + "term1": { + { + term: &v1.PodAffinityTerm{TopologyKey: "node"}, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, + }, + }, + }, + }, + serviceAffinityInUse: true, + serviceAffinityMatchingPodList: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "pod1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "pod2"}}, + }, + serviceAffinityMatchingPodServices: []*v1.Service{ + {ObjectMeta: metav1.ObjectMeta{Name: "service1"}}, + }, + } + + if !reflect.DeepEqual(source.ShallowCopy().(*predicateMetadata), &source) { + t.Errorf("Copy is not equal to source!") + } +} diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index f28c7a901ce..94e6aa70219 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -45,6 +45,10 @@ import ( "github.com/golang/glog" ) +const ( + MatchInterPodAffinity = "MatchInterPodAffinity" +) + // NodeInfo: Other types for predicate functions... type NodeInfo interface { GetNodeInfo(nodeID string) (*v1.Node, error) diff --git a/plugin/pkg/scheduler/algorithm/scheduler_interface.go b/plugin/pkg/scheduler/algorithm/scheduler_interface.go index 7214f8ac4d1..41a6dbb48bb 100644 --- a/plugin/pkg/scheduler/algorithm/scheduler_interface.go +++ b/plugin/pkg/scheduler/algorithm/scheduler_interface.go @@ -47,6 +47,10 @@ type SchedulerExtender interface { // onto machines. type ScheduleAlgorithm interface { Schedule(*v1.Pod, NodeLister) (selectedMachine string, err error) + // Preempt receives scheduling errors for a pod and tries to create room for + // the pod by preempting lower priority pods if possible. + // It returns the node where preemption happened, a list of preempted pods, and error if any. + Preempt(*v1.Pod, NodeLister, error) (selectedNode *v1.Node, preemptedPods []*v1.Pod, err error) // Predicates() returns a pointer to a map of predicate functions. This is // exposed for testing. Predicates() map[string]FitPredicate diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go index d38fbb78f62..668db25d64d 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -155,7 +155,7 @@ func defaultPredicates() sets.String { ), // Fit is determined by inter-pod affinity. factory.RegisterFitPredicateFactory( - "MatchInterPodAffinity", + predicates.MatchInterPodAffinity, func(args factory.PluginFactoryArgs) algorithm.FitPredicate { return predicates.NewPodAffinityPredicate(args.NodeInfo, args.PodLister) }, diff --git a/plugin/pkg/scheduler/core/extender_test.go b/plugin/pkg/scheduler/core/extender_test.go index cb211e27f46..8b26ccdfeba 100644 --- a/plugin/pkg/scheduler/core/extender_test.go +++ b/plugin/pkg/scheduler/core/extender_test.go @@ -183,6 +183,8 @@ func (f *FakeExtender) IsBinder() bool { return true } +var _ algorithm.SchedulerExtender = &FakeExtender{} + func TestGenericSchedulerWithExtenders(t *testing.T) { tests := []struct { name string diff --git a/plugin/pkg/scheduler/core/generic_scheduler.go b/plugin/pkg/scheduler/core/generic_scheduler.go index b3e35477c66..686b52e170c 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler.go +++ b/plugin/pkg/scheduler/core/generic_scheduler.go @@ -47,7 +47,14 @@ type FitError struct { var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods") -const NoNodeAvailableMsg = "No nodes are available that match all of the following predicates" +const ( + NoNodeAvailableMsg = "No nodes are available that match all of the predicates" + // NominatedNodeAnnotationKey is used to annotate a pod that has preempted other pods. + // The scheduler uses the annotation to find that the pod shouldn't preempt more pods + // when it gets to the head of scheduling queue again. + // See podEligibleToPreemptOthers() for more information. + NominatedNodeAnnotationKey = "NominatedNodeName" +) // Error returns detailed information of why the pod failed to fit on each node func (f *FitError) Error() string { @@ -163,23 +170,61 @@ func (g *genericScheduler) selectHost(priorityList schedulerapi.HostPriorityList // preempt finds nodes with pods that can be preempted to make room for "pod" to // schedule. It chooses one of the nodes and preempts the pods on the node and -// returns the node name if such a node is found. -// TODO(bsalamat): This function is under construction! DO NOT USE! -func (g *genericScheduler) preempt(pod *v1.Pod, nodeLister algorithm.NodeLister) (string, error) { - nodes, err := nodeLister.List() +// returns the node and the list of preempted pods if such a node is found. +// TODO(bsalamat): Add priority-based scheduling. More info: today one or more +// pending pods (different from the pod that triggered the preemption(s)) may +// schedule into some portion of the resources freed up by the preemption(s) +// before the pod that triggered the preemption(s) has a chance to schedule +// there, thereby preventing the pod that triggered the preemption(s) from +// scheduling. Solution is given at: +// https://github.com/kubernetes/community/blob/master/contributors/design-proposals/pod-preemption.md#preemption-mechanics +func (g *genericScheduler) Preempt(pod *v1.Pod, nodeLister algorithm.NodeLister, scheduleErr error) (*v1.Node, []*v1.Pod, error) { + // Scheduler may return various types of errors. Consider preemption only if + // the error is of type FitError. + fitError, ok := scheduleErr.(*FitError) + if !ok || fitError == nil { + return nil, nil, nil + } + err := g.cache.UpdateNodeNameToInfoMap(g.cachedNodeInfoMap) if err != nil { - return "", err + return nil, nil, err } - if len(nodes) == 0 { - return "", ErrNoNodesAvailable + if !podEligibleToPreemptOthers(pod, g.cachedNodeInfoMap) { + glog.V(5).Infof("Pod %v is not eligible for more preemption.", pod.Name) + return nil, nil, nil } - nodeToPods := selectNodesForPreemption(pod, g.cachedNodeInfoMap, nodes, g.predicates, g.predicateMetaProducer) - if len(nodeToPods) == 0 { - return "", nil + allNodes, err := nodeLister.List() + if err != nil { + return nil, nil, err } - node := pickOneNodeForPreemption(nodeToPods) - // TODO: Add actual preemption of pods - return node, nil + if len(allNodes) == 0 { + return nil, nil, ErrNoNodesAvailable + } + potentialNodes := nodesWherePreemptionMightHelp(pod, allNodes, fitError.FailedPredicates) + if len(potentialNodes) == 0 { + glog.V(3).Infof("Preemption will not help schedule pod %v on any node.", pod.Name) + return nil, nil, nil + } + nodeToPods, err := selectNodesForPreemption(pod, g.cachedNodeInfoMap, potentialNodes, g.predicates, g.predicateMetaProducer) + if err != nil { + return nil, nil, err + } + for len(nodeToPods) > 0 { + node := pickOneNodeForPreemption(nodeToPods) + if node == nil { + return nil, nil, err + } + passes, pErr := nodePassesExtendersForPreemption(pod, node.Name, nodeToPods[node], g.cachedNodeInfoMap, g.extenders) + if passes && pErr == nil { + return node, nodeToPods[node], err + } + if pErr != nil { + glog.Errorf("Error occurred while checking extenders for preemption on node %v: %v", node, pErr) + } + // Remove the node from the map and try to pick a different node. + delete(nodeToPods, node) + } + return nil, nil, err } // Filters the nodes to find the ones that fit based on the given predicate functions @@ -452,108 +497,151 @@ func EqualPriorityMap(_ *v1.Pod, _ interface{}, nodeInfo *schedulercache.NodeInf // 1. A node with minimum highest priority victim is picked. // 2. Ties are broken by sum of priorities of all victims. // 3. If there are still ties, node with the minimum number of victims is picked. -// 4. If there are still ties, the first such node in the map is picked (sort of randomly). -func pickOneNodeForPreemption(nodesToPods map[string][]*v1.Pod) string { +// 4. If there are still ties, the first such node is picked (sort of randomly). +//TODO(bsalamat): Try to reuse the "nodeScore" slices in order to save GC time. +func pickOneNodeForPreemption(nodesToPods map[*v1.Node][]*v1.Pod) *v1.Node { type nodeScore struct { - nodeName string + node *v1.Node highestPriority int32 sumPriorities int64 numPods int } if len(nodesToPods) == 0 { - return "" + return nil } minHighestPriority := int32(math.MaxInt32) - nodeScores := []*nodeScore{} - for nodeName, pods := range nodesToPods { + minPriorityScores := []*nodeScore{} + for node, pods := range nodesToPods { if len(pods) == 0 { // We found a node that doesn't need any preemption. Return it! // This should happen rarely when one or more pods are terminated between // the time that scheduler tries to schedule the pod and the time that // preemption logic tries to find nodes for preemption. - return nodeName + return node } // highestPodPriority is the highest priority among the victims on this node. highestPodPriority := util.GetPodPriority(pods[0]) if highestPodPriority < minHighestPriority { minHighestPriority = highestPodPriority + minPriorityScores = nil + } + if highestPodPriority == minHighestPriority { + minPriorityScores = append(minPriorityScores, &nodeScore{node: node, highestPriority: highestPodPriority, numPods: len(pods)}) } - nodeScores = append(nodeScores, &nodeScore{nodeName: nodeName, highestPriority: highestPodPriority, numPods: len(pods)}) } - // Find the nodes with minimum highest priority victim. + if len(minPriorityScores) == 1 { + return minPriorityScores[0].node + } + // There are a few nodes with minimum highest priority victim. Find the + // smallest sum of priorities. minSumPriorities := int64(math.MaxInt64) - lowestHighPriorityNodes := []*nodeScore{} - for _, nodeScore := range nodeScores { - if nodeScore.highestPriority == minHighestPriority { - lowestHighPriorityNodes = append(lowestHighPriorityNodes, nodeScore) - var sumPriorities int64 - for _, pod := range nodesToPods[nodeScore.nodeName] { - // We add MaxInt32+1 to all priorities to make all of them >= 0. This is - // needed so that a node with a few pods with negative priority is not - // picked over a node with a smaller number of pods with the same negative - // priority (and similar scenarios). - sumPriorities += int64(util.GetPodPriority(pod)) + int64(math.MaxInt32+1) - } - if sumPriorities < minSumPriorities { - minSumPriorities = sumPriorities - } - nodeScore.sumPriorities = sumPriorities + minSumPriorityScores := []*nodeScore{} + for _, nodeScore := range minPriorityScores { + var sumPriorities int64 + for _, pod := range nodesToPods[nodeScore.node] { + // We add MaxInt32+1 to all priorities to make all of them >= 0. This is + // needed so that a node with a few pods with negative priority is not + // picked over a node with a smaller number of pods with the same negative + // priority (and similar scenarios). + sumPriorities += int64(util.GetPodPriority(pod)) + int64(math.MaxInt32+1) + } + if sumPriorities < minSumPriorities { + minSumPriorities = sumPriorities + minSumPriorityScores = nil + } + nodeScore.sumPriorities = sumPriorities + if sumPriorities == minSumPriorities { + minSumPriorityScores = append(minSumPriorityScores, nodeScore) } } - if len(lowestHighPriorityNodes) == 1 { - return lowestHighPriorityNodes[0].nodeName + if len(minSumPriorityScores) == 1 { + return minSumPriorityScores[0].node } - // There are multiple nodes with the same minimum highest priority victim. - // Choose the one(s) with lowest sum of priorities. + // There are a few nodes with minimum highest priority victim and sum of priorities. + // Find one with the minimum number of pods. minNumPods := math.MaxInt32 - lowestSumPriorityNodes := []*nodeScore{} - for _, nodeScore := range lowestHighPriorityNodes { - if nodeScore.sumPriorities == minSumPriorities { - lowestSumPriorityNodes = append(lowestSumPriorityNodes, nodeScore) - if nodeScore.numPods < minNumPods { - minNumPods = nodeScore.numPods - } + minNumPodScores := []*nodeScore{} + for _, nodeScore := range minSumPriorityScores { + if nodeScore.numPods < minNumPods { + minNumPods = nodeScore.numPods + minNumPodScores = nil } - } - if len(lowestSumPriorityNodes) == 1 { - return lowestSumPriorityNodes[0].nodeName - } - // There are still more than one node with minimum highest priority victim and - // lowest sum of victim priorities. Find the anyone with minimum number of victims. - for _, nodeScore := range lowestSumPriorityNodes { if nodeScore.numPods == minNumPods { - return nodeScore.nodeName + minNumPodScores = append(minNumPodScores, nodeScore) } } - glog.Errorf("We should never reach here!") - return "" + // At this point, even if there are more than one node with the same score, + // return the first one. + if len(minNumPodScores) > 0 { + return minNumPodScores[0].node + } + glog.Errorf("Error in logic of node scoring for preemption. We should never reach here!") + return nil } // selectNodesForPreemption finds all the nodes with possible victims for // preemption in parallel. func selectNodesForPreemption(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, - nodes []*v1.Node, + potentialNodes []*v1.Node, predicates map[string]algorithm.FitPredicate, metadataProducer algorithm.PredicateMetadataProducer, -) map[string][]*v1.Pod { +) (map[*v1.Node][]*v1.Pod, error) { - nodeNameToPods := map[string][]*v1.Pod{} + nodeNameToPods := map[*v1.Node][]*v1.Pod{} var resultLock sync.Mutex // We can use the same metadata producer for all nodes. meta := metadataProducer(pod, nodeNameToInfo) checkNode := func(i int) { - nodeName := nodes[i].Name - pods, fits := selectVictimsOnNode(pod, meta.ShallowCopy(), nodeNameToInfo[nodeName], predicates) + nodeName := potentialNodes[i].Name + var metaCopy algorithm.PredicateMetadata + if meta != nil { + metaCopy = meta.ShallowCopy() + } + pods, fits := selectVictimsOnNode(pod, metaCopy, nodeNameToInfo[nodeName], predicates) if fits { resultLock.Lock() - nodeNameToPods[nodeName] = pods + nodeNameToPods[potentialNodes[i]] = pods resultLock.Unlock() } } - workqueue.Parallelize(16, len(nodes), checkNode) - return nodeNameToPods + workqueue.Parallelize(16, len(potentialNodes), checkNode) + return nodeNameToPods, nil +} + +func nodePassesExtendersForPreemption( + pod *v1.Pod, + nodeName string, + victims []*v1.Pod, + nodeNameToInfo map[string]*schedulercache.NodeInfo, + extenders []algorithm.SchedulerExtender) (bool, error) { + // If there are any extenders, run them and filter the list of candidate nodes. + if len(extenders) == 0 { + return true, nil + } + // Remove the victims from the corresponding nodeInfo and send nodes to the + // extenders for filtering. + originalNodeInfo := nodeNameToInfo[nodeName] + nodeInfoCopy := nodeNameToInfo[nodeName].Clone() + for _, victim := range victims { + nodeInfoCopy.RemovePod(victim) + } + nodeNameToInfo[nodeName] = nodeInfoCopy + defer func() { nodeNameToInfo[nodeName] = originalNodeInfo }() + filteredNodes := []*v1.Node{nodeInfoCopy.Node()} + for _, extender := range extenders { + var err error + var failedNodesMap map[string]string + filteredNodes, failedNodesMap, err = extender.Filter(pod, filteredNodes, nodeNameToInfo) + if err != nil { + return false, err + } + if _, found := failedNodesMap[nodeName]; found || len(filteredNodes) == 0 { + return false, nil + } + } + return true, nil } // selectVictimsOnNode finds minimum set of pods on the given node that should @@ -563,25 +651,31 @@ func selectNodesForPreemption(pod *v1.Pod, // to one another, not relative to the preemptor "pod"). // The algorithm first checks if the pod can be scheduled on the node when all the // lower priority pods are gone. If so, it sorts all the lower priority pods by -// their priority and starting from the highest priority one, tries to keep as +// their priority and starts from the highest priority one, tries to keep as // many of them as possible while checking that the "pod" can still fit on the node. // NOTE: This function assumes that it is never called if "pod" cannot be scheduled // due to pod affinity, node affinity, or node anti-affinity reasons. None of // these predicates can be satisfied by removing more pods from the node. -func selectVictimsOnNode(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo, fitPredicates map[string]algorithm.FitPredicate) ([]*v1.Pod, bool) { - higherPriority := func(pod1, pod2 interface{}) bool { - return util.GetPodPriority(pod1.(*v1.Pod)) > util.GetPodPriority(pod2.(*v1.Pod)) - } - potentialVictims := util.SortableList{CompFunc: higherPriority} +// TODO(bsalamat): Add support for PodDisruptionBudget. +func selectVictimsOnNode( + pod *v1.Pod, + meta algorithm.PredicateMetadata, + nodeInfo *schedulercache.NodeInfo, + fitPredicates map[string]algorithm.FitPredicate) ([]*v1.Pod, bool) { + potentialVictims := util.SortableList{CompFunc: util.HigherPriorityPod} nodeInfoCopy := nodeInfo.Clone() removePod := func(rp *v1.Pod) { nodeInfoCopy.RemovePod(rp) - meta.RemovePod(rp) + if meta != nil { + meta.RemovePod(rp) + } } addPod := func(ap *v1.Pod) { nodeInfoCopy.AddPod(ap) - meta.AddPod(ap, nodeInfoCopy) + if meta != nil { + meta.AddPod(ap, nodeInfoCopy) + } } // As the first step, remove all the lower priority pods from the node and // check if the given pod can be scheduled. @@ -597,57 +691,83 @@ func selectVictimsOnNode(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo // we are almost done and this node is not suitable for preemption. The only condition // that we should check is if the "pod" is failing to schedule due to pod affinity // failure. - if fits, failedPredicates, err := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits { + // TODO(bsalamat): Consider checking affinity to lower priority pods if feasible with reasonable performance. + if fits, _, err := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits { if err != nil { glog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, err) - return nil, false - } - // If the new pod still cannot be scheduled for any reason other than pod - // affinity, the new pod will not fit on this node and we are done here. - affinity := pod.Spec.Affinity - if affinity == nil || affinity.PodAffinity == nil { - return nil, false - } - for _, failedPred := range failedPredicates { - if failedPred != predicates.ErrPodAffinityNotMatch { - return nil, false - } - } - // If we reach here, it means that the pod cannot be scheduled due to pod - // affinity or anti-affinity. Since failure reason for both affinity and - // anti-affinity is the same, we cannot say which one caused it. So, we try - // adding pods one at a time and see if any of them satisfies the affinity rules. - for i, p := range potentialVictims.Items { - existingPod := p.(*v1.Pod) - addPod(existingPod) - if fits, _, _ = podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits { - removePod(existingPod) - } else { - // We found the pod needed to satisfy pod affinity. Let's remove it from - // potential victims list. - // NOTE: We assume that pod affinity can be satisfied by only one pod, - // not multiple pods. This is how scheduler works today. - potentialVictims.Items = append(potentialVictims.Items[:i], potentialVictims.Items[i+1:]...) - break - } - } - if !fits { - return nil, false } + return nil, false } victims := []*v1.Pod{} - // Try to reprieve as may pods as possible starting from the highest priority one. + // Try to reprieve as many pods as possible starting from the highest priority one. for _, p := range potentialVictims.Items { lpp := p.(*v1.Pod) addPod(lpp) if fits, _, _ := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits { removePod(lpp) victims = append(victims, lpp) + glog.V(5).Infof("Pod %v is a potential preemption victim on node %v.", lpp.Name, nodeInfo.Node().Name) } } return victims, true } +// nodesWherePreemptionMightHelp returns a list of nodes with failed predicates +// that may be satisfied by removing pods from the node. +func nodesWherePreemptionMightHelp(pod *v1.Pod, nodes []*v1.Node, failedPredicatesMap FailedPredicateMap) []*v1.Node { + potentialNodes := []*v1.Node{} + for _, node := range nodes { + unresolvableReasonExist := false + failedPredicates, found := failedPredicatesMap[node.Name] + // If we assume that scheduler looks at all nodes and populates the failedPredicateMap + // (which is the case today), the !found case should never happen, but we'd prefer + // to rely less on such assumptions in the code when checking does not impose + // significant overhead. + for _, failedPredicate := range failedPredicates { + switch failedPredicate { + case + predicates.ErrNodeSelectorNotMatch, + predicates.ErrPodNotMatchHostName, + predicates.ErrTaintsTolerationsNotMatch, + predicates.ErrNodeLabelPresenceViolated, + predicates.ErrNodeNotReady, + predicates.ErrNodeNetworkUnavailable, + predicates.ErrNodeUnschedulable, + predicates.ErrNodeUnknownCondition: + unresolvableReasonExist = true + break + // TODO(bsalamat): Please add affinity failure cases once we have specific affinity failure errors. + } + } + if !found || !unresolvableReasonExist { + glog.V(3).Infof("Node %v is a potential node for preemption.", node.Name) + potentialNodes = append(potentialNodes, node) + } + } + return potentialNodes +} + +// podEligibleToPreemptOthers determines whether this pod should be considered +// for preempting other pods or not. If this pod has already preempted other +// pods and those are in their graceful termination period, it shouldn't be +// considered for preemption. +// We look at the node that is nominated for this pod and as long as there are +// terminating pods on the node, we don't consider this for preempting more pods. +// TODO(bsalamat): Revisit this algorithm once scheduling by priority is added. +func podEligibleToPreemptOthers(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) bool { + if nodeName, found := pod.Annotations[NominatedNodeAnnotationKey]; found { + if nodeInfo, found := nodeNameToInfo[nodeName]; found { + for _, p := range nodeInfo.Pods() { + if p.DeletionTimestamp != nil && util.GetPodPriority(p) < util.GetPodPriority(pod) { + // There is a terminating pod on the nominated node. + return false + } + } + } + } + return true +} + func NewGenericScheduler( cache schedulercache.Cache, eCache *EquivalenceCache, diff --git a/plugin/pkg/scheduler/core/generic_scheduler_test.go b/plugin/pkg/scheduler/core/generic_scheduler_test.go index d2ed01dff7f..cde1b5e1b4e 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler_test.go +++ b/plugin/pkg/scheduler/core/generic_scheduler_test.go @@ -33,6 +33,7 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" + "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates" algorithmpredicates "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates" algorithmpriorities "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities" priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util" @@ -307,8 +308,7 @@ func TestGenericScheduler(t *testing.T) { } scheduler := NewGenericScheduler( - cache, nil, test.predicates, algorithm.EmptyPredicateMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, - []algorithm.SchedulerExtender{}) + cache, nil, test.predicates, algorithm.EmptyPredicateMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, []algorithm.SchedulerExtender{}) machine, err := scheduler.Schedule(test.pod, schedulertesting.FakeNodeLister(makeNodeList(test.nodes))) if !reflect.DeepEqual(err, test.wErr) { @@ -548,12 +548,24 @@ func TestZeroRequest(t *testing.T) { } } -func checkPreemptionVictims(testName string, expected map[string]map[string]bool, nodeToPods map[string][]*v1.Pod) error { +func printNodeToPods(nodeToPods map[*v1.Node][]*v1.Pod) string { + var output string + for node, pods := range nodeToPods { + output += node.Name + ": [" + for _, pod := range pods { + output += pod.Name + ", " + } + output += "]" + } + return output +} + +func checkPreemptionVictims(testName string, expected map[string]map[string]bool, nodeToPods map[*v1.Node][]*v1.Pod) error { if len(expected) == len(nodeToPods) { for k, pods := range nodeToPods { - if expPods, ok := expected[k]; ok { + if expPods, ok := expected[k.Name]; ok { if len(pods) != len(expPods) { - return fmt.Errorf("test [%v]: unexpected number of pods. expected: %v, got: %v", testName, expected, nodeToPods) + return fmt.Errorf("test [%v]: unexpected number of pods. expected: %v, got: %v", testName, expected, printNodeToPods(nodeToPods)) } prevPriority := int32(math.MaxInt32) for _, p := range pods { @@ -567,11 +579,11 @@ func checkPreemptionVictims(testName string, expected map[string]map[string]bool } } } else { - return fmt.Errorf("test [%v]: unexpected machines. expected: %v, got: %v", testName, expected, nodeToPods) + return fmt.Errorf("test [%v]: unexpected machines. expected: %v, got: %v", testName, expected, printNodeToPods(nodeToPods)) } } } else { - return fmt.Errorf("test [%v]: unexpected number of machines. expected: %v, got: %v", testName, expected, nodeToPods) + return fmt.Errorf("test [%v]: unexpected number of machines. expected: %v, got: %v", testName, expected, printNodeToPods(nodeToPods)) } return nil } @@ -724,67 +736,37 @@ func TestSelectNodesForPreemption(t *testing.T) { expected: map[string]map[string]bool{"machine1": {"b": true, "c": true}}, }, { - name: "lower priority pod is not preempted to satisfy pod affinity", + name: "pod with anti-affinity is preempted", predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, Affinity: &v1.Affinity{ - PodAffinity: &v1.PodAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ - { - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "service", - Operator: metav1.LabelSelectorOpIn, - Values: []string{"securityscan", "value2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{ + Name: "machine1", + Labels: map[string]string{"pod": "preemptor"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority}}, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "a", Labels: map[string]string{"service": "securityscan"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1", Affinity: &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "pod", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"preemptor", "value2"}, + }, }, }, + TopologyKey: "hostname", }, - TopologyKey: "hostname", }, - }, - }}}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", Labels: map[string]string{"service": "securityscan"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, + }}}}, {ObjectMeta: metav1.ObjectMeta{Name: "b"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "c"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, {ObjectMeta: metav1.ObjectMeta{Name: "d"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}}, {ObjectMeta: metav1.ObjectMeta{Name: "e"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}}, - expected: map[string]map[string]bool{"machine1": {"b": true, "c": true}}, - addAffinityPredicate: true, - }, - { - name: "between two pods that satisfy affinity, the higher priority one stays", - predicates: map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, Affinity: &v1.Affinity{ - PodAffinity: &v1.PodAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ - { - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "service", - Operator: metav1.LabelSelectorOpIn, - Values: []string{"securityscan", "value2"}, - }, - }, - }, - TopologyKey: "hostname", - }, - }, - }}}}, - pods: []*v1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "a", Labels: map[string]string{"service": "securityscan"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "b", Labels: map[string]string{"service": "securityscan"}}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "c"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "d"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &highPriority, NodeName: "machine1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "e"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}}}, - expected: map[string]map[string]bool{"machine1": {"a": true, "c": true}}, + expected: map[string]map[string]bool{"machine1": {"a": true}, "machine2": {}}, addAffinityPredicate: true, }, } - for _, test := range tests { nodes := []*v1.Node{} for _, n := range test.nodes { @@ -793,10 +775,13 @@ func TestSelectNodesForPreemption(t *testing.T) { nodes = append(nodes, node) } if test.addAffinityPredicate { - test.predicates["affinity"] = algorithmpredicates.NewPodAffinityPredicate(FakeNodeInfo(*nodes[0]), schedulertesting.FakePodLister(test.pods)) + test.predicates[predicates.MatchInterPodAffinity] = algorithmpredicates.NewPodAffinityPredicate(FakeNodeInfo(*nodes[0]), schedulertesting.FakePodLister(test.pods)) } nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, nodes) - nodeToPods := selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata) + nodeToPods, err := selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata) + if err != nil { + t.Error(err) + } if err := checkPreemptionVictims(test.name, test.expected, nodeToPods); err != nil { t.Error(err) } @@ -950,10 +935,11 @@ func TestPickOneNodeForPreemption(t *testing.T) { nodes = append(nodes, makeNode(n, priorityutil.DefaultMilliCpuRequest*5, priorityutil.DefaultMemoryRequest*5)) } nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, nodes) - node := pickOneNodeForPreemption(selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata)) + candidateNodes, _ := selectNodesForPreemption(test.pod, nodeNameToInfo, nodes, test.predicates, PredicateMetadata) + node := pickOneNodeForPreemption(candidateNodes) found := false for _, nodeName := range test.expected { - if node == nodeName { + if node.Name == nodeName { found = true break } @@ -963,3 +949,272 @@ func TestPickOneNodeForPreemption(t *testing.T) { } } } + +func TestNodesWherePreemptionMightHelp(t *testing.T) { + // Prepare 4 node names. + nodeNames := []string{} + for i := 1; i < 5; i++ { + nodeNames = append(nodeNames, fmt.Sprintf("machine%d", i)) + } + + tests := []struct { + name string + failedPredMap FailedPredicateMap + pod *v1.Pod + expected map[string]bool // set of expected node names. Value is ignored. + }{ + { + name: "No node should be attempted", + failedPredMap: FailedPredicateMap{ + "machine1": []algorithm.PredicateFailureReason{predicates.ErrNodeSelectorNotMatch}, + "machine2": []algorithm.PredicateFailureReason{predicates.ErrPodNotMatchHostName}, + "machine3": []algorithm.PredicateFailureReason{predicates.ErrTaintsTolerationsNotMatch}, + "machine4": []algorithm.PredicateFailureReason{predicates.ErrNodeLabelPresenceViolated}, + }, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}}, + expected: map[string]bool{}, + }, + { + name: "pod affinity should be tried", + failedPredMap: FailedPredicateMap{ + "machine1": []algorithm.PredicateFailureReason{predicates.ErrPodAffinityNotMatch}, + "machine2": []algorithm.PredicateFailureReason{predicates.ErrPodNotMatchHostName}, + "machine3": []algorithm.PredicateFailureReason{predicates.ErrNodeUnschedulable}, + }, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Spec: v1.PodSpec{Affinity: &v1.Affinity{ + PodAffinity: &v1.PodAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "service", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"securityscan", "value2"}, + }, + }, + }, + TopologyKey: "hostname", + }, + }, + }}}}, + expected: map[string]bool{"machine1": true, "machine4": true}, + }, + { + name: "pod with both pod affinity and anti-affinity should be tried", + failedPredMap: FailedPredicateMap{ + "machine1": []algorithm.PredicateFailureReason{predicates.ErrPodAffinityNotMatch}, + "machine2": []algorithm.PredicateFailureReason{predicates.ErrPodNotMatchHostName}, + }, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Spec: v1.PodSpec{Affinity: &v1.Affinity{ + PodAffinity: &v1.PodAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "service", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"securityscan", "value2"}, + }, + }, + }, + TopologyKey: "hostname", + }, + }, + }, + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "service", + Operator: metav1.LabelSelectorOpNotIn, + Values: []string{"blah", "foo"}, + }, + }, + }, + TopologyKey: "region", + }, + }, + }, + }}}, + expected: map[string]bool{"machine1": true, "machine3": true, "machine4": true}, + }, + { + name: "Mix of failed predicates works fine", + failedPredMap: FailedPredicateMap{ + "machine1": []algorithm.PredicateFailureReason{predicates.ErrNodeSelectorNotMatch, predicates.ErrNodeOutOfDisk, predicates.NewInsufficientResourceError(v1.ResourceMemory, 1000, 500, 300)}, + "machine2": []algorithm.PredicateFailureReason{predicates.ErrPodNotMatchHostName, predicates.ErrDiskConflict}, + "machine3": []algorithm.PredicateFailureReason{predicates.NewInsufficientResourceError(v1.ResourceMemory, 1000, 600, 400)}, + "machine4": []algorithm.PredicateFailureReason{}, + }, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}}, + expected: map[string]bool{"machine3": true, "machine4": true}, + }, + } + + for _, test := range tests { + nodes := nodesWherePreemptionMightHelp(test.pod, makeNodeList(nodeNames), test.failedPredMap) + if len(test.expected) != len(nodes) { + t.Errorf("test [%v]:number of nodes is not the same as expected. exptectd: %d, got: %d. Nodes: %v", test.name, len(test.expected), len(nodes), nodes) + } + for _, node := range nodes { + if _, found := test.expected[node.Name]; !found { + t.Errorf("test [%v]: node %v is not expected.", test.name, node.Name) + } + } + } +} + +func TestPreempt(t *testing.T) { + failedPredMap := FailedPredicateMap{ + "machine1": []algorithm.PredicateFailureReason{predicates.NewInsufficientResourceError(v1.ResourceMemory, 1000, 500, 300)}, + "machine2": []algorithm.PredicateFailureReason{predicates.ErrDiskConflict}, + "machine3": []algorithm.PredicateFailureReason{predicates.NewInsufficientResourceError(v1.ResourceMemory, 1000, 600, 400)}, + } + // Prepare 3 node names. + nodeNames := []string{} + for i := 1; i < 4; i++ { + nodeNames = append(nodeNames, fmt.Sprintf("machine%d", i)) + } + tests := []struct { + name string + pod *v1.Pod + pods []*v1.Pod + extenders []*FakeExtender + expectedNode string + expectedPods []string // list of preempted pods + }{ + { + name: "basic preemption logic", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m3.1"}, Spec: v1.PodSpec{Containers: mediumContainers, Priority: &midPriority, NodeName: "machine3"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + expectedNode: "machine1", + expectedPods: []string{"m1.1", "m1.2"}, + }, + { + name: "One node doesn't need any preemption", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &highPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + expectedNode: "machine3", + expectedPods: []string{}, + }, + { + name: "Scheduler extenders allow only machine1, otherwise machine3 would have been chosen", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + extenders: []*FakeExtender{ + { + predicates: []fitPredicate{truePredicateExtender}, + }, + { + predicates: []fitPredicate{machine1PredicateExtender}, + }, + }, + expectedNode: "machine1", + expectedPods: []string{"m1.1", "m1.2"}, + }, + { + name: "Scheduler extenders do not allow any preemption", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Spec: v1.PodSpec{ + Containers: veryLargeContainers, + Priority: &highPriority}, + }, + pods: []*v1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "m1.1"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &midPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "m1.2"}, Spec: v1.PodSpec{Containers: smallContainers, Priority: &lowPriority, NodeName: "machine1"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + + {ObjectMeta: metav1.ObjectMeta{Name: "m2.1"}, Spec: v1.PodSpec{Containers: largeContainers, Priority: &midPriority, NodeName: "machine2"}, Status: v1.PodStatus{Phase: v1.PodRunning}}, + }, + extenders: []*FakeExtender{ + { + predicates: []fitPredicate{falsePredicateExtender}, + }, + }, + expectedNode: "", + expectedPods: []string{}, + }, + } + + for _, test := range tests { + stop := make(chan struct{}) + cache := schedulercache.New(time.Duration(0), stop) + for _, pod := range test.pods { + cache.AddPod(pod) + } + for _, name := range nodeNames { + cache.AddNode(makeNode(name, priorityutil.DefaultMilliCpuRequest*5, priorityutil.DefaultMemoryRequest*5)) + } + extenders := []algorithm.SchedulerExtender{} + for _, extender := range test.extenders { + extenders = append(extenders, extender) + } + scheduler := NewGenericScheduler( + cache, nil, map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, algorithm.EmptyPredicateMetadataProducer, []algorithm.PriorityConfig{{Function: numericPriority, Weight: 1}}, algorithm.EmptyMetadataProducer, extenders) + // Call Preempt and check the expected results. + node, victims, err := scheduler.Preempt(test.pod, schedulertesting.FakeNodeLister(makeNodeList(nodeNames)), error(&FitError{test.pod, failedPredMap})) + if err != nil { + t.Errorf("test [%v]: unexpected error in preemption: %v", test.name, err) + } + if (node != nil && node.Name != test.expectedNode) || (node == nil && len(test.expectedNode) != 0) { + t.Errorf("test [%v]: expected node: %v, got: %v", test.name, test.expectedNode, node) + } + if len(victims) != len(test.expectedPods) { + t.Errorf("test [%v]: expected %v pods, got %v.", test.name, len(test.expectedPods), len(victims)) + } + for _, victim := range victims { + found := false + for _, expPod := range test.expectedPods { + if expPod == victim.Name { + found = true + break + } + } + if !found { + t.Errorf("test [%v]: pod %v is not expected to be a victim.", test.name, victim.Name) + } + // Mark the victims for deletion and record the preemptor's nominated node name. + now := metav1.Now() + victim.DeletionTimestamp = &now + test.pod.Annotations = make(map[string]string) + test.pod.Annotations[NominatedNodeAnnotationKey] = node.Name + } + // Call preempt again and make sure it doesn't preempt any more pods. + node, victims, err = scheduler.Preempt(test.pod, schedulertesting.FakeNodeLister(makeNodeList(nodeNames)), error(&FitError{test.pod, failedPredMap})) + if err != nil { + t.Errorf("test [%v]: unexpected error in preemption: %v", test.name, err) + } + if node != nil && len(victims) > 0 { + t.Errorf("test [%v]: didn't expect any more preemption. Node %v is selected for preemption.", test.name, node) + } + close(stop) + } +} diff --git a/plugin/pkg/scheduler/factory/factory.go b/plugin/pkg/scheduler/factory/factory.go index 31cb17548ed..069707ddf21 100644 --- a/plugin/pkg/scheduler/factory/factory.go +++ b/plugin/pkg/scheduler/factory/factory.go @@ -716,6 +716,7 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, Algorithm: algo, Binder: f.getBinder(extenders), PodConditionUpdater: &podConditionUpdater{f.client}, + PodPreemptor: &podPreemptor{f.client}, WaitForCacheSync: func() bool { return cache.WaitForCacheSync(f.StopEverything, f.scheduledPodsHasSynced) }, @@ -991,3 +992,28 @@ func (p *podConditionUpdater) Update(pod *v1.Pod, condition *v1.PodCondition) er } return nil } + +type podPreemptor struct { + Client clientset.Interface +} + +func (p *podPreemptor) GetUpdatedPod(pod *v1.Pod) (*v1.Pod, error) { + return p.Client.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}) +} + +func (p *podPreemptor) DeletePod(pod *v1.Pod) error { + return p.Client.CoreV1().Pods(pod.Namespace).Delete(pod.Name, &metav1.DeleteOptions{}) +} + +//TODO(bsalamat): change this to patch PodStatus to avoid overwriting potential pending status updates. +func (p *podPreemptor) UpdatePodAnnotations(pod *v1.Pod, annotations map[string]string) error { + podCopy := pod.DeepCopy() + if podCopy.Annotations == nil { + podCopy.Annotations = map[string]string{} + } + for k, v := range annotations { + podCopy.Annotations[k] = v + } + _, err := p.Client.CoreV1().Pods(podCopy.Namespace).UpdateStatus(podCopy) + return err +} diff --git a/plugin/pkg/scheduler/scheduler.go b/plugin/pkg/scheduler/scheduler.go index a58a31bc34a..8f1c8ea38ea 100644 --- a/plugin/pkg/scheduler/scheduler.go +++ b/plugin/pkg/scheduler/scheduler.go @@ -23,10 +23,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientset "k8s.io/client-go/kubernetes" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" "k8s.io/kubernetes/plugin/pkg/scheduler/core" @@ -48,6 +50,14 @@ type PodConditionUpdater interface { Update(pod *v1.Pod, podCondition *v1.PodCondition) error } +// PodPreemptor has methods needed to delete a pod and to update +// annotations of the preemptor pod. +type PodPreemptor interface { + GetUpdatedPod(pod *v1.Pod) (*v1.Pod, error) + DeletePod(pod *v1.Pod) error + UpdatePodAnnotations(pod *v1.Pod, annots map[string]string) error +} + // Scheduler watches for new unscheduled pods. It attempts to find // nodes that they fit on and writes bindings back to the api server. type Scheduler struct { @@ -102,6 +112,8 @@ type Config struct { // with scheduling, PodScheduled condition will be updated in apiserver in /bind // handler so that binding and setting PodCondition it is atomic. PodConditionUpdater PodConditionUpdater + // PodPreemptor is used to evict pods and update pod annotations. + PodPreemptor PodPreemptor // NextPod should be a function that blocks until the next pod // is available. We don't use a channel for this, because scheduling @@ -176,6 +188,41 @@ func (sched *Scheduler) schedule(pod *v1.Pod) (string, error) { return host, err } +func (sched *Scheduler) preempt(preemptor *v1.Pod, scheduleErr error) (string, error) { + if !utilfeature.DefaultFeatureGate.Enabled(features.PodPriority) { + glog.V(3).Infof("Pod priority feature is not enabled. No preemption is performed.") + return "", nil + } + preemptor, err := sched.config.PodPreemptor.GetUpdatedPod(preemptor) + if err != nil { + glog.Errorf("Error getting the updated preemptor pod object: %v", err) + return "", err + } + node, victims, err := sched.config.Algorithm.Preempt(preemptor, sched.config.NodeLister, scheduleErr) + if err != nil { + glog.Errorf("Error preempting victims to make room for %v/%v.", preemptor.Namespace, preemptor.Name) + return "", err + } + if node == nil { + return "", err + } + glog.Infof("Preempting %d pod(s) on node %v to make room for %v/%v.", len(victims), node.Name, preemptor.Namespace, preemptor.Name) + annotations := map[string]string{core.NominatedNodeAnnotationKey: node.Name} + err = sched.config.PodPreemptor.UpdatePodAnnotations(preemptor, annotations) + if err != nil { + glog.Errorf("Error in preemption process. Cannot update pod %v annotations: %v", preemptor.Name, err) + return "", err + } + for _, victim := range victims { + if err := sched.config.PodPreemptor.DeletePod(victim); err != nil { + glog.Errorf("Error preempting pod %v/%v: %v", victim.Namespace, victim.Name, err) + return "", err + } + sched.config.Recorder.Eventf(victim, v1.EventTypeNormal, "Preempted", "by %v/%v on node %v", preemptor.Namespace, preemptor.Name, node.Name) + } + return node.Name, err +} + // assume signals to the cache that a pod is already in the cache, so that binding can be asnychronous. // assume modifies `assumed`. func (sched *Scheduler) assume(assumed *v1.Pod, host string) error { @@ -258,6 +305,13 @@ func (sched *Scheduler) scheduleOne() { suggestedHost, err := sched.schedule(pod) metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start)) if err != nil { + // schedule() may have failed because the pod would not fit on any host, so we try to + // preempt, with the expectation that the next time the pod is tried for scheduling it + // will fit due to the preemption. It is also possible that a different pod will schedule + // into the resources that were preempted, but this is harmless. + if fitError, ok := err.(*core.FitError); ok { + sched.preempt(pod, fitError) + } return } diff --git a/plugin/pkg/scheduler/scheduler_test.go b/plugin/pkg/scheduler/scheduler_test.go index 3ad54d400b3..b0743b7e17b 100644 --- a/plugin/pkg/scheduler/scheduler_test.go +++ b/plugin/pkg/scheduler/scheduler_test.go @@ -103,6 +103,10 @@ func (es mockScheduler) Prioritizers() []algorithm.PriorityConfig { return nil } +func (es mockScheduler) Preempt(pod *v1.Pod, nodeLister algorithm.NodeLister, scheduleErr error) (*v1.Node, []*v1.Pod, error) { + return nil, nil, nil +} + func TestScheduler(t *testing.T) { eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartLogging(t.Logf).Stop() diff --git a/plugin/pkg/scheduler/schedulercache/node_info.go b/plugin/pkg/scheduler/schedulercache/node_info.go index 4192aea7264..cda57e8b433 100644 --- a/plugin/pkg/scheduler/schedulercache/node_info.go +++ b/plugin/pkg/scheduler/schedulercache/node_info.go @@ -320,7 +320,7 @@ func (n *NodeInfo) AddPod(pod *v1.Pod) { n.generation++ } -// RemovePod subtracts pod information to this NodeInfo. +// RemovePod subtracts pod information from this NodeInfo. func (n *NodeInfo) RemovePod(pod *v1.Pod) error { k1, err := getPodKey(pod) if err != nil { diff --git a/plugin/pkg/scheduler/util/utils.go b/plugin/pkg/scheduler/util/utils.go index 2bf10adf6b1..2cbe26f2e3a 100644 --- a/plugin/pkg/scheduler/util/utils.go +++ b/plugin/pkg/scheduler/util/utils.go @@ -67,8 +67,8 @@ type SortableList struct { CompFunc LessFunc } -// LessFunc is a function that receives two Pods and returns true if the first -// pod should be placed before pod2 when the list is sorted. +// LessFunc is a function that receives two items and returns true if the first +// item should be placed before the second one when the list is sorted. type LessFunc func(item1, item2 interface{}) bool var _ = sort.Interface(&SortableList{}) @@ -88,3 +88,10 @@ func (l *SortableList) Swap(i, j int) { func (l *SortableList) Sort() { sort.Sort(l) } + +// HigherPriorityPod return true when priority of the first pod is higher than +// the second one. It takes arguments of the type "interface{}" to be used with +// SortableList, but expects those arguments to be *v1.Pod. +func HigherPriorityPod(pod1, pod2 interface{}) bool { + return GetPodPriority(pod1.(*v1.Pod)) > GetPodPriority(pod2.(*v1.Pod)) +} diff --git a/test/e2e/scheduling/predicates.go b/test/e2e/scheduling/predicates.go index 8443e26b38d..9c14b20a5f4 100644 --- a/test/e2e/scheduling/predicates.go +++ b/test/e2e/scheduling/predicates.go @@ -52,6 +52,7 @@ type pausePodConfig struct { NodeName string Ports []v1.ContainerPort OwnerReferences []metav1.OwnerReference + PriorityClassName string } var _ = SIGDescribe("SchedulerPredicates [Serial]", func() { @@ -555,8 +556,9 @@ func initPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod { Ports: conf.Ports, }, }, - Tolerations: conf.Tolerations, - NodeName: conf.NodeName, + Tolerations: conf.Tolerations, + NodeName: conf.NodeName, + PriorityClassName: conf.PriorityClassName, }, } if conf.Resources != nil { diff --git a/test/e2e/scheduling/preemption.go b/test/e2e/scheduling/preemption.go new file mode 100644 index 00000000000..ef1af225362 --- /dev/null +++ b/test/e2e/scheduling/preemption.go @@ -0,0 +1,128 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "fmt" + "time" + + "k8s.io/api/core/v1" + "k8s.io/api/scheduling/v1alpha1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + _ "github.com/stretchr/testify/assert" +) + +var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func() { + var cs clientset.Interface + var nodeList *v1.NodeList + var ns string + f := framework.NewDefaultFramework("sched-preemption") + + lowPriority, mediumPriority, highPriority := int32(1), int32(100), int32(1000) + lowPriorityClassName := f.BaseName + "-low-priority" + mediumPriorityClassName := f.BaseName + "-medium-priority" + highPriorityClassName := f.BaseName + "-high-priority" + + AfterEach(func() { + }) + + BeforeEach(func() { + cs = f.ClientSet + ns = f.Namespace.Name + nodeList = &v1.NodeList{} + + _, err := f.ClientSet.SchedulingV1alpha1().PriorityClasses().Create(&v1alpha1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: highPriorityClassName}, Value: highPriority}) + Expect(err == nil || errors.IsAlreadyExists(err)).To(Equal(true)) + _, err = f.ClientSet.SchedulingV1alpha1().PriorityClasses().Create(&v1alpha1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: mediumPriorityClassName}, Value: mediumPriority}) + Expect(err == nil || errors.IsAlreadyExists(err)).To(Equal(true)) + _, err = f.ClientSet.SchedulingV1alpha1().PriorityClasses().Create(&v1alpha1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: lowPriorityClassName}, Value: lowPriority}) + Expect(err == nil || errors.IsAlreadyExists(err)).To(Equal(true)) + + framework.WaitForAllNodesHealthy(cs, time.Minute) + masterNodes, nodeList = framework.GetMasterAndWorkerNodesOrDie(cs) + + err = framework.CheckTestingNSDeletedExcept(cs, ns) + framework.ExpectNoError(err) + }) + + // This test verifies that when a higher priority pod is created and no node with + // enough resources is found, scheduler preempts a lower priority pod to schedule + // the high priority pod. + It("validates basic preemption works", func() { + var podRes v1.ResourceList + // Create one pod per node that uses a lot of the node's resources. + By("Create pods that use 60% of node resources.") + pods := make([]*v1.Pod, len(nodeList.Items)) + for i, node := range nodeList.Items { + cpuAllocatable, found := node.Status.Allocatable["cpu"] + Expect(found).To(Equal(true)) + milliCPU := cpuAllocatable.MilliValue() * 40 / 100 + memAllocatable, found := node.Status.Allocatable["memory"] + Expect(found).To(Equal(true)) + memory := memAllocatable.Value() * 60 / 100 + podRes = v1.ResourceList{} + podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI) + podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI) + + // make the first pod low priority and the rest medium priority. + priorityName := mediumPriorityClassName + if i == 0 { + priorityName = lowPriorityClassName + } + pods[i] = createPausePod(f, pausePodConfig{ + Name: fmt.Sprintf("pod%d-%v", i, priorityName), + PriorityClassName: priorityName, + Resources: &v1.ResourceRequirements{ + Requests: podRes, + }, + }) + framework.Logf("Created pod: %v", pods[i].Name) + } + By("Wait for pods to be scheduled.") + for _, pod := range pods { + framework.ExpectNoError(framework.WaitForPodRunningInNamespace(cs, pod)) + } + + By("Run a high priority pod that use 60% of a node resources.") + // Create a high priority pod and make sure it is scheduled. + runPausePod(f, pausePodConfig{ + Name: "preemptor-pod", + PriorityClassName: highPriorityClassName, + Resources: &v1.ResourceRequirements{ + Requests: podRes, + }, + }) + // Make sure that the lowest priority pod is deleted. + preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{}) + podDeleted := (err != nil && errors.IsNotFound(err)) || + (err == nil && preemptedPod.DeletionTimestamp != nil) + Expect(podDeleted).To(BeTrue()) + // Other pods (mid priority ones) should be present. + for i := 1; i < len(pods); i++ { + livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + Expect(livePod.DeletionTimestamp).To(BeNil()) + } + }) +}) diff --git a/test/integration/scheduler/priorities_test.go b/test/integration/scheduler/priorities_test.go index 5271ad82ff3..6dd75ee670a 100644 --- a/test/integration/scheduler/priorities_test.go +++ b/test/integration/scheduler/priorities_test.go @@ -51,7 +51,7 @@ func TestNodeAffinity(t *testing.T) { } // Create a pod with node affinity. podName := "pod-with-node-affinity" - pod, err := runPausePod(context.clientSet, &pausePodConfig{ + pod, err := runPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{ Name: podName, Namespace: context.ns.Name, Affinity: &v1.Affinity{ @@ -72,7 +72,7 @@ func TestNodeAffinity(t *testing.T) { }, }, }, - }) + })) if err != nil { t.Fatalf("Error running pause pod: %v", err) } @@ -110,11 +110,11 @@ func TestPodAffinity(t *testing.T) { // Add a pod with a label and wait for it to schedule. labelKey := "service" labelValue := "S1" - _, err = runPausePod(context.clientSet, &pausePodConfig{ + _, err = runPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{ Name: "attractor-pod", Namespace: context.ns.Name, Labels: map[string]string{labelKey: labelValue}, - }) + })) if err != nil { t.Fatalf("Error running the attractor pod: %v", err) } @@ -125,7 +125,7 @@ func TestPodAffinity(t *testing.T) { } // Add a new pod with affinity to the attractor pod. podName := "pod-with-podaffinity" - pod, err := runPausePod(context.clientSet, &pausePodConfig{ + pod, err := runPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{ Name: podName, Namespace: context.ns.Name, Affinity: &v1.Affinity{ @@ -158,7 +158,7 @@ func TestPodAffinity(t *testing.T) { }, }, }, - }) + })) if err != nil { t.Fatalf("Error running pause pod: %v", err) } diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go index e0c2b586b46..770a9bead28 100644 --- a/test/integration/scheduler/scheduler_test.go +++ b/test/integration/scheduler/scheduler_test.go @@ -24,9 +24,11 @@ import ( "time" "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/informers" clientset "k8s.io/client-go/kubernetes" clientv1core "k8s.io/client-go/kubernetes/typed/core/v1" @@ -36,15 +38,18 @@ import ( "k8s.io/client-go/tools/record" "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/testapi" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/plugin/cmd/kube-scheduler/app" "k8s.io/kubernetes/plugin/cmd/kube-scheduler/app/options" "k8s.io/kubernetes/plugin/pkg/scheduler" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" _ "k8s.io/kubernetes/plugin/pkg/scheduler/algorithmprovider" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" + "k8s.io/kubernetes/plugin/pkg/scheduler/core" "k8s.io/kubernetes/plugin/pkg/scheduler/factory" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" "k8s.io/kubernetes/test/integration/framework" + testutils "k8s.io/kubernetes/test/utils" ) const enableEquivalenceCache = true @@ -457,13 +462,13 @@ func TestMultiScheduler(t *testing.T) { } defaultScheduler := "default-scheduler" - testPodFitsDefault, err := createPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-default", Namespace: context.ns.Name, SchedulerName: defaultScheduler}) + testPodFitsDefault, err := createPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-default", Namespace: context.ns.Name, SchedulerName: defaultScheduler})) if err != nil { t.Fatalf("Failed to create pod: %v", err) } fooScheduler := "foo-scheduler" - testPodFitsFoo, err := createPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-foo", Namespace: context.ns.Name, SchedulerName: fooScheduler}) + testPodFitsFoo, err := createPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-foo", Namespace: context.ns.Name, SchedulerName: fooScheduler})) if err != nil { t.Fatalf("Failed to create pod: %v", err) } @@ -647,3 +652,251 @@ func TestAllocatable(t *testing.T) { t.Logf("Test allocatable awareness: %s Pod not scheduled as expected", testAllocPod2.Name) } } + +// TestPreemption tests a few preemption scenarios. +func TestPreemption(t *testing.T) { + // Enable PodPriority feature gate. + utilfeature.DefaultFeatureGate.Set(fmt.Sprintf("%s=true", features.PodPriority)) + // Initialize scheduler. + context := initTest(t, "preemption") + defer cleanupTest(t, context) + cs := context.clientSet + + lowPriority, mediumPriority, highPriority := int32(100), int32(200), int32(300) + defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI)}, + } + + tests := []struct { + description string + existingPods []*v1.Pod + pod *v1.Pod + preemptedPodIndexes map[int]struct{} + }{ + { + description: "basic pod preemption", + existingPods: []*v1.Pod{ + initPausePod(context.clientSet, &pausePodConfig{ + Name: "victim-pod", + Namespace: context.ns.Name, + Priority: &lowPriority, + Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)}, + }, + }), + }, + pod: initPausePod(cs, &pausePodConfig{ + Name: "preemptor-pod", + Namespace: context.ns.Name, + Priority: &highPriority, + Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)}, + }, + }), + preemptedPodIndexes: map[int]struct{}{0: {}}, + }, + { + description: "preemption is performed to satisfy anti-affinity", + existingPods: []*v1.Pod{ + initPausePod(cs, &pausePodConfig{ + Name: "pod-0", Namespace: context.ns.Name, + Priority: &mediumPriority, + Labels: map[string]string{"pod": "p0"}, + Resources: defaultPodRes, + }), + initPausePod(cs, &pausePodConfig{ + Name: "pod-1", Namespace: context.ns.Name, + Priority: &lowPriority, + Labels: map[string]string{"pod": "p1"}, + Resources: defaultPodRes, + Affinity: &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "pod", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"preemptor"}, + }, + }, + }, + TopologyKey: "node", + }, + }, + }, + }, + }), + }, + // A higher priority pod with anti-affinity. + pod: initPausePod(cs, &pausePodConfig{ + Name: "preemptor-pod", + Namespace: context.ns.Name, + Priority: &highPriority, + Labels: map[string]string{"pod": "preemptor"}, + Resources: defaultPodRes, + Affinity: &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "pod", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"p0"}, + }, + }, + }, + TopologyKey: "node", + }, + }, + }, + }, + }), + preemptedPodIndexes: map[int]struct{}{0: {}, 1: {}}, + }, + { + // This is similar to the previous case only pod-1 is high priority. + description: "preemption is not performed when anti-affinity is not satisfied", + existingPods: []*v1.Pod{ + initPausePod(cs, &pausePodConfig{ + Name: "pod-0", Namespace: context.ns.Name, + Priority: &mediumPriority, + Labels: map[string]string{"pod": "p0"}, + Resources: defaultPodRes, + }), + initPausePod(cs, &pausePodConfig{ + Name: "pod-1", Namespace: context.ns.Name, + Priority: &highPriority, + Labels: map[string]string{"pod": "p1"}, + Resources: defaultPodRes, + Affinity: &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "pod", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"preemptor"}, + }, + }, + }, + TopologyKey: "node", + }, + }, + }, + }, + }), + }, + // A higher priority pod with anti-affinity. + pod: initPausePod(cs, &pausePodConfig{ + Name: "preemptor-pod", + Namespace: context.ns.Name, + Priority: &highPriority, + Labels: map[string]string{"pod": "preemptor"}, + Resources: defaultPodRes, + Affinity: &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "pod", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"p0"}, + }, + }, + }, + TopologyKey: "node", + }, + }, + }, + }, + }), + preemptedPodIndexes: map[int]struct{}{}, + }, + } + + // Create a node with some resources and a label. + nodeRes := &v1.ResourceList{ + v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(500, resource.BinarySI), + } + node, err := createNode(context.clientSet, "node1", nodeRes) + if err != nil { + t.Fatalf("Error creating nodes: %v", err) + } + nodeLabels := map[string]string{"node": node.Name} + if err = testutils.AddLabelsToNode(context.clientSet, node.Name, nodeLabels); err != nil { + t.Fatalf("Cannot add labels to node: %v", err) + } + if err = waitForNodeLabels(context.clientSet, node.Name, nodeLabels); err != nil { + t.Fatalf("Adding labels to node didn't succeed: %v", err) + } + + for _, test := range tests { + pods := make([]*v1.Pod, len(test.existingPods)) + // Create and run existingPods. + for i, p := range test.existingPods { + pods[i], err = runPausePod(cs, p) + if err != nil { + t.Fatalf("Test [%v]: Error running pause pod: %v", test.description, err) + } + } + // Create the "pod". + preemptor, err := createPausePod(cs, test.pod) + if err != nil { + t.Errorf("Error while creating high priority pod: %v", err) + } + // Wait for preemption of pods and make sure the other ones are not preempted. + for i, p := range pods { + if _, found := test.preemptedPodIndexes[i]; found { + if err = wait.Poll(time.Second, wait.ForeverTestTimeout, podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil { + t.Errorf("Test [%v]: Pod %v is not getting evicted.", test.description, p.Name) + } + } else { + if p.DeletionTimestamp != nil { + t.Errorf("Test [%v]: Didn't expect pod %v to get preempted.", test.description, p.Name) + } + } + } + // Also check that the preemptor pod gets the annotation for nominated node name. + if len(test.preemptedPodIndexes) > 0 { + if err = wait.Poll(time.Second, wait.ForeverTestTimeout, func() (bool, error) { + pod, err := context.clientSet.CoreV1().Pods(context.ns.Name).Get("preemptor-pod", metav1.GetOptions{}) + if err != nil { + t.Errorf("Test [%v]: error getting pod: %v", test.description, err) + } + annot, found := pod.Annotations[core.NominatedNodeAnnotationKey] + if found && len(annot) > 0 { + return true, nil + } + return false, err + }); err != nil { + t.Errorf("Test [%v]: Pod annotation did not get set.", test.description) + } + } + + // Cleanup + pods = append(pods, preemptor) + for _, p := range pods { + err = cs.CoreV1().Pods(p.Namespace).Delete(p.Name, metav1.NewDeleteOptions(0)) + if err != nil && !errors.IsNotFound(err) { + t.Errorf("Test [%v]: error, %v, while deleting pod during test.", test.description, err) + } + err = wait.Poll(time.Second, wait.ForeverTestTimeout, podDeleted(cs, p.Namespace, p.Name)) + if err != nil { + t.Errorf("Test [%v]: error, %v, while waiting for pod to get deleted.", test.description, err) + } + } + } +} diff --git a/test/integration/scheduler/util.go b/test/integration/scheduler/util.go index 8cf8c6c90c2..81726f1caba 100644 --- a/test/integration/scheduler/util.go +++ b/test/integration/scheduler/util.go @@ -205,6 +205,7 @@ type pausePodConfig struct { Tolerations []v1.Toleration NodeName string SchedulerName string + Priority *int32 } // initPausePod initializes a pod API object from the given config. It is used @@ -213,6 +214,7 @@ func initPausePod(cs clientset.Interface, conf *pausePodConfig) *v1.Pod { pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: conf.Name, + Namespace: conf.Namespace, Labels: conf.Labels, Annotations: conf.Annotations, }, @@ -228,6 +230,7 @@ func initPausePod(cs clientset.Interface, conf *pausePodConfig) *v1.Pod { Tolerations: conf.Tolerations, NodeName: conf.NodeName, SchedulerName: conf.SchedulerName, + Priority: conf.Priority, }, } if conf.Resources != nil { @@ -238,9 +241,8 @@ func initPausePod(cs clientset.Interface, conf *pausePodConfig) *v1.Pod { // createPausePod creates a pod with "Pause" image and the given config and // return its pointer and error status. -func createPausePod(cs clientset.Interface, conf *pausePodConfig) (*v1.Pod, error) { - p := initPausePod(cs, conf) - return cs.CoreV1().Pods(conf.Namespace).Create(p) +func createPausePod(cs clientset.Interface, p *v1.Pod) (*v1.Pod, error) { + return cs.CoreV1().Pods(p.Namespace).Create(p) } // createPausePodWithResource creates a pod with "Pause" image and the given @@ -262,22 +264,21 @@ func createPausePodWithResource(cs clientset.Interface, podName string, nsName s }, } } - return createPausePod(cs, &conf) + return createPausePod(cs, initPausePod(cs, &conf)) } // runPausePod creates a pod with "Pause" image and the given config and waits // until it is scheduled. It returns its pointer and error status. -func runPausePod(cs clientset.Interface, conf *pausePodConfig) (*v1.Pod, error) { - p := initPausePod(cs, conf) - pod, err := cs.CoreV1().Pods(conf.Namespace).Create(p) +func runPausePod(cs clientset.Interface, pod *v1.Pod) (*v1.Pod, error) { + pod, err := cs.CoreV1().Pods(pod.Namespace).Create(pod) if err != nil { return nil, fmt.Errorf("Error creating pause pod: %v", err) } if err = waitForPodToSchedule(cs, pod); err != nil { return pod, fmt.Errorf("Pod %v didn't schedule successfully. Error: %v", pod.Name, err) } - if pod, err = cs.CoreV1().Pods(conf.Namespace).Get(conf.Name, metav1.GetOptions{}); err != nil { - return pod, fmt.Errorf("Error getting pod %v info: %v", conf.Name, err) + if pod, err = cs.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}); err != nil { + return pod, fmt.Errorf("Error getting pod %v info: %v", pod.Name, err) } return pod, nil } @@ -285,7 +286,10 @@ func runPausePod(cs clientset.Interface, conf *pausePodConfig) (*v1.Pod, error) // podDeleted returns true if a pod is not found in the given namespace. func podDeleted(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc { return func() (bool, error) { - _, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{}) + pod, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{}) + if pod.DeletionTimestamp != nil { + return true, nil + } if errors.IsNotFound(err) { return true, nil } @@ -293,6 +297,20 @@ func podDeleted(c clientset.Interface, podNamespace, podName string) wait.Condit } } +// podIsGettingEvicted returns true if the pod's deletion timestamp is set. +func podIsGettingEvicted(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc { + return func() (bool, error) { + pod, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{}) + if err != nil { + return false, err + } + if pod.DeletionTimestamp != nil { + return true, nil + } + return false, nil + } +} + // podScheduled returns true if a node is assigned to the given pod. func podScheduled(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc { return func() (bool, error) { From 86b06c3832a024d0d83c5471079acd06949b1e29 Mon Sep 17 00:00:00 2001 From: "Bobby (Babak) Salamat" Date: Fri, 25 Aug 2017 10:34:30 -0700 Subject: [PATCH 6/7] autogenerated files --- plugin/pkg/scheduler/BUILD | 2 ++ test/e2e/scheduling/BUILD | 2 ++ test/integration/scheduler/BUILD | 3 +++ 3 files changed, 7 insertions(+) diff --git a/plugin/pkg/scheduler/BUILD b/plugin/pkg/scheduler/BUILD index e322315259a..e68c63c3360 100644 --- a/plugin/pkg/scheduler/BUILD +++ b/plugin/pkg/scheduler/BUILD @@ -36,6 +36,7 @@ go_library( "testutil.go", ], deps = [ + "//pkg/features:go_default_library", "//plugin/pkg/scheduler/algorithm:go_default_library", "//plugin/pkg/scheduler/api:go_default_library", "//plugin/pkg/scheduler/core:go_default_library", @@ -47,6 +48,7 @@ go_library( "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/kubernetes:go_default_library", "//vendor/k8s.io/client-go/listers/core/v1:go_default_library", "//vendor/k8s.io/client-go/tools/cache:go_default_library", diff --git a/test/e2e/scheduling/BUILD b/test/e2e/scheduling/BUILD index c39f21b35a1..50828eb4591 100644 --- a/test/e2e/scheduling/BUILD +++ b/test/e2e/scheduling/BUILD @@ -15,6 +15,7 @@ go_library( "nvidia-gpus.go", "opaque_resource.go", "predicates.go", + "preemption.go", "priorities.go", "rescheduler.go", ], @@ -32,6 +33,7 @@ go_library( "//vendor/github.com/stretchr/testify/assert:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/extensions/v1beta1:go_default_library", + "//vendor/k8s.io/api/scheduling/v1alpha1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", diff --git a/test/integration/scheduler/BUILD b/test/integration/scheduler/BUILD index 34c1505287d..118815cb7a3 100644 --- a/test/integration/scheduler/BUILD +++ b/test/integration/scheduler/BUILD @@ -21,12 +21,14 @@ go_test( deps = [ "//pkg/api:go_default_library", "//pkg/api/testapi:go_default_library", + "//pkg/features:go_default_library", "//plugin/cmd/kube-scheduler/app:go_default_library", "//plugin/cmd/kube-scheduler/app/options:go_default_library", "//plugin/pkg/scheduler:go_default_library", "//plugin/pkg/scheduler/algorithm:go_default_library", "//plugin/pkg/scheduler/algorithmprovider:go_default_library", "//plugin/pkg/scheduler/api:go_default_library", + "//plugin/pkg/scheduler/core:go_default_library", "//plugin/pkg/scheduler/factory:go_default_library", "//plugin/pkg/scheduler/schedulercache:go_default_library", "//test/e2e/framework:go_default_library", @@ -37,6 +39,7 @@ go_test( "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/informers:go_default_library", "//vendor/k8s.io/client-go/kubernetes:go_default_library", "//vendor/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library", From c0b718373befb2168befc0c4579fd9a02155d5bc Mon Sep 17 00:00:00 2001 From: "Bobby (Babak) Salamat" Date: Thu, 24 Aug 2017 18:18:02 -0700 Subject: [PATCH 7/7] Fix RBAC rules to allow scheduler update annotations of pods. --- plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go | 3 ++- .../rbac/bootstrappolicy/testdata/cluster-roles.yaml | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go index f69dfabd6ea..04e65441068 100644 --- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go +++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go @@ -330,7 +330,8 @@ func ClusterRoles() []rbac.ClusterRole { rbac.NewRule("get", "update", "patch", "delete").Groups(legacyGroup).Resources("endpoints").Names("kube-scheduler").RuleOrDie(), // fundamental resources - rbac.NewRule(Read...).Groups(legacyGroup).Resources("nodes", "pods").RuleOrDie(), + rbac.NewRule(Read...).Groups(legacyGroup).Resources("nodes").RuleOrDie(), + rbac.NewRule("get", "list", "watch", "delete").Groups(legacyGroup).Resources("pods").RuleOrDie(), rbac.NewRule("create").Groups(legacyGroup).Resources("pods/binding", "bindings").RuleOrDie(), rbac.NewRule("update").Groups(legacyGroup).Resources("pods/status").RuleOrDie(), // things that select pods diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml index 0f572371e2f..b16948997c8 100644 --- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml +++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml @@ -580,8 +580,16 @@ items: - "" resources: - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: - pods verbs: + - delete - get - list - watch