From 68f7a7c682cbfe06c6e820d2fd7b46c6d34ca655 Mon Sep 17 00:00:00 2001 From: NoicFank <1015542478@qq.com> Date: Thu, 24 Oct 2024 11:50:46 +0800 Subject: [PATCH] bugfix(scheduler): preemption picks wrong victim node with higher priority pod on it. Introducing pdb to preemption had disrupted the orderliness of pods in the victims, which would leads picking wrong victim node with higher priority pod on it. --- .../defaultpreemption/default_preemption.go | 7 +++ .../default_preemption_test.go | 38 ++++++++++++ pkg/scheduler/testing/wrappers.go | 60 +++++++++++++++++++ 3 files changed, 105 insertions(+) diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go index f1c58de9389..1d573973e3e 100644 --- a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go +++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go @@ -191,6 +191,8 @@ func (pl *DefaultPreemption) SelectVictimsOnNode( } var victims []*v1.Pod numViolatingVictim := 0 + // Sort potentialVictims by pod priority from high to low, which ensures to + // reprieve higher priority pods first. sort.Slice(potentialVictims, func(i, j int) bool { return util.MoreImportantPod(potentialVictims[i].Pod, potentialVictims[j].Pod) }) // Try to reprieve as many pods as possible. We first try to reprieve the PDB // violating victims and then other non-violating ones. In both cases, we start @@ -225,6 +227,11 @@ func (pl *DefaultPreemption) SelectVictimsOnNode( return nil, 0, framework.AsStatus(err) } } + + // Sort victims after reprieving pods to keep the pods in the victims sorted in order of priority from high to low. + if len(violatingVictims) != 0 && len(nonViolatingVictims) != 0 { + sort.Slice(victims, func(i, j int) bool { return util.MoreImportantPod(victims[i], victims[j]) }) + } return victims, numViolatingVictim, framework.NewStatus(framework.Success) } diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go index c5b97f0aae9..89192d4ffca 100644 --- a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go +++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go @@ -21,6 +21,7 @@ import ( "encoding/json" "errors" "fmt" + "math" "math/rand" "sort" "strings" @@ -144,6 +145,12 @@ func (pl *TestPlugin) Filter(ctx context.Context, state *framework.CycleState, p return nil } +const ( + LabelKeyIsViolatingPDB = "test.kubernetes.io/is-violating-pdb" + LabelValueViolatingPDB = "violating" + LabelValueNonViolatingPDB = "non-violating" +) + func TestPostFilter(t *testing.T) { metrics.Register() onePodRes := map[v1.ResourceName]string{v1.ResourcePods: "1"} @@ -152,6 +159,7 @@ func TestPostFilter(t *testing.T) { name string pod *v1.Pod pods []*v1.Pod + pdbs []*policy.PodDisruptionBudget nodes []*v1.Node filteredNodesStatuses *framework.NodeToStatus extender framework.Extender @@ -234,6 +242,29 @@ func TestPostFilter(t *testing.T) { wantResult: framework.NewPostFilterResultWithNominatedNode("node2"), wantStatus: framework.NewStatus(framework.Success), }, + { + name: "pod can be made schedulable on minHighestPriority node", + pod: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).Priority(veryHighPriority).Obj(), + pods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Label(LabelKeyIsViolatingPDB, LabelValueNonViolatingPDB).Namespace(v1.NamespaceDefault).Priority(highPriority).Node("node1").Obj(), + st.MakePod().Name("p2").UID("p2").Label(LabelKeyIsViolatingPDB, LabelValueViolatingPDB).Namespace(v1.NamespaceDefault).Priority(lowPriority).Node("node1").Obj(), + st.MakePod().Name("p3").UID("p3").Label(LabelKeyIsViolatingPDB, LabelValueViolatingPDB).Namespace(v1.NamespaceDefault).Priority(midPriority).Node("node2").Obj(), + }, + pdbs: []*policy.PodDisruptionBudget{ + st.MakePDB().Name("violating-pdb").Namespace(v1.NamespaceDefault).MatchLabel(LabelKeyIsViolatingPDB, LabelValueViolatingPDB).MinAvailable("100%").Obj(), + st.MakePDB().Name("non-violating-pdb").Namespace(v1.NamespaceDefault).MatchLabel(LabelKeyIsViolatingPDB, LabelValueNonViolatingPDB).MinAvailable("0").DisruptionsAllowed(math.MaxInt32).Obj(), + }, + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Capacity(onePodRes).Obj(), + st.MakeNode().Name("node2").Capacity(onePodRes).Obj(), + }, + filteredNodesStatuses: framework.NewNodeToStatus(map[string]*framework.Status{ + "node1": framework.NewStatus(framework.Unschedulable), + "node2": framework.NewStatus(framework.Unschedulable), + }, framework.NewStatus(framework.UnschedulableAndUnresolvable)), + wantResult: framework.NewPostFilterResultWithNominatedNode("node2"), + wantStatus: framework.NewStatus(framework.Success), + }, { name: "preemption result filtered out by extenders", pod: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).Priority(highPriority).Obj(), @@ -365,6 +396,13 @@ func TestPostFilter(t *testing.T) { for i := range tt.pods { podInformer.GetStore().Add(tt.pods[i]) } + pdbInformer := informerFactory.Policy().V1().PodDisruptionBudgets().Informer() + for i := range tt.pdbs { + if err := pdbInformer.GetStore().Add(tt.pdbs[i]); err != nil { + t.Fatal(err) + } + } + // Register NodeResourceFit as the Filter & PreFilter plugin. registeredPlugins := []tf.RegisterPluginFunc{ tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), diff --git a/pkg/scheduler/testing/wrappers.go b/pkg/scheduler/testing/wrappers.go index 43c87ab2667..7d0c402f102 100644 --- a/pkg/scheduler/testing/wrappers.go +++ b/pkg/scheduler/testing/wrappers.go @@ -22,12 +22,14 @@ import ( "time" v1 "k8s.io/api/core/v1" + policy "k8s.io/api/policy/v1" resourceapi "k8s.io/api/resource/v1alpha3" storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" imageutils "k8s.io/kubernetes/test/utils/image" "k8s.io/utils/ptr" ) @@ -231,6 +233,64 @@ func (c *ContainerWrapper) RestartPolicy(restartPolicy v1.ContainerRestartPolicy return c } +// PodDisruptionBudgetWrapper wraps a PodDisruptionBudget inside. +type PodDisruptionBudgetWrapper struct { + policy.PodDisruptionBudget +} + +// MakePDB creates a PodDisruptionBudget wrapper. +func MakePDB() *PodDisruptionBudgetWrapper { + return &PodDisruptionBudgetWrapper{policy.PodDisruptionBudget{}} +} + +// Obj returns the inner PodDisruptionBudget. +func (p *PodDisruptionBudgetWrapper) Obj() *policy.PodDisruptionBudget { + return &p.PodDisruptionBudget +} + +// Name sets `name` as the name of the inner PodDisruptionBudget. +func (p *PodDisruptionBudgetWrapper) Name(name string) *PodDisruptionBudgetWrapper { + p.SetName(name) + return p +} + +// Namespace sets `namespace` as the namespace of the inner PodDisruptionBudget. +func (p *PodDisruptionBudgetWrapper) Namespace(namespace string) *PodDisruptionBudgetWrapper { + p.SetNamespace(namespace) + return p +} + +// MinAvailable sets `minAvailable` to the inner PodDisruptionBudget.Spec.MinAvailable. +func (p *PodDisruptionBudgetWrapper) MinAvailable(minAvailable string) *PodDisruptionBudgetWrapper { + p.Spec.MinAvailable = &intstr.IntOrString{ + Type: intstr.String, + StrVal: minAvailable, + } + return p +} + +// MatchLabel adds a {key,value} to the inner PodDisruptionBudget.Spec.Selector.MatchLabels. +func (p *PodDisruptionBudgetWrapper) MatchLabel(key, value string) *PodDisruptionBudgetWrapper { + selector := p.Spec.Selector + if selector == nil { + selector = &metav1.LabelSelector{} + } + matchLabels := selector.MatchLabels + if matchLabels == nil { + matchLabels = map[string]string{} + } + matchLabels[key] = value + selector.MatchLabels = matchLabels + p.Spec.Selector = selector + return p +} + +// DisruptionsAllowed sets `disruptionsAllowed` to the inner PodDisruptionBudget.Status.DisruptionsAllowed. +func (p *PodDisruptionBudgetWrapper) DisruptionsAllowed(disruptionsAllowed int32) *PodDisruptionBudgetWrapper { + p.Status.DisruptionsAllowed = disruptionsAllowed + return p +} + // PodWrapper wraps a Pod inside. type PodWrapper struct{ v1.Pod }