mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-30 23:15:14 +00:00
Merge pull request #128307 from NoicFank/bugfix-scheduler-preemption
bugfix(scheduler): preemption picks wrong victim node with higher priority pod on it
This commit is contained in:
commit
988769933e
@ -191,6 +191,8 @@ func (pl *DefaultPreemption) SelectVictimsOnNode(
|
||||
}
|
||||
var victims []*v1.Pod
|
||||
numViolatingVictim := 0
|
||||
// Sort potentialVictims by pod priority from high to low, which ensures to
|
||||
// reprieve higher priority pods first.
|
||||
sort.Slice(potentialVictims, func(i, j int) bool { return util.MoreImportantPod(potentialVictims[i].Pod, potentialVictims[j].Pod) })
|
||||
// Try to reprieve as many pods as possible. We first try to reprieve the PDB
|
||||
// violating victims and then other non-violating ones. In both cases, we start
|
||||
@ -225,6 +227,11 @@ func (pl *DefaultPreemption) SelectVictimsOnNode(
|
||||
return nil, 0, framework.AsStatus(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Sort victims after reprieving pods to keep the pods in the victims sorted in order of priority from high to low.
|
||||
if len(violatingVictims) != 0 && len(nonViolatingVictims) != 0 {
|
||||
sort.Slice(victims, func(i, j int) bool { return util.MoreImportantPod(victims[i], victims[j]) })
|
||||
}
|
||||
return victims, numViolatingVictim, framework.NewStatus(framework.Success)
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,7 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"math/rand"
|
||||
"sort"
|
||||
"strings"
|
||||
@ -144,6 +145,12 @@ func (pl *TestPlugin) Filter(ctx context.Context, state *framework.CycleState, p
|
||||
return nil
|
||||
}
|
||||
|
||||
const (
|
||||
LabelKeyIsViolatingPDB = "test.kubernetes.io/is-violating-pdb"
|
||||
LabelValueViolatingPDB = "violating"
|
||||
LabelValueNonViolatingPDB = "non-violating"
|
||||
)
|
||||
|
||||
func TestPostFilter(t *testing.T) {
|
||||
metrics.Register()
|
||||
onePodRes := map[v1.ResourceName]string{v1.ResourcePods: "1"}
|
||||
@ -152,6 +159,7 @@ func TestPostFilter(t *testing.T) {
|
||||
name string
|
||||
pod *v1.Pod
|
||||
pods []*v1.Pod
|
||||
pdbs []*policy.PodDisruptionBudget
|
||||
nodes []*v1.Node
|
||||
filteredNodesStatuses *framework.NodeToStatus
|
||||
extender framework.Extender
|
||||
@ -234,6 +242,29 @@ func TestPostFilter(t *testing.T) {
|
||||
wantResult: framework.NewPostFilterResultWithNominatedNode("node2"),
|
||||
wantStatus: framework.NewStatus(framework.Success),
|
||||
},
|
||||
{
|
||||
name: "pod can be made schedulable on minHighestPriority node",
|
||||
pod: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).Priority(veryHighPriority).Obj(),
|
||||
pods: []*v1.Pod{
|
||||
st.MakePod().Name("p1").UID("p1").Label(LabelKeyIsViolatingPDB, LabelValueNonViolatingPDB).Namespace(v1.NamespaceDefault).Priority(highPriority).Node("node1").Obj(),
|
||||
st.MakePod().Name("p2").UID("p2").Label(LabelKeyIsViolatingPDB, LabelValueViolatingPDB).Namespace(v1.NamespaceDefault).Priority(lowPriority).Node("node1").Obj(),
|
||||
st.MakePod().Name("p3").UID("p3").Label(LabelKeyIsViolatingPDB, LabelValueViolatingPDB).Namespace(v1.NamespaceDefault).Priority(midPriority).Node("node2").Obj(),
|
||||
},
|
||||
pdbs: []*policy.PodDisruptionBudget{
|
||||
st.MakePDB().Name("violating-pdb").Namespace(v1.NamespaceDefault).MatchLabel(LabelKeyIsViolatingPDB, LabelValueViolatingPDB).MinAvailable("100%").Obj(),
|
||||
st.MakePDB().Name("non-violating-pdb").Namespace(v1.NamespaceDefault).MatchLabel(LabelKeyIsViolatingPDB, LabelValueNonViolatingPDB).MinAvailable("0").DisruptionsAllowed(math.MaxInt32).Obj(),
|
||||
},
|
||||
nodes: []*v1.Node{
|
||||
st.MakeNode().Name("node1").Capacity(onePodRes).Obj(),
|
||||
st.MakeNode().Name("node2").Capacity(onePodRes).Obj(),
|
||||
},
|
||||
filteredNodesStatuses: framework.NewNodeToStatus(map[string]*framework.Status{
|
||||
"node1": framework.NewStatus(framework.Unschedulable),
|
||||
"node2": framework.NewStatus(framework.Unschedulable),
|
||||
}, framework.NewStatus(framework.UnschedulableAndUnresolvable)),
|
||||
wantResult: framework.NewPostFilterResultWithNominatedNode("node2"),
|
||||
wantStatus: framework.NewStatus(framework.Success),
|
||||
},
|
||||
{
|
||||
name: "preemption result filtered out by extenders",
|
||||
pod: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).Priority(highPriority).Obj(),
|
||||
@ -365,6 +396,13 @@ func TestPostFilter(t *testing.T) {
|
||||
for i := range tt.pods {
|
||||
podInformer.GetStore().Add(tt.pods[i])
|
||||
}
|
||||
pdbInformer := informerFactory.Policy().V1().PodDisruptionBudgets().Informer()
|
||||
for i := range tt.pdbs {
|
||||
if err := pdbInformer.GetStore().Add(tt.pdbs[i]); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Register NodeResourceFit as the Filter & PreFilter plugin.
|
||||
registeredPlugins := []tf.RegisterPluginFunc{
|
||||
tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New),
|
||||
|
@ -22,12 +22,14 @@ import (
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
policy "k8s.io/api/policy/v1"
|
||||
resourceapi "k8s.io/api/resource/v1alpha3"
|
||||
storagev1 "k8s.io/api/storage/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/intstr"
|
||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
@ -231,6 +233,64 @@ func (c *ContainerWrapper) RestartPolicy(restartPolicy v1.ContainerRestartPolicy
|
||||
return c
|
||||
}
|
||||
|
||||
// PodDisruptionBudgetWrapper wraps a PodDisruptionBudget inside.
|
||||
type PodDisruptionBudgetWrapper struct {
|
||||
policy.PodDisruptionBudget
|
||||
}
|
||||
|
||||
// MakePDB creates a PodDisruptionBudget wrapper.
|
||||
func MakePDB() *PodDisruptionBudgetWrapper {
|
||||
return &PodDisruptionBudgetWrapper{policy.PodDisruptionBudget{}}
|
||||
}
|
||||
|
||||
// Obj returns the inner PodDisruptionBudget.
|
||||
func (p *PodDisruptionBudgetWrapper) Obj() *policy.PodDisruptionBudget {
|
||||
return &p.PodDisruptionBudget
|
||||
}
|
||||
|
||||
// Name sets `name` as the name of the inner PodDisruptionBudget.
|
||||
func (p *PodDisruptionBudgetWrapper) Name(name string) *PodDisruptionBudgetWrapper {
|
||||
p.SetName(name)
|
||||
return p
|
||||
}
|
||||
|
||||
// Namespace sets `namespace` as the namespace of the inner PodDisruptionBudget.
|
||||
func (p *PodDisruptionBudgetWrapper) Namespace(namespace string) *PodDisruptionBudgetWrapper {
|
||||
p.SetNamespace(namespace)
|
||||
return p
|
||||
}
|
||||
|
||||
// MinAvailable sets `minAvailable` to the inner PodDisruptionBudget.Spec.MinAvailable.
|
||||
func (p *PodDisruptionBudgetWrapper) MinAvailable(minAvailable string) *PodDisruptionBudgetWrapper {
|
||||
p.Spec.MinAvailable = &intstr.IntOrString{
|
||||
Type: intstr.String,
|
||||
StrVal: minAvailable,
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// MatchLabel adds a {key,value} to the inner PodDisruptionBudget.Spec.Selector.MatchLabels.
|
||||
func (p *PodDisruptionBudgetWrapper) MatchLabel(key, value string) *PodDisruptionBudgetWrapper {
|
||||
selector := p.Spec.Selector
|
||||
if selector == nil {
|
||||
selector = &metav1.LabelSelector{}
|
||||
}
|
||||
matchLabels := selector.MatchLabels
|
||||
if matchLabels == nil {
|
||||
matchLabels = map[string]string{}
|
||||
}
|
||||
matchLabels[key] = value
|
||||
selector.MatchLabels = matchLabels
|
||||
p.Spec.Selector = selector
|
||||
return p
|
||||
}
|
||||
|
||||
// DisruptionsAllowed sets `disruptionsAllowed` to the inner PodDisruptionBudget.Status.DisruptionsAllowed.
|
||||
func (p *PodDisruptionBudgetWrapper) DisruptionsAllowed(disruptionsAllowed int32) *PodDisruptionBudgetWrapper {
|
||||
p.Status.DisruptionsAllowed = disruptionsAllowed
|
||||
return p
|
||||
}
|
||||
|
||||
// PodWrapper wraps a Pod inside.
|
||||
type PodWrapper struct{ v1.Pod }
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user