mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-27 21:47:07 +00:00
Merge pull request #128170 from sanposhiho/async-preemption
feature(KEP-4832): asynchronous preemption
This commit is contained in:
commit
fb033826a8
@ -580,6 +580,14 @@ const (
|
|||||||
// which benefits to reduce the useless requeueing.
|
// which benefits to reduce the useless requeueing.
|
||||||
SchedulerQueueingHints featuregate.Feature = "SchedulerQueueingHints"
|
SchedulerQueueingHints featuregate.Feature = "SchedulerQueueingHints"
|
||||||
|
|
||||||
|
// owner: @sanposhiho
|
||||||
|
// kep: http://kep.k8s.io/4832
|
||||||
|
// alpha: v1.32
|
||||||
|
//
|
||||||
|
// Running some expensive operation within the scheduler's preemption asynchronously,
|
||||||
|
// which improves the scheduling latency when the preemption involves in.
|
||||||
|
SchedulerAsyncPreemption featuregate.Feature = "SchedulerAsyncPreemption"
|
||||||
|
|
||||||
// owner: @atosatto @yuanchen8911
|
// owner: @atosatto @yuanchen8911
|
||||||
// kep: http://kep.k8s.io/3902
|
// kep: http://kep.k8s.io/3902
|
||||||
//
|
//
|
||||||
|
@ -633,6 +633,10 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate
|
|||||||
{Version: version.MustParse("1.29"), Default: false, PreRelease: featuregate.Alpha},
|
{Version: version.MustParse("1.29"), Default: false, PreRelease: featuregate.Alpha},
|
||||||
},
|
},
|
||||||
|
|
||||||
|
SchedulerAsyncPreemption: {
|
||||||
|
{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
|
||||||
|
},
|
||||||
|
|
||||||
SchedulerQueueingHints: {
|
SchedulerQueueingHints: {
|
||||||
{Version: version.MustParse("1.28"), Default: false, PreRelease: featuregate.Beta},
|
{Version: version.MustParse("1.28"), Default: false, PreRelease: featuregate.Beta},
|
||||||
{Version: version.MustParse("1.32"), Default: true, PreRelease: featuregate.Beta},
|
{Version: version.MustParse("1.32"), Default: true, PreRelease: featuregate.Beta},
|
||||||
|
@ -52,6 +52,7 @@ var ExpandedPluginsV1 = &config.Plugins{
|
|||||||
PreEnqueue: config.PluginSet{
|
PreEnqueue: config.PluginSet{
|
||||||
Enabled: []config.Plugin{
|
Enabled: []config.Plugin{
|
||||||
{Name: names.SchedulingGates},
|
{Name: names.SchedulingGates},
|
||||||
|
{Name: names.DefaultPreemption},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
QueueSort: config.PluginSet{
|
QueueSort: config.PluginSet{
|
||||||
|
@ -92,9 +92,11 @@ type PreEnqueueCheck func(pod *v1.Pod) bool
|
|||||||
type SchedulingQueue interface {
|
type SchedulingQueue interface {
|
||||||
framework.PodNominator
|
framework.PodNominator
|
||||||
Add(logger klog.Logger, pod *v1.Pod)
|
Add(logger klog.Logger, pod *v1.Pod)
|
||||||
// Activate moves the given pods to activeQ iff they're in unschedulablePods or backoffQ.
|
// Activate moves the given pods to activeQ.
|
||||||
// The passed-in pods are originally compiled from plugins that want to activate Pods,
|
// If a pod isn't found in unschedulablePods or backoffQ and it's in-flight,
|
||||||
// by injecting the pods through a reserved CycleState struct (PodsToActivate).
|
// the wildcard event is registered so that the pod will be requeued when it comes back.
|
||||||
|
// But, if a pod isn't found in unschedulablePods or backoffQ and it's not in-flight (i.e., completely unknown pod),
|
||||||
|
// Activate would ignore the pod.
|
||||||
Activate(logger klog.Logger, pods map[string]*v1.Pod)
|
Activate(logger klog.Logger, pods map[string]*v1.Pod)
|
||||||
// AddUnschedulableIfNotPresent adds an unschedulable pod back to scheduling queue.
|
// AddUnschedulableIfNotPresent adds an unschedulable pod back to scheduling queue.
|
||||||
// The podSchedulingCycle represents the current scheduling cycle number which can be
|
// The podSchedulingCycle represents the current scheduling cycle number which can be
|
||||||
@ -411,9 +413,22 @@ func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework
|
|||||||
}
|
}
|
||||||
|
|
||||||
if event.IsWildCard() {
|
if event.IsWildCard() {
|
||||||
|
// If the wildcard event has a Pod in newObj,
|
||||||
|
// that indicates that the event wants to be effective for the Pod only.
|
||||||
|
// Specifically, EventForceActivate could have a target Pod in newObj.
|
||||||
|
if newObj != nil {
|
||||||
|
if pod, ok := newObj.(*v1.Pod); !ok || pod.UID != pInfo.Pod.UID {
|
||||||
|
// This wildcard event is not for this Pod.
|
||||||
|
if ok {
|
||||||
|
logger.V(6).Info("Not worth requeuing because the event is wildcard, but for another pod", "pod", klog.KObj(pInfo.Pod), "event", event.Label(), "newObj", klog.KObj(pod))
|
||||||
|
}
|
||||||
|
return queueSkip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If the wildcard event is special one as someone wants to force all Pods to move to activeQ/backoffQ.
|
// If the wildcard event is special one as someone wants to force all Pods to move to activeQ/backoffQ.
|
||||||
// We return queueAfterBackoff in this case, while resetting all blocked plugins.
|
// We return queueAfterBackoff in this case, while resetting all blocked plugins.
|
||||||
logger.V(6).Info("Worth requeuing because the event is wildcard", "pod", klog.KObj(pInfo.Pod))
|
logger.V(6).Info("Worth requeuing because the event is wildcard", "pod", klog.KObj(pInfo.Pod), "event", event.Label())
|
||||||
return queueAfterBackoff
|
return queueAfterBackoff
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -590,7 +605,11 @@ func (p *PriorityQueue) Add(logger klog.Logger, pod *v1.Pod) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Activate moves the given pods to activeQ iff they're in unschedulablePods or backoffQ.
|
// Activate moves the given pods to activeQ.
|
||||||
|
// If a pod isn't found in unschedulablePods or backoffQ and it's in-flight,
|
||||||
|
// the wildcard event is registered so that the pod will be requeued when it comes back.
|
||||||
|
// But, if a pod isn't found in unschedulablePods or backoffQ and it's not in-flight (i.e., completely unknown pod),
|
||||||
|
// Activate would ignore the pod.
|
||||||
func (p *PriorityQueue) Activate(logger klog.Logger, pods map[string]*v1.Pod) {
|
func (p *PriorityQueue) Activate(logger klog.Logger, pods map[string]*v1.Pod) {
|
||||||
p.lock.Lock()
|
p.lock.Lock()
|
||||||
defer p.lock.Unlock()
|
defer p.lock.Unlock()
|
||||||
@ -599,7 +618,15 @@ func (p *PriorityQueue) Activate(logger klog.Logger, pods map[string]*v1.Pod) {
|
|||||||
for _, pod := range pods {
|
for _, pod := range pods {
|
||||||
if p.activate(logger, pod) {
|
if p.activate(logger, pod) {
|
||||||
activated = true
|
activated = true
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If this pod is in-flight, register the activation event (for when QHint is enabled) or update moveRequestCycle (for when QHints is disabled)
|
||||||
|
// so that the pod will be requeued when it comes back.
|
||||||
|
// Specifically in the in-tree plugins, this is for the scenario with the preemption plugin
|
||||||
|
// where the async preemption API calls are all done or fail at some point before the Pod comes back to the queue.
|
||||||
|
p.activeQ.addEventsIfPodInFlight(nil, pod, []framework.ClusterEvent{framework.EventForceActivate})
|
||||||
|
p.moveRequestCycle = p.activeQ.schedulingCycle()
|
||||||
}
|
}
|
||||||
|
|
||||||
if activated {
|
if activated {
|
||||||
|
@ -1294,13 +1294,17 @@ func TestPriorityQueue_Delete(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestPriorityQueue_Activate(t *testing.T) {
|
func TestPriorityQueue_Activate(t *testing.T) {
|
||||||
|
metrics.Register()
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
qPodInfoInUnschedulablePods []*framework.QueuedPodInfo
|
qPodInfoInUnschedulablePods []*framework.QueuedPodInfo
|
||||||
qPodInfoInPodBackoffQ []*framework.QueuedPodInfo
|
qPodInfoInPodBackoffQ []*framework.QueuedPodInfo
|
||||||
qPodInActiveQ []*v1.Pod
|
qPodInActiveQ []*v1.Pod
|
||||||
qPodInfoToActivate *framework.QueuedPodInfo
|
qPodInfoToActivate *framework.QueuedPodInfo
|
||||||
|
qPodInInFlightPod *v1.Pod
|
||||||
|
expectedInFlightEvent *clusterEvent
|
||||||
want []*framework.QueuedPodInfo
|
want []*framework.QueuedPodInfo
|
||||||
|
qHintEnabled bool
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "pod already in activeQ",
|
name: "pod already in activeQ",
|
||||||
@ -1313,6 +1317,21 @@ func TestPriorityQueue_Activate(t *testing.T) {
|
|||||||
qPodInfoToActivate: &framework.QueuedPodInfo{PodInfo: highPriNominatedPodInfo},
|
qPodInfoToActivate: &framework.QueuedPodInfo{PodInfo: highPriNominatedPodInfo},
|
||||||
want: []*framework.QueuedPodInfo{},
|
want: []*framework.QueuedPodInfo{},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "[QHint] pod not in unschedulablePods/podBackoffQ but in-flight",
|
||||||
|
qPodInfoToActivate: &framework.QueuedPodInfo{PodInfo: highPriNominatedPodInfo},
|
||||||
|
qPodInInFlightPod: highPriNominatedPodInfo.Pod,
|
||||||
|
expectedInFlightEvent: &clusterEvent{oldObj: (*v1.Pod)(nil), newObj: highPriNominatedPodInfo.Pod, event: framework.EventForceActivate},
|
||||||
|
want: []*framework.QueuedPodInfo{},
|
||||||
|
qHintEnabled: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "[QHint] pod not in unschedulablePods/podBackoffQ and not in-flight",
|
||||||
|
qPodInfoToActivate: &framework.QueuedPodInfo{PodInfo: highPriNominatedPodInfo},
|
||||||
|
qPodInInFlightPod: medPriorityPodInfo.Pod, // different pod is in-flight
|
||||||
|
want: []*framework.QueuedPodInfo{},
|
||||||
|
qHintEnabled: true,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "pod in unschedulablePods",
|
name: "pod in unschedulablePods",
|
||||||
qPodInfoInUnschedulablePods: []*framework.QueuedPodInfo{{PodInfo: highPriNominatedPodInfo}},
|
qPodInfoInUnschedulablePods: []*framework.QueuedPodInfo{{PodInfo: highPriNominatedPodInfo}},
|
||||||
@ -1329,12 +1348,30 @@ func TestPriorityQueue_Activate(t *testing.T) {
|
|||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, tt.qHintEnabled)
|
||||||
var objs []runtime.Object
|
var objs []runtime.Object
|
||||||
logger, ctx := ktesting.NewTestContext(t)
|
logger, ctx := ktesting.NewTestContext(t)
|
||||||
ctx, cancel := context.WithCancel(ctx)
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
q := NewTestQueueWithObjects(ctx, newDefaultQueueSort(), objs)
|
q := NewTestQueueWithObjects(ctx, newDefaultQueueSort(), objs)
|
||||||
|
|
||||||
|
if tt.qPodInInFlightPod != nil {
|
||||||
|
// Put -> Pop the Pod to make it registered in inFlightPods.
|
||||||
|
q.activeQ.underLock(func(unlockedActiveQ unlockedActiveQueuer) {
|
||||||
|
unlockedActiveQ.AddOrUpdate(newQueuedPodInfoForLookup(tt.qPodInInFlightPod))
|
||||||
|
})
|
||||||
|
p, err := q.activeQ.pop(logger)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Pop failed: %v", err)
|
||||||
|
}
|
||||||
|
if p.Pod.Name != tt.qPodInInFlightPod.Name {
|
||||||
|
t.Errorf("Unexpected popped pod: %v", p.Pod.Name)
|
||||||
|
}
|
||||||
|
if len(q.activeQ.listInFlightEvents()) != 1 {
|
||||||
|
t.Fatal("Expected the pod to be recorded in in-flight events, but it doesn't")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Prepare activeQ/unschedulablePods/podBackoffQ according to the table
|
// Prepare activeQ/unschedulablePods/podBackoffQ according to the table
|
||||||
for _, qPod := range tt.qPodInActiveQ {
|
for _, qPod := range tt.qPodInActiveQ {
|
||||||
q.Add(logger, qPod)
|
q.Add(logger, qPod)
|
||||||
@ -1353,7 +1390,29 @@ func TestPriorityQueue_Activate(t *testing.T) {
|
|||||||
|
|
||||||
// Check the result after activation by the length of activeQ
|
// Check the result after activation by the length of activeQ
|
||||||
if wantLen := len(tt.want); q.activeQ.len() != wantLen {
|
if wantLen := len(tt.want); q.activeQ.len() != wantLen {
|
||||||
t.Errorf("length compare: want %v, got %v", wantLen, q.activeQ.len())
|
t.Fatalf("length compare: want %v, got %v", wantLen, q.activeQ.len())
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.expectedInFlightEvent != nil {
|
||||||
|
if len(q.activeQ.listInFlightEvents()) != 2 {
|
||||||
|
t.Fatalf("Expected two in-flight event to be recorded, but got %v events", len(q.activeQ.listInFlightEvents()))
|
||||||
|
}
|
||||||
|
found := false
|
||||||
|
for _, e := range q.activeQ.listInFlightEvents() {
|
||||||
|
event, ok := e.(*clusterEvent)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if d := cmp.Diff(tt.expectedInFlightEvent, event, cmpopts.EquateComparable(clusterEvent{})); d != "" {
|
||||||
|
t.Fatalf("Unexpected in-flight event (-want, +got):\n%s", d)
|
||||||
|
}
|
||||||
|
found = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if !found {
|
||||||
|
t.Fatalf("Expected in-flight event to be recorded, but it wasn't.")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if the specific pod exists in activeQ
|
// Check if the specific pod exists in activeQ
|
||||||
@ -3779,6 +3838,7 @@ func mustNewPodInfo(pod *v1.Pod) *framework.PodInfo {
|
|||||||
|
|
||||||
// Test_isPodWorthRequeuing tests isPodWorthRequeuing function.
|
// Test_isPodWorthRequeuing tests isPodWorthRequeuing function.
|
||||||
func Test_isPodWorthRequeuing(t *testing.T) {
|
func Test_isPodWorthRequeuing(t *testing.T) {
|
||||||
|
metrics.Register()
|
||||||
count := 0
|
count := 0
|
||||||
queueHintReturnQueue := func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
queueHintReturnQueue := func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||||
count++
|
count++
|
||||||
@ -3857,11 +3917,37 @@ func Test_isPodWorthRequeuing(t *testing.T) {
|
|||||||
},
|
},
|
||||||
event: framework.EventUnschedulableTimeout,
|
event: framework.EventUnschedulableTimeout,
|
||||||
oldObj: nil,
|
oldObj: nil,
|
||||||
newObj: st.MakeNode().Obj(),
|
newObj: nil,
|
||||||
expected: queueAfterBackoff,
|
expected: queueAfterBackoff,
|
||||||
expectedExecutionCount: 0,
|
expectedExecutionCount: 0,
|
||||||
queueingHintMap: QueueingHintMapPerProfile{},
|
queueingHintMap: QueueingHintMapPerProfile{},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "return Queue when the event is wildcard and the wildcard targets the pod to be requeued right now",
|
||||||
|
podInfo: &framework.QueuedPodInfo{
|
||||||
|
UnschedulablePlugins: sets.New("fooPlugin1"),
|
||||||
|
PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()),
|
||||||
|
},
|
||||||
|
event: framework.EventForceActivate,
|
||||||
|
oldObj: nil,
|
||||||
|
newObj: st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj(),
|
||||||
|
expected: queueAfterBackoff,
|
||||||
|
expectedExecutionCount: 0,
|
||||||
|
queueingHintMap: QueueingHintMapPerProfile{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "return Skip when the event is wildcard, but the wildcard targets a different pod",
|
||||||
|
podInfo: &framework.QueuedPodInfo{
|
||||||
|
UnschedulablePlugins: sets.New("fooPlugin1"),
|
||||||
|
PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()),
|
||||||
|
},
|
||||||
|
event: framework.EventForceActivate,
|
||||||
|
oldObj: nil,
|
||||||
|
newObj: st.MakePod().Name("pod-different").Namespace("ns2").UID("2").Obj(),
|
||||||
|
expected: queueSkip,
|
||||||
|
expectedExecutionCount: 0,
|
||||||
|
queueingHintMap: QueueingHintMapPerProfile{},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "interprets Queue from the Pending plugin as queueImmediately",
|
name: "interprets Queue from the Pending plugin as queueImmediately",
|
||||||
podInfo: &framework.QueuedPodInfo{
|
podInfo: &framework.QueuedPodInfo{
|
||||||
|
@ -34,6 +34,9 @@ const (
|
|||||||
// ForceActivate is the event when a pod is moved from unschedulablePods/backoffQ
|
// ForceActivate is the event when a pod is moved from unschedulablePods/backoffQ
|
||||||
// to activeQ. Usually it's triggered by plugin implementations.
|
// to activeQ. Usually it's triggered by plugin implementations.
|
||||||
ForceActivate = "ForceActivate"
|
ForceActivate = "ForceActivate"
|
||||||
|
// UnschedulableTimeout is the event when a pod is moved from unschedulablePods
|
||||||
|
// due to the timeout specified at pod-max-in-unschedulable-pods-duration.
|
||||||
|
UnschedulableTimeout = "UnschedulableTimeout"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@ -50,7 +53,9 @@ var (
|
|||||||
// EventUnscheduledPodDelete is the event when an unscheduled pod is deleted.
|
// EventUnscheduledPodDelete is the event when an unscheduled pod is deleted.
|
||||||
EventUnscheduledPodDelete = ClusterEvent{Resource: unschedulablePod, ActionType: Delete}
|
EventUnscheduledPodDelete = ClusterEvent{Resource: unschedulablePod, ActionType: Delete}
|
||||||
// EventUnschedulableTimeout is the event when a pod stays in unschedulable for longer than timeout.
|
// EventUnschedulableTimeout is the event when a pod stays in unschedulable for longer than timeout.
|
||||||
EventUnschedulableTimeout = ClusterEvent{Resource: WildCard, ActionType: All, label: "UnschedulableTimeout"}
|
EventUnschedulableTimeout = ClusterEvent{Resource: WildCard, ActionType: All, label: UnschedulableTimeout}
|
||||||
|
// EventForceActivate is the event when a pod is moved from unschedulablePods/backoffQ to activeQ.
|
||||||
|
EventForceActivate = ClusterEvent{Resource: WildCard, ActionType: All, label: ForceActivate}
|
||||||
)
|
)
|
||||||
|
|
||||||
// PodSchedulingPropertiesChange interprets the update of a pod and returns corresponding UpdatePodXYZ event(s).
|
// PodSchedulingPropertiesChange interprets the update of a pod and returns corresponding UpdatePodXYZ event(s).
|
||||||
|
@ -770,6 +770,8 @@ type Framework interface {
|
|||||||
|
|
||||||
// SetPodNominator sets the PodNominator
|
// SetPodNominator sets the PodNominator
|
||||||
SetPodNominator(nominator PodNominator)
|
SetPodNominator(nominator PodNominator)
|
||||||
|
// SetPodActivator sets the PodActivator
|
||||||
|
SetPodActivator(activator PodActivator)
|
||||||
|
|
||||||
// Close calls Close method of each plugin.
|
// Close calls Close method of each plugin.
|
||||||
Close() error
|
Close() error
|
||||||
@ -783,6 +785,8 @@ type Handle interface {
|
|||||||
PodNominator
|
PodNominator
|
||||||
// PluginsRunner abstracts operations to run some plugins.
|
// PluginsRunner abstracts operations to run some plugins.
|
||||||
PluginsRunner
|
PluginsRunner
|
||||||
|
// PodActivator abstracts operations in the scheduling queue.
|
||||||
|
PodActivator
|
||||||
// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
|
// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
|
||||||
// is taken at the beginning of a scheduling cycle and remains unchanged until
|
// is taken at the beginning of a scheduling cycle and remains unchanged until
|
||||||
// a pod finishes "Permit" point.
|
// a pod finishes "Permit" point.
|
||||||
@ -896,6 +900,16 @@ func (ni *NominatingInfo) Mode() NominatingMode {
|
|||||||
return ni.NominatingMode
|
return ni.NominatingMode
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PodActivator abstracts operations in the scheduling queue.
|
||||||
|
type PodActivator interface {
|
||||||
|
// Activate moves the given pods to activeQ.
|
||||||
|
// If a pod isn't found in unschedulablePods or backoffQ and it's in-flight,
|
||||||
|
// the wildcard event is registered so that the pod will be requeued when it comes back.
|
||||||
|
// But, if a pod isn't found in unschedulablePods or backoffQ and it's not in-flight (i.e., completely unknown pod),
|
||||||
|
// Activate would ignore the pod.
|
||||||
|
Activate(logger klog.Logger, pods map[string]*v1.Pod)
|
||||||
|
}
|
||||||
|
|
||||||
// PodNominator abstracts operations to maintain nominated Pods.
|
// PodNominator abstracts operations to maintain nominated Pods.
|
||||||
type PodNominator interface {
|
type PodNominator interface {
|
||||||
// AddNominatedPod adds the given pod to the nominator or
|
// AddNominatedPod adds the given pod to the nominator or
|
||||||
|
@ -53,9 +53,11 @@ type DefaultPreemption struct {
|
|||||||
args config.DefaultPreemptionArgs
|
args config.DefaultPreemptionArgs
|
||||||
podLister corelisters.PodLister
|
podLister corelisters.PodLister
|
||||||
pdbLister policylisters.PodDisruptionBudgetLister
|
pdbLister policylisters.PodDisruptionBudgetLister
|
||||||
|
Evaluator *preemption.Evaluator
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ framework.PostFilterPlugin = &DefaultPreemption{}
|
var _ framework.PostFilterPlugin = &DefaultPreemption{}
|
||||||
|
var _ framework.PreEnqueuePlugin = &DefaultPreemption{}
|
||||||
|
|
||||||
// Name returns name of the plugin. It is used in logs, etc.
|
// Name returns name of the plugin. It is used in logs, etc.
|
||||||
func (pl *DefaultPreemption) Name() string {
|
func (pl *DefaultPreemption) Name() string {
|
||||||
@ -71,13 +73,19 @@ func New(_ context.Context, dpArgs runtime.Object, fh framework.Handle, fts feat
|
|||||||
if err := validation.ValidateDefaultPreemptionArgs(nil, args); err != nil {
|
if err := validation.ValidateDefaultPreemptionArgs(nil, args); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
|
||||||
|
pdbLister := getPDBLister(fh.SharedInformerFactory())
|
||||||
|
|
||||||
pl := DefaultPreemption{
|
pl := DefaultPreemption{
|
||||||
fh: fh,
|
fh: fh,
|
||||||
fts: fts,
|
fts: fts,
|
||||||
args: *args,
|
args: *args,
|
||||||
podLister: fh.SharedInformerFactory().Core().V1().Pods().Lister(),
|
podLister: podLister,
|
||||||
pdbLister: getPDBLister(fh.SharedInformerFactory()),
|
pdbLister: pdbLister,
|
||||||
}
|
}
|
||||||
|
pl.Evaluator = preemption.NewEvaluator(Name, fh, &pl, fts.EnableAsyncPreemption)
|
||||||
|
|
||||||
return &pl, nil
|
return &pl, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -87,16 +95,7 @@ func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.Cy
|
|||||||
metrics.PreemptionAttempts.Inc()
|
metrics.PreemptionAttempts.Inc()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
pe := preemption.Evaluator{
|
result, status := pl.Evaluator.Preempt(ctx, state, pod, m)
|
||||||
PluginName: names.DefaultPreemption,
|
|
||||||
Handler: pl.fh,
|
|
||||||
PodLister: pl.podLister,
|
|
||||||
PdbLister: pl.pdbLister,
|
|
||||||
State: state,
|
|
||||||
Interface: pl,
|
|
||||||
}
|
|
||||||
|
|
||||||
result, status := pe.Preempt(ctx, pod, m)
|
|
||||||
msg := status.Message()
|
msg := status.Message()
|
||||||
if len(msg) > 0 {
|
if len(msg) > 0 {
|
||||||
return result, framework.NewStatus(status.Code(), "preemption: "+msg)
|
return result, framework.NewStatus(status.Code(), "preemption: "+msg)
|
||||||
@ -104,6 +103,24 @@ func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.Cy
|
|||||||
return result, status
|
return result, status
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pl *DefaultPreemption) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
|
||||||
|
if !pl.fts.EnableAsyncPreemption {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if pl.Evaluator.IsPodRunningPreemption(p.GetUID()) {
|
||||||
|
return framework.NewStatus(framework.UnschedulableAndUnresolvable, "waiting for the preemption for this pod to be finished")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// EventsToRegister returns the possible events that may make a Pod
|
||||||
|
// failed by this plugin schedulable.
|
||||||
|
func (pl *DefaultPreemption) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||||
|
// The plugin moves the preemptor Pod to acviteQ/backoffQ once the preemption API calls are all done,
|
||||||
|
// and we don't need to move the Pod with any events.
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
// calculateNumCandidates returns the number of candidates the FindCandidates
|
// calculateNumCandidates returns the number of candidates the FindCandidates
|
||||||
// method must produce from dry running based on the constraints given by
|
// method must produce from dry running based on the constraints given by
|
||||||
// <minCandidateNodesPercentage> and <minCandidateNodesAbsolute>. The number of
|
// <minCandidateNodesPercentage> and <minCandidateNodesAbsolute>. The number of
|
||||||
|
@ -25,6 +25,7 @@ import (
|
|||||||
"math/rand"
|
"math/rand"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -37,10 +38,12 @@ import (
|
|||||||
"k8s.io/apimachinery/pkg/runtime"
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
"k8s.io/apimachinery/pkg/util/sets"
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
"k8s.io/apimachinery/pkg/util/strategicpatch"
|
"k8s.io/apimachinery/pkg/util/strategicpatch"
|
||||||
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
"k8s.io/client-go/informers"
|
"k8s.io/client-go/informers"
|
||||||
clientsetfake "k8s.io/client-go/kubernetes/fake"
|
clientsetfake "k8s.io/client-go/kubernetes/fake"
|
||||||
clienttesting "k8s.io/client-go/testing"
|
clienttesting "k8s.io/client-go/testing"
|
||||||
"k8s.io/client-go/tools/events"
|
"k8s.io/client-go/tools/events"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
"k8s.io/klog/v2/ktesting"
|
"k8s.io/klog/v2/ktesting"
|
||||||
kubeschedulerconfigv1 "k8s.io/kube-scheduler/config/v1"
|
kubeschedulerconfigv1 "k8s.io/kube-scheduler/config/v1"
|
||||||
extenderv1 "k8s.io/kube-scheduler/extender/v1"
|
extenderv1 "k8s.io/kube-scheduler/extender/v1"
|
||||||
@ -436,6 +439,7 @@ func TestPostFilter(t *testing.T) {
|
|||||||
pdbLister: getPDBLister(informerFactory),
|
pdbLister: getPDBLister(informerFactory),
|
||||||
args: *getDefaultDefaultPreemptionArgs(),
|
args: *getDefaultDefaultPreemptionArgs(),
|
||||||
}
|
}
|
||||||
|
p.Evaluator = preemption.NewEvaluator(names.DefaultPreemption, f, &p, false)
|
||||||
|
|
||||||
state := framework.NewCycleState()
|
state := framework.NewCycleState()
|
||||||
// Ensure <state> is populated.
|
// Ensure <state> is populated.
|
||||||
@ -1206,11 +1210,10 @@ func TestDryRunPreemption(t *testing.T) {
|
|||||||
Handler: pl.fh,
|
Handler: pl.fh,
|
||||||
PodLister: pl.podLister,
|
PodLister: pl.podLister,
|
||||||
PdbLister: pl.pdbLister,
|
PdbLister: pl.pdbLister,
|
||||||
State: state,
|
|
||||||
Interface: pl,
|
Interface: pl,
|
||||||
}
|
}
|
||||||
offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
|
offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
|
||||||
got, _, _ := pe.DryRunPreemption(ctx, pod, nodeInfos, tt.pdbs, offset, numCandidates)
|
got, _, _ := pe.DryRunPreemption(ctx, state, pod, nodeInfos, tt.pdbs, offset, numCandidates)
|
||||||
// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
|
// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
|
||||||
for i := range got {
|
for i := range got {
|
||||||
victims := got[i].Victims().Pods
|
victims := got[i].Victims().Pods
|
||||||
@ -1447,11 +1450,10 @@ func TestSelectBestCandidate(t *testing.T) {
|
|||||||
Handler: pl.fh,
|
Handler: pl.fh,
|
||||||
PodLister: pl.podLister,
|
PodLister: pl.podLister,
|
||||||
PdbLister: pl.pdbLister,
|
PdbLister: pl.pdbLister,
|
||||||
State: state,
|
|
||||||
Interface: pl,
|
Interface: pl,
|
||||||
}
|
}
|
||||||
offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
|
offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
|
||||||
candidates, _, _ := pe.DryRunPreemption(ctx, tt.pod, nodeInfos, nil, offset, numCandidates)
|
candidates, _, _ := pe.DryRunPreemption(ctx, state, tt.pod, nodeInfos, nil, offset, numCandidates)
|
||||||
s := pe.SelectCandidate(ctx, candidates)
|
s := pe.SelectCandidate(ctx, candidates)
|
||||||
if s == nil || len(s.Name()) == 0 {
|
if s == nil || len(s.Name()) == 0 {
|
||||||
return
|
return
|
||||||
@ -1548,7 +1550,9 @@ func TestPodEligibleToPreemptOthers(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPreempt(t *testing.T) {
|
func TestPreempt(t *testing.T) {
|
||||||
|
metrics.Register()
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
pod *v1.Pod
|
pod *v1.Pod
|
||||||
@ -1713,16 +1717,29 @@ func TestPreempt(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
labelKeys := []string{"hostname", "zone", "region"}
|
labelKeys := []string{"hostname", "zone", "region"}
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
client := clientsetfake.NewClientset()
|
client := clientsetfake.NewClientset()
|
||||||
informerFactory := informers.NewSharedInformerFactory(client, 0)
|
informerFactory := informers.NewSharedInformerFactory(client, 0)
|
||||||
podInformer := informerFactory.Core().V1().Pods().Informer()
|
podInformer := informerFactory.Core().V1().Pods().Informer()
|
||||||
podInformer.GetStore().Add(test.pod)
|
testPod := test.pod.DeepCopy()
|
||||||
|
testPods := make([]*v1.Pod, len(test.pods))
|
||||||
for i := range test.pods {
|
for i := range test.pods {
|
||||||
podInformer.GetStore().Add(test.pods[i])
|
testPods[i] = test.pods[i].DeepCopy()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := podInformer.GetStore().Add(testPod); err != nil {
|
||||||
|
t.Fatalf("Failed to add test pod %s: %v", testPod.Name, err)
|
||||||
|
}
|
||||||
|
for i := range testPods {
|
||||||
|
if err := podInformer.GetStore().Add(testPods[i]); err != nil {
|
||||||
|
t.Fatalf("Failed to add test pod %s: %v", testPods[i], err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Need to protect deletedPodNames and patchedPodNames to prevent DATA RACE panic.
|
||||||
|
var mu sync.RWMutex
|
||||||
deletedPodNames := sets.New[string]()
|
deletedPodNames := sets.New[string]()
|
||||||
patchedPodNames := sets.New[string]()
|
patchedPodNames := sets.New[string]()
|
||||||
patchedPods := []*v1.Pod{}
|
patchedPods := []*v1.Pod{}
|
||||||
@ -1748,10 +1765,14 @@ func TestPreempt(t *testing.T) {
|
|||||||
t.Fatalf("Failed to unmarshal updated pod %q: %v", updated, err)
|
t.Fatalf("Failed to unmarshal updated pod %q: %v", updated, err)
|
||||||
}
|
}
|
||||||
patchedPods = append(patchedPods, updatedPod)
|
patchedPods = append(patchedPods, updatedPod)
|
||||||
|
mu.Lock()
|
||||||
|
defer mu.Unlock()
|
||||||
patchedPodNames.Insert(podName)
|
patchedPodNames.Insert(podName)
|
||||||
return true, nil, nil
|
return true, nil, nil
|
||||||
})
|
})
|
||||||
client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
|
client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
|
||||||
|
mu.Lock()
|
||||||
|
defer mu.Unlock()
|
||||||
deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName())
|
deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName())
|
||||||
return true, nil, nil
|
return true, nil, nil
|
||||||
})
|
})
|
||||||
@ -1763,8 +1784,10 @@ func TestPreempt(t *testing.T) {
|
|||||||
waitingPods := frameworkruntime.NewWaitingPodsMap()
|
waitingPods := frameworkruntime.NewWaitingPodsMap()
|
||||||
|
|
||||||
cache := internalcache.New(ctx, time.Duration(0))
|
cache := internalcache.New(ctx, time.Duration(0))
|
||||||
for _, pod := range test.pods {
|
for _, pod := range testPods {
|
||||||
cache.AddPod(logger, pod)
|
if err := cache.AddPod(logger, pod.DeepCopy()); err != nil {
|
||||||
|
t.Fatalf("Failed to add pod %s: %v", pod.Name, err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
cachedNodeInfoMap := map[string]*framework.NodeInfo{}
|
cachedNodeInfoMap := map[string]*framework.NodeInfo{}
|
||||||
nodes := make([]*v1.Node, len(test.nodeNames))
|
nodes := make([]*v1.Node, len(test.nodeNames))
|
||||||
@ -1777,6 +1800,7 @@ func TestPreempt(t *testing.T) {
|
|||||||
node.ObjectMeta.Labels[labelKeys[i]] = label
|
node.ObjectMeta.Labels[labelKeys[i]] = label
|
||||||
}
|
}
|
||||||
node.Name = node.ObjectMeta.Labels["hostname"]
|
node.Name = node.ObjectMeta.Labels["hostname"]
|
||||||
|
t.Logf("node is added: %v. labels: %#v", node.Name, node.ObjectMeta.Labels)
|
||||||
cache.AddNode(logger, node)
|
cache.AddNode(logger, node)
|
||||||
nodes[i] = node
|
nodes[i] = node
|
||||||
|
|
||||||
@ -1803,10 +1827,11 @@ func TestPreempt(t *testing.T) {
|
|||||||
frameworkruntime.WithEventRecorder(&events.FakeRecorder{}),
|
frameworkruntime.WithEventRecorder(&events.FakeRecorder{}),
|
||||||
frameworkruntime.WithExtenders(extenders),
|
frameworkruntime.WithExtenders(extenders),
|
||||||
frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
|
frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
|
||||||
frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(test.pods, nodes)),
|
frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(testPods, nodes)),
|
||||||
frameworkruntime.WithInformerFactory(informerFactory),
|
frameworkruntime.WithInformerFactory(informerFactory),
|
||||||
frameworkruntime.WithWaitingPods(waitingPods),
|
frameworkruntime.WithWaitingPods(waitingPods),
|
||||||
frameworkruntime.WithLogger(logger),
|
frameworkruntime.WithLogger(logger),
|
||||||
|
frameworkruntime.WithPodActivator(&fakePodActivator{}),
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
@ -1814,7 +1839,7 @@ func TestPreempt(t *testing.T) {
|
|||||||
|
|
||||||
state := framework.NewCycleState()
|
state := framework.NewCycleState()
|
||||||
// Some tests rely on PreFilter plugin to compute its CycleState.
|
// Some tests rely on PreFilter plugin to compute its CycleState.
|
||||||
if _, s, _ := fwk.RunPreFilterPlugins(ctx, state, test.pod); !s.IsSuccess() {
|
if _, s, _ := fwk.RunPreFilterPlugins(ctx, state, testPod); !s.IsSuccess() {
|
||||||
t.Errorf("Unexpected preFilterStatus: %v", s)
|
t.Errorf("Unexpected preFilterStatus: %v", s)
|
||||||
}
|
}
|
||||||
// Call preempt and check the expected results.
|
// Call preempt and check the expected results.
|
||||||
@ -1825,14 +1850,7 @@ func TestPreempt(t *testing.T) {
|
|||||||
args: *getDefaultDefaultPreemptionArgs(),
|
args: *getDefaultDefaultPreemptionArgs(),
|
||||||
}
|
}
|
||||||
|
|
||||||
pe := preemption.Evaluator{
|
pe := preemption.NewEvaluator(names.DefaultPreemption, pl.fh, &pl, asyncPreemptionEnabled)
|
||||||
PluginName: names.DefaultPreemption,
|
|
||||||
Handler: pl.fh,
|
|
||||||
PodLister: pl.podLister,
|
|
||||||
PdbLister: pl.pdbLister,
|
|
||||||
State: state,
|
|
||||||
Interface: &pl,
|
|
||||||
}
|
|
||||||
|
|
||||||
// so that these nodes are eligible for preemption, we set their status
|
// so that these nodes are eligible for preemption, we set their status
|
||||||
// to Unschedulable.
|
// to Unschedulable.
|
||||||
@ -1842,16 +1860,33 @@ func TestPreempt(t *testing.T) {
|
|||||||
nodeToStatusMap.Set(n.Name, framework.NewStatus(framework.Unschedulable))
|
nodeToStatusMap.Set(n.Name, framework.NewStatus(framework.Unschedulable))
|
||||||
}
|
}
|
||||||
|
|
||||||
res, status := pe.Preempt(ctx, test.pod, nodeToStatusMap)
|
res, status := pe.Preempt(ctx, state, testPod, nodeToStatusMap)
|
||||||
if !status.IsSuccess() && !status.IsRejected() {
|
if !status.IsSuccess() && !status.IsRejected() {
|
||||||
t.Errorf("unexpected error in preemption: %v", status.AsError())
|
t.Errorf("unexpected error in preemption: %v", status.AsError())
|
||||||
}
|
}
|
||||||
if diff := cmp.Diff(test.want, res); diff != "" {
|
if diff := cmp.Diff(test.want, res); diff != "" {
|
||||||
t.Errorf("Unexpected status (-want, +got):\n%s", diff)
|
t.Errorf("Unexpected status (-want, +got):\n%s", diff)
|
||||||
}
|
}
|
||||||
if len(deletedPodNames) != len(test.expectedPods) {
|
|
||||||
t.Errorf("expected %v pods, got %v.", len(test.expectedPods), len(deletedPodNames))
|
if asyncPreemptionEnabled {
|
||||||
|
// Wait for the pod to be deleted.
|
||||||
|
if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
|
mu.RLock()
|
||||||
|
defer mu.RUnlock()
|
||||||
|
return len(deletedPodNames) == len(test.expectedPods), nil
|
||||||
|
}); err != nil {
|
||||||
|
t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames))
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
mu.RLock()
|
||||||
|
// If async preemption is disabled, the pod should be deleted immediately.
|
||||||
|
if len(deletedPodNames) != len(test.expectedPods) {
|
||||||
|
t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames))
|
||||||
|
}
|
||||||
|
mu.RUnlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
mu.RLock()
|
||||||
if diff := cmp.Diff(sets.List(patchedPodNames), sets.List(deletedPodNames)); diff != "" {
|
if diff := cmp.Diff(sets.List(patchedPodNames), sets.List(deletedPodNames)); diff != "" {
|
||||||
t.Errorf("unexpected difference in the set of patched and deleted pods: %s", diff)
|
t.Errorf("unexpected difference in the set of patched and deleted pods: %s", diff)
|
||||||
}
|
}
|
||||||
@ -1884,20 +1919,21 @@ func TestPreempt(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if res != nil && res.NominatingInfo != nil {
|
if res != nil && res.NominatingInfo != nil {
|
||||||
test.pod.Status.NominatedNodeName = res.NominatedNodeName
|
testPod.Status.NominatedNodeName = res.NominatedNodeName
|
||||||
}
|
}
|
||||||
|
|
||||||
// Manually set the deleted Pods' deletionTimestamp to non-nil.
|
// Manually set the deleted Pods' deletionTimestamp to non-nil.
|
||||||
for _, pod := range test.pods {
|
for _, pod := range testPods {
|
||||||
if deletedPodNames.Has(pod.Name) {
|
if deletedPodNames.Has(pod.Name) {
|
||||||
now := metav1.Now()
|
now := metav1.Now()
|
||||||
pod.DeletionTimestamp = &now
|
pod.DeletionTimestamp = &now
|
||||||
deletedPodNames.Delete(pod.Name)
|
deletedPodNames.Delete(pod.Name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
mu.RUnlock()
|
||||||
|
|
||||||
// Call preempt again and make sure it doesn't preempt any more pods.
|
// Call preempt again and make sure it doesn't preempt any more pods.
|
||||||
res, status = pe.Preempt(ctx, test.pod, framework.NewDefaultNodeToStatus())
|
res, status = pe.Preempt(ctx, state, testPod, framework.NewDefaultNodeToStatus())
|
||||||
if !status.IsSuccess() && !status.IsRejected() {
|
if !status.IsSuccess() && !status.IsRejected() {
|
||||||
t.Errorf("unexpected error in preemption: %v", status.AsError())
|
t.Errorf("unexpected error in preemption: %v", status.AsError())
|
||||||
}
|
}
|
||||||
@ -1907,3 +1943,9 @@ func TestPreempt(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakePodActivator struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakePodActivator) Activate(logger klog.Logger, pods map[string]*v1.Pod) {}
|
||||||
|
@ -28,4 +28,5 @@ type Features struct {
|
|||||||
EnableInPlacePodVerticalScaling bool
|
EnableInPlacePodVerticalScaling bool
|
||||||
EnableSidecarContainers bool
|
EnableSidecarContainers bool
|
||||||
EnableSchedulingQueueHint bool
|
EnableSchedulingQueueHint bool
|
||||||
|
EnableAsyncPreemption bool
|
||||||
}
|
}
|
||||||
|
@ -54,6 +54,7 @@ func NewInTreeRegistry() runtime.Registry {
|
|||||||
EnableInPlacePodVerticalScaling: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
EnableInPlacePodVerticalScaling: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
||||||
EnableSidecarContainers: feature.DefaultFeatureGate.Enabled(features.SidecarContainers),
|
EnableSidecarContainers: feature.DefaultFeatureGate.Enabled(features.SidecarContainers),
|
||||||
EnableSchedulingQueueHint: feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
|
EnableSchedulingQueueHint: feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
|
||||||
|
EnableAsyncPreemption: feature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption),
|
||||||
}
|
}
|
||||||
|
|
||||||
registry := runtime.Registry{
|
registry := runtime.Registry{
|
||||||
|
@ -23,11 +23,15 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
policy "k8s.io/api/policy/v1"
|
policy "k8s.io/api/policy/v1"
|
||||||
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||||
"k8s.io/apimachinery/pkg/labels"
|
"k8s.io/apimachinery/pkg/labels"
|
||||||
|
"k8s.io/apimachinery/pkg/types"
|
||||||
utilerrors "k8s.io/apimachinery/pkg/util/errors"
|
utilerrors "k8s.io/apimachinery/pkg/util/errors"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
corelisters "k8s.io/client-go/listers/core/v1"
|
corelisters "k8s.io/client-go/listers/core/v1"
|
||||||
policylisters "k8s.io/client-go/listers/policy/v1"
|
policylisters "k8s.io/client-go/listers/policy/v1"
|
||||||
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
|
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
|
||||||
@ -36,6 +40,7 @@ import (
|
|||||||
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
|
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
|
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
|
||||||
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||||
)
|
)
|
||||||
@ -125,10 +130,88 @@ type Evaluator struct {
|
|||||||
Handler framework.Handle
|
Handler framework.Handle
|
||||||
PodLister corelisters.PodLister
|
PodLister corelisters.PodLister
|
||||||
PdbLister policylisters.PodDisruptionBudgetLister
|
PdbLister policylisters.PodDisruptionBudgetLister
|
||||||
State *framework.CycleState
|
|
||||||
|
enableAsyncPreemption bool
|
||||||
|
mu sync.RWMutex
|
||||||
|
// preempting is a set that records the pods that are currently triggering preemption asynchronously,
|
||||||
|
// which is used to prevent the pods from entering the scheduling cycle meanwhile.
|
||||||
|
preempting sets.Set[types.UID]
|
||||||
|
|
||||||
|
// PreemptPod is a function that actually makes API calls to preempt a specific Pod.
|
||||||
|
// This is exposed to be replaced during tests.
|
||||||
|
PreemptPod func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error
|
||||||
|
|
||||||
Interface
|
Interface
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsyncPreemption bool) *Evaluator {
|
||||||
|
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
|
||||||
|
pdbLister := fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()
|
||||||
|
|
||||||
|
ev := &Evaluator{
|
||||||
|
PluginName: names.DefaultPreemption,
|
||||||
|
Handler: fh,
|
||||||
|
PodLister: podLister,
|
||||||
|
PdbLister: pdbLister,
|
||||||
|
Interface: i,
|
||||||
|
enableAsyncPreemption: enableAsyncPreemption,
|
||||||
|
preempting: sets.New[types.UID](),
|
||||||
|
}
|
||||||
|
|
||||||
|
// PreemptPod actually makes API calls to preempt a specific Pod.
|
||||||
|
//
|
||||||
|
// We implement it here directly, rather than creating a separate method like ev.preemptPod(...)
|
||||||
|
// to prevent the misuse of the PreemptPod function.
|
||||||
|
ev.PreemptPod = func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error {
|
||||||
|
logger := klog.FromContext(ctx)
|
||||||
|
|
||||||
|
// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
|
||||||
|
// Otherwise we should delete the victim.
|
||||||
|
if waitingPod := ev.Handler.GetWaitingPod(victim.UID); waitingPod != nil {
|
||||||
|
waitingPod.Reject(pluginName, "preempted")
|
||||||
|
logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(preemptor), "waitingPod", klog.KObj(victim), "node", c.Name())
|
||||||
|
} else {
|
||||||
|
condition := &v1.PodCondition{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
Reason: v1.PodReasonPreemptionByScheduler,
|
||||||
|
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
|
||||||
|
}
|
||||||
|
newStatus := victim.Status.DeepCopy()
|
||||||
|
updated := apipod.UpdatePodCondition(newStatus, condition)
|
||||||
|
if updated {
|
||||||
|
if err := util.PatchPodStatus(ctx, ev.Handler.ClientSet(), victim, newStatus); err != nil {
|
||||||
|
logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := util.DeletePod(ctx, ev.Handler.ClientSet(), victim); err != nil {
|
||||||
|
if !apierrors.IsNotFound(err) {
|
||||||
|
logger.Error(err, "Tried to preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
logger.V(2).Info("Victim Pod is already deleted", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
|
||||||
|
}
|
||||||
|
|
||||||
|
ev.Handler.EventRecorder().Eventf(victim, preemptor, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", preemptor.UID, c.Name())
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return ev
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsPodRunningPreemption returns true if the pod is currently triggering preemption asynchronously.
|
||||||
|
func (ev *Evaluator) IsPodRunningPreemption(podUID types.UID) bool {
|
||||||
|
ev.mu.RLock()
|
||||||
|
defer ev.mu.RUnlock()
|
||||||
|
|
||||||
|
return ev.preempting.Has(podUID)
|
||||||
|
}
|
||||||
|
|
||||||
// Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status.
|
// Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status.
|
||||||
// The semantics of returned <PostFilterResult, Status> varies on different scenarios:
|
// The semantics of returned <PostFilterResult, Status> varies on different scenarios:
|
||||||
//
|
//
|
||||||
@ -145,7 +228,7 @@ type Evaluator struct {
|
|||||||
//
|
//
|
||||||
// - <non-nil PostFilterResult, Success>. It's the regular happy path
|
// - <non-nil PostFilterResult, Success>. It's the regular happy path
|
||||||
// and the non-empty nominatedNodeName will be applied to the preemptor pod.
|
// and the non-empty nominatedNodeName will be applied to the preemptor pod.
|
||||||
func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
|
func (ev *Evaluator) Preempt(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
|
||||||
logger := klog.FromContext(ctx)
|
logger := klog.FromContext(ctx)
|
||||||
|
|
||||||
// 0) Fetch the latest version of <pod>.
|
// 0) Fetch the latest version of <pod>.
|
||||||
@ -171,7 +254,7 @@ func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeT
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, framework.AsStatus(err)
|
return nil, framework.AsStatus(err)
|
||||||
}
|
}
|
||||||
candidates, nodeToStatusMap, err := ev.findCandidates(ctx, allNodes, pod, m)
|
candidates, nodeToStatusMap, err := ev.findCandidates(ctx, state, allNodes, pod, m)
|
||||||
if err != nil && len(candidates) == 0 {
|
if err != nil && len(candidates) == 0 {
|
||||||
return nil, framework.AsStatus(err)
|
return nil, framework.AsStatus(err)
|
||||||
}
|
}
|
||||||
@ -203,17 +286,23 @@ func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeT
|
|||||||
return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption")
|
return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.V(2).Info("the target node for the preemption is determined", "node", bestCandidate.Name(), "pod", klog.KObj(pod))
|
||||||
|
|
||||||
// 5) Perform preparation work before nominating the selected candidate.
|
// 5) Perform preparation work before nominating the selected candidate.
|
||||||
|
if ev.enableAsyncPreemption {
|
||||||
|
ev.prepareCandidateAsync(bestCandidate, pod, ev.PluginName)
|
||||||
|
} else {
|
||||||
if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
|
if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
|
||||||
return nil, status
|
return nil, status
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success)
|
return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success)
|
||||||
}
|
}
|
||||||
|
|
||||||
// FindCandidates calculates a slice of preemption candidates.
|
// FindCandidates calculates a slice of preemption candidates.
|
||||||
// Each candidate is executable to make the given <pod> schedulable.
|
// Each candidate is executable to make the given <pod> schedulable.
|
||||||
func (ev *Evaluator) findCandidates(ctx context.Context, allNodes []*framework.NodeInfo, pod *v1.Pod, m framework.NodeToStatusReader) ([]Candidate, *framework.NodeToStatus, error) {
|
func (ev *Evaluator) findCandidates(ctx context.Context, state *framework.CycleState, allNodes []*framework.NodeInfo, pod *v1.Pod, m framework.NodeToStatusReader) ([]Candidate, *framework.NodeToStatus, error) {
|
||||||
if len(allNodes) == 0 {
|
if len(allNodes) == 0 {
|
||||||
return nil, nil, errors.New("no nodes available")
|
return nil, nil, errors.New("no nodes available")
|
||||||
}
|
}
|
||||||
@ -239,7 +328,7 @@ func (ev *Evaluator) findCandidates(ctx context.Context, allNodes []*framework.N
|
|||||||
}
|
}
|
||||||
|
|
||||||
offset, candidatesNum := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes)))
|
offset, candidatesNum := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes)))
|
||||||
return ev.DryRunPreemption(ctx, pod, potentialNodes, pdbs, offset, candidatesNum)
|
return ev.DryRunPreemption(ctx, state, pod, potentialNodes, pdbs, offset, candidatesNum)
|
||||||
}
|
}
|
||||||
|
|
||||||
// callExtenders calls given <extenders> to select the list of feasible candidates.
|
// callExtenders calls given <extenders> to select the list of feasible candidates.
|
||||||
@ -347,41 +436,11 @@ func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
logger := klog.FromContext(ctx)
|
logger := klog.FromContext(ctx)
|
||||||
errCh := parallelize.NewErrorChannel()
|
errCh := parallelize.NewErrorChannel()
|
||||||
preemptPod := func(index int) {
|
fh.Parallelizer().Until(ctx, len(c.Victims().Pods), func(index int) {
|
||||||
victim := c.Victims().Pods[index]
|
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[index], pluginName); err != nil {
|
||||||
// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
|
|
||||||
// Otherwise we should delete the victim.
|
|
||||||
if waitingPod := fh.GetWaitingPod(victim.UID); waitingPod != nil {
|
|
||||||
waitingPod.Reject(pluginName, "preempted")
|
|
||||||
logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(pod), "waitingPod", klog.KObj(victim), "node", c.Name())
|
|
||||||
} else {
|
|
||||||
condition := &v1.PodCondition{
|
|
||||||
Type: v1.DisruptionTarget,
|
|
||||||
Status: v1.ConditionTrue,
|
|
||||||
Reason: v1.PodReasonPreemptionByScheduler,
|
|
||||||
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", pod.Spec.SchedulerName),
|
|
||||||
}
|
|
||||||
newStatus := victim.Status.DeepCopy()
|
|
||||||
updated := apipod.UpdatePodCondition(newStatus, condition)
|
|
||||||
if updated {
|
|
||||||
if err := util.PatchPodStatus(ctx, cs, victim, newStatus); err != nil {
|
|
||||||
logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod))
|
|
||||||
errCh.SendErrorWithCancel(err, cancel)
|
errCh.SendErrorWithCancel(err, cancel)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
}
|
}, ev.PluginName)
|
||||||
if err := util.DeletePod(ctx, cs, victim); err != nil {
|
|
||||||
logger.Error(err, "Preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod))
|
|
||||||
errCh.SendErrorWithCancel(err, cancel)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(pod), "victim", klog.KObj(victim), "node", c.Name())
|
|
||||||
}
|
|
||||||
|
|
||||||
fh.EventRecorder().Eventf(victim, pod, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", pod.UID, c.Name())
|
|
||||||
}
|
|
||||||
|
|
||||||
fh.Parallelizer().Until(ctx, len(c.Victims().Pods), preemptPod, ev.PluginName)
|
|
||||||
if err := errCh.ReceiveError(); err != nil {
|
if err := errCh.ReceiveError(); err != nil {
|
||||||
return framework.AsStatus(err)
|
return framework.AsStatus(err)
|
||||||
}
|
}
|
||||||
@ -401,6 +460,91 @@ func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// prepareCandidateAsync triggers a goroutine for some preparation work:
|
||||||
|
// - Evict the victim pods
|
||||||
|
// - Reject the victim pods if they are in waitingPod map
|
||||||
|
// - Clear the low-priority pods' nominatedNodeName status if needed
|
||||||
|
// The Pod won't be retried until the goroutine triggered here completes.
|
||||||
|
//
|
||||||
|
// See http://kep.k8s.io/4832 for how the async preemption works.
|
||||||
|
func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName string) {
|
||||||
|
metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
|
||||||
|
|
||||||
|
// Intentionally create a new context, not using a ctx from the scheduling cycle, to create ctx,
|
||||||
|
// because this process could continue even after this scheduling cycle finishes.
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
errCh := parallelize.NewErrorChannel()
|
||||||
|
preemptPod := func(index int) {
|
||||||
|
victim := c.Victims().Pods[index]
|
||||||
|
if err := ev.PreemptPod(ctx, c, pod, victim, pluginName); err != nil {
|
||||||
|
errCh.SendErrorWithCancel(err, cancel)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ev.mu.Lock()
|
||||||
|
ev.preempting.Insert(pod.UID)
|
||||||
|
ev.mu.Unlock()
|
||||||
|
|
||||||
|
logger := klog.FromContext(ctx)
|
||||||
|
go func() {
|
||||||
|
startTime := time.Now()
|
||||||
|
result := metrics.GoroutineResultSuccess
|
||||||
|
defer metrics.PreemptionGoroutinesDuration.WithLabelValues(result).Observe(metrics.SinceInSeconds(startTime))
|
||||||
|
defer metrics.PreemptionGoroutinesExecutionTotal.WithLabelValues(result).Inc()
|
||||||
|
defer func() {
|
||||||
|
if result == metrics.GoroutineResultError {
|
||||||
|
// When API call isn't successful, the Pod may get stuck in the unschedulable pod pool in the worst case.
|
||||||
|
// So, we should move the Pod to the activeQ.
|
||||||
|
ev.Handler.Activate(logger, map[string]*v1.Pod{pod.Name: pod})
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
defer cancel()
|
||||||
|
logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods))
|
||||||
|
|
||||||
|
// Lower priority pods nominated to run on this node, may no longer fit on
|
||||||
|
// this node. So, we should remove their nomination. Removing their
|
||||||
|
// nomination updates these pods and moves them to the active queue. It
|
||||||
|
// lets scheduler find another place for them.
|
||||||
|
nominatedPods := getLowerPriorityNominatedPods(logger, ev.Handler, pod, c.Name())
|
||||||
|
if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), nominatedPods...); err != nil {
|
||||||
|
logger.Error(err, "Cannot clear 'NominatedNodeName' field from lower priority pods on the same target node", "node", c.Name())
|
||||||
|
result = metrics.GoroutineResultError
|
||||||
|
// We do not return as this error is not critical.
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(c.Victims().Pods) == 0 {
|
||||||
|
ev.mu.Lock()
|
||||||
|
delete(ev.preempting, pod.UID)
|
||||||
|
ev.mu.Unlock()
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// We can evict all victims in parallel, but the last one.
|
||||||
|
// We have to remove the pod from the preempting map before the last one is evicted
|
||||||
|
// because, otherwise, the pod removal might be notified to the scheduling queue before
|
||||||
|
// we remove this pod from the preempting map,
|
||||||
|
// and the pod could end up stucking at the unschedulable pod pool
|
||||||
|
// by all the pod removal events being ignored.
|
||||||
|
ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName)
|
||||||
|
if err := errCh.ReceiveError(); err != nil {
|
||||||
|
logger.Error(err, "Error occurred during async preemption")
|
||||||
|
result = metrics.GoroutineResultError
|
||||||
|
}
|
||||||
|
|
||||||
|
ev.mu.Lock()
|
||||||
|
delete(ev.preempting, pod.UID)
|
||||||
|
ev.mu.Unlock()
|
||||||
|
|
||||||
|
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil {
|
||||||
|
logger.Error(err, "Error occurred during async preemption")
|
||||||
|
result = metrics.GoroutineResultError
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name(), "result", result)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) {
|
func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) {
|
||||||
if pdbLister != nil {
|
if pdbLister != nil {
|
||||||
return pdbLister.List(labels.Everything())
|
return pdbLister.List(labels.Everything())
|
||||||
@ -538,7 +682,7 @@ func getLowerPriorityNominatedPods(logger klog.Logger, pn framework.PodNominator
|
|||||||
// The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
|
// The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
|
||||||
// candidates, ones that do not violate PDB are preferred over ones that do.
|
// candidates, ones that do not violate PDB are preferred over ones that do.
|
||||||
// NOTE: This method is exported for easier testing in default preemption.
|
// NOTE: This method is exported for easier testing in default preemption.
|
||||||
func (ev *Evaluator) DryRunPreemption(ctx context.Context, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
|
func (ev *Evaluator) DryRunPreemption(ctx context.Context, state *framework.CycleState, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
|
||||||
pdbs []*policy.PodDisruptionBudget, offset int32, candidatesNum int32) ([]Candidate, *framework.NodeToStatus, error) {
|
pdbs []*policy.PodDisruptionBudget, offset int32, candidatesNum int32) ([]Candidate, *framework.NodeToStatus, error) {
|
||||||
|
|
||||||
fh := ev.Handler
|
fh := ev.Handler
|
||||||
@ -557,7 +701,7 @@ func (ev *Evaluator) DryRunPreemption(ctx context.Context, pod *v1.Pod, potentia
|
|||||||
nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot()
|
nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot()
|
||||||
logger.V(5).Info("Check the potential node for preemption", "node", nodeInfoCopy.Node().Name)
|
logger.V(5).Info("Check the potential node for preemption", "node", nodeInfoCopy.Node().Name)
|
||||||
|
|
||||||
stateCopy := ev.State.Clone()
|
stateCopy := state.Clone()
|
||||||
pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs)
|
pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs)
|
||||||
if status.IsSuccess() && len(pods) != 0 {
|
if status.IsSuccess() && len(pods) != 0 {
|
||||||
victims := extenderv1.Victims{
|
victims := extenderv1.Victims{
|
||||||
|
@ -20,18 +20,26 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
policy "k8s.io/api/policy/v1"
|
policy "k8s.io/api/policy/v1"
|
||||||
"k8s.io/apimachinery/pkg/runtime"
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
|
"k8s.io/apimachinery/pkg/types"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
"k8s.io/client-go/informers"
|
"k8s.io/client-go/informers"
|
||||||
clientsetfake "k8s.io/client-go/kubernetes/fake"
|
clientsetfake "k8s.io/client-go/kubernetes/fake"
|
||||||
"k8s.io/client-go/kubernetes/scheme"
|
"k8s.io/client-go/kubernetes/scheme"
|
||||||
|
clienttesting "k8s.io/client-go/testing"
|
||||||
"k8s.io/client-go/tools/events"
|
"k8s.io/client-go/tools/events"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
"k8s.io/klog/v2/ktesting"
|
"k8s.io/klog/v2/ktesting"
|
||||||
extenderv1 "k8s.io/kube-scheduler/extender/v1"
|
extenderv1 "k8s.io/kube-scheduler/extender/v1"
|
||||||
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
|
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
|
||||||
@ -81,6 +89,19 @@ func (pl *FakePostFilterPlugin) OrderedScoreFuncs(ctx context.Context, nodesToVi
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type fakePodActivator struct {
|
||||||
|
activatedPods map[string]*v1.Pod
|
||||||
|
mu *sync.RWMutex
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakePodActivator) Activate(logger klog.Logger, pods map[string]*v1.Pod) {
|
||||||
|
f.mu.Lock()
|
||||||
|
defer f.mu.Unlock()
|
||||||
|
for name, pod := range pods {
|
||||||
|
f.activatedPods[name] = pod
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type FakePreemptionScorePostFilterPlugin struct{}
|
type FakePreemptionScorePostFilterPlugin struct{}
|
||||||
|
|
||||||
func (pl *FakePreemptionScorePostFilterPlugin) SelectVictimsOnNode(
|
func (pl *FakePreemptionScorePostFilterPlugin) SelectVictimsOnNode(
|
||||||
@ -243,9 +264,8 @@ func TestDryRunPreemption(t *testing.T) {
|
|||||||
PluginName: "FakePostFilter",
|
PluginName: "FakePostFilter",
|
||||||
Handler: fwk,
|
Handler: fwk,
|
||||||
Interface: fakePostPlugin,
|
Interface: fakePostPlugin,
|
||||||
State: state,
|
|
||||||
}
|
}
|
||||||
got, _, _ := pe.DryRunPreemption(ctx, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
|
got, _, _ := pe.DryRunPreemption(ctx, state, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
|
||||||
// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
|
// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
|
||||||
for i := range got {
|
for i := range got {
|
||||||
victims := got[i].Victims().Pods
|
victims := got[i].Victims().Pods
|
||||||
@ -344,9 +364,8 @@ func TestSelectCandidate(t *testing.T) {
|
|||||||
PluginName: "FakePreemptionScorePostFilter",
|
PluginName: "FakePreemptionScorePostFilter",
|
||||||
Handler: fwk,
|
Handler: fwk,
|
||||||
Interface: fakePreemptionScorePostFilterPlugin,
|
Interface: fakePreemptionScorePostFilterPlugin,
|
||||||
State: state,
|
|
||||||
}
|
}
|
||||||
candidates, _, _ := pe.DryRunPreemption(ctx, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
|
candidates, _, _ := pe.DryRunPreemption(ctx, state, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
|
||||||
s := pe.SelectCandidate(ctx, candidates)
|
s := pe.SelectCandidate(ctx, candidates)
|
||||||
if s == nil || len(s.Name()) == 0 {
|
if s == nil || len(s.Name()) == 0 {
|
||||||
t.Errorf("expect any node in %v, but no candidate selected", tt.expected)
|
t.Errorf("expect any node in %v, but no candidate selected", tt.expected)
|
||||||
@ -393,6 +412,11 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
||||||
Obj()
|
Obj()
|
||||||
|
|
||||||
|
failVictim = st.MakePod().Name("fail-victim").UID("victim1").
|
||||||
|
Node(node1Name).SchedulerName(defaultSchedulerName).Priority(midPriority).
|
||||||
|
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
||||||
|
Obj()
|
||||||
|
|
||||||
victim2 = st.MakePod().Name("victim2").UID("victim2").
|
victim2 = st.MakePod().Name("victim2").UID("victim2").
|
||||||
Node(node1Name).SchedulerName(defaultSchedulerName).Priority(50000).
|
Node(node1Name).SchedulerName(defaultSchedulerName).Priority(50000).
|
||||||
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
||||||
@ -404,6 +428,12 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
||||||
Obj()
|
Obj()
|
||||||
|
|
||||||
|
failVictim1WithMatchingCondition = st.MakePod().Name("fail-victim").UID("victim1").
|
||||||
|
Node(node1Name).SchedulerName(defaultSchedulerName).Priority(midPriority).
|
||||||
|
Conditions([]v1.PodCondition{condition}).
|
||||||
|
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
||||||
|
Obj()
|
||||||
|
|
||||||
preemptor = st.MakePod().Name("preemptor").UID("preemptor").
|
preemptor = st.MakePod().Name("preemptor").UID("preemptor").
|
||||||
SchedulerName(defaultSchedulerName).Priority(highPriority).
|
SchedulerName(defaultSchedulerName).Priority(highPriority).
|
||||||
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}).
|
||||||
@ -416,10 +446,18 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
candidate *fakeCandidate
|
candidate *fakeCandidate
|
||||||
preemptor *v1.Pod
|
preemptor *v1.Pod
|
||||||
testPods []*v1.Pod
|
testPods []*v1.Pod
|
||||||
|
expectedDeletedPods []string
|
||||||
|
expectedDeletionError bool
|
||||||
|
expectedPatchError bool
|
||||||
|
// Only compared when async preemption is disabled.
|
||||||
expectedStatus *framework.Status
|
expectedStatus *framework.Status
|
||||||
|
// Only compared when async preemption is enabled.
|
||||||
|
expectedPreemptingMap sets.Set[types.UID]
|
||||||
|
expectedActivatedPods map[string]*v1.Pod
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "no victims",
|
name: "no victims",
|
||||||
|
|
||||||
candidate: &fakeCandidate{
|
candidate: &fakeCandidate{
|
||||||
victims: &extenderv1.Victims{},
|
victims: &extenderv1.Victims{},
|
||||||
},
|
},
|
||||||
@ -429,9 +467,11 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
},
|
},
|
||||||
nodeNames: []string{node1Name},
|
nodeNames: []string{node1Name},
|
||||||
expectedStatus: nil,
|
expectedStatus: nil,
|
||||||
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "one victim without condition",
|
name: "one victim without condition",
|
||||||
|
|
||||||
candidate: &fakeCandidate{
|
candidate: &fakeCandidate{
|
||||||
name: node1Name,
|
name: node1Name,
|
||||||
victims: &extenderv1.Victims{
|
victims: &extenderv1.Victims{
|
||||||
@ -445,10 +485,13 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
victim1,
|
victim1,
|
||||||
},
|
},
|
||||||
nodeNames: []string{node1Name},
|
nodeNames: []string{node1Name},
|
||||||
|
expectedDeletedPods: []string{"victim1"},
|
||||||
expectedStatus: nil,
|
expectedStatus: nil,
|
||||||
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "one victim with same condition",
|
name: "one victim with same condition",
|
||||||
|
|
||||||
candidate: &fakeCandidate{
|
candidate: &fakeCandidate{
|
||||||
name: node1Name,
|
name: node1Name,
|
||||||
victims: &extenderv1.Victims{
|
victims: &extenderv1.Victims{
|
||||||
@ -462,10 +505,13 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
victim1WithMatchingCondition,
|
victim1WithMatchingCondition,
|
||||||
},
|
},
|
||||||
nodeNames: []string{node1Name},
|
nodeNames: []string{node1Name},
|
||||||
|
expectedDeletedPods: []string{"victim1"},
|
||||||
expectedStatus: nil,
|
expectedStatus: nil,
|
||||||
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "one victim, but patch pod failed (not found victim1 pod)",
|
name: "one victim, not-found victim error is ignored when patching",
|
||||||
|
|
||||||
candidate: &fakeCandidate{
|
candidate: &fakeCandidate{
|
||||||
name: node1Name,
|
name: node1Name,
|
||||||
victims: &extenderv1.Victims{
|
victims: &extenderv1.Victims{
|
||||||
@ -477,10 +523,32 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
preemptor: preemptor,
|
preemptor: preemptor,
|
||||||
testPods: []*v1.Pod{},
|
testPods: []*v1.Pod{},
|
||||||
nodeNames: []string{node1Name},
|
nodeNames: []string{node1Name},
|
||||||
expectedStatus: framework.AsStatus(errors.New("patch pod status failed")),
|
expectedDeletedPods: []string{"victim1"},
|
||||||
|
expectedStatus: nil,
|
||||||
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "one victim, but delete pod failed (not found victim1 pod)",
|
name: "one victim, but pod deletion failed",
|
||||||
|
|
||||||
|
candidate: &fakeCandidate{
|
||||||
|
name: node1Name,
|
||||||
|
victims: &extenderv1.Victims{
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
failVictim1WithMatchingCondition,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
preemptor: preemptor,
|
||||||
|
testPods: []*v1.Pod{},
|
||||||
|
expectedDeletionError: true,
|
||||||
|
nodeNames: []string{node1Name},
|
||||||
|
expectedStatus: framework.AsStatus(errors.New("delete pod failed")),
|
||||||
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
|
expectedActivatedPods: map[string]*v1.Pod{preemptor.Name: preemptor},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "one victim, not-found victim error is ignored when deleting",
|
||||||
|
|
||||||
candidate: &fakeCandidate{
|
candidate: &fakeCandidate{
|
||||||
name: node1Name,
|
name: node1Name,
|
||||||
victims: &extenderv1.Victims{
|
victims: &extenderv1.Victims{
|
||||||
@ -492,15 +560,37 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
preemptor: preemptor,
|
preemptor: preemptor,
|
||||||
testPods: []*v1.Pod{},
|
testPods: []*v1.Pod{},
|
||||||
nodeNames: []string{node1Name},
|
nodeNames: []string{node1Name},
|
||||||
expectedStatus: framework.AsStatus(errors.New("delete pod failed")),
|
expectedDeletedPods: []string{"victim1"},
|
||||||
|
expectedStatus: nil,
|
||||||
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "two victims without condition, one passes successfully and the second fails (not found victim2 pod)",
|
name: "one victim, but patch pod failed",
|
||||||
|
|
||||||
candidate: &fakeCandidate{
|
candidate: &fakeCandidate{
|
||||||
name: node1Name,
|
name: node1Name,
|
||||||
victims: &extenderv1.Victims{
|
victims: &extenderv1.Victims{
|
||||||
Pods: []*v1.Pod{
|
Pods: []*v1.Pod{
|
||||||
victim1,
|
failVictim,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
preemptor: preemptor,
|
||||||
|
testPods: []*v1.Pod{},
|
||||||
|
expectedPatchError: true,
|
||||||
|
nodeNames: []string{node1Name},
|
||||||
|
expectedStatus: framework.AsStatus(errors.New("patch pod status failed")),
|
||||||
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
|
expectedActivatedPods: map[string]*v1.Pod{preemptor.Name: preemptor},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "two victims without condition, one passes successfully and the second fails",
|
||||||
|
|
||||||
|
candidate: &fakeCandidate{
|
||||||
|
name: node1Name,
|
||||||
|
victims: &extenderv1.Victims{
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
failVictim,
|
||||||
victim2,
|
victim2,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -510,12 +600,17 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
victim1,
|
victim1,
|
||||||
},
|
},
|
||||||
nodeNames: []string{node1Name},
|
nodeNames: []string{node1Name},
|
||||||
|
expectedPatchError: true,
|
||||||
|
expectedDeletedPods: []string{"victim2"},
|
||||||
expectedStatus: framework.AsStatus(errors.New("patch pod status failed")),
|
expectedStatus: framework.AsStatus(errors.New("patch pod status failed")),
|
||||||
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
|
expectedActivatedPods: map[string]*v1.Pod{preemptor.Name: preemptor},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%v (Async preemption enabled: %v)", tt.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
metrics.Register()
|
metrics.Register()
|
||||||
logger, ctx := ktesting.NewTestContext(t)
|
logger, ctx := ktesting.NewTestContext(t)
|
||||||
ctx, cancel := context.WithCancel(ctx)
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
@ -533,9 +628,42 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
for _, pod := range tt.testPods {
|
for _, pod := range tt.testPods {
|
||||||
objs = append(objs, pod)
|
objs = append(objs, pod)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
requestStopper := make(chan struct{})
|
||||||
|
mu := &sync.RWMutex{}
|
||||||
|
deletedPods := sets.New[string]()
|
||||||
|
deletionFailure := false // whether any request to delete pod failed
|
||||||
|
patchFailure := false // whether any request to patch pod status failed
|
||||||
|
|
||||||
cs := clientsetfake.NewClientset(objs...)
|
cs := clientsetfake.NewClientset(objs...)
|
||||||
|
cs.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
|
||||||
|
<-requestStopper
|
||||||
|
mu.Lock()
|
||||||
|
defer mu.Unlock()
|
||||||
|
name := action.(clienttesting.DeleteAction).GetName()
|
||||||
|
if name == "fail-victim" {
|
||||||
|
deletionFailure = true
|
||||||
|
return true, nil, fmt.Errorf("delete pod failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
deletedPods.Insert(name)
|
||||||
|
return true, nil, nil
|
||||||
|
})
|
||||||
|
|
||||||
|
cs.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
|
||||||
|
<-requestStopper
|
||||||
|
mu.Lock()
|
||||||
|
defer mu.Unlock()
|
||||||
|
if action.(clienttesting.PatchAction).GetName() == "fail-victim" {
|
||||||
|
patchFailure = true
|
||||||
|
return true, nil, fmt.Errorf("patch pod status failed")
|
||||||
|
}
|
||||||
|
return true, nil, nil
|
||||||
|
})
|
||||||
|
|
||||||
informerFactory := informers.NewSharedInformerFactory(cs, 0)
|
informerFactory := informers.NewSharedInformerFactory(cs, 0)
|
||||||
eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: cs.EventsV1()})
|
eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: cs.EventsV1()})
|
||||||
|
fakeActivator := &fakePodActivator{activatedPods: make(map[string]*v1.Pod), mu: mu}
|
||||||
fwk, err := tf.NewFramework(
|
fwk, err := tf.NewFramework(
|
||||||
ctx,
|
ctx,
|
||||||
registeredPlugins, "",
|
registeredPlugins, "",
|
||||||
@ -546,6 +674,7 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(tt.testPods, nodes)),
|
frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(tt.testPods, nodes)),
|
||||||
frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
|
frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
|
||||||
frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, "test-scheduler")),
|
frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, "test-scheduler")),
|
||||||
|
frameworkruntime.WithPodActivator(fakeActivator),
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
@ -553,13 +682,24 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
informerFactory.Start(ctx.Done())
|
informerFactory.Start(ctx.Done())
|
||||||
informerFactory.WaitForCacheSync(ctx.Done())
|
informerFactory.WaitForCacheSync(ctx.Done())
|
||||||
fakePreemptionScorePostFilterPlugin := &FakePreemptionScorePostFilterPlugin{}
|
fakePreemptionScorePostFilterPlugin := &FakePreemptionScorePostFilterPlugin{}
|
||||||
pe := Evaluator{
|
pe := NewEvaluator("FakePreemptionScorePostFilter", fwk, fakePreemptionScorePostFilterPlugin, asyncPreemptionEnabled)
|
||||||
PluginName: "FakePreemptionScorePostFilter",
|
|
||||||
Handler: fwk,
|
|
||||||
Interface: fakePreemptionScorePostFilterPlugin,
|
|
||||||
State: framework.NewCycleState(),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if asyncPreemptionEnabled {
|
||||||
|
pe.prepareCandidateAsync(tt.candidate, tt.preemptor, "test-plugin")
|
||||||
|
pe.mu.Lock()
|
||||||
|
// The preempting map should be registered synchronously
|
||||||
|
// so we don't need wait.Poll.
|
||||||
|
if !tt.expectedPreemptingMap.Equal(pe.preempting) {
|
||||||
|
t.Errorf("expected preempting map %v, got %v", tt.expectedPreemptingMap, pe.preempting)
|
||||||
|
close(requestStopper)
|
||||||
|
pe.mu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
pe.mu.Unlock()
|
||||||
|
// make the requests complete
|
||||||
|
close(requestStopper)
|
||||||
|
} else {
|
||||||
|
close(requestStopper) // no need to stop requests
|
||||||
status := pe.prepareCandidate(ctx, tt.candidate, tt.preemptor, "test-plugin")
|
status := pe.prepareCandidate(ctx, tt.candidate, tt.preemptor, "test-plugin")
|
||||||
if tt.expectedStatus == nil {
|
if tt.expectedStatus == nil {
|
||||||
if status != nil {
|
if status != nil {
|
||||||
@ -570,11 +710,48 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
t.Errorf("expect status %v, but got nil", tt.expectedStatus)
|
t.Errorf("expect status %v, but got nil", tt.expectedStatus)
|
||||||
} else if status.Code() != tt.expectedStatus.Code() {
|
} else if status.Code() != tt.expectedStatus.Code() {
|
||||||
t.Errorf("expect status code %v, but got %v", tt.expectedStatus.Code(), status.Code())
|
t.Errorf("expect status code %v, but got %v", tt.expectedStatus.Code(), status.Code())
|
||||||
|
} else if status.Message() != tt.expectedStatus.Message() {
|
||||||
|
t.Errorf("expect status message %v, but got %v", tt.expectedStatus.Message(), status.Message())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var lastErrMsg string
|
||||||
|
if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
|
mu.RLock()
|
||||||
|
defer mu.RUnlock()
|
||||||
|
if !deletedPods.Equal(sets.New(tt.expectedDeletedPods...)) {
|
||||||
|
lastErrMsg = fmt.Sprintf("expected deleted pods %v, got %v", tt.expectedDeletedPods, deletedPods.UnsortedList())
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
if tt.expectedDeletionError != deletionFailure {
|
||||||
|
lastErrMsg = fmt.Sprintf("expected deletion error %v, got %v", tt.expectedDeletionError, deletionFailure)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
if tt.expectedPatchError != patchFailure {
|
||||||
|
lastErrMsg = fmt.Sprintf("expected patch error %v, got %v", tt.expectedPatchError, patchFailure)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if asyncPreemptionEnabled {
|
||||||
|
if tt.expectedActivatedPods != nil && !reflect.DeepEqual(tt.expectedActivatedPods, fakeActivator.activatedPods) {
|
||||||
|
lastErrMsg = fmt.Sprintf("expected activated pods %v, got %v", tt.expectedActivatedPods, fakeActivator.activatedPods)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
if tt.expectedActivatedPods == nil && len(fakeActivator.activatedPods) != 0 {
|
||||||
|
lastErrMsg = fmt.Sprintf("expected no activated pods, got %v", fakeActivator.activatedPods)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatal(lastErrMsg)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type fakeExtender struct {
|
type fakeExtender struct {
|
||||||
ignorable bool
|
ignorable bool
|
||||||
@ -812,7 +989,6 @@ func TestCallExtenders(t *testing.T) {
|
|||||||
PluginName: "FakePreemptionScorePostFilter",
|
PluginName: "FakePreemptionScorePostFilter",
|
||||||
Handler: fwk,
|
Handler: fwk,
|
||||||
Interface: fakePreemptionScorePostFilterPlugin,
|
Interface: fakePreemptionScorePostFilterPlugin,
|
||||||
State: framework.NewCycleState(),
|
|
||||||
}
|
}
|
||||||
gotCandidates, status := pe.callExtenders(logger, preemptor, tt.candidates)
|
gotCandidates, status := pe.callExtenders(logger, preemptor, tt.candidates)
|
||||||
if (tt.wantStatus == nil) != (status == nil) || status.Code() != tt.wantStatus.Code() {
|
if (tt.wantStatus == nil) != (status == nil) || status.Code() != tt.wantStatus.Code() {
|
||||||
|
@ -84,6 +84,7 @@ type frameworkImpl struct {
|
|||||||
|
|
||||||
extenders []framework.Extender
|
extenders []framework.Extender
|
||||||
framework.PodNominator
|
framework.PodNominator
|
||||||
|
framework.PodActivator
|
||||||
|
|
||||||
parallelizer parallelize.Parallelizer
|
parallelizer parallelize.Parallelizer
|
||||||
}
|
}
|
||||||
@ -131,6 +132,7 @@ type frameworkOptions struct {
|
|||||||
snapshotSharedLister framework.SharedLister
|
snapshotSharedLister framework.SharedLister
|
||||||
metricsRecorder *metrics.MetricAsyncRecorder
|
metricsRecorder *metrics.MetricAsyncRecorder
|
||||||
podNominator framework.PodNominator
|
podNominator framework.PodNominator
|
||||||
|
podActivator framework.PodActivator
|
||||||
extenders []framework.Extender
|
extenders []framework.Extender
|
||||||
captureProfile CaptureProfile
|
captureProfile CaptureProfile
|
||||||
parallelizer parallelize.Parallelizer
|
parallelizer parallelize.Parallelizer
|
||||||
@ -200,6 +202,12 @@ func WithPodNominator(nominator framework.PodNominator) Option {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func WithPodActivator(activator framework.PodActivator) Option {
|
||||||
|
return func(o *frameworkOptions) {
|
||||||
|
o.podActivator = activator
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// WithExtenders sets extenders for the scheduling frameworkImpl.
|
// WithExtenders sets extenders for the scheduling frameworkImpl.
|
||||||
func WithExtenders(extenders []framework.Extender) Option {
|
func WithExtenders(extenders []framework.Extender) Option {
|
||||||
return func(o *frameworkOptions) {
|
return func(o *frameworkOptions) {
|
||||||
@ -279,6 +287,7 @@ func NewFramework(ctx context.Context, r Registry, profile *config.KubeScheduler
|
|||||||
metricsRecorder: options.metricsRecorder,
|
metricsRecorder: options.metricsRecorder,
|
||||||
extenders: options.extenders,
|
extenders: options.extenders,
|
||||||
PodNominator: options.podNominator,
|
PodNominator: options.podNominator,
|
||||||
|
PodActivator: options.podActivator,
|
||||||
parallelizer: options.parallelizer,
|
parallelizer: options.parallelizer,
|
||||||
logger: logger,
|
logger: logger,
|
||||||
}
|
}
|
||||||
@ -427,6 +436,10 @@ func (f *frameworkImpl) SetPodNominator(n framework.PodNominator) {
|
|||||||
f.PodNominator = n
|
f.PodNominator = n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f *frameworkImpl) SetPodActivator(a framework.PodActivator) {
|
||||||
|
f.PodActivator = a
|
||||||
|
}
|
||||||
|
|
||||||
// Close closes each plugin, when they implement io.Closer interface.
|
// Close closes each plugin, when they implement io.Closer interface.
|
||||||
func (f *frameworkImpl) Close() error {
|
func (f *frameworkImpl) Close() error {
|
||||||
var errs []error
|
var errs []error
|
||||||
|
@ -296,7 +296,7 @@ func (ce ClusterEvent) Label() string {
|
|||||||
|
|
||||||
// AllClusterEventLabels returns all possible cluster event labels given to the metrics.
|
// AllClusterEventLabels returns all possible cluster event labels given to the metrics.
|
||||||
func AllClusterEventLabels() []string {
|
func AllClusterEventLabels() []string {
|
||||||
labels := []string{EventUnschedulableTimeout.Label()}
|
labels := []string{UnschedulableTimeout, ForceActivate}
|
||||||
for _, r := range allResources {
|
for _, r := range allResources {
|
||||||
for _, a := range basicActionTypes {
|
for _, a := range basicActionTypes {
|
||||||
labels = append(labels, ClusterEvent{Resource: r, ActionType: a}.Label())
|
labels = append(labels, ClusterEvent{Resource: r, ActionType: a}.Label())
|
||||||
|
@ -40,6 +40,11 @@ const (
|
|||||||
Binding = "binding"
|
Binding = "binding"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
GoroutineResultSuccess = "success"
|
||||||
|
GoroutineResultError = "error"
|
||||||
|
)
|
||||||
|
|
||||||
// ExtentionPoints is a list of possible values for the extension_point label.
|
// ExtentionPoints is a list of possible values for the extension_point label.
|
||||||
var ExtentionPoints = []string{
|
var ExtentionPoints = []string{
|
||||||
PreFilter,
|
PreFilter,
|
||||||
@ -105,13 +110,20 @@ var (
|
|||||||
FrameworkExtensionPointDuration *metrics.HistogramVec
|
FrameworkExtensionPointDuration *metrics.HistogramVec
|
||||||
PluginExecutionDuration *metrics.HistogramVec
|
PluginExecutionDuration *metrics.HistogramVec
|
||||||
|
|
||||||
// This is only available when the QHint feature gate is enabled.
|
|
||||||
queueingHintExecutionDuration *metrics.HistogramVec
|
|
||||||
SchedulerQueueIncomingPods *metrics.CounterVec
|
|
||||||
PermitWaitDuration *metrics.HistogramVec
|
PermitWaitDuration *metrics.HistogramVec
|
||||||
CacheSize *metrics.GaugeVec
|
CacheSize *metrics.GaugeVec
|
||||||
unschedulableReasons *metrics.GaugeVec
|
unschedulableReasons *metrics.GaugeVec
|
||||||
PluginEvaluationTotal *metrics.CounterVec
|
PluginEvaluationTotal *metrics.CounterVec
|
||||||
|
|
||||||
|
// The below two are only available when the QHint feature gate is enabled.
|
||||||
|
queueingHintExecutionDuration *metrics.HistogramVec
|
||||||
|
SchedulerQueueIncomingPods *metrics.CounterVec
|
||||||
|
|
||||||
|
// The below two are only available when the async-preemption feature gate is enabled.
|
||||||
|
PreemptionGoroutinesDuration *metrics.HistogramVec
|
||||||
|
PreemptionGoroutinesExecutionTotal *metrics.CounterVec
|
||||||
|
|
||||||
|
// metricsList is a list of all metrics that should be registered always, regardless of any feature gate's value.
|
||||||
metricsList []metrics.Registerable
|
metricsList []metrics.Registerable
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -123,11 +135,14 @@ func Register() {
|
|||||||
registerMetrics.Do(func() {
|
registerMetrics.Do(func() {
|
||||||
InitMetrics()
|
InitMetrics()
|
||||||
RegisterMetrics(metricsList...)
|
RegisterMetrics(metricsList...)
|
||||||
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
|
|
||||||
RegisterMetrics(queueingHintExecutionDuration)
|
|
||||||
RegisterMetrics(InFlightEvents)
|
|
||||||
}
|
|
||||||
volumebindingmetrics.RegisterVolumeSchedulingMetrics()
|
volumebindingmetrics.RegisterVolumeSchedulingMetrics()
|
||||||
|
|
||||||
|
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
|
||||||
|
RegisterMetrics(queueingHintExecutionDuration, InFlightEvents)
|
||||||
|
}
|
||||||
|
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption) {
|
||||||
|
RegisterMetrics(PreemptionGoroutinesDuration, PreemptionGoroutinesExecutionTotal)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -317,6 +332,25 @@ func InitMetrics() {
|
|||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.ALPHA,
|
||||||
}, []string{"plugin", "extension_point", "profile"})
|
}, []string{"plugin", "extension_point", "profile"})
|
||||||
|
|
||||||
|
PreemptionGoroutinesDuration = metrics.NewHistogramVec(
|
||||||
|
&metrics.HistogramOpts{
|
||||||
|
Subsystem: SchedulerSubsystem,
|
||||||
|
Name: "preemption_goroutines_duration_seconds",
|
||||||
|
Help: "Duration in seconds for running goroutines for the preemption.",
|
||||||
|
Buckets: metrics.ExponentialBuckets(0.01, 2, 20),
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
[]string{"result"})
|
||||||
|
|
||||||
|
PreemptionGoroutinesExecutionTotal = metrics.NewCounterVec(
|
||||||
|
&metrics.CounterOpts{
|
||||||
|
Subsystem: SchedulerSubsystem,
|
||||||
|
Name: "preemption_goroutines_execution_total",
|
||||||
|
Help: "Number of preemption goroutines executed.",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
[]string{"result"})
|
||||||
|
|
||||||
metricsList = []metrics.Registerable{
|
metricsList = []metrics.Registerable{
|
||||||
scheduleAttempts,
|
scheduleAttempts,
|
||||||
schedulingLatency,
|
schedulingLatency,
|
||||||
|
@ -355,6 +355,7 @@ func New(ctx context.Context,
|
|||||||
|
|
||||||
for _, fwk := range profiles {
|
for _, fwk := range profiles {
|
||||||
fwk.SetPodNominator(podQueue)
|
fwk.SetPodNominator(podQueue)
|
||||||
|
fwk.SetPodActivator(podQueue)
|
||||||
}
|
}
|
||||||
|
|
||||||
schedulerCache := internalcache.New(ctx, durationToExpireAssumedPod)
|
schedulerCache := internalcache.New(ctx, durationToExpireAssumedPod)
|
||||||
|
@ -310,6 +310,12 @@ func (p *PodWrapper) Name(s string) *PodWrapper {
|
|||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Name sets `s` as the name of the inner pod.
|
||||||
|
func (p *PodWrapper) GenerateName(s string) *PodWrapper {
|
||||||
|
p.SetGenerateName(s)
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
// UID sets `s` as the UID of the inner pod.
|
// UID sets `s` as the UID of the inner pod.
|
||||||
func (p *PodWrapper) UID(s string) *PodWrapper {
|
func (p *PodWrapper) UID(s string) *PodWrapper {
|
||||||
p.SetUID(types.UID(s))
|
p.SetUID(types.UID(s))
|
||||||
|
@ -312,6 +312,10 @@ var (
|
|||||||
// TODO: document the feature (owning SIG, when to use this feature for a test)
|
// TODO: document the feature (owning SIG, when to use this feature for a test)
|
||||||
RegularResourceUsageTracking = framework.WithFeature(framework.ValidFeatures.Add("RegularResourceUsageTracking"))
|
RegularResourceUsageTracking = framework.WithFeature(framework.ValidFeatures.Add("RegularResourceUsageTracking"))
|
||||||
|
|
||||||
|
// Owner: sig-scheduling
|
||||||
|
// Marks tests of the asynchronous preemption (KEP-4832) that require the `SchedulerAsyncPreemption` feature gate.
|
||||||
|
SchedulerAsyncPreemption = framework.WithFeature(framework.ValidFeatures.Add("SchedulerAsyncPreemption"))
|
||||||
|
|
||||||
// Owner: sig-network
|
// Owner: sig-network
|
||||||
// Marks tests that require a pod networking implementation that supports SCTP
|
// Marks tests that require a pod networking implementation that supports SCTP
|
||||||
// traffic between pods.
|
// traffic between pods.
|
||||||
|
@ -43,6 +43,7 @@ import (
|
|||||||
clientset "k8s.io/client-go/kubernetes"
|
clientset "k8s.io/client-go/kubernetes"
|
||||||
"k8s.io/client-go/tools/cache"
|
"k8s.io/client-go/tools/cache"
|
||||||
"k8s.io/kubernetes/pkg/apis/scheduling"
|
"k8s.io/kubernetes/pkg/apis/scheduling"
|
||||||
|
"k8s.io/kubernetes/test/e2e/feature"
|
||||||
"k8s.io/kubernetes/test/e2e/framework"
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
|
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
|
||||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||||
@ -305,6 +306,157 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
/*
|
||||||
|
Release: v1.32
|
||||||
|
Testname: Scheduler runs the preemption with various priority classes expectedly
|
||||||
|
Description: When there are Pods with various priority classes running the preemption,
|
||||||
|
the scheduler must prioritize the Pods with the higher priority class.
|
||||||
|
*/
|
||||||
|
framework.It("validates various priority Pods preempt expectedly with the async preemption", feature.SchedulerAsyncPreemption, func(ctx context.Context) {
|
||||||
|
var podRes v1.ResourceList
|
||||||
|
// Create 10 pods per node that will eat up all the node's resources.
|
||||||
|
ginkgo.By("Create 10 low-priority pods on each node.")
|
||||||
|
lowPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items))
|
||||||
|
// Create pods in the cluster.
|
||||||
|
for i, node := range nodeList.Items {
|
||||||
|
// Update each node to advertise 3 available extended resources
|
||||||
|
e2enode.AddExtendedResource(ctx, cs, node.Name, testExtendedResource, resource.MustParse("10"))
|
||||||
|
|
||||||
|
// Create 10 low priority pods on each node, which will use up 10/10 of the node's resources.
|
||||||
|
for j := 0; j < 10; j++ {
|
||||||
|
// Request 1 of the available resources for the victim pods
|
||||||
|
podRes = v1.ResourceList{}
|
||||||
|
podRes[testExtendedResource] = resource.MustParse("1")
|
||||||
|
pausePod := createPausePod(ctx, f, pausePodConfig{
|
||||||
|
Name: fmt.Sprintf("pod%d-%d-%v", i, j, lowPriorityClassName),
|
||||||
|
PriorityClassName: lowPriorityClassName,
|
||||||
|
// This victim pod will be preempted by the high priority pod.
|
||||||
|
// But, the deletion will be blocked by the finalizer.
|
||||||
|
//
|
||||||
|
// The finalizer is needed to prevent the medium Pods from being scheduled instead of the high Pods,
|
||||||
|
// depending on when the scheduler notices the existence of all the high Pods we create.
|
||||||
|
Finalizers: []string{testFinalizer},
|
||||||
|
Resources: &v1.ResourceRequirements{
|
||||||
|
Requests: podRes,
|
||||||
|
Limits: podRes,
|
||||||
|
},
|
||||||
|
Affinity: &v1.Affinity{
|
||||||
|
NodeAffinity: &v1.NodeAffinity{
|
||||||
|
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
|
||||||
|
NodeSelectorTerms: []v1.NodeSelectorTerm{
|
||||||
|
{
|
||||||
|
MatchFields: []v1.NodeSelectorRequirement{
|
||||||
|
{Key: "metadata.name", Operator: v1.NodeSelectorOpIn, Values: []string{node.Name}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
lowPriorityPods = append(lowPriorityPods, pausePod)
|
||||||
|
framework.Logf("Created pod: %v", pausePod.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ginkgo.By("Wait for lower priority pods to be scheduled.")
|
||||||
|
for _, pod := range lowPriorityPods {
|
||||||
|
framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod))
|
||||||
|
}
|
||||||
|
|
||||||
|
highPriorityPods := make([]*v1.Pod, 0, 5*len(nodeList.Items))
|
||||||
|
mediumPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items))
|
||||||
|
|
||||||
|
ginkgo.By("Run high/medium priority pods that have same requirements as that of lower priority pod")
|
||||||
|
for i := range nodeList.Items {
|
||||||
|
// Create medium priority pods first
|
||||||
|
// to confirm the scheduler finally prioritize the high priority pods, ignoring the medium priority pods.
|
||||||
|
for j := 0; j < 10; j++ {
|
||||||
|
// 5 pods per node will be unschedulable
|
||||||
|
// because the node only has 10 resource, and high priority pods will use 5 resource.
|
||||||
|
p := createPausePod(ctx, f, pausePodConfig{
|
||||||
|
Name: fmt.Sprintf("pod%d-%d-%v", i, j, mediumPriorityClassName),
|
||||||
|
PriorityClassName: mediumPriorityClassName,
|
||||||
|
Resources: &v1.ResourceRequirements{
|
||||||
|
// Set the pod request to the low priority pod's resources
|
||||||
|
Requests: lowPriorityPods[0].Spec.Containers[0].Resources.Requests,
|
||||||
|
Limits: lowPriorityPods[0].Spec.Containers[0].Resources.Requests,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
mediumPriorityPods = append(mediumPriorityPods, p)
|
||||||
|
}
|
||||||
|
|
||||||
|
for j := 0; j < 5; j++ {
|
||||||
|
p := createPausePod(ctx, f, pausePodConfig{
|
||||||
|
Name: fmt.Sprintf("pod%d-%d-%v", i, j, highPriorityClassName),
|
||||||
|
PriorityClassName: highPriorityClassName,
|
||||||
|
Resources: &v1.ResourceRequirements{
|
||||||
|
// Set the pod request to the low priority pod's resources
|
||||||
|
Requests: lowPriorityPods[0].Spec.Containers[0].Resources.Requests,
|
||||||
|
Limits: lowPriorityPods[0].Spec.Containers[0].Resources.Requests,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
highPriorityPods = append(highPriorityPods, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// All low priority Pods should be the target of preemption.
|
||||||
|
// Those Pods have a finalizer and hence should not be deleted yet at this point.
|
||||||
|
ginkgo.By("Check all low priority pods to be about to preempted.")
|
||||||
|
for _, pod := range lowPriorityPods {
|
||||||
|
framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
|
preemptedPod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return preemptedPod.DeletionTimestamp != nil, nil
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// All high priority Pods should be schedulable by removing the low priority Pods.
|
||||||
|
ginkgo.By("Wait for high priority pods to be ready for the preemption.")
|
||||||
|
for _, pod := range highPriorityPods {
|
||||||
|
framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
|
highPod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return highPod.Status.NominatedNodeName != "", nil
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
ginkgo.By("Remove the finalizer from all low priority pods to proceed the preemption.")
|
||||||
|
for _, pod := range lowPriorityPods {
|
||||||
|
// Remove the finalizer so that the pod can be deleted by GC
|
||||||
|
e2epod.NewPodClient(f).RemoveFinalizer(ctx, pod.Name, testFinalizer)
|
||||||
|
}
|
||||||
|
|
||||||
|
ginkgo.By("Wait for high priority pods to be scheduled.")
|
||||||
|
for _, pod := range highPriorityPods {
|
||||||
|
framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod))
|
||||||
|
}
|
||||||
|
|
||||||
|
ginkgo.By("Wait for 5 medium priority pods to be scheduled.")
|
||||||
|
framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
|
scheduled := 0
|
||||||
|
for _, pod := range mediumPriorityPods {
|
||||||
|
medPod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if medPod.Spec.NodeName != "" {
|
||||||
|
scheduled++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if scheduled > 5 {
|
||||||
|
return false, fmt.Errorf("expected 5 medium priority pods to be scheduled, but got %d", scheduled)
|
||||||
|
}
|
||||||
|
|
||||||
|
return scheduled == 5, nil
|
||||||
|
}))
|
||||||
|
})
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Release: v1.31
|
Release: v1.31
|
||||||
Testname: Verify the DisruptionTarget condition is added to the preempted pod
|
Testname: Verify the DisruptionTarget condition is added to the preempted pod
|
||||||
|
@ -1058,6 +1058,12 @@
|
|||||||
lockToDefault: false
|
lockToDefault: false
|
||||||
preRelease: Alpha
|
preRelease: Alpha
|
||||||
version: "1.29"
|
version: "1.29"
|
||||||
|
- name: SchedulerAsyncPreemption
|
||||||
|
versionedSpecs:
|
||||||
|
- default: false
|
||||||
|
lockToDefault: false
|
||||||
|
preRelease: Alpha
|
||||||
|
version: "1.32"
|
||||||
- name: SchedulerQueueingHints
|
- name: SchedulerQueueingHints
|
||||||
versionedSpecs:
|
versionedSpecs:
|
||||||
- default: false
|
- default: false
|
||||||
|
@ -33,23 +33,34 @@ import (
|
|||||||
"k8s.io/apimachinery/pkg/types"
|
"k8s.io/apimachinery/pkg/types"
|
||||||
"k8s.io/apimachinery/pkg/util/intstr"
|
"k8s.io/apimachinery/pkg/util/intstr"
|
||||||
"k8s.io/apimachinery/pkg/util/wait"
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||||
"k8s.io/client-go/informers"
|
"k8s.io/client-go/informers"
|
||||||
clientset "k8s.io/client-go/kubernetes"
|
clientset "k8s.io/client-go/kubernetes"
|
||||||
restclient "k8s.io/client-go/rest"
|
restclient "k8s.io/client-go/rest"
|
||||||
|
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
||||||
"k8s.io/component-helpers/storage/volume"
|
"k8s.io/component-helpers/storage/volume"
|
||||||
"k8s.io/klog/v2"
|
"k8s.io/klog/v2"
|
||||||
configv1 "k8s.io/kube-scheduler/config/v1"
|
configv1 "k8s.io/kube-scheduler/config/v1"
|
||||||
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
|
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
|
||||||
"k8s.io/kubernetes/pkg/apis/scheduling"
|
"k8s.io/kubernetes/pkg/apis/scheduling"
|
||||||
|
"k8s.io/kubernetes/pkg/features"
|
||||||
"k8s.io/kubernetes/pkg/scheduler"
|
"k8s.io/kubernetes/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||||
configtesting "k8s.io/kubernetes/pkg/scheduler/apis/config/testing"
|
configtesting "k8s.io/kubernetes/pkg/scheduler/apis/config/testing"
|
||||||
|
"k8s.io/kubernetes/pkg/scheduler/backend/queue"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||||
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption"
|
||||||
|
plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||||
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions"
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions"
|
||||||
|
"k8s.io/kubernetes/pkg/scheduler/framework/preemption"
|
||||||
frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime"
|
frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime"
|
||||||
st "k8s.io/kubernetes/pkg/scheduler/testing"
|
st "k8s.io/kubernetes/pkg/scheduler/testing"
|
||||||
"k8s.io/kubernetes/plugin/pkg/admission/priority"
|
"k8s.io/kubernetes/plugin/pkg/admission/priority"
|
||||||
testutils "k8s.io/kubernetes/test/integration/util"
|
testutils "k8s.io/kubernetes/test/integration/util"
|
||||||
|
"k8s.io/kubernetes/test/utils/ktesting"
|
||||||
"k8s.io/utils/pointer"
|
"k8s.io/utils/pointer"
|
||||||
|
"k8s.io/utils/ptr"
|
||||||
)
|
)
|
||||||
|
|
||||||
// imported from testutils
|
// imported from testutils
|
||||||
@ -452,8 +463,11 @@ func TestPreemption(t *testing.T) {
|
|||||||
t.Fatalf("Error creating node: %v", err)
|
t.Fatalf("Error creating node: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
|
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncPreemption, asyncPreemptionEnabled)
|
||||||
|
|
||||||
filter.Tokens = test.initTokens
|
filter.Tokens = test.initTokens
|
||||||
filter.EnablePreFilter = test.enablePreFilter
|
filter.EnablePreFilter = test.enablePreFilter
|
||||||
filter.Unresolvable = test.unresolvable
|
filter.Unresolvable = test.unresolvable
|
||||||
@ -485,12 +499,10 @@ func TestPreemption(t *testing.T) {
|
|||||||
if cond == nil {
|
if cond == nil {
|
||||||
t.Errorf("Pod %q does not have the expected condition: %q", klog.KObj(pod), v1.DisruptionTarget)
|
t.Errorf("Pod %q does not have the expected condition: %q", klog.KObj(pod), v1.DisruptionTarget)
|
||||||
}
|
}
|
||||||
} else {
|
} else if p.DeletionTimestamp != nil {
|
||||||
if p.DeletionTimestamp != nil {
|
|
||||||
t.Errorf("Didn't expect pod %v to get preempted.", p.Name)
|
t.Errorf("Didn't expect pod %v to get preempted.", p.Name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
// Also check that the preemptor pod gets the NominatedNodeName field set.
|
// Also check that the preemptor pod gets the NominatedNodeName field set.
|
||||||
if len(test.preemptedPodIndexes) > 0 {
|
if len(test.preemptedPodIndexes) > 0 {
|
||||||
if err := waitForNominatedNodeName(cs, preemptor); err != nil {
|
if err := waitForNominatedNodeName(cs, preemptor); err != nil {
|
||||||
@ -504,6 +516,508 @@ func TestPreemption(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAsyncPreemption(t *testing.T) {
|
||||||
|
type createPod struct {
|
||||||
|
pod *v1.Pod
|
||||||
|
// count is the number of times the pod should be created by this action.
|
||||||
|
// i.e., if you use it, you have to use GenerateName.
|
||||||
|
// By default, it's 1.
|
||||||
|
count *int
|
||||||
|
}
|
||||||
|
|
||||||
|
type schedulePod struct {
|
||||||
|
podName string
|
||||||
|
expectSuccess bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type scenario struct {
|
||||||
|
// name is this step's name, just for the debugging purpose.
|
||||||
|
name string
|
||||||
|
|
||||||
|
// Only one of the following actions should be set.
|
||||||
|
|
||||||
|
// createPod creates a Pod.
|
||||||
|
createPod *createPod
|
||||||
|
// schedulePod schedules one Pod that is at the top of the activeQ.
|
||||||
|
// You should give a Pod name that is supposed to be scheduled.
|
||||||
|
schedulePod *schedulePod
|
||||||
|
// completePreemption completes the preemption that is currently on-going.
|
||||||
|
// You should give a Pod name.
|
||||||
|
completePreemption string
|
||||||
|
// podGatedInQueue checks if the given Pod is in the scheduling queue and gated by the preemption.
|
||||||
|
// You should give a Pod name.
|
||||||
|
podGatedInQueue string
|
||||||
|
// podRunningPreemption checks if the given Pod is running preemption.
|
||||||
|
// You should give a Pod index representing the order of Pod creation.
|
||||||
|
// e.g., if you want to check the Pod created first in the test case, you should give 0.
|
||||||
|
podRunningPreemption *int
|
||||||
|
}
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
// scenarios after the first attempt of scheduling the pod.
|
||||||
|
scenarios []scenario
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
// Very basic test case: if it fails, the basic scenario is broken somewhere.
|
||||||
|
name: "basic: async preemption happens expectedly",
|
||||||
|
scenarios: []scenario{
|
||||||
|
{
|
||||||
|
name: "create scheduled Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().GenerateName("victim-").Req(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Node("node").Container("image").ZeroTerminationGracePeriod().Priority(1).Obj(),
|
||||||
|
count: ptr.To(2),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "create a preemptor Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().Name("preemptor").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Priority(100).Obj(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the preemptor Pod",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the pod is in the queue and gated",
|
||||||
|
podGatedInQueue: "preemptor",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the preemptor Pod making the preemption API calls",
|
||||||
|
podRunningPreemption: ptr.To(2),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "complete the preemption API calls",
|
||||||
|
completePreemption: "preemptor",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the preemptor Pod after the preemption",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor",
|
||||||
|
expectSuccess: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Lower priority Pod doesn't take over the place for higher priority Pod that is running the preemption",
|
||||||
|
scenarios: []scenario{
|
||||||
|
{
|
||||||
|
name: "create scheduled Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().GenerateName("victim-").Req(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Node("node").Container("image").ZeroTerminationGracePeriod().Priority(1).Obj(),
|
||||||
|
count: ptr.To(2),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "create a preemptor Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().Name("preemptor-high-priority").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Priority(100).Obj(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the preemptor Pod",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the pod is in the queue and gated",
|
||||||
|
podGatedInQueue: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the preemptor Pod making the preemption API calls",
|
||||||
|
podRunningPreemption: ptr.To(2),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// This Pod is lower priority than the preemptor Pod.
|
||||||
|
// Given the preemptor Pod is nominated to the node, this Pod should be unschedulable.
|
||||||
|
name: "create a second Pod that is lower priority than the first preemptor Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().Name("pod-mid-priority").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Priority(50).Obj(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the mid-priority Pod",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "pod-mid-priority",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "complete the preemption API calls",
|
||||||
|
completePreemption: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// the preemptor pod should be popped from the queue before the mid-priority pod.
|
||||||
|
name: "schedule the preemptor Pod again",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-high-priority",
|
||||||
|
expectSuccess: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the mid-priority Pod again",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "pod-mid-priority",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Higher priority Pod takes over the place for lower priority Pod that is running the preemption",
|
||||||
|
scenarios: []scenario{
|
||||||
|
{
|
||||||
|
name: "create scheduled Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().GenerateName("victim-").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Node("node").Container("image").ZeroTerminationGracePeriod().Priority(1).Obj(),
|
||||||
|
count: ptr.To(4),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "create a preemptor Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().Name("preemptor-high-priority").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Priority(100).Obj(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the preemptor Pod",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the pod is in the queue and gated",
|
||||||
|
podGatedInQueue: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the preemptor Pod making the preemption API calls",
|
||||||
|
podRunningPreemption: ptr.To(4),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// This Pod is higher priority than the preemptor Pod.
|
||||||
|
// Even though the preemptor Pod is nominated to the node, this Pod can take over the place.
|
||||||
|
name: "create a second Pod that is higher priority than the first preemptor Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().Name("preemptor-super-high-priority").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Priority(200).Obj(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the super-high-priority Pod",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-super-high-priority",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the super-high-priority Pod making the preemption API calls",
|
||||||
|
podRunningPreemption: ptr.To(5),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// the super-high-priority preemptor should enter the preemption
|
||||||
|
// and select the place where the preemptor-high-priority selected.
|
||||||
|
// So, basically both goroutines are preempting the same Pods.
|
||||||
|
name: "check the super-high-priority pod is in the queue and gated",
|
||||||
|
podGatedInQueue: "preemptor-super-high-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "complete the preemption API calls of super-high-priority",
|
||||||
|
completePreemption: "preemptor-super-high-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "complete the preemption API calls of high-priority",
|
||||||
|
completePreemption: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the super-high-priority Pod",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-super-high-priority",
|
||||||
|
expectSuccess: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the high-priority Pod",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Lower priority Pod can select the same place where the higher priority Pod is preempting if the node is big enough",
|
||||||
|
scenarios: []scenario{
|
||||||
|
{
|
||||||
|
name: "create scheduled Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().GenerateName("victim-").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Node("node").Container("image").ZeroTerminationGracePeriod().Priority(1).Obj(),
|
||||||
|
count: ptr.To(4),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// It will preempt two victims.
|
||||||
|
name: "create a preemptor Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().Name("preemptor-high-priority").Req(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Container("image").Priority(100).Obj(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the preemptor Pod",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the pod is in the queue and gated",
|
||||||
|
podGatedInQueue: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the preemptor Pod making the preemption API calls",
|
||||||
|
podRunningPreemption: ptr.To(4),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// This Pod is lower priority than the preemptor Pod.
|
||||||
|
// Given the preemptor Pod is nominated to the node, this Pod should be unschedulable.
|
||||||
|
// This Pod will trigger the preemption to target the two victims that the first Pod doesn't target.
|
||||||
|
name: "create a second Pod that is lower priority than the first preemptor Pod",
|
||||||
|
createPod: &createPod{
|
||||||
|
pod: st.MakePod().Name("preemptor-mid-priority").Req(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Container("image").Priority(50).Obj(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the mid-priority Pod",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-mid-priority",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the mid-priority pod is in the queue and gated",
|
||||||
|
podGatedInQueue: "preemptor-mid-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check the mid-priority Pod making the preemption API calls",
|
||||||
|
podRunningPreemption: ptr.To(5),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "complete the preemption API calls",
|
||||||
|
completePreemption: "preemptor-mid-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "complete the preemption API calls",
|
||||||
|
completePreemption: "preemptor-high-priority",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// the preemptor pod should be popped from the queue before the mid-priority pod.
|
||||||
|
name: "schedule the preemptor Pod again",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-high-priority",
|
||||||
|
expectSuccess: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "schedule the mid-priority Pod again",
|
||||||
|
schedulePod: &schedulePod{
|
||||||
|
podName: "preemptor-mid-priority",
|
||||||
|
expectSuccess: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// All test cases have the same node.
|
||||||
|
node := st.MakeNode().Name("node").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj()
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
// We need to use a custom preemption plugin to test async preemption behavior
|
||||||
|
delayedPreemptionPluginName := "delay-preemption"
|
||||||
|
// keyed by the pod name
|
||||||
|
preemptionDoneChannels := make(map[string]chan struct{})
|
||||||
|
defer func() {
|
||||||
|
for _, ch := range preemptionDoneChannels {
|
||||||
|
close(ch)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
registry := make(frameworkruntime.Registry)
|
||||||
|
var preemptionPlugin *defaultpreemption.DefaultPreemption
|
||||||
|
err := registry.Register(delayedPreemptionPluginName, func(c context.Context, r runtime.Object, fh framework.Handle) (framework.Plugin, error) {
|
||||||
|
p, err := frameworkruntime.FactoryAdapter(plfeature.Features{EnableAsyncPreemption: true}, defaultpreemption.New)(c, &config.DefaultPreemptionArgs{
|
||||||
|
// Set default values to pass the validation at the initialization, not related to the test.
|
||||||
|
MinCandidateNodesPercentage: 10,
|
||||||
|
MinCandidateNodesAbsolute: 100,
|
||||||
|
}, fh)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error creating default preemption plugin: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var ok bool
|
||||||
|
preemptionPlugin, ok = p.(*defaultpreemption.DefaultPreemption)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("unexpected plugin type %T", p)
|
||||||
|
}
|
||||||
|
|
||||||
|
preemptPodFn := preemptionPlugin.Evaluator.PreemptPod
|
||||||
|
preemptionPlugin.Evaluator.PreemptPod = func(ctx context.Context, c preemption.Candidate, preemptor, victim *v1.Pod, pluginName string) error {
|
||||||
|
// block the preemption goroutine to complete until the test case allows it to proceed.
|
||||||
|
if ch, ok := preemptionDoneChannels[preemptor.Name]; ok {
|
||||||
|
<-ch
|
||||||
|
}
|
||||||
|
return preemptPodFn(ctx, c, preemptor, victim, pluginName)
|
||||||
|
}
|
||||||
|
|
||||||
|
return preemptionPlugin, nil
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Error registering a filter: %v", err)
|
||||||
|
}
|
||||||
|
cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{
|
||||||
|
Profiles: []configv1.KubeSchedulerProfile{{
|
||||||
|
SchedulerName: pointer.String(v1.DefaultSchedulerName),
|
||||||
|
Plugins: &configv1.Plugins{
|
||||||
|
MultiPoint: configv1.PluginSet{
|
||||||
|
Enabled: []configv1.Plugin{
|
||||||
|
{Name: delayedPreemptionPluginName},
|
||||||
|
},
|
||||||
|
Disabled: []configv1.Plugin{
|
||||||
|
{Name: names.DefaultPreemption},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
})
|
||||||
|
|
||||||
|
// It initializes the scheduler, but doesn't start.
|
||||||
|
// We manually trigger the scheduling cycle.
|
||||||
|
testCtx := testutils.InitTestSchedulerWithOptions(t,
|
||||||
|
testutils.InitTestAPIServer(t, "preemption", nil),
|
||||||
|
0,
|
||||||
|
scheduler.WithProfiles(cfg.Profiles...),
|
||||||
|
scheduler.WithFrameworkOutOfTreeRegistry(registry),
|
||||||
|
// disable backoff
|
||||||
|
scheduler.WithPodMaxBackoffSeconds(0),
|
||||||
|
scheduler.WithPodInitialBackoffSeconds(0),
|
||||||
|
)
|
||||||
|
testutils.SyncSchedulerInformerFactory(testCtx)
|
||||||
|
cs := testCtx.ClientSet
|
||||||
|
|
||||||
|
if preemptionPlugin == nil {
|
||||||
|
t.Fatalf("the preemption plugin should be initialized")
|
||||||
|
}
|
||||||
|
|
||||||
|
logger, _ := ktesting.NewTestContext(t)
|
||||||
|
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncPreemption, true)
|
||||||
|
|
||||||
|
createdPods := []*v1.Pod{}
|
||||||
|
defer testutils.CleanupPods(testCtx.Ctx, cs, t, createdPods)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
if _, err := cs.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}); err != nil {
|
||||||
|
t.Fatalf("Failed to create an initial Node %q: %v", node.Name, err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if err := cs.CoreV1().Nodes().Delete(ctx, node.Name, metav1.DeleteOptions{}); err != nil {
|
||||||
|
t.Fatalf("Failed to delete the Node %q: %v", node.Name, err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
for _, scenario := range test.scenarios {
|
||||||
|
t.Logf("Running scenario: %s", scenario.name)
|
||||||
|
switch {
|
||||||
|
case scenario.createPod != nil:
|
||||||
|
if scenario.createPod.count == nil {
|
||||||
|
scenario.createPod.count = ptr.To(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < *scenario.createPod.count; i++ {
|
||||||
|
pod, err := cs.CoreV1().Pods(testCtx.NS.Name).Create(ctx, scenario.createPod.pod, metav1.CreateOptions{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create a Pod %q: %v", pod.Name, err)
|
||||||
|
}
|
||||||
|
createdPods = append(createdPods, pod)
|
||||||
|
}
|
||||||
|
case scenario.schedulePod != nil:
|
||||||
|
lastFailure := ""
|
||||||
|
if err := wait.PollUntilContextTimeout(testCtx.Ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
|
if len(testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()) == 0 {
|
||||||
|
lastFailure = fmt.Sprintf("Expected the pod %s to be scheduled, but no pod arrives at the activeQ", scenario.schedulePod.podName)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()[0].Name != scenario.schedulePod.podName {
|
||||||
|
// need to wait more because maybe the queue will get another Pod that higher priority than the current top pod.
|
||||||
|
lastFailure = fmt.Sprintf("The pod %s is expected to be scheduled, but the top Pod is %s", scenario.schedulePod.podName, testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()[0].Name)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatal(lastFailure)
|
||||||
|
}
|
||||||
|
|
||||||
|
preemptionDoneChannels[scenario.schedulePod.podName] = make(chan struct{})
|
||||||
|
testCtx.Scheduler.ScheduleOne(testCtx.Ctx)
|
||||||
|
if scenario.schedulePod.expectSuccess {
|
||||||
|
if err := wait.PollUntilContextTimeout(testCtx.Ctx, 200*time.Millisecond, wait.ForeverTestTimeout, false, testutils.PodScheduled(cs, testCtx.NS.Name, scenario.schedulePod.podName)); err != nil {
|
||||||
|
t.Fatalf("Expected the pod %s to be scheduled", scenario.schedulePod.podName)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if !podInUnschedulablePodPool(t, testCtx.Scheduler.SchedulingQueue, scenario.schedulePod.podName) {
|
||||||
|
t.Fatalf("Expected the pod %s to be in the queue after the scheduling attempt", scenario.schedulePod.podName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case scenario.completePreemption != "":
|
||||||
|
if _, ok := preemptionDoneChannels[scenario.completePreemption]; !ok {
|
||||||
|
t.Fatalf("The preemptor Pod %q is not running preemption", scenario.completePreemption)
|
||||||
|
}
|
||||||
|
|
||||||
|
close(preemptionDoneChannels[scenario.completePreemption])
|
||||||
|
delete(preemptionDoneChannels, scenario.completePreemption)
|
||||||
|
case scenario.podGatedInQueue != "":
|
||||||
|
// make sure the Pod is in the queue in the first place.
|
||||||
|
if !podInUnschedulablePodPool(t, testCtx.Scheduler.SchedulingQueue, scenario.podGatedInQueue) {
|
||||||
|
t.Fatalf("Expected the pod %s to be in the queue", scenario.podGatedInQueue)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure this Pod is gated by the preemption at PreEnqueue extension point
|
||||||
|
// by activating the Pod and see if it's still in the unsched pod pool.
|
||||||
|
testCtx.Scheduler.SchedulingQueue.Activate(logger, map[string]*v1.Pod{scenario.podGatedInQueue: st.MakePod().Namespace(testCtx.NS.Name).Name(scenario.podGatedInQueue).Obj()})
|
||||||
|
if !podInUnschedulablePodPool(t, testCtx.Scheduler.SchedulingQueue, scenario.podGatedInQueue) {
|
||||||
|
t.Fatalf("Expected the pod %s to be in the queue even after the activation", scenario.podGatedInQueue)
|
||||||
|
}
|
||||||
|
case scenario.podRunningPreemption != nil:
|
||||||
|
if err := wait.PollUntilContextTimeout(testCtx.Ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
|
return preemptionPlugin.Evaluator.IsPodRunningPreemption(createdPods[*scenario.podRunningPreemption].GetUID()), nil
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("Expected the pod %s to be running preemption", createdPods[*scenario.podRunningPreemption].Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// podInUnschedulablePodPool checks if the given Pod is in the unschedulable pod pool.
|
||||||
|
func podInUnschedulablePodPool(t *testing.T, queue queue.SchedulingQueue, podName string) bool {
|
||||||
|
t.Helper()
|
||||||
|
// First, look for the pod in the activeQ.
|
||||||
|
for _, pod := range queue.PodsInActiveQ() {
|
||||||
|
if pod.Name == podName {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pendingPods, _ := queue.PendingPods()
|
||||||
|
for _, pod := range pendingPods {
|
||||||
|
if pod.Name == podName {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
// TestNonPreemption tests NonPreempt option of PriorityClass of scheduler works as expected.
|
// TestNonPreemption tests NonPreempt option of PriorityClass of scheduler works as expected.
|
||||||
func TestNonPreemption(t *testing.T) {
|
func TestNonPreemption(t *testing.T) {
|
||||||
@ -554,8 +1068,10 @@ func TestNonPreemption(t *testing.T) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Error creating nodes: %v", err)
|
t.Fatalf("Error creating nodes: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
defer testutils.CleanupPods(testCtx.Ctx, cs, t, []*v1.Pod{preemptor, victim})
|
defer testutils.CleanupPods(testCtx.Ctx, cs, t, []*v1.Pod{preemptor, victim})
|
||||||
preemptor.Spec.PreemptionPolicy = test.PreemptionPolicy
|
preemptor.Spec.PreemptionPolicy = test.PreemptionPolicy
|
||||||
victimPod, err := createPausePod(cs, victim)
|
victimPod, err := createPausePod(cs, victim)
|
||||||
@ -582,6 +1098,7 @@ func TestNonPreemption(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestDisablePreemption tests disable pod preemption of scheduler works as expected.
|
// TestDisablePreemption tests disable pod preemption of scheduler works as expected.
|
||||||
func TestDisablePreemption(t *testing.T) {
|
func TestDisablePreemption(t *testing.T) {
|
||||||
@ -630,8 +1147,9 @@ func TestDisablePreemption(t *testing.T) {
|
|||||||
t.Fatalf("Error creating nodes: %v", err)
|
t.Fatalf("Error creating nodes: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
pods := make([]*v1.Pod, len(test.existingPods))
|
pods := make([]*v1.Pod, len(test.existingPods))
|
||||||
// Create and run existingPods.
|
// Create and run existingPods.
|
||||||
for i, p := range test.existingPods {
|
for i, p := range test.existingPods {
|
||||||
@ -661,6 +1179,7 @@ func TestDisablePreemption(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// This test verifies that system critical priorities are created automatically and resolved properly.
|
// This test verifies that system critical priorities are created automatically and resolved properly.
|
||||||
func TestPodPriorityResolution(t *testing.T) {
|
func TestPodPriorityResolution(t *testing.T) {
|
||||||
@ -736,9 +1255,9 @@ func TestPodPriorityResolution(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pods := make([]*v1.Pod, 0, len(tests))
|
pods := make([]*v1.Pod, 0, len(tests))
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.Name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.Name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
t.Run(test.Name, func(t *testing.T) {
|
|
||||||
pod, err := runPausePod(cs, test.Pod)
|
pod, err := runPausePod(cs, test.Pod)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if test.ExpectedError == nil {
|
if test.ExpectedError == nil {
|
||||||
@ -757,10 +1276,10 @@ func TestPodPriorityResolution(t *testing.T) {
|
|||||||
} else {
|
} else {
|
||||||
t.Errorf("Expected pod %v to have priority %v but was nil", pod.Name, test.PriorityClass)
|
t.Errorf("Expected pod %v to have priority %v but was nil", pod.Name, test.PriorityClass)
|
||||||
}
|
}
|
||||||
})
|
testutils.CleanupPods(testCtx.Ctx, cs, t, pods)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
testutils.CleanupPods(testCtx.Ctx, cs, t, pods)
|
}
|
||||||
testutils.CleanupNodes(cs, t)
|
testutils.CleanupNodes(cs, t)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -824,8 +1343,9 @@ func TestPreemptionStarvation(t *testing.T) {
|
|||||||
t.Fatalf("Error creating nodes: %v", err)
|
t.Fatalf("Error creating nodes: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
pendingPods := make([]*v1.Pod, test.numExpectedPending)
|
pendingPods := make([]*v1.Pod, test.numExpectedPending)
|
||||||
numRunningPods := test.numExistingPod - test.numExpectedPending
|
numRunningPods := test.numExistingPod - test.numExpectedPending
|
||||||
runningPods := make([]*v1.Pod, numRunningPods)
|
runningPods := make([]*v1.Pod, numRunningPods)
|
||||||
@ -878,6 +1398,7 @@ func TestPreemptionStarvation(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestPreemptionRaces tests that other scheduling events and operations do not
|
// TestPreemptionRaces tests that other scheduling events and operations do not
|
||||||
// race with the preemption process.
|
// race with the preemption process.
|
||||||
@ -924,8 +1445,9 @@ func TestPreemptionRaces(t *testing.T) {
|
|||||||
t.Fatalf("Error creating nodes: %v", err)
|
t.Fatalf("Error creating nodes: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
if test.numRepetitions <= 0 {
|
if test.numRepetitions <= 0 {
|
||||||
test.numRepetitions = 1
|
test.numRepetitions = 1
|
||||||
}
|
}
|
||||||
@ -992,6 +1514,7 @@ func TestPreemptionRaces(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
alwaysFailPlugin = "alwaysFailPlugin"
|
alwaysFailPlugin = "alwaysFailPlugin"
|
||||||
@ -1126,8 +1649,9 @@ func TestNominatedNodeCleanUp(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", tt.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{
|
cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{
|
||||||
Profiles: []configv1.KubeSchedulerProfile{{
|
Profiles: []configv1.KubeSchedulerProfile{{
|
||||||
SchedulerName: pointer.String(v1.DefaultSchedulerName),
|
SchedulerName: pointer.String(v1.DefaultSchedulerName),
|
||||||
@ -1197,6 +1721,7 @@ func TestNominatedNodeCleanUp(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func mkMinAvailablePDB(name, namespace string, uid types.UID, minAvailable int, matchLabels map[string]string) *policy.PodDisruptionBudget {
|
func mkMinAvailablePDB(name, namespace string, uid types.UID, minAvailable int, matchLabels map[string]string) *policy.PodDisruptionBudget {
|
||||||
intMinAvailable := intstr.FromInt32(int32(minAvailable))
|
intMinAvailable := intstr.FromInt32(int32(minAvailable))
|
||||||
@ -1405,8 +1930,9 @@ func TestPDBInPreemption(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
for i := 1; i <= test.nodeCnt; i++ {
|
for i := 1; i <= test.nodeCnt; i++ {
|
||||||
nodeName := fmt.Sprintf("node-%v", i)
|
nodeName := fmt.Sprintf("node-%v", i)
|
||||||
_, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(defaultNodeRes).Obj())
|
_, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(defaultNodeRes).Obj())
|
||||||
@ -1482,6 +2008,7 @@ func TestPDBInPreemption(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func initTestPreferNominatedNode(t *testing.T, nsPrefix string, opts ...scheduler.Option) *testutils.TestContext {
|
func initTestPreferNominatedNode(t *testing.T, nsPrefix string, opts ...scheduler.Option) *testutils.TestContext {
|
||||||
testCtx := testutils.InitTestSchedulerWithOptions(t, testutils.InitTestAPIServer(t, nsPrefix, nil), 0, opts...)
|
testCtx := testutils.InitTestSchedulerWithOptions(t, testutils.InitTestAPIServer(t, nsPrefix, nil), 0, opts...)
|
||||||
@ -1563,8 +2090,9 @@ func TestPreferNominatedNode(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
testCtx := initTestPreferNominatedNode(t, "perfer-nominated-node")
|
testCtx := initTestPreferNominatedNode(t, "perfer-nominated-node")
|
||||||
cs := testCtx.ClientSet
|
cs := testCtx.ClientSet
|
||||||
nsName := testCtx.NS.Name
|
nsName := testCtx.NS.Name
|
||||||
@ -1611,6 +2139,7 @@ func TestPreferNominatedNode(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestReadWriteOncePodPreemption tests preemption scenarios for pods with
|
// TestReadWriteOncePodPreemption tests preemption scenarios for pods with
|
||||||
// ReadWriteOncePod PVCs.
|
// ReadWriteOncePod PVCs.
|
||||||
@ -1912,8 +2441,9 @@ func TestReadWriteOncePodPreemption(t *testing.T) {
|
|||||||
t.Fatalf("Error creating node: %v", err)
|
t.Fatalf("Error creating node: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
if err := test.init(); err != nil {
|
if err := test.init(); err != nil {
|
||||||
t.Fatalf("Error while initializing test: %v", err)
|
t.Fatalf("Error while initializing test: %v", err)
|
||||||
}
|
}
|
||||||
@ -1961,3 +2491,4 @@ func TestReadWriteOncePodPreemption(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
@ -354,10 +354,20 @@
|
|||||||
initPods: 2000
|
initPods: 2000
|
||||||
measurePods: 500
|
measurePods: 500
|
||||||
- name: 5000Nodes
|
- name: 5000Nodes
|
||||||
featureGates:
|
|
||||||
SchedulerQueueingHints: false
|
|
||||||
labels: [performance]
|
labels: [performance]
|
||||||
threshold: 200
|
threshold: 200
|
||||||
|
featureGates:
|
||||||
|
SchedulerQueueingHints: false
|
||||||
|
SchedulerAsyncPreemption: false
|
||||||
|
params:
|
||||||
|
initNodes: 5000
|
||||||
|
initPods: 20000
|
||||||
|
measurePods: 5000
|
||||||
|
- name: 5000Nodes_AsyncPreemptionEnabled
|
||||||
|
threshold: 200
|
||||||
|
labels: [performance]
|
||||||
|
featureGates:
|
||||||
|
SchedulerAsyncPreemption: true
|
||||||
params:
|
params:
|
||||||
initNodes: 5000
|
initNodes: 5000
|
||||||
initPods: 20000
|
initPods: 20000
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Pod
|
kind: Pod
|
||||||
metadata:
|
metadata:
|
||||||
generateName: pod-
|
generateName: pod-high-priority-
|
||||||
spec:
|
spec:
|
||||||
priority: 10
|
priority: 10
|
||||||
containers:
|
containers:
|
||||||
|
Loading…
Reference in New Issue
Block a user