diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go index 292e72916f6..9f6a592eccf 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue.go +++ b/pkg/scheduler/internal/queue/scheduling_queue.go @@ -47,6 +47,7 @@ import ( "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" "k8s.io/kubernetes/pkg/scheduler/internal/heap" "k8s.io/kubernetes/pkg/scheduler/metrics" "k8s.io/kubernetes/pkg/scheduler/util" @@ -1193,11 +1194,25 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn activated := false for _, pInfo := range podInfoList { - // Since there may be many gated pods and they will not move from the - // unschedulable pool, we skip calling the expensive isPodWorthRequeueing. - if pInfo.Gated { + // When handling events takes time, a scheduling throughput gets impacted negatively + // because of a shared lock within PriorityQueue, which Pop() also requires. + // + // Scheduling-gated Pods never get schedulable with any events, + // except the Pods themselves got updated, which isn't handled by movePodsToActiveOrBackoffQueue. + // So, we can skip them early here so that they don't go through isPodWorthRequeuing, + // which isn't fast enough to keep a sufficient scheduling throughput + // when the number of scheduling-gated Pods in unschedulablePods is large. + // https://github.com/kubernetes/kubernetes/issues/124384 + // This is a hotfix for this issue, which might be changed + // once we have a better general solution for the shared lock issue. + // + // Note that we cannot skip all pInfo.Gated Pods here + // because PreEnqueue plugins apart from the scheduling gate plugin may change the gating status + // with these events. + if pInfo.Gated && pInfo.UnschedulablePlugins.Has(names.SchedulingGates) { continue } + schedulingHint := p.isPodWorthRequeuing(logger, pInfo, event, oldObj, newObj) if schedulingHint == queueSkip { // QueueingHintFn determined that this Pod isn't worth putting to activeQ or backoffQ by this event. diff --git a/pkg/scheduler/internal/queue/scheduling_queue_test.go b/pkg/scheduler/internal/queue/scheduling_queue_test.go index c2ad3d8ce67..b74d197d6b5 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue_test.go +++ b/pkg/scheduler/internal/queue/scheduling_queue_test.go @@ -44,6 +44,7 @@ import ( "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/framework" plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/schedulinggates" "k8s.io/kubernetes/pkg/scheduler/metrics" @@ -1499,13 +1500,19 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithQueueingHint(t *testing. expectedQ: unschedulablePods, }, { - name: "QueueHintFunction is not called when Pod is gated", - podInfo: setQueuedPodInfoGated(&framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")}), + name: "QueueHintFunction is not called when Pod is gated by SchedulingGates plugin", + podInfo: setQueuedPodInfoGated(&framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New(names.SchedulingGates, "foo")}), hint: func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { return framework.Queue, fmt.Errorf("QueueingHintFn should not be called as pod is gated") }, expectedQ: unschedulablePods, }, + { + name: "QueueHintFunction is called when Pod is gated by a plugin other than SchedulingGates", + podInfo: setQueuedPodInfoGated(&framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")}), + hint: queueHintReturnQueue, + expectedQ: activeQ, + }, } for _, test := range tests { @@ -1518,14 +1525,13 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithQueueingHint(t *testing. QueueingHintFn: test.hint, }, } - test.podInfo.UnschedulablePlugins = sets.New("foo") cl := testingclock.NewFakeClock(now) q := NewTestQueue(ctx, newDefaultQueueSort(), WithQueueingHintMapPerProfile(m), WithClock(cl)) - // add to unsched pod pool q.activeQ.Add(q.newQueuedPodInfo(test.podInfo.Pod)) if p, err := q.Pop(logger); err != nil || p.Pod != test.podInfo.Pod { t.Errorf("Expected: %v after Pop, but got: %v", test.podInfo.Pod.Name, p.Pod.Name) } + // add to unsched pod pool err := q.AddUnschedulableIfNotPresent(logger, test.podInfo, q.SchedulingCycle()) if err != nil { t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err) diff --git a/test/integration/scheduler/plugins/plugins_test.go b/test/integration/scheduler/plugins/plugins_test.go index 7f2c3d53fc0..ce0a5a87b18 100644 --- a/test/integration/scheduler/plugins/plugins_test.go +++ b/test/integration/scheduler/plugins/plugins_test.go @@ -2651,8 +2651,8 @@ func (pl *SchedulingGatesPluginWOEvents) EventsToRegister() []framework.ClusterE return nil } -// This test helps to verify registering nil events for schedulingGates plugin works as expected. -func TestSchedulingGatesPluginEventsToRegister(t *testing.T) { +// This test helps to verify registering nil events for PreEnqueue plugin works as expected. +func TestPreEnqueuePluginEventsToRegister(t *testing.T) { testContext := testutils.InitTestAPIServer(t, "preenqueue-plugin", nil) num := func(pl framework.Plugin) int { @@ -2668,8 +2668,9 @@ func TestSchedulingGatesPluginEventsToRegister(t *testing.T) { } tests := []struct { - name string - withEvents bool + name string + withEvents bool + // count is the expected number of calls to PreEnqueue(). count int queueHintEnabled []bool expectedScheduled []bool @@ -2686,7 +2687,7 @@ func TestSchedulingGatesPluginEventsToRegister(t *testing.T) { { name: "preEnqueue plugin with event registered", withEvents: true, - count: 2, + count: 3, queueHintEnabled: []bool{false, true}, expectedScheduled: []bool{true, true}, }, @@ -2700,7 +2701,7 @@ func TestSchedulingGatesPluginEventsToRegister(t *testing.T) { t.Run(tt.name+fmt.Sprintf(" queueHint(%v)", queueHintEnabled), func(t *testing.T) { featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, queueHintEnabled) - // new plugin every time to clear counts + // use new plugin every time to clear counts var plugin framework.PreEnqueuePlugin if tt.withEvents { plugin = &SchedulingGatesPluginWithEvents{SchedulingGates: schedulinggates.SchedulingGates{}}