From dd3af9a85b887633af4d3bb3acedc09e21808a1d Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Sat, 15 Jun 2024 22:36:42 +0000 Subject: [PATCH 1/6] fix: skip isPodWorthRequeuing only when SchedulingGates gates the pod --- pkg/scheduler/internal/queue/scheduling_queue.go | 4 +++- .../internal/queue/scheduling_queue_test.go | 16 ++++++++++++---- .../scheduler/plugins/plugins_test.go | 9 +++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go index 292e72916f6..6788d04d282 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue.go +++ b/pkg/scheduler/internal/queue/scheduling_queue.go @@ -47,6 +47,7 @@ import ( "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" "k8s.io/kubernetes/pkg/scheduler/internal/heap" "k8s.io/kubernetes/pkg/scheduler/metrics" "k8s.io/kubernetes/pkg/scheduler/util" @@ -1195,9 +1196,10 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn for _, pInfo := range podInfoList { // Since there may be many gated pods and they will not move from the // unschedulable pool, we skip calling the expensive isPodWorthRequeueing. - if pInfo.Gated { + if pInfo.Gated && pInfo.UnschedulablePlugins.Has(names.SchedulingGates) { continue } + schedulingHint := p.isPodWorthRequeuing(logger, pInfo, event, oldObj, newObj) if schedulingHint == queueSkip { // QueueingHintFn determined that this Pod isn't worth putting to activeQ or backoffQ by this event. diff --git a/pkg/scheduler/internal/queue/scheduling_queue_test.go b/pkg/scheduler/internal/queue/scheduling_queue_test.go index c2ad3d8ce67..c12fd1f6726 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue_test.go +++ b/pkg/scheduler/internal/queue/scheduling_queue_test.go @@ -44,6 +44,7 @@ import ( "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/framework" plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/schedulinggates" "k8s.io/kubernetes/pkg/scheduler/metrics" @@ -1499,13 +1500,21 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithQueueingHint(t *testing. expectedQ: unschedulablePods, }, { - name: "QueueHintFunction is not called when Pod is gated", - podInfo: setQueuedPodInfoGated(&framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")}), + name: "QueueHintFunction is not called when Pod is gated by SchedulingGates plugin", + podInfo: setQueuedPodInfoGated(&framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New(names.SchedulingGates, "foo")}), hint: func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { return framework.Queue, fmt.Errorf("QueueingHintFn should not be called as pod is gated") }, expectedQ: unschedulablePods, }, + { + name: "QueueHintFunction is called when Pod is gated by a plugin other than SchedulingGates", + podInfo: setQueuedPodInfoGated(&framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")}), + hint: queueHintReturnQueue, + // FIXME: This should be backoffQ. + // https://github.com/kubernetes/kubernetes/issues/125538 + expectedQ: activeQ, + }, } for _, test := range tests { @@ -1518,14 +1527,13 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithQueueingHint(t *testing. QueueingHintFn: test.hint, }, } - test.podInfo.UnschedulablePlugins = sets.New("foo") cl := testingclock.NewFakeClock(now) q := NewTestQueue(ctx, newDefaultQueueSort(), WithQueueingHintMapPerProfile(m), WithClock(cl)) - // add to unsched pod pool q.activeQ.Add(q.newQueuedPodInfo(test.podInfo.Pod)) if p, err := q.Pop(logger); err != nil || p.Pod != test.podInfo.Pod { t.Errorf("Expected: %v after Pop, but got: %v", test.podInfo.Pod.Name, p.Pod.Name) } + // add to unsched pod pool err := q.AddUnschedulableIfNotPresent(logger, test.podInfo, q.SchedulingCycle()) if err != nil { t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err) diff --git a/test/integration/scheduler/plugins/plugins_test.go b/test/integration/scheduler/plugins/plugins_test.go index 7f2c3d53fc0..306a8d398fc 100644 --- a/test/integration/scheduler/plugins/plugins_test.go +++ b/test/integration/scheduler/plugins/plugins_test.go @@ -2668,8 +2668,9 @@ func TestSchedulingGatesPluginEventsToRegister(t *testing.T) { } tests := []struct { - name string - withEvents bool + name string + withEvents bool + // count is the expected number of calls to PreEnqueue(). count int queueHintEnabled []bool expectedScheduled []bool @@ -2686,7 +2687,7 @@ func TestSchedulingGatesPluginEventsToRegister(t *testing.T) { { name: "preEnqueue plugin with event registered", withEvents: true, - count: 2, + count: 3, queueHintEnabled: []bool{false, true}, expectedScheduled: []bool{true, true}, }, @@ -2700,7 +2701,7 @@ func TestSchedulingGatesPluginEventsToRegister(t *testing.T) { t.Run(tt.name+fmt.Sprintf(" queueHint(%v)", queueHintEnabled), func(t *testing.T) { featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, queueHintEnabled) - // new plugin every time to clear counts + // use new plugin every time to clear counts var plugin framework.PreEnqueuePlugin if tt.withEvents { plugin = &SchedulingGatesPluginWithEvents{SchedulingGates: schedulinggates.SchedulingGates{}} From 2c4dc6b65b7c04abce2abef0c6de4af14ecfb96b Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Thu, 20 Jun 2024 23:36:05 +0000 Subject: [PATCH 2/6] elaborate comments --- pkg/scheduler/internal/queue/scheduling_queue.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go index 6788d04d282..4f584d69c66 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue.go +++ b/pkg/scheduler/internal/queue/scheduling_queue.go @@ -1194,8 +1194,14 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn activated := false for _, pInfo := range podInfoList { - // Since there may be many gated pods and they will not move from the - // unschedulable pool, we skip calling the expensive isPodWorthRequeueing. + // Scheduling-gated Pods never get schedulable with any events, + // except the Pods themselves got updated, which isn't handled by movePodsToActiveOrBackoffQueue. + // So, here we skip them early here so that they don't go through isPodWorthRequeuing, + // which isn't fast enough when the number of scheduling-gated Pods in unschedulablePods is large. + // + // Note that we cannot skip all pInfo.Gated Pods here + // because PreEnqueue plugins apart from the scheduling gate plugin may change the gating status + // with these events. if pInfo.Gated && pInfo.UnschedulablePlugins.Has(names.SchedulingGates) { continue } From fa8da84835977892881ec7e0430d010251e91209 Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Thu, 20 Jun 2024 23:36:25 +0000 Subject: [PATCH 3/6] remove fixme comment --- pkg/scheduler/internal/queue/scheduling_queue_test.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/scheduler/internal/queue/scheduling_queue_test.go b/pkg/scheduler/internal/queue/scheduling_queue_test.go index c12fd1f6726..b74d197d6b5 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue_test.go +++ b/pkg/scheduler/internal/queue/scheduling_queue_test.go @@ -1508,11 +1508,9 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithQueueingHint(t *testing. expectedQ: unschedulablePods, }, { - name: "QueueHintFunction is called when Pod is gated by a plugin other than SchedulingGates", - podInfo: setQueuedPodInfoGated(&framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")}), - hint: queueHintReturnQueue, - // FIXME: This should be backoffQ. - // https://github.com/kubernetes/kubernetes/issues/125538 + name: "QueueHintFunction is called when Pod is gated by a plugin other than SchedulingGates", + podInfo: setQueuedPodInfoGated(&framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")}), + hint: queueHintReturnQueue, expectedQ: activeQ, }, } From 2304806cbee4ba43018cf1dabcaed2bfc3d2a1e7 Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Thu, 20 Jun 2024 23:43:41 +0000 Subject: [PATCH 4/6] elaborate comment more --- pkg/scheduler/internal/queue/scheduling_queue.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go index 4f584d69c66..ba3a8b50d5a 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue.go +++ b/pkg/scheduler/internal/queue/scheduling_queue.go @@ -1194,10 +1194,15 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn activated := false for _, pInfo := range podInfoList { + // When handling events takes time, a scheduling throughput gets impacted negatively + // because of a shared lock within PriorityQueue, which Pop() also requires. + // // Scheduling-gated Pods never get schedulable with any events, // except the Pods themselves got updated, which isn't handled by movePodsToActiveOrBackoffQueue. - // So, here we skip them early here so that they don't go through isPodWorthRequeuing, + // So, we can skip them early here so that they don't go through isPodWorthRequeuing, // which isn't fast enough when the number of scheduling-gated Pods in unschedulablePods is large. + // This is a hotfix, which might be changed + // once we have a better general solution for the shared lock issue. // // Note that we cannot skip all pInfo.Gated Pods here // because PreEnqueue plugins apart from the scheduling gate plugin may change the gating status From afed31193bf81d14a96f056a4e507e30a3d884ba Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Thu, 20 Jun 2024 23:46:17 +0000 Subject: [PATCH 5/6] update a test name and comment --- test/integration/scheduler/plugins/plugins_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/integration/scheduler/plugins/plugins_test.go b/test/integration/scheduler/plugins/plugins_test.go index 306a8d398fc..ce0a5a87b18 100644 --- a/test/integration/scheduler/plugins/plugins_test.go +++ b/test/integration/scheduler/plugins/plugins_test.go @@ -2651,8 +2651,8 @@ func (pl *SchedulingGatesPluginWOEvents) EventsToRegister() []framework.ClusterE return nil } -// This test helps to verify registering nil events for schedulingGates plugin works as expected. -func TestSchedulingGatesPluginEventsToRegister(t *testing.T) { +// This test helps to verify registering nil events for PreEnqueue plugin works as expected. +func TestPreEnqueuePluginEventsToRegister(t *testing.T) { testContext := testutils.InitTestAPIServer(t, "preenqueue-plugin", nil) num := func(pl framework.Plugin) int { From 98a3182398c1bb42df3a3d8addcb661b00319597 Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Thu, 20 Jun 2024 23:48:42 +0000 Subject: [PATCH 6/6] correct comment --- pkg/scheduler/internal/queue/scheduling_queue.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go index ba3a8b50d5a..9f6a592eccf 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue.go +++ b/pkg/scheduler/internal/queue/scheduling_queue.go @@ -1200,8 +1200,10 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn // Scheduling-gated Pods never get schedulable with any events, // except the Pods themselves got updated, which isn't handled by movePodsToActiveOrBackoffQueue. // So, we can skip them early here so that they don't go through isPodWorthRequeuing, - // which isn't fast enough when the number of scheduling-gated Pods in unschedulablePods is large. - // This is a hotfix, which might be changed + // which isn't fast enough to keep a sufficient scheduling throughput + // when the number of scheduling-gated Pods in unschedulablePods is large. + // https://github.com/kubernetes/kubernetes/issues/124384 + // This is a hotfix for this issue, which might be changed // once we have a better general solution for the shared lock issue. // // Note that we cannot skip all pInfo.Gated Pods here