From cb5dc46edf705582cf597123cbb3e9da29693a31 Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Thu, 19 Oct 2023 11:02:11 +0000 Subject: [PATCH] feature(scheduler): simplify QueueingHint by introducing new statuses --- pkg/scheduler/framework/interface.go | 37 ++- pkg/scheduler/framework/interface_test.go | 1 + .../dynamicresources/dynamicresources.go | 35 ++- .../dynamicresources/dynamicresources_test.go | 16 +- .../plugins/nodeaffinity/node_affinity.go | 8 +- .../nodeaffinity/node_affinity_test.go | 10 +- .../framework/plugins/nodeports/node_ports.go | 8 +- .../plugins/nodeports/node_ports_test.go | 4 +- .../nodeunschedulable/node_unschedulable.go | 4 +- .../node_unschedulable_test.go | 8 +- pkg/scheduler/framework/types.go | 54 ++-- .../internal/queue/scheduling_queue.go | 153 ++++++---- .../internal/queue/scheduling_queue_test.go | 271 +++++++++--------- pkg/scheduler/schedule_one.go | 25 +- pkg/scheduler/scheduler.go | 4 +- test/integration/scheduler/queue_test.go | 2 +- .../scheduler/rescheduling_test.go | 4 +- 17 files changed, 360 insertions(+), 284 deletions(-) diff --git a/pkg/scheduler/framework/interface.go b/pkg/scheduler/framework/interface.go index 6fc1564e387..cf743e53956 100644 --- a/pkg/scheduler/framework/interface.go +++ b/pkg/scheduler/framework/interface.go @@ -77,18 +77,30 @@ const ( // Success means that plugin ran correctly and found pod schedulable. // NOTE: A nil status is also considered as "Success". Success Code = iota - // Error is used for internal plugin errors, unexpected input, etc. + // Error is one of the failures, used for internal plugin errors, unexpected input, etc. + // Plugin shouldn't return this code for expected failures, like Unschedulable. + // Since it's the unexpected failure, the scheduling queue registers the pod without unschedulable plugins. + // Meaning, the Pod will be requeued to activeQ/backoffQ soon. Error - // Unschedulable is used when a plugin finds a pod unschedulable. The scheduler might attempt to + // Unschedulable is one of the failures, used when a plugin finds a pod unschedulable. + // If it's returned from PreFilter or Filter, the scheduler might attempt to // run other postFilter plugins like preemption to get this pod scheduled. // Use UnschedulableAndUnresolvable to make the scheduler skipping other postFilter plugins. // The accompanying status message should explain why the pod is unschedulable. + // + // We regard the backoff as a penalty of wasting the scheduling cycle. + // When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling, + // the Pod goes through backoff. Unschedulable // UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and // other postFilter plugins like preemption would not change anything. // Plugins should return Unschedulable if it is possible that the pod can get scheduled // after running other postFilter plugins. // The accompanying status message should explain why the pod is unschedulable. + // + // We regard the backoff as a penalty of wasting the scheduling cycle. + // When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling, + // the Pod goes through backoff. UnschedulableAndUnresolvable // Wait is used when a Permit plugin finds a pod scheduling should wait. Wait @@ -97,10 +109,27 @@ const ( // - when a PreFilter plugin returns Skip so that coupled Filter plugin/PreFilterExtensions() will be skipped. // - when a PreScore plugin returns Skip so that coupled Score plugin will be skipped. Skip + // Pending means that the scheduling process is finished successfully, + // but the plugin wants to abort the scheduling cycle/binding cycle here. + // + // For example, the DRA plugin sometimes needs to wait for the external device driver + // to provision the resource for the Pod. + // It's different from when to return Unschedulable/UnschedulableAndUnresolvable, + // because in this case, the scheduler decides where the Pod can go successfully, + // but we need to wait for the external component to do something based on that scheduling result. + // + // We regard the backoff as a penalty of wasting the scheduling cycle. + // In the case of returning Pending, we cannot say the scheduling cycle is wasted + // because the scheduling result is used to proceed the Pod's scheduling forward, + // that particular scheduling cycle is failed though. + // So, Pods rejected by such reasons don't need to suffer a penalty (backoff). + // When the scheduling queue requeues Pods, which was rejected with Pending in the last scheduling, + // the Pod goes to activeQ directly ignoring backoff. + Pending ) // This list should be exactly the same as the codes iota defined above in the same order. -var codes = []string{"Success", "Error", "Unschedulable", "UnschedulableAndUnresolvable", "Wait", "Skip"} +var codes = []string{"Success", "Error", "Unschedulable", "UnschedulableAndUnresolvable", "Wait", "Skip", "Pending"} func (c Code) String() string { return codes[c] @@ -151,7 +180,7 @@ type Status struct { reasons []string err error // failedPlugin is an optional field that records the plugin name a Pod failed by. - // It's set by the framework when code is Error, Unschedulable or UnschedulableAndUnresolvable. + // It's set by the framework when code is Unschedulable or UnschedulableAndUnresolvable. failedPlugin string } diff --git a/pkg/scheduler/framework/interface_test.go b/pkg/scheduler/framework/interface_test.go index cebab3bc2ed..a3b6a31fcc0 100644 --- a/pkg/scheduler/framework/interface_test.go +++ b/pkg/scheduler/framework/interface_test.go @@ -131,6 +131,7 @@ func TestStatusCodes(t *testing.T) { assertStatusCode(t, UnschedulableAndUnresolvable, 3) assertStatusCode(t, Wait, 4) assertStatusCode(t, Skip, 5) + assertStatusCode(t, Pending, 6) } func assertStatusCode(t *testing.T, code Code, value int) { diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go index 7581b66b317..085b2dab54c 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go @@ -315,7 +315,7 @@ func (pl *dynamicResources) isSchedulableAfterClaimChange(logger klog.Logger, po originalClaim, modifiedClaim, err := schedutil.As[*resourcev1alpha2.ResourceClaim](oldObj, newObj) if err != nil { // Shouldn't happen. - return framework.QueueAfterBackoff, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err) + return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err) } usesClaim := false @@ -339,7 +339,7 @@ func (pl *dynamicResources) isSchedulableAfterClaimChange(logger klog.Logger, po if originalClaim == nil { logger.V(4).Info("claim for pod got created", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim)) - return framework.QueueImmediately, nil + return framework.Queue, nil } // Modifications may or may not be relevant. If the entire @@ -357,7 +357,7 @@ func (pl *dynamicResources) isSchedulableAfterClaimChange(logger klog.Logger, po } logger.V(4).Info("status of claim for pod got updated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim)) - return framework.QueueImmediately, nil + return framework.Queue, nil } // isSchedulableAfterPodSchedulingContextChange is invoked for all @@ -376,7 +376,7 @@ func (pl *dynamicResources) isSchedulableAfterPodSchedulingContextChange(logger oldPodScheduling, newPodScheduling, err := schedutil.As[*resourcev1alpha2.PodSchedulingContext](oldObj, newObj) if err != nil { // Shouldn't happen. - return framework.QueueAfterBackoff, fmt.Errorf("unexpected object in isSchedulableAfterPodSchedulingContextChange: %w", err) + return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterPodSchedulingContextChange: %w", err) } podScheduling := newPodScheduling // Never nil because deletes are handled above. @@ -423,7 +423,7 @@ func (pl *dynamicResources) isSchedulableAfterPodSchedulingContextChange(logger len(oldPodScheduling.Status.ResourceClaims) < len(podScheduling.Status.ResourceClaims) /* new information and not incomplete (checked above) */ { // This definitely is new information for the scheduler. Try again immediately. logger.V(4).Info("PodSchedulingContext for pod has all required information, schedule immediately", "pod", klog.KObj(pod)) - return framework.QueueImmediately, nil + return framework.Queue, nil } // The other situation where the scheduler needs to do @@ -448,7 +448,7 @@ func (pl *dynamicResources) isSchedulableAfterPodSchedulingContextChange(logger for _, claimStatus := range podScheduling.Status.ResourceClaims { if sliceContains(claimStatus.UnsuitableNodes, podScheduling.Spec.SelectedNode) { logger.V(5).Info("PodSchedulingContext has unsuitable selected node, schedule immediately", "pod", klog.KObj(pod), "selectedNode", podScheduling.Spec.SelectedNode, "podResourceName", claimStatus.Name) - return framework.QueueImmediately, nil + return framework.Queue, nil } } } @@ -463,7 +463,7 @@ func (pl *dynamicResources) isSchedulableAfterPodSchedulingContextChange(logger // Once we get here, all changes which are known to require special responses // have been checked for. Whatever the change was, we don't know exactly how - // to handle it and thus return QueueAfterBackoff. This will cause the + // to handle it and thus return Queue. This will cause the // scheduler to treat the event as if no event hint callback had been provided. // Developers who want to investigate this can enable a diff at log level 6. if loggerV := logger.V(6); loggerV.Enabled() { @@ -471,7 +471,7 @@ func (pl *dynamicResources) isSchedulableAfterPodSchedulingContextChange(logger } else { logger.V(5).Info("PodSchedulingContext for pod with unknown changes, maybe schedule", "pod", klog.KObj(pod)) } - return framework.QueueAfterBackoff, nil + return framework.Queue, nil } @@ -961,7 +961,7 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil { return statusError(logger, err) } - return statusUnschedulable(logger, "waiting for resource driver to allocate resource", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) + return statusPending(logger, "waiting for resource driver to allocate resource", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) } } @@ -977,7 +977,7 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat // provisioning? On the one hand, volume provisioning is currently // irreversible, so it better should come last. On the other hand, // triggering both in parallel might be faster. - return statusUnschedulable(logger, "waiting for resource driver to provide information", "pod", klog.KObj(pod)) + return statusPending(logger, "waiting for resource driver to provide information", "pod", klog.KObj(pod)) } func containsNode(hay []string, needle string) bool { @@ -1073,6 +1073,21 @@ func statusUnschedulable(logger klog.Logger, reason string, kv ...interface{}) * return framework.NewStatus(framework.UnschedulableAndUnresolvable, reason) } +// statusPending ensures that there is a log message associated with the +// line where the status originated. +func statusPending(logger klog.Logger, reason string, kv ...interface{}) *framework.Status { + if loggerV := logger.V(5); loggerV.Enabled() { + helper, loggerV := loggerV.WithCallStackHelper() + helper() + kv = append(kv, "reason", reason) + // nolint: logcheck // warns because it cannot check key/values + loggerV.Info("pod waiting for external component", kv...) + } + + // When we return Pending, we want to block the Pod at the same time. + return framework.NewStatus(framework.Pending, reason) +} + // statusError ensures that there is a log message associated with the // line where the error originated. func statusError(logger klog.Logger, err error, kv ...interface{}) *framework.Status { diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go index a1a2a788873..42dfe066a27 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go @@ -346,7 +346,7 @@ func TestPlugin(t *testing.T) { classes: []*resourcev1alpha2.ResourceClass{resourceClass}, want: want{ reserve: result{ - status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `waiting for resource driver to allocate resource`), + status: framework.NewStatus(framework.Pending, `waiting for resource driver to allocate resource`), added: []metav1.Object{schedulingSelectedPotential}, }, }, @@ -360,7 +360,7 @@ func TestPlugin(t *testing.T) { classes: []*resourcev1alpha2.ResourceClass{resourceClass}, want: want{ reserve: result{ - status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `waiting for resource driver to provide information`), + status: framework.NewStatus(framework.Pending, `waiting for resource driver to provide information`), added: []metav1.Object{schedulingPotential}, }, }, @@ -374,7 +374,7 @@ func TestPlugin(t *testing.T) { classes: []*resourcev1alpha2.ResourceClass{resourceClass}, want: want{ reserve: result{ - status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `waiting for resource driver to allocate resource`), + status: framework.NewStatus(framework.Pending, `waiting for resource driver to allocate resource`), changes: change{ scheduling: func(in *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext { return st.FromPodSchedulingContexts(in). @@ -925,7 +925,7 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) { "queue-on-add": { pod: podWithClaimName, newObj: pendingImmediateClaim, - expectedHint: framework.QueueImmediately, + expectedHint: framework.Queue, }, "backoff-wrong-old-object": { pod: podWithClaimName, @@ -953,7 +953,7 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) { claim.Status.Allocation = &resourcev1alpha2.AllocationResult{} return claim }(), - expectedHint: framework.QueueImmediately, + expectedHint: framework.Queue, }, } @@ -1051,7 +1051,7 @@ func Test_isSchedulableAfterPodSchedulingContextChange(t *testing.T) { claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim}, oldObj: scheduling, newObj: schedulingInfo, - expectedHint: framework.QueueImmediately, + expectedHint: framework.Queue, }, "queue-bad-selected-node": { pod: podWithClaimTemplateInStatus, @@ -1067,7 +1067,7 @@ func Test_isSchedulableAfterPodSchedulingContextChange(t *testing.T) { scheduling.Status.ResourceClaims[0].UnsuitableNodes = append(scheduling.Status.ResourceClaims[0].UnsuitableNodes, scheduling.Spec.SelectedNode) return scheduling }(), - expectedHint: framework.QueueImmediately, + expectedHint: framework.Queue, }, "skip-spec-changes": { pod: podWithClaimTemplateInStatus, @@ -1089,7 +1089,7 @@ func Test_isSchedulableAfterPodSchedulingContextChange(t *testing.T) { scheduling.Finalizers = append(scheduling.Finalizers, "foo") return scheduling }(), - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, }, } diff --git a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go index 41527583a9e..9dcc65683c9 100644 --- a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go +++ b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go @@ -94,7 +94,7 @@ func (pl *NodeAffinity) EventsToRegister() []framework.ClusterEventWithHint { func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj) if err != nil { - return framework.QueueAfterBackoff, err + return framework.Queue, err } if pl.addedNodeSelector != nil && !pl.addedNodeSelector.Match(modifiedNode) { @@ -105,7 +105,7 @@ func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1 requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod) isMatched, err := requiredNodeAffinity.Match(modifiedNode) if err != nil { - return framework.QueueAfterBackoff, err + return framework.Queue, err } if !isMatched { logger.V(4).Info("node was created or updated, but doesn't matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) @@ -116,14 +116,14 @@ func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1 if originalNode != nil { wasMatched, err = requiredNodeAffinity.Match(originalNode) if err != nil { - return framework.QueueAfterBackoff, err + return framework.Queue, err } } if !wasMatched { // This modification makes this Node match with Pod's NodeAffinity. logger.V(4).Info("node was created or updated, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) - return framework.QueueAfterBackoff, nil + return framework.Queue, nil } logger.V(4).Info("node was created or updated, but it doesn't make this pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) diff --git a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go index d9f3d1b9ac1..9de8397df91 100644 --- a/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go +++ b/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go @@ -1189,7 +1189,7 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { args: &config.NodeAffinityArgs{}, pod: podWithNodeAffinity.Obj(), newObj: "not-a-node", - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, expectedErr: true, }, "backoff-wrong-old-object": { @@ -1197,7 +1197,7 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { pod: podWithNodeAffinity.Obj(), oldObj: "not-a-node", newObj: st.MakeNode().Obj(), - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, expectedErr: true, }, "skip-queue-on-add": { @@ -1210,7 +1210,7 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { args: &config.NodeAffinityArgs{}, pod: podWithNodeAffinity.Obj(), newObj: st.MakeNode().Label("foo", "bar").Obj(), - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, }, "skip-unrelated-changes": { args: &config.NodeAffinityArgs{}, @@ -1245,7 +1245,7 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { pod: podWithNodeAffinity.Obj(), oldObj: st.MakeNode().Obj(), newObj: st.MakeNode().Label("foo", "bar").Obj(), - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, }, "skip-queue-on-add-scheduler-enforced-node-affinity": { args: &config.NodeAffinityArgs{ @@ -1289,7 +1289,7 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) { }, pod: podWithNodeAffinity.Obj(), newObj: st.MakeNode().Label("foo", "bar").Obj(), - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, }, } diff --git a/pkg/scheduler/framework/plugins/nodeports/node_ports.go b/pkg/scheduler/framework/plugins/nodeports/node_ports.go index 96b32b087e2..79a80ce3804 100644 --- a/pkg/scheduler/framework/plugins/nodeports/node_ports.go +++ b/pkg/scheduler/framework/plugins/nodeports/node_ports.go @@ -120,7 +120,7 @@ func (pl *NodePorts) EventsToRegister() []framework.ClusterEventWithHint { // See: https://github.com/kubernetes/kubernetes/issues/109437 // And, we can remove NodeUpdated event once https://github.com/kubernetes/kubernetes/issues/110175 is solved. // We don't need the QueueingHintFn here because the scheduling of Pods will be always retried with backoff when this Event happens. - // (the same as QueueAfterBackoff) + // (the same as Queue) {Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.Update}}, } } @@ -130,7 +130,7 @@ func (pl *NodePorts) EventsToRegister() []framework.ClusterEventWithHint { func (pl *NodePorts) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { deletedPod, _, err := util.As[*v1.Pod](oldObj, nil) if err != nil { - return framework.QueueAfterBackoff, err + return framework.Queue, err } // If the deleted pod is unscheduled, it doesn't make the target pod schedulable. @@ -163,8 +163,8 @@ func (pl *NodePorts) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Po return framework.QueueSkip, nil } - logger.V(4).Info("the deleted pod and the target pod have any common port(s), returning QueueAfterBackoff as deleting this Pod may make the Pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod)) - return framework.QueueAfterBackoff, nil + logger.V(4).Info("the deleted pod and the target pod have any common port(s), returning Queue as deleting this Pod may make the Pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod)) + return framework.Queue, nil } // Filter invoked at the filter extension point. diff --git a/pkg/scheduler/framework/plugins/nodeports/node_ports_test.go b/pkg/scheduler/framework/plugins/nodeports/node_ports_test.go index 00774ae8db2..e3649de039b 100644 --- a/pkg/scheduler/framework/plugins/nodeports/node_ports_test.go +++ b/pkg/scheduler/framework/plugins/nodeports/node_ports_test.go @@ -313,7 +313,7 @@ func Test_isSchedulableAfterPodDeleted(t *testing.T) { "backoff-wrong-old-object": { pod: podWithHostPort.Obj(), oldObj: "not-a-pod", - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, expectedErr: true, }, "skip-queue-on-unscheduled": { @@ -334,7 +334,7 @@ func Test_isSchedulableAfterPodDeleted(t *testing.T) { "queue-on-released-hostport": { pod: podWithHostPort.Obj(), oldObj: st.MakePod().Node("fake-node").HostPort(8080).Obj(), - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, }, } diff --git a/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go b/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go index f437405711e..674c9390b48 100644 --- a/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go +++ b/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go @@ -61,7 +61,7 @@ func (pl *NodeUnschedulable) isSchedulableAfterNodeChange(logger klog.Logger, po originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj) if err != nil { logger.Error(err, "unexpected objects in isSchedulableAfterNodeChange", "oldObj", oldObj, "newObj", newObj) - return framework.QueueAfterBackoff, err + return framework.Queue, err } originalNodeSchedulable, modifiedNodeSchedulable := false, !modifiedNode.Spec.Unschedulable @@ -71,7 +71,7 @@ func (pl *NodeUnschedulable) isSchedulableAfterNodeChange(logger klog.Logger, po if !originalNodeSchedulable && modifiedNodeSchedulable { logger.V(4).Info("node was created or updated, pod may be schedulable now", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) - return framework.QueueAfterBackoff, nil + return framework.Queue, nil } logger.V(4).Info("node was created or updated, but it doesn't make this pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode)) diff --git a/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable_test.go b/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable_test.go index 5abd2581403..f66bb5611d6 100644 --- a/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable_test.go +++ b/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable_test.go @@ -98,7 +98,7 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) { name: "backoff-wrong-new-object", pod: &v1.Pod{}, newObj: "not-a-node", - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, expectedErr: true, }, { @@ -110,7 +110,7 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) { }, }, oldObj: "not-a-node", - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, expectedErr: true, }, { @@ -131,7 +131,7 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) { Unschedulable: false, }, }, - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, }, { name: "skip-unrelated-change", @@ -167,7 +167,7 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) { Unschedulable: true, }, }, - expectedHint: framework.QueueAfterBackoff, + expectedHint: framework.Queue, }, } diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index 6431211cbc8..3dc818541aa 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -85,15 +85,15 @@ type ClusterEventWithHint struct { // and filters out events to reduce useless retry of Pod's scheduling. // It's an optional field. If not set, // the scheduling of Pods will be always retried with backoff when this Event happens. - // (the same as QueueAfterBackoff) + // (the same as Queue) QueueingHintFn QueueingHintFn } // QueueingHintFn returns a hint that signals whether the event can make a Pod, // which was rejected by this plugin in the past scheduling cycle, schedulable or not. // It's called before a Pod gets moved from unschedulableQ to backoffQ or activeQ. -// If it returns an error, we'll take the returned QueueingHint as `QueueAfterBackoff` at the caller whatever we returned here so that -// we can prevent the Pod from stucking in the unschedulable pod pool. +// If it returns an error, we'll take the returned QueueingHint as `Queue` at the caller whatever we returned here so that +// we can prevent the Pod from being stuck in the unschedulable pod pool. // // - `pod`: the Pod to be enqueued, which is rejected by this plugin in the past. // - `oldObj` `newObj`: the object involved in that event. @@ -109,29 +109,16 @@ const ( // scheduling of the pod. QueueSkip QueueingHint = iota - // QueueAfterBackoff implies that the Pod may be schedulable by the event, - // and worth retrying the scheduling again after backoff. - QueueAfterBackoff - - // QueueImmediately is returned only when it is highly possible that the Pod gets scheduled in the next scheduling. - // You should only return QueueImmediately when there is a high chance that the Pod gets scheduled in the next scheduling. - // Otherwise, it's detrimental to scheduling throughput. - // For example, when the Pod was rejected as waiting for an external resource to be provisioned, that is directly tied to the Pod, - // and the event is that the resource is provisioned, then you can return QueueImmediately. - // As a counterexample, when the Pod was rejected due to insufficient memory resource, - // and the event is that more memory on Node is available, then you should return QueueAfterBackoff instead of QueueImmediately - // because other Pods may be waiting for the same resources and only a few of them would schedule in the next scheduling cycle. - QueueImmediately + // Queue implies that the Pod may be schedulable by the event. + Queue ) func (s QueueingHint) String() string { switch s { case QueueSkip: return "QueueSkip" - case QueueAfterBackoff: - return "QueueAfterBackoff" - case QueueImmediately: - return "QueueImmediately" + case Queue: + return "Queue" } return "" } @@ -179,9 +166,11 @@ type QueuedPodInfo struct { // It shouldn't be updated once initialized. It's used to record the e2e scheduling // latency for a pod. InitialAttemptTimestamp *time.Time - // If a Pod failed in a scheduling cycle, record the plugin names it failed by. + // UnschedulablePlugins records the plugin names that the Pod failed with Unschedulable or UnschedulableAndUnresolvable status. // It's registered only when the Pod is rejected in PreFilter, Filter, Reserve, or Permit (WaitOnPermit). UnschedulablePlugins sets.Set[string] + // PendingPlugins records the plugin names that the Pod failed with Pending status. + PendingPlugins sets.Set[string] // Whether the Pod is scheduling gated (by PreEnqueuePlugins) or not. Gated bool } @@ -292,8 +281,11 @@ type WeightedAffinityTerm struct { // Diagnosis records the details to diagnose a scheduling failure. type Diagnosis struct { - NodeToStatusMap NodeToStatusMap + NodeToStatusMap NodeToStatusMap + // UnschedulablePlugins are plugins that returns Unschedulable or UnschedulableAndUnresolvable. UnschedulablePlugins sets.Set[string] + // UnschedulablePlugins are plugins that returns Pending. + PendingPlugins sets.Set[string] // PreFilterMsg records the messages returned from PreFilter plugins. PreFilterMsg string // PostFilterMsg records the messages returned from PostFilter plugins. @@ -312,6 +304,24 @@ const ( NoNodeAvailableMsg = "0/%v nodes are available" ) +func (d *Diagnosis) SetFailedPlugin(sts *Status) { + if sts.FailedPlugin() == "" { + return + } + if sts.IsUnschedulable() { + if d.UnschedulablePlugins == nil { + d.UnschedulablePlugins = sets.New[string]() + } + d.UnschedulablePlugins.Insert(sts.FailedPlugin()) + } + if sts.Code() == Pending { + if d.PendingPlugins == nil { + d.PendingPlugins = sets.New[string]() + } + d.PendingPlugins.Insert(sts.FailedPlugin()) + } +} + // Error returns detailed information of why the pod failed to fit on each node. // A message format is "0/X nodes are available: . . ." func (f *FitError) Error() string { diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go index 0c265bcf693..b0c970dcaca 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue.go +++ b/pkg/scheduler/internal/queue/scheduling_queue.go @@ -397,72 +397,97 @@ func (p *PriorityQueue) Run(logger klog.Logger) { }, 30*time.Second, p.stop) } -// isPodWorthRequeuing calls QueueingHintFn of only plugins registered in pInfo.unschedulablePlugins. -// If any QueueingHintFn returns QueueImmediately, the scheduling queue is supposed to enqueue this Pod to activeQ. -// If no QueueingHintFn returns QueueImmediately, but some return QueueAfterBackoff, +// queueingStrategy indicates how the scheduling queue should enqueue the Pod from unschedulable pod pool. +type queueingStrategy int + +const ( + // queueSkip indicates that the scheduling queue should skip requeuing the Pod to activeQ/backoffQ. + queueSkip queueingStrategy = iota + // queueAfterBackoff indicates that the scheduling queue should requeue the Pod after backoff is completed. + queueAfterBackoff + // queueImmediately indicates that the scheduling queue should skip backoff and requeue the Pod immediately to activeQ. + queueImmediately +) + +// isPodWorthRequeuing calls QueueingHintFn of only plugins registered in pInfo.unschedulablePlugins and pInfo.PendingPlugins. +// +// If any of pInfo.PendingPlugins return Queue, +// the scheduling queue is supposed to enqueue this Pod to activeQ, skipping backoffQ. +// If any of pInfo.unschedulablePlugins return Queue, // the scheduling queue is supposed to enqueue this Pod to activeQ/backoffQ depending on the remaining backoff time of the Pod. -// If all QueueingHintFn returns QueueSkip, the scheduling queue enqueues the Pod back to unschedulable Pod pool +// If all QueueingHintFns returns Skip, the scheduling queue enqueues the Pod back to unschedulable Pod pool // because no plugin changes the scheduling result via the event. -func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework.QueuedPodInfo, event framework.ClusterEvent, oldObj, newObj interface{}) framework.QueueingHint { - if pInfo.UnschedulablePlugins.Len() == 0 { - logger.V(6).Info("Worth requeuing because no unschedulable plugins", "pod", klog.KObj(pInfo.Pod)) - return framework.QueueAfterBackoff +func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework.QueuedPodInfo, event framework.ClusterEvent, oldObj, newObj interface{}) queueingStrategy { + failedPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) + if failedPlugins.Len() == 0 { + logger.V(6).Info("Worth requeuing because no failed plugins", "pod", klog.KObj(pInfo.Pod)) + return queueAfterBackoff } if event.IsWildCard() { + // If the wildcard event is special one as someone wants to force all Pods to move to activeQ/backoffQ. + // We return queueAfterBackoff in this case, while resetting all blocked plugins. logger.V(6).Info("Worth requeuing because the event is wildcard", "pod", klog.KObj(pInfo.Pod)) - return framework.QueueAfterBackoff + return queueAfterBackoff } hintMap, ok := p.queueingHintMap[pInfo.Pod.Spec.SchedulerName] if !ok { // shouldn't reach here unless bug. logger.Error(nil, "No QueueingHintMap is registered for this profile", "profile", pInfo.Pod.Spec.SchedulerName, "pod", klog.KObj(pInfo.Pod)) - return framework.QueueAfterBackoff + return queueAfterBackoff } pod := pInfo.Pod - queueHint := framework.QueueSkip + queueStrategy := queueSkip for eventToMatch, hintfns := range hintMap { if eventToMatch.Resource != event.Resource || eventToMatch.ActionType&event.ActionType == 0 { continue } for _, hintfn := range hintfns { - if !pInfo.UnschedulablePlugins.Has(hintfn.PluginName) { + if !failedPlugins.Has(hintfn.PluginName) { + // skip if it's not hintfn from failedPlugins. continue } h, err := hintfn.QueueingHintFn(logger, pod, oldObj, newObj) if err != nil { - // If the QueueingHintFn returned an error, we should treat the event as QueueAfterBackoff so that we can prevent - // the Pod from stucking in the unschedulable pod pool. + // If the QueueingHintFn returned an error, we should treat the event as Queue so that we can prevent + // the Pod from being stuck in the unschedulable pod pool. oldObjMeta, newObjMeta, asErr := util.As[klog.KMetadata](oldObj, newObj) if asErr != nil { logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod)) } else { logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod), "oldObj", klog.KObj(oldObjMeta), "newObj", klog.KObj(newObjMeta)) } - h = framework.QueueAfterBackoff + h = framework.Queue } - - switch h { - case framework.QueueSkip: + if h == framework.QueueSkip { continue - case framework.QueueImmediately: - return h - case framework.QueueAfterBackoff: - // replace queueHint with the returned value, - // but continue to other queueHintFn to check because other plugins may want to return QueueImmediately. - queueHint = h } + if pInfo.PendingPlugins.Has(hintfn.PluginName) { + // interprets Queue from the Pending plugin as queueImmediately. + // We can return immediately because queueImmediately is the highest priority. + return queueImmediately + } + + // interprets Queue from the unschedulable plugin as queueAfterBackoff. + + if pInfo.PendingPlugins.Len() == 0 { + // We can return immediately because no Pending plugins, which only can make queueImmediately, registered in this Pod, + // and queueAfterBackoff is the second highest priority. + return queueAfterBackoff + } + + // We can't return immediately because there are some Pending plugins registered in this Pod. + // We need to check if those plugins return Queue or not and if they do, we return queueImmediately. + queueStrategy = queueAfterBackoff } } - // No queueing hint function is registered for this event - // or no queueing hint fn returns the value other than QueueSkip. - return queueHint + return queueStrategy } // runPreEnqueuePlugins iterates PreEnqueue function in each registered PreEnqueuePlugin. @@ -626,7 +651,7 @@ func (p *PriorityQueue) SchedulingCycle() int64 { // determineSchedulingHintForInFlightPod looks at the unschedulable plugins of the given Pod // and determines the scheduling hint for this Pod while checking the events that happened during in-flight. -func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger, pInfo *framework.QueuedPodInfo) framework.QueueingHint { +func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger, pInfo *framework.QueuedPodInfo) queueingStrategy { logger.V(5).Info("Checking events for in-flight pod", "pod", klog.KObj(pInfo.Pod), "unschedulablePlugins", pInfo.UnschedulablePlugins, "inFlightEventsSize", p.inFlightEvents.Len(), "inFlightPodsSize", len(p.inFlightPods)) // AddUnschedulableIfNotPresent is called with the Pod at the end of scheduling or binding. @@ -638,48 +663,50 @@ func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger // be empty. If it is not, we may have a problem. if len(pInfo.UnschedulablePlugins) != 0 { logger.Error(nil, "In flight Pod isn't found in the scheduling queue. If you see this error log, it's likely a bug in the scheduler.", "pod", klog.KObj(pInfo.Pod)) - return framework.QueueAfterBackoff + return queueAfterBackoff } if p.inFlightEvents.Len() > len(p.inFlightPods) { - return framework.QueueAfterBackoff + return queueAfterBackoff } - return framework.QueueSkip + return queueSkip } - if len(pInfo.UnschedulablePlugins) == 0 { - // No unschedulable plugins are associated with this Pod. + failedPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) + if len(failedPlugins) == 0 { + // No failed plugins are associated with this Pod. // Meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue. // In this case, we should retry scheduling it because this Pod may not be retried until the next flush. - return framework.QueueAfterBackoff + return queueAfterBackoff } // check if there is an event that makes this Pod schedulable based on pInfo.UnschedulablePlugins. - schedulingHint := framework.QueueSkip + queueingStrategy := queueSkip for event := inFlightPod.Next(); event != nil; event = event.Next() { e, ok := event.Value.(*clusterEvent) if !ok { - // Must be another pod. Can be ignored. + // Must be another in-flight Pod (*v1.Pod). Can be ignored. continue } logger.V(5).Info("Checking event for in-flight pod", "pod", klog.KObj(pInfo.Pod), "event", e.event.Label) - hint := p.isPodWorthRequeuing(logger, pInfo, e.event, e.oldObj, e.newObj) - if hint == framework.QueueSkip { + switch p.isPodWorthRequeuing(logger, pInfo, e.event, e.oldObj, e.newObj) { + case queueSkip: continue - } - - if hint == framework.QueueImmediately { - // QueueImmediately is the strongest opinion, we don't need to check other events. - schedulingHint = framework.QueueImmediately - break - } - if hint == framework.QueueAfterBackoff { - // replace schedulingHint with QueueAfterBackoff, - // but continue to check other events because we may find it QueueImmediately with other events. - schedulingHint = framework.QueueAfterBackoff + case queueImmediately: + // queueImmediately is the highest priority. + // No need to go through the rest of the events. + return queueImmediately + case queueAfterBackoff: + // replace schedulingHint with queueAfterBackoff + queueingStrategy = queueAfterBackoff + if pInfo.PendingPlugins.Len() == 0 { + // We can return immediately because no Pending plugins, which only can make queueImmediately, registered in this Pod, + // and queueAfterBackoff is the second highest priority. + return queueAfterBackoff + } } } - return schedulingHint + return queueingStrategy } // addUnschedulableIfNotPresentWithoutQueueingHint inserts a pod that cannot be scheduled into @@ -693,12 +720,16 @@ func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger, // Refresh the timestamp since the pod is re-added. pInfo.Timestamp = p.clock.Now() + // When the queueing hint is enabled, they are used differently. + // But, we use all of them as UnschedulablePlugins when the queueing hint isn't enabled so that we don't break the old behaviour. + failedPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) + // If a move request has been received, move it to the BackoffQ, otherwise move // it to unschedulablePods. - for plugin := range pInfo.UnschedulablePlugins { + for plugin := range failedPlugins { metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Inc() } - if p.moveRequestCycle >= podSchedulingCycle || len(pInfo.UnschedulablePlugins) == 0 { + if p.moveRequestCycle >= podSchedulingCycle || len(failedPlugins) == 0 { // Two cases to move a Pod to the active/backoff queue: // - The Pod is rejected by some plugins, but a move request is received after this Pod's scheduling cycle is started. // In this case, the received event may be make Pod schedulable and we should retry scheduling it. @@ -753,7 +784,8 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo * // If a move request has been received, move it to the BackoffQ, otherwise move // it to unschedulablePods. - for plugin := range pInfo.UnschedulablePlugins { + failedPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) + for plugin := range failedPlugins { metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Inc() } @@ -762,7 +794,7 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo * // In this case, we try to requeue this Pod to activeQ/backoffQ. queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, ScheduleAttemptFailure) - logger.V(3).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", queue, "schedulingCycle", podSchedulingCycle, "hint", schedulingHint) + logger.V(3).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", queue, "schedulingCycle", podSchedulingCycle, "hint", schedulingHint, "unschedulable plugins", failedPlugins) if queue == activeQ { // When the Pod is moved to activeQ, need to let p.cond know so that the Pod will be pop()ed out. p.cond.Broadcast() @@ -853,10 +885,11 @@ func (p *PriorityQueue) Pop() (*framework.QueuedPodInfo, error) { } // Update metrics and reset the set of unschedulable plugins for the next attempt. - for plugin := range pInfo.UnschedulablePlugins { + for plugin := range pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) { metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Dec() } pInfo.UnschedulablePlugins.Clear() + pInfo.PendingPlugins.Clear() return pInfo, nil } @@ -1067,15 +1100,15 @@ func (p *PriorityQueue) MoveAllToActiveOrBackoffQueue(logger klog.Logger, event // It returns the queue name Pod goes. // // NOTE: this function assumes lock has been acquired in caller -func (p *PriorityQueue) requeuePodViaQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, schedulingHint framework.QueueingHint, event string) string { - if schedulingHint == framework.QueueSkip { +func (p *PriorityQueue) requeuePodViaQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, strategy queueingStrategy, event string) string { + if strategy == queueSkip { p.unschedulablePods.addOrUpdate(pInfo) metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc() return unschedulablePods } pod := pInfo.Pod - if schedulingHint == framework.QueueAfterBackoff && p.isPodBackingoff(pInfo) { + if strategy == queueAfterBackoff && p.isPodBackingoff(pInfo) { if err := p.podBackoffQ.Add(pInfo); err != nil { logger.Error(err, "Error adding pod to the backoff queue, queue this Pod to unschedulable pod pool", "pod", klog.KObj(pod)) p.unschedulablePods.addOrUpdate(pInfo) @@ -1086,7 +1119,7 @@ func (p *PriorityQueue) requeuePodViaQueueingHint(logger klog.Logger, pInfo *fra return backoffQ } - // Reach here if schedulingHint is QueueImmediately, or schedulingHint is QueueAfterBackoff but the pod is not backing off. + // Reach here if schedulingHint is QueueImmediately, or schedulingHint is Queue but the pod is not backing off. added, err := p.addToActiveQ(logger, pInfo) if err != nil { @@ -1111,7 +1144,7 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn activated := false for _, pInfo := range podInfoList { schedulingHint := p.isPodWorthRequeuing(logger, pInfo, event, oldObj, newObj) - if schedulingHint == framework.QueueSkip { + if schedulingHint == queueSkip { // QueueingHintFn determined that this Pod isn't worth putting to activeQ or backoffQ by this event. logger.V(5).Info("Event is not making pod schedulable", "pod", klog.KObj(pInfo.Pod), "event", event.Label) continue diff --git a/pkg/scheduler/internal/queue/scheduling_queue_test.go b/pkg/scheduler/internal/queue/scheduling_queue_test.go index a84fb742cc7..5a35c197a81 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue_test.go +++ b/pkg/scheduler/internal/queue/scheduling_queue_test.go @@ -87,13 +87,10 @@ var ( cmpopts.IgnoreFields(nominator{}, "podLister", "lock"), } - queueHintReturnQueueAfterBackoff = func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { - return framework.QueueAfterBackoff, nil + queueHintReturnQueue = func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { + return framework.Queue, nil } - queueHintReturnQueueImmediately = func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { - return framework.QueueImmediately, nil - } - queueHintReturnQueueSkip = func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { + queueHintReturnSkip = func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { return framework.QueueSkip, nil } ) @@ -244,7 +241,7 @@ func Test_InFlightPods(t *testing.T) { AssignedPodAdd: { { PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueSkip, + QueueingHintFn: queueHintReturnSkip, }, }, }, @@ -340,7 +337,7 @@ func Test_InFlightPods(t *testing.T) { AssignedPodAdd: { { PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueSkip, + QueueingHintFn: queueHintReturnSkip, }, }, }, @@ -364,7 +361,7 @@ func Test_InFlightPods(t *testing.T) { AssignedPodAdd: { { PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueImmediately, + QueueingHintFn: queueHintReturnQueue, }, }, }, @@ -389,14 +386,14 @@ func Test_InFlightPods(t *testing.T) { { // It will be ignored because the event is not NodeAdd. PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueImmediately, + QueueingHintFn: queueHintReturnQueue, }, }, }, }, }, { - name: "pod is enqueued to unschedulable pod pool because the failed plugin has a hint fn but it returns QueueSkip", + name: "pod is enqueued to unschedulable pod pool because the failed plugin has a hint fn but it returns Skip", isSchedulingQueueHintEnabled: true, initialPods: []*v1.Pod{pod}, actions: []action{ @@ -414,20 +411,24 @@ func Test_InFlightPods(t *testing.T) { AssignedPodAdd: { { PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueSkip, + QueueingHintFn: queueHintReturnSkip, }, }, }, }, }, { - name: "pod is enqueued to activeQ because the failed plugin has a hint fn and it returns QueueImmediately", + name: "pod is enqueued to activeQ because the Pending plugins has a hint fn and it returns Queue", isSchedulingQueueHintEnabled: true, initialPods: []*v1.Pod{pod}, actions: []action{ {podPopped: pod}, {eventHappens: &AssignedPodAdd}, - {podEnqueued: newQueuedPodInfoForLookup(pod, "fooPlugin1", "fooPlugin2", "fooPlugin3")}, + {podEnqueued: &framework.QueuedPodInfo{ + PodInfo: mustNewPodInfo(pod), + UnschedulablePlugins: sets.New("fooPlugin2", "fooPlugin3"), + PendingPlugins: sets.New("fooPlugin1"), + }}, }, wantActiveQPodNames: []string{"targetpod"}, wantInFlightPods: nil, @@ -436,26 +437,24 @@ func Test_InFlightPods(t *testing.T) { "": { AssignedPodAdd: { { - // it will be ignored because the hint fn returns QueueSkip that is weaker than queueHintReturnQueueImmediately from fooPlugin1. PluginName: "fooPlugin3", - QueueingHintFn: queueHintReturnQueueSkip, + QueueingHintFn: queueHintReturnSkip, }, { - // it will be ignored because the hint fn returns QueueAfterBackoff that is weaker than queueHintReturnQueueImmediately from fooPlugin1. PluginName: "fooPlugin2", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, { // The hint fn tells that this event makes a Pod scheudlable immediately. PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueImmediately, + QueueingHintFn: queueHintReturnQueue, }, }, }, }, }, { - name: "pod is enqueued to backoffQ because the failed plugin has a hint fn and it returns QueueAfterBackoff", + name: "pod is enqueued to backoffQ because the failed plugin has a hint fn and it returns Queue", isSchedulingQueueHintEnabled: true, initialPods: []*v1.Pod{pod}, actions: []action{ @@ -470,21 +469,21 @@ func Test_InFlightPods(t *testing.T) { "": { AssignedPodAdd: { { - // it will be ignored because the hint fn returns QueueSkip that is weaker than queueHintReturnQueueAfterBackoff from fooPlugin1. + // it will be ignored because the hint fn returns Skip that is weaker than queueHintReturnQueue from fooPlugin1. PluginName: "fooPlugin2", - QueueingHintFn: queueHintReturnQueueSkip, + QueueingHintFn: queueHintReturnSkip, }, { - // The hint fn tells that this event makes a Pod scheudlable. + // The hint fn tells that this event makes a Pod schedulable. PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, }, }, }, }, { - name: "pod is enqueued to activeQ because the failed plugin has a hint fn and it returns QueueImmediately for a concurrent event that was received while some other pod was in flight", + name: "pod is enqueued to activeQ because the failed plugin has a hint fn and it returns Queue for a concurrent event that was received while some other pod was in flight", isSchedulingQueueHintEnabled: true, initialPods: []*v1.Pod{pod, pod2}, actions: []action{ @@ -501,7 +500,8 @@ func Test_InFlightPods(t *testing.T) { }}, {callback: func(t *testing.T, q *PriorityQueue) { logger, _ := ktesting.NewTestContext(t) - poppedPod2.UnschedulablePlugins = sets.New("fooPlugin1", "fooPlugin2", "fooPlugin3") + poppedPod2.UnschedulablePlugins = sets.New("fooPlugin2", "fooPlugin3") + poppedPod2.PendingPlugins = sets.New("fooPlugin1") err := q.AddUnschedulableIfNotPresent(logger, poppedPod2, q.SchedulingCycle()) if err != nil { t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err) @@ -517,32 +517,34 @@ func Test_InFlightPods(t *testing.T) { { // it will be ignored because the hint fn returns QueueSkip that is weaker than queueHintReturnQueueImmediately from fooPlugin1. PluginName: "fooPlugin3", - QueueingHintFn: queueHintReturnQueueSkip, + QueueingHintFn: queueHintReturnSkip, }, { - // it will be ignored because the hint fn returns QueueAfterBackoff that is weaker than queueHintReturnQueueImmediately from fooPlugin1. + // it will be ignored because the fooPlugin2 is registered in UnschedulablePlugins and it's interpret as Queue that is weaker than QueueImmediately from fooPlugin1. PluginName: "fooPlugin2", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, { - // The hint fn tells that this event makes a Pod scheudlable immediately. + // The hint fn tells that this event makes a Pod scheudlable. + // Given fooPlugin1 is registered as Pendings, we interpret Queue as queueImmediately. PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueImmediately, + QueueingHintFn: queueHintReturnQueue, }, }, }, }, }, { - name: "popped pod must have empty UnschedulablePlugins", + name: "popped pod must have empty UnschedulablePlugins and PendingPlugins", isSchedulingQueueHintEnabled: true, initialPods: []*v1.Pod{pod}, actions: []action{ {callback: func(t *testing.T, q *PriorityQueue) { poppedPod = popPod(t, q, pod) }}, {callback: func(t *testing.T, q *PriorityQueue) { logger, _ := ktesting.NewTestContext(t) - // Unschedulable. - poppedPod.UnschedulablePlugins = sets.New("fooPlugin1") + // Unschedulable due to PendingPlugins. + poppedPod.PendingPlugins = sets.New("fooPlugin1") + poppedPod.UnschedulablePlugins = sets.New("fooPlugin2") if err := q.AddUnschedulableIfNotPresent(logger, poppedPod, q.SchedulingCycle()); err != nil { t.Errorf("Unexpected error from AddUnschedulableIfNotPresent: %v", err) } @@ -568,7 +570,7 @@ func Test_InFlightPods(t *testing.T) { { // The hint fn tells that this event makes a Pod scheudlable immediately. PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueImmediately, + QueueingHintFn: queueHintReturnQueue, }, }, }, @@ -694,9 +696,9 @@ func TestPop(t *testing.T) { "": { PvAdd: { { - // The hint fn tells that this event makes a Pod scheudlable immediately. + // The hint fn tells that this event makes a Pod scheudlable. PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueImmediately, + QueueingHintFn: queueHintReturnQueue, }, }, }, @@ -713,7 +715,8 @@ func TestPop(t *testing.T) { // Simulate failed attempt that makes the pod unschedulable. poppedPod := popPod(t, q, pod) - poppedPod.UnschedulablePlugins = sets.New("fooPlugin1") + // We put register the plugin to PendingPlugins so that it's interpreted as queueImmediately and skip backoff. + poppedPod.PendingPlugins = sets.New("fooPlugin1") if err := q.AddUnschedulableIfNotPresent(logger, poppedPod, q.SchedulingCycle()); err != nil { t.Errorf("Unexpected error from AddUnschedulableIfNotPresent: %v", err) } @@ -723,8 +726,8 @@ func TestPop(t *testing.T) { // Now check result of Pop. poppedPod = popPod(t, q, pod) - if len(poppedPod.UnschedulablePlugins) > 0 { - t.Errorf("QueuedPodInfo from Pop should have empty UnschedulablePlugins, got instead: %+v", poppedPod) + if len(poppedPod.PendingPlugins) > 0 { + t.Errorf("QueuedPodInfo from Pop should have empty PendingPlugins, got instead: %+v", poppedPod) } }) } @@ -1221,7 +1224,7 @@ func BenchmarkMoveAllToActiveOrBackoffQueue(b *testing.B) { if (k+1)%(j+1) == 0 { m[""][events[j]] = append(m[""][events[j]], &QueueingHintFunction{ PluginName: plugins[k], - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }) } } @@ -1286,28 +1289,28 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithQueueingHint(t *testing. expectedQ string }{ { - name: "QueueImmediately queues pod to activeQ", - podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p)}, - hint: queueHintReturnQueueImmediately, + name: "Queue queues pod to activeQ", + podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), PendingPlugins: sets.New("foo")}, + hint: queueHintReturnQueue, expectedQ: activeQ, }, { - name: "QueueAfterBackoff queues pod to backoffQ if Pod is backing off", - podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p)}, - hint: queueHintReturnQueueAfterBackoff, + name: "Queue queues pod to backoffQ if Pod is backing off", + podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")}, + hint: queueHintReturnQueue, expectedQ: backoffQ, }, { - name: "QueueAfterBackoff queues pod to activeQ if Pod is not backing off", - podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p)}, - hint: queueHintReturnQueueAfterBackoff, + name: "Queue queues pod to activeQ if Pod is not backing off", + podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")}, + hint: queueHintReturnQueue, duration: DefaultPodInitialBackoffDuration, // backoff is finished expectedQ: activeQ, }, { - name: "QueueSkip queues pod to unschedulablePods", - podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p)}, - hint: queueHintReturnQueueSkip, + name: "Skip queues pod to unschedulablePods", + podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")}, + hint: queueHintReturnSkip, expectedQ: unschedulablePods, }, } @@ -1362,7 +1365,7 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueue(t *testing.T) { m[""][NodeAdd] = []*QueueingHintFunction{ { PluginName: "fooPlugin", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, } q := NewTestQueue(ctx, newDefaultQueueSort(), WithClock(c), WithQueueingHintMapPerProfile(m)) @@ -1544,7 +1547,7 @@ func TestPriorityQueue_AssignedPodAdded(t *testing.T) { m[""][AssignedPodAdd] = []*QueueingHintFunction{ { PluginName: "fakePlugin", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, } q := NewTestQueue(ctx, newDefaultQueueSort(), WithClock(c), WithQueueingHintMapPerProfile(m)) @@ -2139,7 +2142,7 @@ func TestHighPriorityFlushUnschedulablePodsLeftover(t *testing.T) { m[""][NodeAdd] = []*QueueingHintFunction{ { PluginName: "fakePlugin", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, } logger, ctx := ktesting.NewTestContext(t) @@ -3180,18 +3183,14 @@ func mustNewPodInfo(pod *v1.Pod) *framework.PodInfo { // Test_isPodWorthRequeuing tests isPodWorthRequeuing function. func Test_isPodWorthRequeuing(t *testing.T) { count := 0 - queueHintReturnQueueImmediately := func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { + queueHintReturnQueue := func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { count++ - return framework.QueueImmediately, nil + return framework.Queue, nil } - queueHintReturnQueueSkip := func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { + queueHintReturnSkip := func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { count++ return framework.QueueSkip, nil } - queueHintReturnQueueAfterBackoff := func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { - count++ - return framework.QueueAfterBackoff, nil - } queueHintReturnErr := func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { count++ return framework.QueueSkip, fmt.Errorf("unexpected error") @@ -3203,12 +3202,12 @@ func Test_isPodWorthRequeuing(t *testing.T) { event framework.ClusterEvent oldObj interface{} newObj interface{} - expected framework.QueueingHint + expected queueingStrategy expectedExecutionCount int // expected total execution count of queueing hint function queueingHintMap QueueingHintMapPerProfile }{ { - name: "return QueueAfterBackoff when no queueing hint function is registered for the event", + name: "return Queue when no queueing hint function is registered for the event", podInfo: &framework.QueuedPodInfo{ UnschedulablePlugins: sets.New("fooPlugin1"), PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()), @@ -3216,7 +3215,7 @@ func Test_isPodWorthRequeuing(t *testing.T) { event: NodeAdd, oldObj: nil, newObj: st.MakeNode().Obj(), - expected: framework.QueueSkip, + expected: queueSkip, expectedExecutionCount: 0, queueingHintMap: QueueingHintMapPerProfile{ "": { @@ -3225,14 +3224,14 @@ func Test_isPodWorthRequeuing(t *testing.T) { { // It will be ignored because the event is not NodeAdd. PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueImmediately, + QueueingHintFn: queueHintReturnQueue, }, }, }, }, }, { - name: "Treat the event as QueueAfterBackoff when QueueHintFn returns error", + name: "Treat the event as Queue when QueueHintFn returns error", podInfo: &framework.QueuedPodInfo{ UnschedulablePlugins: sets.New("fooPlugin1"), PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()), @@ -3240,7 +3239,7 @@ func Test_isPodWorthRequeuing(t *testing.T) { event: NodeAdd, oldObj: nil, newObj: st.MakeNode().Obj(), - expected: framework.QueueAfterBackoff, + expected: queueAfterBackoff, expectedExecutionCount: 1, queueingHintMap: QueueingHintMapPerProfile{ "": { @@ -3254,7 +3253,7 @@ func Test_isPodWorthRequeuing(t *testing.T) { }, }, { - name: "return QueueAfterBackoff when the event is wildcard", + name: "return Queue when the event is wildcard", podInfo: &framework.QueuedPodInfo{ UnschedulablePlugins: sets.New("fooPlugin1"), PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()), @@ -3262,78 +3261,40 @@ func Test_isPodWorthRequeuing(t *testing.T) { event: WildCardEvent, oldObj: nil, newObj: st.MakeNode().Obj(), - expected: framework.QueueAfterBackoff, + expected: queueAfterBackoff, expectedExecutionCount: 0, queueingHintMap: QueueingHintMapPerProfile{}, }, { - name: "QueueImmediately is the highest priority", + name: "interprets Queue from the Pending plugin as queueImmediately", podInfo: &framework.QueuedPodInfo{ - UnschedulablePlugins: sets.New("fooPlugin1", "fooPlugin2", "fooPlugin3", "fooPlugin4", "fooPlugin5"), + UnschedulablePlugins: sets.New("fooPlugin1", "fooPlugin3"), + PendingPlugins: sets.New("fooPlugin2"), PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()), }, event: NodeAdd, oldObj: nil, - newObj: st.MakeNode().Obj(), - expected: framework.QueueImmediately, - expectedExecutionCount: 3, + newObj: st.MakeNode().Node, + expected: queueImmediately, + expectedExecutionCount: 2, queueingHintMap: QueueingHintMapPerProfile{ "": { NodeAdd: { { - // executed - PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + PluginName: "fooPlugin1", + // It returns Queue and it's interpreted as queueAfterBackoff. + // But, the function continues to run other hints because the Pod has PendingPlugins, which can result in queueImmediately. + QueueingHintFn: queueHintReturnQueue, }, { - // executed - PluginName: "fooPlugin2", - QueueingHintFn: queueHintReturnErr, - }, - { - // executed - // But, no more queueing hint function is executed - // because the highest priority is QueueImmediately. - PluginName: "fooPlugin3", - QueueingHintFn: queueHintReturnQueueImmediately, - }, - { - PluginName: "fooPlugin4", - QueueingHintFn: queueHintReturnQueueAfterBackoff, - }, - { - PluginName: "fooPlugin5", - QueueingHintFn: queueHintReturnQueueSkip, - }, - }, - }, - }, - }, - { - name: "QueueSkip is the lowest priority", - podInfo: &framework.QueuedPodInfo{ - UnschedulablePlugins: sets.New("fooPlugin1", "fooPlugin2", "fooPlugin3", "fooPlugin4"), - PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()), - }, - event: NodeAdd, - oldObj: nil, - newObj: st.MakeNode().Obj(), - expected: framework.QueueAfterBackoff, - expectedExecutionCount: 4, - queueingHintMap: QueueingHintMapPerProfile{ - "": { - NodeAdd: { - { - PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueAfterBackoff, - }, - { - PluginName: "fooPlugin2", - QueueingHintFn: queueHintReturnQueueSkip, + PluginName: "fooPlugin2", + // It's interpreted as queueImmediately. + // The function doesn't run other hints because queueImmediately is the highest priority. + QueueingHintFn: queueHintReturnQueue, }, { PluginName: "fooPlugin3", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, { PluginName: "fooPlugin4", @@ -3344,7 +3305,7 @@ func Test_isPodWorthRequeuing(t *testing.T) { }, }, { - name: "Queueing hint function that isn't from the plugin, that is in the UnschedulablePlugins, is ignored", + name: "interprets Queue from the Unschedulable plugin as queueAfterBackoff", podInfo: &framework.QueuedPodInfo{ UnschedulablePlugins: sets.New("fooPlugin1", "fooPlugin2"), PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()), @@ -3352,29 +3313,57 @@ func Test_isPodWorthRequeuing(t *testing.T) { event: NodeAdd, oldObj: nil, newObj: st.MakeNode().Obj(), - expected: framework.QueueAfterBackoff, + expected: queueAfterBackoff, expectedExecutionCount: 2, queueingHintMap: QueueingHintMapPerProfile{ "": { NodeAdd: { { + // Skip will be ignored PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnSkip, }, { + // Skip will be ignored PluginName: "fooPlugin2", - QueueingHintFn: queueHintReturnQueueSkip, - }, - { - PluginName: "fooPlugin3", - QueueingHintFn: queueHintReturnQueueImmediately, // It'll be ignored. + QueueingHintFn: queueHintReturnQueue, }, }, }, }, }, { - name: "If event is specific Node update event, queueing hint function for NodeUpdate/UpdateNodeLabel is executed", + name: "Queueing hint function that isn't from the plugin in UnschedulablePlugins/PendingPlugins is ignored", + podInfo: &framework.QueuedPodInfo{ + UnschedulablePlugins: sets.New("fooPlugin1", "fooPlugin2"), + PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()), + }, + event: NodeAdd, + oldObj: nil, + newObj: st.MakeNode().Node, + expected: queueSkip, + expectedExecutionCount: 2, + queueingHintMap: QueueingHintMapPerProfile{ + "": { + NodeAdd: { + { + PluginName: "fooPlugin1", + QueueingHintFn: queueHintReturnSkip, + }, + { + PluginName: "fooPlugin2", + QueueingHintFn: queueHintReturnSkip, + }, + { + PluginName: "fooPlugin3", + QueueingHintFn: queueHintReturnQueue, // It'll be ignored. + }, + }, + }, + }, + }, + { + name: "If event is specific Node update event, queueing hint function for NodeUpdate/UpdateNodeLabel is also executed", podInfo: &framework.QueuedPodInfo{ UnschedulablePlugins: sets.New("fooPlugin1", "fooPlugin2"), PodInfo: mustNewPodInfo(st.MakePod().Name("pod1").Namespace("ns1").UID("1").Obj()), @@ -3382,30 +3371,32 @@ func Test_isPodWorthRequeuing(t *testing.T) { event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.UpdateNodeLabel}, oldObj: nil, newObj: st.MakeNode().Obj(), - expected: framework.QueueAfterBackoff, - expectedExecutionCount: 3, + expected: queueAfterBackoff, + expectedExecutionCount: 1, queueingHintMap: QueueingHintMapPerProfile{ "": { framework.ClusterEvent{Resource: framework.Node, ActionType: framework.UpdateNodeLabel}: { { - PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + PluginName: "fooPlugin1", + // It's only executed and interpreted as queueAfterBackoff. + // The function doesn't run other hints because this Pod doesn't have PendingPlugins. + QueueingHintFn: queueHintReturnQueue, }, { PluginName: "fooPlugin2", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, }, framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Update}: { { PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, }, NodeAdd: { // not executed because NodeAdd is unrelated. { PluginName: "fooPlugin1", - QueueingHintFn: queueHintReturnQueueAfterBackoff, + QueueingHintFn: queueHintReturnQueue, }, }, }, diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 89a4e0ae1c1..33a1a7a89c6 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -211,10 +211,10 @@ func (sched *Scheduler) schedulingCycle( NumAllNodes: 1, Pod: pod, Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: sts}, - UnschedulablePlugins: sets.New(sts.FailedPlugin()), + NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: sts}, }, } + fitErr.Diagnosis.SetFailedPlugin(sts) return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(sts.Code()).WithError(fitErr) } return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, sts @@ -234,10 +234,10 @@ func (sched *Scheduler) schedulingCycle( NumAllNodes: 1, Pod: pod, Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: runPermitStatus}, - UnschedulablePlugins: sets.New(runPermitStatus.FailedPlugin()), + NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: runPermitStatus}, }, } + fitErr.Diagnosis.SetFailedPlugin(runPermitStatus) return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(runPermitStatus.Code()).WithError(fitErr) } @@ -435,8 +435,7 @@ func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*v1.Node, framework.Diagnosis, error) { logger := klog.FromContext(ctx) diagnosis := framework.Diagnosis{ - NodeToStatusMap: make(framework.NodeToStatusMap), - UnschedulablePlugins: sets.New[string](), + NodeToStatusMap: make(framework.NodeToStatusMap), } allNodes, err := sched.nodeInfoSnapshot.NodeInfos().List() @@ -459,10 +458,7 @@ func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.F msg := s.Message() diagnosis.PreFilterMsg = msg logger.V(5).Info("Status after running PreFilter plugins for pod", "pod", klog.KObj(pod), "status", msg) - // Status satisfying IsUnschedulable() gets injected into diagnosis.UnschedulablePlugins. - if s.FailedPlugin() != "" { - diagnosis.UnschedulablePlugins.Insert(s.FailedPlugin()) - } + diagnosis.SetFailedPlugin(s) return nil, diagnosis, nil } @@ -490,7 +486,7 @@ func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.F nodes = append(nodes, nInfo) } } - feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, nodes) + feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, nodes) // always try to update the sched.nextStartNodeIndex regardless of whether an error has occurred // this is helpful to make sure that all the nodes have a chance to be searched processedNodes := len(feasibleNodes) + len(diagnosis.NodeToStatusMap) @@ -513,7 +509,7 @@ func (sched *Scheduler) evaluateNominatedNode(ctx context.Context, pod *v1.Pod, return nil, err } node := []*framework.NodeInfo{nodeInfo} - feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, node) + feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, node) if err != nil { return nil, err } @@ -532,7 +528,7 @@ func (sched *Scheduler) findNodesThatPassFilters( fwk framework.Framework, state *framework.CycleState, pod *v1.Pod, - diagnosis framework.Diagnosis, + diagnosis *framework.Diagnosis, nodes []*framework.NodeInfo) ([]*v1.Node, error) { numAllNodes := len(nodes) numNodesToFind := sched.numFeasibleNodesToFind(fwk.PercentageOfNodesToScore(), int32(numAllNodes)) @@ -573,7 +569,7 @@ func (sched *Scheduler) findNodesThatPassFilters( } else { statusesLock.Lock() diagnosis.NodeToStatusMap[nodeInfo.Node().Name] = status - diagnosis.UnschedulablePlugins.Insert(status.FailedPlugin()) + diagnosis.SetFailedPlugin(status) statusesLock.Unlock() } } @@ -982,6 +978,7 @@ func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framewo logger.V(2).Info("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod)) } else if fitError, ok := err.(*framework.FitError); ok { // Inject UnschedulablePlugins to PodInfo, which will be used later for moving Pods between queues efficiently. podInfo.UnschedulablePlugins = fitError.Diagnosis.UnschedulablePlugins + podInfo.PendingPlugins = fitError.Diagnosis.PendingPlugins logger.V(2).Info("Unable to schedule pod; no fit; waiting", "pod", klog.KObj(pod), "err", errMsg) } else if apierrors.IsNotFound(err) { logger.V(2).Info("Unable to schedule pod, possibly due to node not found; waiting", "pod", klog.KObj(pod), "err", errMsg) diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 635335855d5..d7752001ec1 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -363,9 +363,9 @@ func New(ctx context.Context, } // defaultQueueingHintFn is the default queueing hint function. -// It always returns QueueAfterBackoff as the queueing hint. +// It always returns Queue as the queueing hint. var defaultQueueingHintFn = func(_ klog.Logger, _ *v1.Pod, _, _ interface{}) (framework.QueueingHint, error) { - return framework.QueueAfterBackoff, nil + return framework.Queue, nil } func buildQueueingHintMap(es []framework.EnqueueExtensions) internalqueue.QueueingHintMap { diff --git a/test/integration/scheduler/queue_test.go b/test/integration/scheduler/queue_test.go index 88c96eda786..789dc1ce217 100644 --- a/test/integration/scheduler/queue_test.go +++ b/test/integration/scheduler/queue_test.go @@ -544,7 +544,7 @@ func TestRequeueByPermitRejection(t *testing.T) { frameworkHandler: fh, schedulingHint: func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { queueingHintCalledCounter++ - return framework.QueueImmediately, nil + return framework.Queue, nil }, } return fakePermit, nil diff --git a/test/integration/scheduler/rescheduling_test.go b/test/integration/scheduler/rescheduling_test.go index 049cde2ea14..f2abaffab59 100644 --- a/test/integration/scheduler/rescheduling_test.go +++ b/test/integration/scheduler/rescheduling_test.go @@ -71,7 +71,7 @@ func (rp *ReservePlugin) EventsToRegister() []framework.ClusterEventWithHint { { Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add}, QueueingHintFn: func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { - return framework.QueueImmediately, nil + return framework.Queue, nil }, }, } @@ -108,7 +108,7 @@ func (pp *PermitPlugin) EventsToRegister() []framework.ClusterEventWithHint { { Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add}, QueueingHintFn: func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { - return framework.QueueImmediately, nil + return framework.Queue, nil }, }, }