diff --git a/pkg/scheduler/backend/queue/scheduling_queue.go b/pkg/scheduler/backend/queue/scheduling_queue.go index 56d4ec20bab..9e787d1ed53 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue.go +++ b/pkg/scheduler/backend/queue/scheduling_queue.go @@ -642,20 +642,19 @@ func (p *PriorityQueue) SchedulingCycle() int64 { // determineSchedulingHintForInFlightPod looks at the unschedulable plugins of the given Pod // and determines the scheduling hint for this Pod while checking the events that happened during in-flight. func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger, pInfo *framework.QueuedPodInfo) queueingStrategy { - events, err := p.activeQ.clusterEventsForPod(logger, pInfo) - if err != nil { - logger.Error(err, "Error getting cluster events for pod", "pod", klog.KObj(pInfo.Pod)) - return queueAfterBackoff - } - - rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) - if len(rejectorPlugins) == 0 { + if len(pInfo.UnschedulablePlugins) == 0 && len(pInfo.PendingPlugins) == 0 { // No failed plugins are associated with this Pod. // Meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue. // In this case, we should retry scheduling it because this Pod may not be retried until the next flush. return queueAfterBackoff } + events, err := p.activeQ.clusterEventsForPod(logger, pInfo) + if err != nil { + logger.Error(err, "Error getting cluster events for pod", "pod", klog.KObj(pInfo.Pod)) + return queueAfterBackoff + } + // check if there is an event that makes this Pod schedulable based on pInfo.UnschedulablePlugins. queueingStrategy := queueSkip for _, e := range events { diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index 4743306ca63..fd7cb9c3515 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -239,8 +239,12 @@ type QueuedPodInfo struct { // It shouldn't be updated once initialized. It's used to record the e2e scheduling // latency for a pod. InitialAttemptTimestamp *time.Time - // UnschedulablePlugins records the plugin names that the Pod failed with Unschedulable or UnschedulableAndUnresolvable status. - // It's registered only when the Pod is rejected in PreFilter, Filter, Reserve, PreBind or Permit (WaitOnPermit). + // UnschedulablePlugins records the plugin names that the Pod failed with Unschedulable or UnschedulableAndUnresolvable status + // at specific extension points: PreFilter, Filter, Reserve, Permit (WaitOnPermit), or PreBind. + // If Pods are rejected at other extension points, + // they're assumed to be unexpected errors (e.g., temporal network issue, plugin implementation issue, etc) + // and retried soon after a backoff period. + // That is because such failures could be solved regardless of incoming cluster events (registered in EventsToRegister). UnschedulablePlugins sets.Set[string] // PendingPlugins records the plugin names that the Pod failed with Pending status. PendingPlugins sets.Set[string] diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 847336def49..513fe5e029a 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -126,9 +126,6 @@ func (sched *Scheduler) ScheduleOne(ctx context.Context) { sched.handleBindingCycleError(bindingCycleCtx, state, fwk, assumedPodInfo, start, scheduleResult, status) return } - // Usually, DonePod is called inside the scheduling queue, - // but in this case, we need to call it here because this Pod won't go back to the scheduling queue. - sched.SchedulingQueue.Done(assumedPodInfo.Pod.UID) }() } @@ -309,6 +306,13 @@ func (sched *Scheduler) bindingCycle( return status } + // Any failures after this point cannot lead to the Pod being considered unschedulable. + // We define the Pod as "unschedulable" only when Pods are rejected at specific extension points, and PreBind is the last one in the scheduling/binding cycle. + // + // We can call Done() here because + // we can free the cluster events stored in the scheduling queue sonner, which is worth for busy clusters memory consumption wise. + sched.SchedulingQueue.Done(assumedPod.UID) + // Run "bind" plugins. if status := sched.bind(ctx, fwk, assumedPod, scheduleResult.SuggestedHost, state); !status.IsSuccess() { return status