Merge pull request #50106 from julia-stripe/improve-scheduler-error-handling

Automatic merge from submit-queue Retry scheduling pods after errors more consistently in scheduler **What this PR does / why we need it**: This fixes 2 places in the scheduler where pods can get stuck in Pending forever. In both these places, errors happen and `sched.config.Error` is not called afterwards. This is a problem because `sched.config.Error` is responsible for requeuing pods to retry scheduling when there are issues (see [here](2540b333b2/plugin/pkg/scheduler/factory/factory.go (L958))), so if we don't call `sched.config.Error` then the pod will never get scheduled (unless the scheduler is restarted). One of these (where it returns when `ForgetPod` fails instead of continuing and reporting an error) is a regression from [this refactor](https://github.com/kubernetes/kubernetes/commit/ecb962e6585#diff-67f2b61521299ca8d8687b0933bbfb19L234), and with the [old behavior](80f26fa8a8/plugin/pkg/scheduler/scheduler.go (L233-L237)) the error was reported correctly. As far as I can tell changing the error handling in that refactor wasn't intentional. When AssumePod fails there's never been an error reported but I think adding this will help the scheduler recover when something goes wrong instead of letting pods possibly never get scheduled. This will help prevent issues like https://github.com/kubernetes/kubernetes/issues/49314 in the future. **Release note**: ```release-note Fix incorrect retry logic in scheduler ```
2025-07-29 14:37:00 +00:00 · 2017-08-07 01:35:17 -07:00 · 2017-08-07 01:35:17 -07:00 · bc7ccfe93b
commit bc7ccfe93b
parent c75d3028dd 2d9c6dfae8
1 changed files with 18 additions and 12 deletions
--- a/plugin/pkg/scheduler/scheduler.go
+++ b/plugin/pkg/scheduler/scheduler.go
@ -17,7 +17,6 @@ limitations under the License.
 package scheduler

 import (
-	"fmt"
 	"time"

 	"k8s.io/api/core/v1"
@ -194,12 +193,20 @@ func (sched *Scheduler) assume(assumed *v1.Pod, host string) error {
 	assumed.Spec.NodeName = host
 	if err := sched.config.SchedulerCache.AssumePod(assumed); err != nil {
 		glog.Errorf("scheduler cache AssumePod failed: %v", err)
-		// TODO: This means that a given pod is already in cache (which means it
-		// is either assumed or already added). This is most probably result of a
-		// BUG in retrying logic. As a temporary workaround (which doesn't fully
-		// fix the problem, but should reduce its impact), we simply return here,
-		// as binding doesn't make sense anyway.
-		// This should be fixed properly though.
+
+		// This is most probably result of a BUG in retrying logic.
+		// We report an error here so that pod scheduling can be retried.
+		// This relies on the fact that Error will check if the pod has been bound
+		// to a node and if so will not add it back to the unscheduled pods queue
+		// (otherwise this would cause an infinite loop).
+		sched.config.Error(assumed, err)
+		sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePod failed: %v", err)
+		sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
+			Type:    v1.PodScheduled,
+			Status:  v1.ConditionFalse,
+			Reason:  "SchedulerError",
+			Message: err.Error(),
+		})
 		return err
 	}

@ -219,10 +226,13 @@ func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
 	// If binding succeeded then PodScheduled condition will be updated in apiserver so that
 	// it's atomic with setting host.
 	err := sched.config.Binder.Bind(b)
+	if err := sched.config.SchedulerCache.FinishBinding(assumed); err != nil {
+		glog.Errorf("scheduler cache FinishBinding failed: %v", err)
+	}
 	if err != nil {
 		glog.V(1).Infof("Failed to bind pod: %v/%v", assumed.Namespace, assumed.Name)
 		if err := sched.config.SchedulerCache.ForgetPod(assumed); err != nil {
-			return fmt.Errorf("scheduler cache ForgetPod failed: %v", err)
+			glog.Errorf("scheduler cache ForgetPod failed: %v", err)
 		}
 		sched.config.Error(assumed, err)
 		sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "Binding rejected: %v", err)
@ -234,10 +244,6 @@ func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
 		return err
 	}

-	if err := sched.config.SchedulerCache.FinishBinding(assumed); err != nil {
-		return fmt.Errorf("scheduler cache FinishBinding failed: %v", err)
-	}
-
 	metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
 	sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v to %v", assumed.Name, b.Target.Name)
 	return nil