Merge pull request #50106 from julia-stripe/improve-scheduler-error-handling

Automatic merge from submit-queue Retry scheduling pods after errors more consistently in scheduler **What this PR does / why we need it**: This fixes 2 places in the scheduler where pods can get stuck in Pending forever. In both these places, errors happen and `sched.config.Error` is not called afterwards. This is a problem because `sched.config.Error` is responsible for requeuing pods to retry scheduling when there are issues (see [here](2540b333b2/plugin/pkg/scheduler/factory/factory.go (L958))), so if we don't call `sched.config.Error` then the pod will never get scheduled (unless the scheduler is restarted). One of these (where it returns when `ForgetPod` fails instead of continuing and reporting an error) is a regression from [this refactor](https://github.com/kubernetes/kubernetes/commit/ecb962e6585#diff-67f2b61521299ca8d8687b0933bbfb19L234), and with the [old behavior](80f26fa8a8/plugin/pkg/scheduler/scheduler.go (L233-L237)) the error was reported correctly. As far as I can tell changing the error handling in that refactor wasn't intentional. When AssumePod fails there's never been an error reported but I think adding this will help the scheduler recover when something goes wrong instead of letting pods possibly never get scheduled. This will help prevent issues like https://github.com/kubernetes/kubernetes/issues/49314 in the future. **Release note**: ```release-note Fix incorrect retry logic in scheduler ```
2025-07-23 11:50:44 +00:00 · 2017-08-07 01:35:17 -07:00 · 2017-08-07 01:35:17 -07:00 · bc7ccfe93b
commit bc7ccfe93b
parent c75d3028dd 2d9c6dfae8
1 changed files with 18 additions and 12 deletions
--- a/plugin/pkg/scheduler/scheduler.go
+++ b/plugin/pkg/scheduler/scheduler.go
@ -17,7 +17,6 @@ limitations under the License.
 package scheduler
 import (
 	"fmt"
 	"time"
 	"k8s.io/api/core/v1"
@ -194,12 +193,20 @@ func (sched *Scheduler) assume(assumed *v1.Pod, host string) error {
 	assumed.Spec.NodeName = host
 	if err := sched.config.SchedulerCache.AssumePod(assumed); err != nil {
 		glog.Errorf("scheduler cache AssumePod failed: %v", err)
-		// TODO: This means that a given pod is already in cache (which means it
+
-		// is either assumed or already added). This is most probably result of a
+		// This is most probably result of a BUG in retrying logic.
-		// BUG in retrying logic. As a temporary workaround (which doesn't fully
+		// We report an error here so that pod scheduling can be retried.
-		// fix the problem, but should reduce its impact), we simply return here,
+		// This relies on the fact that Error will check if the pod has been bound
-		// as binding doesn't make sense anyway.
+		// to a node and if so will not add it back to the unscheduled pods queue
-		// This should be fixed properly though.
+		// (otherwise this would cause an infinite loop).
 		sched.config.Error(assumed, err)
 		sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePod failed: %v", err)
 		sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
 			Type:    v1.PodScheduled,
 			Status:  v1.ConditionFalse,
 			Reason:  "SchedulerError",
 			Message: err.Error(),
 		})
 		return err
 	}
@ -219,10 +226,13 @@ func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
 	// If binding succeeded then PodScheduled condition will be updated in apiserver so that
 	// it's atomic with setting host.
 	err := sched.config.Binder.Bind(b)
 	if err := sched.config.SchedulerCache.FinishBinding(assumed); err != nil {
 		glog.Errorf("scheduler cache FinishBinding failed: %v", err)
 	}
 	if err != nil {
 		glog.V(1).Infof("Failed to bind pod: %v/%v", assumed.Namespace, assumed.Name)
 		if err := sched.config.SchedulerCache.ForgetPod(assumed); err != nil {
-			return fmt.Errorf("scheduler cache ForgetPod failed: %v", err)
+			glog.Errorf("scheduler cache ForgetPod failed: %v", err)
 		}
 		sched.config.Error(assumed, err)
 		sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "Binding rejected: %v", err)
@ -234,10 +244,6 @@ func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
 		return err
 	}
 	if err := sched.config.SchedulerCache.FinishBinding(assumed); err != nil {
 		return fmt.Errorf("scheduler cache FinishBinding failed: %v", err)
 	}
 	metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
 	sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v to %v", assumed.Name, b.Target.Name)
 	return nil