Fix job's backoff limit for restart policy Never, rely on number of failures instead of number of NumRequeues

2025-07-23 11:50:44 +00:00 · 2020-08-07 14:00:24 +03:00 · 2020-08-07 14:00:24 +03:00 · ca420ddada
commit ca420ddada
parent 6ce9e71cd5
2 changed files with 105 additions and 4 deletions
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@ -467,9 +467,6 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 		return true, nil
 	}
 	// retrieve the previous number of retry
 	previousRetry := jm.queue.NumRequeues(key)
 	// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
 	// and update the expectations after we've retrieved active pods from the store. If a new pod enters
 	// the store after we've checked the expectation, the job sync is just deferred till the next relist.
@ -506,7 +503,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 	// is different than parallelism, otherwise the previous controller loop
 	// failed updating status so even if we pick up failure it is not a new one
 	exceedsBackoffLimit := jobHaveNewFailure && (active != *job.Spec.Parallelism) &&
-		(int32(previousRetry)+1 > *job.Spec.BackoffLimit)
+		(failed >= *job.Spec.BackoffLimit)
 	if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
 		// check if the number of pod restart exceeds backoff (for restart OnFailure only)
--- a/pkg/controller/job/job_controller_test.go
+++ b/pkg/controller/job/job_controller_test.go
@ -1521,3 +1521,107 @@ func TestJobBackoffForOnFailure(t *testing.T) {
 		})
 	}
 }
 func TestJobBackoffOnRestartPolicyNever(t *testing.T) {
 	jobConditionFailed := batch.JobFailed
 	testCases := map[string]struct {
 		// job setup
 		parallelism  int32
 		completions  int32
 		backoffLimit int32
 		// pod setup
 		activePodsPhase v1.PodPhase
 		activePods      int32
 		failedPods      int32
 		// expectations
 		isExpectingAnError      bool
 		jobKeyForget            bool
 		expectedActive          int32
 		expectedSucceeded       int32
 		expectedFailed          int32
 		expectedCondition       *batch.JobConditionType
 		expectedConditionReason string
 	}{
 		"not enough failures with backoffLimit 0 - single pod": {
 			1, 1, 0,
 			v1.PodRunning, 1, 0,
 			false, true, 1, 0, 0, nil, "",
 		},
 		"not enough failures with backoffLimit 1 - single pod": {
 			1, 1, 1,
 			v1.PodPending, 1, 0,
 			false, true, 1, 0, 0, nil, "",
 		},
 		"too many failures with backoffLimit 1 - single pod": {
 			1, 1, 1,
 			"", 0, 1,
 			false, true, 0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded",
 		},
 		"not enough failures with backoffLimit 6 - multiple pods": {
 			2, 2, 6,
 			v1.PodRunning, 1, 5,
 			true, false, 2, 0, 5, nil, "",
 		},
 		"too many failures with backoffLimit 6 - multiple pods": {
 			2, 2, 6,
 			"", 0, 6,
 			false, true, 0, 0, 6, &jobConditionFailed, "BackoffLimitExceeded",
 		},
 	}
 	for name, tc := range testCases {
 		t.Run(name, func(t *testing.T) {
 			// job manager setup
 			clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
 			manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc)
 			fakePodControl := controller.FakePodControl{}
 			manager.podControl = &fakePodControl
 			manager.podStoreSynced = alwaysReady
 			manager.jobStoreSynced = alwaysReady
 			var actual *batch.Job
 			manager.updateHandler = func(job *batch.Job) error {
 				actual = job
 				return nil
 			}
 			// job & pods setup
 			job := newJob(tc.parallelism, tc.completions, tc.backoffLimit)
 			job.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyNever
 			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
 			podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer()
 			for _, pod := range newPodList(tc.failedPods, v1.PodFailed, job) {
 				podIndexer.Add(&pod)
 			}
 			for _, pod := range newPodList(tc.activePods, tc.activePodsPhase, job) {
 				podIndexer.Add(&pod)
 			}
 			// run
 			forget, err := manager.syncJob(testutil.GetKey(job, t))
 			if (err != nil) != tc.isExpectingAnError {
 				t.Errorf("unexpected error syncing job. Got %#v, isExpectingAnError: %v\n", err, tc.isExpectingAnError)
 			}
 			if forget != tc.jobKeyForget {
 				t.Errorf("unexpected forget value. Expected %v, saw %v\n", tc.jobKeyForget, forget)
 			}
 			// validate status
 			if actual.Status.Active != tc.expectedActive {
 				t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active)
 			}
 			if actual.Status.Succeeded != tc.expectedSucceeded {
 				t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded)
 			}
 			if actual.Status.Failed != tc.expectedFailed {
 				t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed)
 			}
 			// validate conditions
 			if tc.expectedCondition != nil && !getCondition(actual, *tc.expectedCondition, tc.expectedConditionReason) {
 				t.Errorf("expected completion condition. Got %#v", actual.Status.Conditions)
 			}
 		})
 	}
 }