Merge pull request #93779 from yodarshafrir1/fix_restart_job_failure_with_restart_policy_never

Fix job backoff limit for restart policy Never
2025-09-04 18:52:38 +00:00 · 2020-10-06 10:02:44 -07:00
parent 4db8c590ac 24010022ef
commit f30d6a463d
3 changed files with 106 additions and 11 deletions
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -467,9 +467,6 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 		return true, nil
 	}

-	// retrieve the previous number of retry
-	previousRetry := jm.queue.NumRequeues(key)
-
 	// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
 	// and update the expectations after we've retrieved active pods from the store. If a new pod enters
 	// the store after we've checked the expectation, the job sync is just deferred till the next relist.
@@ -506,7 +503,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 	// is different than parallelism, otherwise the previous controller loop
 	// failed updating status so even if we pick up failure it is not a new one
 	exceedsBackoffLimit := jobHaveNewFailure && (active != *job.Spec.Parallelism) &&
-		(int32(previousRetry)+1 > *job.Spec.BackoffLimit)
+		(failed > *job.Spec.BackoffLimit)

 	if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
 		// check if the number of pod restart exceeds backoff (for restart OnFailure only)
--- a/pkg/controller/job/job_controller_test.go
+++ b/pkg/controller/job/job_controller_test.go
@@ -1521,3 +1521,107 @@ func TestJobBackoffForOnFailure(t *testing.T) {
 		})
 	}
 }
+
+func TestJobBackoffOnRestartPolicyNever(t *testing.T) {
+	jobConditionFailed := batch.JobFailed
+
+	testCases := map[string]struct {
+		// job setup
+		parallelism  int32
+		completions  int32
+		backoffLimit int32
+
+		// pod setup
+		activePodsPhase v1.PodPhase
+		activePods      int32
+		failedPods      int32
+
+		// expectations
+		isExpectingAnError      bool
+		jobKeyForget            bool
+		expectedActive          int32
+		expectedSucceeded       int32
+		expectedFailed          int32
+		expectedCondition       *batch.JobConditionType
+		expectedConditionReason string
+	}{
+		"not enough failures with backoffLimit 0 - single pod": {
+			1, 1, 0,
+			v1.PodRunning, 1, 0,
+			false, true, 1, 0, 0, nil, "",
+		},
+		"not enough failures with backoffLimit 1 - single pod": {
+			1, 1, 1,
+			"", 0, 1,
+			true, false, 1, 0, 1, nil, "",
+		},
+		"too many failures with backoffLimit 1 - single pod": {
+			1, 1, 1,
+			"", 0, 2,
+			false, true, 0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded",
+		},
+		"not enough failures with backoffLimit 6 - multiple pods": {
+			2, 2, 6,
+			v1.PodRunning, 1, 6,
+			true, false, 2, 0, 6, nil, "",
+		},
+		"too many failures with backoffLimit 6 - multiple pods": {
+			2, 2, 6,
+			"", 0, 7,
+			false, true, 0, 0, 7, &jobConditionFailed, "BackoffLimitExceeded",
+		},
+	}
+
+	for name, tc := range testCases {
+		t.Run(name, func(t *testing.T) {
+			// job manager setup
+			clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
+			manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc)
+			fakePodControl := controller.FakePodControl{}
+			manager.podControl = &fakePodControl
+			manager.podStoreSynced = alwaysReady
+			manager.jobStoreSynced = alwaysReady
+			var actual *batch.Job
+			manager.updateHandler = func(job *batch.Job) error {
+				actual = job
+				return nil
+			}
+
+			// job & pods setup
+			job := newJob(tc.parallelism, tc.completions, tc.backoffLimit)
+			job.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyNever
+			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
+			podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer()
+			for _, pod := range newPodList(tc.failedPods, v1.PodFailed, job) {
+				podIndexer.Add(&pod)
+			}
+			for _, pod := range newPodList(tc.activePods, tc.activePodsPhase, job) {
+				podIndexer.Add(&pod)
+			}
+
+			// run
+			forget, err := manager.syncJob(testutil.GetKey(job, t))
+
+			if (err != nil) != tc.isExpectingAnError {
+				t.Errorf("unexpected error syncing job. Got %#v, isExpectingAnError: %v\n", err, tc.isExpectingAnError)
+			}
+			if forget != tc.jobKeyForget {
+				t.Errorf("unexpected forget value. Expected %v, saw %v\n", tc.jobKeyForget, forget)
+			}
+			// validate status
+			if actual.Status.Active != tc.expectedActive {
+				t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active)
+			}
+			if actual.Status.Succeeded != tc.expectedSucceeded {
+				t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded)
+			}
+			if actual.Status.Failed != tc.expectedFailed {
+				t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed)
+			}
+			// validate conditions
+			if tc.expectedCondition != nil && !getCondition(actual, *tc.expectedCondition, tc.expectedConditionReason) {
+				t.Errorf("expected completion condition. Got %#v", actual.Status.Conditions)
+			}
+		})
+	}
+}
--- a/test/e2e/apps/job.go
+++ b/test/e2e/apps/job.go
@@ -246,13 +246,7 @@ var _ = SIGDescribe("Job", func() {
 		ginkgo.By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1))
 		pods, err := e2ejob.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name)
 		framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name)
-		// gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1))
-		// due to NumRequeus not being stable enough, especially with failed status
-		// updates we need to allow more than backoff+1
-		// TODO revert this back to above when https://github.com/kubernetes/kubernetes/issues/64787 gets fixed
-		if len(pods.Items) < backoff+1 {
-			framework.Failf("Not enough pod created expected at least %d, got %#v", backoff+1, pods.Items)
-		}
+		gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1))
 		for _, pod := range pods.Items {
 			framework.ExpectEqual(pod.Status.Phase, v1.PodFailed)
 		}