diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go index 02a1cf3a895..70c7504834b 100644 --- a/pkg/controller/job/job_controller.go +++ b/pkg/controller/job/job_controller.go @@ -467,9 +467,6 @@ func (jm *Controller) syncJob(key string) (bool, error) { return true, nil } - // retrieve the previous number of retry - previousRetry := jm.queue.NumRequeues(key) - // Check the expectations of the job before counting active pods, otherwise a new pod can sneak in // and update the expectations after we've retrieved active pods from the store. If a new pod enters // the store after we've checked the expectation, the job sync is just deferred till the next relist. @@ -506,7 +503,7 @@ func (jm *Controller) syncJob(key string) (bool, error) { // is different than parallelism, otherwise the previous controller loop // failed updating status so even if we pick up failure it is not a new one exceedsBackoffLimit := jobHaveNewFailure && (active != *job.Spec.Parallelism) && - (int32(previousRetry)+1 > *job.Spec.BackoffLimit) + (failed > *job.Spec.BackoffLimit) if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) { // check if the number of pod restart exceeds backoff (for restart OnFailure only) diff --git a/pkg/controller/job/job_controller_test.go b/pkg/controller/job/job_controller_test.go index a6544e79092..62c5ad611e1 100644 --- a/pkg/controller/job/job_controller_test.go +++ b/pkg/controller/job/job_controller_test.go @@ -1521,3 +1521,107 @@ func TestJobBackoffForOnFailure(t *testing.T) { }) } } + +func TestJobBackoffOnRestartPolicyNever(t *testing.T) { + jobConditionFailed := batch.JobFailed + + testCases := map[string]struct { + // job setup + parallelism int32 + completions int32 + backoffLimit int32 + + // pod setup + activePodsPhase v1.PodPhase + activePods int32 + failedPods int32 + + // expectations + isExpectingAnError bool + jobKeyForget bool + expectedActive int32 + expectedSucceeded int32 + expectedFailed int32 + expectedCondition *batch.JobConditionType + expectedConditionReason string + }{ + "not enough failures with backoffLimit 0 - single pod": { + 1, 1, 0, + v1.PodRunning, 1, 0, + false, true, 1, 0, 0, nil, "", + }, + "not enough failures with backoffLimit 1 - single pod": { + 1, 1, 1, + "", 0, 1, + true, false, 1, 0, 1, nil, "", + }, + "too many failures with backoffLimit 1 - single pod": { + 1, 1, 1, + "", 0, 2, + false, true, 0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded", + }, + "not enough failures with backoffLimit 6 - multiple pods": { + 2, 2, 6, + v1.PodRunning, 1, 6, + true, false, 2, 0, 6, nil, "", + }, + "too many failures with backoffLimit 6 - multiple pods": { + 2, 2, 6, + "", 0, 7, + false, true, 0, 0, 7, &jobConditionFailed, "BackoffLimitExceeded", + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + // job manager setup + clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) + manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc) + fakePodControl := controller.FakePodControl{} + manager.podControl = &fakePodControl + manager.podStoreSynced = alwaysReady + manager.jobStoreSynced = alwaysReady + var actual *batch.Job + manager.updateHandler = func(job *batch.Job) error { + actual = job + return nil + } + + // job & pods setup + job := newJob(tc.parallelism, tc.completions, tc.backoffLimit) + job.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyNever + sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) + podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer() + for _, pod := range newPodList(tc.failedPods, v1.PodFailed, job) { + podIndexer.Add(&pod) + } + for _, pod := range newPodList(tc.activePods, tc.activePodsPhase, job) { + podIndexer.Add(&pod) + } + + // run + forget, err := manager.syncJob(testutil.GetKey(job, t)) + + if (err != nil) != tc.isExpectingAnError { + t.Errorf("unexpected error syncing job. Got %#v, isExpectingAnError: %v\n", err, tc.isExpectingAnError) + } + if forget != tc.jobKeyForget { + t.Errorf("unexpected forget value. Expected %v, saw %v\n", tc.jobKeyForget, forget) + } + // validate status + if actual.Status.Active != tc.expectedActive { + t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active) + } + if actual.Status.Succeeded != tc.expectedSucceeded { + t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded) + } + if actual.Status.Failed != tc.expectedFailed { + t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed) + } + // validate conditions + if tc.expectedCondition != nil && !getCondition(actual, *tc.expectedCondition, tc.expectedConditionReason) { + t.Errorf("expected completion condition. Got %#v", actual.Status.Conditions) + } + }) + } +} diff --git a/test/e2e/apps/job.go b/test/e2e/apps/job.go index 9c9c23fe6ea..352da6907b1 100644 --- a/test/e2e/apps/job.go +++ b/test/e2e/apps/job.go @@ -246,13 +246,7 @@ var _ = SIGDescribe("Job", func() { ginkgo.By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1)) pods, err := e2ejob.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name) framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name) - // gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1)) - // due to NumRequeus not being stable enough, especially with failed status - // updates we need to allow more than backoff+1 - // TODO revert this back to above when https://github.com/kubernetes/kubernetes/issues/64787 gets fixed - if len(pods.Items) < backoff+1 { - framework.Failf("Not enough pod created expected at least %d, got %#v", backoff+1, pods.Items) - } + gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1)) for _, pod := range pods.Items { framework.ExpectEqual(pod.Status.Phase, v1.PodFailed) }