diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go index 59206323e33..70c7504834b 100644 --- a/pkg/controller/job/job_controller.go +++ b/pkg/controller/job/job_controller.go @@ -503,7 +503,7 @@ func (jm *Controller) syncJob(key string) (bool, error) { // is different than parallelism, otherwise the previous controller loop // failed updating status so even if we pick up failure it is not a new one exceedsBackoffLimit := jobHaveNewFailure && (active != *job.Spec.Parallelism) && - (failed >= *job.Spec.BackoffLimit) + (failed > *job.Spec.BackoffLimit) if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) { // check if the number of pod restart exceeds backoff (for restart OnFailure only) diff --git a/pkg/controller/job/job_controller_test.go b/pkg/controller/job/job_controller_test.go index ebf3041a715..62c5ad611e1 100644 --- a/pkg/controller/job/job_controller_test.go +++ b/pkg/controller/job/job_controller_test.go @@ -1552,23 +1552,23 @@ func TestJobBackoffOnRestartPolicyNever(t *testing.T) { }, "not enough failures with backoffLimit 1 - single pod": { 1, 1, 1, - v1.PodPending, 1, 0, - false, true, 1, 0, 0, nil, "", + "", 0, 1, + true, false, 1, 0, 1, nil, "", }, "too many failures with backoffLimit 1 - single pod": { 1, 1, 1, - "", 0, 1, - false, true, 0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded", + "", 0, 2, + false, true, 0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded", }, "not enough failures with backoffLimit 6 - multiple pods": { 2, 2, 6, - v1.PodRunning, 1, 5, - true, false, 2, 0, 5, nil, "", + v1.PodRunning, 1, 6, + true, false, 2, 0, 6, nil, "", }, "too many failures with backoffLimit 6 - multiple pods": { 2, 2, 6, - "", 0, 6, - false, true, 0, 0, 6, &jobConditionFailed, "BackoffLimitExceeded", + "", 0, 7, + false, true, 0, 0, 7, &jobConditionFailed, "BackoffLimitExceeded", }, } diff --git a/test/e2e/apps/job.go b/test/e2e/apps/job.go index 9c9c23fe6ea..352da6907b1 100644 --- a/test/e2e/apps/job.go +++ b/test/e2e/apps/job.go @@ -246,13 +246,7 @@ var _ = SIGDescribe("Job", func() { ginkgo.By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1)) pods, err := e2ejob.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name) framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name) - // gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1)) - // due to NumRequeus not being stable enough, especially with failed status - // updates we need to allow more than backoff+1 - // TODO revert this back to above when https://github.com/kubernetes/kubernetes/issues/64787 gets fixed - if len(pods.Items) < backoff+1 { - framework.Failf("Not enough pod created expected at least %d, got %#v", backoff+1, pods.Items) - } + gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1)) for _, pod := range pods.Items { framework.ExpectEqual(pod.Status.Phase, v1.PodFailed) }