From c6e8bd62ad06203cd5f25631f2e266105934a3d8 Mon Sep 17 00:00:00 2001 From: cedric lamoriniere Date: Mon, 5 Feb 2018 10:06:19 +0100 Subject: [PATCH] Improves backoff policy in JobController issues: https://github.com/kubernetes/kubernetes/issues/56853 Add check if the number of pods succeeded increased since the last check. If yes the backoff delay is cleared. This logic improves the Job backoff policy when parallelism > 1 and few pods's Job failed but others succeed. --- pkg/controller/job/job_controller.go | 12 ++++++++++-- pkg/controller/job/job_controller_test.go | 9 +++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go index a12cbed1c46..14674a6b898 100644 --- a/pkg/controller/job/job_controller.go +++ b/pkg/controller/job/job_controller.go @@ -553,6 +553,14 @@ func (jm *JobController) syncJob(key string) (bool, error) { } forget := false + // Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true + // This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to + // improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed. + // In this case, we should clear the backoff delay. + if job.Status.Succeeded < succeeded { + forget = true + } + // no need to update the job if the status hasn't changed since last time if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions { job.Status.Active = active @@ -560,12 +568,12 @@ func (jm *JobController) syncJob(key string) (bool, error) { job.Status.Failed = failed if err := jm.updateHandler(&job); err != nil { - return false, err + return forget, err } if jobHaveNewFailure && !IsJobFinished(&job) { // returning an error will re-enqueue Job after the backoff period - return false, fmt.Errorf("failed pod(s) detected for job key %q", key) + return forget, fmt.Errorf("failed pod(s) detected for job key %q", key) } forget = true diff --git a/pkg/controller/job/job_controller_test.go b/pkg/controller/job/job_controller_test.go index 21a2b8f8d25..ec9aabf3fc9 100644 --- a/pkg/controller/job/job_controller_test.go +++ b/pkg/controller/job/job_controller_test.go @@ -218,11 +218,16 @@ func TestControllerSyncJob(t *testing.T) { fmt.Errorf("Fake error"), true, 0, 3, 0, 0, 0, 1, 3, 0, 0, nil, "", }, - "failed pod": { + "failed + succeed pods: reset backoff delay": { 2, 5, 6, false, 0, - fmt.Errorf("Fake error"), false, 0, 1, 1, 1, + fmt.Errorf("Fake error"), true, 0, 1, 1, 1, 1, 0, 1, 1, 1, nil, "", }, + "only new failed pod": { + 2, 5, 6, false, 0, + fmt.Errorf("Fake error"), false, 0, 1, 0, 1, + 1, 0, 1, 0, 1, nil, "", + }, "job finish": { 2, 5, 6, false, 0, nil, true, 0, 0, 5, 0,