Improves backoff policy in JobController

issues: https://github.com/kubernetes/kubernetes/issues/56853

Add check if the number of pods succeeded increased since the last
check. If yes the backoff delay is cleared. This logic improves the Job
backoff policy when parallelism > 1 and few pods's Job failed but others
succeed.
This commit is contained in:
cedric lamoriniere 2018-02-05 10:06:19 +01:00
parent 551f73d324
commit c6e8bd62ad
2 changed files with 17 additions and 4 deletions

View File

@ -553,6 +553,14 @@ func (jm *JobController) syncJob(key string) (bool, error) {
} }
forget := false forget := false
// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
// In this case, we should clear the backoff delay.
if job.Status.Succeeded < succeeded {
forget = true
}
// no need to update the job if the status hasn't changed since last time // no need to update the job if the status hasn't changed since last time
if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions { if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions {
job.Status.Active = active job.Status.Active = active
@ -560,12 +568,12 @@ func (jm *JobController) syncJob(key string) (bool, error) {
job.Status.Failed = failed job.Status.Failed = failed
if err := jm.updateHandler(&job); err != nil { if err := jm.updateHandler(&job); err != nil {
return false, err return forget, err
} }
if jobHaveNewFailure && !IsJobFinished(&job) { if jobHaveNewFailure && !IsJobFinished(&job) {
// returning an error will re-enqueue Job after the backoff period // returning an error will re-enqueue Job after the backoff period
return false, fmt.Errorf("failed pod(s) detected for job key %q", key) return forget, fmt.Errorf("failed pod(s) detected for job key %q", key)
} }
forget = true forget = true

View File

@ -218,11 +218,16 @@ func TestControllerSyncJob(t *testing.T) {
fmt.Errorf("Fake error"), true, 0, 3, 0, 0, fmt.Errorf("Fake error"), true, 0, 3, 0, 0,
0, 1, 3, 0, 0, nil, "", 0, 1, 3, 0, 0, nil, "",
}, },
"failed pod": { "failed + succeed pods: reset backoff delay": {
2, 5, 6, false, 0, 2, 5, 6, false, 0,
fmt.Errorf("Fake error"), false, 0, 1, 1, 1, fmt.Errorf("Fake error"), true, 0, 1, 1, 1,
1, 0, 1, 1, 1, nil, "", 1, 0, 1, 1, 1, nil, "",
}, },
"only new failed pod": {
2, 5, 6, false, 0,
fmt.Errorf("Fake error"), false, 0, 1, 0, 1,
1, 0, 1, 0, 1, nil, "",
},
"job finish": { "job finish": {
2, 5, 6, false, 0, 2, 5, 6, false, 0,
nil, true, 0, 0, 5, 0, nil, true, 0, 0, 5, 0,