mirror of
https://github.com/k3s-io/kubernetes.git
synced 2026-01-13 11:25:19 +00:00
Improves backoff policy in JobController
issues: https://github.com/kubernetes/kubernetes/issues/56853 Add check if the number of pods succeeded increased since the last check. If yes the backoff delay is cleared. This logic improves the Job backoff policy when parallelism > 1 and few pods's Job failed but others succeed.
This commit is contained in:
@@ -553,6 +553,14 @@ func (jm *JobController) syncJob(key string) (bool, error) {
|
||||
}
|
||||
|
||||
forget := false
|
||||
// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
|
||||
// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
|
||||
// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
|
||||
// In this case, we should clear the backoff delay.
|
||||
if job.Status.Succeeded < succeeded {
|
||||
forget = true
|
||||
}
|
||||
|
||||
// no need to update the job if the status hasn't changed since last time
|
||||
if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions {
|
||||
job.Status.Active = active
|
||||
@@ -560,12 +568,12 @@ func (jm *JobController) syncJob(key string) (bool, error) {
|
||||
job.Status.Failed = failed
|
||||
|
||||
if err := jm.updateHandler(&job); err != nil {
|
||||
return false, err
|
||||
return forget, err
|
||||
}
|
||||
|
||||
if jobHaveNewFailure && !IsJobFinished(&job) {
|
||||
// returning an error will re-enqueue Job after the backoff period
|
||||
return false, fmt.Errorf("failed pod(s) detected for job key %q", key)
|
||||
return forget, fmt.Errorf("failed pod(s) detected for job key %q", key)
|
||||
}
|
||||
|
||||
forget = true
|
||||
|
||||
Reference in New Issue
Block a user