From 35d0af9243cb38e44bcd932db96746a848c441a8 Mon Sep 17 00:00:00 2001 From: Michal Wozniak Date: Wed, 19 Jul 2023 12:38:54 +0200 Subject: [PATCH] Include ignored pods when computing backoff delay for Job pod failures --- pkg/controller/job/job_controller.go | 25 +++++++----- pkg/controller/job/job_controller_test.go | 47 +++++++++++++++++++++++ 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go index 078ef2a7e6c..f6bce071742 100644 --- a/pkg/controller/job/job_controller.go +++ b/pkg/controller/job/job_controller.go @@ -794,7 +794,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) { active := int32(len(jobCtx.activePods)) newSucceededPods, newFailedPods := getNewFinishedPods(jobCtx) jobCtx.succeeded = job.Status.Succeeded + int32(len(newSucceededPods)) + int32(len(jobCtx.uncounted.succeeded)) - failed := job.Status.Failed + int32(len(newFailedPods)) + int32(len(jobCtx.uncounted.failed)) + failed := job.Status.Failed + int32(nonIgnoredFailedPodsCount(jobCtx, newFailedPods)) + int32(len(jobCtx.uncounted.failed)) var ready *int32 if feature.DefaultFeatureGate.Enabled(features.JobReadyPods) { ready = pointer.Int32(countReadyPods(jobCtx.activePods)) @@ -951,6 +951,19 @@ func (jm *Controller) deleteActivePods(ctx context.Context, job *batch.Job, pods return successfulDeletes, errorFromChannel(errCh) } +func nonIgnoredFailedPodsCount(jobCtx *syncJobCtx, failedPods []*v1.Pod) int { + result := len(failedPods) + if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && jobCtx.job.Spec.PodFailurePolicy != nil { + for _, p := range failedPods { + _, countFailed, _ := matchPodFailurePolicy(jobCtx.job.Spec.PodFailurePolicy, p) + if !countFailed { + result-- + } + } + } + return result +} + // deleteJobPods deletes the pods, returns the number of successful removals // and any error. func (jm *Controller) deleteJobPods(ctx context.Context, job *batch.Job, jobKey string, pods []*v1.Pod) (int32, error) { @@ -1406,15 +1419,7 @@ func getNewFinishedPods(jobCtx *syncJobCtx) (succeededPods, failedPods []*v1.Pod return p.Status.Phase == v1.PodSucceeded }) failedPods = getValidPodsWithFilter(jobCtx, jobCtx.uncounted.Failed(), func(p *v1.Pod) bool { - if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && jobCtx.job.Spec.PodFailurePolicy != nil { - if !isPodFailed(p, jobCtx.job) { - return false - } - _, countFailed, _ := matchPodFailurePolicy(jobCtx.job.Spec.PodFailurePolicy, p) - return countFailed - } else { - return isPodFailed(p, jobCtx.job) - } + return isPodFailed(p, jobCtx.job) }) return succeededPods, failedPods } diff --git a/pkg/controller/job/job_controller_test.go b/pkg/controller/job/job_controller_test.go index e533300e3de..5b1f0338265 100644 --- a/pkg/controller/job/job_controller_test.go +++ b/pkg/controller/job/job_controller_test.go @@ -3019,6 +3019,53 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) { wantStatusFailed: 0, wantStatusSucceeded: 0, }, + "ignore pod failure based on OnPodConditions, ignored failures delays pod recreation": { + enableJobPodFailurePolicy: true, + job: batch.Job{ + TypeMeta: metav1.TypeMeta{Kind: "Job"}, + ObjectMeta: validObjectMeta, + Spec: batch.JobSpec{ + Selector: validSelector, + Template: validTemplate, + Parallelism: pointer.Int32(1), + Completions: pointer.Int32(1), + BackoffLimit: pointer.Int32(0), + PodFailurePolicy: &batch.PodFailurePolicy{ + Rules: []batch.PodFailurePolicyRule{ + { + Action: batch.PodFailurePolicyActionIgnore, + OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ + { + Type: v1.DisruptionTarget, + Status: v1.ConditionTrue, + }, + }, + }, + }, + }, + }, + }, + pods: []v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + DeletionTimestamp: &now, + }, + Status: v1.PodStatus{ + Phase: v1.PodFailed, + Conditions: []v1.PodCondition{ + { + Type: v1.DisruptionTarget, + Status: v1.ConditionTrue, + }, + }, + }, + }, + }, + wantConditions: nil, + wantStatusActive: 0, + wantStatusFailed: 0, + wantStatusSucceeded: 0, + }, "fail job based on OnPodConditions": { enableJobPodFailurePolicy: true, job: batch.Job{