From f7a1fb76f4fa410c01fe382ae9060f2079bb9f7d Mon Sep 17 00:00:00 2001 From: Aldo Culquicondor Date: Fri, 7 Jul 2023 14:08:19 -0400 Subject: [PATCH] Only declare job as finished after removing all finalizers Change-Id: Id4b01b0e6fabe24134e57e687356e0fc613cead4 --- pkg/controller/job/job_controller.go | 11 ++++------- test/integration/job/job_test.go | 3 +++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go index 49c186ecc8a..4205ae4c16b 100644 --- a/pkg/controller/job/job_controller.go +++ b/pkg/controller/job/job_controller.go @@ -792,12 +792,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) { var manageJobErr error var finishedCondition *batch.JobCondition - jobHasNewFailure := failed > job.Status.Failed - // new failures happen when status does not reflect the failures and active - // is different than parallelism, otherwise the previous controller loop - // failed updating status so even if we pick up failure it is not a new one - exceedsBackoffLimit := jobHasNewFailure && (active != *job.Spec.Parallelism) && - (failed > *job.Spec.BackoffLimit) + exceedsBackoffLimit := failed > *job.Spec.BackoffLimit if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) { if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.JobFailureTarget); failureTargetCondition != nil { @@ -999,6 +994,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job needsFlush = true } podFailureCountByPolicyAction := map[string]int{} + reachedMaxUncountedPods := false for _, pod := range pods { if !hasJobTrackingFinalizer(pod) || expectedRmFinalizers.Has(string(pod.UID)) { // This pod was processed in a previous sync. @@ -1049,6 +1045,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job // // The job will be synced again because the Job status and Pod updates // will put the Job back to the work queue. + reachedMaxUncountedPods = true break } } @@ -1077,7 +1074,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush, newBackoffRecord); err != nil { return err } - jobFinished := jm.enactJobFinished(job, finishedCond) + jobFinished := !reachedMaxUncountedPods && jm.enactJobFinished(job, finishedCond) if jobFinished { needsFlush = true } diff --git a/test/integration/job/job_test.go b/test/integration/job/job_test.go index 215c369a082..f3cfeebbf57 100644 --- a/test/integration/job/job_test.go +++ b/test/integration/job/job_test.go @@ -1341,6 +1341,9 @@ func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) { } func TestFinalizersClearedWhenBackoffLimitExceeded(t *testing.T) { + // Set a maximum number of uncounted pods below parallelism, to ensure it + // doesn't affect the termination of pods. + t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 50)) closeFn, restConfig, clientSet, ns := setup(t, "simple") defer closeFn() ctx, cancel := startJobControllerAndWaitForCaches(restConfig)