From f7a1fb76f4fa410c01fe382ae9060f2079bb9f7d Mon Sep 17 00:00:00 2001
From: Aldo Culquicondor <acondor@google.com>
Date: Fri, 7 Jul 2023 14:08:19 -0400
Subject: [PATCH] Only declare job as finished after removing all finalizers

Change-Id: Id4b01b0e6fabe24134e57e687356e0fc613cead4
---
 pkg/controller/job/job_controller.go | 11 ++++-------
 test/integration/job/job_test.go     |  3 +++
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go
index 49c186ecc8a..4205ae4c16b 100644
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -792,12 +792,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
 	var manageJobErr error
 	var finishedCondition *batch.JobCondition
 
-	jobHasNewFailure := failed > job.Status.Failed
-	// new failures happen when status does not reflect the failures and active
-	// is different than parallelism, otherwise the previous controller loop
-	// failed updating status so even if we pick up failure it is not a new one
-	exceedsBackoffLimit := jobHasNewFailure && (active != *job.Spec.Parallelism) &&
-		(failed > *job.Spec.BackoffLimit)
+	exceedsBackoffLimit := failed > *job.Spec.BackoffLimit
 
 	if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
 		if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.JobFailureTarget); failureTargetCondition != nil {
@@ -999,6 +994,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 		needsFlush = true
 	}
 	podFailureCountByPolicyAction := map[string]int{}
+	reachedMaxUncountedPods := false
 	for _, pod := range pods {
 		if !hasJobTrackingFinalizer(pod) || expectedRmFinalizers.Has(string(pod.UID)) {
 			// This pod was processed in a previous sync.
@@ -1049,6 +1045,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 			//
 			// The job will be synced again because the Job status and Pod updates
 			// will put the Job back to the work queue.
+			reachedMaxUncountedPods = true
 			break
 		}
 	}
@@ -1077,7 +1074,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 	if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush, newBackoffRecord); err != nil {
 		return err
 	}
-	jobFinished := jm.enactJobFinished(job, finishedCond)
+	jobFinished := !reachedMaxUncountedPods && jm.enactJobFinished(job, finishedCond)
 	if jobFinished {
 		needsFlush = true
 	}
diff --git a/test/integration/job/job_test.go b/test/integration/job/job_test.go
index 215c369a082..f3cfeebbf57 100644
--- a/test/integration/job/job_test.go
+++ b/test/integration/job/job_test.go
@@ -1341,6 +1341,9 @@ func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) {
 }
 
 func TestFinalizersClearedWhenBackoffLimitExceeded(t *testing.T) {
+	// Set a maximum number of uncounted pods below parallelism, to ensure it
+	// doesn't affect the termination of pods.
+	t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 50))
 	closeFn, restConfig, clientSet, ns := setup(t, "simple")
 	defer closeFn()
 	ctx, cancel := startJobControllerAndWaitForCaches(restConfig)