Merge pull request #120175 from kannon92/move-pod-failure-policy-constant

move reasons to api package for job controller
2026-01-04 15:05:20 +00:00 · 2023-09-20 03:06:00 -07:00
parent 355feb21fd a62eb45ae2
commit fd5f36e6a0
3 changed files with 48 additions and 32 deletions
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -56,12 +56,6 @@ import (
 	"k8s.io/utils/pointer"
 )

-const (
-	// PodFailurePolicy reason indicates a job failure condition is added due to
-	// a failed pod matching a pod failure policy rule
-	jobConditionReasonPodFailurePolicy = "PodFailurePolicy"
-)
-
 // controllerKind contains the schema.GroupVersionKind for this controller type.
 var controllerKind = batch.SchemeGroupVersion.WithKind("Job")

@@ -85,6 +79,15 @@ var (
 	MaxPodCreateDeletePerSync = 500
 )

+const (
+	// MaxFailedIndexesExceeded indicates that an indexed of a job failed
+	// https://kep.k8s.io/3850
+	// In Beta, this should be moved to staging as an API field.
+	jobReasonMaxFailedIndexesExceeded string = "MaxFailedIndexesExceeded"
+	// FailedIndexes means Job has failed indexes.
+	jobReasonFailedIndexes string = "FailedIndexes"
+)
+
 // Controller ensures that all Job objects have corresponding pods to
 // run their configured workload.
 type Controller struct {
@@ -816,16 +819,16 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
 			jobCtx.finishedCondition = newFailedConditionForFailureTarget(failureTargetCondition, jm.clock.Now())
 		} else if failJobMessage := getFailJobMessage(&job, pods); failJobMessage != nil {
 			// Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed.
-			jobCtx.finishedCondition = newCondition(batch.JobFailureTarget, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage, jm.clock.Now())
+			jobCtx.finishedCondition = newCondition(batch.JobFailureTarget, v1.ConditionTrue, batch.JobReasonPodFailurePolicy, *failJobMessage, jm.clock.Now())
 		}
 	}
 	if jobCtx.finishedCondition == nil {
 		if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
 			// check if the number of pod restart exceeds backoff (for restart OnFailure only)
 			// OR if the number of failed jobs increased since the last syncJob
-			jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit", jm.clock.Now())
+			jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonBackoffLimitExceeded, "Job has reached the specified backoff limit", jm.clock.Now())
 		} else if jm.pastActiveDeadline(&job) {
-			jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline", jm.clock.Now())
+			jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonDeadlineExceeded, "Job was active longer than specified deadline", jm.clock.Now())
 		} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
 			syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - jm.clock.Since(job.Status.StartTime.Time)
 			logger.V(2).Info("Job has activeDeadlineSeconds configuration. Will sync this job again", "key", key, "nextSyncIn", syncDuration)
@@ -840,9 +843,9 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
 			jobCtx.failedIndexes = calculateFailedIndexes(logger, &job, pods)
 			if jobCtx.finishedCondition == nil {
 				if job.Spec.MaxFailedIndexes != nil && jobCtx.failedIndexes.total() > int(*job.Spec.MaxFailedIndexes) {
-					jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "MaxFailedIndexesExceeded", "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now())
+					jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobReasonMaxFailedIndexesExceeded, "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now())
 				} else if jobCtx.failedIndexes.total() > 0 && jobCtx.failedIndexes.total()+jobCtx.succeededIndexes.total() >= int(*job.Spec.Completions) {
-					jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "FailedIndexes", "Job has failed indexes", jm.clock.Now())
+					jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobReasonFailedIndexes, "Job has failed indexes", jm.clock.Now())
 				}
 			}
 			jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx)
--- a/pkg/controller/job/job_controller_test.go
+++ b/pkg/controller/job/job_controller_test.go
@@ -1909,7 +1909,7 @@ func TestSyncJobPastDeadline(t *testing.T) {
 			expectedDeletions:       1,
 			expectedFailed:          1,
 			expectedCondition:       batch.JobFailed,
-			expectedConditionReason: "DeadlineExceeded",
+			expectedConditionReason: batch.JobReasonDeadlineExceeded,
 		},
 		"activeDeadlineSeconds bigger than single pod execution": {
 			parallelism:             1,
@@ -1923,7 +1923,7 @@ func TestSyncJobPastDeadline(t *testing.T) {
 			expectedSucceeded:       1,
 			expectedFailed:          1,
 			expectedCondition:       batch.JobFailed,
-			expectedConditionReason: "DeadlineExceeded",
+			expectedConditionReason: batch.JobReasonDeadlineExceeded,
 		},
 		"activeDeadlineSeconds times-out before any pod starts": {
 			parallelism:             1,
@@ -1932,7 +1932,7 @@ func TestSyncJobPastDeadline(t *testing.T) {
 			startTime:               10,
 			backoffLimit:            6,
 			expectedCondition:       batch.JobFailed,
-			expectedConditionReason: "DeadlineExceeded",
+			expectedConditionReason: batch.JobReasonDeadlineExceeded,
 		},
 		"activeDeadlineSeconds with backofflimit reach": {
 			parallelism:             1,
@@ -1942,7 +1942,7 @@ func TestSyncJobPastDeadline(t *testing.T) {
 			failedPods:              1,
 			expectedFailed:          1,
 			expectedCondition:       batch.JobFailed,
-			expectedConditionReason: "BackoffLimitExceeded",
+			expectedConditionReason: batch.JobReasonBackoffLimitExceeded,
 		},
 		"activeDeadlineSeconds is not triggered when Job is suspended": {
 			suspend:                 true,
@@ -2098,7 +2098,7 @@ func TestPastDeadlineJobFinished(t *testing.T) {
 				if err != nil {
 					return false, nil
 				}
-				if getCondition(j, batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded") {
+				if getCondition(j, batch.JobFailed, v1.ConditionTrue, batch.JobReasonDeadlineExceeded) {
 					if manager.clock.Since(j.Status.StartTime.Time) < time.Duration(*j.Spec.ActiveDeadlineSeconds)*time.Second {
 						return true, errors.New("Job contains DeadlineExceeded condition earlier than expected")
 					}
@@ -2397,7 +2397,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 				{
 					Type:    batch.JobFailed,
 					Status:  v1.ConditionTrue,
-					Reason:  "PodFailurePolicy",
+					Reason:  batch.JobReasonPodFailurePolicy,
 					Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
 				},
 			},
@@ -2425,7 +2425,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 						{
 							Type:    batch.JobFailureTarget,
 							Status:  v1.ConditionTrue,
-							Reason:  "PodFailurePolicy",
+							Reason:  batch.JobReasonPodFailurePolicy,
 							Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
 						},
 					},
@@ -2452,7 +2452,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 				{
 					Type:    batch.JobFailed,
 					Status:  v1.ConditionTrue,
-					Reason:  "PodFailurePolicy",
+					Reason:  batch.JobReasonPodFailurePolicy,
 					Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
 				},
 			},
@@ -2480,7 +2480,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 						{
 							Type:    batch.JobFailureTarget,
 							Status:  v1.ConditionTrue,
-							Reason:  "PodFailurePolicy",
+							Reason:  batch.JobReasonPodFailurePolicy,
 							Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
 						},
 					},
@@ -2507,7 +2507,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 				{
 					Type:    batch.JobFailed,
 					Status:  v1.ConditionTrue,
-					Reason:  "PodFailurePolicy",
+					Reason:  batch.JobReasonPodFailurePolicy,
 					Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
 				},
 			},
@@ -2596,7 +2596,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 				{
 					Type:    batch.JobFailed,
 					Status:  v1.ConditionTrue,
-					Reason:  "PodFailurePolicy",
+					Reason:  batch.JobReasonPodFailurePolicy,
 					Message: "Container main-container for pod default/mypod-1 failed with exit code 5 matching FailJob rule at index 1",
 				},
 			},
@@ -2642,7 +2642,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 				{
 					Type:    batch.JobFailed,
 					Status:  v1.ConditionTrue,
-					Reason:  "PodFailurePolicy",
+					Reason:  batch.JobReasonPodFailurePolicy,
 					Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
 				},
 			},
@@ -2695,7 +2695,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 				{
 					Type:    batch.JobFailed,
 					Status:  v1.ConditionTrue,
-					Reason:  "PodFailurePolicy",
+					Reason:  batch.JobReasonPodFailurePolicy,
 					Message: "Container main-container for pod default/mypod-0 failed with exit code 42 matching FailJob rule at index 0",
 				},
 			},
@@ -2797,7 +2797,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 				{
 					Type:    batch.JobFailed,
 					Status:  v1.ConditionTrue,
-					Reason:  "PodFailurePolicy",
+					Reason:  batch.JobReasonPodFailurePolicy,
 					Message: "Container init-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
 				},
 			},
@@ -2924,7 +2924,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 				{
 					Type:    batch.JobFailed,
 					Status:  v1.ConditionTrue,
-					Reason:  "BackoffLimitExceeded",
+					Reason:  batch.JobReasonBackoffLimitExceeded,
 					Message: "Job has reached the specified backoff limit",
 				},
 			},
@@ -3185,7 +3185,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
 				{
 					Type:    batch.JobFailed,
 					Status:  v1.ConditionTrue,
-					Reason:  "PodFailurePolicy",
+					Reason:  batch.JobReasonPodFailurePolicy,
 					Message: "Pod default/mypod-0 has condition DisruptionTarget matching FailJob rule at index 0",
 				},
 			},
@@ -3571,13 +3571,13 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 					{
 						Type:    batch.JobFailureTarget,
 						Status:  v1.ConditionTrue,
-						Reason:  "PodFailurePolicy",
+						Reason:  batch.JobReasonPodFailurePolicy,
 						Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
 					},
 					{
 						Type:    batch.JobFailed,
 						Status:  v1.ConditionTrue,
-						Reason:  "PodFailurePolicy",
+						Reason:  batch.JobReasonPodFailurePolicy,
 						Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
 					},
 				},
@@ -3660,7 +3660,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 					{
 						Type:    batch.JobFailed,
 						Status:  v1.ConditionTrue,
-						Reason:  "BackoffLimitExceeded",
+						Reason:  batch.JobReasonBackoffLimitExceeded,
 						Message: "Job has reached the specified backoff limit",
 					},
 				},
@@ -3695,7 +3695,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 					{
 						Type:    batch.JobFailed,
 						Status:  v1.ConditionTrue,
-						Reason:  "FailedIndexes",
+						Reason:  jobReasonFailedIndexes,
 						Message: "Job has failed indexes",
 					},
 				},
@@ -3733,7 +3733,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 					{
 						Type:    batch.JobFailed,
 						Status:  v1.ConditionTrue,
-						Reason:  "MaxFailedIndexesExceeded",
+						Reason:  jobReasonMaxFailedIndexesExceeded,
 						Message: "Job has exceeded the specified maximal number of failed indexes",
 					},
 				},
--- a/staging/src/k8s.io/api/batch/v1/types.go
+++ b/staging/src/k8s.io/api/batch/v1/types.go
@@ -535,6 +535,19 @@ const (
 	JobFailureTarget JobConditionType = "FailureTarget"
 )

+const (
+	// JobReasonPodFailurePolicy reason indicates a job failure condition is added due to
+	// a failed pod matching a pod failure policy rule
+	// https://kep.k8s.io/3329
+	// This is currently a beta field.
+	JobReasonPodFailurePolicy string = "PodFailurePolicy"
+	// JobReasonBackOffLimitExceeded reason indicates that pods within a job have failed a number of
+	// times higher than backOffLimit times.
+	JobReasonBackoffLimitExceeded string = "BackoffLimitExceeded"
+	// JobReasponDeadlineExceeded means job duration is past ActiveDeadline
+	JobReasonDeadlineExceeded string = "DeadlineExceeded"
+)
+
 // JobCondition describes current state of a job.
 type JobCondition struct {
 	// Type of job condition, Complete or Failed.