diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go index 42c5aa75a47..9ab4678ee71 100644 --- a/pkg/controller/job/job_controller.go +++ b/pkg/controller/job/job_controller.go @@ -1031,6 +1031,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job if cleanUncountedPodsWithoutFinalizers(&job.Status, uidsWithFinalizer) { needsFlush = true } + podFailureCountByPolicyAction := map[string]int{} for _, pod := range pods { if !hasJobTrackingFinalizer(pod) || expectedRmFinalizers.Has(string(pod.UID)) { continue @@ -1061,7 +1062,10 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job ix := getCompletionIndex(pod.Annotations) if !uncounted.failed.Has(string(pod.UID)) && (!isIndexed || (ix != unknownCompletionIndex && ix < int(*job.Spec.Completions))) { if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil { - _, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod) + _, countFailed, action := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod) + if action != nil { + podFailureCountByPolicyAction[string(*action)] += 1 + } if countFailed { needsFlush = true uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID) @@ -1102,7 +1106,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job } } var err error - if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil { + if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush); err != nil { return err } jobFinished := jm.enactJobFinished(job, finishedCond) @@ -1132,7 +1136,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job // // Returns whether there are pending changes in the Job status that need to be // flushed in subsequent calls. -func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, job *batch.Job, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.String, oldCounters *batch.JobStatus, needsFlush bool) (*batch.Job, bool, error) { +func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, job *batch.Job, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.String, oldCounters *batch.JobStatus, podFailureCountByPolicyAction map[string]int, needsFlush bool) (*batch.Job, bool, error) { var err error if needsFlush { if job, err = jm.updateStatusHandler(ctx, job); err != nil { @@ -1143,6 +1147,8 @@ func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, job *oldCounters = job.Status needsFlush = false } + recordJobPodFailurePolicyActions(job, podFailureCountByPolicyAction) + jobKey, err := controller.KeyFunc(job) if err != nil { return job, needsFlush, fmt.Errorf("getting job key: %w", err) @@ -1263,10 +1269,10 @@ func (jm *Controller) recordJobFinished(job *batch.Job, finishedCond *batch.JobC jm.recorder.Event(job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached") } jm.recorder.Event(job, v1.EventTypeNormal, "Completed", "Job completed") - metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded").Inc() + metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded", "").Inc() } else { jm.recorder.Event(job, v1.EventTypeWarning, finishedCond.Reason, finishedCond.Message) - metrics.JobFinishedNum.WithLabelValues(completionMode, "failed").Inc() + metrics.JobFinishedNum.WithLabelValues(completionMode, "failed", finishedCond.Reason).Inc() } return true } @@ -1345,7 +1351,7 @@ func getFailJobMessage(job *batch.Job, pods []*v1.Pod, uncounted sets.String) *s } for _, p := range pods { if isPodFailed(p, uncounted != nil) { - jobFailureMessage, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p) + jobFailureMessage, _, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p) if jobFailureMessage != nil { return jobFailureMessage } @@ -1369,7 +1375,7 @@ func getStatus(job *batch.Job, pods []*v1.Pod, uncounted *uncountedTerminatedPod if !isPodFailed(p, uncounted != nil) { return false } - _, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p) + _, countFailed, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p) return countFailed } else { return isPodFailed(p, uncounted != nil) @@ -1768,6 +1774,12 @@ func recordJobPodFinished(job *batch.Job, oldCounters batch.JobStatus) { metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff)) } +func recordJobPodFailurePolicyActions(job *batch.Job, podFailureCountByPolicyAction map[string]int) { + for action, count := range podFailureCountByPolicyAction { + metrics.PodFailuresHandledByFailurePolicy.WithLabelValues(action).Add(float64(count)) + } +} + func countReadyPods(pods []*v1.Pod) int32 { cnt := int32(0) for _, p := range pods { diff --git a/pkg/controller/job/metrics/metrics.go b/pkg/controller/job/metrics/metrics.go index 0c0fb0cbd74..87f042c7d0d 100644 --- a/pkg/controller/job/metrics/metrics.go +++ b/pkg/controller/job/metrics/metrics.go @@ -55,10 +55,12 @@ var ( }, []string{"completion_mode", "result", "action"}, ) - // JobFinishedNum tracks the number of Jobs that finish. Possible label - // values: + // JobFinishedNum tracks the number of Jobs that finish. Empty reason label + // is used to count successful jobs. + // Possible label values: // completion_mode: Indexed, NonIndexed // result: failed, succeeded + // reason: "BackoffLimitExceeded", "DeadlineExceeded", "PodFailurePolicy", "" JobFinishedNum = metrics.NewCounterVec( &metrics.CounterOpts{ Subsystem: JobControllerSubsystem, @@ -66,7 +68,7 @@ var ( Help: "The number of finished job", StabilityLevel: metrics.ALPHA, }, - []string{"completion_mode", "result"}, + []string{"completion_mode", "result", "reason"}, ) // JobPodsFinished records the number of finished Pods that the job controller @@ -84,6 +86,22 @@ var ( }, []string{"completion_mode", "result"}) + // PodFailuresHandledByFailurePolicy records the number of finished Pods + // handled by pod failure policy. + // Possible label values: + // action: FailJob, Ignore, Count + PodFailuresHandledByFailurePolicy = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: JobControllerSubsystem, + Name: "pod_failures_handled_by_failure_policy_total", + Help: `The number of failed Pods handled by failure policy with + respect to the failure policy action applied based on the matched + rule. Possible values of the action label correspond to the + possible values for the failure policy rule action, which are: + "FailJob", "Ignore" and "Count".`, + }, + []string{"action"}) + // TerminatedPodsWithTrackingFinalizer records the addition and removal of // terminated pods that have the finalizer batch.kubernetes.io/job-tracking, // regardless of whether they are owned by a Job. @@ -137,6 +155,7 @@ func Register() { legacyregistry.MustRegister(JobSyncNum) legacyregistry.MustRegister(JobFinishedNum) legacyregistry.MustRegister(JobPodsFinished) + legacyregistry.MustRegister(PodFailuresHandledByFailurePolicy) legacyregistry.MustRegister(TerminatedPodsTrackingFinalizerTotal) }) } diff --git a/pkg/controller/job/pod_failure_policy.go b/pkg/controller/job/pod_failure_policy.go index 642157882f1..a49030a4c3a 100644 --- a/pkg/controller/job/pod_failure_policy.go +++ b/pkg/controller/job/pod_failure_policy.go @@ -26,42 +26,46 @@ import ( // matchPodFailurePolicy returns information about matching a given failed pod // against the pod failure policy rules. The information is represented as an // optional job failure message (present in case the pod matched a 'FailJob' -// rule) and a boolean indicating if the failure should be counted towards -// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule). -func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool) { +// rule), a boolean indicating if the failure should be counted towards +// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule), +// and a pointer to the matched pod failure policy action. +func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool, *batch.PodFailurePolicyAction) { if podFailurePolicy == nil { - return nil, true + return nil, true, nil } + ignore := batch.PodFailurePolicyActionIgnore + failJob := batch.PodFailurePolicyActionFailJob + count := batch.PodFailurePolicyActionCount for index, podFailurePolicyRule := range podFailurePolicy.Rules { if podFailurePolicyRule.OnExitCodes != nil { if containerStatus := matchOnExitCodes(&failedPod.Status, podFailurePolicyRule.OnExitCodes); containerStatus != nil { switch podFailurePolicyRule.Action { case batch.PodFailurePolicyActionIgnore: - return nil, false + return nil, false, &ignore case batch.PodFailurePolicyActionCount: - return nil, true + return nil, true, &count case batch.PodFailurePolicyActionFailJob: msg := fmt.Sprintf("Container %s for pod %s/%s failed with exit code %v matching %v rule at index %d", containerStatus.Name, failedPod.Namespace, failedPod.Name, containerStatus.State.Terminated.ExitCode, podFailurePolicyRule.Action, index) - return &msg, true + return &msg, true, &failJob } } } else if podFailurePolicyRule.OnPodConditions != nil { if podCondition := matchOnPodConditions(&failedPod.Status, podFailurePolicyRule.OnPodConditions); podCondition != nil { switch podFailurePolicyRule.Action { case batch.PodFailurePolicyActionIgnore: - return nil, false + return nil, false, &ignore case batch.PodFailurePolicyActionCount: - return nil, true + return nil, true, &count case batch.PodFailurePolicyActionFailJob: msg := fmt.Sprintf("Pod %s/%s has condition %v matching %v rule at index %d", failedPod.Namespace, failedPod.Name, podCondition.Type, podFailurePolicyRule.Action, index) - return &msg, true + return &msg, true, &failJob } } } } - return nil, true + return nil, true, nil } func matchOnExitCodes(podStatus *v1.PodStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus { diff --git a/pkg/controller/job/pod_failure_policy_test.go b/pkg/controller/job/pod_failure_policy_test.go index d3c40323c20..3d872b48d29 100644 --- a/pkg/controller/job/pod_failure_policy_test.go +++ b/pkg/controller/job/pod_failure_policy_test.go @@ -19,6 +19,7 @@ package job import ( "testing" + "github.com/google/go-cmp/cmp" batch "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -31,12 +32,16 @@ func TestMatchPodFailurePolicy(t *testing.T) { Namespace: "default", Name: "mypod", } + ignore := batch.PodFailurePolicyActionIgnore + failJob := batch.PodFailurePolicyActionFailJob + count := batch.PodFailurePolicyActionCount testCases := map[string]struct { podFailurePolicy *batch.PodFailurePolicy failedPod *v1.Pod wantJobFailureMessage *string wantCountFailed bool + wantAction *batch.PodFailurePolicyAction }{ "unknown action for rule matching by exit codes - skip rule with unknown action": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -75,6 +80,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"), wantCountFailed: true, + wantAction: &failJob, }, "unknown action for rule matching by pod conditions - skip rule with unknown action": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -113,6 +119,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: nil, wantCountFailed: false, + wantAction: &ignore, }, "unknown operator - rule with unknown action is skipped for onExitCodes": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -151,6 +158,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"), wantCountFailed: true, + wantAction: &failJob, }, "no policy rules": { podFailurePolicy: nil, @@ -201,6 +209,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: nil, wantCountFailed: false, + wantAction: &ignore, }, "FailJob rule matched for exit codes": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -232,6 +241,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 0"), wantCountFailed: true, + wantAction: &failJob, }, "successful containers are skipped by the rules": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -320,6 +330,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 1 matching FailJob rule at index 0"), wantCountFailed: true, + wantAction: &failJob, }, "second jobfail rule matched for exit codes": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -358,6 +369,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 6 matching FailJob rule at index 1"), wantCountFailed: true, + wantAction: &failJob, }, "count rule matched for exit codes": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -389,6 +401,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: nil, wantCountFailed: true, + wantAction: &count, }, "ignore rule matched for pod conditions": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -418,6 +431,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: nil, wantCountFailed: false, + wantAction: &ignore, }, "ignore rule matches by the status=False": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -447,6 +461,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: nil, wantCountFailed: false, + wantAction: &ignore, }, "ignore rule matches by the status=Unknown": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -476,6 +491,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: nil, wantCountFailed: false, + wantAction: &ignore, }, "ignore rule does not match when status for pattern is False, but actual True": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -592,6 +608,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: pointer.String("Pod default/mypod has condition DisruptionTarget matching FailJob rule at index 0"), wantCountFailed: true, + wantAction: &failJob, }, "count rule matched for pod conditions": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -621,6 +638,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: nil, wantCountFailed: true, + wantAction: &count, }, "no rule matched": { podFailurePolicy: &batch.PodFailurePolicy{ @@ -683,25 +701,21 @@ func TestMatchPodFailurePolicy(t *testing.T) { }, wantJobFailureMessage: nil, wantCountFailed: true, + wantAction: &count, }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { - jobFailMessage, countFailed := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod) - if tc.wantJobFailureMessage == nil { - if jobFailMessage != nil { - t.Errorf("Unexpected job fail message. Got: %q", *jobFailMessage) - } - } else { - if jobFailMessage == nil { - t.Errorf("Missing job fail message. want: %q", *tc.wantJobFailureMessage) - } else if *tc.wantJobFailureMessage != *jobFailMessage { - t.Errorf("Unexpected job fail message. want: %q. got: %q", *tc.wantJobFailureMessage, *jobFailMessage) - } + jobFailMessage, countFailed, action := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod) + if diff := cmp.Diff(tc.wantJobFailureMessage, jobFailMessage); diff != "" { + t.Errorf("Unexpected job failure message: %s", diff) } if tc.wantCountFailed != countFailed { t.Errorf("Unexpected count failed. want: %v. got: %v", tc.wantCountFailed, countFailed) } + if diff := cmp.Diff(tc.wantAction, action); diff != "" { + t.Errorf("Unexpected failure policy action: %s", diff) + } }) } } diff --git a/test/integration/job/job_test.go b/test/integration/job/job_test.go index f3ba4314b7a..89fcf711036 100644 --- a/test/integration/job/job_test.go +++ b/test/integration/job/job_test.go @@ -68,7 +68,7 @@ type metricLabelsWithValue struct { Value int } -func TestMetrics(t *testing.T) { +func TestMetricsOnSuccesses(t *testing.T) { nonIndexedCompletion := batchv1.NonIndexedCompletion indexedCompletion := batchv1.IndexedCompletion wFinalizers := true @@ -94,7 +94,7 @@ func TestMetrics(t *testing.T) { }, }, wantJobFinishedNumMetric: metricLabelsWithValue{ - Labels: []string{"NonIndexed", "succeeded"}, + Labels: []string{"NonIndexed", "succeeded", ""}, Value: 1, }, wantJobPodsFinishedMetric: metricLabelsWithValue{ @@ -111,7 +111,7 @@ func TestMetrics(t *testing.T) { }, }, wantJobFinishedNumMetric: metricLabelsWithValue{ - Labels: []string{"Indexed", "succeeded"}, + Labels: []string{"Indexed", "succeeded", ""}, Value: 1, }, wantJobPodsFinishedMetric: metricLabelsWithValue{ @@ -149,6 +149,146 @@ func TestMetrics(t *testing.T) { } } +func TestJobFinishedNumReasonMetric(t *testing.T) { + wFinalizers := true + // setup the job controller + closeFn, restConfig, clientSet, ns := setup(t, "simple") + defer closeFn() + ctx, cancel := startJobControllerAndWaitForCaches(restConfig) + defer cancel() + + testCases := map[string]struct { + job batchv1.Job + podStatus v1.PodStatus + enableJobPodFailurePolicy bool + wantJobFinishedNumMetric metricLabelsWithValue + }{ + "non-indexed job; failed pod handled by FailJob action; JobPodFailurePolicy enabled": { + enableJobPodFailurePolicy: true, + job: batchv1.Job{ + Spec: batchv1.JobSpec{ + Completions: pointer.Int32(1), + Parallelism: pointer.Int32(1), + BackoffLimit: pointer.Int32(1), + PodFailurePolicy: &batchv1.PodFailurePolicy{ + Rules: []batchv1.PodFailurePolicyRule{ + { + Action: batchv1.PodFailurePolicyActionFailJob, + OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ + Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, + Values: []int32{5}, + }, + }, + }, + }, + }, + }, + podStatus: v1.PodStatus{ + Phase: v1.PodFailed, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 5, + }, + }, + }, + }, + }, + wantJobFinishedNumMetric: metricLabelsWithValue{ + Labels: []string{"NonIndexed", "failed", "PodFailurePolicy"}, + Value: 1, + }, + }, + "non-indexed job; failed pod handled by Count action; JobPodFailurePolicy enabled": { + enableJobPodFailurePolicy: true, + job: batchv1.Job{ + Spec: batchv1.JobSpec{ + Completions: pointer.Int32(1), + Parallelism: pointer.Int32(1), + BackoffLimit: pointer.Int32(0), + PodFailurePolicy: &batchv1.PodFailurePolicy{ + Rules: []batchv1.PodFailurePolicyRule{ + { + Action: batchv1.PodFailurePolicyActionCount, + OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ + Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, + Values: []int32{5}, + }, + }, + }, + }, + }, + }, + podStatus: v1.PodStatus{ + Phase: v1.PodFailed, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 5, + }, + }, + }, + }, + }, + wantJobFinishedNumMetric: metricLabelsWithValue{ + Labels: []string{"NonIndexed", "failed", "BackoffLimitExceeded"}, + Value: 1, + }, + }, + "non-indexed job; failed": { + job: batchv1.Job{ + Spec: batchv1.JobSpec{ + Completions: pointer.Int32(1), + Parallelism: pointer.Int32(1), + BackoffLimit: pointer.Int32(0), + }, + }, + podStatus: v1.PodStatus{ + Phase: v1.PodFailed, + }, + wantJobFinishedNumMetric: metricLabelsWithValue{ + Labels: []string{"NonIndexed", "failed", "BackoffLimitExceeded"}, + Value: 1, + }, + }, + } + job_index := 0 // job index to avoid collisions between job names created by different test cases + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)() + defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)() + resetMetrics() + // create a single job and wait for its completion + job := tc.job.DeepCopy() + job.Name = fmt.Sprintf("job-%v", job_index) + job_index++ + jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, job) + if err != nil { + t.Fatalf("Failed to create Job: %v", err) + } + validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ + Active: int(*jobObj.Spec.Parallelism), + Ready: pointer.Int32(0), + }, wFinalizers) + + op := func(p *v1.Pod) bool { + p.Status = tc.podStatus + return true + } + if err, _ := updateJobPodsStatus(ctx, clientSet, jobObj, op, 1); err != nil { + t.Fatalf("Error %q while updating pod status for Job: %q", err, jobObj.Name) + } + + validateJobFailed(ctx, t, clientSet, jobObj) + + // verify metric values after the job is finished + validateCounterMetric(t, metrics.JobFinishedNum, tc.wantJobFinishedNumMetric) + }) + } +} + func validateCounterMetric(t *testing.T, counterVec *basemetrics.CounterVec, wantMetric metricLabelsWithValue) { t.Helper() var cmpErr error @@ -351,6 +491,13 @@ func TestJobPodFailurePolicy(t *testing.T) { }, }, }, + { + Action: batchv1.PodFailurePolicyActionCount, + OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ + Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, + Values: []int32{10}, + }, + }, { Action: batchv1.PodFailurePolicyActionFailJob, OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ @@ -375,6 +522,19 @@ func TestJobPodFailurePolicy(t *testing.T) { }, }, } + podStatusMatchingOnExitCodesCountRule := v1.PodStatus{ + Phase: v1.PodFailed, + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "main-container", + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 10, + }, + }, + }, + }, + } podStatusMatchingOnPodConditionsIgnoreRule := v1.PodStatus{ Phase: v1.PodFailed, Conditions: []v1.PodCondition{ @@ -384,14 +544,18 @@ func TestJobPodFailurePolicy(t *testing.T) { }, }, } + podStatusNotMatchingAnyRule := v1.PodStatus{ + Phase: v1.PodFailed, + } testCases := map[string]struct { - enableJobPodFailurePolicy bool - restartController bool - job batchv1.Job - podStatus v1.PodStatus - wantActive int - wantFailed int - wantJobConditionType batchv1.JobConditionType + enableJobPodFailurePolicy bool + restartController bool + job batchv1.Job + podStatus v1.PodStatus + wantActive int + wantFailed int + wantJobConditionType batchv1.JobConditionType + wantPodFailuresHandledByPolicyRuleMetric *metricLabelsWithValue }{ "pod status matching the configured FailJob rule on exit codes; job terminated when JobPodFailurePolicy enabled": { enableJobPodFailurePolicy: true, @@ -400,6 +564,10 @@ func TestJobPodFailurePolicy(t *testing.T) { wantActive: 0, wantFailed: 1, wantJobConditionType: batchv1.JobFailed, + wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{ + Labels: []string{"FailJob"}, + Value: 1, + }, }, "pod status matching the configured FailJob rule on exit codes; with controller restart; job terminated when JobPodFailurePolicy enabled": { enableJobPodFailurePolicy: true, @@ -425,11 +593,40 @@ func TestJobPodFailurePolicy(t *testing.T) { wantActive: 1, wantFailed: 0, wantJobConditionType: batchv1.JobComplete, + wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{ + Labels: []string{"Ignore"}, + Value: 1, + }, + }, + "pod status matching the configured Count rule on exit codes; pod failure counted when JobPodFailurePolicy enabled": { + enableJobPodFailurePolicy: true, + job: job, + podStatus: podStatusMatchingOnExitCodesCountRule, + wantActive: 1, + wantFailed: 1, + wantJobConditionType: batchv1.JobComplete, + wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{ + Labels: []string{"Count"}, + Value: 1, + }, + }, + "pod status non-matching any configured rule; pod failure counted when JobPodFailurePolicy enabled": { + enableJobPodFailurePolicy: true, + job: job, + podStatus: podStatusNotMatchingAnyRule, + wantActive: 1, + wantFailed: 1, + wantJobConditionType: batchv1.JobComplete, + wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{ + Labels: []string{"Count"}, + Value: 0, + }, }, } for name, test := range testCases { for _, wFinalizers := range []bool{false, true} { t.Run(fmt.Sprintf("%s; finalizers=%t", name, wFinalizers), func(t *testing.T) { + resetMetrics() defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)() defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, test.enableJobPodFailurePolicy)() @@ -475,6 +672,9 @@ func TestJobPodFailurePolicy(t *testing.T) { } } validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType) + if wFinalizers && test.wantPodFailuresHandledByPolicyRuleMetric != nil { + validateCounterMetric(t, metrics.PodFailuresHandledByFailurePolicy, *test.wantPodFailuresHandledByPolicyRuleMetric) + } validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) }) } @@ -1699,6 +1899,7 @@ func resetMetrics() { metrics.TerminatedPodsTrackingFinalizerTotal.Reset() metrics.JobFinishedNum.Reset() metrics.JobPodsFinished.Reset() + metrics.PodFailuresHandledByFailurePolicy.Reset() } func createJobControllerWithSharedInformers(restConfig *restclient.Config, informerSet informers.SharedInformerFactory) (*jobcontroller.Controller, context.Context, context.CancelFunc) {