Extend metrics with the new labels (#113324)

* Extend job metrics * Refactor TestMetrics to extract its checks into dedicated tests per feature
2026-01-05 15:37:24 +00:00 · 2022-10-31 16:50:45 +01:00
parent 4d2128b523
commit 3628532311
5 changed files with 292 additions and 42 deletions
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -1031,6 +1031,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 	if cleanUncountedPodsWithoutFinalizers(&job.Status, uidsWithFinalizer) {
 		needsFlush = true
 	}
+	podFailureCountByPolicyAction := map[string]int{}
 	for _, pod := range pods {
 		if !hasJobTrackingFinalizer(pod) || expectedRmFinalizers.Has(string(pod.UID)) {
 			continue
@@ -1061,7 +1062,10 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 			ix := getCompletionIndex(pod.Annotations)
 			if !uncounted.failed.Has(string(pod.UID)) && (!isIndexed || (ix != unknownCompletionIndex && ix < int(*job.Spec.Completions))) {
 				if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
-					_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod)
+					_, countFailed, action := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod)
+					if action != nil {
+						podFailureCountByPolicyAction[string(*action)] += 1
+					}
 					if countFailed {
 						needsFlush = true
 						uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
@@ -1102,7 +1106,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 		}
 	}
 	var err error
-	if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil {
+	if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush); err != nil {
 		return err
 	}
 	jobFinished := jm.enactJobFinished(job, finishedCond)
@@ -1132,7 +1136,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 //
 // Returns whether there are pending changes in the Job status that need to be
 // flushed in subsequent calls.
-func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, job *batch.Job, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.String, oldCounters *batch.JobStatus, needsFlush bool) (*batch.Job, bool, error) {
+func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, job *batch.Job, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.String, oldCounters *batch.JobStatus, podFailureCountByPolicyAction map[string]int, needsFlush bool) (*batch.Job, bool, error) {
 	var err error
 	if needsFlush {
 		if job, err = jm.updateStatusHandler(ctx, job); err != nil {
@@ -1143,6 +1147,8 @@ func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, job
 		*oldCounters = job.Status
 		needsFlush = false
 	}
+	recordJobPodFailurePolicyActions(job, podFailureCountByPolicyAction)
+
 	jobKey, err := controller.KeyFunc(job)
 	if err != nil {
 		return job, needsFlush, fmt.Errorf("getting job key: %w", err)
@@ -1263,10 +1269,10 @@ func (jm *Controller) recordJobFinished(job *batch.Job, finishedCond *batch.JobC
 			jm.recorder.Event(job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached")
 		}
 		jm.recorder.Event(job, v1.EventTypeNormal, "Completed", "Job completed")
-		metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded").Inc()
+		metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded", "").Inc()
 	} else {
 		jm.recorder.Event(job, v1.EventTypeWarning, finishedCond.Reason, finishedCond.Message)
-		metrics.JobFinishedNum.WithLabelValues(completionMode, "failed").Inc()
+		metrics.JobFinishedNum.WithLabelValues(completionMode, "failed", finishedCond.Reason).Inc()
 	}
 	return true
 }
@@ -1345,7 +1351,7 @@ func getFailJobMessage(job *batch.Job, pods []*v1.Pod, uncounted sets.String) *s
 	}
 	for _, p := range pods {
 		if isPodFailed(p, uncounted != nil) {
-			jobFailureMessage, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
+			jobFailureMessage, _, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
 			if jobFailureMessage != nil {
 				return jobFailureMessage
 			}
@@ -1369,7 +1375,7 @@ func getStatus(job *batch.Job, pods []*v1.Pod, uncounted *uncountedTerminatedPod
 			if !isPodFailed(p, uncounted != nil) {
 				return false
 			}
-			_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
+			_, countFailed, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
 			return countFailed
 		} else {
 			return isPodFailed(p, uncounted != nil)
@@ -1768,6 +1774,12 @@ func recordJobPodFinished(job *batch.Job, oldCounters batch.JobStatus) {
 	metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff))
 }

+func recordJobPodFailurePolicyActions(job *batch.Job, podFailureCountByPolicyAction map[string]int) {
+	for action, count := range podFailureCountByPolicyAction {
+		metrics.PodFailuresHandledByFailurePolicy.WithLabelValues(action).Add(float64(count))
+	}
+}
+
 func countReadyPods(pods []*v1.Pod) int32 {
 	cnt := int32(0)
 	for _, p := range pods {
--- a/pkg/controller/job/metrics/metrics.go
+++ b/pkg/controller/job/metrics/metrics.go
@@ -55,10 +55,12 @@ var (
 		},
 		[]string{"completion_mode", "result", "action"},
 	)
-	// JobFinishedNum tracks the number of Jobs that finish. Possible label
-	// values:
+	// JobFinishedNum tracks the number of Jobs that finish. Empty reason label
+	// is used to count successful jobs.
+	// Possible label values:
 	//   completion_mode: Indexed, NonIndexed
 	//   result:          failed, succeeded
+	//   reason:          "BackoffLimitExceeded", "DeadlineExceeded", "PodFailurePolicy", ""
 	JobFinishedNum = metrics.NewCounterVec(
 		&metrics.CounterOpts{
 			Subsystem:      JobControllerSubsystem,
@@ -66,7 +68,7 @@ var (
 			Help:           "The number of finished job",
 			StabilityLevel: metrics.ALPHA,
 		},
-		[]string{"completion_mode", "result"},
+		[]string{"completion_mode", "result", "reason"},
 	)

 	// JobPodsFinished records the number of finished Pods that the job controller
@@ -84,6 +86,22 @@ var (
 		},
 		[]string{"completion_mode", "result"})

+	// PodFailuresHandledByFailurePolicy records the number of finished Pods
+	// handled by pod failure policy.
+	// Possible label values:
+	//   action: FailJob, Ignore, Count
+	PodFailuresHandledByFailurePolicy = metrics.NewCounterVec(
+		&metrics.CounterOpts{
+			Subsystem: JobControllerSubsystem,
+			Name:      "pod_failures_handled_by_failure_policy_total",
+			Help: `The number of failed Pods handled by failure policy with
+			respect to the failure policy action applied based on the matched
+			rule. Possible values of the action label correspond to the
+			possible values for the failure policy rule action, which are:
+			"FailJob", "Ignore" and "Count".`,
+		},
+		[]string{"action"})
+
 	// TerminatedPodsWithTrackingFinalizer records the addition and removal of
 	// terminated pods that have the finalizer batch.kubernetes.io/job-tracking,
 	// regardless of whether they are owned by a Job.
@@ -137,6 +155,7 @@ func Register() {
 		legacyregistry.MustRegister(JobSyncNum)
 		legacyregistry.MustRegister(JobFinishedNum)
 		legacyregistry.MustRegister(JobPodsFinished)
+		legacyregistry.MustRegister(PodFailuresHandledByFailurePolicy)
 		legacyregistry.MustRegister(TerminatedPodsTrackingFinalizerTotal)
 	})
 }
--- a/pkg/controller/job/pod_failure_policy.go
+++ b/pkg/controller/job/pod_failure_policy.go
@@ -26,42 +26,46 @@ import (
 // matchPodFailurePolicy returns information about matching a given failed pod
 // against the pod failure policy rules. The information is represented as an
 // optional job failure message (present in case the pod matched a 'FailJob'
-// rule) and a boolean indicating if the failure should be counted towards
-// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule).
-func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool) {
+// rule), a boolean indicating if the failure should be counted towards
+// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule),
+// and a pointer to the matched pod failure policy action.
+func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool, *batch.PodFailurePolicyAction) {
 	if podFailurePolicy == nil {
-		return nil, true
+		return nil, true, nil
 	}
+	ignore := batch.PodFailurePolicyActionIgnore
+	failJob := batch.PodFailurePolicyActionFailJob
+	count := batch.PodFailurePolicyActionCount
 	for index, podFailurePolicyRule := range podFailurePolicy.Rules {
 		if podFailurePolicyRule.OnExitCodes != nil {
 			if containerStatus := matchOnExitCodes(&failedPod.Status, podFailurePolicyRule.OnExitCodes); containerStatus != nil {
 				switch podFailurePolicyRule.Action {
 				case batch.PodFailurePolicyActionIgnore:
-					return nil, false
+					return nil, false, &ignore
 				case batch.PodFailurePolicyActionCount:
-					return nil, true
+					return nil, true, &count
 				case batch.PodFailurePolicyActionFailJob:
 					msg := fmt.Sprintf("Container %s for pod %s/%s failed with exit code %v matching %v rule at index %d",
 						containerStatus.Name, failedPod.Namespace, failedPod.Name, containerStatus.State.Terminated.ExitCode, podFailurePolicyRule.Action, index)
-					return &msg, true
+					return &msg, true, &failJob
 				}
 			}
 		} else if podFailurePolicyRule.OnPodConditions != nil {
 			if podCondition := matchOnPodConditions(&failedPod.Status, podFailurePolicyRule.OnPodConditions); podCondition != nil {
 				switch podFailurePolicyRule.Action {
 				case batch.PodFailurePolicyActionIgnore:
-					return nil, false
+					return nil, false, &ignore
 				case batch.PodFailurePolicyActionCount:
-					return nil, true
+					return nil, true, &count
 				case batch.PodFailurePolicyActionFailJob:
 					msg := fmt.Sprintf("Pod %s/%s has condition %v matching %v rule at index %d",
 						failedPod.Namespace, failedPod.Name, podCondition.Type, podFailurePolicyRule.Action, index)
-					return &msg, true
+					return &msg, true, &failJob
 				}
 			}
 		}
 	}
-	return nil, true
+	return nil, true, nil
 }

 func matchOnExitCodes(podStatus *v1.PodStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {
--- a/pkg/controller/job/pod_failure_policy_test.go
+++ b/pkg/controller/job/pod_failure_policy_test.go
@@ -19,6 +19,7 @@ package job
 import (
 	"testing"

+	"github.com/google/go-cmp/cmp"
 	batch "k8s.io/api/batch/v1"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -31,12 +32,16 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 		Namespace: "default",
 		Name:      "mypod",
 	}
+	ignore := batch.PodFailurePolicyActionIgnore
+	failJob := batch.PodFailurePolicyActionFailJob
+	count := batch.PodFailurePolicyActionCount

 	testCases := map[string]struct {
 		podFailurePolicy      *batch.PodFailurePolicy
 		failedPod             *v1.Pod
 		wantJobFailureMessage *string
 		wantCountFailed       bool
+		wantAction            *batch.PodFailurePolicyAction
 	}{
 		"unknown action for rule matching by exit codes - skip rule with unknown action": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -75,6 +80,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
 			wantCountFailed:       true,
+			wantAction:            &failJob,
 		},
 		"unknown action for rule matching by pod conditions - skip rule with unknown action": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -113,6 +119,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: nil,
 			wantCountFailed:       false,
+			wantAction:            &ignore,
 		},
 		"unknown operator - rule with unknown action is skipped for onExitCodes": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -151,6 +158,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
 			wantCountFailed:       true,
+			wantAction:            &failJob,
 		},
 		"no policy rules": {
 			podFailurePolicy: nil,
@@ -201,6 +209,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: nil,
 			wantCountFailed:       false,
+			wantAction:            &ignore,
 		},
 		"FailJob rule matched for exit codes": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -232,6 +241,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 0"),
 			wantCountFailed:       true,
+			wantAction:            &failJob,
 		},
 		"successful containers are skipped by the rules": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -320,6 +330,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 1 matching FailJob rule at index 0"),
 			wantCountFailed:       true,
+			wantAction:            &failJob,
 		},
 		"second jobfail rule matched for exit codes": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -358,6 +369,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 6 matching FailJob rule at index 1"),
 			wantCountFailed:       true,
+			wantAction:            &failJob,
 		},
 		"count rule matched for exit codes": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -389,6 +401,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: nil,
 			wantCountFailed:       true,
+			wantAction:            &count,
 		},
 		"ignore rule matched for pod conditions": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -418,6 +431,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: nil,
 			wantCountFailed:       false,
+			wantAction:            &ignore,
 		},
 		"ignore rule matches by the status=False": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -447,6 +461,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: nil,
 			wantCountFailed:       false,
+			wantAction:            &ignore,
 		},
 		"ignore rule matches by the status=Unknown": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -476,6 +491,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: nil,
 			wantCountFailed:       false,
+			wantAction:            &ignore,
 		},
 		"ignore rule does not match when status for pattern is False, but actual True": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -592,6 +608,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: pointer.String("Pod default/mypod has condition DisruptionTarget matching FailJob rule at index 0"),
 			wantCountFailed:       true,
+			wantAction:            &failJob,
 		},
 		"count rule matched for pod conditions": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -621,6 +638,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: nil,
 			wantCountFailed:       true,
+			wantAction:            &count,
 		},
 		"no rule matched": {
 			podFailurePolicy: &batch.PodFailurePolicy{
@@ -683,25 +701,21 @@ func TestMatchPodFailurePolicy(t *testing.T) {
 			},
 			wantJobFailureMessage: nil,
 			wantCountFailed:       true,
+			wantAction:            &count,
 		},
 	}
 	for name, tc := range testCases {
 		t.Run(name, func(t *testing.T) {
-			jobFailMessage, countFailed := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
-			if tc.wantJobFailureMessage == nil {
-				if jobFailMessage != nil {
-					t.Errorf("Unexpected job fail message. Got: %q", *jobFailMessage)
-				}
-			} else {
-				if jobFailMessage == nil {
-					t.Errorf("Missing job fail message. want: %q", *tc.wantJobFailureMessage)
-				} else if *tc.wantJobFailureMessage != *jobFailMessage {
-					t.Errorf("Unexpected job fail message. want: %q. got: %q", *tc.wantJobFailureMessage, *jobFailMessage)
-				}
+			jobFailMessage, countFailed, action := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
+			if diff := cmp.Diff(tc.wantJobFailureMessage, jobFailMessage); diff != "" {
+				t.Errorf("Unexpected job failure message: %s", diff)
 			}
 			if tc.wantCountFailed != countFailed {
 				t.Errorf("Unexpected count failed. want: %v. got: %v", tc.wantCountFailed, countFailed)
 			}
+			if diff := cmp.Diff(tc.wantAction, action); diff != "" {
+				t.Errorf("Unexpected failure policy action: %s", diff)
+			}
 		})
 	}
 }
--- a/test/integration/job/job_test.go
+++ b/test/integration/job/job_test.go
@@ -68,7 +68,7 @@ type metricLabelsWithValue struct {
 	Value  int
 }

-func TestMetrics(t *testing.T) {
+func TestMetricsOnSuccesses(t *testing.T) {
 	nonIndexedCompletion := batchv1.NonIndexedCompletion
 	indexedCompletion := batchv1.IndexedCompletion
 	wFinalizers := true
@@ -94,7 +94,7 @@ func TestMetrics(t *testing.T) {
 				},
 			},
 			wantJobFinishedNumMetric: metricLabelsWithValue{
-				Labels: []string{"NonIndexed", "succeeded"},
+				Labels: []string{"NonIndexed", "succeeded", ""},
 				Value:  1,
 			},
 			wantJobPodsFinishedMetric: metricLabelsWithValue{
@@ -111,7 +111,7 @@ func TestMetrics(t *testing.T) {
 				},
 			},
 			wantJobFinishedNumMetric: metricLabelsWithValue{
-				Labels: []string{"Indexed", "succeeded"},
+				Labels: []string{"Indexed", "succeeded", ""},
 				Value:  1,
 			},
 			wantJobPodsFinishedMetric: metricLabelsWithValue{
@@ -149,6 +149,146 @@ func TestMetrics(t *testing.T) {
 	}
 }

+func TestJobFinishedNumReasonMetric(t *testing.T) {
+	wFinalizers := true
+	// setup the job controller
+	closeFn, restConfig, clientSet, ns := setup(t, "simple")
+	defer closeFn()
+	ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
+	defer cancel()
+
+	testCases := map[string]struct {
+		job                       batchv1.Job
+		podStatus                 v1.PodStatus
+		enableJobPodFailurePolicy bool
+		wantJobFinishedNumMetric  metricLabelsWithValue
+	}{
+		"non-indexed job; failed pod handled by FailJob action; JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			job: batchv1.Job{
+				Spec: batchv1.JobSpec{
+					Completions:  pointer.Int32(1),
+					Parallelism:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(1),
+					PodFailurePolicy: &batchv1.PodFailurePolicy{
+						Rules: []batchv1.PodFailurePolicyRule{
+							{
+								Action: batchv1.PodFailurePolicyActionFailJob,
+								OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+									Values:   []int32{5},
+								},
+							},
+						},
+					},
+				},
+			},
+			podStatus: v1.PodStatus{
+				Phase: v1.PodFailed,
+				ContainerStatuses: []v1.ContainerStatus{
+					{
+						State: v1.ContainerState{
+							Terminated: &v1.ContainerStateTerminated{
+								ExitCode: 5,
+							},
+						},
+					},
+				},
+			},
+			wantJobFinishedNumMetric: metricLabelsWithValue{
+				Labels: []string{"NonIndexed", "failed", "PodFailurePolicy"},
+				Value:  1,
+			},
+		},
+		"non-indexed job; failed pod handled by Count action; JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			job: batchv1.Job{
+				Spec: batchv1.JobSpec{
+					Completions:  pointer.Int32(1),
+					Parallelism:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(0),
+					PodFailurePolicy: &batchv1.PodFailurePolicy{
+						Rules: []batchv1.PodFailurePolicyRule{
+							{
+								Action: batchv1.PodFailurePolicyActionCount,
+								OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+									Values:   []int32{5},
+								},
+							},
+						},
+					},
+				},
+			},
+			podStatus: v1.PodStatus{
+				Phase: v1.PodFailed,
+				ContainerStatuses: []v1.ContainerStatus{
+					{
+						State: v1.ContainerState{
+							Terminated: &v1.ContainerStateTerminated{
+								ExitCode: 5,
+							},
+						},
+					},
+				},
+			},
+			wantJobFinishedNumMetric: metricLabelsWithValue{
+				Labels: []string{"NonIndexed", "failed", "BackoffLimitExceeded"},
+				Value:  1,
+			},
+		},
+		"non-indexed job; failed": {
+			job: batchv1.Job{
+				Spec: batchv1.JobSpec{
+					Completions:  pointer.Int32(1),
+					Parallelism:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(0),
+				},
+			},
+			podStatus: v1.PodStatus{
+				Phase: v1.PodFailed,
+			},
+			wantJobFinishedNumMetric: metricLabelsWithValue{
+				Labels: []string{"NonIndexed", "failed", "BackoffLimitExceeded"},
+				Value:  1,
+			},
+		},
+	}
+	job_index := 0 // job index to avoid collisions between job names created by different test cases
+	for name, tc := range testCases {
+		t.Run(name, func(t *testing.T) {
+			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
+			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
+			resetMetrics()
+			// create a single job and wait for its completion
+			job := tc.job.DeepCopy()
+			job.Name = fmt.Sprintf("job-%v", job_index)
+			job_index++
+			jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, job)
+			if err != nil {
+				t.Fatalf("Failed to create Job: %v", err)
+			}
+			validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
+				Active: int(*jobObj.Spec.Parallelism),
+				Ready:  pointer.Int32(0),
+			}, wFinalizers)
+
+			op := func(p *v1.Pod) bool {
+				p.Status = tc.podStatus
+				return true
+			}
+			if err, _ := updateJobPodsStatus(ctx, clientSet, jobObj, op, 1); err != nil {
+				t.Fatalf("Error %q while updating pod status for Job: %q", err, jobObj.Name)
+			}
+
+			validateJobFailed(ctx, t, clientSet, jobObj)
+
+			// verify metric values after the job is finished
+			validateCounterMetric(t, metrics.JobFinishedNum, tc.wantJobFinishedNumMetric)
+		})
+	}
+}
+
 func validateCounterMetric(t *testing.T, counterVec *basemetrics.CounterVec, wantMetric metricLabelsWithValue) {
 	t.Helper()
 	var cmpErr error
@@ -351,6 +491,13 @@ func TestJobPodFailurePolicy(t *testing.T) {
 							},
 						},
 					},
+					{
+						Action: batchv1.PodFailurePolicyActionCount,
+						OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{10},
+						},
+					},
 					{
 						Action: batchv1.PodFailurePolicyActionFailJob,
 						OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
@@ -375,6 +522,19 @@ func TestJobPodFailurePolicy(t *testing.T) {
 			},
 		},
 	}
+	podStatusMatchingOnExitCodesCountRule := v1.PodStatus{
+		Phase: v1.PodFailed,
+		ContainerStatuses: []v1.ContainerStatus{
+			{
+				Name: "main-container",
+				State: v1.ContainerState{
+					Terminated: &v1.ContainerStateTerminated{
+						ExitCode: 10,
+					},
+				},
+			},
+		},
+	}
 	podStatusMatchingOnPodConditionsIgnoreRule := v1.PodStatus{
 		Phase: v1.PodFailed,
 		Conditions: []v1.PodCondition{
@@ -384,14 +544,18 @@ func TestJobPodFailurePolicy(t *testing.T) {
 			},
 		},
 	}
+	podStatusNotMatchingAnyRule := v1.PodStatus{
+		Phase: v1.PodFailed,
+	}
 	testCases := map[string]struct {
-		enableJobPodFailurePolicy bool
-		restartController         bool
-		job                       batchv1.Job
-		podStatus                 v1.PodStatus
-		wantActive                int
-		wantFailed                int
-		wantJobConditionType      batchv1.JobConditionType
+		enableJobPodFailurePolicy                bool
+		restartController                        bool
+		job                                      batchv1.Job
+		podStatus                                v1.PodStatus
+		wantActive                               int
+		wantFailed                               int
+		wantJobConditionType                     batchv1.JobConditionType
+		wantPodFailuresHandledByPolicyRuleMetric *metricLabelsWithValue
 	}{
 		"pod status matching the configured FailJob rule on exit codes; job terminated when JobPodFailurePolicy enabled": {
 			enableJobPodFailurePolicy: true,
@@ -400,6 +564,10 @@ func TestJobPodFailurePolicy(t *testing.T) {
 			wantActive:                0,
 			wantFailed:                1,
 			wantJobConditionType:      batchv1.JobFailed,
+			wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
+				Labels: []string{"FailJob"},
+				Value:  1,
+			},
 		},
 		"pod status matching the configured FailJob rule on exit codes; with controller restart; job terminated when JobPodFailurePolicy enabled": {
 			enableJobPodFailurePolicy: true,
@@ -425,11 +593,40 @@ func TestJobPodFailurePolicy(t *testing.T) {
 			wantActive:                1,
 			wantFailed:                0,
 			wantJobConditionType:      batchv1.JobComplete,
+			wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
+				Labels: []string{"Ignore"},
+				Value:  1,
+			},
+		},
+		"pod status matching the configured Count rule on exit codes; pod failure counted when JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			job:                       job,
+			podStatus:                 podStatusMatchingOnExitCodesCountRule,
+			wantActive:                1,
+			wantFailed:                1,
+			wantJobConditionType:      batchv1.JobComplete,
+			wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
+				Labels: []string{"Count"},
+				Value:  1,
+			},
+		},
+		"pod status non-matching any configured rule; pod failure counted when JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			job:                       job,
+			podStatus:                 podStatusNotMatchingAnyRule,
+			wantActive:                1,
+			wantFailed:                1,
+			wantJobConditionType:      batchv1.JobComplete,
+			wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
+				Labels: []string{"Count"},
+				Value:  0,
+			},
 		},
 	}
 	for name, test := range testCases {
 		for _, wFinalizers := range []bool{false, true} {
 			t.Run(fmt.Sprintf("%s; finalizers=%t", name, wFinalizers), func(t *testing.T) {
+				resetMetrics()
 				defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
 				defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, test.enableJobPodFailurePolicy)()

@@ -475,6 +672,9 @@ func TestJobPodFailurePolicy(t *testing.T) {
 					}
 				}
 				validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
+				if wFinalizers && test.wantPodFailuresHandledByPolicyRuleMetric != nil {
+					validateCounterMetric(t, metrics.PodFailuresHandledByFailurePolicy, *test.wantPodFailuresHandledByPolicyRuleMetric)
+				}
 				validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
 			})
 		}
@@ -1699,6 +1899,7 @@ func resetMetrics() {
 	metrics.TerminatedPodsTrackingFinalizerTotal.Reset()
 	metrics.JobFinishedNum.Reset()
 	metrics.JobPodsFinished.Reset()
+	metrics.PodFailuresHandledByFailurePolicy.Reset()
 }

 func createJobControllerWithSharedInformers(restConfig *restclient.Config, informerSet informers.SharedInformerFactory) (*jobcontroller.Controller, context.Context, context.CancelFunc) {