Extend metrics with the new labels (#113324)

* Extend job metrics

* Refactor TestMetrics to extract its checks into dedicated tests per feature
This commit is contained in:
Michał Woźniak 2022-10-31 16:50:45 +01:00 committed by GitHub
parent 4d2128b523
commit 3628532311
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 292 additions and 42 deletions

View File

@ -1031,6 +1031,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
if cleanUncountedPodsWithoutFinalizers(&job.Status, uidsWithFinalizer) {
needsFlush = true
}
podFailureCountByPolicyAction := map[string]int{}
for _, pod := range pods {
if !hasJobTrackingFinalizer(pod) || expectedRmFinalizers.Has(string(pod.UID)) {
continue
@ -1061,7 +1062,10 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
ix := getCompletionIndex(pod.Annotations)
if !uncounted.failed.Has(string(pod.UID)) && (!isIndexed || (ix != unknownCompletionIndex && ix < int(*job.Spec.Completions))) {
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod)
_, countFailed, action := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod)
if action != nil {
podFailureCountByPolicyAction[string(*action)] += 1
}
if countFailed {
needsFlush = true
uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
@ -1102,7 +1106,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
}
}
var err error
if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil {
if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush); err != nil {
return err
}
jobFinished := jm.enactJobFinished(job, finishedCond)
@ -1132,7 +1136,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
//
// Returns whether there are pending changes in the Job status that need to be
// flushed in subsequent calls.
func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, job *batch.Job, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.String, oldCounters *batch.JobStatus, needsFlush bool) (*batch.Job, bool, error) {
func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, job *batch.Job, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.String, oldCounters *batch.JobStatus, podFailureCountByPolicyAction map[string]int, needsFlush bool) (*batch.Job, bool, error) {
var err error
if needsFlush {
if job, err = jm.updateStatusHandler(ctx, job); err != nil {
@ -1143,6 +1147,8 @@ func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, job
*oldCounters = job.Status
needsFlush = false
}
recordJobPodFailurePolicyActions(job, podFailureCountByPolicyAction)
jobKey, err := controller.KeyFunc(job)
if err != nil {
return job, needsFlush, fmt.Errorf("getting job key: %w", err)
@ -1263,10 +1269,10 @@ func (jm *Controller) recordJobFinished(job *batch.Job, finishedCond *batch.JobC
jm.recorder.Event(job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached")
}
jm.recorder.Event(job, v1.EventTypeNormal, "Completed", "Job completed")
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded").Inc()
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded", "").Inc()
} else {
jm.recorder.Event(job, v1.EventTypeWarning, finishedCond.Reason, finishedCond.Message)
metrics.JobFinishedNum.WithLabelValues(completionMode, "failed").Inc()
metrics.JobFinishedNum.WithLabelValues(completionMode, "failed", finishedCond.Reason).Inc()
}
return true
}
@ -1345,7 +1351,7 @@ func getFailJobMessage(job *batch.Job, pods []*v1.Pod, uncounted sets.String) *s
}
for _, p := range pods {
if isPodFailed(p, uncounted != nil) {
jobFailureMessage, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
jobFailureMessage, _, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
if jobFailureMessage != nil {
return jobFailureMessage
}
@ -1369,7 +1375,7 @@ func getStatus(job *batch.Job, pods []*v1.Pod, uncounted *uncountedTerminatedPod
if !isPodFailed(p, uncounted != nil) {
return false
}
_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
_, countFailed, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
return countFailed
} else {
return isPodFailed(p, uncounted != nil)
@ -1768,6 +1774,12 @@ func recordJobPodFinished(job *batch.Job, oldCounters batch.JobStatus) {
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff))
}
func recordJobPodFailurePolicyActions(job *batch.Job, podFailureCountByPolicyAction map[string]int) {
for action, count := range podFailureCountByPolicyAction {
metrics.PodFailuresHandledByFailurePolicy.WithLabelValues(action).Add(float64(count))
}
}
func countReadyPods(pods []*v1.Pod) int32 {
cnt := int32(0)
for _, p := range pods {

View File

@ -55,10 +55,12 @@ var (
},
[]string{"completion_mode", "result", "action"},
)
// JobFinishedNum tracks the number of Jobs that finish. Possible label
// values:
// JobFinishedNum tracks the number of Jobs that finish. Empty reason label
// is used to count successful jobs.
// Possible label values:
// completion_mode: Indexed, NonIndexed
// result: failed, succeeded
// reason: "BackoffLimitExceeded", "DeadlineExceeded", "PodFailurePolicy", ""
JobFinishedNum = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: JobControllerSubsystem,
@ -66,7 +68,7 @@ var (
Help: "The number of finished job",
StabilityLevel: metrics.ALPHA,
},
[]string{"completion_mode", "result"},
[]string{"completion_mode", "result", "reason"},
)
// JobPodsFinished records the number of finished Pods that the job controller
@ -84,6 +86,22 @@ var (
},
[]string{"completion_mode", "result"})
// PodFailuresHandledByFailurePolicy records the number of finished Pods
// handled by pod failure policy.
// Possible label values:
// action: FailJob, Ignore, Count
PodFailuresHandledByFailurePolicy = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: JobControllerSubsystem,
Name: "pod_failures_handled_by_failure_policy_total",
Help: `The number of failed Pods handled by failure policy with
respect to the failure policy action applied based on the matched
rule. Possible values of the action label correspond to the
possible values for the failure policy rule action, which are:
"FailJob", "Ignore" and "Count".`,
},
[]string{"action"})
// TerminatedPodsWithTrackingFinalizer records the addition and removal of
// terminated pods that have the finalizer batch.kubernetes.io/job-tracking,
// regardless of whether they are owned by a Job.
@ -137,6 +155,7 @@ func Register() {
legacyregistry.MustRegister(JobSyncNum)
legacyregistry.MustRegister(JobFinishedNum)
legacyregistry.MustRegister(JobPodsFinished)
legacyregistry.MustRegister(PodFailuresHandledByFailurePolicy)
legacyregistry.MustRegister(TerminatedPodsTrackingFinalizerTotal)
})
}

View File

@ -26,42 +26,46 @@ import (
// matchPodFailurePolicy returns information about matching a given failed pod
// against the pod failure policy rules. The information is represented as an
// optional job failure message (present in case the pod matched a 'FailJob'
// rule) and a boolean indicating if the failure should be counted towards
// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule).
func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool) {
// rule), a boolean indicating if the failure should be counted towards
// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule),
// and a pointer to the matched pod failure policy action.
func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool, *batch.PodFailurePolicyAction) {
if podFailurePolicy == nil {
return nil, true
return nil, true, nil
}
ignore := batch.PodFailurePolicyActionIgnore
failJob := batch.PodFailurePolicyActionFailJob
count := batch.PodFailurePolicyActionCount
for index, podFailurePolicyRule := range podFailurePolicy.Rules {
if podFailurePolicyRule.OnExitCodes != nil {
if containerStatus := matchOnExitCodes(&failedPod.Status, podFailurePolicyRule.OnExitCodes); containerStatus != nil {
switch podFailurePolicyRule.Action {
case batch.PodFailurePolicyActionIgnore:
return nil, false
return nil, false, &ignore
case batch.PodFailurePolicyActionCount:
return nil, true
return nil, true, &count
case batch.PodFailurePolicyActionFailJob:
msg := fmt.Sprintf("Container %s for pod %s/%s failed with exit code %v matching %v rule at index %d",
containerStatus.Name, failedPod.Namespace, failedPod.Name, containerStatus.State.Terminated.ExitCode, podFailurePolicyRule.Action, index)
return &msg, true
return &msg, true, &failJob
}
}
} else if podFailurePolicyRule.OnPodConditions != nil {
if podCondition := matchOnPodConditions(&failedPod.Status, podFailurePolicyRule.OnPodConditions); podCondition != nil {
switch podFailurePolicyRule.Action {
case batch.PodFailurePolicyActionIgnore:
return nil, false
return nil, false, &ignore
case batch.PodFailurePolicyActionCount:
return nil, true
return nil, true, &count
case batch.PodFailurePolicyActionFailJob:
msg := fmt.Sprintf("Pod %s/%s has condition %v matching %v rule at index %d",
failedPod.Namespace, failedPod.Name, podCondition.Type, podFailurePolicyRule.Action, index)
return &msg, true
return &msg, true, &failJob
}
}
}
}
return nil, true
return nil, true, nil
}
func matchOnExitCodes(podStatus *v1.PodStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {

View File

@ -19,6 +19,7 @@ package job
import (
"testing"
"github.com/google/go-cmp/cmp"
batch "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -31,12 +32,16 @@ func TestMatchPodFailurePolicy(t *testing.T) {
Namespace: "default",
Name: "mypod",
}
ignore := batch.PodFailurePolicyActionIgnore
failJob := batch.PodFailurePolicyActionFailJob
count := batch.PodFailurePolicyActionCount
testCases := map[string]struct {
podFailurePolicy *batch.PodFailurePolicy
failedPod *v1.Pod
wantJobFailureMessage *string
wantCountFailed bool
wantAction *batch.PodFailurePolicyAction
}{
"unknown action for rule matching by exit codes - skip rule with unknown action": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -75,6 +80,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
wantCountFailed: true,
wantAction: &failJob,
},
"unknown action for rule matching by pod conditions - skip rule with unknown action": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -113,6 +119,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: nil,
wantCountFailed: false,
wantAction: &ignore,
},
"unknown operator - rule with unknown action is skipped for onExitCodes": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -151,6 +158,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
wantCountFailed: true,
wantAction: &failJob,
},
"no policy rules": {
podFailurePolicy: nil,
@ -201,6 +209,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: nil,
wantCountFailed: false,
wantAction: &ignore,
},
"FailJob rule matched for exit codes": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -232,6 +241,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 0"),
wantCountFailed: true,
wantAction: &failJob,
},
"successful containers are skipped by the rules": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -320,6 +330,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 1 matching FailJob rule at index 0"),
wantCountFailed: true,
wantAction: &failJob,
},
"second jobfail rule matched for exit codes": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -358,6 +369,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 6 matching FailJob rule at index 1"),
wantCountFailed: true,
wantAction: &failJob,
},
"count rule matched for exit codes": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -389,6 +401,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: nil,
wantCountFailed: true,
wantAction: &count,
},
"ignore rule matched for pod conditions": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -418,6 +431,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: nil,
wantCountFailed: false,
wantAction: &ignore,
},
"ignore rule matches by the status=False": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -447,6 +461,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: nil,
wantCountFailed: false,
wantAction: &ignore,
},
"ignore rule matches by the status=Unknown": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -476,6 +491,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: nil,
wantCountFailed: false,
wantAction: &ignore,
},
"ignore rule does not match when status for pattern is False, but actual True": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -592,6 +608,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: pointer.String("Pod default/mypod has condition DisruptionTarget matching FailJob rule at index 0"),
wantCountFailed: true,
wantAction: &failJob,
},
"count rule matched for pod conditions": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -621,6 +638,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: nil,
wantCountFailed: true,
wantAction: &count,
},
"no rule matched": {
podFailurePolicy: &batch.PodFailurePolicy{
@ -683,25 +701,21 @@ func TestMatchPodFailurePolicy(t *testing.T) {
},
wantJobFailureMessage: nil,
wantCountFailed: true,
wantAction: &count,
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
jobFailMessage, countFailed := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
if tc.wantJobFailureMessage == nil {
if jobFailMessage != nil {
t.Errorf("Unexpected job fail message. Got: %q", *jobFailMessage)
}
} else {
if jobFailMessage == nil {
t.Errorf("Missing job fail message. want: %q", *tc.wantJobFailureMessage)
} else if *tc.wantJobFailureMessage != *jobFailMessage {
t.Errorf("Unexpected job fail message. want: %q. got: %q", *tc.wantJobFailureMessage, *jobFailMessage)
}
jobFailMessage, countFailed, action := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
if diff := cmp.Diff(tc.wantJobFailureMessage, jobFailMessage); diff != "" {
t.Errorf("Unexpected job failure message: %s", diff)
}
if tc.wantCountFailed != countFailed {
t.Errorf("Unexpected count failed. want: %v. got: %v", tc.wantCountFailed, countFailed)
}
if diff := cmp.Diff(tc.wantAction, action); diff != "" {
t.Errorf("Unexpected failure policy action: %s", diff)
}
})
}
}

View File

@ -68,7 +68,7 @@ type metricLabelsWithValue struct {
Value int
}
func TestMetrics(t *testing.T) {
func TestMetricsOnSuccesses(t *testing.T) {
nonIndexedCompletion := batchv1.NonIndexedCompletion
indexedCompletion := batchv1.IndexedCompletion
wFinalizers := true
@ -94,7 +94,7 @@ func TestMetrics(t *testing.T) {
},
},
wantJobFinishedNumMetric: metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded"},
Labels: []string{"NonIndexed", "succeeded", ""},
Value: 1,
},
wantJobPodsFinishedMetric: metricLabelsWithValue{
@ -111,7 +111,7 @@ func TestMetrics(t *testing.T) {
},
},
wantJobFinishedNumMetric: metricLabelsWithValue{
Labels: []string{"Indexed", "succeeded"},
Labels: []string{"Indexed", "succeeded", ""},
Value: 1,
},
wantJobPodsFinishedMetric: metricLabelsWithValue{
@ -149,6 +149,146 @@ func TestMetrics(t *testing.T) {
}
}
func TestJobFinishedNumReasonMetric(t *testing.T) {
wFinalizers := true
// setup the job controller
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
defer cancel()
testCases := map[string]struct {
job batchv1.Job
podStatus v1.PodStatus
enableJobPodFailurePolicy bool
wantJobFinishedNumMetric metricLabelsWithValue
}{
"non-indexed job; failed pod handled by FailJob action; JobPodFailurePolicy enabled": {
enableJobPodFailurePolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Completions: pointer.Int32(1),
Parallelism: pointer.Int32(1),
BackoffLimit: pointer.Int32(1),
PodFailurePolicy: &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
Action: batchv1.PodFailurePolicyActionFailJob,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{5},
},
},
},
},
},
},
podStatus: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
wantJobFinishedNumMetric: metricLabelsWithValue{
Labels: []string{"NonIndexed", "failed", "PodFailurePolicy"},
Value: 1,
},
},
"non-indexed job; failed pod handled by Count action; JobPodFailurePolicy enabled": {
enableJobPodFailurePolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Completions: pointer.Int32(1),
Parallelism: pointer.Int32(1),
BackoffLimit: pointer.Int32(0),
PodFailurePolicy: &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
Action: batchv1.PodFailurePolicyActionCount,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{5},
},
},
},
},
},
},
podStatus: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
wantJobFinishedNumMetric: metricLabelsWithValue{
Labels: []string{"NonIndexed", "failed", "BackoffLimitExceeded"},
Value: 1,
},
},
"non-indexed job; failed": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Completions: pointer.Int32(1),
Parallelism: pointer.Int32(1),
BackoffLimit: pointer.Int32(0),
},
},
podStatus: v1.PodStatus{
Phase: v1.PodFailed,
},
wantJobFinishedNumMetric: metricLabelsWithValue{
Labels: []string{"NonIndexed", "failed", "BackoffLimitExceeded"},
Value: 1,
},
},
}
job_index := 0 // job index to avoid collisions between job names created by different test cases
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
resetMetrics()
// create a single job and wait for its completion
job := tc.job.DeepCopy()
job.Name = fmt.Sprintf("job-%v", job_index)
job_index++
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, job)
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: int(*jobObj.Spec.Parallelism),
Ready: pointer.Int32(0),
}, wFinalizers)
op := func(p *v1.Pod) bool {
p.Status = tc.podStatus
return true
}
if err, _ := updateJobPodsStatus(ctx, clientSet, jobObj, op, 1); err != nil {
t.Fatalf("Error %q while updating pod status for Job: %q", err, jobObj.Name)
}
validateJobFailed(ctx, t, clientSet, jobObj)
// verify metric values after the job is finished
validateCounterMetric(t, metrics.JobFinishedNum, tc.wantJobFinishedNumMetric)
})
}
}
func validateCounterMetric(t *testing.T, counterVec *basemetrics.CounterVec, wantMetric metricLabelsWithValue) {
t.Helper()
var cmpErr error
@ -351,6 +491,13 @@ func TestJobPodFailurePolicy(t *testing.T) {
},
},
},
{
Action: batchv1.PodFailurePolicyActionCount,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{10},
},
},
{
Action: batchv1.PodFailurePolicyActionFailJob,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
@ -375,6 +522,19 @@ func TestJobPodFailurePolicy(t *testing.T) {
},
},
}
podStatusMatchingOnExitCodesCountRule := v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 10,
},
},
},
},
}
podStatusMatchingOnPodConditionsIgnoreRule := v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
@ -384,14 +544,18 @@ func TestJobPodFailurePolicy(t *testing.T) {
},
},
}
podStatusNotMatchingAnyRule := v1.PodStatus{
Phase: v1.PodFailed,
}
testCases := map[string]struct {
enableJobPodFailurePolicy bool
restartController bool
job batchv1.Job
podStatus v1.PodStatus
wantActive int
wantFailed int
wantJobConditionType batchv1.JobConditionType
enableJobPodFailurePolicy bool
restartController bool
job batchv1.Job
podStatus v1.PodStatus
wantActive int
wantFailed int
wantJobConditionType batchv1.JobConditionType
wantPodFailuresHandledByPolicyRuleMetric *metricLabelsWithValue
}{
"pod status matching the configured FailJob rule on exit codes; job terminated when JobPodFailurePolicy enabled": {
enableJobPodFailurePolicy: true,
@ -400,6 +564,10 @@ func TestJobPodFailurePolicy(t *testing.T) {
wantActive: 0,
wantFailed: 1,
wantJobConditionType: batchv1.JobFailed,
wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
Labels: []string{"FailJob"},
Value: 1,
},
},
"pod status matching the configured FailJob rule on exit codes; with controller restart; job terminated when JobPodFailurePolicy enabled": {
enableJobPodFailurePolicy: true,
@ -425,11 +593,40 @@ func TestJobPodFailurePolicy(t *testing.T) {
wantActive: 1,
wantFailed: 0,
wantJobConditionType: batchv1.JobComplete,
wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
Labels: []string{"Ignore"},
Value: 1,
},
},
"pod status matching the configured Count rule on exit codes; pod failure counted when JobPodFailurePolicy enabled": {
enableJobPodFailurePolicy: true,
job: job,
podStatus: podStatusMatchingOnExitCodesCountRule,
wantActive: 1,
wantFailed: 1,
wantJobConditionType: batchv1.JobComplete,
wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
Labels: []string{"Count"},
Value: 1,
},
},
"pod status non-matching any configured rule; pod failure counted when JobPodFailurePolicy enabled": {
enableJobPodFailurePolicy: true,
job: job,
podStatus: podStatusNotMatchingAnyRule,
wantActive: 1,
wantFailed: 1,
wantJobConditionType: batchv1.JobComplete,
wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
Labels: []string{"Count"},
Value: 0,
},
},
}
for name, test := range testCases {
for _, wFinalizers := range []bool{false, true} {
t.Run(fmt.Sprintf("%s; finalizers=%t", name, wFinalizers), func(t *testing.T) {
resetMetrics()
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, test.enableJobPodFailurePolicy)()
@ -475,6 +672,9 @@ func TestJobPodFailurePolicy(t *testing.T) {
}
}
validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
if wFinalizers && test.wantPodFailuresHandledByPolicyRuleMetric != nil {
validateCounterMetric(t, metrics.PodFailuresHandledByFailurePolicy, *test.wantPodFailuresHandledByPolicyRuleMetric)
}
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
})
}
@ -1699,6 +1899,7 @@ func resetMetrics() {
metrics.TerminatedPodsTrackingFinalizerTotal.Reset()
metrics.JobFinishedNum.Reset()
metrics.JobPodsFinished.Reset()
metrics.PodFailuresHandledByFailurePolicy.Reset()
}
func createJobControllerWithSharedInformers(restConfig *restclient.Config, informerSet informers.SharedInformerFactory) (*jobcontroller.Controller, context.Context, context.CancelFunc) {