Job: Extend the jobs_finished_total metric reason label with SuccessPolicy and CompletionsReached

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
This commit is contained in:
Yuki Iwai 2024-07-14 09:19:58 +09:00
parent 594490fd77
commit 6e8dc2c250
3 changed files with 44 additions and 11 deletions

View File

@ -1424,7 +1424,7 @@ func (jm *Controller) recordJobFinished(job *batch.Job, finishedCond *batch.JobC
jm.recorder.Event(job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached")
}
jm.recorder.Event(job, v1.EventTypeNormal, "Completed", "Job completed")
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded", "").Inc()
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded", finishedCond.Reason).Inc()
} else {
jm.recorder.Event(job, v1.EventTypeWarning, finishedCond.Reason, finishedCond.Message)
metrics.JobFinishedNum.WithLabelValues(completionMode, "failed", finishedCond.Reason).Inc()

View File

@ -55,12 +55,14 @@ var (
},
[]string{"completion_mode", "result", "action"},
)
// JobFinishedNum tracks the number of Jobs that finish. Empty reason label
// is used to count successful jobs.
// JobFinishedNum tracks the number of Jobs that finish.
// TODO: Once we remove the JobSuccessPolicy feature gate, we need to remove "" reason label comment.
// When the JobSuccessPolicy feature gate is disabled, empty reason label is used to count successful jobs.
// Otherwise, "CompletionsReached" reason label is used to count successful jobs.
// Possible label values:
// completion_mode: Indexed, NonIndexed
// result: failed, succeeded
// reason: "BackoffLimitExceeded", "DeadlineExceeded", "PodFailurePolicy", "FailedIndexes", "MaxFailedIndexesExceeded", ""
// reason: "BackoffLimitExceeded", "DeadlineExceeded", "PodFailurePolicy", "FailedIndexes", "MaxFailedIndexesExceeded", "SuccessPolicy", "CompletionsReached", ""
JobFinishedNum = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: JobControllerSubsystem,

View File

@ -511,10 +511,10 @@ func TestSuccessPolicy(t *testing.T) {
testCases := map[string]struct {
enableJobSuccessPolicy bool
enableBackoffLimitPerIndex bool
job batchv1.Job
podTerminations []podTerminationWithExpectations
wantConditionTypes []batchv1.JobConditionType
wantJobFinishedNumMetric []metricLabelsWithValue
job batchv1.Job
podTerminations []podTerminationWithExpectations
wantConditionTypes []batchv1.JobConditionType
wantJobFinishedNumMetric []metricLabelsWithValue
}{
"all indexes succeeded; JobSuccessPolicy is enabled": {
enableJobSuccessPolicy: true,
@ -547,7 +547,7 @@ func TestSuccessPolicy(t *testing.T) {
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
wantJobFinishedNumMetric: []metricLabelsWithValue{
{
Labels: []string{"Indexed", "succeeded", ""},
Labels: []string{"Indexed", "succeeded", "SuccessPolicy"},
Value: 1,
},
},
@ -587,6 +587,37 @@ func TestSuccessPolicy(t *testing.T) {
},
},
},
"job without successPolicy; incremented the jobs_finished_total metric with CompletionsReached reason": {
enableJobSuccessPolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](1),
Completions: ptr.To[int32](1),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
Template: podTemplateSpec,
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodSucceeded,
},
wantActive: 0,
wantFailed: 0,
wantSucceeded: 1,
wantCompletedIndexes: "0",
wantTerminating: ptr.To[int32](0),
},
},
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
wantJobFinishedNumMetric: []metricLabelsWithValue{
{
Labels: []string{"Indexed", "succeeded", "CompletionsReached"},
Value: 1,
},
},
},
"job with successPolicy with succeededIndexes; job has SuccessCriteriaMet and Complete conditions even if some indexes remain pending": {
enableJobSuccessPolicy: true,
job: batchv1.Job{
@ -629,7 +660,7 @@ func TestSuccessPolicy(t *testing.T) {
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
wantJobFinishedNumMetric: []metricLabelsWithValue{
{
Labels: []string{"Indexed", "succeeded", ""},
Labels: []string{"Indexed", "succeeded", "SuccessPolicy"},
Value: 1,
},
},
@ -676,7 +707,7 @@ func TestSuccessPolicy(t *testing.T) {
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
wantJobFinishedNumMetric: []metricLabelsWithValue{
{
Labels: []string{"Indexed", "succeeded", ""},
Labels: []string{"Indexed", "succeeded", "SuccessPolicy"},
Value: 1,
},
},