Add metric job_pod_finished

To count the number of pods that the job controller successfully tracked with the JobTrackingWithFinalizers feature gate.
This commit is contained in:
Aldo Culquicondor 2021-09-02 14:28:28 -04:00
parent 7d9cb88fed
commit a0e7a567c5
4 changed files with 74 additions and 9 deletions

View File

@ -338,3 +338,10 @@ func (bci byCompletionIndex) Swap(i, j int) {
func (bci byCompletionIndex) Len() int {
return len(bci)
}
func completionModeStr(job *batch.Job) string {
if job.Spec.CompletionMode != nil {
return string(*job.Spec.CompletionMode)
}
return string(batch.NonIndexedCompletion)
}

View File

@ -897,6 +897,8 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(job *batch.Job, pods []*
uidsWithFinalizer.Insert(string(p.UID))
}
}
// Shallow copy, as it will only be used to detect changes in the counters.
oldCounters := job.Status
if cleanUncountedPodsWithoutFinalizers(&job.Status, uidsWithFinalizer) {
needsFlush = true
}
@ -951,7 +953,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(job *batch.Job, pods []*
job.Status.CompletedIndexes = succeededIndexes.String()
}
var err error
if needsFlush, err = jm.flushUncountedAndRemoveFinalizers(job, podsToRemoveFinalizer, uidsWithFinalizer, needsFlush); err != nil {
if needsFlush, err = jm.flushUncountedAndRemoveFinalizers(job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil {
return err
}
if jm.enactJobFinished(job, finishedCond) {
@ -961,6 +963,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(job *batch.Job, pods []*
if err := jm.updateStatusHandler(job); err != nil {
return fmt.Errorf("removing uncounted pods from status: %w", err)
}
recordJobPodFinished(job, oldCounters)
}
return nil
}
@ -974,11 +977,14 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(job *batch.Job, pods []*
// 4. (if not all removals succeeded) flush Job status again.
// Returns whether there are pending changes in the Job status that need to be
// flushed in subsequent calls.
func (jm *Controller) flushUncountedAndRemoveFinalizers(job *batch.Job, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.String, needsFlush bool) (bool, error) {
func (jm *Controller) flushUncountedAndRemoveFinalizers(job *batch.Job, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.String, oldCounters *batch.JobStatus, needsFlush bool) (bool, error) {
if needsFlush {
if err := jm.updateStatusHandler(job); err != nil {
return needsFlush, fmt.Errorf("adding uncounted pods to status: %w", err)
}
recordJobPodFinished(job, *oldCounters)
// Shallow copy.
*oldCounters = job.Status
needsFlush = false
}
var rmErr error
@ -1545,3 +1551,11 @@ func ensureJobConditionStatus(list []batch.JobCondition, cType batch.JobConditio
}
return list, false
}
func recordJobPodFinished(job *batch.Job, oldCounters batch.JobStatus) {
completionMode := completionModeStr(job)
diff := job.Status.Succeeded - oldCounters.Succeeded
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded).Add(float64(diff))
diff = job.Status.Failed - oldCounters.Failed
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff))
}

View File

@ -46,8 +46,10 @@ import (
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
featuregatetesting "k8s.io/component-base/featuregate/testing"
metricstestutil "k8s.io/component-base/metrics/testutil"
_ "k8s.io/kubernetes/pkg/apis/core/install"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/job/metrics"
"k8s.io/kubernetes/pkg/controller/testutil"
"k8s.io/kubernetes/pkg/features"
"k8s.io/utils/pointer"
@ -1514,19 +1516,20 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
manager, _ := newControllerFromClient(clientSet, controller.NoResyncPeriodFunc)
fakePodControl := controller.FakePodControl{Err: tc.podControlErr}
metrics.JobPodsFinished.Reset()
manager.podControl = &fakePodControl
var statusUpdates []batch.JobStatus
manager.updateStatusHandler = func(job *batch.Job) error {
statusUpdates = append(statusUpdates, *job.Status.DeepCopy())
return tc.statusUpdateErr
}
if tc.job.Status.UncountedTerminatedPods == nil {
tc.job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{}
job := tc.job.DeepCopy()
if job.Status.UncountedTerminatedPods == nil {
job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{}
}
uncounted := newUncountedTerminatedPods(*tc.job.Status.UncountedTerminatedPods)
succeededIndexes := succeededIndexesFromJob(&tc.job)
err := manager.trackJobStatusAndRemoveFinalizers(&tc.job, tc.pods, succeededIndexes, *uncounted, tc.finishedCond, tc.needsFlush)
uncounted := newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods)
succeededIndexes := succeededIndexesFromJob(job)
err := manager.trackJobStatusAndRemoveFinalizers(job, tc.pods, succeededIndexes, *uncounted, tc.finishedCond, tc.needsFlush)
if !errors.Is(err, tc.wantErr) {
t.Errorf("Got error %v, want %w", err, tc.wantErr)
}
@ -1537,6 +1540,25 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
if rmFinalizers != tc.wantRmFinalizers {
t.Errorf("Removed %d finalizers, want %d", rmFinalizers, tc.wantRmFinalizers)
}
if tc.wantErr == nil {
completionMode := completionModeStr(job)
v, err := metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded))
if err != nil {
t.Fatalf("Obtaining succeeded job_pods_finished_total: %v", err)
}
newSucceeded := job.Status.Succeeded - tc.job.Status.Succeeded
if float64(newSucceeded) != v {
t.Errorf("Metric reports %.0f succeeded pods, want %d", v, newSucceeded)
}
v, err = metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed))
if err != nil {
t.Fatalf("Obtaining failed job_pods_finished_total: %v", err)
}
newFailed := job.Status.Failed - tc.job.Status.Failed
if float64(newFailed) != v {
t.Errorf("Metric reports %.0f failed pods, want %d", v, newFailed)
}
}
})
}
}

View File

@ -68,10 +68,26 @@ var (
},
[]string{"completion_mode", "result"},
)
// JobPodsFinished records the number of finished Pods that the job controller
// finished tracking.
// It only applies to Jobs that were created while the feature gate
// JobTrackingWithFinalizers was enabled.
// Possible label values:
// completion_mode: Indexed, NonIndexed
// result: failed, succeeded
JobPodsFinished = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: JobControllerSubsystem,
Name: "job_pods_finished_total",
Help: "The number of finished Pods that are fully tracked",
},
[]string{"completion_mode", "result"})
)
// Possible values for the "action" label in the above metrics.
const (
// Possible values for the "action" label in the above metrics.
// JobSyncActionReconciling when the Job's pod creation/deletion expectations
// are unsatisfied and the controller is waiting for issued Pod
// creation/deletions to complete.
@ -88,6 +104,11 @@ const (
// if a Job is suspended or if the number of active Pods is more than
// parallelism.
JobSyncActionPodsDeleted = "pods_deleted"
// Possible values for "result" label in the above metrics.
Succeeded = "succeeded"
Failed = "failed"
)
var registerMetrics sync.Once
@ -98,5 +119,6 @@ func Register() {
legacyregistry.MustRegister(JobSyncDurationSeconds)
legacyregistry.MustRegister(JobSyncNum)
legacyregistry.MustRegister(JobFinishedNum)
legacyregistry.MustRegister(JobPodsFinished)
})
}