Merge pull request #101292 from AliceZhang2016/job_controller_metrics

Graduate indexed job to beta
2025-09-24 03:18:57 +00:00 · 2021-05-07 13:31:44 -07:00
parent c9bd08a3c4 0c99f29f98
commit 548fb43643
14 changed files with 143 additions and 23 deletions
--- a/pkg/controller/job/indexed_job_utils.go
+++ b/pkg/controller/job/indexed_job_utils.go
@@ -170,7 +170,7 @@ func getCompletionIndex(annotations map[string]string) int {
 	if annotations == nil {
 		return unknownCompletionIndex
 	}
-	v, ok := annotations[batch.JobCompletionIndexAnnotationAlpha]
+	v, ok := annotations[batch.JobCompletionIndexAnnotation]
 	if !ok {
 		return unknownCompletionIndex
 	}
@@ -203,7 +203,7 @@ func addCompletionIndexEnvVariable(container *v1.Container) {
 		Name: completionIndexEnvName,
 		ValueFrom: &v1.EnvVarSource{
 			FieldRef: &v1.ObjectFieldSelector{
-				FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotationAlpha),
+				FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation),
 			},
 		},
 	})
@@ -213,7 +213,7 @@ func addCompletionIndexAnnotation(template *v1.PodTemplateSpec, index int) {
 	if template.Annotations == nil {
 		template.Annotations = make(map[string]string, 1)
 	}
-	template.Annotations[batch.JobCompletionIndexAnnotationAlpha] = strconv.Itoa(index)
+	template.Annotations[batch.JobCompletionIndexAnnotation] = strconv.Itoa(index)
 }

 type byCompletionIndex []*v1.Pod
--- a/pkg/controller/job/indexed_job_utils_test.go
+++ b/pkg/controller/job/indexed_job_utils_test.go
@@ -279,7 +279,7 @@ func hollowPodsWithIndexPhase(descs []indexPhase) []*v1.Pod {
 		}
 		if desc.Index != noIndex {
 			p.Annotations = map[string]string{
-				batch.JobCompletionIndexAnnotationAlpha: desc.Index,
+				batch.JobCompletionIndexAnnotation: desc.Index,
 			}
 		}
 		pods = append(pods, p)
@@ -297,7 +297,7 @@ func toIndexPhases(pods []*v1.Pod) []indexPhase {
 	for i, p := range pods {
 		index := noIndex
 		if p.Annotations != nil {
-			index = p.Annotations[batch.JobCompletionIndexAnnotationAlpha]
+			index = p.Annotations[batch.JobCompletionIndexAnnotation]
 		}
 		result[i] = indexPhase{index, p.Status.Phase}
 	}
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -47,6 +47,7 @@ import (
 	"k8s.io/component-base/metrics/prometheus/ratelimiter"
 	"k8s.io/klog/v2"
 	"k8s.io/kubernetes/pkg/controller"
+	"k8s.io/kubernetes/pkg/controller/job/metrics"
 	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/utils/integer"
 )
@@ -60,7 +61,8 @@ var (
 	// DefaultJobBackOff is the default backoff period, exported for the e2e test
 	DefaultJobBackOff = 10 * time.Second
 	// MaxJobBackOff is the max backoff period, exported for the e2e test
-	MaxJobBackOff = 360 * time.Second
+	MaxJobBackOff             = 360 * time.Second
+	maxPodCreateDeletePerSync = 500
 )

 // Controller ensures that all Job objects have corresponding pods to
@@ -139,6 +141,8 @@ func NewController(podInformer coreinformers.PodInformer, jobInformer batchinfor
 	jm.updateHandler = jm.updateJobStatus
 	jm.syncHandler = jm.syncJob

+	metrics.Register()
+
 	return jm
 }

@@ -440,7 +444,7 @@ func (jm *Controller) getPodsForJob(j *batch.Job) ([]*v1.Pod, error) {
 // syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning
 // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
 // concurrently with the same key.
-func (jm *Controller) syncJob(key string) (bool, error) {
+func (jm *Controller) syncJob(key string) (forget bool, rErr error) {
 	startTime := time.Now()
 	defer func() {
 		klog.V(4).Infof("Finished syncing job %q (%v)", key, time.Since(startTime))
@@ -480,6 +484,21 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 		return false, nil
 	}

+	completionMode := string(batch.NonIndexedCompletion)
+	if isIndexedJob(&job) {
+		completionMode = string(batch.IndexedCompletion)
+	}
+
+	defer func() {
+		result := "success"
+		if rErr != nil {
+			result = "error"
+		}
+
+		metrics.JobSyncDurationSeconds.WithLabelValues(completionMode, result).Observe(time.Since(startTime).Seconds())
+		metrics.JobSyncNum.WithLabelValues(completionMode, result).Inc()
+	}()
+
 	// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
 	// and update the expectations after we've retrieved active pods from the store. If a new pod enters
 	// the store after we've checked the expectation, the job sync is just deferred till the next relist.
@@ -546,6 +565,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 		job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobFailed, v1.ConditionTrue, failureReason, failureMessage))
 		jobConditionsChanged = true
 		jm.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage)
+		metrics.JobFinishedNum.WithLabelValues(completionMode, "failed").Inc()
 	} else {
 		if jobNeedsSync && job.DeletionTimestamp == nil {
 			active, manageJobErr = jm.manageJob(&job, activePods, succeeded, pods)
@@ -581,6 +601,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 			now := metav1.Now()
 			job.Status.CompletionTime = &now
 			jm.recorder.Event(&job, v1.EventTypeNormal, "Completed", "Job completed")
+			metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded").Inc()
 		} else if utilfeature.DefaultFeatureGate.Enabled(features.SuspendJob) && manageJobCalled {
 			// Update the conditions / emit events only if manageJob was called in
 			// this syncJob. Otherwise wait for the right syncJob call to make
@@ -613,7 +634,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
 		}
 	}

-	forget := false
+	forget = false
 	// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
 	// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
 	// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
@@ -783,6 +804,9 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
 		rmAtLeast = 0
 	}
 	podsToDelete := activePodsForRemoval(job, activePods, int(rmAtLeast))
+	if len(podsToDelete) > maxPodCreateDeletePerSync {
+		podsToDelete = podsToDelete[:maxPodCreateDeletePerSync]
+	}
 	if len(podsToDelete) > 0 {
 		jm.expectations.ExpectDeletions(jobKey, len(podsToDelete))
 		klog.V(4).InfoS("Too many pods running for job", "job", klog.KObj(job), "deleted", len(podsToDelete), "target", parallelism)
@@ -803,6 +827,10 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
 			return active, nil
 		}

+		if diff > int32(maxPodCreateDeletePerSync) {
+			diff = int32(maxPodCreateDeletePerSync)
+		}
+
 		jm.expectations.ExpectCreations(jobKey, int(diff))
 		errCh := make(chan error, diff)
 		klog.V(4).Infof("Too few pods running job %q, need %d, creating %d", jobKey, wantActive, diff)
--- a/pkg/controller/job/job_controller_test.go
+++ b/pkg/controller/job/job_controller_test.go
@@ -149,7 +149,7 @@ func setPodsStatusesWithIndexes(podIndexer cache.Indexer, job *batch.Job, status
 		p.Status = v1.PodStatus{Phase: s.Phase}
 		if s.Index != noIndex {
 			p.Annotations = map[string]string{
-				batch.JobCompletionIndexAnnotationAlpha: s.Index,
+				batch.JobCompletionIndexAnnotation: s.Index,
 			}
 		}
 		podIndexer.Add(p)
@@ -2176,7 +2176,7 @@ func checkJobCompletionEnvVariable(t *testing.T, spec *v1.PodSpec) {
 			Name: "JOB_COMPLETION_INDEX",
 			ValueFrom: &v1.EnvVarSource{
 				FieldRef: &v1.ObjectFieldSelector{
-					FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotationAlpha),
+					FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation),
 				},
 			},
 		},
--- a/pkg/controller/job/metrics/metrics.go
+++ b/pkg/controller/job/metrics/metrics.go
@@ -0,0 +1,75 @@
+/*
+Copyright 2021 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"sync"
+
+	"k8s.io/component-base/metrics"
+	"k8s.io/component-base/metrics/legacyregistry"
+)
+
+// JobControllerSubsystem - subsystem name used for this controller.
+const JobControllerSubsystem = "job_controller"
+
+var (
+	// JobSyncDurationSeconds tracks the latency of job syncs as
+	// completion_mode = Indexed / NonIndexed and result = success / error.
+	JobSyncDurationSeconds = metrics.NewHistogramVec(
+		&metrics.HistogramOpts{
+			Subsystem:      JobControllerSubsystem,
+			Name:           "job_sync_duration_seconds",
+			Help:           "The time it took to sync a job",
+			StabilityLevel: metrics.ALPHA,
+			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
+		},
+		[]string{"completion_mode", "result"},
+	)
+	// JobSyncNum tracks the number of job syncs as
+	// completion_mode = Indexed / NonIndexed and result = success / error.
+	JobSyncNum = metrics.NewCounterVec(
+		&metrics.CounterOpts{
+			Subsystem:      JobControllerSubsystem,
+			Name:           "job_sync_total",
+			Help:           "The number of job syncs",
+			StabilityLevel: metrics.ALPHA,
+		},
+		[]string{"completion_mode", "result"},
+	)
+	// JobFinishedNum tracks the number of jobs that finish as
+	// completion_mode = Indexed / NonIndexed and result = failed / succeeded.
+	JobFinishedNum = metrics.NewCounterVec(
+		&metrics.CounterOpts{
+			Subsystem:      JobControllerSubsystem,
+			Name:           "job_finished_total",
+			Help:           "The number of finished job",
+			StabilityLevel: metrics.ALPHA,
+		},
+		[]string{"completion_mode", "result"},
+	)
+)
+
+var registerMetrics sync.Once
+
+// Register registers Job controller metrics.
+func Register() {
+	registerMetrics.Do(func() {
+		legacyregistry.MustRegister(JobSyncDurationSeconds)
+		legacyregistry.MustRegister(JobSyncNum)
+		legacyregistry.MustRegister(JobFinishedNum)
+	})
+}