Add profile label to schedule_attempts_total metric

and e2e_scheduling_duration_seconds Also adding result label to e2e_scheduling_duration_seconds. Previously, the metric was only updated for successful attempts Signed-off-by: Aldo Culquicondor <acondor@google.com>
2025-07-29 14:37:00 +00:00 · 2020-06-16 15:50:57 -04:00 · 2020-06-16 15:50:57 -04:00 · eb9711dc1f
commit eb9711dc1f
parent 9cd906e932
6 changed files with 74 additions and 34 deletions
--- a/pkg/scheduler/metrics/BUILD
+++ b/pkg/scheduler/metrics/BUILD
@ -7,6 +7,7 @@ go_library(
    srcs = [
        "metric_recorder.go",
        "metrics.go",
+        "profile_metrics.go",
    ],
    importpath = "k8s.io/kubernetes/pkg/scheduler/metrics",
    deps = [
--- a/pkg/scheduler/metrics/metrics.go
+++ b/pkg/scheduler/metrics/metrics.go
@ -54,16 +54,7 @@ var (
 			Name:           "schedule_attempts_total",
 			Help:           "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
 			StabilityLevel: metrics.ALPHA,
-		}, []string{"result"})
-	// PodScheduleSuccesses counts how many pods were scheduled.
-	// This metric will be initialized again in Register() to assure the metric is not no-op metric.
-	PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
-	// PodScheduleFailures counts how many pods could not be scheduled.
-	// This metric will be initialized again in Register() to assure the metric is not no-op metric.
-	PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
-	// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
-	// This metric will be initialized again in Register() to assure the metric is not no-op metric.
-	PodScheduleErrors            = scheduleAttempts.With(metrics.Labels{"result": "error"})
+		}, []string{"result", "profile"})
 	DeprecatedSchedulingDuration = metrics.NewSummaryVec(
 		&metrics.SummaryOpts{
 			Subsystem: SchedulerSubsystem,
@ -77,15 +68,14 @@ var (
 		},
 		[]string{OperationLabel},
 	)
-	E2eSchedulingLatency = metrics.NewHistogram(
+	e2eSchedulingLatency = metrics.NewHistogramVec(
 		&metrics.HistogramOpts{
 			Subsystem:      SchedulerSubsystem,
 			Name:           "e2e_scheduling_duration_seconds",
 			Help:           "E2e scheduling latency in seconds (scheduling algorithm + binding)",
 			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
 			StabilityLevel: metrics.ALPHA,
-		},
-	)
+		}, []string{"result", "profile"})
 	SchedulingAlgorithmLatency = metrics.NewHistogram(
 		&metrics.HistogramOpts{
 			Subsystem:      SchedulerSubsystem,
@ -235,7 +225,7 @@ var (
 	metricsList = []metrics.Registerable{
 		scheduleAttempts,
 		DeprecatedSchedulingDuration,
-		E2eSchedulingLatency,
+		e2eSchedulingLatency,
 		SchedulingAlgorithmLatency,
 		BindingLatency,
 		DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration,
@ -263,9 +253,6 @@ func Register() {
 	registerMetrics.Do(func() {
 		RegisterMetrics(metricsList...)
 		volumeschedulingmetrics.RegisterVolumeSchedulingMetrics()
-		PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
-		PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
-		PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
 	})
 }

--- a/pkg/scheduler/metrics/profile_metrics.go
+++ b/pkg/scheduler/metrics/profile_metrics.go
@ -0,0 +1,48 @@
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+// This file contains helpers for metrics that are associated to a profile.
+
+var (
+	scheduledResult     = "scheduled"
+	unschedulableResult = "unschedulable"
+	errorResult         = "error"
+)
+
+// PodScheduled can records a successful scheduling attempt and the duration
+// since `start`.
+func PodScheduled(profile string, duration float64) {
+	observeScheduleAttemptAndLatency(scheduledResult, profile, duration)
+}
+
+// PodUnschedulable can records a scheduling attempt for an unschedulable pod
+// and the duration since `start`.
+func PodUnschedulable(profile string, duration float64) {
+	observeScheduleAttemptAndLatency(unschedulableResult, profile, duration)
+}
+
+// PodScheduleError can records a scheduling attempt that had an error and the
+// duration since `start`.
+func PodScheduleError(profile string, duration float64) {
+	observeScheduleAttemptAndLatency(errorResult, profile, duration)
+}
+
+func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
+	e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
+	scheduleAttempts.WithLabelValues(result, profile).Inc()
+}
--- a/pkg/scheduler/profile/profile.go
+++ b/pkg/scheduler/profile/profile.go
@ -40,19 +40,22 @@ type FrameworkFactory func(config.KubeSchedulerProfile, ...frameworkruntime.Opti
 type Profile struct {
 	framework.Framework
 	Recorder events.EventRecorder
+	Name     string
 }

 // NewProfile builds a Profile for the given configuration.
 func NewProfile(cfg config.KubeSchedulerProfile, frameworkFact FrameworkFactory, recorderFact RecorderFactory,
 	opts ...frameworkruntime.Option) (*Profile, error) {
-	r := recorderFact(cfg.SchedulerName)
-	f, err := frameworkFact(cfg, append(opts, frameworkruntime.WithEventRecorder(r))...)
+	recorder := recorderFact(cfg.SchedulerName)
+	opts = append(opts, frameworkruntime.WithEventRecorder(recorder))
+	fwk, err := frameworkFact(cfg, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return &Profile{
-		Framework: f,
-		Recorder:  r,
+		Name:      cfg.SchedulerName,
+		Framework: fwk,
+		Recorder:  recorder,
 	}, nil
 }

--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
@ -506,13 +506,13 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 			// Pod did not fit anywhere, so it is counted as a failure. If preemption
 			// succeeds, the pod should get counted as a success the next time we try to
 			// schedule it. (hopefully)
-			metrics.PodScheduleFailures.Inc()
+			metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 		} else if err == core.ErrNoNodesAvailable {
 			// No nodes available is counted as unschedulable rather than an error.
-			metrics.PodScheduleFailures.Inc()
+			metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 		} else {
 			klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 		}
 		sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
 		return
@ -526,7 +526,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 	// Run "reserve" plugins.
 	if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
 		sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "")
-		metrics.PodScheduleErrors.Inc()
+		metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 		return
 	}

@ -539,7 +539,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		// to a node and if so will not add it back to the unscheduled pods queue
 		// (otherwise this would cause an infinite loop).
 		sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "")
-		metrics.PodScheduleErrors.Inc()
+		metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 		// trigger un-reserve plugins to clean up state associated with the reserved Pod
 		prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
 		return
@ -550,10 +550,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 	if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
 		var reason string
 		if runPermitStatus.IsUnschedulable() {
-			metrics.PodScheduleFailures.Inc()
+			metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 			reason = v1.PodReasonUnschedulable
 		} else {
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 			reason = SchedulerError
 		}
 		if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@ -576,10 +576,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		if !waitOnPermitStatus.IsSuccess() {
 			var reason string
 			if waitOnPermitStatus.IsUnschedulable() {
-				metrics.PodScheduleFailures.Inc()
+				metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 				reason = v1.PodReasonUnschedulable
 			} else {
-				metrics.PodScheduleErrors.Inc()
+				metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 				reason = SchedulerError
 			}
 			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@ -595,7 +595,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
 		if !preBindStatus.IsSuccess() {
 			var reason string
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 			reason = SchedulerError
 			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
 				klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
@ -607,9 +607,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		}

 		err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
-		metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
 		if err != nil {
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 			// trigger un-reserve plugins to clean up state associated with the reserved Pod
 			prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
 			sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "")
@ -619,7 +618,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 				klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
 			}

-			metrics.PodScheduleSuccesses.Inc()
+			metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start))
 			metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
 			metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))

--- a/pkg/scheduler/scheduler_test.go
+++ b/pkg/scheduler/scheduler_test.go
@ -319,6 +319,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
 					testSchedulerName: &profile.Profile{
 						Framework: fwk,
 						Recorder:  eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName),
+						Name:      testSchedulerName,
 					},
 				},
 			}
@ -770,6 +771,7 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache internalcache.C
 	prof := &profile.Profile{
 		Framework: fwk,
 		Recorder:  &events.FakeRecorder{},
+		Name:      testSchedulerName,
 	}
 	if broadcaster != nil {
 		prof.Recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName)