Add profile label to schedule_attempts_total metric

and e2e_scheduling_duration_seconds Also adding result label to e2e_scheduling_duration_seconds. Previously, the metric was only updated for successful attempts Signed-off-by: Aldo Culquicondor <acondor@google.com>
2025-07-23 11:50:44 +00:00 · 2020-06-16 15:50:57 -04:00 · 2020-06-16 15:50:57 -04:00 · eb9711dc1f
commit eb9711dc1f
parent 9cd906e932
6 changed files with 74 additions and 34 deletions
--- a/pkg/scheduler/metrics/BUILD
+++ b/pkg/scheduler/metrics/BUILD
@ -7,6 +7,7 @@ go_library(
    srcs = [
        "metric_recorder.go",
        "metrics.go",
        "profile_metrics.go",
    ],
    importpath = "k8s.io/kubernetes/pkg/scheduler/metrics",
    deps = [
--- a/pkg/scheduler/metrics/metrics.go
+++ b/pkg/scheduler/metrics/metrics.go
@ -54,16 +54,7 @@ var (
 			Name:           "schedule_attempts_total",
 			Help:           "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
 			StabilityLevel: metrics.ALPHA,
-		}, []string{"result"})
+		}, []string{"result", "profile"})
 	// PodScheduleSuccesses counts how many pods were scheduled.
 	// This metric will be initialized again in Register() to assure the metric is not no-op metric.
 	PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
 	// PodScheduleFailures counts how many pods could not be scheduled.
 	// This metric will be initialized again in Register() to assure the metric is not no-op metric.
 	PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
 	// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
 	// This metric will be initialized again in Register() to assure the metric is not no-op metric.
 	PodScheduleErrors            = scheduleAttempts.With(metrics.Labels{"result": "error"})
 	DeprecatedSchedulingDuration = metrics.NewSummaryVec(
 		&metrics.SummaryOpts{
 			Subsystem: SchedulerSubsystem,
@ -77,15 +68,14 @@ var (
 		},
 		[]string{OperationLabel},
 	)
-	E2eSchedulingLatency = metrics.NewHistogram(
+	e2eSchedulingLatency = metrics.NewHistogramVec(
 		&metrics.HistogramOpts{
 			Subsystem:      SchedulerSubsystem,
 			Name:           "e2e_scheduling_duration_seconds",
 			Help:           "E2e scheduling latency in seconds (scheduling algorithm + binding)",
 			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
 			StabilityLevel: metrics.ALPHA,
-		},
+		}, []string{"result", "profile"})
 	)
 	SchedulingAlgorithmLatency = metrics.NewHistogram(
 		&metrics.HistogramOpts{
 			Subsystem:      SchedulerSubsystem,
@ -235,7 +225,7 @@ var (
 	metricsList = []metrics.Registerable{
 		scheduleAttempts,
 		DeprecatedSchedulingDuration,
-		E2eSchedulingLatency,
+		e2eSchedulingLatency,
 		SchedulingAlgorithmLatency,
 		BindingLatency,
 		DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration,
@ -263,9 +253,6 @@ func Register() {
 	registerMetrics.Do(func() {
 		RegisterMetrics(metricsList...)
 		volumeschedulingmetrics.RegisterVolumeSchedulingMetrics()
 		PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
 		PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
 		PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
 	})
 }
--- a/pkg/scheduler/metrics/profile_metrics.go
+++ b/pkg/scheduler/metrics/profile_metrics.go
@ -0,0 +1,48 @@
 /*
 Copyright 2020 The Kubernetes Authors.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package metrics
 // This file contains helpers for metrics that are associated to a profile.
 var (
 	scheduledResult     = "scheduled"
 	unschedulableResult = "unschedulable"
 	errorResult         = "error"
 )
 // PodScheduled can records a successful scheduling attempt and the duration
 // since `start`.
 func PodScheduled(profile string, duration float64) {
 	observeScheduleAttemptAndLatency(scheduledResult, profile, duration)
 }
 // PodUnschedulable can records a scheduling attempt for an unschedulable pod
 // and the duration since `start`.
 func PodUnschedulable(profile string, duration float64) {
 	observeScheduleAttemptAndLatency(unschedulableResult, profile, duration)
 }
 // PodScheduleError can records a scheduling attempt that had an error and the
 // duration since `start`.
 func PodScheduleError(profile string, duration float64) {
 	observeScheduleAttemptAndLatency(errorResult, profile, duration)
 }
 func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
 	e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
 	scheduleAttempts.WithLabelValues(result, profile).Inc()
 }
--- a/pkg/scheduler/profile/profile.go
+++ b/pkg/scheduler/profile/profile.go
@ -40,19 +40,22 @@ type FrameworkFactory func(config.KubeSchedulerProfile, ...frameworkruntime.Opti
 type Profile struct {
 	framework.Framework
 	Recorder events.EventRecorder
 	Name     string
 }
 // NewProfile builds a Profile for the given configuration.
 func NewProfile(cfg config.KubeSchedulerProfile, frameworkFact FrameworkFactory, recorderFact RecorderFactory,
 	opts ...frameworkruntime.Option) (*Profile, error) {
-	r := recorderFact(cfg.SchedulerName)
+	recorder := recorderFact(cfg.SchedulerName)
-	f, err := frameworkFact(cfg, append(opts, frameworkruntime.WithEventRecorder(r))...)
+	opts = append(opts, frameworkruntime.WithEventRecorder(recorder))
 	fwk, err := frameworkFact(cfg, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return &Profile{
-		Framework: f,
+		Name:      cfg.SchedulerName,
-		Recorder:  r,
+		Framework: fwk,
 		Recorder:  recorder,
 	}, nil
 }
--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
@ -506,13 +506,13 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 			// Pod did not fit anywhere, so it is counted as a failure. If preemption
 			// succeeds, the pod should get counted as a success the next time we try to
 			// schedule it. (hopefully)
-			metrics.PodScheduleFailures.Inc()
+			metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 		} else if err == core.ErrNoNodesAvailable {
 			// No nodes available is counted as unschedulable rather than an error.
-			metrics.PodScheduleFailures.Inc()
+			metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 		} else {
 			klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 		}
 		sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
 		return
@ -526,7 +526,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 	// Run "reserve" plugins.
 	if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
 		sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "")
-		metrics.PodScheduleErrors.Inc()
+		metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 		return
 	}
@ -539,7 +539,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		// to a node and if so will not add it back to the unscheduled pods queue
 		// (otherwise this would cause an infinite loop).
 		sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "")
-		metrics.PodScheduleErrors.Inc()
+		metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 		// trigger un-reserve plugins to clean up state associated with the reserved Pod
 		prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
 		return
@ -550,10 +550,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 	if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
 		var reason string
 		if runPermitStatus.IsUnschedulable() {
-			metrics.PodScheduleFailures.Inc()
+			metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 			reason = v1.PodReasonUnschedulable
 		} else {
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 			reason = SchedulerError
 		}
 		if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@ -576,10 +576,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		if !waitOnPermitStatus.IsSuccess() {
 			var reason string
 			if waitOnPermitStatus.IsUnschedulable() {
-				metrics.PodScheduleFailures.Inc()
+				metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 				reason = v1.PodReasonUnschedulable
 			} else {
-				metrics.PodScheduleErrors.Inc()
+				metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 				reason = SchedulerError
 			}
 			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@ -595,7 +595,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
 		if !preBindStatus.IsSuccess() {
 			var reason string
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 			reason = SchedulerError
 			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
 				klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
@ -607,9 +607,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		}
 		err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
 		metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
 		if err != nil {
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 			// trigger un-reserve plugins to clean up state associated with the reserved Pod
 			prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
 			sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "")
@ -619,7 +618,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 				klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
 			}
-			metrics.PodScheduleSuccesses.Inc()
+			metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start))
 			metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
 			metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))
--- a/pkg/scheduler/scheduler_test.go
+++ b/pkg/scheduler/scheduler_test.go
@ -319,6 +319,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
 					testSchedulerName: &profile.Profile{
 						Framework: fwk,
 						Recorder:  eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName),
 						Name:      testSchedulerName,
 					},
 				},
 			}
@ -770,6 +771,7 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache internalcache.C
 	prof := &profile.Profile{
 		Framework: fwk,
 		Recorder:  &events.FakeRecorder{},
 		Name:      testSchedulerName,
 	}
 	if broadcaster != nil {
 		prof.Recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName)