Add profile label to schedule_attempts_total metric

and e2e_scheduling_duration_seconds Also adding result label to e2e_scheduling_duration_seconds. Previously, the metric was only updated for successful attempts Signed-off-by: Aldo Culquicondor <acondor@google.com>
2025-09-26 12:46:06 +00:00 · 2020-06-16 15:50:57 -04:00
parent 9cd906e932
commit eb9711dc1f
6 changed files with 74 additions and 34 deletions
--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
@@ -506,13 +506,13 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 			// Pod did not fit anywhere, so it is counted as a failure. If preemption
 			// succeeds, the pod should get counted as a success the next time we try to
 			// schedule it. (hopefully)
-			metrics.PodScheduleFailures.Inc()
+			metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 		} else if err == core.ErrNoNodesAvailable {
 			// No nodes available is counted as unschedulable rather than an error.
-			metrics.PodScheduleFailures.Inc()
+			metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 		} else {
 			klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 		}
 		sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
 		return
@@ -526,7 +526,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 	// Run "reserve" plugins.
 	if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
 		sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "")
-		metrics.PodScheduleErrors.Inc()
+		metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 		return
 	}

@@ -539,7 +539,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		// to a node and if so will not add it back to the unscheduled pods queue
 		// (otherwise this would cause an infinite loop).
 		sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "")
-		metrics.PodScheduleErrors.Inc()
+		metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 		// trigger un-reserve plugins to clean up state associated with the reserved Pod
 		prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
 		return
@@ -550,10 +550,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 	if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
 		var reason string
 		if runPermitStatus.IsUnschedulable() {
-			metrics.PodScheduleFailures.Inc()
+			metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 			reason = v1.PodReasonUnschedulable
 		} else {
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 			reason = SchedulerError
 		}
 		if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@@ -576,10 +576,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		if !waitOnPermitStatus.IsSuccess() {
 			var reason string
 			if waitOnPermitStatus.IsUnschedulable() {
-				metrics.PodScheduleFailures.Inc()
+				metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
 				reason = v1.PodReasonUnschedulable
 			} else {
-				metrics.PodScheduleErrors.Inc()
+				metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 				reason = SchedulerError
 			}
 			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@@ -595,7 +595,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
 		if !preBindStatus.IsSuccess() {
 			var reason string
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 			reason = SchedulerError
 			if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
 				klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
@@ -607,9 +607,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 		}

 		err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
-		metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
 		if err != nil {
-			metrics.PodScheduleErrors.Inc()
+			metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
 			// trigger un-reserve plugins to clean up state associated with the reserved Pod
 			prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
 			sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "")
@@ -619,7 +618,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
 				klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
 			}

-			metrics.PodScheduleSuccesses.Inc()
+			metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start))
 			metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
 			metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))