diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index 2416e830578..e3623e1da09 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -154,7 +154,7 @@ var ( StabilityLevel: metrics.ALPHA, }, []string{"work"}) - PodSchedulingDuration = metrics.NewHistogram( + PodSchedulingDuration = metrics.NewHistogramVec( &metrics.HistogramOpts{ Subsystem: SchedulerSubsystem, Name: "pod_scheduling_duration_seconds", @@ -162,7 +162,8 @@ var ( // Start with 1ms with the last bucket being [~16s, Inf) Buckets: metrics.ExponentialBuckets(0.001, 2, 15), StabilityLevel: metrics.ALPHA, - }) + }, + []string{"attempts"}) PodSchedulingAttempts = metrics.NewHistogram( &metrics.HistogramOpts{ diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 7b0a2a452e1..5d8557f4571 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -612,10 +612,9 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { if klog.V(2).Enabled() { klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes) } - metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start)) metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts)) - metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp)) + metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(podInfo)).Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp)) // Run "postbind" plugins. prof.RunPostBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) @@ -623,6 +622,15 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { }() } +func getAttemptsLabel(p *framework.QueuedPodInfo) string { + // We breakdown the pod scheduling duration by attempts capped to a limit + // to avoid ending up with a high cardinality metric. + if p.Attempts >= 15 { + return "15+" + } + return string(p.Attempts) +} + func (sched *Scheduler) profileForPod(pod *v1.Pod) (*profile.Profile, error) { prof, ok := sched.Profiles[pod.Spec.SchedulerName] if !ok {