breakdown PodSchedulingDuration by number of attempts

This commit is contained in:
Abdullah Gharaibeh 2020-06-30 10:53:02 -04:00
parent 908847c01e
commit d1ea49bcd9
2 changed files with 13 additions and 4 deletions

View File

@ -154,7 +154,7 @@ var (
StabilityLevel: metrics.ALPHA,
}, []string{"work"})
PodSchedulingDuration = metrics.NewHistogram(
PodSchedulingDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "pod_scheduling_duration_seconds",
@ -162,7 +162,8 @@ var (
// Start with 1ms with the last bucket being [~16s, Inf)
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
})
},
[]string{"attempts"})
PodSchedulingAttempts = metrics.NewHistogram(
&metrics.HistogramOpts{

View File

@ -612,10 +612,9 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
if klog.V(2).Enabled() {
klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
}
metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start))
metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))
metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(podInfo)).Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))
// Run "postbind" plugins.
prof.RunPostBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
@ -623,6 +622,15 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
}()
}
func getAttemptsLabel(p *framework.QueuedPodInfo) string {
// We breakdown the pod scheduling duration by attempts capped to a limit
// to avoid ending up with a high cardinality metric.
if p.Attempts >= 15 {
return "15+"
}
return string(p.Attempts)
}
func (sched *Scheduler) profileForPod(pod *v1.Pod) (*profile.Profile, error) {
prof, ok := sched.Profiles[pod.Spec.SchedulerName]
if !ok {