diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index 0250ddb76d1..2fbba9980c5 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -27,16 +27,15 @@ const ( // SchedulerSubsystem - subsystem name used by scheduler SchedulerSubsystem = "scheduler" // SchedulingLatencyName - scheduler latency metric name - SchedulingLatencyName = "scheduling_latencies_summary" + SchedulingLatencyName = "scheduling_latency_seconds" // OperationLabel - operation label name OperationLabel = "operation" // Binding - binding operation label value Binding = "binding" - // SchedulingAlgorithm - scheduling algorithm operation label value - SchedulingAlgorithm = "scheduling_algorithm" + // SelectingNode - selecting node operation label value + SelectingNode = "selecting_node" // E2eScheduling - e2e scheduling operation label value - E2eScheduling = "e2e_scheduling" ) // All the histogram based metrics have 1ms as size for the smallest bucket. @@ -45,13 +44,29 @@ var ( prometheus.SummaryOpts{ Subsystem: SchedulerSubsystem, Name: SchedulingLatencyName, - Help: "Scheduling latency in microseconds split by sub-parts of the scheduling operation", + Help: "Scheduling latency in seconds split by sub-parts of the scheduling operation", // Make the sliding window of 5h. // TODO: The value for this should be based on some SLI definition (long term). MaxAge: 5 * time.Hour, }, []string{OperationLabel}, ) + E2eSchedulingLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Subsystem: SchedulerSubsystem, + Name: "e2e_scheduling_latency_microseconds", + Help: "E2e scheduling latency (scheduling algorithm + binding)", + Buckets: prometheus.ExponentialBuckets(1000, 2, 15), + }, + ) + SchedulingAlgorithmLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Subsystem: SchedulerSubsystem, + Name: "scheduling_algorithm_latency_microseconds", + Help: "Scheduling algorithm latency", + Buckets: prometheus.ExponentialBuckets(1000, 2, 15), + }, + ) SchedulingAlgorithmPredicateEvaluationDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ Subsystem: SchedulerSubsystem, @@ -76,6 +91,14 @@ var ( Buckets: prometheus.ExponentialBuckets(1000, 2, 15), }, ) + BindingLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Subsystem: SchedulerSubsystem, + Name: "binding_latency_microseconds", + Help: "Binding latency", + Buckets: prometheus.ExponentialBuckets(1000, 2, 15), + }, + ) PreemptionVictims = prometheus.NewGauge( prometheus.GaugeOpts{ Subsystem: SchedulerSubsystem, @@ -90,6 +113,9 @@ var ( }) metricsList = []prometheus.Collector{ SchedulingLatency, + E2eSchedulingLatency, + SchedulingAlgorithmLatency, + BindingLatency, SchedulingAlgorithmPredicateEvaluationDuration, SchedulingAlgorithmPriorityEvaluationDuration, SchedulingAlgorithmPremptionEvaluationDuration, @@ -102,6 +128,7 @@ var registerMetrics sync.Once // Register all metrics. func Register() { + // Register the metrics. registerMetrics.Do(func() { for _, metric := range metricsList { prometheus.MustRegister(metric) @@ -118,3 +145,8 @@ func Reset() { func SinceInMicroseconds(start time.Time) float64 { return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds()) } + +// SinceInSeconds gets the time since the specified start in seconds. +func SinceInSeconds(start time.Time) float64 { + return time.Since(start).Seconds() +} diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index b4667d4ecb4..bc646548b4d 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -429,7 +429,8 @@ func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error { return err } - metrics.SchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInMicroseconds(bindingStart)) + metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart)) + metrics.SchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInSeconds(bindingStart)) sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, b.Target.Name) return nil } @@ -461,7 +462,8 @@ func (sched *Scheduler) scheduleOne() { } return } - metrics.SchedulingLatency.WithLabelValues(metrics.SchedulingAlgorithm).Observe(metrics.SinceInMicroseconds(start)) + metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start)) + metrics.SchedulingLatency.WithLabelValues(metrics.SelectingNode).Observe(metrics.SinceInSeconds(start)) // Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet. // This allows us to keep scheduling without waiting on binding to occur. assumedPod := pod.DeepCopy() @@ -496,7 +498,7 @@ func (sched *Scheduler) scheduleOne() { Name: suggestedHost, }, }) - metrics.SchedulingLatency.WithLabelValues(metrics.E2eScheduling).Observe(metrics.SinceInMicroseconds(start)) + metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start)) if err != nil { glog.Errorf("Internal error binding pod: (%v)", err) } diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go index 9b7b5b2aa8e..2562dc47209 100644 --- a/test/e2e/framework/metrics_util.go +++ b/test/e2e/framework/metrics_util.go @@ -210,13 +210,12 @@ func (l *PodStartupLatency) PrintJSON() string { } type SchedulingMetrics struct { - SchedulingLatency LatencyMetric `json:"schedulingLatency"` - BindingLatency LatencyMetric `json:"bindingLatency"` - E2ELatency LatencyMetric `json:"e2eLatency"` - ThroughputAverage float64 `json:"throughputAverage"` - ThroughputPerc50 float64 `json:"throughputPerc50"` - ThroughputPerc90 float64 `json:"throughputPerc90"` - ThroughputPerc99 float64 `json:"throughputPerc99"` + SelectingNodeLatency LatencyMetric `json:"selectingNodeLatency"` + BindingLatency LatencyMetric `json:"bindingLatency"` + ThroughputAverage float64 `json:"throughputAverage"` + ThroughputPerc50 float64 `json:"throughputPerc50"` + ThroughputPerc90 float64 `json:"throughputPerc90"` + ThroughputPerc99 float64 `json:"throughputPerc99"` } func (l *SchedulingMetrics) SummaryKind() string { @@ -512,23 +511,20 @@ func getSchedulingLatency(c clientset.Interface) (*SchedulingMetrics, error) { var metric *LatencyMetric = nil switch sample.Metric[schedulermetric.OperationLabel] { - case schedulermetric.SchedulingAlgorithm: - metric = &result.SchedulingLatency + case schedulermetric.SelectingNode: + metric = &result.SelectingNodeLatency case schedulermetric.Binding: metric = &result.BindingLatency - case schedulermetric.E2eScheduling: - metric = &result.E2ELatency } if metric == nil { continue } - latency := sample.Value quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64) if err != nil { return nil, err } - setQuantile(metric, quantile, time.Duration(int64(latency))) + setQuantile(metric, quantile, time.Duration(int64(float64(sample.Value)*float64(time.Second)))) } return &result, nil }