From 8995c1e030643f26cad45fa33004d86726a2911d Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Wed, 13 Nov 2019 16:35:59 -0500 Subject: [PATCH] Update bucket for scheduler framework latency histograms. --- pkg/scheduler/metrics/metrics.go | 34 +++++++++++-------- pkg/scheduler/scheduler.go | 4 +-- .../scheduler_perf/scheduler_bench_test.go | 2 +- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index 8f3d88e0179..3e5887aceb6 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -159,7 +159,7 @@ var ( StabilityLevel: metrics.ALPHA, }, ) - SchedulingAlgorithmPremptionEvaluationDuration = metrics.NewHistogram( + SchedulingAlgorithmPreemptionEvaluationDuration = metrics.NewHistogram( &metrics.HistogramOpts{ Subsystem: SchedulerSubsystem, Name: "scheduling_algorithm_preemption_evaluation_seconds", @@ -168,7 +168,7 @@ var ( StabilityLevel: metrics.ALPHA, }, ) - DeprecatedSchedulingAlgorithmPremptionEvaluationDuration = metrics.NewHistogram( + DeprecatedSchedulingAlgorithmPreemptionEvaluationDuration = metrics.NewHistogram( &metrics.HistogramOpts{ Subsystem: SchedulerSubsystem, Name: "scheduling_algorithm_preemption_evaluation", @@ -228,9 +228,10 @@ var ( PodSchedulingDuration = metrics.NewHistogram( &metrics.HistogramOpts{ - Subsystem: SchedulerSubsystem, - Name: "pod_scheduling_duration_seconds", - Help: "E2e latency for a pod being scheduled which may include multiple scheduling attempts.", + Subsystem: SchedulerSubsystem, + Name: "pod_scheduling_duration_seconds", + Help: "E2e latency for a pod being scheduled which may include multiple scheduling attempts.", + // Start with 1ms with the last bucket being [~16s, Inf) Buckets: metrics.ExponentialBuckets(0.001, 2, 15), StabilityLevel: metrics.ALPHA, }) @@ -246,20 +247,23 @@ var ( FrameworkExtensionPointDuration = metrics.NewHistogramVec( &metrics.HistogramOpts{ - Subsystem: SchedulerSubsystem, - Name: "framework_extension_point_duration_seconds", - Help: "Latency for running all plugins of a specific extension point.", - Buckets: nil, + Subsystem: SchedulerSubsystem, + Name: "framework_extension_point_duration_seconds", + Help: "Latency for running all plugins of a specific extension point.", + // Start with 0.1ms with the last bucket being [~200ms, Inf) + Buckets: metrics.ExponentialBuckets(0.0001, 2, 12), StabilityLevel: metrics.ALPHA, }, []string{"extension_point", "status"}) PluginExecutionDuration = metrics.NewHistogramVec( &metrics.HistogramOpts{ - Subsystem: SchedulerSubsystem, - Name: "plugin_execution_duration_seconds", - Help: "Duration for running a plugin at a specific extension point.", - Buckets: nil, + Subsystem: SchedulerSubsystem, + Name: "plugin_execution_duration_seconds", + Help: "Duration for running a plugin at a specific extension point.", + // Start with 0.01ms with the last bucket being [~22ms, Inf). We use a small factor (1.5) + // so that we have better granularity since plugin latency is very sensitive. + Buckets: metrics.ExponentialBuckets(0.00001, 1.5, 20), StabilityLevel: metrics.ALPHA, }, []string{"plugin", "extension_point", "status"}) @@ -304,8 +308,8 @@ var ( DeprecatedSchedulingAlgorithmPredicateEvaluationDuration, SchedulingAlgorithmPriorityEvaluationDuration, DeprecatedSchedulingAlgorithmPriorityEvaluationDuration, - SchedulingAlgorithmPremptionEvaluationDuration, - DeprecatedSchedulingAlgorithmPremptionEvaluationDuration, + SchedulingAlgorithmPreemptionEvaluationDuration, + DeprecatedSchedulingAlgorithmPreemptionEvaluationDuration, PreemptionVictims, PreemptionAttempts, pendingPods, diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index f80d5deb134..e20b2d8b4c8 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -614,8 +614,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { preemptionStartTime := time.Now() sched.preempt(schedulingCycleCtx, state, fwk, pod, fitError) metrics.PreemptionAttempts.Inc() - metrics.SchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInSeconds(preemptionStartTime)) - metrics.DeprecatedSchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInMicroseconds(preemptionStartTime)) + metrics.SchedulingAlgorithmPreemptionEvaluationDuration.Observe(metrics.SinceInSeconds(preemptionStartTime)) + metrics.DeprecatedSchedulingAlgorithmPreemptionEvaluationDuration.Observe(metrics.SinceInMicroseconds(preemptionStartTime)) metrics.SchedulingLatency.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime)) metrics.DeprecatedSchedulingLatency.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime)) } diff --git a/test/integration/scheduler_perf/scheduler_bench_test.go b/test/integration/scheduler_perf/scheduler_bench_test.go index ab473a2c5bb..703c48a3904 100644 --- a/test/integration/scheduler_perf/scheduler_bench_test.go +++ b/test/integration/scheduler_perf/scheduler_bench_test.go @@ -48,7 +48,7 @@ var ( // BenchmarkScheduling benchmarks the scheduling rate when the cluster has // various quantities of nodes and scheduled pods. -func BenchmarkSchedulingV(b *testing.B) { +func BenchmarkScheduling(b *testing.B) { tests := []struct{ nodes, existingPods, minPods int }{ {nodes: 100, existingPods: 0, minPods: 100}, {nodes: 100, existingPods: 1000, minPods: 100},