Merge pull request #105941 from rezakrimi/issue/105861

Make some scheduler metrics stable
2025-09-14 13:45:06 +00:00 · 2021-11-04 10:06:03 -07:00
parent 4c659c5342 bb15f02039
commit c2706035f2
4 changed files with 86 additions and 12 deletions
--- a/pkg/scheduler/internal/queue/scheduling_queue_test.go
+++ b/pkg/scheduler/internal/queue/scheduling_queue_test.go
@@ -1450,7 +1450,7 @@ func TestPendingPodsMetric(t *testing.T) {
 			},
 			metricsName: "scheduler_pending_pods",
 			wants: `
-# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
+# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
 # TYPE scheduler_pending_pods gauge
 scheduler_pending_pods{queue="active"} 30
 scheduler_pending_pods{queue="backoff"} 0
@@ -1471,7 +1471,7 @@ scheduler_pending_pods{queue="unschedulable"} 20
 			},
 			metricsName: "scheduler_pending_pods",
 			wants: `
-# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
+# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
 # TYPE scheduler_pending_pods gauge
 scheduler_pending_pods{queue="active"} 15
 scheduler_pending_pods{queue="backoff"} 25
@@ -1492,7 +1492,7 @@ scheduler_pending_pods{queue="unschedulable"} 10
 			},
 			metricsName: "scheduler_pending_pods",
 			wants: `
-# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
+# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
 # TYPE scheduler_pending_pods gauge
 scheduler_pending_pods{queue="active"} 50
 scheduler_pending_pods{queue="backoff"} 0
@@ -1515,7 +1515,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
 			},
 			metricsName: "scheduler_pending_pods",
 			wants: `
-# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
+# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
 # TYPE scheduler_pending_pods gauge
 scheduler_pending_pods{queue="active"} 30
 scheduler_pending_pods{queue="backoff"} 20
@@ -1538,7 +1538,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
 			},
 			metricsName: "scheduler_pending_pods",
 			wants: `
-# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
+# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
 # TYPE scheduler_pending_pods gauge
 scheduler_pending_pods{queue="active"} 50
 scheduler_pending_pods{queue="backoff"} 0
--- a/pkg/scheduler/metrics/metrics.go
+++ b/pkg/scheduler/metrics/metrics.go
@@ -44,16 +44,25 @@ var (
 			Subsystem:      SchedulerSubsystem,
 			Name:           "schedule_attempts_total",
 			Help:           "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
-			StabilityLevel: metrics.ALPHA,
+			StabilityLevel: metrics.STABLE,
 		}, []string{"result", "profile"})

 	e2eSchedulingLatency = metrics.NewHistogramVec(
+		&metrics.HistogramOpts{
+			Subsystem:         SchedulerSubsystem,
+			Name:              "e2e_scheduling_duration_seconds",
+			DeprecatedVersion: "1.23.0",
+			Help:              "E2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.",
+			Buckets:           metrics.ExponentialBuckets(0.001, 2, 15),
+			StabilityLevel:    metrics.ALPHA,
+		}, []string{"result", "profile"})
+	schedulingLatency = metrics.NewHistogramVec(
 		&metrics.HistogramOpts{
 			Subsystem:      SchedulerSubsystem,
-			Name:           "e2e_scheduling_duration_seconds",
-			Help:           "E2e scheduling latency in seconds (scheduling algorithm + binding)",
+			Name:           "scheduling_attempt_duration_seconds",
+			Help:           "Scheduling attempt latency in seconds (scheduling algorithm + binding)",
 			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
-			StabilityLevel: metrics.ALPHA,
+			StabilityLevel: metrics.STABLE,
 		}, []string{"result", "profile"})
 	SchedulingAlgorithmLatency = metrics.NewHistogram(
 		&metrics.HistogramOpts{
@@ -71,21 +80,21 @@ var (
 			Help:      "Number of selected preemption victims",
 			// we think #victims>50 is pretty rare, therefore [50, +Inf) is considered a single bucket.
 			Buckets:        metrics.LinearBuckets(5, 5, 10),
-			StabilityLevel: metrics.ALPHA,
+			StabilityLevel: metrics.STABLE,
 		})
 	PreemptionAttempts = metrics.NewCounter(
 		&metrics.CounterOpts{
 			Subsystem:      SchedulerSubsystem,
 			Name:           "preemption_attempts_total",
 			Help:           "Total preemption attempts in the cluster till now",
-			StabilityLevel: metrics.ALPHA,
+			StabilityLevel: metrics.STABLE,
 		})
 	pendingPods = metrics.NewGaugeVec(
 		&metrics.GaugeOpts{
 			Subsystem:      SchedulerSubsystem,
 			Name:           "pending_pods",
 			Help:           "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.",
-			StabilityLevel: metrics.ALPHA,
+			StabilityLevel: metrics.STABLE,
 		}, []string{"queue"})
 	SchedulerGoroutines = metrics.NewGaugeVec(
 		&metrics.GaugeOpts{
@@ -167,6 +176,7 @@ var (
 	metricsList = []metrics.Registerable{
 		scheduleAttempts,
 		e2eSchedulingLatency,
+		schedulingLatency,
 		SchedulingAlgorithmLatency,
 		PreemptionVictims,
 		PreemptionAttempts,
--- a/pkg/scheduler/metrics/profile_metrics.go
+++ b/pkg/scheduler/metrics/profile_metrics.go
@@ -44,5 +44,6 @@ func PodScheduleError(profile string, duration float64) {

 func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
 	e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
+	schedulingLatency.WithLabelValues(result, profile).Observe(duration)
 	scheduleAttempts.WithLabelValues(result, profile).Inc()
 }
--- a/test/instrumentation/testdata/stable-metrics-list.yaml
+++ b/test/instrumentation/testdata/stable-metrics-list.yaml
@@ -1,3 +1,66 @@
+- name: pending_pods
+  subsystem: scheduler
+  help: Number of pending pods, by the queue type. 'active' means number of pods in
+    activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number
+    of pods in unschedulableQ.
+  type: Gauge
+  stabilityLevel: STABLE
+  labels:
+  - queue
+- name: preemption_attempts_total
+  subsystem: scheduler
+  help: Total preemption attempts in the cluster till now
+  type: Counter
+  stabilityLevel: STABLE
+- name: preemption_victims
+  subsystem: scheduler
+  help: Number of selected preemption victims
+  type: Histogram
+  stabilityLevel: STABLE
+  buckets:
+  - 5
+  - 10
+  - 15
+  - 20
+  - 25
+  - 30
+  - 35
+  - 40
+  - 45
+  - 50
+- name: schedule_attempts_total
+  subsystem: scheduler
+  help: Number of attempts to schedule pods, by the result. 'unschedulable' means
+    a pod could not be scheduled, while 'error' means an internal scheduler problem.
+  type: Counter
+  stabilityLevel: STABLE
+  labels:
+  - profile
+  - result
+- name: scheduling_attempt_duration_seconds
+  subsystem: scheduler
+  help: Scheduling attempt latency in seconds (scheduling algorithm + binding)
+  type: Histogram
+  stabilityLevel: STABLE
+  labels:
+  - profile
+  - result
+  buckets:
+  - 0.001
+  - 0.002
+  - 0.004
+  - 0.008
+  - 0.016
+  - 0.032
+  - 0.064
+  - 0.128
+  - 0.256
+  - 0.512
+  - 1.024
+  - 2.048
+  - 4.096
+  - 8.192
+  - 16.384
 - name: apiserver_request_duration_seconds
  help: Response latency distribution in seconds for each verb, dry run value, group,
    version, resource, subresource, scope and component.