diff --git a/pkg/scheduler/internal/queue/scheduling_queue_test.go b/pkg/scheduler/internal/queue/scheduling_queue_test.go index ae89ad2e4b5..e36d45abd1d 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue_test.go +++ b/pkg/scheduler/internal/queue/scheduling_queue_test.go @@ -1450,7 +1450,7 @@ func TestPendingPodsMetric(t *testing.T) { }, metricsName: "scheduler_pending_pods", wants: ` -# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. +# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # TYPE scheduler_pending_pods gauge scheduler_pending_pods{queue="active"} 30 scheduler_pending_pods{queue="backoff"} 0 @@ -1471,7 +1471,7 @@ scheduler_pending_pods{queue="unschedulable"} 20 }, metricsName: "scheduler_pending_pods", wants: ` -# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. +# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # TYPE scheduler_pending_pods gauge scheduler_pending_pods{queue="active"} 15 scheduler_pending_pods{queue="backoff"} 25 @@ -1492,7 +1492,7 @@ scheduler_pending_pods{queue="unschedulable"} 10 }, metricsName: "scheduler_pending_pods", wants: ` -# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. +# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # TYPE scheduler_pending_pods gauge scheduler_pending_pods{queue="active"} 50 scheduler_pending_pods{queue="backoff"} 0 @@ -1515,7 +1515,7 @@ scheduler_pending_pods{queue="unschedulable"} 0 }, metricsName: "scheduler_pending_pods", wants: ` -# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. +# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # TYPE scheduler_pending_pods gauge scheduler_pending_pods{queue="active"} 30 scheduler_pending_pods{queue="backoff"} 20 @@ -1538,7 +1538,7 @@ scheduler_pending_pods{queue="unschedulable"} 0 }, metricsName: "scheduler_pending_pods", wants: ` -# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. +# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # TYPE scheduler_pending_pods gauge scheduler_pending_pods{queue="active"} 50 scheduler_pending_pods{queue="backoff"} 0 diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index 8d5fa803cd7..bd8a30f08c3 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -44,16 +44,25 @@ var ( Subsystem: SchedulerSubsystem, Name: "schedule_attempts_total", Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.STABLE, }, []string{"result", "profile"}) e2eSchedulingLatency = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Subsystem: SchedulerSubsystem, + Name: "e2e_scheduling_duration_seconds", + DeprecatedVersion: "1.23.0", + Help: "E2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.", + Buckets: metrics.ExponentialBuckets(0.001, 2, 15), + StabilityLevel: metrics.ALPHA, + }, []string{"result", "profile"}) + schedulingLatency = metrics.NewHistogramVec( &metrics.HistogramOpts{ Subsystem: SchedulerSubsystem, - Name: "e2e_scheduling_duration_seconds", - Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)", + Name: "scheduling_attempt_duration_seconds", + Help: "Scheduling attempt latency in seconds (scheduling algorithm + binding)", Buckets: metrics.ExponentialBuckets(0.001, 2, 15), - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.STABLE, }, []string{"result", "profile"}) SchedulingAlgorithmLatency = metrics.NewHistogram( &metrics.HistogramOpts{ @@ -71,21 +80,21 @@ var ( Help: "Number of selected preemption victims", // we think #victims>50 is pretty rare, therefore [50, +Inf) is considered a single bucket. Buckets: metrics.LinearBuckets(5, 5, 10), - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.STABLE, }) PreemptionAttempts = metrics.NewCounter( &metrics.CounterOpts{ Subsystem: SchedulerSubsystem, Name: "preemption_attempts_total", Help: "Total preemption attempts in the cluster till now", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.STABLE, }) pendingPods = metrics.NewGaugeVec( &metrics.GaugeOpts{ Subsystem: SchedulerSubsystem, Name: "pending_pods", Help: "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.STABLE, }, []string{"queue"}) SchedulerGoroutines = metrics.NewGaugeVec( &metrics.GaugeOpts{ @@ -167,6 +176,7 @@ var ( metricsList = []metrics.Registerable{ scheduleAttempts, e2eSchedulingLatency, + schedulingLatency, SchedulingAlgorithmLatency, PreemptionVictims, PreemptionAttempts, diff --git a/pkg/scheduler/metrics/profile_metrics.go b/pkg/scheduler/metrics/profile_metrics.go index c570e28a9bd..b844ef68746 100644 --- a/pkg/scheduler/metrics/profile_metrics.go +++ b/pkg/scheduler/metrics/profile_metrics.go @@ -44,5 +44,6 @@ func PodScheduleError(profile string, duration float64) { func observeScheduleAttemptAndLatency(result, profile string, duration float64) { e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration) + schedulingLatency.WithLabelValues(result, profile).Observe(duration) scheduleAttempts.WithLabelValues(result, profile).Inc() } diff --git a/test/instrumentation/testdata/stable-metrics-list.yaml b/test/instrumentation/testdata/stable-metrics-list.yaml index 7b4f87231a0..4f48d397cb3 100644 --- a/test/instrumentation/testdata/stable-metrics-list.yaml +++ b/test/instrumentation/testdata/stable-metrics-list.yaml @@ -1,3 +1,66 @@ +- name: pending_pods + subsystem: scheduler + help: Number of pending pods, by the queue type. 'active' means number of pods in + activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number + of pods in unschedulableQ. + type: Gauge + stabilityLevel: STABLE + labels: + - queue +- name: preemption_attempts_total + subsystem: scheduler + help: Total preemption attempts in the cluster till now + type: Counter + stabilityLevel: STABLE +- name: preemption_victims + subsystem: scheduler + help: Number of selected preemption victims + type: Histogram + stabilityLevel: STABLE + buckets: + - 5 + - 10 + - 15 + - 20 + - 25 + - 30 + - 35 + - 40 + - 45 + - 50 +- name: schedule_attempts_total + subsystem: scheduler + help: Number of attempts to schedule pods, by the result. 'unschedulable' means + a pod could not be scheduled, while 'error' means an internal scheduler problem. + type: Counter + stabilityLevel: STABLE + labels: + - profile + - result +- name: scheduling_attempt_duration_seconds + subsystem: scheduler + help: Scheduling attempt latency in seconds (scheduling algorithm + binding) + type: Histogram + stabilityLevel: STABLE + labels: + - profile + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 - name: apiserver_request_duration_seconds help: Response latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.