Merge pull request #105941 from rezakrimi/issue/105861

Make some scheduler metrics stable
This commit is contained in:
Kubernetes Prow Robot 2021-11-04 10:06:03 -07:00 committed by GitHub
commit c2706035f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 86 additions and 12 deletions

View File

@ -1450,7 +1450,7 @@ func TestPendingPodsMetric(t *testing.T) {
}, },
metricsName: "scheduler_pending_pods", metricsName: "scheduler_pending_pods",
wants: ` wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge # TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 30 scheduler_pending_pods{queue="active"} 30
scheduler_pending_pods{queue="backoff"} 0 scheduler_pending_pods{queue="backoff"} 0
@ -1471,7 +1471,7 @@ scheduler_pending_pods{queue="unschedulable"} 20
}, },
metricsName: "scheduler_pending_pods", metricsName: "scheduler_pending_pods",
wants: ` wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge # TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 15 scheduler_pending_pods{queue="active"} 15
scheduler_pending_pods{queue="backoff"} 25 scheduler_pending_pods{queue="backoff"} 25
@ -1492,7 +1492,7 @@ scheduler_pending_pods{queue="unschedulable"} 10
}, },
metricsName: "scheduler_pending_pods", metricsName: "scheduler_pending_pods",
wants: ` wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge # TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 50 scheduler_pending_pods{queue="active"} 50
scheduler_pending_pods{queue="backoff"} 0 scheduler_pending_pods{queue="backoff"} 0
@ -1515,7 +1515,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
}, },
metricsName: "scheduler_pending_pods", metricsName: "scheduler_pending_pods",
wants: ` wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge # TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 30 scheduler_pending_pods{queue="active"} 30
scheduler_pending_pods{queue="backoff"} 20 scheduler_pending_pods{queue="backoff"} 20
@ -1538,7 +1538,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
}, },
metricsName: "scheduler_pending_pods", metricsName: "scheduler_pending_pods",
wants: ` wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ. # HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge # TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 50 scheduler_pending_pods{queue="active"} 50
scheduler_pending_pods{queue="backoff"} 0 scheduler_pending_pods{queue="backoff"} 0

View File

@ -44,16 +44,25 @@ var (
Subsystem: SchedulerSubsystem, Subsystem: SchedulerSubsystem,
Name: "schedule_attempts_total", Name: "schedule_attempts_total",
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.", Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.STABLE,
}, []string{"result", "profile"}) }, []string{"result", "profile"})
e2eSchedulingLatency = metrics.NewHistogramVec( e2eSchedulingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_duration_seconds",
DeprecatedVersion: "1.23.0",
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
}, []string{"result", "profile"})
schedulingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{ &metrics.HistogramOpts{
Subsystem: SchedulerSubsystem, Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_duration_seconds", Name: "scheduling_attempt_duration_seconds",
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)", Help: "Scheduling attempt latency in seconds (scheduling algorithm + binding)",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15), Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.STABLE,
}, []string{"result", "profile"}) }, []string{"result", "profile"})
SchedulingAlgorithmLatency = metrics.NewHistogram( SchedulingAlgorithmLatency = metrics.NewHistogram(
&metrics.HistogramOpts{ &metrics.HistogramOpts{
@ -71,21 +80,21 @@ var (
Help: "Number of selected preemption victims", Help: "Number of selected preemption victims",
// we think #victims>50 is pretty rare, therefore [50, +Inf) is considered a single bucket. // we think #victims>50 is pretty rare, therefore [50, +Inf) is considered a single bucket.
Buckets: metrics.LinearBuckets(5, 5, 10), Buckets: metrics.LinearBuckets(5, 5, 10),
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.STABLE,
}) })
PreemptionAttempts = metrics.NewCounter( PreemptionAttempts = metrics.NewCounter(
&metrics.CounterOpts{ &metrics.CounterOpts{
Subsystem: SchedulerSubsystem, Subsystem: SchedulerSubsystem,
Name: "preemption_attempts_total", Name: "preemption_attempts_total",
Help: "Total preemption attempts in the cluster till now", Help: "Total preemption attempts in the cluster till now",
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.STABLE,
}) })
pendingPods = metrics.NewGaugeVec( pendingPods = metrics.NewGaugeVec(
&metrics.GaugeOpts{ &metrics.GaugeOpts{
Subsystem: SchedulerSubsystem, Subsystem: SchedulerSubsystem,
Name: "pending_pods", Name: "pending_pods",
Help: "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.", Help: "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.",
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.STABLE,
}, []string{"queue"}) }, []string{"queue"})
SchedulerGoroutines = metrics.NewGaugeVec( SchedulerGoroutines = metrics.NewGaugeVec(
&metrics.GaugeOpts{ &metrics.GaugeOpts{
@ -167,6 +176,7 @@ var (
metricsList = []metrics.Registerable{ metricsList = []metrics.Registerable{
scheduleAttempts, scheduleAttempts,
e2eSchedulingLatency, e2eSchedulingLatency,
schedulingLatency,
SchedulingAlgorithmLatency, SchedulingAlgorithmLatency,
PreemptionVictims, PreemptionVictims,
PreemptionAttempts, PreemptionAttempts,

View File

@ -44,5 +44,6 @@ func PodScheduleError(profile string, duration float64) {
func observeScheduleAttemptAndLatency(result, profile string, duration float64) { func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration) e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
schedulingLatency.WithLabelValues(result, profile).Observe(duration)
scheduleAttempts.WithLabelValues(result, profile).Inc() scheduleAttempts.WithLabelValues(result, profile).Inc()
} }

View File

@ -1,3 +1,66 @@
- name: pending_pods
subsystem: scheduler
help: Number of pending pods, by the queue type. 'active' means number of pods in
activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number
of pods in unschedulableQ.
type: Gauge
stabilityLevel: STABLE
labels:
- queue
- name: preemption_attempts_total
subsystem: scheduler
help: Total preemption attempts in the cluster till now
type: Counter
stabilityLevel: STABLE
- name: preemption_victims
subsystem: scheduler
help: Number of selected preemption victims
type: Histogram
stabilityLevel: STABLE
buckets:
- 5
- 10
- 15
- 20
- 25
- 30
- 35
- 40
- 45
- 50
- name: schedule_attempts_total
subsystem: scheduler
help: Number of attempts to schedule pods, by the result. 'unschedulable' means
a pod could not be scheduled, while 'error' means an internal scheduler problem.
type: Counter
stabilityLevel: STABLE
labels:
- profile
- result
- name: scheduling_attempt_duration_seconds
subsystem: scheduler
help: Scheduling attempt latency in seconds (scheduling algorithm + binding)
type: Histogram
stabilityLevel: STABLE
labels:
- profile
- result
buckets:
- 0.001
- 0.002
- 0.004
- 0.008
- 0.016
- 0.032
- 0.064
- 0.128
- 0.256
- 0.512
- 1.024
- 2.048
- 4.096
- 8.192
- 16.384
- name: apiserver_request_duration_seconds - name: apiserver_request_duration_seconds
help: Response latency distribution in seconds for each verb, dry run value, group, help: Response latency distribution in seconds for each verb, dry run value, group,
version, resource, subresource, scope and component. version, resource, subresource, scope and component.