Merge pull request #105941 from rezakrimi/issue/105861

Make some scheduler metrics stable
This commit is contained in:
Kubernetes Prow Robot 2021-11-04 10:06:03 -07:00 committed by GitHub
commit c2706035f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 86 additions and 12 deletions

View File

@ -1450,7 +1450,7 @@ func TestPendingPodsMetric(t *testing.T) {
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 30
scheduler_pending_pods{queue="backoff"} 0
@ -1471,7 +1471,7 @@ scheduler_pending_pods{queue="unschedulable"} 20
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 15
scheduler_pending_pods{queue="backoff"} 25
@ -1492,7 +1492,7 @@ scheduler_pending_pods{queue="unschedulable"} 10
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 50
scheduler_pending_pods{queue="backoff"} 0
@ -1515,7 +1515,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 30
scheduler_pending_pods{queue="backoff"} 20
@ -1538,7 +1538,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 50
scheduler_pending_pods{queue="backoff"} 0

View File

@ -44,16 +44,25 @@ var (
Subsystem: SchedulerSubsystem,
Name: "schedule_attempts_total",
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
}, []string{"result", "profile"})
e2eSchedulingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_duration_seconds",
DeprecatedVersion: "1.23.0",
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
}, []string{"result", "profile"})
schedulingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_duration_seconds",
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)",
Name: "scheduling_attempt_duration_seconds",
Help: "Scheduling attempt latency in seconds (scheduling algorithm + binding)",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
}, []string{"result", "profile"})
SchedulingAlgorithmLatency = metrics.NewHistogram(
&metrics.HistogramOpts{
@ -71,21 +80,21 @@ var (
Help: "Number of selected preemption victims",
// we think #victims>50 is pretty rare, therefore [50, +Inf) is considered a single bucket.
Buckets: metrics.LinearBuckets(5, 5, 10),
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
})
PreemptionAttempts = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "preemption_attempts_total",
Help: "Total preemption attempts in the cluster till now",
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
})
pendingPods = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: SchedulerSubsystem,
Name: "pending_pods",
Help: "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.",
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
}, []string{"queue"})
SchedulerGoroutines = metrics.NewGaugeVec(
&metrics.GaugeOpts{
@ -167,6 +176,7 @@ var (
metricsList = []metrics.Registerable{
scheduleAttempts,
e2eSchedulingLatency,
schedulingLatency,
SchedulingAlgorithmLatency,
PreemptionVictims,
PreemptionAttempts,

View File

@ -44,5 +44,6 @@ func PodScheduleError(profile string, duration float64) {
func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
schedulingLatency.WithLabelValues(result, profile).Observe(duration)
scheduleAttempts.WithLabelValues(result, profile).Inc()
}

View File

@ -1,3 +1,66 @@
- name: pending_pods
subsystem: scheduler
help: Number of pending pods, by the queue type. 'active' means number of pods in
activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number
of pods in unschedulableQ.
type: Gauge
stabilityLevel: STABLE
labels:
- queue
- name: preemption_attempts_total
subsystem: scheduler
help: Total preemption attempts in the cluster till now
type: Counter
stabilityLevel: STABLE
- name: preemption_victims
subsystem: scheduler
help: Number of selected preemption victims
type: Histogram
stabilityLevel: STABLE
buckets:
- 5
- 10
- 15
- 20
- 25
- 30
- 35
- 40
- 45
- 50
- name: schedule_attempts_total
subsystem: scheduler
help: Number of attempts to schedule pods, by the result. 'unschedulable' means
a pod could not be scheduled, while 'error' means an internal scheduler problem.
type: Counter
stabilityLevel: STABLE
labels:
- profile
- result
- name: scheduling_attempt_duration_seconds
subsystem: scheduler
help: Scheduling attempt latency in seconds (scheduling algorithm + binding)
type: Histogram
stabilityLevel: STABLE
labels:
- profile
- result
buckets:
- 0.001
- 0.002
- 0.004
- 0.008
- 0.016
- 0.032
- 0.064
- 0.128
- 0.256
- 0.512
- 1.024
- 2.048
- 4.096
- 8.192
- 16.384
- name: apiserver_request_duration_seconds
help: Response latency distribution in seconds for each verb, dry run value, group,
version, resource, subresource, scope and component.