mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-08 11:38:15 +00:00
Merge pull request #105941 from rezakrimi/issue/105861
Make some scheduler metrics stable
This commit is contained in:
commit
c2706035f2
@ -1450,7 +1450,7 @@ func TestPendingPodsMetric(t *testing.T) {
|
|||||||
},
|
},
|
||||||
metricsName: "scheduler_pending_pods",
|
metricsName: "scheduler_pending_pods",
|
||||||
wants: `
|
wants: `
|
||||||
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
||||||
# TYPE scheduler_pending_pods gauge
|
# TYPE scheduler_pending_pods gauge
|
||||||
scheduler_pending_pods{queue="active"} 30
|
scheduler_pending_pods{queue="active"} 30
|
||||||
scheduler_pending_pods{queue="backoff"} 0
|
scheduler_pending_pods{queue="backoff"} 0
|
||||||
@ -1471,7 +1471,7 @@ scheduler_pending_pods{queue="unschedulable"} 20
|
|||||||
},
|
},
|
||||||
metricsName: "scheduler_pending_pods",
|
metricsName: "scheduler_pending_pods",
|
||||||
wants: `
|
wants: `
|
||||||
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
||||||
# TYPE scheduler_pending_pods gauge
|
# TYPE scheduler_pending_pods gauge
|
||||||
scheduler_pending_pods{queue="active"} 15
|
scheduler_pending_pods{queue="active"} 15
|
||||||
scheduler_pending_pods{queue="backoff"} 25
|
scheduler_pending_pods{queue="backoff"} 25
|
||||||
@ -1492,7 +1492,7 @@ scheduler_pending_pods{queue="unschedulable"} 10
|
|||||||
},
|
},
|
||||||
metricsName: "scheduler_pending_pods",
|
metricsName: "scheduler_pending_pods",
|
||||||
wants: `
|
wants: `
|
||||||
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
||||||
# TYPE scheduler_pending_pods gauge
|
# TYPE scheduler_pending_pods gauge
|
||||||
scheduler_pending_pods{queue="active"} 50
|
scheduler_pending_pods{queue="active"} 50
|
||||||
scheduler_pending_pods{queue="backoff"} 0
|
scheduler_pending_pods{queue="backoff"} 0
|
||||||
@ -1515,7 +1515,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
|
|||||||
},
|
},
|
||||||
metricsName: "scheduler_pending_pods",
|
metricsName: "scheduler_pending_pods",
|
||||||
wants: `
|
wants: `
|
||||||
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
||||||
# TYPE scheduler_pending_pods gauge
|
# TYPE scheduler_pending_pods gauge
|
||||||
scheduler_pending_pods{queue="active"} 30
|
scheduler_pending_pods{queue="active"} 30
|
||||||
scheduler_pending_pods{queue="backoff"} 20
|
scheduler_pending_pods{queue="backoff"} 20
|
||||||
@ -1538,7 +1538,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
|
|||||||
},
|
},
|
||||||
metricsName: "scheduler_pending_pods",
|
metricsName: "scheduler_pending_pods",
|
||||||
wants: `
|
wants: `
|
||||||
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
|
||||||
# TYPE scheduler_pending_pods gauge
|
# TYPE scheduler_pending_pods gauge
|
||||||
scheduler_pending_pods{queue="active"} 50
|
scheduler_pending_pods{queue="active"} 50
|
||||||
scheduler_pending_pods{queue="backoff"} 0
|
scheduler_pending_pods{queue="backoff"} 0
|
||||||
|
@ -44,16 +44,25 @@ var (
|
|||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: "schedule_attempts_total",
|
Name: "schedule_attempts_total",
|
||||||
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
|
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.STABLE,
|
||||||
}, []string{"result", "profile"})
|
}, []string{"result", "profile"})
|
||||||
|
|
||||||
e2eSchedulingLatency = metrics.NewHistogramVec(
|
e2eSchedulingLatency = metrics.NewHistogramVec(
|
||||||
|
&metrics.HistogramOpts{
|
||||||
|
Subsystem: SchedulerSubsystem,
|
||||||
|
Name: "e2e_scheduling_duration_seconds",
|
||||||
|
DeprecatedVersion: "1.23.0",
|
||||||
|
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.",
|
||||||
|
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
}, []string{"result", "profile"})
|
||||||
|
schedulingLatency = metrics.NewHistogramVec(
|
||||||
&metrics.HistogramOpts{
|
&metrics.HistogramOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: "e2e_scheduling_duration_seconds",
|
Name: "scheduling_attempt_duration_seconds",
|
||||||
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)",
|
Help: "Scheduling attempt latency in seconds (scheduling algorithm + binding)",
|
||||||
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
|
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.STABLE,
|
||||||
}, []string{"result", "profile"})
|
}, []string{"result", "profile"})
|
||||||
SchedulingAlgorithmLatency = metrics.NewHistogram(
|
SchedulingAlgorithmLatency = metrics.NewHistogram(
|
||||||
&metrics.HistogramOpts{
|
&metrics.HistogramOpts{
|
||||||
@ -71,21 +80,21 @@ var (
|
|||||||
Help: "Number of selected preemption victims",
|
Help: "Number of selected preemption victims",
|
||||||
// we think #victims>50 is pretty rare, therefore [50, +Inf) is considered a single bucket.
|
// we think #victims>50 is pretty rare, therefore [50, +Inf) is considered a single bucket.
|
||||||
Buckets: metrics.LinearBuckets(5, 5, 10),
|
Buckets: metrics.LinearBuckets(5, 5, 10),
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.STABLE,
|
||||||
})
|
})
|
||||||
PreemptionAttempts = metrics.NewCounter(
|
PreemptionAttempts = metrics.NewCounter(
|
||||||
&metrics.CounterOpts{
|
&metrics.CounterOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: "preemption_attempts_total",
|
Name: "preemption_attempts_total",
|
||||||
Help: "Total preemption attempts in the cluster till now",
|
Help: "Total preemption attempts in the cluster till now",
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.STABLE,
|
||||||
})
|
})
|
||||||
pendingPods = metrics.NewGaugeVec(
|
pendingPods = metrics.NewGaugeVec(
|
||||||
&metrics.GaugeOpts{
|
&metrics.GaugeOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: "pending_pods",
|
Name: "pending_pods",
|
||||||
Help: "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.",
|
Help: "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.",
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.STABLE,
|
||||||
}, []string{"queue"})
|
}, []string{"queue"})
|
||||||
SchedulerGoroutines = metrics.NewGaugeVec(
|
SchedulerGoroutines = metrics.NewGaugeVec(
|
||||||
&metrics.GaugeOpts{
|
&metrics.GaugeOpts{
|
||||||
@ -167,6 +176,7 @@ var (
|
|||||||
metricsList = []metrics.Registerable{
|
metricsList = []metrics.Registerable{
|
||||||
scheduleAttempts,
|
scheduleAttempts,
|
||||||
e2eSchedulingLatency,
|
e2eSchedulingLatency,
|
||||||
|
schedulingLatency,
|
||||||
SchedulingAlgorithmLatency,
|
SchedulingAlgorithmLatency,
|
||||||
PreemptionVictims,
|
PreemptionVictims,
|
||||||
PreemptionAttempts,
|
PreemptionAttempts,
|
||||||
|
@ -44,5 +44,6 @@ func PodScheduleError(profile string, duration float64) {
|
|||||||
|
|
||||||
func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
|
func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
|
||||||
e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
|
e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
|
||||||
|
schedulingLatency.WithLabelValues(result, profile).Observe(duration)
|
||||||
scheduleAttempts.WithLabelValues(result, profile).Inc()
|
scheduleAttempts.WithLabelValues(result, profile).Inc()
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,66 @@
|
|||||||
|
- name: pending_pods
|
||||||
|
subsystem: scheduler
|
||||||
|
help: Number of pending pods, by the queue type. 'active' means number of pods in
|
||||||
|
activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number
|
||||||
|
of pods in unschedulableQ.
|
||||||
|
type: Gauge
|
||||||
|
stabilityLevel: STABLE
|
||||||
|
labels:
|
||||||
|
- queue
|
||||||
|
- name: preemption_attempts_total
|
||||||
|
subsystem: scheduler
|
||||||
|
help: Total preemption attempts in the cluster till now
|
||||||
|
type: Counter
|
||||||
|
stabilityLevel: STABLE
|
||||||
|
- name: preemption_victims
|
||||||
|
subsystem: scheduler
|
||||||
|
help: Number of selected preemption victims
|
||||||
|
type: Histogram
|
||||||
|
stabilityLevel: STABLE
|
||||||
|
buckets:
|
||||||
|
- 5
|
||||||
|
- 10
|
||||||
|
- 15
|
||||||
|
- 20
|
||||||
|
- 25
|
||||||
|
- 30
|
||||||
|
- 35
|
||||||
|
- 40
|
||||||
|
- 45
|
||||||
|
- 50
|
||||||
|
- name: schedule_attempts_total
|
||||||
|
subsystem: scheduler
|
||||||
|
help: Number of attempts to schedule pods, by the result. 'unschedulable' means
|
||||||
|
a pod could not be scheduled, while 'error' means an internal scheduler problem.
|
||||||
|
type: Counter
|
||||||
|
stabilityLevel: STABLE
|
||||||
|
labels:
|
||||||
|
- profile
|
||||||
|
- result
|
||||||
|
- name: scheduling_attempt_duration_seconds
|
||||||
|
subsystem: scheduler
|
||||||
|
help: Scheduling attempt latency in seconds (scheduling algorithm + binding)
|
||||||
|
type: Histogram
|
||||||
|
stabilityLevel: STABLE
|
||||||
|
labels:
|
||||||
|
- profile
|
||||||
|
- result
|
||||||
|
buckets:
|
||||||
|
- 0.001
|
||||||
|
- 0.002
|
||||||
|
- 0.004
|
||||||
|
- 0.008
|
||||||
|
- 0.016
|
||||||
|
- 0.032
|
||||||
|
- 0.064
|
||||||
|
- 0.128
|
||||||
|
- 0.256
|
||||||
|
- 0.512
|
||||||
|
- 1.024
|
||||||
|
- 2.048
|
||||||
|
- 4.096
|
||||||
|
- 8.192
|
||||||
|
- 16.384
|
||||||
- name: apiserver_request_duration_seconds
|
- name: apiserver_request_duration_seconds
|
||||||
help: Response latency distribution in seconds for each verb, dry run value, group,
|
help: Response latency distribution in seconds for each verb, dry run value, group,
|
||||||
version, resource, subresource, scope and component.
|
version, resource, subresource, scope and component.
|
||||||
|
Loading…
Reference in New Issue
Block a user