mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 19:56:01 +00:00
Merge pull request #64838 from krzysied/scheduling_latency_metric_fix
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Adding summary metric for scheduling latency **What this PR does / why we need it**: Re-introduces histogram metrics for the backward compatibility. Changes SchedulingLatency metric to satisfy prometheus best practice. ref #64316 **Release note**: ```release-note NONE ```
This commit is contained in:
commit
a6e61e7452
@ -27,16 +27,15 @@ const (
|
|||||||
// SchedulerSubsystem - subsystem name used by scheduler
|
// SchedulerSubsystem - subsystem name used by scheduler
|
||||||
SchedulerSubsystem = "scheduler"
|
SchedulerSubsystem = "scheduler"
|
||||||
// SchedulingLatencyName - scheduler latency metric name
|
// SchedulingLatencyName - scheduler latency metric name
|
||||||
SchedulingLatencyName = "scheduling_latencies_summary"
|
SchedulingLatencyName = "scheduling_latency_seconds"
|
||||||
|
|
||||||
// OperationLabel - operation label name
|
// OperationLabel - operation label name
|
||||||
OperationLabel = "operation"
|
OperationLabel = "operation"
|
||||||
// Binding - binding operation label value
|
// Binding - binding operation label value
|
||||||
Binding = "binding"
|
Binding = "binding"
|
||||||
// SchedulingAlgorithm - scheduling algorithm operation label value
|
// SelectingNode - selecting node operation label value
|
||||||
SchedulingAlgorithm = "scheduling_algorithm"
|
SelectingNode = "selecting_node"
|
||||||
// E2eScheduling - e2e scheduling operation label value
|
// E2eScheduling - e2e scheduling operation label value
|
||||||
E2eScheduling = "e2e_scheduling"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// All the histogram based metrics have 1ms as size for the smallest bucket.
|
// All the histogram based metrics have 1ms as size for the smallest bucket.
|
||||||
@ -45,13 +44,29 @@ var (
|
|||||||
prometheus.SummaryOpts{
|
prometheus.SummaryOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: SchedulingLatencyName,
|
Name: SchedulingLatencyName,
|
||||||
Help: "Scheduling latency in microseconds split by sub-parts of the scheduling operation",
|
Help: "Scheduling latency in seconds split by sub-parts of the scheduling operation",
|
||||||
// Make the sliding window of 5h.
|
// Make the sliding window of 5h.
|
||||||
// TODO: The value for this should be based on some SLI definition (long term).
|
// TODO: The value for this should be based on some SLI definition (long term).
|
||||||
MaxAge: 5 * time.Hour,
|
MaxAge: 5 * time.Hour,
|
||||||
},
|
},
|
||||||
[]string{OperationLabel},
|
[]string{OperationLabel},
|
||||||
)
|
)
|
||||||
|
E2eSchedulingLatency = prometheus.NewHistogram(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Subsystem: SchedulerSubsystem,
|
||||||
|
Name: "e2e_scheduling_latency_microseconds",
|
||||||
|
Help: "E2e scheduling latency (scheduling algorithm + binding)",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
SchedulingAlgorithmLatency = prometheus.NewHistogram(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Subsystem: SchedulerSubsystem,
|
||||||
|
Name: "scheduling_algorithm_latency_microseconds",
|
||||||
|
Help: "Scheduling algorithm latency",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
|
||||||
|
},
|
||||||
|
)
|
||||||
SchedulingAlgorithmPredicateEvaluationDuration = prometheus.NewHistogram(
|
SchedulingAlgorithmPredicateEvaluationDuration = prometheus.NewHistogram(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
@ -76,6 +91,14 @@ var (
|
|||||||
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
|
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
BindingLatency = prometheus.NewHistogram(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Subsystem: SchedulerSubsystem,
|
||||||
|
Name: "binding_latency_microseconds",
|
||||||
|
Help: "Binding latency",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
|
||||||
|
},
|
||||||
|
)
|
||||||
PreemptionVictims = prometheus.NewGauge(
|
PreemptionVictims = prometheus.NewGauge(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
@ -90,6 +113,9 @@ var (
|
|||||||
})
|
})
|
||||||
metricsList = []prometheus.Collector{
|
metricsList = []prometheus.Collector{
|
||||||
SchedulingLatency,
|
SchedulingLatency,
|
||||||
|
E2eSchedulingLatency,
|
||||||
|
SchedulingAlgorithmLatency,
|
||||||
|
BindingLatency,
|
||||||
SchedulingAlgorithmPredicateEvaluationDuration,
|
SchedulingAlgorithmPredicateEvaluationDuration,
|
||||||
SchedulingAlgorithmPriorityEvaluationDuration,
|
SchedulingAlgorithmPriorityEvaluationDuration,
|
||||||
SchedulingAlgorithmPremptionEvaluationDuration,
|
SchedulingAlgorithmPremptionEvaluationDuration,
|
||||||
@ -102,6 +128,7 @@ var registerMetrics sync.Once
|
|||||||
|
|
||||||
// Register all metrics.
|
// Register all metrics.
|
||||||
func Register() {
|
func Register() {
|
||||||
|
// Register the metrics.
|
||||||
registerMetrics.Do(func() {
|
registerMetrics.Do(func() {
|
||||||
for _, metric := range metricsList {
|
for _, metric := range metricsList {
|
||||||
prometheus.MustRegister(metric)
|
prometheus.MustRegister(metric)
|
||||||
@ -118,3 +145,8 @@ func Reset() {
|
|||||||
func SinceInMicroseconds(start time.Time) float64 {
|
func SinceInMicroseconds(start time.Time) float64 {
|
||||||
return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
|
return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SinceInSeconds gets the time since the specified start in seconds.
|
||||||
|
func SinceInSeconds(start time.Time) float64 {
|
||||||
|
return time.Since(start).Seconds()
|
||||||
|
}
|
||||||
|
@ -429,7 +429,8 @@ func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics.SchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInMicroseconds(bindingStart))
|
metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
|
||||||
|
metrics.SchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInSeconds(bindingStart))
|
||||||
sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, b.Target.Name)
|
sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, b.Target.Name)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -461,7 +462,8 @@ func (sched *Scheduler) scheduleOne() {
|
|||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
metrics.SchedulingLatency.WithLabelValues(metrics.SchedulingAlgorithm).Observe(metrics.SinceInMicroseconds(start))
|
metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start))
|
||||||
|
metrics.SchedulingLatency.WithLabelValues(metrics.SelectingNode).Observe(metrics.SinceInSeconds(start))
|
||||||
// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
|
// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
|
||||||
// This allows us to keep scheduling without waiting on binding to occur.
|
// This allows us to keep scheduling without waiting on binding to occur.
|
||||||
assumedPod := pod.DeepCopy()
|
assumedPod := pod.DeepCopy()
|
||||||
@ -496,7 +498,7 @@ func (sched *Scheduler) scheduleOne() {
|
|||||||
Name: suggestedHost,
|
Name: suggestedHost,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
metrics.SchedulingLatency.WithLabelValues(metrics.E2eScheduling).Observe(metrics.SinceInMicroseconds(start))
|
metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Errorf("Internal error binding pod: (%v)", err)
|
glog.Errorf("Internal error binding pod: (%v)", err)
|
||||||
}
|
}
|
||||||
|
@ -210,9 +210,8 @@ func (l *PodStartupLatency) PrintJSON() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type SchedulingMetrics struct {
|
type SchedulingMetrics struct {
|
||||||
SchedulingLatency LatencyMetric `json:"schedulingLatency"`
|
SelectingNodeLatency LatencyMetric `json:"selectingNodeLatency"`
|
||||||
BindingLatency LatencyMetric `json:"bindingLatency"`
|
BindingLatency LatencyMetric `json:"bindingLatency"`
|
||||||
E2ELatency LatencyMetric `json:"e2eLatency"`
|
|
||||||
ThroughputAverage float64 `json:"throughputAverage"`
|
ThroughputAverage float64 `json:"throughputAverage"`
|
||||||
ThroughputPerc50 float64 `json:"throughputPerc50"`
|
ThroughputPerc50 float64 `json:"throughputPerc50"`
|
||||||
ThroughputPerc90 float64 `json:"throughputPerc90"`
|
ThroughputPerc90 float64 `json:"throughputPerc90"`
|
||||||
@ -512,23 +511,20 @@ func getSchedulingLatency(c clientset.Interface) (*SchedulingMetrics, error) {
|
|||||||
|
|
||||||
var metric *LatencyMetric = nil
|
var metric *LatencyMetric = nil
|
||||||
switch sample.Metric[schedulermetric.OperationLabel] {
|
switch sample.Metric[schedulermetric.OperationLabel] {
|
||||||
case schedulermetric.SchedulingAlgorithm:
|
case schedulermetric.SelectingNode:
|
||||||
metric = &result.SchedulingLatency
|
metric = &result.SelectingNodeLatency
|
||||||
case schedulermetric.Binding:
|
case schedulermetric.Binding:
|
||||||
metric = &result.BindingLatency
|
metric = &result.BindingLatency
|
||||||
case schedulermetric.E2eScheduling:
|
|
||||||
metric = &result.E2ELatency
|
|
||||||
}
|
}
|
||||||
if metric == nil {
|
if metric == nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
latency := sample.Value
|
|
||||||
quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
|
quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
setQuantile(metric, quantile, time.Duration(int64(latency)))
|
setQuantile(metric, quantile, time.Duration(int64(float64(sample.Value)*float64(time.Second))))
|
||||||
}
|
}
|
||||||
return &result, nil
|
return &result, nil
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user