mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Add profile label to schedule_attempts_total metric
and e2e_scheduling_duration_seconds Also adding result label to e2e_scheduling_duration_seconds. Previously, the metric was only updated for successful attempts Signed-off-by: Aldo Culquicondor <acondor@google.com>
This commit is contained in:
parent
9cd906e932
commit
eb9711dc1f
@ -7,6 +7,7 @@ go_library(
|
|||||||
srcs = [
|
srcs = [
|
||||||
"metric_recorder.go",
|
"metric_recorder.go",
|
||||||
"metrics.go",
|
"metrics.go",
|
||||||
|
"profile_metrics.go",
|
||||||
],
|
],
|
||||||
importpath = "k8s.io/kubernetes/pkg/scheduler/metrics",
|
importpath = "k8s.io/kubernetes/pkg/scheduler/metrics",
|
||||||
deps = [
|
deps = [
|
||||||
|
@ -54,16 +54,7 @@ var (
|
|||||||
Name: "schedule_attempts_total",
|
Name: "schedule_attempts_total",
|
||||||
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
|
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.ALPHA,
|
||||||
}, []string{"result"})
|
}, []string{"result", "profile"})
|
||||||
// PodScheduleSuccesses counts how many pods were scheduled.
|
|
||||||
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
|
|
||||||
PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
|
|
||||||
// PodScheduleFailures counts how many pods could not be scheduled.
|
|
||||||
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
|
|
||||||
PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
|
|
||||||
// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
|
|
||||||
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
|
|
||||||
PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
|
|
||||||
DeprecatedSchedulingDuration = metrics.NewSummaryVec(
|
DeprecatedSchedulingDuration = metrics.NewSummaryVec(
|
||||||
&metrics.SummaryOpts{
|
&metrics.SummaryOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
@ -77,15 +68,14 @@ var (
|
|||||||
},
|
},
|
||||||
[]string{OperationLabel},
|
[]string{OperationLabel},
|
||||||
)
|
)
|
||||||
E2eSchedulingLatency = metrics.NewHistogram(
|
e2eSchedulingLatency = metrics.NewHistogramVec(
|
||||||
&metrics.HistogramOpts{
|
&metrics.HistogramOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: "e2e_scheduling_duration_seconds",
|
Name: "e2e_scheduling_duration_seconds",
|
||||||
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)",
|
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)",
|
||||||
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
|
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.ALPHA,
|
||||||
},
|
}, []string{"result", "profile"})
|
||||||
)
|
|
||||||
SchedulingAlgorithmLatency = metrics.NewHistogram(
|
SchedulingAlgorithmLatency = metrics.NewHistogram(
|
||||||
&metrics.HistogramOpts{
|
&metrics.HistogramOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
@ -235,7 +225,7 @@ var (
|
|||||||
metricsList = []metrics.Registerable{
|
metricsList = []metrics.Registerable{
|
||||||
scheduleAttempts,
|
scheduleAttempts,
|
||||||
DeprecatedSchedulingDuration,
|
DeprecatedSchedulingDuration,
|
||||||
E2eSchedulingLatency,
|
e2eSchedulingLatency,
|
||||||
SchedulingAlgorithmLatency,
|
SchedulingAlgorithmLatency,
|
||||||
BindingLatency,
|
BindingLatency,
|
||||||
DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration,
|
DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration,
|
||||||
@ -263,9 +253,6 @@ func Register() {
|
|||||||
registerMetrics.Do(func() {
|
registerMetrics.Do(func() {
|
||||||
RegisterMetrics(metricsList...)
|
RegisterMetrics(metricsList...)
|
||||||
volumeschedulingmetrics.RegisterVolumeSchedulingMetrics()
|
volumeschedulingmetrics.RegisterVolumeSchedulingMetrics()
|
||||||
PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
|
|
||||||
PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
|
|
||||||
PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
48
pkg/scheduler/metrics/profile_metrics.go
Normal file
48
pkg/scheduler/metrics/profile_metrics.go
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
// This file contains helpers for metrics that are associated to a profile.
|
||||||
|
|
||||||
|
var (
|
||||||
|
scheduledResult = "scheduled"
|
||||||
|
unschedulableResult = "unschedulable"
|
||||||
|
errorResult = "error"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PodScheduled can records a successful scheduling attempt and the duration
|
||||||
|
// since `start`.
|
||||||
|
func PodScheduled(profile string, duration float64) {
|
||||||
|
observeScheduleAttemptAndLatency(scheduledResult, profile, duration)
|
||||||
|
}
|
||||||
|
|
||||||
|
// PodUnschedulable can records a scheduling attempt for an unschedulable pod
|
||||||
|
// and the duration since `start`.
|
||||||
|
func PodUnschedulable(profile string, duration float64) {
|
||||||
|
observeScheduleAttemptAndLatency(unschedulableResult, profile, duration)
|
||||||
|
}
|
||||||
|
|
||||||
|
// PodScheduleError can records a scheduling attempt that had an error and the
|
||||||
|
// duration since `start`.
|
||||||
|
func PodScheduleError(profile string, duration float64) {
|
||||||
|
observeScheduleAttemptAndLatency(errorResult, profile, duration)
|
||||||
|
}
|
||||||
|
|
||||||
|
func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
|
||||||
|
e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
|
||||||
|
scheduleAttempts.WithLabelValues(result, profile).Inc()
|
||||||
|
}
|
@ -40,19 +40,22 @@ type FrameworkFactory func(config.KubeSchedulerProfile, ...frameworkruntime.Opti
|
|||||||
type Profile struct {
|
type Profile struct {
|
||||||
framework.Framework
|
framework.Framework
|
||||||
Recorder events.EventRecorder
|
Recorder events.EventRecorder
|
||||||
|
Name string
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewProfile builds a Profile for the given configuration.
|
// NewProfile builds a Profile for the given configuration.
|
||||||
func NewProfile(cfg config.KubeSchedulerProfile, frameworkFact FrameworkFactory, recorderFact RecorderFactory,
|
func NewProfile(cfg config.KubeSchedulerProfile, frameworkFact FrameworkFactory, recorderFact RecorderFactory,
|
||||||
opts ...frameworkruntime.Option) (*Profile, error) {
|
opts ...frameworkruntime.Option) (*Profile, error) {
|
||||||
r := recorderFact(cfg.SchedulerName)
|
recorder := recorderFact(cfg.SchedulerName)
|
||||||
f, err := frameworkFact(cfg, append(opts, frameworkruntime.WithEventRecorder(r))...)
|
opts = append(opts, frameworkruntime.WithEventRecorder(recorder))
|
||||||
|
fwk, err := frameworkFact(cfg, opts...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
return &Profile{
|
return &Profile{
|
||||||
Framework: f,
|
Name: cfg.SchedulerName,
|
||||||
Recorder: r,
|
Framework: fwk,
|
||||||
|
Recorder: recorder,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -506,13 +506,13 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
|||||||
// Pod did not fit anywhere, so it is counted as a failure. If preemption
|
// Pod did not fit anywhere, so it is counted as a failure. If preemption
|
||||||
// succeeds, the pod should get counted as a success the next time we try to
|
// succeeds, the pod should get counted as a success the next time we try to
|
||||||
// schedule it. (hopefully)
|
// schedule it. (hopefully)
|
||||||
metrics.PodScheduleFailures.Inc()
|
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
|
||||||
} else if err == core.ErrNoNodesAvailable {
|
} else if err == core.ErrNoNodesAvailable {
|
||||||
// No nodes available is counted as unschedulable rather than an error.
|
// No nodes available is counted as unschedulable rather than an error.
|
||||||
metrics.PodScheduleFailures.Inc()
|
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
|
||||||
} else {
|
} else {
|
||||||
klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
|
klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
|
||||||
metrics.PodScheduleErrors.Inc()
|
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||||
}
|
}
|
||||||
sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
|
sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
|
||||||
return
|
return
|
||||||
@ -526,7 +526,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
|||||||
// Run "reserve" plugins.
|
// Run "reserve" plugins.
|
||||||
if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
|
if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
|
||||||
sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "")
|
sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "")
|
||||||
metrics.PodScheduleErrors.Inc()
|
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -539,7 +539,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
|||||||
// to a node and if so will not add it back to the unscheduled pods queue
|
// to a node and if so will not add it back to the unscheduled pods queue
|
||||||
// (otherwise this would cause an infinite loop).
|
// (otherwise this would cause an infinite loop).
|
||||||
sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "")
|
sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "")
|
||||||
metrics.PodScheduleErrors.Inc()
|
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||||
// trigger un-reserve plugins to clean up state associated with the reserved Pod
|
// trigger un-reserve plugins to clean up state associated with the reserved Pod
|
||||||
prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
|
prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
|
||||||
return
|
return
|
||||||
@ -550,10 +550,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
|||||||
if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
|
if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
|
||||||
var reason string
|
var reason string
|
||||||
if runPermitStatus.IsUnschedulable() {
|
if runPermitStatus.IsUnschedulable() {
|
||||||
metrics.PodScheduleFailures.Inc()
|
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
|
||||||
reason = v1.PodReasonUnschedulable
|
reason = v1.PodReasonUnschedulable
|
||||||
} else {
|
} else {
|
||||||
metrics.PodScheduleErrors.Inc()
|
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||||
reason = SchedulerError
|
reason = SchedulerError
|
||||||
}
|
}
|
||||||
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
|
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
|
||||||
@ -576,10 +576,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
|||||||
if !waitOnPermitStatus.IsSuccess() {
|
if !waitOnPermitStatus.IsSuccess() {
|
||||||
var reason string
|
var reason string
|
||||||
if waitOnPermitStatus.IsUnschedulable() {
|
if waitOnPermitStatus.IsUnschedulable() {
|
||||||
metrics.PodScheduleFailures.Inc()
|
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
|
||||||
reason = v1.PodReasonUnschedulable
|
reason = v1.PodReasonUnschedulable
|
||||||
} else {
|
} else {
|
||||||
metrics.PodScheduleErrors.Inc()
|
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||||
reason = SchedulerError
|
reason = SchedulerError
|
||||||
}
|
}
|
||||||
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
|
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
|
||||||
@ -595,7 +595,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
|||||||
preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
|
preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
|
||||||
if !preBindStatus.IsSuccess() {
|
if !preBindStatus.IsSuccess() {
|
||||||
var reason string
|
var reason string
|
||||||
metrics.PodScheduleErrors.Inc()
|
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||||
reason = SchedulerError
|
reason = SchedulerError
|
||||||
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
|
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
|
||||||
klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
|
klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
|
||||||
@ -607,9 +607,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
|
err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
|
||||||
metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
metrics.PodScheduleErrors.Inc()
|
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||||
// trigger un-reserve plugins to clean up state associated with the reserved Pod
|
// trigger un-reserve plugins to clean up state associated with the reserved Pod
|
||||||
prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
|
prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
|
||||||
sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "")
|
sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "")
|
||||||
@ -619,7 +618,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
|||||||
klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
|
klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics.PodScheduleSuccesses.Inc()
|
metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start))
|
||||||
metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
|
metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
|
||||||
metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))
|
metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))
|
||||||
|
|
||||||
|
@ -319,6 +319,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
|
|||||||
testSchedulerName: &profile.Profile{
|
testSchedulerName: &profile.Profile{
|
||||||
Framework: fwk,
|
Framework: fwk,
|
||||||
Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName),
|
Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName),
|
||||||
|
Name: testSchedulerName,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@ -770,6 +771,7 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache internalcache.C
|
|||||||
prof := &profile.Profile{
|
prof := &profile.Profile{
|
||||||
Framework: fwk,
|
Framework: fwk,
|
||||||
Recorder: &events.FakeRecorder{},
|
Recorder: &events.FakeRecorder{},
|
||||||
|
Name: testSchedulerName,
|
||||||
}
|
}
|
||||||
if broadcaster != nil {
|
if broadcaster != nil {
|
||||||
prof.Recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName)
|
prof.Recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName)
|
||||||
|
Loading…
Reference in New Issue
Block a user