Add profile label to schedule_attempts_total metric

and e2e_scheduling_duration_seconds

Also adding result label to e2e_scheduling_duration_seconds. Previously, the metric was only updated for successful attempts

Signed-off-by: Aldo Culquicondor <acondor@google.com>
This commit is contained in:
Aldo Culquicondor 2020-06-16 15:50:57 -04:00
parent 9cd906e932
commit eb9711dc1f
6 changed files with 74 additions and 34 deletions

View File

@ -7,6 +7,7 @@ go_library(
srcs = [ srcs = [
"metric_recorder.go", "metric_recorder.go",
"metrics.go", "metrics.go",
"profile_metrics.go",
], ],
importpath = "k8s.io/kubernetes/pkg/scheduler/metrics", importpath = "k8s.io/kubernetes/pkg/scheduler/metrics",
deps = [ deps = [

View File

@ -54,16 +54,7 @@ var (
Name: "schedule_attempts_total", Name: "schedule_attempts_total",
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.", Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.ALPHA,
}, []string{"result"}) }, []string{"result", "profile"})
// PodScheduleSuccesses counts how many pods were scheduled.
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
// PodScheduleFailures counts how many pods could not be scheduled.
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
DeprecatedSchedulingDuration = metrics.NewSummaryVec( DeprecatedSchedulingDuration = metrics.NewSummaryVec(
&metrics.SummaryOpts{ &metrics.SummaryOpts{
Subsystem: SchedulerSubsystem, Subsystem: SchedulerSubsystem,
@ -77,15 +68,14 @@ var (
}, },
[]string{OperationLabel}, []string{OperationLabel},
) )
E2eSchedulingLatency = metrics.NewHistogram( e2eSchedulingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{ &metrics.HistogramOpts{
Subsystem: SchedulerSubsystem, Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_duration_seconds", Name: "e2e_scheduling_duration_seconds",
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)", Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15), Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.ALPHA,
}, }, []string{"result", "profile"})
)
SchedulingAlgorithmLatency = metrics.NewHistogram( SchedulingAlgorithmLatency = metrics.NewHistogram(
&metrics.HistogramOpts{ &metrics.HistogramOpts{
Subsystem: SchedulerSubsystem, Subsystem: SchedulerSubsystem,
@ -235,7 +225,7 @@ var (
metricsList = []metrics.Registerable{ metricsList = []metrics.Registerable{
scheduleAttempts, scheduleAttempts,
DeprecatedSchedulingDuration, DeprecatedSchedulingDuration,
E2eSchedulingLatency, e2eSchedulingLatency,
SchedulingAlgorithmLatency, SchedulingAlgorithmLatency,
BindingLatency, BindingLatency,
DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration, DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration,
@ -263,9 +253,6 @@ func Register() {
registerMetrics.Do(func() { registerMetrics.Do(func() {
RegisterMetrics(metricsList...) RegisterMetrics(metricsList...)
volumeschedulingmetrics.RegisterVolumeSchedulingMetrics() volumeschedulingmetrics.RegisterVolumeSchedulingMetrics()
PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
}) })
} }

View File

@ -0,0 +1,48 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
// This file contains helpers for metrics that are associated to a profile.
var (
scheduledResult = "scheduled"
unschedulableResult = "unschedulable"
errorResult = "error"
)
// PodScheduled can records a successful scheduling attempt and the duration
// since `start`.
func PodScheduled(profile string, duration float64) {
observeScheduleAttemptAndLatency(scheduledResult, profile, duration)
}
// PodUnschedulable can records a scheduling attempt for an unschedulable pod
// and the duration since `start`.
func PodUnschedulable(profile string, duration float64) {
observeScheduleAttemptAndLatency(unschedulableResult, profile, duration)
}
// PodScheduleError can records a scheduling attempt that had an error and the
// duration since `start`.
func PodScheduleError(profile string, duration float64) {
observeScheduleAttemptAndLatency(errorResult, profile, duration)
}
func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
scheduleAttempts.WithLabelValues(result, profile).Inc()
}

View File

@ -40,19 +40,22 @@ type FrameworkFactory func(config.KubeSchedulerProfile, ...frameworkruntime.Opti
type Profile struct { type Profile struct {
framework.Framework framework.Framework
Recorder events.EventRecorder Recorder events.EventRecorder
Name string
} }
// NewProfile builds a Profile for the given configuration. // NewProfile builds a Profile for the given configuration.
func NewProfile(cfg config.KubeSchedulerProfile, frameworkFact FrameworkFactory, recorderFact RecorderFactory, func NewProfile(cfg config.KubeSchedulerProfile, frameworkFact FrameworkFactory, recorderFact RecorderFactory,
opts ...frameworkruntime.Option) (*Profile, error) { opts ...frameworkruntime.Option) (*Profile, error) {
r := recorderFact(cfg.SchedulerName) recorder := recorderFact(cfg.SchedulerName)
f, err := frameworkFact(cfg, append(opts, frameworkruntime.WithEventRecorder(r))...) opts = append(opts, frameworkruntime.WithEventRecorder(recorder))
fwk, err := frameworkFact(cfg, opts...)
if err != nil { if err != nil {
return nil, err return nil, err
} }
return &Profile{ return &Profile{
Framework: f, Name: cfg.SchedulerName,
Recorder: r, Framework: fwk,
Recorder: recorder,
}, nil }, nil
} }

View File

@ -506,13 +506,13 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
// Pod did not fit anywhere, so it is counted as a failure. If preemption // Pod did not fit anywhere, so it is counted as a failure. If preemption
// succeeds, the pod should get counted as a success the next time we try to // succeeds, the pod should get counted as a success the next time we try to
// schedule it. (hopefully) // schedule it. (hopefully)
metrics.PodScheduleFailures.Inc() metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
} else if err == core.ErrNoNodesAvailable { } else if err == core.ErrNoNodesAvailable {
// No nodes available is counted as unschedulable rather than an error. // No nodes available is counted as unschedulable rather than an error.
metrics.PodScheduleFailures.Inc() metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
} else { } else {
klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod)) klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
metrics.PodScheduleErrors.Inc() metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
} }
sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode) sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
return return
@ -526,7 +526,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
// Run "reserve" plugins. // Run "reserve" plugins.
if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() { if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "") sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "")
metrics.PodScheduleErrors.Inc() metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
return return
} }
@ -539,7 +539,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
// to a node and if so will not add it back to the unscheduled pods queue // to a node and if so will not add it back to the unscheduled pods queue
// (otherwise this would cause an infinite loop). // (otherwise this would cause an infinite loop).
sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "") sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "")
metrics.PodScheduleErrors.Inc() metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
// trigger un-reserve plugins to clean up state associated with the reserved Pod // trigger un-reserve plugins to clean up state associated with the reserved Pod
prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
return return
@ -550,10 +550,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() { if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
var reason string var reason string
if runPermitStatus.IsUnschedulable() { if runPermitStatus.IsUnschedulable() {
metrics.PodScheduleFailures.Inc() metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
reason = v1.PodReasonUnschedulable reason = v1.PodReasonUnschedulable
} else { } else {
metrics.PodScheduleErrors.Inc() metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
reason = SchedulerError reason = SchedulerError
} }
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil { if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@ -576,10 +576,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
if !waitOnPermitStatus.IsSuccess() { if !waitOnPermitStatus.IsSuccess() {
var reason string var reason string
if waitOnPermitStatus.IsUnschedulable() { if waitOnPermitStatus.IsUnschedulable() {
metrics.PodScheduleFailures.Inc() metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
reason = v1.PodReasonUnschedulable reason = v1.PodReasonUnschedulable
} else { } else {
metrics.PodScheduleErrors.Inc() metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
reason = SchedulerError reason = SchedulerError
} }
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil { if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@ -595,7 +595,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
if !preBindStatus.IsSuccess() { if !preBindStatus.IsSuccess() {
var reason string var reason string
metrics.PodScheduleErrors.Inc() metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
reason = SchedulerError reason = SchedulerError
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil { if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr) klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
@ -607,9 +607,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
} }
err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state) err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
if err != nil { if err != nil {
metrics.PodScheduleErrors.Inc() metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
// trigger un-reserve plugins to clean up state associated with the reserved Pod // trigger un-reserve plugins to clean up state associated with the reserved Pod
prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "") sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "")
@ -619,7 +618,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes) klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
} }
metrics.PodScheduleSuccesses.Inc() metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start))
metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts)) metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp)) metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))

View File

@ -319,6 +319,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
testSchedulerName: &profile.Profile{ testSchedulerName: &profile.Profile{
Framework: fwk, Framework: fwk,
Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName), Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName),
Name: testSchedulerName,
}, },
}, },
} }
@ -770,6 +771,7 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache internalcache.C
prof := &profile.Profile{ prof := &profile.Profile{
Framework: fwk, Framework: fwk,
Recorder: &events.FakeRecorder{}, Recorder: &events.FakeRecorder{},
Name: testSchedulerName,
} }
if broadcaster != nil { if broadcaster != nil {
prof.Recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName) prof.Recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName)