Add profile label to schedule_attempts_total metric

and e2e_scheduling_duration_seconds

Also adding result label to e2e_scheduling_duration_seconds. Previously, the metric was only updated for successful attempts

Signed-off-by: Aldo Culquicondor <acondor@google.com>
This commit is contained in:
Aldo Culquicondor 2020-06-16 15:50:57 -04:00
parent 9cd906e932
commit eb9711dc1f
6 changed files with 74 additions and 34 deletions

View File

@ -7,6 +7,7 @@ go_library(
srcs = [
"metric_recorder.go",
"metrics.go",
"profile_metrics.go",
],
importpath = "k8s.io/kubernetes/pkg/scheduler/metrics",
deps = [

View File

@ -54,16 +54,7 @@ var (
Name: "schedule_attempts_total",
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
StabilityLevel: metrics.ALPHA,
}, []string{"result"})
// PodScheduleSuccesses counts how many pods were scheduled.
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
// PodScheduleFailures counts how many pods could not be scheduled.
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
}, []string{"result", "profile"})
DeprecatedSchedulingDuration = metrics.NewSummaryVec(
&metrics.SummaryOpts{
Subsystem: SchedulerSubsystem,
@ -77,15 +68,14 @@ var (
},
[]string{OperationLabel},
)
E2eSchedulingLatency = metrics.NewHistogram(
e2eSchedulingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_duration_seconds",
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
},
)
}, []string{"result", "profile"})
SchedulingAlgorithmLatency = metrics.NewHistogram(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
@ -235,7 +225,7 @@ var (
metricsList = []metrics.Registerable{
scheduleAttempts,
DeprecatedSchedulingDuration,
E2eSchedulingLatency,
e2eSchedulingLatency,
SchedulingAlgorithmLatency,
BindingLatency,
DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration,
@ -263,9 +253,6 @@ func Register() {
registerMetrics.Do(func() {
RegisterMetrics(metricsList...)
volumeschedulingmetrics.RegisterVolumeSchedulingMetrics()
PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
})
}

View File

@ -0,0 +1,48 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
// This file contains helpers for metrics that are associated to a profile.
var (
scheduledResult = "scheduled"
unschedulableResult = "unschedulable"
errorResult = "error"
)
// PodScheduled can records a successful scheduling attempt and the duration
// since `start`.
func PodScheduled(profile string, duration float64) {
observeScheduleAttemptAndLatency(scheduledResult, profile, duration)
}
// PodUnschedulable can records a scheduling attempt for an unschedulable pod
// and the duration since `start`.
func PodUnschedulable(profile string, duration float64) {
observeScheduleAttemptAndLatency(unschedulableResult, profile, duration)
}
// PodScheduleError can records a scheduling attempt that had an error and the
// duration since `start`.
func PodScheduleError(profile string, duration float64) {
observeScheduleAttemptAndLatency(errorResult, profile, duration)
}
func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
scheduleAttempts.WithLabelValues(result, profile).Inc()
}

View File

@ -40,19 +40,22 @@ type FrameworkFactory func(config.KubeSchedulerProfile, ...frameworkruntime.Opti
type Profile struct {
framework.Framework
Recorder events.EventRecorder
Name string
}
// NewProfile builds a Profile for the given configuration.
func NewProfile(cfg config.KubeSchedulerProfile, frameworkFact FrameworkFactory, recorderFact RecorderFactory,
opts ...frameworkruntime.Option) (*Profile, error) {
r := recorderFact(cfg.SchedulerName)
f, err := frameworkFact(cfg, append(opts, frameworkruntime.WithEventRecorder(r))...)
recorder := recorderFact(cfg.SchedulerName)
opts = append(opts, frameworkruntime.WithEventRecorder(recorder))
fwk, err := frameworkFact(cfg, opts...)
if err != nil {
return nil, err
}
return &Profile{
Framework: f,
Recorder: r,
Name: cfg.SchedulerName,
Framework: fwk,
Recorder: recorder,
}, nil
}

View File

@ -506,13 +506,13 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
// Pod did not fit anywhere, so it is counted as a failure. If preemption
// succeeds, the pod should get counted as a success the next time we try to
// schedule it. (hopefully)
metrics.PodScheduleFailures.Inc()
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
} else if err == core.ErrNoNodesAvailable {
// No nodes available is counted as unschedulable rather than an error.
metrics.PodScheduleFailures.Inc()
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
} else {
klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
}
sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
return
@ -526,7 +526,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
// Run "reserve" plugins.
if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "")
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
return
}
@ -539,7 +539,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
// to a node and if so will not add it back to the unscheduled pods queue
// (otherwise this would cause an infinite loop).
sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "")
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
// trigger un-reserve plugins to clean up state associated with the reserved Pod
prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
return
@ -550,10 +550,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
var reason string
if runPermitStatus.IsUnschedulable() {
metrics.PodScheduleFailures.Inc()
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
reason = v1.PodReasonUnschedulable
} else {
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
reason = SchedulerError
}
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@ -576,10 +576,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
if !waitOnPermitStatus.IsSuccess() {
var reason string
if waitOnPermitStatus.IsUnschedulable() {
metrics.PodScheduleFailures.Inc()
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
reason = v1.PodReasonUnschedulable
} else {
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
reason = SchedulerError
}
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@ -595,7 +595,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
if !preBindStatus.IsSuccess() {
var reason string
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
reason = SchedulerError
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
@ -607,9 +607,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
}
err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
if err != nil {
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
// trigger un-reserve plugins to clean up state associated with the reserved Pod
prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "")
@ -619,7 +618,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
}
metrics.PodScheduleSuccesses.Inc()
metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start))
metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))

View File

@ -319,6 +319,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
testSchedulerName: &profile.Profile{
Framework: fwk,
Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName),
Name: testSchedulerName,
},
},
}
@ -770,6 +771,7 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache internalcache.C
prof := &profile.Profile{
Framework: fwk,
Recorder: &events.FakeRecorder{},
Name: testSchedulerName,
}
if broadcaster != nil {
prof.Recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName)