diff --git a/pkg/scheduler/metrics/BUILD b/pkg/scheduler/metrics/BUILD index 7cd56dda5b3..e31dcb266c8 100644 --- a/pkg/scheduler/metrics/BUILD +++ b/pkg/scheduler/metrics/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "metric_recorder.go", "metrics.go", + "profile_metrics.go", ], importpath = "k8s.io/kubernetes/pkg/scheduler/metrics", deps = [ diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index f5bbd164858..52949c39d8d 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -54,16 +54,7 @@ var ( Name: "schedule_attempts_total", Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.", StabilityLevel: metrics.ALPHA, - }, []string{"result"}) - // PodScheduleSuccesses counts how many pods were scheduled. - // This metric will be initialized again in Register() to assure the metric is not no-op metric. - PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"}) - // PodScheduleFailures counts how many pods could not be scheduled. - // This metric will be initialized again in Register() to assure the metric is not no-op metric. - PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"}) - // PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error. - // This metric will be initialized again in Register() to assure the metric is not no-op metric. - PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"}) + }, []string{"result", "profile"}) DeprecatedSchedulingDuration = metrics.NewSummaryVec( &metrics.SummaryOpts{ Subsystem: SchedulerSubsystem, @@ -77,15 +68,14 @@ var ( }, []string{OperationLabel}, ) - E2eSchedulingLatency = metrics.NewHistogram( + e2eSchedulingLatency = metrics.NewHistogramVec( &metrics.HistogramOpts{ Subsystem: SchedulerSubsystem, Name: "e2e_scheduling_duration_seconds", Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)", Buckets: metrics.ExponentialBuckets(0.001, 2, 15), StabilityLevel: metrics.ALPHA, - }, - ) + }, []string{"result", "profile"}) SchedulingAlgorithmLatency = metrics.NewHistogram( &metrics.HistogramOpts{ Subsystem: SchedulerSubsystem, @@ -235,7 +225,7 @@ var ( metricsList = []metrics.Registerable{ scheduleAttempts, DeprecatedSchedulingDuration, - E2eSchedulingLatency, + e2eSchedulingLatency, SchedulingAlgorithmLatency, BindingLatency, DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration, @@ -263,9 +253,6 @@ func Register() { registerMetrics.Do(func() { RegisterMetrics(metricsList...) volumeschedulingmetrics.RegisterVolumeSchedulingMetrics() - PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"}) - PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"}) - PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"}) }) } diff --git a/pkg/scheduler/metrics/profile_metrics.go b/pkg/scheduler/metrics/profile_metrics.go new file mode 100644 index 00000000000..c570e28a9bd --- /dev/null +++ b/pkg/scheduler/metrics/profile_metrics.go @@ -0,0 +1,48 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +// This file contains helpers for metrics that are associated to a profile. + +var ( + scheduledResult = "scheduled" + unschedulableResult = "unschedulable" + errorResult = "error" +) + +// PodScheduled can records a successful scheduling attempt and the duration +// since `start`. +func PodScheduled(profile string, duration float64) { + observeScheduleAttemptAndLatency(scheduledResult, profile, duration) +} + +// PodUnschedulable can records a scheduling attempt for an unschedulable pod +// and the duration since `start`. +func PodUnschedulable(profile string, duration float64) { + observeScheduleAttemptAndLatency(unschedulableResult, profile, duration) +} + +// PodScheduleError can records a scheduling attempt that had an error and the +// duration since `start`. +func PodScheduleError(profile string, duration float64) { + observeScheduleAttemptAndLatency(errorResult, profile, duration) +} + +func observeScheduleAttemptAndLatency(result, profile string, duration float64) { + e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration) + scheduleAttempts.WithLabelValues(result, profile).Inc() +} diff --git a/pkg/scheduler/profile/profile.go b/pkg/scheduler/profile/profile.go index 8e1397247d2..6726eaeba70 100644 --- a/pkg/scheduler/profile/profile.go +++ b/pkg/scheduler/profile/profile.go @@ -40,19 +40,22 @@ type FrameworkFactory func(config.KubeSchedulerProfile, ...frameworkruntime.Opti type Profile struct { framework.Framework Recorder events.EventRecorder + Name string } // NewProfile builds a Profile for the given configuration. func NewProfile(cfg config.KubeSchedulerProfile, frameworkFact FrameworkFactory, recorderFact RecorderFactory, opts ...frameworkruntime.Option) (*Profile, error) { - r := recorderFact(cfg.SchedulerName) - f, err := frameworkFact(cfg, append(opts, frameworkruntime.WithEventRecorder(r))...) + recorder := recorderFact(cfg.SchedulerName) + opts = append(opts, frameworkruntime.WithEventRecorder(recorder)) + fwk, err := frameworkFact(cfg, opts...) if err != nil { return nil, err } return &Profile{ - Framework: f, - Recorder: r, + Name: cfg.SchedulerName, + Framework: fwk, + Recorder: recorder, }, nil } diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 2cf3872329e..cdade50d237 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -506,13 +506,13 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { // Pod did not fit anywhere, so it is counted as a failure. If preemption // succeeds, the pod should get counted as a success the next time we try to // schedule it. (hopefully) - metrics.PodScheduleFailures.Inc() + metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start)) } else if err == core.ErrNoNodesAvailable { // No nodes available is counted as unschedulable rather than an error. - metrics.PodScheduleFailures.Inc() + metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start)) } else { klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod)) - metrics.PodScheduleErrors.Inc() + metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start)) } sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode) return @@ -526,7 +526,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { // Run "reserve" plugins. if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() { sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "") - metrics.PodScheduleErrors.Inc() + metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start)) return } @@ -539,7 +539,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { // to a node and if so will not add it back to the unscheduled pods queue // (otherwise this would cause an infinite loop). sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "") - metrics.PodScheduleErrors.Inc() + metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start)) // trigger un-reserve plugins to clean up state associated with the reserved Pod prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) return @@ -550,10 +550,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() { var reason string if runPermitStatus.IsUnschedulable() { - metrics.PodScheduleFailures.Inc() + metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start)) reason = v1.PodReasonUnschedulable } else { - metrics.PodScheduleErrors.Inc() + metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start)) reason = SchedulerError } if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil { @@ -576,10 +576,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { if !waitOnPermitStatus.IsSuccess() { var reason string if waitOnPermitStatus.IsUnschedulable() { - metrics.PodScheduleFailures.Inc() + metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start)) reason = v1.PodReasonUnschedulable } else { - metrics.PodScheduleErrors.Inc() + metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start)) reason = SchedulerError } if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil { @@ -595,7 +595,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) if !preBindStatus.IsSuccess() { var reason string - metrics.PodScheduleErrors.Inc() + metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start)) reason = SchedulerError if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil { klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr) @@ -607,9 +607,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { } err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state) - metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start)) if err != nil { - metrics.PodScheduleErrors.Inc() + metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start)) // trigger un-reserve plugins to clean up state associated with the reserved Pod prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "") @@ -619,7 +618,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) { klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes) } - metrics.PodScheduleSuccesses.Inc() + metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start)) metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts)) metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp)) diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index 00d531a7422..54ddd06c027 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -319,6 +319,7 @@ func TestSchedulerScheduleOne(t *testing.T) { testSchedulerName: &profile.Profile{ Framework: fwk, Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName), + Name: testSchedulerName, }, }, } @@ -770,6 +771,7 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache internalcache.C prof := &profile.Profile{ Framework: fwk, Recorder: &events.FakeRecorder{}, + Name: testSchedulerName, } if broadcaster != nil { prof.Recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName)