mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 03:41:45 +00:00
Add profile label to schedule_attempts_total metric
and e2e_scheduling_duration_seconds Also adding result label to e2e_scheduling_duration_seconds. Previously, the metric was only updated for successful attempts Signed-off-by: Aldo Culquicondor <acondor@google.com>
This commit is contained in:
parent
9cd906e932
commit
eb9711dc1f
@ -7,6 +7,7 @@ go_library(
|
||||
srcs = [
|
||||
"metric_recorder.go",
|
||||
"metrics.go",
|
||||
"profile_metrics.go",
|
||||
],
|
||||
importpath = "k8s.io/kubernetes/pkg/scheduler/metrics",
|
||||
deps = [
|
||||
|
@ -54,16 +54,7 @@ var (
|
||||
Name: "schedule_attempts_total",
|
||||
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
}, []string{"result"})
|
||||
// PodScheduleSuccesses counts how many pods were scheduled.
|
||||
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
|
||||
PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
|
||||
// PodScheduleFailures counts how many pods could not be scheduled.
|
||||
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
|
||||
PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
|
||||
// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
|
||||
// This metric will be initialized again in Register() to assure the metric is not no-op metric.
|
||||
PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
|
||||
}, []string{"result", "profile"})
|
||||
DeprecatedSchedulingDuration = metrics.NewSummaryVec(
|
||||
&metrics.SummaryOpts{
|
||||
Subsystem: SchedulerSubsystem,
|
||||
@ -77,15 +68,14 @@ var (
|
||||
},
|
||||
[]string{OperationLabel},
|
||||
)
|
||||
E2eSchedulingLatency = metrics.NewHistogram(
|
||||
e2eSchedulingLatency = metrics.NewHistogramVec(
|
||||
&metrics.HistogramOpts{
|
||||
Subsystem: SchedulerSubsystem,
|
||||
Name: "e2e_scheduling_duration_seconds",
|
||||
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)",
|
||||
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
}, []string{"result", "profile"})
|
||||
SchedulingAlgorithmLatency = metrics.NewHistogram(
|
||||
&metrics.HistogramOpts{
|
||||
Subsystem: SchedulerSubsystem,
|
||||
@ -235,7 +225,7 @@ var (
|
||||
metricsList = []metrics.Registerable{
|
||||
scheduleAttempts,
|
||||
DeprecatedSchedulingDuration,
|
||||
E2eSchedulingLatency,
|
||||
e2eSchedulingLatency,
|
||||
SchedulingAlgorithmLatency,
|
||||
BindingLatency,
|
||||
DeprecatedSchedulingAlgorithmPredicateEvaluationSecondsDuration,
|
||||
@ -263,9 +253,6 @@ func Register() {
|
||||
registerMetrics.Do(func() {
|
||||
RegisterMetrics(metricsList...)
|
||||
volumeschedulingmetrics.RegisterVolumeSchedulingMetrics()
|
||||
PodScheduleSuccesses = scheduleAttempts.With(metrics.Labels{"result": "scheduled"})
|
||||
PodScheduleFailures = scheduleAttempts.With(metrics.Labels{"result": "unschedulable"})
|
||||
PodScheduleErrors = scheduleAttempts.With(metrics.Labels{"result": "error"})
|
||||
})
|
||||
}
|
||||
|
||||
|
48
pkg/scheduler/metrics/profile_metrics.go
Normal file
48
pkg/scheduler/metrics/profile_metrics.go
Normal file
@ -0,0 +1,48 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package metrics
|
||||
|
||||
// This file contains helpers for metrics that are associated to a profile.
|
||||
|
||||
var (
|
||||
scheduledResult = "scheduled"
|
||||
unschedulableResult = "unschedulable"
|
||||
errorResult = "error"
|
||||
)
|
||||
|
||||
// PodScheduled can records a successful scheduling attempt and the duration
|
||||
// since `start`.
|
||||
func PodScheduled(profile string, duration float64) {
|
||||
observeScheduleAttemptAndLatency(scheduledResult, profile, duration)
|
||||
}
|
||||
|
||||
// PodUnschedulable can records a scheduling attempt for an unschedulable pod
|
||||
// and the duration since `start`.
|
||||
func PodUnschedulable(profile string, duration float64) {
|
||||
observeScheduleAttemptAndLatency(unschedulableResult, profile, duration)
|
||||
}
|
||||
|
||||
// PodScheduleError can records a scheduling attempt that had an error and the
|
||||
// duration since `start`.
|
||||
func PodScheduleError(profile string, duration float64) {
|
||||
observeScheduleAttemptAndLatency(errorResult, profile, duration)
|
||||
}
|
||||
|
||||
func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
|
||||
e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
|
||||
scheduleAttempts.WithLabelValues(result, profile).Inc()
|
||||
}
|
@ -40,19 +40,22 @@ type FrameworkFactory func(config.KubeSchedulerProfile, ...frameworkruntime.Opti
|
||||
type Profile struct {
|
||||
framework.Framework
|
||||
Recorder events.EventRecorder
|
||||
Name string
|
||||
}
|
||||
|
||||
// NewProfile builds a Profile for the given configuration.
|
||||
func NewProfile(cfg config.KubeSchedulerProfile, frameworkFact FrameworkFactory, recorderFact RecorderFactory,
|
||||
opts ...frameworkruntime.Option) (*Profile, error) {
|
||||
r := recorderFact(cfg.SchedulerName)
|
||||
f, err := frameworkFact(cfg, append(opts, frameworkruntime.WithEventRecorder(r))...)
|
||||
recorder := recorderFact(cfg.SchedulerName)
|
||||
opts = append(opts, frameworkruntime.WithEventRecorder(recorder))
|
||||
fwk, err := frameworkFact(cfg, opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &Profile{
|
||||
Framework: f,
|
||||
Recorder: r,
|
||||
Name: cfg.SchedulerName,
|
||||
Framework: fwk,
|
||||
Recorder: recorder,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
@ -506,13 +506,13 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
||||
// Pod did not fit anywhere, so it is counted as a failure. If preemption
|
||||
// succeeds, the pod should get counted as a success the next time we try to
|
||||
// schedule it. (hopefully)
|
||||
metrics.PodScheduleFailures.Inc()
|
||||
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
|
||||
} else if err == core.ErrNoNodesAvailable {
|
||||
// No nodes available is counted as unschedulable rather than an error.
|
||||
metrics.PodScheduleFailures.Inc()
|
||||
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
|
||||
} else {
|
||||
klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||
}
|
||||
sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
|
||||
return
|
||||
@ -526,7 +526,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
||||
// Run "reserve" plugins.
|
||||
if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
|
||||
sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "")
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||
return
|
||||
}
|
||||
|
||||
@ -539,7 +539,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
||||
// to a node and if so will not add it back to the unscheduled pods queue
|
||||
// (otherwise this would cause an infinite loop).
|
||||
sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "")
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||
// trigger un-reserve plugins to clean up state associated with the reserved Pod
|
||||
prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
|
||||
return
|
||||
@ -550,10 +550,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
||||
if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
|
||||
var reason string
|
||||
if runPermitStatus.IsUnschedulable() {
|
||||
metrics.PodScheduleFailures.Inc()
|
||||
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
|
||||
reason = v1.PodReasonUnschedulable
|
||||
} else {
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||
reason = SchedulerError
|
||||
}
|
||||
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
|
||||
@ -576,10 +576,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
||||
if !waitOnPermitStatus.IsSuccess() {
|
||||
var reason string
|
||||
if waitOnPermitStatus.IsUnschedulable() {
|
||||
metrics.PodScheduleFailures.Inc()
|
||||
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
|
||||
reason = v1.PodReasonUnschedulable
|
||||
} else {
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||
reason = SchedulerError
|
||||
}
|
||||
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
|
||||
@ -595,7 +595,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
||||
preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
|
||||
if !preBindStatus.IsSuccess() {
|
||||
var reason string
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||
reason = SchedulerError
|
||||
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
|
||||
klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
|
||||
@ -607,9 +607,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
||||
}
|
||||
|
||||
err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
|
||||
metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
|
||||
if err != nil {
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
|
||||
// trigger un-reserve plugins to clean up state associated with the reserved Pod
|
||||
prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
|
||||
sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "")
|
||||
@ -619,7 +618,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
|
||||
klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
|
||||
}
|
||||
|
||||
metrics.PodScheduleSuccesses.Inc()
|
||||
metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start))
|
||||
metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
|
||||
metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))
|
||||
|
||||
|
@ -319,6 +319,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
|
||||
testSchedulerName: &profile.Profile{
|
||||
Framework: fwk,
|
||||
Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName),
|
||||
Name: testSchedulerName,
|
||||
},
|
||||
},
|
||||
}
|
||||
@ -770,6 +771,7 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache internalcache.C
|
||||
prof := &profile.Profile{
|
||||
Framework: fwk,
|
||||
Recorder: &events.FakeRecorder{},
|
||||
Name: testSchedulerName,
|
||||
}
|
||||
if broadcaster != nil {
|
||||
prof.Recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName)
|
||||
|
Loading…
Reference in New Issue
Block a user