diff --git a/pkg/scheduler/framework/preemption/preemption.go b/pkg/scheduler/framework/preemption/preemption.go index 91a0d629002..9569851612c 100644 --- a/pkg/scheduler/framework/preemption/preemption.go +++ b/pkg/scheduler/framework/preemption/preemption.go @@ -23,6 +23,7 @@ import ( "math" "sync" "sync/atomic" + "time" v1 "k8s.io/api/core/v1" policy "k8s.io/api/policy/v1" @@ -487,6 +488,10 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName logger := klog.FromContext(ctx) go func() { + startTime := time.Now() + result := metrics.GoroutineResultSuccess + defer metrics.PreemptionGoroutinesDuration.WithLabelValues(result).Observe(metrics.SinceInSeconds(startTime)) + defer metrics.PreemptionGoroutinesExecutionTotal.WithLabelValues(result).Inc() defer cancel() logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods)) @@ -497,6 +502,7 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName nominatedPods := getLowerPriorityNominatedPods(logger, ev.Handler, pod, c.Name()) if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), nominatedPods...); err != nil { logger.Error(err, "Cannot clear 'NominatedNodeName' field") + result = metrics.GoroutineResultError // We do not return as this error is not critical. } @@ -518,6 +524,7 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName) if err := errCh.ReceiveError(); err != nil { logger.Error(err, "Error occurred during preemption") + result = metrics.GoroutineResultError } ev.mu.Lock() @@ -526,9 +533,10 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil { logger.Error(err, "Error occurred during preemption") + result = metrics.GoroutineResultError } - logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name()) + logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name(), "result", result) }() } diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index 84a1bb9e7e5..954b61aa88b 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -40,6 +40,11 @@ const ( Binding = "binding" ) +const ( + GoroutineResultSuccess = "success" + GoroutineResultError = "error" +) + // ExtentionPoints is a list of possible values for the extension_point label. var ExtentionPoints = []string{ PreFilter, @@ -105,14 +110,20 @@ var ( FrameworkExtensionPointDuration *metrics.HistogramVec PluginExecutionDuration *metrics.HistogramVec - // This is only available when the QHint feature gate is enabled. + PermitWaitDuration *metrics.HistogramVec + CacheSize *metrics.GaugeVec + unschedulableReasons *metrics.GaugeVec + PluginEvaluationTotal *metrics.CounterVec + // The below two are only available when the QHint feature gate is enabled. queueingHintExecutionDuration *metrics.HistogramVec SchedulerQueueIncomingPods *metrics.CounterVec - PermitWaitDuration *metrics.HistogramVec - CacheSize *metrics.GaugeVec - unschedulableReasons *metrics.GaugeVec - PluginEvaluationTotal *metrics.CounterVec - metricsList []metrics.Registerable + + // The below two are only available when the async-preemption feature gate is enabled. + PreemptionGoroutinesDuration *metrics.HistogramVec + PreemptionGoroutinesExecutionTotal *metrics.CounterVec + + // metricsList is a list of all metrics that should be registered always, regardless of any feature gate's value. + metricsList []metrics.Registerable ) var registerMetrics sync.Once @@ -123,11 +134,14 @@ func Register() { registerMetrics.Do(func() { InitMetrics() RegisterMetrics(metricsList...) - if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) { - RegisterMetrics(queueingHintExecutionDuration) - RegisterMetrics(InFlightEvents) - } volumebindingmetrics.RegisterVolumeSchedulingMetrics() + + if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) { + RegisterMetrics(queueingHintExecutionDuration, InFlightEvents) + } + if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption) { + RegisterMetrics(PreemptionGoroutinesDuration, PreemptionGoroutinesExecutionTotal) + } }) } @@ -316,6 +330,24 @@ func InitMetrics() { Help: "Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter, Filter, PreScore, and Score).", StabilityLevel: metrics.ALPHA, }, []string{"plugin", "extension_point", "profile"}) + PreemptionGoroutinesDuration = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Subsystem: SchedulerSubsystem, + Name: "preemption_goroutines_duration_seconds", + Help: "Duration in seconds for running goroutines for the preemption.", + Buckets: metrics.ExponentialBuckets(0.01, 2, 20), + StabilityLevel: metrics.ALPHA, + }, + []string{"result"}, + ) + PreemptionGoroutinesExecutionTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: SchedulerSubsystem, + Name: "preemption_goroutines_execution_total", + Help: "Number of preemption goroutines executed.", + StabilityLevel: metrics.ALPHA, + }, + []string{"result"}) metricsList = []metrics.Registerable{ scheduleAttempts,