feat: support metrics

This commit is contained in:
Kensei Nakada 2024-10-31 23:12:57 +09:00
parent fe3119fa69
commit 1e2511031e
2 changed files with 51 additions and 11 deletions

View File

@ -23,6 +23,7 @@ import (
"math"
"sync"
"sync/atomic"
"time"
v1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1"
@ -487,6 +488,10 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName
logger := klog.FromContext(ctx)
go func() {
startTime := time.Now()
result := metrics.GoroutineResultSuccess
defer metrics.PreemptionGoroutinesDuration.WithLabelValues(result).Observe(metrics.SinceInSeconds(startTime))
defer metrics.PreemptionGoroutinesExecutionTotal.WithLabelValues(result).Inc()
defer cancel()
logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods))
@ -497,6 +502,7 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName
nominatedPods := getLowerPriorityNominatedPods(logger, ev.Handler, pod, c.Name())
if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), nominatedPods...); err != nil {
logger.Error(err, "Cannot clear 'NominatedNodeName' field")
result = metrics.GoroutineResultError
// We do not return as this error is not critical.
}
@ -518,6 +524,7 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName
ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName)
if err := errCh.ReceiveError(); err != nil {
logger.Error(err, "Error occurred during preemption")
result = metrics.GoroutineResultError
}
ev.mu.Lock()
@ -526,9 +533,10 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil {
logger.Error(err, "Error occurred during preemption")
result = metrics.GoroutineResultError
}
logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name())
logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name(), "result", result)
}()
}

View File

@ -40,6 +40,11 @@ const (
Binding = "binding"
)
const (
GoroutineResultSuccess = "success"
GoroutineResultError = "error"
)
// ExtentionPoints is a list of possible values for the extension_point label.
var ExtentionPoints = []string{
PreFilter,
@ -105,14 +110,20 @@ var (
FrameworkExtensionPointDuration *metrics.HistogramVec
PluginExecutionDuration *metrics.HistogramVec
// This is only available when the QHint feature gate is enabled.
PermitWaitDuration *metrics.HistogramVec
CacheSize *metrics.GaugeVec
unschedulableReasons *metrics.GaugeVec
PluginEvaluationTotal *metrics.CounterVec
// The below two are only available when the QHint feature gate is enabled.
queueingHintExecutionDuration *metrics.HistogramVec
SchedulerQueueIncomingPods *metrics.CounterVec
PermitWaitDuration *metrics.HistogramVec
CacheSize *metrics.GaugeVec
unschedulableReasons *metrics.GaugeVec
PluginEvaluationTotal *metrics.CounterVec
metricsList []metrics.Registerable
// The below two are only available when the async-preemption feature gate is enabled.
PreemptionGoroutinesDuration *metrics.HistogramVec
PreemptionGoroutinesExecutionTotal *metrics.CounterVec
// metricsList is a list of all metrics that should be registered always, regardless of any feature gate's value.
metricsList []metrics.Registerable
)
var registerMetrics sync.Once
@ -123,11 +134,14 @@ func Register() {
registerMetrics.Do(func() {
InitMetrics()
RegisterMetrics(metricsList...)
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
RegisterMetrics(queueingHintExecutionDuration)
RegisterMetrics(InFlightEvents)
}
volumebindingmetrics.RegisterVolumeSchedulingMetrics()
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
RegisterMetrics(queueingHintExecutionDuration, InFlightEvents)
}
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption) {
RegisterMetrics(PreemptionGoroutinesDuration, PreemptionGoroutinesExecutionTotal)
}
})
}
@ -316,6 +330,24 @@ func InitMetrics() {
Help: "Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter, Filter, PreScore, and Score).",
StabilityLevel: metrics.ALPHA,
}, []string{"plugin", "extension_point", "profile"})
PreemptionGoroutinesDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "preemption_goroutines_duration_seconds",
Help: "Duration in seconds for running goroutines for the preemption.",
Buckets: metrics.ExponentialBuckets(0.01, 2, 20),
StabilityLevel: metrics.ALPHA,
},
[]string{"result"},
)
PreemptionGoroutinesExecutionTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "preemption_goroutines_execution_total",
Help: "Number of preemption goroutines executed.",
StabilityLevel: metrics.ALPHA,
},
[]string{"result"})
metricsList = []metrics.Registerable{
scheduleAttempts,