Merge pull request #84011 from liu-cong/permit

Add permit_wait_duration_seconds metric for scheduler.
This commit is contained in:
Kubernetes Prow Robot 2019-10-24 11:55:41 -07:00 committed by GitHub
commit 4fdf582687
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 101 additions and 7 deletions

View File

@ -571,11 +571,13 @@ func (f *framework) RunPermitPlugins(
// We now wait for the minimum duration if at least one plugin asked to
// wait (and no plugin rejected the pod)
if statusCode == Wait {
startTime := time.Now()
w := newWaitingPod(pod, pluginsWaitTime)
f.waitingPods.add(w)
defer f.waitingPods.remove(pod.UID)
klog.V(4).Infof("waiting for pod %q at permit", pod.Name)
s := <-w.s
metrics.PermitWaitDuration.WithLabelValues(s.Code().String()).Observe(metrics.SinceInSeconds(startTime))
if !s.IsSuccess() {
if s.IsUnschedulable() {
msg := fmt.Sprintf("pod %q rejected while waiting at permit: %v", pod.Name, s.Message())

View File

@ -727,7 +727,48 @@ func TestRecordingMetrics(t *testing.T) {
tt.action(f)
collectAndCompare(t, tt.wantExtensionPoint, tt.wantStatus)
collectAndCompareFrameworkMetrics(t, tt.wantExtensionPoint, tt.wantStatus)
})
}
}
func TestPermitWaitingMetric(t *testing.T) {
tests := []struct {
name string
inject injectedResult
wantRes string
}{
{
name: "Permit - Success",
},
{
name: "Permit - Wait Timeout",
inject: injectedResult{PermitStatus: int(Wait)},
wantRes: "Unschedulable",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
plugin := &TestPlugin{name: testPlugin, inj: tt.inject}
r := make(Registry)
r.Register(testPlugin,
func(_ *runtime.Unknown, fh FrameworkHandle) (Plugin, error) {
return plugin, nil
})
plugins := &config.Plugins{
Permit: &config.PluginSet{Enabled: []config.Plugin{{Name: testPlugin, Weight: 1}}},
}
f, err := NewFramework(r, plugins, emptyArgs)
if err != nil {
t.Fatalf("Failed to create framework for testing: %v", err)
}
metrics.Register()
metrics.PermitWaitDuration.Reset()
f.RunPermitPlugins(context.TODO(), nil, pod, "")
collectAndComparePermitWaitDuration(t, tt.wantRes)
})
}
}
@ -775,12 +816,8 @@ func injectNormalizeRes(inj injectedResult, scores NodeScoreList) *Status {
return nil
}
func collectAndCompare(t *testing.T, wantExtensionPoint string, wantStatus Code) {
ch := make(chan prometheus.Metric, 1)
m := &dto.Metric{}
metrics.FrameworkExtensionPointDuration.Collect(ch)
got := <-ch
got.Write(m)
func collectAndCompareFrameworkMetrics(t *testing.T, wantExtensionPoint string, wantStatus Code) {
m := collectHistogramMetric(metrics.FrameworkExtensionPointDuration)
if len(m.Label) != 2 {
t.Fatalf("Unexpected number of label pairs, got: %v, want: 2", len(m.Label))
@ -802,3 +839,43 @@ func collectAndCompare(t *testing.T, wantExtensionPoint string, wantStatus Code)
t.Errorf("Expect latency to be greater than 0, got: %v", m.Histogram.SampleSum)
}
}
func collectAndComparePermitWaitDuration(t *testing.T, wantRes string) {
m := collectHistogramMetric(metrics.PermitWaitDuration)
if wantRes == "" {
if m != nil {
t.Errorf("PermitWaitDuration shouldn't be recorded but got %+v", m)
}
return
}
if wantRes != "" {
if len(m.Label) != 1 {
t.Fatalf("Unexpected number of label pairs, got: %v, want: 1", len(m.Label))
}
if *m.Label[0].Value != wantRes {
t.Errorf("Unexpected result label, got: %q, want %q", *m.Label[0].Value, wantRes)
}
if *m.Histogram.SampleCount != 1 {
t.Errorf("Expect 1 sample, got: %v", m.Histogram.SampleCount)
}
if *m.Histogram.SampleSum <= 0 {
t.Errorf("Expect latency to be greater than 0, got: %v", m.Histogram.SampleSum)
}
}
}
func collectHistogramMetric(metric prometheus.Collector) *dto.Metric {
ch := make(chan prometheus.Metric, 1)
metric.Collect(ch)
select {
case got := <-ch:
m := &dto.Metric{}
got.Write(m)
return m
default:
return nil
}
}

View File

@ -85,6 +85,10 @@ func newWaitingPod(pod *v1.Pod, pluginsMaxWaitTime map[string]time.Duration) *wa
}
wp.pendingPlugins = make(map[string]*time.Timer, len(pluginsMaxWaitTime))
// The time.AfterFunc calls wp.Reject which iterates through pendingPlugins map. Acquire the
// lock here so that time.AfterFunc can only execute after newWaitingPod finishes.
wp.mu.Lock()
defer wp.mu.Unlock()
for k, v := range pluginsMaxWaitTime {
plugin, waitTime := k, v
wp.pendingPlugins[plugin] = time.AfterFunc(waitTime, func() {

View File

@ -262,6 +262,16 @@ var (
StabilityLevel: metrics.ALPHA,
}, []string{"queue", "event"})
PermitWaitDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "permit_wait_duration_seconds",
Help: "Duration of waiting in RunPermitPlugins.",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
},
[]string{"result"})
metricsList = []metrics.Registerable{
scheduleAttempts,
SchedulingLatency,
@ -286,6 +296,7 @@ var (
FrameworkExtensionPointDuration,
SchedulerQueueIncomingPods,
SchedulerGoroutines,
PermitWaitDuration,
}
)