From 224087abfa0d0dc25f2c6c8e86cb617d0430e908 Mon Sep 17 00:00:00 2001 From: Heba Elayoty <31887807+helayoty@users.noreply.github.com> Date: Tue, 15 Aug 2023 15:17:41 -0700 Subject: [PATCH] Add Pod Scheduling SLI Duration metric (#119049) Signed-off-by: Heba Elayoty Co-authored-by: Aldo Culquicondor <1299064+alculquicondor@users.noreply.github.com> --- pkg/scheduler/metrics/metrics.go | 18 ++++++++++- pkg/scheduler/schedule_one.go | 1 + .../testdata/stable-metrics-list.yaml | 30 +++++++++++++++++++ .../scheduler_perf/scheduler_perf_test.go | 3 +- 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index c76e1a28d64..d4871e70d7f 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -113,14 +113,29 @@ var ( Help: "Number of running goroutines split by the work they do such as binding.", StabilityLevel: metrics.ALPHA, }, []string{"operation"}) + + // PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed + // in v1.31. Please use PodSchedulingSLIDuration instead. PodSchedulingDuration = metrics.NewHistogramVec( &metrics.HistogramOpts{ Subsystem: SchedulerSubsystem, Name: "pod_scheduling_duration_seconds", Help: "E2e latency for a pod being scheduled which may include multiple scheduling attempts.", // Start with 10ms with the last bucket being [~88m, Inf). + Buckets: metrics.ExponentialBuckets(0.01, 2, 20), + StabilityLevel: metrics.STABLE, + DeprecatedVersion: "1.28.0", + }, + []string{"attempts"}) + + PodSchedulingSLIDuration = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Subsystem: SchedulerSubsystem, + Name: "pod_scheduling_sli_duration_seconds", + Help: "E2e latency for a pod being scheduled, from the time the pod enters the scheduling queue an d might involve multiple scheduling attempts.", + // Start with 10ms with the last bucket being [~88m, Inf). Buckets: metrics.ExponentialBuckets(0.01, 2, 20), - StabilityLevel: metrics.STABLE, + StabilityLevel: metrics.BETA, }, []string{"attempts"}) @@ -206,6 +221,7 @@ var ( PreemptionAttempts, pendingPods, PodSchedulingDuration, + PodSchedulingSLIDuration, PodSchedulingAttempts, FrameworkExtensionPointDuration, PluginExecutionDuration, diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 525e1af8632..4122ceed142 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -288,6 +288,7 @@ func (sched *Scheduler) bindingCycle( metrics.PodSchedulingAttempts.Observe(float64(assumedPodInfo.Attempts)) if assumedPodInfo.InitialAttemptTimestamp != nil { metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp)) + metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp)) } // Run "postbind" plugins. fwk.RunPostBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost) diff --git a/test/instrumentation/testdata/stable-metrics-list.yaml b/test/instrumentation/testdata/stable-metrics-list.yaml index 0dc8ac811eb..a33d6c6476f 100644 --- a/test/instrumentation/testdata/stable-metrics-list.yaml +++ b/test/instrumentation/testdata/stable-metrics-list.yaml @@ -74,6 +74,35 @@ stabilityLevel: STABLE labels: - zone +- name: pod_scheduling_sli_duration_seconds + subsystem: scheduler + help: E2e latency for a pod being scheduled, from the time the pod enters the scheduling + queue an d might involve multiple scheduling attempts. + type: Histogram + stabilityLevel: BETA + labels: + - attempts + buckets: + - 0.01 + - 0.02 + - 0.04 + - 0.08 + - 0.16 + - 0.32 + - 0.64 + - 1.28 + - 2.56 + - 5.12 + - 10.24 + - 20.48 + - 40.96 + - 81.92 + - 163.84 + - 327.68 + - 655.36 + - 1310.72 + - 2621.44 + - 5242.88 - name: kube_pod_resource_limit help: Resources limit for workloads on the cluster, broken down by pod. This shows the resource usage the scheduler and kubelet expect per pod for resources along @@ -151,6 +180,7 @@ help: E2e latency for a pod being scheduled which may include multiple scheduling attempts. type: Histogram + deprecatedVersion: 1.28.0 stabilityLevel: STABLE labels: - attempts diff --git a/test/integration/scheduler_perf/scheduler_perf_test.go b/test/integration/scheduler_perf/scheduler_perf_test.go index 8671b6381ce..84aae101443 100644 --- a/test/integration/scheduler_perf/scheduler_perf_test.go +++ b/test/integration/scheduler_perf/scheduler_perf_test.go @@ -102,7 +102,8 @@ var ( label: resultLabelName, values: []string{metrics.ScheduledResult, metrics.UnschedulableResult, metrics.ErrorResult}, }, - "scheduler_pod_scheduling_duration_seconds": nil, + "scheduler_pod_scheduling_duration_seconds": nil, + "scheduler_pod_scheduling_sli_duration_seconds": nil, }, } )