From 224087abfa0d0dc25f2c6c8e86cb617d0430e908 Mon Sep 17 00:00:00 2001
From: Heba Elayoty <31887807+helayoty@users.noreply.github.com>
Date: Tue, 15 Aug 2023 15:17:41 -0700
Subject: [PATCH] Add Pod Scheduling SLI Duration metric (#119049)

Signed-off-by: Heba Elayoty <hebaelayoty@gmail.com>
Co-authored-by: Aldo Culquicondor <1299064+alculquicondor@users.noreply.github.com>
---
 pkg/scheduler/metrics/metrics.go              | 18 ++++++++++-
 pkg/scheduler/schedule_one.go                 |  1 +
 .../testdata/stable-metrics-list.yaml         | 30 +++++++++++++++++++
 .../scheduler_perf/scheduler_perf_test.go     |  3 +-
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go
index c76e1a28d64..d4871e70d7f 100644
--- a/pkg/scheduler/metrics/metrics.go
+++ b/pkg/scheduler/metrics/metrics.go
@@ -113,14 +113,29 @@ var (
 			Help:           "Number of running goroutines split by the work they do such as binding.",
 			StabilityLevel: metrics.ALPHA,
 		}, []string{"operation"})
+
+	// PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed
+	// in v1.31. Please use PodSchedulingSLIDuration instead.
 	PodSchedulingDuration = metrics.NewHistogramVec(
 		&metrics.HistogramOpts{
 			Subsystem: SchedulerSubsystem,
 			Name:      "pod_scheduling_duration_seconds",
 			Help:      "E2e latency for a pod being scheduled which may include multiple scheduling attempts.",
 			// Start with 10ms with the last bucket being [~88m, Inf).
+			Buckets:           metrics.ExponentialBuckets(0.01, 2, 20),
+			StabilityLevel:    metrics.STABLE,
+			DeprecatedVersion: "1.28.0",
+		},
+		[]string{"attempts"})
+
+	PodSchedulingSLIDuration = metrics.NewHistogramVec(
+		&metrics.HistogramOpts{
+			Subsystem: SchedulerSubsystem,
+			Name:      "pod_scheduling_sli_duration_seconds",
+			Help:      "E2e latency for a pod being scheduled, from the time the pod enters the scheduling queue an d might involve multiple scheduling attempts.",
+			// Start with 10ms with the last bucket being [~88m, Inf).
 			Buckets:        metrics.ExponentialBuckets(0.01, 2, 20),
-			StabilityLevel: metrics.STABLE,
+			StabilityLevel: metrics.BETA,
 		},
 		[]string{"attempts"})
 
@@ -206,6 +221,7 @@ var (
 		PreemptionAttempts,
 		pendingPods,
 		PodSchedulingDuration,
+		PodSchedulingSLIDuration,
 		PodSchedulingAttempts,
 		FrameworkExtensionPointDuration,
 		PluginExecutionDuration,
diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go
index 525e1af8632..4122ceed142 100644
--- a/pkg/scheduler/schedule_one.go
+++ b/pkg/scheduler/schedule_one.go
@@ -288,6 +288,7 @@ func (sched *Scheduler) bindingCycle(
 	metrics.PodSchedulingAttempts.Observe(float64(assumedPodInfo.Attempts))
 	if assumedPodInfo.InitialAttemptTimestamp != nil {
 		metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
+		metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
 	}
 	// Run "postbind" plugins.
 	fwk.RunPostBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost)
diff --git a/test/instrumentation/testdata/stable-metrics-list.yaml b/test/instrumentation/testdata/stable-metrics-list.yaml
index 0dc8ac811eb..a33d6c6476f 100644
--- a/test/instrumentation/testdata/stable-metrics-list.yaml
+++ b/test/instrumentation/testdata/stable-metrics-list.yaml
@@ -74,6 +74,35 @@
   stabilityLevel: STABLE
   labels:
   - zone
+- name: pod_scheduling_sli_duration_seconds
+  subsystem: scheduler
+  help: E2e latency for a pod being scheduled, from the time the pod enters the scheduling
+    queue an d might involve multiple scheduling attempts.
+  type: Histogram
+  stabilityLevel: BETA
+  labels:
+  - attempts
+  buckets:
+  - 0.01
+  - 0.02
+  - 0.04
+  - 0.08
+  - 0.16
+  - 0.32
+  - 0.64
+  - 1.28
+  - 2.56
+  - 5.12
+  - 10.24
+  - 20.48
+  - 40.96
+  - 81.92
+  - 163.84
+  - 327.68
+  - 655.36
+  - 1310.72
+  - 2621.44
+  - 5242.88
 - name: kube_pod_resource_limit
   help: Resources limit for workloads on the cluster, broken down by pod. This shows
     the resource usage the scheduler and kubelet expect per pod for resources along
@@ -151,6 +180,7 @@
   help: E2e latency for a pod being scheduled which may include multiple scheduling
     attempts.
   type: Histogram
+  deprecatedVersion: 1.28.0
   stabilityLevel: STABLE
   labels:
   - attempts
diff --git a/test/integration/scheduler_perf/scheduler_perf_test.go b/test/integration/scheduler_perf/scheduler_perf_test.go
index 8671b6381ce..84aae101443 100644
--- a/test/integration/scheduler_perf/scheduler_perf_test.go
+++ b/test/integration/scheduler_perf/scheduler_perf_test.go
@@ -102,7 +102,8 @@ var (
 				label:  resultLabelName,
 				values: []string{metrics.ScheduledResult, metrics.UnschedulableResult, metrics.ErrorResult},
 			},
-			"scheduler_pod_scheduling_duration_seconds": nil,
+			"scheduler_pod_scheduling_duration_seconds":     nil,
+			"scheduler_pod_scheduling_sli_duration_seconds": nil,
 		},
 	}
 )