Merge pull request #64266 from shyamjvs/measure-max-scheduler-throughput-metric

Automatic merge from submit-queue (batch tested with PRs 63232, 64257, 64183, 64266, 64134). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Measure scheduler throughput in density test This is a step towards exposing scheduler-related metrics on [perf-dash](http://perf-dash.k8s.io/). This particular PR adds scheduler throughput computation and makes the results available in our test artifacts. So if you do some experiments, you'll have some historical baseline data to compare against. xref https://github.com/kubernetes/kubernetes/issues/63493 fyi - @wojtek-t @davidopp @bsalamat @misterikkit cc @kubernetes/sig-scheduling-misc @kubernetes/sig-scalability-misc ```release-note NONE ```
2025-08-02 00:07:50 +00:00 · 2018-05-25 08:24:22 -07:00 · 2018-05-25 08:24:22 -07:00 · b8db949560
commit b8db949560
parent f33aed89d1 f363f549c0
3 changed files with 43 additions and 25 deletions
--- a/test/e2e/framework/metrics_util.go
+++ b/test/e2e/framework/metrics_util.go
@ -206,21 +206,22 @@ func (l *PodStartupLatency) PrintJSON() string {
 	return PrettyPrintJSON(PodStartupLatencyToPerfData(l))
 }

-type SchedulingLatency struct {
-	Scheduling LatencyMetric `json:"scheduling"`
-	Binding    LatencyMetric `json:"binding"`
-	Total      LatencyMetric `json:"total"`
+type SchedulingMetrics struct {
+	SchedulingLatency LatencyMetric `json:"schedulingLatency"`
+	BindingLatency    LatencyMetric `json:"bindingLatency"`
+	E2ELatency        LatencyMetric `json:"e2eLatency"`
+	ThroughputSamples []float64     `json:"throughputSamples"`
 }

-func (l *SchedulingLatency) SummaryKind() string {
-	return "SchedulingLatency"
+func (l *SchedulingMetrics) SummaryKind() string {
+	return "SchedulingMetrics"
 }

-func (l *SchedulingLatency) PrintHumanReadable() string {
+func (l *SchedulingMetrics) PrintHumanReadable() string {
 	return PrettyPrintJSON(l)
 }

-func (l *SchedulingLatency) PrintJSON() string {
+func (l *SchedulingMetrics) PrintJSON() string {
 	return PrettyPrintJSON(l)
 }

@ -438,9 +439,9 @@ func getMetrics(c clientset.Interface) (string, error) {
 	return string(body), nil
 }

-// Retrieves scheduler metrics information.
-func getSchedulingLatency(c clientset.Interface) (*SchedulingLatency, error) {
-	result := SchedulingLatency{}
+// Retrieves scheduler latency metrics.
+func getSchedulingLatency(c clientset.Interface) (*SchedulingMetrics, error) {
+	result := SchedulingMetrics{}

 	// Check if master Node is registered
 	nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
@ -491,11 +492,11 @@ func getSchedulingLatency(c clientset.Interface) (*SchedulingLatency, error) {
 		var metric *LatencyMetric = nil
 		switch sample.Metric[model.MetricNameLabel] {
 		case "scheduler_scheduling_algorithm_latency_microseconds":
-			metric = &result.Scheduling
+			metric = &result.SchedulingLatency
 		case "scheduler_binding_latency_microseconds":
-			metric = &result.Binding
+			metric = &result.BindingLatency
 		case "scheduler_e2e_scheduling_latency_microseconds":
-			metric = &result.Total
+			metric = &result.E2ELatency
 		}
 		if metric == nil {
 			continue
@ -512,7 +513,7 @@ func getSchedulingLatency(c clientset.Interface) (*SchedulingLatency, error) {
 }

 // Verifies (currently just by logging them) the scheduling latencies.
-func VerifySchedulerLatency(c clientset.Interface) (*SchedulingLatency, error) {
+func VerifySchedulerLatency(c clientset.Interface) (*SchedulingMetrics, error) {
 	latency, err := getSchedulingLatency(c)
 	if err != nil {
 		return nil, err
--- a/test/e2e/scalability/density.go
+++ b/test/e2e/scalability/density.go
@ -224,32 +224,42 @@ func density30AddonResourceVerifier(numNodes int) map[string]framework.ResourceC
 	return constraints
 }

-func logPodStartupStatus(c clientset.Interface, expectedPods int, observedLabels map[string]string, period time.Duration, stopCh chan struct{}) {
+func logPodStartupStatus(
+	c clientset.Interface,
+	expectedPods int,
+	observedLabels map[string]string,
+	period time.Duration,
+	scheduleThroughputs []float64,
+	stopCh chan struct{}) {
+
 	label := labels.SelectorFromSet(labels.Set(observedLabels))
 	podStore, err := testutils.NewPodStore(c, metav1.NamespaceAll, label, fields.Everything())
 	framework.ExpectNoError(err)
 	defer podStore.Stop()

 	ticker := time.NewTicker(period)
+	startupStatus := testutils.ComputeRCStartupStatus(podStore.List(), expectedPods)
+	lastScheduledCount := startupStatus.Scheduled
 	defer ticker.Stop()
 	for {
 		select {
 		case <-ticker.C:
-			pods := podStore.List()
-			startupStatus := testutils.ComputeRCStartupStatus(pods, expectedPods)
-			framework.Logf(startupStatus.String("Density"))
 		case <-stopCh:
-			pods := podStore.List()
-			startupStatus := testutils.ComputeRCStartupStatus(pods, expectedPods)
-			framework.Logf(startupStatus.String("Density"))
 			return
 		}
+		// Log status of the pods.
+		startupStatus := testutils.ComputeRCStartupStatus(podStore.List(), expectedPods)
+		framework.Logf(startupStatus.String("Density"))
+		// Compute scheduling throughput for the latest time period.
+		throughput := float64(startupStatus.Scheduled-lastScheduledCount) / float64(period/time.Second)
+		scheduleThroughputs = append(scheduleThroughputs, throughput)
+		lastScheduledCount = startupStatus.Scheduled
 	}
 }

 // runDensityTest will perform a density test and return the time it took for
 // all pods to start
-func runDensityTest(dtc DensityTestConfig, testPhaseDurations *timer.TestPhaseTimer) time.Duration {
+func runDensityTest(dtc DensityTestConfig, testPhaseDurations *timer.TestPhaseTimer, scheduleThroughputs []float64) time.Duration {
 	defer GinkgoRecover()

 	// Create all secrets, configmaps and daemons.
@ -274,7 +284,7 @@ func runDensityTest(dtc DensityTestConfig, testPhaseDurations *timer.TestPhaseTi
 		}()
 	}
 	logStopCh := make(chan struct{})
-	go logPodStartupStatus(dtc.ClientSets[0], dtc.PodCount, map[string]string{"type": "densityPod"}, dtc.PollInterval, logStopCh)
+	go logPodStartupStatus(dtc.ClientSets[0], dtc.PodCount, map[string]string{"type": "densityPod"}, dtc.PollInterval, scheduleThroughputs, logStopCh)
 	wg.Wait()
 	startupTime := time.Since(startTime)
 	close(logStopCh)
@ -355,6 +365,7 @@ var _ = SIGDescribe("Density", func() {
 	var nodeCpuCapacity int64
 	var nodeMemCapacity int64
 	var nodes *v1.NodeList
+	var scheduleThroughputs []float64

 	testCaseBaseName := "density"
 	missingMeasurements := 0
@ -397,6 +408,7 @@ var _ = SIGDescribe("Density", func() {
 		latency, err := framework.VerifySchedulerLatency(c)
 		framework.ExpectNoError(err)
 		if err == nil {
+			latency.ThroughputSamples = scheduleThroughputs
 			summaries = append(summaries, latency)
 		}
 		summaries = append(summaries, testPhaseDurations)
@ -643,7 +655,7 @@ var _ = SIGDescribe("Density", func() {
 						LogFunc:   framework.Logf,
 					})
 			}
-			e2eStartupTime = runDensityTest(dConfig, testPhaseDurations)
+			e2eStartupTime = runDensityTest(dConfig, testPhaseDurations, scheduleThroughputs)
 			if itArg.runLatencyTest {
 				By("Scheduling additional Pods to measure startup latencies")

--- a/test/utils/runners.go
+++ b/test/utils/runners.go
@ -655,6 +655,7 @@ type RCStartupStatus struct {
 	RunningButNotReady    int
 	Waiting               int
 	Pending               int
+	Scheduled             int
 	Unknown               int
 	Inactive              int
 	FailedContainers      int
@ -708,6 +709,10 @@ func ComputeRCStartupStatus(pods []*v1.Pod, expected int) RCStartupStatus {
 		} else if p.Status.Phase == v1.PodUnknown {
 			startupStatus.Unknown++
 		}
+		// Record count of scheduled pods (useful for computing scheduler throughput).
+		if p.Spec.NodeName != "" {
+			startupStatus.Scheduled++
+		}
 	}
 	return startupStatus
 }