Merge pull request #117230 from pohly/scheduler-perf-throughput

scheduler_perf: update throughputCollector
2025-09-24 11:29:09 +00:00 · 2023-04-29 12:12:17 -07:00
parent d94c733ee2 78b8af9fed
commit b4c6a70927
2 changed files with 60 additions and 15 deletions
--- a/test/integration/scheduler_perf/scheduler_perf_test.go
+++ b/test/integration/scheduler_perf/scheduler_perf_test.go
@@ -845,7 +845,7 @@ func runWorkload(ctx context.Context, b *testing.B, tc *testCase, w *workload) [
 			if concreteOp.CollectMetrics {
 				collectorCtx, collectorCancel = context.WithCancel(ctx)
 				defer collectorCancel()
-				collectors = getTestDataCollectors(podInformer, fmt.Sprintf("%s/%s", b.Name(), namespace), namespace, tc.MetricsCollectorConfig)
+				collectors = getTestDataCollectors(b, podInformer, fmt.Sprintf("%s/%s", b.Name(), namespace), namespace, tc.MetricsCollectorConfig)
 				for _, collector := range collectors {
 					// Need loop-local variable for function below.
 					collector := collector
@@ -1035,12 +1035,12 @@ type testDataCollector interface {
 	collect() []DataItem
 }

-func getTestDataCollectors(podInformer coreinformers.PodInformer, name, namespace string, mcc *metricsCollectorConfig) []testDataCollector {
+func getTestDataCollectors(tb testing.TB, podInformer coreinformers.PodInformer, name, namespace string, mcc *metricsCollectorConfig) []testDataCollector {
 	if mcc == nil {
 		mcc = &defaultMetricsCollectorConfig
 	}
 	return []testDataCollector{
-		newThroughputCollector(podInformer, map[string]string{"Name": name}, []string{namespace}),
+		newThroughputCollector(tb, podInformer, map[string]string{"Name": name}, []string{namespace}),
 		newMetricsCollector(mcc, map[string]string{"Name": name}),
 	}
 }
--- a/test/integration/scheduler_perf/util.go
+++ b/test/integration/scheduler_perf/util.go
@@ -50,10 +50,10 @@ import (
 )

 const (
-	dateFormat                = "2006-01-02T15:04:05Z"
-	testNamespace             = "sched-test"
-	setupNamespace            = "sched-setup"
-	throughputSampleFrequency = time.Second
+	dateFormat               = "2006-01-02T15:04:05Z"
+	testNamespace            = "sched-test"
+	setupNamespace           = "sched-setup"
+	throughputSampleInterval = time.Second
 )

 var dataItemsDir = flag.String("data-items-dir", "", "destination directory for storing generated data items for perf dashboard")
@@ -286,14 +286,16 @@ func collectHistogramVec(metric string, labels map[string]string, lvMap map[stri
 }

 type throughputCollector struct {
+	tb                    testing.TB
 	podInformer           coreinformers.PodInformer
 	schedulingThroughputs []float64
 	labels                map[string]string
 	namespaces            []string
 }

-func newThroughputCollector(podInformer coreinformers.PodInformer, labels map[string]string, namespaces []string) *throughputCollector {
+func newThroughputCollector(tb testing.TB, podInformer coreinformers.PodInformer, labels map[string]string, namespaces []string) *throughputCollector {
 	return &throughputCollector{
+		tb:          tb,
 		podInformer: podInformer,
 		labels:      labels,
 		namespaces:  namespaces,
@@ -306,28 +308,71 @@ func (tc *throughputCollector) run(ctx context.Context) {
 		klog.Fatalf("%v", err)
 	}
 	lastScheduledCount := len(podsScheduled)
-	ticker := time.NewTicker(throughputSampleFrequency)
+	ticker := time.NewTicker(throughputSampleInterval)
 	defer ticker.Stop()
+	lastSampleTime := time.Now()
+	started := false
+	skipped := 0
+
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
+			now := time.Now()
 			podsScheduled, err := getScheduledPods(tc.podInformer, tc.namespaces...)
 			if err != nil {
 				klog.Fatalf("%v", err)
 			}

 			scheduled := len(podsScheduled)
-			// Only do sampling if number of scheduled pods is greater than zero
-			if scheduled > 0 {
-				samplingRatioSeconds := float64(throughputSampleFrequency) / float64(time.Second)
-				throughput := float64(scheduled-lastScheduledCount) / samplingRatioSeconds
-				tc.schedulingThroughputs = append(tc.schedulingThroughputs, throughput)
+			// Only do sampling if number of scheduled pods is greater than zero.
+			if scheduled == 0 {
+				continue
+			}
+			if !started {
+				started = true
+				// Skip the initial sample. It's likely to be an outlier because
+				// sampling and creating pods get started independently.
 				lastScheduledCount = scheduled
-				klog.Infof("%d pods scheduled", lastScheduledCount)
+				lastSampleTime = now
+				continue
 			}

+			newScheduled := scheduled - lastScheduledCount
+			if newScheduled == 0 {
+				// Throughput would be zero for the interval.
+				// Instead of recording 0 pods/s, keep waiting
+				// until we see at least one additional pod
+				// being scheduled.
+				skipped++
+				continue
+			}
+
+			// This should be roughly equal to
+			// throughputSampleInterval * (skipped + 1), but we
+			// don't count on that because the goroutine might not
+			// be scheduled immediately when the timer
+			// triggers. Instead we track the actual time stamps.
+			duration := now.Sub(lastSampleTime)
+			durationInSeconds := duration.Seconds()
+			throughput := float64(newScheduled) / durationInSeconds
+			expectedDuration := throughputSampleInterval * time.Duration(skipped+1)
+			errorMargin := (duration - expectedDuration).Seconds() / expectedDuration.Seconds() * 100
+			if math.Abs(errorMargin) > 5 {
+				// This might affect the result, report it.
+				tc.tb.Errorf("ERROR: Expected throuput collector to sample at regular time intervals. The %d most recent intervals took %s instead of %s, a difference of %0.1f%%.", skipped+1, duration, expectedDuration, errorMargin)
+			}
+
+			// To keep percentiles accurate, we have to record multiple samples with the same
+			// throughput value if we skipped some intervals.
+			for i := 0; i <= skipped; i++ {
+				tc.schedulingThroughputs = append(tc.schedulingThroughputs, throughput)
+			}
+			lastScheduledCount = scheduled
+			klog.Infof("%d pods scheduled", lastScheduledCount)
+			skipped = 0
+			lastSampleTime = now
 		}
 	}
 }