From 3047ab73f5ac8b711b55884f55ab809e96e68fbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Skocze=C5=84?= Date: Thu, 5 Sep 2024 13:39:32 +0000 Subject: [PATCH 1/2] Reset only metrics configured in collector before the createPodsOp --- .../scheduler_perf/scheduler_perf.go | 14 +++++--------- test/integration/scheduler_perf/util.go | 17 ++++++++++++++--- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index d08c643b172..183f56bc3bb 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1139,7 +1139,10 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact for _, collector := range collectors { // Need loop-local variable for function below. collector := collector - collector.init() + err = collector.init() + if err != nil { + tCtx.Fatalf("op %d: Failed to initialize data collector: %v", opIndex, err) + } collectorWG.Add(1) go func() { defer collectorWG.Done() @@ -1205,13 +1208,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact }() } - if !concreteOp.SkipWaitToCompletion { - // SkipWaitToCompletion=false indicates this step has waited for the Pods to be scheduled. - // So we reset the metrics in global registry; otherwise metrics gathered in this step - // will be carried over to next step. - legacyregistry.Reset() - } - case *churnOp: var namespace string if concreteOp.Namespace != nil { @@ -1376,7 +1372,7 @@ func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsP } type testDataCollector interface { - init() + init() error run(tCtx ktesting.TContext) collect() []DataItem } diff --git a/test/integration/scheduler_perf/util.go b/test/integration/scheduler_perf/util.go index 1994b299e4a..2e93d42a62c 100644 --- a/test/integration/scheduler_perf/util.go +++ b/test/integration/scheduler_perf/util.go @@ -263,9 +263,19 @@ func newMetricsCollector(config *metricsCollectorConfig, labels map[string]strin } } -func (mc *metricsCollector) init() { +func (mc *metricsCollector) init() error { // Reset the metrics so that the measurements do not interfere with those collected during the previous steps. - legacyregistry.Reset() + m, err := legacyregistry.DefaultGatherer.Gather() + if err != nil { + return fmt.Errorf("failed to gather metrics to reset: %w", err) + } + for _, mFamily := range m { + // Reset only metrics defined in the collector. + if _, ok := mc.Metrics[mFamily.GetName()]; ok { + mFamily.Reset() + } + } + return nil } func (*metricsCollector) run(tCtx ktesting.TContext) { @@ -381,7 +391,8 @@ func newThroughputCollector(podInformer coreinformers.PodInformer, labels map[st } } -func (tc *throughputCollector) init() { +func (tc *throughputCollector) init() error { + return nil } func (tc *throughputCollector) run(tCtx ktesting.TContext) { From 7d4c713520d93500868b2bf61d7789dee977dbcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Skocze=C5=84?= Date: Thu, 5 Sep 2024 13:40:36 +0000 Subject: [PATCH 2/2] Check if InFlightEvents is empty after scheduler_perf workload --- pkg/scheduler/framework/events.go | 36 +++++++++++++++++++ .../scheduler_perf/scheduler_perf.go | 27 ++++++++++++++ .../scheduler_perf/scheduler_test.go | 14 ++++++++ 3 files changed, 77 insertions(+) diff --git a/pkg/scheduler/framework/events.go b/pkg/scheduler/framework/events.go index 9284c6bb022..333502c8259 100644 --- a/pkg/scheduler/framework/events.go +++ b/pkg/scheduler/framework/events.go @@ -105,6 +105,42 @@ var ( WildCardEvent = ClusterEvent{Resource: WildCard, ActionType: All, Label: "WildCardEvent"} // UnschedulableTimeout is the event when a pod stays in unschedulable for longer than timeout. UnschedulableTimeout = ClusterEvent{Resource: WildCard, ActionType: All, Label: "UnschedulableTimeout"} + // AllEvents contains all events defined above. + AllEvents = []ClusterEvent{ + AssignedPodAdd, + NodeAdd, + NodeDelete, + AssignedPodUpdate, + UnscheduledPodAdd, + UnscheduledPodUpdate, + UnscheduledPodDelete, + assignedPodOtherUpdate, + AssignedPodDelete, + PodRequestScaledDown, + PodLabelChange, + PodTolerationChange, + PodSchedulingGateEliminatedChange, + NodeSpecUnschedulableChange, + NodeAllocatableChange, + NodeLabelChange, + NodeAnnotationChange, + NodeTaintChange, + NodeConditionChange, + PvAdd, + PvUpdate, + PvcAdd, + PvcUpdate, + StorageClassAdd, + StorageClassUpdate, + CSINodeAdd, + CSINodeUpdate, + CSIDriverAdd, + CSIDriverUpdate, + CSIStorageCapacityAdd, + CSIStorageCapacityUpdate, + WildCardEvent, + UnschedulableTimeout, + } ) // PodSchedulingPropertiesChange interprets the update of a pod and returns corresponding UpdatePodXYZ event(s). diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 183f56bc3bb..abc5319422a 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -52,10 +52,13 @@ import ( featuregatetesting "k8s.io/component-base/featuregate/testing" logsapi "k8s.io/component-base/logs/api/v1" "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/component-base/metrics/testutil" "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/apis/config" "k8s.io/kubernetes/pkg/scheduler/apis/config/scheme" "k8s.io/kubernetes/pkg/scheduler/apis/config/validation" + schedframework "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" "k8s.io/kubernetes/pkg/scheduler/metrics" @@ -927,6 +930,13 @@ func RunBenchmarkPerfScheduling(b *testing.B, outOfTreePluginRegistry frameworkr } } + if tc.FeatureGates[features.SchedulerQueueingHints] { + // In any case, we should make sure InFlightEvents is empty after running the scenario. + if err = checkEmptyInFlightEvents(); err != nil { + tCtx.Errorf("%s: %s", w.Name, err) + } + } + // Reset metrics to prevent metrics generated in current workload gets // carried over to the next workload. legacyregistry.Reset() @@ -1027,6 +1037,23 @@ func compareMetricWithThreshold(items []DataItem, threshold float64, metricSelec return nil } +func checkEmptyInFlightEvents() error { + labels := []string{metrics.PodPoppedInFlightEvent} + for _, event := range schedframework.AllEvents { + labels = append(labels, event.Label) + } + for _, label := range labels { + value, err := testutil.GetGaugeMetricValue(metrics.InFlightEvents.WithLabelValues(label)) + if err != nil { + return fmt.Errorf("failed to get InFlightEvents metric for label %s", label) + } + if value > 0 { + return fmt.Errorf("InFlightEvents for label %s should be empty, but has %v items", label, value) + } + } + return nil +} + func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory) []DataItem { b, benchmarking := tCtx.TB().(*testing.B) if benchmarking { diff --git a/test/integration/scheduler_perf/scheduler_test.go b/test/integration/scheduler_perf/scheduler_test.go index fc1dd5cb89e..2142d009dfc 100644 --- a/test/integration/scheduler_perf/scheduler_test.go +++ b/test/integration/scheduler_perf/scheduler_test.go @@ -18,6 +18,9 @@ package benchmark import ( "testing" + + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/kubernetes/pkg/features" ) func TestScheduling(t *testing.T) { @@ -43,6 +46,17 @@ func TestScheduling(t *testing.T) { informerFactory, tCtx := setupTestCase(t, tc, nil, nil) runWorkload(tCtx, tc, w, informerFactory) + + if tc.FeatureGates[features.SchedulerQueueingHints] { + // In any case, we should make sure InFlightEvents is empty after running the scenario. + if err = checkEmptyInFlightEvents(); err != nil { + tCtx.Errorf("%s: %s", w.Name, err) + } + } + + // Reset metrics to prevent metrics generated in current workload gets + // carried over to the next workload. + legacyregistry.Reset() }) } })