From 659804b765c4be5f0a9ee5b9c055f21c8b19493e Mon Sep 17 00:00:00 2001 From: YamasouA Date: Sun, 26 Jan 2025 19:39:38 +0900 Subject: [PATCH 01/25] refactor runWorkloads --- .../scheduler_perf/scheduler_perf.go | 691 ++++++++++-------- 1 file changed, 378 insertions(+), 313 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index a69100bff9e..0ce499443bc 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1450,6 +1450,40 @@ func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContex return dataItems } +type MetricsCollectionData struct { + Collectors []testDataCollector + // This needs a separate context and wait group because + // the metrics collecting needs to be sure that the goroutines + // are stopped. + CollectorCtx ktesting.TContext + CollectorWG *sync.WaitGroup + // Disable error checking of the sampling interval length in the + // throughput collector by default. When running benchmarks, report + // it as test failure when samples are not taken regularly. + ThroughputErrorMargin float64 +} + +type WorkloadState struct { + DataItems []DataItem + NextNodeIndex int + // numPodsScheduledPerNamespace has all namespaces created in workload and the number of pods they (will) have. + // All namespaces listed in numPodsScheduledPerNamespace will be cleaned up. + NumPodsScheduledPerNamespace map[string]int +} + +type SharedOperationData struct { + // Additional informers needed for testing. The pod informer was + // already created before (scheduler.NewInformerFactory) and the + // factory was started for it (mustSetupCluster), therefore we don't + // need to start again. + PodInformer coreinformers.PodInformer + MetricsData *MetricsCollectionData + WorkloadState *WorkloadState + TCtx ktesting.TContext + WG sync.WaitGroup + CancelFunc context.CancelFunc +} + func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory) []DataItem { b, benchmarking := tCtx.TB().(*testing.B) if benchmarking { @@ -1463,9 +1497,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact }) } - // Disable error checking of the sampling interval length in the - // throughput collector by default. When running benchmarks, report - // it as test failure when samples are not taken regularly. var throughputErrorMargin float64 if benchmarking { // TODO: To prevent the perf-test failure, we increased the error margin, if still not enough @@ -1473,12 +1504,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact throughputErrorMargin = 30 } - // Additional informers needed for testing. The pod informer was - // already created before (scheduler.NewInformerFactory) and the - // factory was started for it (mustSetupCluster), therefore we don't - // need to start again. - podInformer := informerFactory.Core().V1().Pods() - // Everything else started by this function gets stopped before it returns. tCtx = ktesting.WithCancel(tCtx) var wg sync.WaitGroup @@ -1486,315 +1511,25 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact defer tCtx.Cancel("workload is done") var dataItems []DataItem - nextNodeIndex := 0 - // numPodsScheduledPerNamespace has all namespaces created in workload and the number of pods they (will) have. - // All namespaces listed in numPodsScheduledPerNamespace will be cleaned up. - numPodsScheduledPerNamespace := make(map[string]int) - var collectors []testDataCollector - // This needs a separate context and wait group because - // the metrics collecting needs to be sure that the goroutines - // are stopped. - var collectorCtx ktesting.TContext var collectorWG sync.WaitGroup defer collectorWG.Wait() + sharedOperationData := SharedOperationData{ + TCtx: tCtx, + WG: wg, + MetricsData: &MetricsCollectionData{ + CollectorWG: &sync.WaitGroup{}, + ThroughputErrorMargin: throughputErrorMargin, + }, + WorkloadState: &WorkloadState{ + NumPodsScheduledPerNamespace: make(map[string]int), + }, + PodInformer: informerFactory.Core().V1().Pods(), + } + for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { - realOp, err := op.realOp.patchParams(w) - if err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) - } - select { - case <-tCtx.Done(): - tCtx.Fatalf("op %d: %v", opIndex, context.Cause(tCtx)) - default: - } - switch concreteOp := realOp.(type) { - case *createNodesOp: - nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, tCtx.Client()) - if err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) - } - if err := nodePreparer.PrepareNodes(tCtx, nextNodeIndex); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) - } - nextNodeIndex += concreteOp.Count - - case *createNamespacesOp: - nsPreparer, err := newNamespacePreparer(tCtx, concreteOp) - if err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) - } - if err := nsPreparer.prepare(tCtx); err != nil { - err2 := nsPreparer.cleanup(tCtx) - if err2 != nil { - err = fmt.Errorf("prepare: %v; cleanup: %v", err, err2) - } - tCtx.Fatalf("op %d: %v", opIndex, err) - } - for _, n := range nsPreparer.namespaces() { - if _, ok := numPodsScheduledPerNamespace[n]; ok { - // this namespace has been already created. - continue - } - numPodsScheduledPerNamespace[n] = 0 - } - - case *createPodsOp: - var namespace string - // define Pod's namespace automatically, and create that namespace. - namespace = fmt.Sprintf("namespace-%d", opIndex) - if concreteOp.Namespace != nil { - namespace = *concreteOp.Namespace - } - createNamespaceIfNotPresent(tCtx, namespace, &numPodsScheduledPerNamespace) - if concreteOp.PodTemplatePath == nil { - concreteOp.PodTemplatePath = tc.DefaultPodTemplatePath - } - - if concreteOp.CollectMetrics { - if collectorCtx != nil { - tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) - } - collectorCtx, collectors = startCollectingMetrics(tCtx, &collectorWG, podInformer, tc.MetricsCollectorConfig, throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) - defer collectorCtx.Cancel("cleaning up") - } - if err := createPodsRapidly(tCtx, namespace, concreteOp); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) - } - switch { - case concreteOp.SkipWaitToCompletion: - // Only record those namespaces that may potentially require barriers - // in the future. - numPodsScheduledPerNamespace[namespace] += concreteOp.Count - case concreteOp.SteadyState: - if err := createPodsSteadily(tCtx, namespace, podInformer, concreteOp); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) - } - default: - if err := waitUntilPodsScheduledInNamespace(tCtx, podInformer, nil, namespace, concreteOp.Count); err != nil { - tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) - } - } - if concreteOp.CollectMetrics { - // CollectMetrics and SkipWaitToCompletion can never be true at the - // same time, so if we're here, it means that all pods have been - // scheduled. - items := stopCollectingMetrics(tCtx, collectorCtx, &collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, collectors) - dataItems = append(dataItems, items...) - collectorCtx = nil - } - - case *deletePodsOp: - labelSelector := labels.ValidatedSetSelector(concreteOp.LabelSelector) - - podsToDelete, err := podInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector) - if err != nil { - tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, concreteOp.Namespace, err) - } - - deletePods := func(opIndex int) { - if concreteOp.DeletePodsPerSecond > 0 { - ticker := time.NewTicker(time.Second / time.Duration(concreteOp.DeletePodsPerSecond)) - defer ticker.Stop() - - for i := 0; i < len(podsToDelete); i++ { - select { - case <-ticker.C: - if err := tCtx.Client().CoreV1().Pods(concreteOp.Namespace).Delete(tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { - if errors.Is(err, context.Canceled) { - return - } - tCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) - } - case <-tCtx.Done(): - return - } - } - return - } - listOpts := metav1.ListOptions{ - LabelSelector: labelSelector.String(), - } - if err := tCtx.Client().CoreV1().Pods(concreteOp.Namespace).DeleteCollection(tCtx, metav1.DeleteOptions{}, listOpts); err != nil { - if errors.Is(err, context.Canceled) { - return - } - tCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, concreteOp.Namespace, err) - } - } - - if concreteOp.SkipWaitToCompletion { - wg.Add(1) - go func(opIndex int) { - defer wg.Done() - deletePods(opIndex) - }(opIndex) - } else { - deletePods(opIndex) - } - - case *churnOp: - var namespace string - if concreteOp.Namespace != nil { - namespace = *concreteOp.Namespace - } else { - namespace = fmt.Sprintf("namespace-%d", opIndex) - } - restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient(tCtx.Client().Discovery())) - // Ensure the namespace exists. - nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} - if _, err := tCtx.Client().CoreV1().Namespaces().Create(tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - tCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) - } - - var churnFns []func(name string) string - - for i, path := range concreteOp.TemplatePaths { - unstructuredObj, gvk, err := getUnstructuredFromFile(path) - if err != nil { - tCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) - } - // Obtain GVR. - mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version) - if err != nil { - tCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) - } - gvr := mapping.Resource - // Distinguish cluster-scoped with namespaced API objects. - var dynRes dynamic.ResourceInterface - if mapping.Scope.Name() == meta.RESTScopeNameNamespace { - dynRes = tCtx.Dynamic().Resource(gvr).Namespace(namespace) - } else { - dynRes = tCtx.Dynamic().Resource(gvr) - } - - churnFns = append(churnFns, func(name string) string { - if name != "" { - if err := dynRes.Delete(tCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { - tCtx.Errorf("op %d: unable to delete %v: %v", opIndex, name, err) - } - return "" - } - - live, err := dynRes.Create(tCtx, unstructuredObj, metav1.CreateOptions{}) - if err != nil { - return "" - } - return live.GetName() - }) - } - - var interval int64 = 500 - if concreteOp.IntervalMilliseconds != 0 { - interval = concreteOp.IntervalMilliseconds - } - ticker := time.NewTicker(time.Duration(interval) * time.Millisecond) - defer ticker.Stop() - - switch concreteOp.Mode { - case Create: - wg.Add(1) - go func() { - defer wg.Done() - count, threshold := 0, concreteOp.Number - if threshold == 0 { - threshold = math.MaxInt32 - } - for count < threshold { - select { - case <-ticker.C: - for i := range churnFns { - churnFns[i]("") - } - count++ - case <-tCtx.Done(): - return - } - } - }() - case Recreate: - wg.Add(1) - go func() { - defer wg.Done() - retVals := make([][]string, len(churnFns)) - // For each churn function, instantiate a slice of strings with length "concreteOp.Number". - for i := range retVals { - retVals[i] = make([]string, concreteOp.Number) - } - - count := 0 - for { - select { - case <-ticker.C: - for i := range churnFns { - retVals[i][count%concreteOp.Number] = churnFns[i](retVals[i][count%concreteOp.Number]) - } - count++ - case <-tCtx.Done(): - return - } - } - }() - } - - case *barrierOp: - for _, namespace := range concreteOp.Namespaces { - if _, ok := numPodsScheduledPerNamespace[namespace]; !ok { - tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) - } - } - switch concreteOp.StageRequirement { - case Attempted: - if err := waitUntilPodsAttempted(tCtx, podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) - } - case Scheduled: - // Default should be treated like "Scheduled", so handling both in the same way. - fallthrough - default: - if err := waitUntilPodsScheduled(tCtx, podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) - } - // At the end of the barrier, we can be sure that there are no pods - // pending scheduling in the namespaces that we just blocked on. - if len(concreteOp.Namespaces) == 0 { - numPodsScheduledPerNamespace = make(map[string]int) - } else { - for _, namespace := range concreteOp.Namespaces { - delete(numPodsScheduledPerNamespace, namespace) - } - } - } - - case *sleepOp: - select { - case <-tCtx.Done(): - case <-time.After(concreteOp.Duration.Duration): - } - - case *startCollectingMetricsOp: - if collectorCtx != nil { - tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) - } - collectorCtx, collectors = startCollectingMetrics(tCtx, &collectorWG, podInformer, tc.MetricsCollectorConfig, throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) - defer collectorCtx.Cancel("cleaning up") - - case *stopCollectingMetricsOp: - items := stopCollectingMetrics(tCtx, collectorCtx, &collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, collectors) - dataItems = append(dataItems, items...) - collectorCtx = nil - - default: - runable, ok := concreteOp.(runnableOp) - if !ok { - tCtx.Fatalf("op %d: invalid op %v", opIndex, concreteOp) - } - for _, namespace := range runable.requiredNamespaces() { - createNamespaceIfNotPresent(tCtx, namespace, &numPodsScheduledPerNamespace) - } - runable.run(tCtx) - } + runOperation(tc, opIndex, op, w, &sharedOperationData) } // check unused params and inform users @@ -1808,6 +1543,336 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact return dataItems } +func runCreateNodesOp(opIndex int, concreteOp *createNodesOp, sharedOperationData *SharedOperationData) { + nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, sharedOperationData.TCtx.Client()) + if err != nil { + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + } + if err := nodePreparer.PrepareNodes(sharedOperationData.TCtx, sharedOperationData.WorkloadState.NextNodeIndex); err != nil { + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + } + sharedOperationData.WorkloadState.NextNodeIndex += concreteOp.Count +} + +func runCreateNamespacesOp(opIndex int, concreteOp *createNamespacesOp, sharedOperationData *SharedOperationData) { + nsPreparer, err := newNamespacePreparer(sharedOperationData.TCtx, concreteOp) + if err != nil { + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + } + if err := nsPreparer.prepare(sharedOperationData.TCtx); err != nil { + err2 := nsPreparer.cleanup(sharedOperationData.TCtx) + if err2 != nil { + err = fmt.Errorf("prepare: %v; cleanup: %v", err, err2) + } + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + } + for _, n := range nsPreparer.namespaces() { + if _, ok := sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace[n]; ok { + // this namespace has been already created. + continue + } + sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace[n] = 0 + } +} + +func runCreatePodsOp(tc *testCase, w *workload, opIndex int, concreteOp *createPodsOp, sharedOperationData *SharedOperationData) { + var namespace string + // define Pod's namespace automatically, and create that namespace. + namespace = fmt.Sprintf("namespace-%d", opIndex) + if concreteOp.Namespace != nil { + namespace = *concreteOp.Namespace + } + createNamespaceIfNotPresent(sharedOperationData.TCtx, namespace, &sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace) + if concreteOp.PodTemplatePath == nil { + concreteOp.PodTemplatePath = tc.DefaultPodTemplatePath + } + + if concreteOp.CollectMetrics { + if sharedOperationData.MetricsData.CollectorCtx != nil { + sharedOperationData.TCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + } + sharedOperationData.MetricsData.CollectorCtx, sharedOperationData.MetricsData.Collectors = startCollectingMetrics(sharedOperationData.TCtx, sharedOperationData.MetricsData.CollectorWG, sharedOperationData.PodInformer, tc.MetricsCollectorConfig, sharedOperationData.MetricsData.ThroughputErrorMargin, opIndex, namespace, []string{namespace}, nil) + defer sharedOperationData.MetricsData.CollectorCtx.Cancel("cleaning up") + } + if err := createPodsRapidly(sharedOperationData.TCtx, namespace, concreteOp); err != nil { + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + } + switch { + case concreteOp.SkipWaitToCompletion: + // Only record those namespaces that may potentially require barriers + // in the future. + sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace[namespace] += concreteOp.Count + case concreteOp.SteadyState: + if err := createPodsSteadily(sharedOperationData.TCtx, namespace, sharedOperationData.PodInformer, concreteOp); err != nil { + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + } + default: + if err := waitUntilPodsScheduledInNamespace(sharedOperationData.TCtx, sharedOperationData.PodInformer, nil, namespace, concreteOp.Count); err != nil { + sharedOperationData.TCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) + } + } + if concreteOp.CollectMetrics { + // CollectMetrics and SkipWaitToCompletion can never be true at the + // same time, so if we're here, it means that all pods have been + // scheduled. + items := stopCollectingMetrics(sharedOperationData.TCtx, sharedOperationData.MetricsData.CollectorCtx, sharedOperationData.MetricsData.CollectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, sharedOperationData.MetricsData.Collectors) + sharedOperationData.WorkloadState.DataItems = append(sharedOperationData.WorkloadState.DataItems, items...) + sharedOperationData.MetricsData.CollectorCtx = nil + } +} + +func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData *SharedOperationData) { + labelSelector := labels.ValidatedSetSelector(concreteOp.LabelSelector) + + podsToDelete, err := sharedOperationData.PodInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector) + if err != nil { + sharedOperationData.TCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, concreteOp.Namespace, err) + } + + deletePods := func(opIndex int) { + if concreteOp.DeletePodsPerSecond > 0 { + ticker := time.NewTicker(time.Second / time.Duration(concreteOp.DeletePodsPerSecond)) + defer ticker.Stop() + + for i := 0; i < len(podsToDelete); i++ { + select { + case <-ticker.C: + if err := sharedOperationData.TCtx.Client().CoreV1().Pods(concreteOp.Namespace).Delete(sharedOperationData.TCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { + if errors.Is(err, context.Canceled) { + return + } + sharedOperationData.TCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) + } + case <-sharedOperationData.TCtx.Done(): + return + } + } + return + } + listOpts := metav1.ListOptions{ + LabelSelector: labelSelector.String(), + } + if err := sharedOperationData.TCtx.Client().CoreV1().Pods(concreteOp.Namespace).DeleteCollection(sharedOperationData.TCtx, metav1.DeleteOptions{}, listOpts); err != nil { + if errors.Is(err, context.Canceled) { + return + } + sharedOperationData.TCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, concreteOp.Namespace, err) + } + } + + if concreteOp.SkipWaitToCompletion { + sharedOperationData.WG.Add(1) + go func(opIndex int) { + defer sharedOperationData.WG.Done() + deletePods(opIndex) + }(opIndex) + } else { + deletePods(opIndex) + } +} + +func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *SharedOperationData) { + var namespace string + if concreteOp.Namespace != nil { + namespace = *concreteOp.Namespace + } else { + namespace = fmt.Sprintf("namespace-%d", opIndex) + } + restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient(sharedOperationData.TCtx.Client().Discovery())) + // Ensure the namespace exists. + nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} + if _, err := sharedOperationData.TCtx.Client().CoreV1().Namespaces().Create(sharedOperationData.TCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + sharedOperationData.TCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) + } + + var churnFns []func(name string) string + + for i, path := range concreteOp.TemplatePaths { + unstructuredObj, gvk, err := getUnstructuredFromFile(path) + if err != nil { + sharedOperationData.TCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) + } + // Obtain GVR. + mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version) + if err != nil { + sharedOperationData.TCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) + } + gvr := mapping.Resource + // Distinguish cluster-scoped with namespaced API objects. + var dynRes dynamic.ResourceInterface + if mapping.Scope.Name() == meta.RESTScopeNameNamespace { + dynRes = sharedOperationData.TCtx.Dynamic().Resource(gvr).Namespace(namespace) + } else { + dynRes = sharedOperationData.TCtx.Dynamic().Resource(gvr) + } + + churnFns = append(churnFns, func(name string) string { + if name != "" { + if err := dynRes.Delete(sharedOperationData.TCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { + sharedOperationData.TCtx.Errorf("op %d: unable to delete %v: %v", opIndex, name, err) + } + return "" + } + + live, err := dynRes.Create(sharedOperationData.TCtx, unstructuredObj, metav1.CreateOptions{}) + if err != nil { + return "" + } + return live.GetName() + }) + } + + var interval int64 = 500 + if concreteOp.IntervalMilliseconds != 0 { + interval = concreteOp.IntervalMilliseconds + } + ticker := time.NewTicker(time.Duration(interval) * time.Millisecond) + defer ticker.Stop() + + switch concreteOp.Mode { + case Create: + sharedOperationData.WG.Add(1) + go func() { + defer sharedOperationData.WG.Done() + count, threshold := 0, concreteOp.Number + if threshold == 0 { + threshold = math.MaxInt32 + } + for count < threshold { + select { + case <-ticker.C: + for i := range churnFns { + churnFns[i]("") + } + count++ + case <-sharedOperationData.TCtx.Done(): + return + } + } + }() + case Recreate: + sharedOperationData.WG.Add(1) + go func() { + defer sharedOperationData.WG.Done() + retVals := make([][]string, len(churnFns)) + // For each churn function, instantiate a slice of strings with length "concreteOp.Number". + for i := range retVals { + retVals[i] = make([]string, concreteOp.Number) + } + + count := 0 + for { + select { + case <-ticker.C: + for i := range churnFns { + retVals[i][count%concreteOp.Number] = churnFns[i](retVals[i][count%concreteOp.Number]) + } + count++ + case <-sharedOperationData.TCtx.Done(): + return + } + } + }() + } +} + +func runBarrierOp(opIndex int, concreteOp *barrierOp, sharedOperationData *SharedOperationData) { + for _, namespace := range concreteOp.Namespaces { + if _, ok := sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace[namespace]; !ok { + sharedOperationData.TCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) + } + } + switch concreteOp.StageRequirement { + case Attempted: + if err := waitUntilPodsAttempted(sharedOperationData.TCtx, sharedOperationData.PodInformer, concreteOp.LabelSelector, concreteOp.Namespaces, sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace); err != nil { + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + } + case Scheduled: + // Default should be treated like "Scheduled", so handling both in the same way. + fallthrough + default: + if err := waitUntilPodsScheduled(sharedOperationData.TCtx, sharedOperationData.PodInformer, concreteOp.LabelSelector, concreteOp.Namespaces, sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace); err != nil { + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + } + // At the end of the barrier, we can be sure that there are no pods + // pending scheduling in the namespaces that we just blocked on. + if len(concreteOp.Namespaces) == 0 { + sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace = make(map[string]int) + } else { + for _, namespace := range concreteOp.Namespaces { + delete(sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace, namespace) + } + } + } +} + +func runSleepOp(concreteOp *sleepOp, sharedOperationData *SharedOperationData) { + select { + case <-sharedOperationData.TCtx.Done(): + case <-time.After(concreteOp.Duration.Duration): + } +} + +func runStartCollectingMetricsOp(opIndex int, tc *testCase, concreteOp *startCollectingMetricsOp, sharedOperationData *SharedOperationData) { + if sharedOperationData.MetricsData.CollectorCtx != nil { + sharedOperationData.TCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + } + sharedOperationData.MetricsData.CollectorCtx, sharedOperationData.MetricsData.Collectors = startCollectingMetrics(sharedOperationData.TCtx, sharedOperationData.MetricsData.CollectorWG, sharedOperationData.PodInformer, tc.MetricsCollectorConfig, sharedOperationData.MetricsData.ThroughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) + + defer sharedOperationData.MetricsData.CollectorCtx.Cancel("cleaning up") +} + +func runStopCollectingMetricsOp(opIndex int, w *workload, sharedOperationData *SharedOperationData) { + items := stopCollectingMetrics(sharedOperationData.TCtx, sharedOperationData.MetricsData.CollectorCtx, sharedOperationData.MetricsData.CollectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, sharedOperationData.MetricsData.Collectors) + sharedOperationData.WorkloadState.DataItems = append(sharedOperationData.WorkloadState.DataItems, items...) + sharedOperationData.MetricsData.CollectorCtx = nil +} + +func runDefault(opIndex int, concreteOp realOp, sharedOperationData *SharedOperationData) { + runable, ok := concreteOp.(runnableOp) + if !ok { + sharedOperationData.TCtx.Fatalf("op %d: invalid op %v", opIndex, concreteOp) + } + for _, namespace := range runable.requiredNamespaces() { + createNamespaceIfNotPresent(sharedOperationData.TCtx, namespace, &sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace) + } + runable.run(sharedOperationData.TCtx) +} + +func runOperation(tc *testCase, opIndex int, op op, w *workload, sharedOperationData *SharedOperationData) { + realOp, err := op.realOp.patchParams(w) + if err != nil { + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + } + select { + case <-sharedOperationData.TCtx.Done(): + sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, context.Cause(sharedOperationData.TCtx)) + default: + } + switch concreteOp := realOp.(type) { + case *createNodesOp: + runCreateNodesOp(opIndex, concreteOp, sharedOperationData) + case *createNamespacesOp: + runCreateNamespacesOp(opIndex, concreteOp, sharedOperationData) + case *createPodsOp: + runCreatePodsOp(tc, w, opIndex, concreteOp, sharedOperationData) + case *deletePodsOp: + runDeletePodsOp(opIndex, concreteOp, sharedOperationData) + case *churnOp: + runChurnOp(opIndex, concreteOp, sharedOperationData) + case *barrierOp: + runBarrierOp(opIndex, concreteOp, sharedOperationData) + case *sleepOp: + runSleepOp(concreteOp, sharedOperationData) + case *startCollectingMetricsOp: + runStartCollectingMetricsOp(opIndex, tc, concreteOp, sharedOperationData) + case *stopCollectingMetricsOp: + runStopCollectingMetricsOp(opIndex, w, sharedOperationData) + default: + runDefault(opIndex, concreteOp, sharedOperationData) + } +} + func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) { if _, ok := (*podsPerNamespace)[namespace]; !ok { // The namespace has not created yet. From 1b0ad78718eb813e7b007ec1f1158cdaa9423cfe Mon Sep 17 00:00:00 2001 From: YamasouA Date: Wed, 29 Jan 2025 23:58:51 +0900 Subject: [PATCH 02/25] fix --- .../scheduler_perf/scheduler_perf.go | 245 ++++++++++-------- 1 file changed, 131 insertions(+), 114 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 0ce499443bc..064dcc6965e 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1450,38 +1450,55 @@ func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContex return dataItems } -type MetricsCollectionData struct { - Collectors []testDataCollector +// metricsCollectionData manages the state and synchronization of metrics collection +// during workload execution. +type metricsCollectionData struct { + // collectors holds a list of test data collectors used to gather performance metrics. + collectors []testDataCollector + // collectorCtx is a separate context specifically for managing the lifecycle + // of metrics collection goroutines. // This needs a separate context and wait group because // the metrics collecting needs to be sure that the goroutines // are stopped. - CollectorCtx ktesting.TContext - CollectorWG *sync.WaitGroup - // Disable error checking of the sampling interval length in the + collectorCtx ktesting.TContext + collectorWG *sync.WaitGroup + // disable error checking of the sampling interval length in the // throughput collector by default. When running benchmarks, report // it as test failure when samples are not taken regularly. - ThroughputErrorMargin float64 + throughputErrorMargin float64 } -type WorkloadState struct { - DataItems []DataItem - NextNodeIndex int +// WorkloadState holds the state information about the workload being executed. +type workloadState struct { + // dataItems stores the collected data from the workload execution. + dataItems []DataItem + // nextNodeIndex keeps track of the next node index to be used when creating nodes. + nextNodeIndex int // numPodsScheduledPerNamespace has all namespaces created in workload and the number of pods they (will) have. // All namespaces listed in numPodsScheduledPerNamespace will be cleaned up. - NumPodsScheduledPerNamespace map[string]int + numPodsScheduledPerNamespace map[string]int } -type SharedOperationData struct { +// sharedOperationData encapsulates all shared state and dependencies used during workload execution. +type sharedOperationData struct { + // podInformer provides access to the informer for monitoring Pod events in the cluster. // Additional informers needed for testing. The pod informer was // already created before (scheduler.NewInformerFactory) and the // factory was started for it (mustSetupCluster), therefore we don't // need to start again. - PodInformer coreinformers.PodInformer - MetricsData *MetricsCollectionData - WorkloadState *WorkloadState - TCtx ktesting.TContext - WG sync.WaitGroup - CancelFunc context.CancelFunc + podInformer coreinformers.PodInformer + // metricsData contains information and synchronization primitives for managing + // metrics collection during workload execution. + metricsData *metricsCollectionData + // workloadState holds information about the current state of the workload, + // including scheduled pods and created namespaces. + workloadState *workloadState + // tCtx is the root test context, used for cancellation and logging throughout + // the execution of workload operations. + tCtx ktesting.TContext + // wg is a wait group that tracks all ongoing goroutines in the workload execution. + // Ensures proper synchronization and prevents premature termination of operations. + wg *sync.WaitGroup } func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory) []DataItem { @@ -1507,25 +1524,27 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact // Everything else started by this function gets stopped before it returns. tCtx = ktesting.WithCancel(tCtx) var wg sync.WaitGroup - defer wg.Wait() - defer tCtx.Cancel("workload is done") + defer func() { + wg.Wait() + tCtx.Cancel("workload is done") + }() var dataItems []DataItem var collectorWG sync.WaitGroup defer collectorWG.Wait() - sharedOperationData := SharedOperationData{ - TCtx: tCtx, - WG: wg, - MetricsData: &MetricsCollectionData{ - CollectorWG: &sync.WaitGroup{}, - ThroughputErrorMargin: throughputErrorMargin, + sharedOperationData := sharedOperationData{ + tCtx: tCtx, + wg: &wg, + metricsData: &metricsCollectionData{ + collectorWG: &sync.WaitGroup{}, + throughputErrorMargin: throughputErrorMargin, }, - WorkloadState: &WorkloadState{ - NumPodsScheduledPerNamespace: make(map[string]int), + workloadState: &workloadState{ + numPodsScheduledPerNamespace: make(map[string]int), }, - PodInformer: informerFactory.Core().V1().Pods(), + podInformer: informerFactory.Core().V1().Pods(), } for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { @@ -1543,90 +1562,90 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact return dataItems } -func runCreateNodesOp(opIndex int, concreteOp *createNodesOp, sharedOperationData *SharedOperationData) { - nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, sharedOperationData.TCtx.Client()) +func runCreateNodesOp(opIndex int, concreteOp *createNodesOp, sharedOperationData *sharedOperationData) { + nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, sharedOperationData.tCtx.Client()) if err != nil { - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) } - if err := nodePreparer.PrepareNodes(sharedOperationData.TCtx, sharedOperationData.WorkloadState.NextNodeIndex); err != nil { - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + if err := nodePreparer.PrepareNodes(sharedOperationData.tCtx, sharedOperationData.workloadState.nextNodeIndex); err != nil { + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) } - sharedOperationData.WorkloadState.NextNodeIndex += concreteOp.Count + sharedOperationData.workloadState.nextNodeIndex += concreteOp.Count } -func runCreateNamespacesOp(opIndex int, concreteOp *createNamespacesOp, sharedOperationData *SharedOperationData) { - nsPreparer, err := newNamespacePreparer(sharedOperationData.TCtx, concreteOp) +func runCreateNamespacesOp(opIndex int, concreteOp *createNamespacesOp, sharedOperationData *sharedOperationData) { + nsPreparer, err := newNamespacePreparer(sharedOperationData.tCtx, concreteOp) if err != nil { - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) } - if err := nsPreparer.prepare(sharedOperationData.TCtx); err != nil { - err2 := nsPreparer.cleanup(sharedOperationData.TCtx) + if err := nsPreparer.prepare(sharedOperationData.tCtx); err != nil { + err2 := nsPreparer.cleanup(sharedOperationData.tCtx) if err2 != nil { err = fmt.Errorf("prepare: %v; cleanup: %v", err, err2) } - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) } for _, n := range nsPreparer.namespaces() { - if _, ok := sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace[n]; ok { + if _, ok := sharedOperationData.workloadState.numPodsScheduledPerNamespace[n]; ok { // this namespace has been already created. continue } - sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace[n] = 0 + sharedOperationData.workloadState.numPodsScheduledPerNamespace[n] = 0 } } -func runCreatePodsOp(tc *testCase, w *workload, opIndex int, concreteOp *createPodsOp, sharedOperationData *SharedOperationData) { +func runCreatePodsOp(tc *testCase, w *workload, opIndex int, concreteOp *createPodsOp, sharedOperationData *sharedOperationData) { var namespace string // define Pod's namespace automatically, and create that namespace. namespace = fmt.Sprintf("namespace-%d", opIndex) if concreteOp.Namespace != nil { namespace = *concreteOp.Namespace } - createNamespaceIfNotPresent(sharedOperationData.TCtx, namespace, &sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace) + createNamespaceIfNotPresent(sharedOperationData.tCtx, namespace, &sharedOperationData.workloadState.numPodsScheduledPerNamespace) if concreteOp.PodTemplatePath == nil { concreteOp.PodTemplatePath = tc.DefaultPodTemplatePath } if concreteOp.CollectMetrics { - if sharedOperationData.MetricsData.CollectorCtx != nil { - sharedOperationData.TCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + if sharedOperationData.metricsData.collectorCtx != nil { + sharedOperationData.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - sharedOperationData.MetricsData.CollectorCtx, sharedOperationData.MetricsData.Collectors = startCollectingMetrics(sharedOperationData.TCtx, sharedOperationData.MetricsData.CollectorWG, sharedOperationData.PodInformer, tc.MetricsCollectorConfig, sharedOperationData.MetricsData.ThroughputErrorMargin, opIndex, namespace, []string{namespace}, nil) - defer sharedOperationData.MetricsData.CollectorCtx.Cancel("cleaning up") + sharedOperationData.metricsData.collectorCtx, sharedOperationData.metricsData.collectors = startCollectingMetrics(sharedOperationData.tCtx, sharedOperationData.metricsData.collectorWG, sharedOperationData.podInformer, tc.MetricsCollectorConfig, sharedOperationData.metricsData.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) + defer sharedOperationData.metricsData.collectorCtx.Cancel("cleaning up") } - if err := createPodsRapidly(sharedOperationData.TCtx, namespace, concreteOp); err != nil { - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + if err := createPodsRapidly(sharedOperationData.tCtx, namespace, concreteOp); err != nil { + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) } switch { case concreteOp.SkipWaitToCompletion: // Only record those namespaces that may potentially require barriers // in the future. - sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace[namespace] += concreteOp.Count + sharedOperationData.workloadState.numPodsScheduledPerNamespace[namespace] += concreteOp.Count case concreteOp.SteadyState: - if err := createPodsSteadily(sharedOperationData.TCtx, namespace, sharedOperationData.PodInformer, concreteOp); err != nil { - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + if err := createPodsSteadily(sharedOperationData.tCtx, namespace, sharedOperationData.podInformer, concreteOp); err != nil { + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) } default: - if err := waitUntilPodsScheduledInNamespace(sharedOperationData.TCtx, sharedOperationData.PodInformer, nil, namespace, concreteOp.Count); err != nil { - sharedOperationData.TCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) + if err := waitUntilPodsScheduledInNamespace(sharedOperationData.tCtx, sharedOperationData.podInformer, nil, namespace, concreteOp.Count); err != nil { + sharedOperationData.tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) } } if concreteOp.CollectMetrics { // CollectMetrics and SkipWaitToCompletion can never be true at the // same time, so if we're here, it means that all pods have been // scheduled. - items := stopCollectingMetrics(sharedOperationData.TCtx, sharedOperationData.MetricsData.CollectorCtx, sharedOperationData.MetricsData.CollectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, sharedOperationData.MetricsData.Collectors) - sharedOperationData.WorkloadState.DataItems = append(sharedOperationData.WorkloadState.DataItems, items...) - sharedOperationData.MetricsData.CollectorCtx = nil + items := stopCollectingMetrics(sharedOperationData.tCtx, sharedOperationData.metricsData.collectorCtx, sharedOperationData.metricsData.collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, sharedOperationData.metricsData.collectors) + sharedOperationData.workloadState.dataItems = append(sharedOperationData.workloadState.dataItems, items...) + sharedOperationData.metricsData.collectorCtx = nil } } -func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData *SharedOperationData) { +func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData *sharedOperationData) { labelSelector := labels.ValidatedSetSelector(concreteOp.LabelSelector) - podsToDelete, err := sharedOperationData.PodInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector) + podsToDelete, err := sharedOperationData.podInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector) if err != nil { - sharedOperationData.TCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, concreteOp.Namespace, err) + sharedOperationData.tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, concreteOp.Namespace, err) } deletePods := func(opIndex int) { @@ -1637,13 +1656,13 @@ func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData for i := 0; i < len(podsToDelete); i++ { select { case <-ticker.C: - if err := sharedOperationData.TCtx.Client().CoreV1().Pods(concreteOp.Namespace).Delete(sharedOperationData.TCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { + if err := sharedOperationData.tCtx.Client().CoreV1().Pods(concreteOp.Namespace).Delete(sharedOperationData.tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { if errors.Is(err, context.Canceled) { return } - sharedOperationData.TCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) + sharedOperationData.tCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) } - case <-sharedOperationData.TCtx.Done(): + case <-sharedOperationData.tCtx.Done(): return } } @@ -1652,18 +1671,18 @@ func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData listOpts := metav1.ListOptions{ LabelSelector: labelSelector.String(), } - if err := sharedOperationData.TCtx.Client().CoreV1().Pods(concreteOp.Namespace).DeleteCollection(sharedOperationData.TCtx, metav1.DeleteOptions{}, listOpts); err != nil { + if err := sharedOperationData.tCtx.Client().CoreV1().Pods(concreteOp.Namespace).DeleteCollection(sharedOperationData.tCtx, metav1.DeleteOptions{}, listOpts); err != nil { if errors.Is(err, context.Canceled) { return } - sharedOperationData.TCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, concreteOp.Namespace, err) + sharedOperationData.tCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, concreteOp.Namespace, err) } } if concreteOp.SkipWaitToCompletion { - sharedOperationData.WG.Add(1) + sharedOperationData.wg.Add(1) go func(opIndex int) { - defer sharedOperationData.WG.Done() + defer sharedOperationData.wg.Done() deletePods(opIndex) }(opIndex) } else { @@ -1671,18 +1690,18 @@ func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData } } -func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *SharedOperationData) { +func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *sharedOperationData) { var namespace string if concreteOp.Namespace != nil { namespace = *concreteOp.Namespace } else { namespace = fmt.Sprintf("namespace-%d", opIndex) } - restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient(sharedOperationData.TCtx.Client().Discovery())) + restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient(sharedOperationData.tCtx.Client().Discovery())) // Ensure the namespace exists. nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} - if _, err := sharedOperationData.TCtx.Client().CoreV1().Namespaces().Create(sharedOperationData.TCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - sharedOperationData.TCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) + if _, err := sharedOperationData.tCtx.Client().CoreV1().Namespaces().Create(sharedOperationData.tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + sharedOperationData.tCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) } var churnFns []func(name string) string @@ -1690,31 +1709,31 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *SharedOpe for i, path := range concreteOp.TemplatePaths { unstructuredObj, gvk, err := getUnstructuredFromFile(path) if err != nil { - sharedOperationData.TCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) + sharedOperationData.tCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) } // Obtain GVR. mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version) if err != nil { - sharedOperationData.TCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) + sharedOperationData.tCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) } gvr := mapping.Resource // Distinguish cluster-scoped with namespaced API objects. var dynRes dynamic.ResourceInterface if mapping.Scope.Name() == meta.RESTScopeNameNamespace { - dynRes = sharedOperationData.TCtx.Dynamic().Resource(gvr).Namespace(namespace) + dynRes = sharedOperationData.tCtx.Dynamic().Resource(gvr).Namespace(namespace) } else { - dynRes = sharedOperationData.TCtx.Dynamic().Resource(gvr) + dynRes = sharedOperationData.tCtx.Dynamic().Resource(gvr) } churnFns = append(churnFns, func(name string) string { if name != "" { - if err := dynRes.Delete(sharedOperationData.TCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { - sharedOperationData.TCtx.Errorf("op %d: unable to delete %v: %v", opIndex, name, err) + if err := dynRes.Delete(sharedOperationData.tCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { + sharedOperationData.tCtx.Errorf("op %d: unable to delete %v: %v", opIndex, name, err) } return "" } - live, err := dynRes.Create(sharedOperationData.TCtx, unstructuredObj, metav1.CreateOptions{}) + live, err := dynRes.Create(sharedOperationData.tCtx, unstructuredObj, metav1.CreateOptions{}) if err != nil { return "" } @@ -1731,9 +1750,9 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *SharedOpe switch concreteOp.Mode { case Create: - sharedOperationData.WG.Add(1) + sharedOperationData.wg.Add(1) go func() { - defer sharedOperationData.WG.Done() + defer sharedOperationData.wg.Done() count, threshold := 0, concreteOp.Number if threshold == 0 { threshold = math.MaxInt32 @@ -1745,15 +1764,15 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *SharedOpe churnFns[i]("") } count++ - case <-sharedOperationData.TCtx.Done(): + case <-sharedOperationData.tCtx.Done(): return } } }() case Recreate: - sharedOperationData.WG.Add(1) + sharedOperationData.wg.Add(1) go func() { - defer sharedOperationData.WG.Done() + defer sharedOperationData.wg.Done() retVals := make([][]string, len(churnFns)) // For each churn function, instantiate a slice of strings with length "concreteOp.Number". for i := range retVals { @@ -1768,7 +1787,7 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *SharedOpe retVals[i][count%concreteOp.Number] = churnFns[i](retVals[i][count%concreteOp.Number]) } count++ - case <-sharedOperationData.TCtx.Done(): + case <-sharedOperationData.tCtx.Done(): return } } @@ -1776,77 +1795,75 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *SharedOpe } } -func runBarrierOp(opIndex int, concreteOp *barrierOp, sharedOperationData *SharedOperationData) { +func runBarrierOp(opIndex int, concreteOp *barrierOp, sharedOperationData *sharedOperationData) { for _, namespace := range concreteOp.Namespaces { - if _, ok := sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace[namespace]; !ok { - sharedOperationData.TCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) + if _, ok := sharedOperationData.workloadState.numPodsScheduledPerNamespace[namespace]; !ok { + sharedOperationData.tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) } } switch concreteOp.StageRequirement { case Attempted: - if err := waitUntilPodsAttempted(sharedOperationData.TCtx, sharedOperationData.PodInformer, concreteOp.LabelSelector, concreteOp.Namespaces, sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace); err != nil { - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + if err := waitUntilPodsAttempted(sharedOperationData.tCtx, sharedOperationData.podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, sharedOperationData.workloadState.numPodsScheduledPerNamespace); err != nil { + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) } case Scheduled: // Default should be treated like "Scheduled", so handling both in the same way. fallthrough default: - if err := waitUntilPodsScheduled(sharedOperationData.TCtx, sharedOperationData.PodInformer, concreteOp.LabelSelector, concreteOp.Namespaces, sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace); err != nil { - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + if err := waitUntilPodsScheduled(sharedOperationData.tCtx, sharedOperationData.podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, sharedOperationData.workloadState.numPodsScheduledPerNamespace); err != nil { + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) } // At the end of the barrier, we can be sure that there are no pods // pending scheduling in the namespaces that we just blocked on. if len(concreteOp.Namespaces) == 0 { - sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace = make(map[string]int) + sharedOperationData.workloadState.numPodsScheduledPerNamespace = make(map[string]int) } else { for _, namespace := range concreteOp.Namespaces { - delete(sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace, namespace) + delete(sharedOperationData.workloadState.numPodsScheduledPerNamespace, namespace) } } } } -func runSleepOp(concreteOp *sleepOp, sharedOperationData *SharedOperationData) { +func runSleepOp(concreteOp *sleepOp, sharedOperationData *sharedOperationData) { select { - case <-sharedOperationData.TCtx.Done(): + case <-sharedOperationData.tCtx.Done(): case <-time.After(concreteOp.Duration.Duration): } } -func runStartCollectingMetricsOp(opIndex int, tc *testCase, concreteOp *startCollectingMetricsOp, sharedOperationData *SharedOperationData) { - if sharedOperationData.MetricsData.CollectorCtx != nil { - sharedOperationData.TCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) +func runStartCollectingMetricsOp(opIndex int, tc *testCase, concreteOp *startCollectingMetricsOp, sharedOperationData *sharedOperationData) { + if sharedOperationData.metricsData.collectorCtx != nil { + sharedOperationData.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - sharedOperationData.MetricsData.CollectorCtx, sharedOperationData.MetricsData.Collectors = startCollectingMetrics(sharedOperationData.TCtx, sharedOperationData.MetricsData.CollectorWG, sharedOperationData.PodInformer, tc.MetricsCollectorConfig, sharedOperationData.MetricsData.ThroughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) - - defer sharedOperationData.MetricsData.CollectorCtx.Cancel("cleaning up") + sharedOperationData.metricsData.collectorCtx, sharedOperationData.metricsData.collectors = startCollectingMetrics(sharedOperationData.tCtx, sharedOperationData.metricsData.collectorWG, sharedOperationData.podInformer, tc.MetricsCollectorConfig, sharedOperationData.metricsData.throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) } -func runStopCollectingMetricsOp(opIndex int, w *workload, sharedOperationData *SharedOperationData) { - items := stopCollectingMetrics(sharedOperationData.TCtx, sharedOperationData.MetricsData.CollectorCtx, sharedOperationData.MetricsData.CollectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, sharedOperationData.MetricsData.Collectors) - sharedOperationData.WorkloadState.DataItems = append(sharedOperationData.WorkloadState.DataItems, items...) - sharedOperationData.MetricsData.CollectorCtx = nil +func runStopCollectingMetricsOp(opIndex int, w *workload, sharedOperationData *sharedOperationData) { + items := stopCollectingMetrics(sharedOperationData.tCtx, sharedOperationData.metricsData.collectorCtx, sharedOperationData.metricsData.collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, sharedOperationData.metricsData.collectors) + sharedOperationData.workloadState.dataItems = append(sharedOperationData.workloadState.dataItems, items...) + sharedOperationData.metricsData.collectorCtx = nil } -func runDefault(opIndex int, concreteOp realOp, sharedOperationData *SharedOperationData) { +func runDefault(opIndex int, concreteOp realOp, sharedOperationData *sharedOperationData) { runable, ok := concreteOp.(runnableOp) if !ok { - sharedOperationData.TCtx.Fatalf("op %d: invalid op %v", opIndex, concreteOp) + sharedOperationData.tCtx.Fatalf("op %d: invalid op %v", opIndex, concreteOp) } for _, namespace := range runable.requiredNamespaces() { - createNamespaceIfNotPresent(sharedOperationData.TCtx, namespace, &sharedOperationData.WorkloadState.NumPodsScheduledPerNamespace) + createNamespaceIfNotPresent(sharedOperationData.tCtx, namespace, &sharedOperationData.workloadState.numPodsScheduledPerNamespace) } - runable.run(sharedOperationData.TCtx) + runable.run(sharedOperationData.tCtx) } -func runOperation(tc *testCase, opIndex int, op op, w *workload, sharedOperationData *SharedOperationData) { +func runOperation(tc *testCase, opIndex int, op op, w *workload, sharedOperationData *sharedOperationData) { realOp, err := op.realOp.patchParams(w) if err != nil { - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, err) + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) } select { - case <-sharedOperationData.TCtx.Done(): - sharedOperationData.TCtx.Fatalf("op %d: %v", opIndex, context.Cause(sharedOperationData.TCtx)) + case <-sharedOperationData.tCtx.Done(): + sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, context.Cause(sharedOperationData.tCtx)) default: } switch concreteOp := realOp.(type) { From 479f9cd898d2b0070ff7dbff7df37fef12769165 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Tue, 11 Feb 2025 09:26:47 +0900 Subject: [PATCH 03/25] can pass all testcase --- .../scheduler_perf/scheduler_perf.go | 439 +++++++++--------- 1 file changed, 224 insertions(+), 215 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 064dcc6965e..9b9f6e58e0a 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1410,6 +1410,17 @@ func checkEmptyInFlightEvents() error { return nil } +type workloadExecutor struct { + tCtx ktesting.TContext + wg *sync.WaitGroup + collectorCtx *ktesting.TContext + collectorWG *sync.WaitGroup + collectors *[]testDataCollector + numPodsScheduledPerNamespace map[string]int + podInformer coreinformers.PodInformer + throughputErrorMargin float64 +} + func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, podInformer coreinformers.PodInformer, mcc *metricsCollectorConfig, throughputErrorMargin float64, opIndex int, name string, namespaces []string, labelSelector map[string]string) (ktesting.TContext, []testDataCollector) { collectorCtx := ktesting.WithCancel(tCtx) workloadName := tCtx.Name() @@ -1450,57 +1461,6 @@ func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContex return dataItems } -// metricsCollectionData manages the state and synchronization of metrics collection -// during workload execution. -type metricsCollectionData struct { - // collectors holds a list of test data collectors used to gather performance metrics. - collectors []testDataCollector - // collectorCtx is a separate context specifically for managing the lifecycle - // of metrics collection goroutines. - // This needs a separate context and wait group because - // the metrics collecting needs to be sure that the goroutines - // are stopped. - collectorCtx ktesting.TContext - collectorWG *sync.WaitGroup - // disable error checking of the sampling interval length in the - // throughput collector by default. When running benchmarks, report - // it as test failure when samples are not taken regularly. - throughputErrorMargin float64 -} - -// WorkloadState holds the state information about the workload being executed. -type workloadState struct { - // dataItems stores the collected data from the workload execution. - dataItems []DataItem - // nextNodeIndex keeps track of the next node index to be used when creating nodes. - nextNodeIndex int - // numPodsScheduledPerNamespace has all namespaces created in workload and the number of pods they (will) have. - // All namespaces listed in numPodsScheduledPerNamespace will be cleaned up. - numPodsScheduledPerNamespace map[string]int -} - -// sharedOperationData encapsulates all shared state and dependencies used during workload execution. -type sharedOperationData struct { - // podInformer provides access to the informer for monitoring Pod events in the cluster. - // Additional informers needed for testing. The pod informer was - // already created before (scheduler.NewInformerFactory) and the - // factory was started for it (mustSetupCluster), therefore we don't - // need to start again. - podInformer coreinformers.PodInformer - // metricsData contains information and synchronization primitives for managing - // metrics collection during workload execution. - metricsData *metricsCollectionData - // workloadState holds information about the current state of the workload, - // including scheduled pods and created namespaces. - workloadState *workloadState - // tCtx is the root test context, used for cancellation and logging throughout - // the execution of workload operations. - tCtx ktesting.TContext - // wg is a wait group that tracks all ongoing goroutines in the workload execution. - // Ensures proper synchronization and prevents premature termination of operations. - wg *sync.WaitGroup -} - func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory) []DataItem { b, benchmarking := tCtx.TB().(*testing.B) if benchmarking { @@ -1514,6 +1474,9 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact }) } + // Disable error checking of the sampling interval length in the + // throughput collector by default. When running benchmarks, report + // it as test failure when samples are not taken regularly. var throughputErrorMargin float64 if benchmarking { // TODO: To prevent the perf-test failure, we increased the error margin, if still not enough @@ -1521,35 +1484,46 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact throughputErrorMargin = 30 } + // Additional informers needed for testing. The pod informer was + // already created before (scheduler.NewInformerFactory) and the + // factory was started for it (mustSetupCluster), therefore we don't + // need to start again. + // podInformer := informerFactory.Core().V1().Pods() + // Everything else started by this function gets stopped before it returns. tCtx = ktesting.WithCancel(tCtx) - var wg sync.WaitGroup - defer func() { - wg.Wait() - tCtx.Cancel("workload is done") - }() + // var wg sync.WaitGroup + // defer wg.Wait() + defer tCtx.Cancel("workload is done") var dataItems []DataItem + // nextNodeIndex := 0 + // // numPodsScheduledPerNamespace has all namespaces created in workload and the number of pods they (will) have. + // // All namespaces listed in numPodsScheduledPerNamespace will be cleaned up. + // numPodsScheduledPerNamespace := make(map[string]int) - var collectorWG sync.WaitGroup - defer collectorWG.Wait() + // sharedOperationData := sharedOperationData{ + // tCtx: tCtx, + // wg: &wg, + // metricsData: &metricsCollectionData{ + // collectorWG: &sync.WaitGroup{}, + // throughputErrorMargin: throughputErrorMargin, + // }, + // workloadState: &workloadState{ + // numPodsScheduledPerNamespace: make(map[string]int), + // }, + // podInformer: informerFactory.Core().V1().Pods(), + // } - sharedOperationData := sharedOperationData{ - tCtx: tCtx, - wg: &wg, - metricsData: &metricsCollectionData{ - collectorWG: &sync.WaitGroup{}, - throughputErrorMargin: throughputErrorMargin, - }, - workloadState: &workloadState{ - numPodsScheduledPerNamespace: make(map[string]int), - }, - podInformer: informerFactory.Core().V1().Pods(), - } + // var collectors []testDataCollector + // // This needs a separate context and wait group because + // // the metrics collecting needs to be sure that the goroutines + // // are stopped. + // var collectorCtx ktesting.TContext + // var collectorWG sync.WaitGroup + // defer collectorWG.Wait() - for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { - runOperation(tc, opIndex, op, w, &sharedOperationData) - } + runJobs(tCtx, tc, w, informerFactory, throughputErrorMargin) // check unused params and inform users unusedParams := w.unusedParams() @@ -1562,90 +1536,137 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact return dataItems } -func runCreateNodesOp(opIndex int, concreteOp *createNodesOp, sharedOperationData *sharedOperationData) { - nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, sharedOperationData.tCtx.Client()) +func doCreateNodesOp(tCtx ktesting.TContext, opIndex int, concreteOp *createNodesOp, nextNodeIndex *int) { + nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, tCtx.Client()) if err != nil { - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) + tCtx.Fatalf("op %d: %v", opIndex, err) } - if err := nodePreparer.PrepareNodes(sharedOperationData.tCtx, sharedOperationData.workloadState.nextNodeIndex); err != nil { - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) + if err := nodePreparer.PrepareNodes(tCtx, *nextNodeIndex); err != nil { + tCtx.Fatalf("op %d: %v", opIndex, err) } - sharedOperationData.workloadState.nextNodeIndex += concreteOp.Count + *nextNodeIndex += concreteOp.Count } -func runCreateNamespacesOp(opIndex int, concreteOp *createNamespacesOp, sharedOperationData *sharedOperationData) { - nsPreparer, err := newNamespacePreparer(sharedOperationData.tCtx, concreteOp) +func doCreateNamespaceOp(tCtx ktesting.TContext, opIndex int, concreteOp *createNamespacesOp, numPodsScheduledPerNamespace map[string]int) { + nsPreparer, err := newNamespacePreparer(tCtx, concreteOp) if err != nil { - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) + tCtx.Fatalf("op %d: %v", opIndex, err) } - if err := nsPreparer.prepare(sharedOperationData.tCtx); err != nil { - err2 := nsPreparer.cleanup(sharedOperationData.tCtx) + if err := nsPreparer.prepare(tCtx); err != nil { + err2 := nsPreparer.cleanup(tCtx) if err2 != nil { err = fmt.Errorf("prepare: %v; cleanup: %v", err, err2) } - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) + tCtx.Fatalf("op %d: %v", opIndex, err) } for _, n := range nsPreparer.namespaces() { - if _, ok := sharedOperationData.workloadState.numPodsScheduledPerNamespace[n]; ok { + if _, ok := numPodsScheduledPerNamespace[n]; ok { // this namespace has been already created. continue } - sharedOperationData.workloadState.numPodsScheduledPerNamespace[n] = 0 + numPodsScheduledPerNamespace[n] = 0 } } -func runCreatePodsOp(tc *testCase, w *workload, opIndex int, concreteOp *createPodsOp, sharedOperationData *sharedOperationData) { +func doBarrierOp(tCtx ktesting.TContext, opIndex int, concreteOp *barrierOp, numPodsScheduledPerNamespace map[string]int, podInformer coreinformers.PodInformer) { + for _, namespace := range concreteOp.Namespaces { + if _, ok := numPodsScheduledPerNamespace[namespace]; !ok { + tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) + } + } + switch concreteOp.StageRequirement { + case Attempted: + if err := waitUntilPodsAttempted(tCtx, podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil { + tCtx.Fatalf("op %d: %v", opIndex, err) + } + case Scheduled: + // Default should be treated like "Scheduled", so handling both in the same way. + fallthrough + default: + if err := waitUntilPodsScheduled(tCtx, podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil { + tCtx.Fatalf("op %d: %v", opIndex, err) + } + // At the end of the barrier, we can be sure that there are no pods + // pending scheduling in the namespaces that we just blocked on. + if len(concreteOp.Namespaces) == 0 { + numPodsScheduledPerNamespace = make(map[string]int) + } else { + for _, namespace := range concreteOp.Namespaces { + delete(numPodsScheduledPerNamespace, namespace) + } + } + } +} + +func doStopCollectingMetrics(tCtx ktesting.TContext, collectorCtx *ktesting.TContext, opIndex int, dataItems *[]DataItem, w *workload, collectors []testDataCollector, collectorWG *sync.WaitGroup) { + items := stopCollectingMetrics(tCtx, *collectorCtx, collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, collectors) + *dataItems = append(*dataItems, items...) + *collectorCtx = nil +} + +func doCreatePodsOp( + tCtx ktesting.TContext, + opIndex int, + concreteOp *createPodsOp, + numPodsScheduledPerNamespace map[string]int, + dataItems *[]DataItem, + w *workload, + collectors *[]testDataCollector, + collectorWG *sync.WaitGroup, + throughputErrorMargin float64, + podInformer coreinformers.PodInformer, + tc *testCase, + collectorCtx *ktesting.TContext) { var namespace string // define Pod's namespace automatically, and create that namespace. namespace = fmt.Sprintf("namespace-%d", opIndex) if concreteOp.Namespace != nil { namespace = *concreteOp.Namespace } - createNamespaceIfNotPresent(sharedOperationData.tCtx, namespace, &sharedOperationData.workloadState.numPodsScheduledPerNamespace) + createNamespaceIfNotPresent(tCtx, namespace, &numPodsScheduledPerNamespace) if concreteOp.PodTemplatePath == nil { concreteOp.PodTemplatePath = tc.DefaultPodTemplatePath } if concreteOp.CollectMetrics { - if sharedOperationData.metricsData.collectorCtx != nil { - sharedOperationData.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + if *collectorCtx != nil { + tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - sharedOperationData.metricsData.collectorCtx, sharedOperationData.metricsData.collectors = startCollectingMetrics(sharedOperationData.tCtx, sharedOperationData.metricsData.collectorWG, sharedOperationData.podInformer, tc.MetricsCollectorConfig, sharedOperationData.metricsData.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) - defer sharedOperationData.metricsData.collectorCtx.Cancel("cleaning up") + *collectorCtx, *collectors = startCollectingMetrics(tCtx, collectorWG, podInformer, tc.MetricsCollectorConfig, throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) } - if err := createPodsRapidly(sharedOperationData.tCtx, namespace, concreteOp); err != nil { - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) + if err := createPodsRapidly(tCtx, namespace, concreteOp); err != nil { + tCtx.Fatalf("op %d: %v", opIndex, err) } switch { case concreteOp.SkipWaitToCompletion: // Only record those namespaces that may potentially require barriers // in the future. - sharedOperationData.workloadState.numPodsScheduledPerNamespace[namespace] += concreteOp.Count + numPodsScheduledPerNamespace[namespace] += concreteOp.Count case concreteOp.SteadyState: - if err := createPodsSteadily(sharedOperationData.tCtx, namespace, sharedOperationData.podInformer, concreteOp); err != nil { - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) + if err := createPodsSteadily(tCtx, namespace, podInformer, concreteOp); err != nil { + tCtx.Fatalf("op %d: %v", opIndex, err) } default: - if err := waitUntilPodsScheduledInNamespace(sharedOperationData.tCtx, sharedOperationData.podInformer, nil, namespace, concreteOp.Count); err != nil { - sharedOperationData.tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) + if err := waitUntilPodsScheduledInNamespace(tCtx, podInformer, nil, namespace, concreteOp.Count); err != nil { + tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) } } if concreteOp.CollectMetrics { // CollectMetrics and SkipWaitToCompletion can never be true at the // same time, so if we're here, it means that all pods have been // scheduled. - items := stopCollectingMetrics(sharedOperationData.tCtx, sharedOperationData.metricsData.collectorCtx, sharedOperationData.metricsData.collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, sharedOperationData.metricsData.collectors) - sharedOperationData.workloadState.dataItems = append(sharedOperationData.workloadState.dataItems, items...) - sharedOperationData.metricsData.collectorCtx = nil + items := stopCollectingMetrics(tCtx, *collectorCtx, collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, *collectors) + *dataItems = append(*dataItems, items...) + *collectorCtx = nil } } -func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData *sharedOperationData) { +func doDeletePodsOp(tCtx ktesting.TContext, opIndex int, concreteOp *deletePodsOp, podInformer coreinformers.PodInformer, wg *sync.WaitGroup) { labelSelector := labels.ValidatedSetSelector(concreteOp.LabelSelector) - podsToDelete, err := sharedOperationData.podInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector) + podsToDelete, err := podInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector) if err != nil { - sharedOperationData.tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, concreteOp.Namespace, err) + tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, concreteOp.Namespace, err) } deletePods := func(opIndex int) { @@ -1656,13 +1677,13 @@ func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData for i := 0; i < len(podsToDelete); i++ { select { case <-ticker.C: - if err := sharedOperationData.tCtx.Client().CoreV1().Pods(concreteOp.Namespace).Delete(sharedOperationData.tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { + if err := tCtx.Client().CoreV1().Pods(concreteOp.Namespace).Delete(tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { if errors.Is(err, context.Canceled) { return } - sharedOperationData.tCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) + tCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) } - case <-sharedOperationData.tCtx.Done(): + case <-tCtx.Done(): return } } @@ -1671,18 +1692,18 @@ func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData listOpts := metav1.ListOptions{ LabelSelector: labelSelector.String(), } - if err := sharedOperationData.tCtx.Client().CoreV1().Pods(concreteOp.Namespace).DeleteCollection(sharedOperationData.tCtx, metav1.DeleteOptions{}, listOpts); err != nil { + if err := tCtx.Client().CoreV1().Pods(concreteOp.Namespace).DeleteCollection(tCtx, metav1.DeleteOptions{}, listOpts); err != nil { if errors.Is(err, context.Canceled) { return } - sharedOperationData.tCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, concreteOp.Namespace, err) + tCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, concreteOp.Namespace, err) } } if concreteOp.SkipWaitToCompletion { - sharedOperationData.wg.Add(1) + wg.Add(1) go func(opIndex int) { - defer sharedOperationData.wg.Done() + defer wg.Done() deletePods(opIndex) }(opIndex) } else { @@ -1690,18 +1711,18 @@ func runDeletePodsOp(opIndex int, concreteOp *deletePodsOp, sharedOperationData } } -func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *sharedOperationData) { +func doChurnOp(tCtx ktesting.TContext, opIndex int, concreteOp *churnOp, wg *sync.WaitGroup) { var namespace string if concreteOp.Namespace != nil { namespace = *concreteOp.Namespace } else { namespace = fmt.Sprintf("namespace-%d", opIndex) } - restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient(sharedOperationData.tCtx.Client().Discovery())) + restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient(tCtx.Client().Discovery())) // Ensure the namespace exists. nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} - if _, err := sharedOperationData.tCtx.Client().CoreV1().Namespaces().Create(sharedOperationData.tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - sharedOperationData.tCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) + if _, err := tCtx.Client().CoreV1().Namespaces().Create(tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + tCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) } var churnFns []func(name string) string @@ -1709,31 +1730,31 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *sharedOpe for i, path := range concreteOp.TemplatePaths { unstructuredObj, gvk, err := getUnstructuredFromFile(path) if err != nil { - sharedOperationData.tCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) + tCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) } // Obtain GVR. mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version) if err != nil { - sharedOperationData.tCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) + tCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) } gvr := mapping.Resource // Distinguish cluster-scoped with namespaced API objects. var dynRes dynamic.ResourceInterface if mapping.Scope.Name() == meta.RESTScopeNameNamespace { - dynRes = sharedOperationData.tCtx.Dynamic().Resource(gvr).Namespace(namespace) + dynRes = tCtx.Dynamic().Resource(gvr).Namespace(namespace) } else { - dynRes = sharedOperationData.tCtx.Dynamic().Resource(gvr) + dynRes = tCtx.Dynamic().Resource(gvr) } churnFns = append(churnFns, func(name string) string { if name != "" { - if err := dynRes.Delete(sharedOperationData.tCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { - sharedOperationData.tCtx.Errorf("op %d: unable to delete %v: %v", opIndex, name, err) + if err := dynRes.Delete(tCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { + tCtx.Errorf("op %d: unable to delete %v: %v", opIndex, name, err) } return "" } - live, err := dynRes.Create(sharedOperationData.tCtx, unstructuredObj, metav1.CreateOptions{}) + live, err := dynRes.Create(tCtx, unstructuredObj, metav1.CreateOptions{}) if err != nil { return "" } @@ -1746,13 +1767,13 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *sharedOpe interval = concreteOp.IntervalMilliseconds } ticker := time.NewTicker(time.Duration(interval) * time.Millisecond) - defer ticker.Stop() switch concreteOp.Mode { case Create: - sharedOperationData.wg.Add(1) + wg.Add(1) go func() { - defer sharedOperationData.wg.Done() + defer wg.Done() + defer ticker.Stop() count, threshold := 0, concreteOp.Number if threshold == 0 { threshold = math.MaxInt32 @@ -1764,15 +1785,16 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *sharedOpe churnFns[i]("") } count++ - case <-sharedOperationData.tCtx.Done(): + case <-tCtx.Done(): return } } }() case Recreate: - sharedOperationData.wg.Add(1) + wg.Add(1) go func() { - defer sharedOperationData.wg.Done() + defer wg.Done() + defer ticker.Stop() retVals := make([][]string, len(churnFns)) // For each churn function, instantiate a slice of strings with length "concreteOp.Number". for i := range retVals { @@ -1787,7 +1809,7 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *sharedOpe retVals[i][count%concreteOp.Number] = churnFns[i](retVals[i][count%concreteOp.Number]) } count++ - case <-sharedOperationData.tCtx.Done(): + case <-tCtx.Done(): return } } @@ -1795,98 +1817,85 @@ func runChurnOp(opIndex int, concreteOp *churnOp, sharedOperationData *sharedOpe } } -func runBarrierOp(opIndex int, concreteOp *barrierOp, sharedOperationData *sharedOperationData) { - for _, namespace := range concreteOp.Namespaces { - if _, ok := sharedOperationData.workloadState.numPodsScheduledPerNamespace[namespace]; !ok { - sharedOperationData.tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) - } - } - switch concreteOp.StageRequirement { - case Attempted: - if err := waitUntilPodsAttempted(sharedOperationData.tCtx, sharedOperationData.podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, sharedOperationData.workloadState.numPodsScheduledPerNamespace); err != nil { - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) - } - case Scheduled: - // Default should be treated like "Scheduled", so handling both in the same way. - fallthrough - default: - if err := waitUntilPodsScheduled(sharedOperationData.tCtx, sharedOperationData.podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, sharedOperationData.workloadState.numPodsScheduledPerNamespace); err != nil { - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) - } - // At the end of the barrier, we can be sure that there are no pods - // pending scheduling in the namespaces that we just blocked on. - if len(concreteOp.Namespaces) == 0 { - sharedOperationData.workloadState.numPodsScheduledPerNamespace = make(map[string]int) - } else { - for _, namespace := range concreteOp.Namespaces { - delete(sharedOperationData.workloadState.numPodsScheduledPerNamespace, namespace) - } - } - } -} - -func runSleepOp(concreteOp *sleepOp, sharedOperationData *sharedOperationData) { - select { - case <-sharedOperationData.tCtx.Done(): - case <-time.After(concreteOp.Duration.Duration): - } -} - -func runStartCollectingMetricsOp(opIndex int, tc *testCase, concreteOp *startCollectingMetricsOp, sharedOperationData *sharedOperationData) { - if sharedOperationData.metricsData.collectorCtx != nil { - sharedOperationData.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) - } - sharedOperationData.metricsData.collectorCtx, sharedOperationData.metricsData.collectors = startCollectingMetrics(sharedOperationData.tCtx, sharedOperationData.metricsData.collectorWG, sharedOperationData.podInformer, tc.MetricsCollectorConfig, sharedOperationData.metricsData.throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) -} - -func runStopCollectingMetricsOp(opIndex int, w *workload, sharedOperationData *sharedOperationData) { - items := stopCollectingMetrics(sharedOperationData.tCtx, sharedOperationData.metricsData.collectorCtx, sharedOperationData.metricsData.collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, sharedOperationData.metricsData.collectors) - sharedOperationData.workloadState.dataItems = append(sharedOperationData.workloadState.dataItems, items...) - sharedOperationData.metricsData.collectorCtx = nil -} - -func runDefault(opIndex int, concreteOp realOp, sharedOperationData *sharedOperationData) { +func doDefaultOp(tCtx ktesting.TContext, opIndex int, concreteOp realOp, numPodsScheduledPerNamespace map[string]int) { runable, ok := concreteOp.(runnableOp) if !ok { - sharedOperationData.tCtx.Fatalf("op %d: invalid op %v", opIndex, concreteOp) + tCtx.Fatalf("op %d: invalid op %v", opIndex, concreteOp) } for _, namespace := range runable.requiredNamespaces() { - createNamespaceIfNotPresent(sharedOperationData.tCtx, namespace, &sharedOperationData.workloadState.numPodsScheduledPerNamespace) + createNamespaceIfNotPresent(tCtx, namespace, &numPodsScheduledPerNamespace) } - runable.run(sharedOperationData.tCtx) + runable.run(tCtx) } -func runOperation(tc *testCase, opIndex int, op op, w *workload, sharedOperationData *sharedOperationData) { - realOp, err := op.realOp.patchParams(w) - if err != nil { - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, err) +func doStartCollectingMetricsOp(tCtx ktesting.TContext, opIndex int, concreteOp *startCollectingMetricsOp, collectorCtx *ktesting.TContext, collectors *[]testDataCollector, collectorWG *sync.WaitGroup, podInformer coreinformers.PodInformer, tc *testCase, throughputErrorMargin float64) { + if *collectorCtx != nil { + tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - select { - case <-sharedOperationData.tCtx.Done(): - sharedOperationData.tCtx.Fatalf("op %d: %v", opIndex, context.Cause(sharedOperationData.tCtx)) - default: - } - switch concreteOp := realOp.(type) { - case *createNodesOp: - runCreateNodesOp(opIndex, concreteOp, sharedOperationData) - case *createNamespacesOp: - runCreateNamespacesOp(opIndex, concreteOp, sharedOperationData) - case *createPodsOp: - runCreatePodsOp(tc, w, opIndex, concreteOp, sharedOperationData) - case *deletePodsOp: - runDeletePodsOp(opIndex, concreteOp, sharedOperationData) - case *churnOp: - runChurnOp(opIndex, concreteOp, sharedOperationData) - case *barrierOp: - runBarrierOp(opIndex, concreteOp, sharedOperationData) - case *sleepOp: - runSleepOp(concreteOp, sharedOperationData) - case *startCollectingMetricsOp: - runStartCollectingMetricsOp(opIndex, tc, concreteOp, sharedOperationData) - case *stopCollectingMetricsOp: - runStopCollectingMetricsOp(opIndex, w, sharedOperationData) - default: - runDefault(opIndex, concreteOp, sharedOperationData) + *collectorCtx, *collectors = startCollectingMetrics(tCtx, collectorWG, podInformer, tc.MetricsCollectorConfig, throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) +} + +func runJobs(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory, throughputErrorMargin float64) { + var wg sync.WaitGroup + defer wg.Wait() + defer tCtx.Cancel("workload is done") + numPodsScheduledPerNamespace := make(map[string]int) + nextNodeIndex := 0 + podInformer := informerFactory.Core().V1().Pods() + var collectors []testDataCollector + // This needs a separate context and wait group because + // the metrics collecting needs to be sure that the goroutines + // are stopped. + var collectorCtx ktesting.TContext + var collectorWG sync.WaitGroup + defer collectorWG.Wait() + var dataItems []DataItem + for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { + realOp, err := op.realOp.patchParams(w) + if err != nil { + tCtx.Fatalf("op %d: %v", opIndex, err) + } + select { + case <-tCtx.Done(): + tCtx.Fatalf("op %d: %v", opIndex, context.Cause(tCtx)) + default: + } + switch concreteOp := realOp.(type) { + case *createNodesOp: + doCreateNodesOp(tCtx, opIndex, concreteOp, &nextNodeIndex) + + case *createNamespacesOp: + doCreateNamespaceOp(tCtx, opIndex, concreteOp, numPodsScheduledPerNamespace) + case *createPodsOp: + doCreatePodsOp(tCtx, opIndex, concreteOp, numPodsScheduledPerNamespace, &dataItems, w, &collectors, &collectorWG, throughputErrorMargin, podInformer, tc, &collectorCtx) + if collectorCtx != nil { + defer collectorCtx.Cancel("cleaning up") + } + case *deletePodsOp: + doDeletePodsOp(tCtx, opIndex, concreteOp, podInformer, &wg) + + case *churnOp: + doChurnOp(tCtx, opIndex, concreteOp, &wg) + + case *barrierOp: + doBarrierOp(tCtx, opIndex, concreteOp, numPodsScheduledPerNamespace, podInformer) + + case *sleepOp: + select { + case <-tCtx.Done(): + case <-time.After(concreteOp.Duration.Duration): + } + + case *startCollectingMetricsOp: + doStartCollectingMetricsOp(tCtx, opIndex, concreteOp, &collectorCtx, &collectors, &collectorWG, podInformer, tc, throughputErrorMargin) + defer collectorCtx.Cancel("cleaning up") + + case *stopCollectingMetricsOp: + doStopCollectingMetrics(tCtx, &collectorCtx, opIndex, &dataItems, w, collectors, &collectorWG) + + default: + doDefaultOp(tCtx, opIndex, concreteOp, numPodsScheduledPerNamespace) + } } } From 297b35873fe504938a4773773717c2d656ce755c Mon Sep 17 00:00:00 2001 From: YamasouA Date: Tue, 11 Feb 2025 12:18:14 +0900 Subject: [PATCH 04/25] use workloadExecutor --- .../scheduler_perf/scheduler_perf.go | 282 ++++++++---------- 1 file changed, 131 insertions(+), 151 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 9b9f6e58e0a..ed86a789dba 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1410,17 +1410,6 @@ func checkEmptyInFlightEvents() error { return nil } -type workloadExecutor struct { - tCtx ktesting.TContext - wg *sync.WaitGroup - collectorCtx *ktesting.TContext - collectorWG *sync.WaitGroup - collectors *[]testDataCollector - numPodsScheduledPerNamespace map[string]int - podInformer coreinformers.PodInformer - throughputErrorMargin float64 -} - func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, podInformer coreinformers.PodInformer, mcc *metricsCollectorConfig, throughputErrorMargin float64, opIndex int, name string, namespaces []string, labelSelector map[string]string) (ktesting.TContext, []testDataCollector) { collectorCtx := ktesting.WithCancel(tCtx) workloadName := tCtx.Name() @@ -1461,6 +1450,21 @@ func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContex return dataItems } +type WorkloadExecutor struct { + tCtx *ktesting.TContext + wg *sync.WaitGroup + collectorCtx *ktesting.TContext + collectorWG *sync.WaitGroup + collectors []testDataCollector + dataItems []DataItem + numPodsScheduledPerNamespace map[string]int + podInformer coreinformers.PodInformer + throughputErrorMargin float64 + tc *testCase + w *workload + nextNodeIndex int +} + func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory) []DataItem { b, benchmarking := tCtx.TB().(*testing.B) if benchmarking { @@ -1488,42 +1492,84 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact // already created before (scheduler.NewInformerFactory) and the // factory was started for it (mustSetupCluster), therefore we don't // need to start again. - // podInformer := informerFactory.Core().V1().Pods() + podInformer := informerFactory.Core().V1().Pods() // Everything else started by this function gets stopped before it returns. tCtx = ktesting.WithCancel(tCtx) - // var wg sync.WaitGroup - // defer wg.Wait() + var wg sync.WaitGroup + defer wg.Wait() defer tCtx.Cancel("workload is done") var dataItems []DataItem - // nextNodeIndex := 0 + // // numPodsScheduledPerNamespace has all namespaces created in workload and the number of pods they (will) have. // // All namespaces listed in numPodsScheduledPerNamespace will be cleaned up. // numPodsScheduledPerNamespace := make(map[string]int) - // sharedOperationData := sharedOperationData{ - // tCtx: tCtx, - // wg: &wg, - // metricsData: &metricsCollectionData{ - // collectorWG: &sync.WaitGroup{}, - // throughputErrorMargin: throughputErrorMargin, - // }, - // workloadState: &workloadState{ - // numPodsScheduledPerNamespace: make(map[string]int), - // }, - // podInformer: informerFactory.Core().V1().Pods(), - // } - - // var collectors []testDataCollector + var collectors []testDataCollector // // This needs a separate context and wait group because // // the metrics collecting needs to be sure that the goroutines // // are stopped. - // var collectorCtx ktesting.TContext - // var collectorWG sync.WaitGroup - // defer collectorWG.Wait() + var collectorCtx ktesting.TContext + var collectorWG sync.WaitGroup + defer collectorWG.Wait() - runJobs(tCtx, tc, w, informerFactory, throughputErrorMargin) + executor := WorkloadExecutor{ + tCtx: &tCtx, + wg: &wg, + collectorCtx: &collectorCtx, + collectorWG: &collectorWG, + collectors: collectors, + numPodsScheduledPerNamespace: make(map[string]int), + podInformer: podInformer, + throughputErrorMargin: throughputErrorMargin, + tc: tc, + w: w, + nextNodeIndex: 0, + dataItems: dataItems, + } + + for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { + realOp, err := op.realOp.patchParams(w) + if err != nil { + tCtx.Fatalf("op %d: %v", opIndex, err) + } + select { + case <-tCtx.Done(): + tCtx.Fatalf("op %d: %v", opIndex, context.Cause(tCtx)) + default: + } + switch concreteOp := realOp.(type) { + case *createNodesOp: + executor.doCreateNodesOp(opIndex, concreteOp) + + case *createNamespacesOp: + executor.doCreateNamespaceOp(opIndex, concreteOp) + case *createPodsOp: + executor.doCreatePodsOp(opIndex, concreteOp) + if *executor.collectorCtx != nil { + defer (*executor.collectorCtx).Cancel("cleaning up") + } + case *deletePodsOp: + executor.doDeletePodsOp(opIndex, concreteOp) + case *churnOp: + executor.doChurnOp(opIndex, concreteOp) + case *barrierOp: + executor.doBarrierOp(opIndex, concreteOp) + case *sleepOp: + select { + case <-tCtx.Done(): + case <-time.After(concreteOp.Duration.Duration): + } + case *startCollectingMetricsOp: + executor.doStartCollectingMetricsOp(opIndex, concreteOp) + defer (*executor.collectorCtx).Cancel("cleaning up") + case *stopCollectingMetricsOp: + executor.doStopCollectingMetrics(opIndex) + default: + executor.doDefaultOp(opIndex, concreteOp) + } + } // check unused params and inform users unusedParams := w.unusedParams() @@ -1536,18 +1582,20 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact return dataItems } -func doCreateNodesOp(tCtx ktesting.TContext, opIndex int, concreteOp *createNodesOp, nextNodeIndex *int) { +func (e *WorkloadExecutor) doCreateNodesOp(opIndex int, concreteOp *createNodesOp) { + tCtx := *e.tCtx nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, tCtx.Client()) if err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } - if err := nodePreparer.PrepareNodes(tCtx, *nextNodeIndex); err != nil { + if err := nodePreparer.PrepareNodes((*e.tCtx), e.nextNodeIndex); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } - *nextNodeIndex += concreteOp.Count + e.nextNodeIndex += concreteOp.Count } -func doCreateNamespaceOp(tCtx ktesting.TContext, opIndex int, concreteOp *createNamespacesOp, numPodsScheduledPerNamespace map[string]int) { +func (e *WorkloadExecutor) doCreateNamespaceOp(opIndex int, concreteOp *createNamespacesOp) { + tCtx := *e.tCtx nsPreparer, err := newNamespacePreparer(tCtx, concreteOp) if err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) @@ -1560,79 +1608,72 @@ func doCreateNamespaceOp(tCtx ktesting.TContext, opIndex int, concreteOp *create tCtx.Fatalf("op %d: %v", opIndex, err) } for _, n := range nsPreparer.namespaces() { - if _, ok := numPodsScheduledPerNamespace[n]; ok { + if _, ok := e.numPodsScheduledPerNamespace[n]; ok { // this namespace has been already created. continue } - numPodsScheduledPerNamespace[n] = 0 + e.numPodsScheduledPerNamespace[n] = 0 } } -func doBarrierOp(tCtx ktesting.TContext, opIndex int, concreteOp *barrierOp, numPodsScheduledPerNamespace map[string]int, podInformer coreinformers.PodInformer) { +func (e *WorkloadExecutor) doBarrierOp(opIndex int, concreteOp *barrierOp) { + tCtx := *e.tCtx for _, namespace := range concreteOp.Namespaces { - if _, ok := numPodsScheduledPerNamespace[namespace]; !ok { + if _, ok := e.numPodsScheduledPerNamespace[namespace]; !ok { tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) } } switch concreteOp.StageRequirement { case Attempted: - if err := waitUntilPodsAttempted(tCtx, podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil { + if err := waitUntilPodsAttempted(tCtx, e.podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, e.numPodsScheduledPerNamespace); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } case Scheduled: // Default should be treated like "Scheduled", so handling both in the same way. fallthrough default: - if err := waitUntilPodsScheduled(tCtx, podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil { + if err := waitUntilPodsScheduled(tCtx, e.podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, e.numPodsScheduledPerNamespace); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } // At the end of the barrier, we can be sure that there are no pods // pending scheduling in the namespaces that we just blocked on. if len(concreteOp.Namespaces) == 0 { - numPodsScheduledPerNamespace = make(map[string]int) + e.numPodsScheduledPerNamespace = make(map[string]int) } else { for _, namespace := range concreteOp.Namespaces { - delete(numPodsScheduledPerNamespace, namespace) + delete(e.numPodsScheduledPerNamespace, namespace) } } } } -func doStopCollectingMetrics(tCtx ktesting.TContext, collectorCtx *ktesting.TContext, opIndex int, dataItems *[]DataItem, w *workload, collectors []testDataCollector, collectorWG *sync.WaitGroup) { - items := stopCollectingMetrics(tCtx, *collectorCtx, collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, collectors) - *dataItems = append(*dataItems, items...) - *collectorCtx = nil +func (e *WorkloadExecutor) doStopCollectingMetrics(opIndex int) { + tCtx := *e.tCtx + collectorCtx := *e.collectorCtx + items := stopCollectingMetrics(tCtx, collectorCtx, e.collectorWG, e.w.Threshold, *e.w.ThresholdMetricSelector, opIndex, e.collectors) + e.dataItems = append(e.dataItems, items...) + collectorCtx = nil } -func doCreatePodsOp( - tCtx ktesting.TContext, - opIndex int, - concreteOp *createPodsOp, - numPodsScheduledPerNamespace map[string]int, - dataItems *[]DataItem, - w *workload, - collectors *[]testDataCollector, - collectorWG *sync.WaitGroup, - throughputErrorMargin float64, - podInformer coreinformers.PodInformer, - tc *testCase, - collectorCtx *ktesting.TContext) { +func (e *WorkloadExecutor) doCreatePodsOp(opIndex int, concreteOp *createPodsOp) { + tCtx := *e.tCtx + collectorCtx := *e.tCtx var namespace string // define Pod's namespace automatically, and create that namespace. namespace = fmt.Sprintf("namespace-%d", opIndex) if concreteOp.Namespace != nil { namespace = *concreteOp.Namespace } - createNamespaceIfNotPresent(tCtx, namespace, &numPodsScheduledPerNamespace) + createNamespaceIfNotPresent(tCtx, namespace, &e.numPodsScheduledPerNamespace) if concreteOp.PodTemplatePath == nil { - concreteOp.PodTemplatePath = tc.DefaultPodTemplatePath + concreteOp.PodTemplatePath = e.tc.DefaultPodTemplatePath } if concreteOp.CollectMetrics { - if *collectorCtx != nil { + if *e.collectorCtx != nil { tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - *collectorCtx, *collectors = startCollectingMetrics(tCtx, collectorWG, podInformer, tc.MetricsCollectorConfig, throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) + *e.collectorCtx, e.collectors = startCollectingMetrics(tCtx, e.collectorWG, e.podInformer, e.tc.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) } if err := createPodsRapidly(tCtx, namespace, concreteOp); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) @@ -1641,13 +1682,13 @@ func doCreatePodsOp( case concreteOp.SkipWaitToCompletion: // Only record those namespaces that may potentially require barriers // in the future. - numPodsScheduledPerNamespace[namespace] += concreteOp.Count + e.numPodsScheduledPerNamespace[namespace] += concreteOp.Count case concreteOp.SteadyState: - if err := createPodsSteadily(tCtx, namespace, podInformer, concreteOp); err != nil { + if err := createPodsSteadily(tCtx, namespace, e.podInformer, concreteOp); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } default: - if err := waitUntilPodsScheduledInNamespace(tCtx, podInformer, nil, namespace, concreteOp.Count); err != nil { + if err := waitUntilPodsScheduledInNamespace(tCtx, e.podInformer, nil, namespace, concreteOp.Count); err != nil { tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) } } @@ -1655,16 +1696,17 @@ func doCreatePodsOp( // CollectMetrics and SkipWaitToCompletion can never be true at the // same time, so if we're here, it means that all pods have been // scheduled. - items := stopCollectingMetrics(tCtx, *collectorCtx, collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, *collectors) - *dataItems = append(*dataItems, items...) - *collectorCtx = nil + items := stopCollectingMetrics(tCtx, collectorCtx, e.collectorWG, e.w.Threshold, *e.w.ThresholdMetricSelector, opIndex, e.collectors) + e.dataItems = append(e.dataItems, items...) + *e.collectorCtx = nil } } -func doDeletePodsOp(tCtx ktesting.TContext, opIndex int, concreteOp *deletePodsOp, podInformer coreinformers.PodInformer, wg *sync.WaitGroup) { +func (e *WorkloadExecutor) doDeletePodsOp(opIndex int, concreteOp *deletePodsOp) { + tCtx := *e.tCtx labelSelector := labels.ValidatedSetSelector(concreteOp.LabelSelector) - podsToDelete, err := podInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector) + podsToDelete, err := e.podInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector) if err != nil { tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, concreteOp.Namespace, err) } @@ -1701,9 +1743,9 @@ func doDeletePodsOp(tCtx ktesting.TContext, opIndex int, concreteOp *deletePodsO } if concreteOp.SkipWaitToCompletion { - wg.Add(1) + e.wg.Add(1) go func(opIndex int) { - defer wg.Done() + defer e.wg.Done() deletePods(opIndex) }(opIndex) } else { @@ -1711,7 +1753,8 @@ func doDeletePodsOp(tCtx ktesting.TContext, opIndex int, concreteOp *deletePodsO } } -func doChurnOp(tCtx ktesting.TContext, opIndex int, concreteOp *churnOp, wg *sync.WaitGroup) { +func (e *WorkloadExecutor) doChurnOp(opIndex int, concreteOp *churnOp) { + tCtx := *e.tCtx var namespace string if concreteOp.Namespace != nil { namespace = *concreteOp.Namespace @@ -1770,9 +1813,9 @@ func doChurnOp(tCtx ktesting.TContext, opIndex int, concreteOp *churnOp, wg *syn switch concreteOp.Mode { case Create: - wg.Add(1) + e.wg.Add(1) go func() { - defer wg.Done() + defer e.wg.Done() defer ticker.Stop() count, threshold := 0, concreteOp.Number if threshold == 0 { @@ -1791,9 +1834,9 @@ func doChurnOp(tCtx ktesting.TContext, opIndex int, concreteOp *churnOp, wg *syn } }() case Recreate: - wg.Add(1) + e.wg.Add(1) go func() { - defer wg.Done() + defer e.wg.Done() defer ticker.Stop() retVals := make([][]string, len(churnFns)) // For each churn function, instantiate a slice of strings with length "concreteOp.Number". @@ -1817,86 +1860,23 @@ func doChurnOp(tCtx ktesting.TContext, opIndex int, concreteOp *churnOp, wg *syn } } -func doDefaultOp(tCtx ktesting.TContext, opIndex int, concreteOp realOp, numPodsScheduledPerNamespace map[string]int) { +func (e *WorkloadExecutor) doDefaultOp(opIndex int, concreteOp realOp) { + tCtx := *e.tCtx runable, ok := concreteOp.(runnableOp) if !ok { tCtx.Fatalf("op %d: invalid op %v", opIndex, concreteOp) } for _, namespace := range runable.requiredNamespaces() { - createNamespaceIfNotPresent(tCtx, namespace, &numPodsScheduledPerNamespace) + createNamespaceIfNotPresent(tCtx, namespace, &e.numPodsScheduledPerNamespace) } runable.run(tCtx) } -func doStartCollectingMetricsOp(tCtx ktesting.TContext, opIndex int, concreteOp *startCollectingMetricsOp, collectorCtx *ktesting.TContext, collectors *[]testDataCollector, collectorWG *sync.WaitGroup, podInformer coreinformers.PodInformer, tc *testCase, throughputErrorMargin float64) { - if *collectorCtx != nil { - tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) - } - *collectorCtx, *collectors = startCollectingMetrics(tCtx, collectorWG, podInformer, tc.MetricsCollectorConfig, throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) -} - -func runJobs(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory, throughputErrorMargin float64) { - var wg sync.WaitGroup - defer wg.Wait() - defer tCtx.Cancel("workload is done") - numPodsScheduledPerNamespace := make(map[string]int) - nextNodeIndex := 0 - podInformer := informerFactory.Core().V1().Pods() - var collectors []testDataCollector - // This needs a separate context and wait group because - // the metrics collecting needs to be sure that the goroutines - // are stopped. - var collectorCtx ktesting.TContext - var collectorWG sync.WaitGroup - defer collectorWG.Wait() - var dataItems []DataItem - for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { - realOp, err := op.realOp.patchParams(w) - if err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) - } - select { - case <-tCtx.Done(): - tCtx.Fatalf("op %d: %v", opIndex, context.Cause(tCtx)) - default: - } - switch concreteOp := realOp.(type) { - case *createNodesOp: - doCreateNodesOp(tCtx, opIndex, concreteOp, &nextNodeIndex) - - case *createNamespacesOp: - doCreateNamespaceOp(tCtx, opIndex, concreteOp, numPodsScheduledPerNamespace) - case *createPodsOp: - doCreatePodsOp(tCtx, opIndex, concreteOp, numPodsScheduledPerNamespace, &dataItems, w, &collectors, &collectorWG, throughputErrorMargin, podInformer, tc, &collectorCtx) - if collectorCtx != nil { - defer collectorCtx.Cancel("cleaning up") - } - case *deletePodsOp: - doDeletePodsOp(tCtx, opIndex, concreteOp, podInformer, &wg) - - case *churnOp: - doChurnOp(tCtx, opIndex, concreteOp, &wg) - - case *barrierOp: - doBarrierOp(tCtx, opIndex, concreteOp, numPodsScheduledPerNamespace, podInformer) - - case *sleepOp: - select { - case <-tCtx.Done(): - case <-time.After(concreteOp.Duration.Duration): - } - - case *startCollectingMetricsOp: - doStartCollectingMetricsOp(tCtx, opIndex, concreteOp, &collectorCtx, &collectors, &collectorWG, podInformer, tc, throughputErrorMargin) - defer collectorCtx.Cancel("cleaning up") - - case *stopCollectingMetricsOp: - doStopCollectingMetrics(tCtx, &collectorCtx, opIndex, &dataItems, w, collectors, &collectorWG) - - default: - doDefaultOp(tCtx, opIndex, concreteOp, numPodsScheduledPerNamespace) - } +func (e *WorkloadExecutor) doStartCollectingMetricsOp(opIndex int, concreteOp *startCollectingMetricsOp) { + if *e.collectorCtx != nil { + (*e.tCtx).Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } + *e.collectorCtx, e.collectors = startCollectingMetrics((*e.tCtx), e.collectorWG, e.podInformer, e.tc.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) } func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) { From c40e69bb4cc20dcaf5f0835c69233768919e8af4 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Thu, 13 Feb 2025 23:02:41 +0900 Subject: [PATCH 05/25] remove double comments --- test/integration/scheduler_perf/scheduler_perf.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index ed86a789dba..4cbcf6f52c7 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1502,14 +1502,10 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact var dataItems []DataItem - // // numPodsScheduledPerNamespace has all namespaces created in workload and the number of pods they (will) have. - // // All namespaces listed in numPodsScheduledPerNamespace will be cleaned up. - // numPodsScheduledPerNamespace := make(map[string]int) - var collectors []testDataCollector - // // This needs a separate context and wait group because - // // the metrics collecting needs to be sure that the goroutines - // // are stopped. + // This needs a separate context and wait group because + // the metrics collecting needs to be sure that the goroutines + // are stopped. var collectorCtx ktesting.TContext var collectorWG sync.WaitGroup defer collectorWG.Wait() From d202a683f54f67788771272fe15d6df27704a6e4 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Thu, 13 Feb 2025 23:04:54 +0900 Subject: [PATCH 06/25] rename workloadExecutor member name --- .../scheduler_perf/scheduler_perf.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 4cbcf6f52c7..941b92344c6 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1460,8 +1460,8 @@ type WorkloadExecutor struct { numPodsScheduledPerNamespace map[string]int podInformer coreinformers.PodInformer throughputErrorMargin float64 - tc *testCase - w *workload + testCase *testCase + workload *workload nextNodeIndex int } @@ -1519,8 +1519,8 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact numPodsScheduledPerNamespace: make(map[string]int), podInformer: podInformer, throughputErrorMargin: throughputErrorMargin, - tc: tc, - w: w, + testCase: tc, + workload: w, nextNodeIndex: 0, dataItems: dataItems, } @@ -1646,7 +1646,7 @@ func (e *WorkloadExecutor) doBarrierOp(opIndex int, concreteOp *barrierOp) { func (e *WorkloadExecutor) doStopCollectingMetrics(opIndex int) { tCtx := *e.tCtx collectorCtx := *e.collectorCtx - items := stopCollectingMetrics(tCtx, collectorCtx, e.collectorWG, e.w.Threshold, *e.w.ThresholdMetricSelector, opIndex, e.collectors) + items := stopCollectingMetrics(tCtx, collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) e.dataItems = append(e.dataItems, items...) collectorCtx = nil } @@ -1662,14 +1662,14 @@ func (e *WorkloadExecutor) doCreatePodsOp(opIndex int, concreteOp *createPodsOp) } createNamespaceIfNotPresent(tCtx, namespace, &e.numPodsScheduledPerNamespace) if concreteOp.PodTemplatePath == nil { - concreteOp.PodTemplatePath = e.tc.DefaultPodTemplatePath + concreteOp.PodTemplatePath = e.testCase.DefaultPodTemplatePath } if concreteOp.CollectMetrics { if *e.collectorCtx != nil { tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - *e.collectorCtx, e.collectors = startCollectingMetrics(tCtx, e.collectorWG, e.podInformer, e.tc.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) + *e.collectorCtx, e.collectors = startCollectingMetrics(tCtx, e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) } if err := createPodsRapidly(tCtx, namespace, concreteOp); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) @@ -1692,7 +1692,7 @@ func (e *WorkloadExecutor) doCreatePodsOp(opIndex int, concreteOp *createPodsOp) // CollectMetrics and SkipWaitToCompletion can never be true at the // same time, so if we're here, it means that all pods have been // scheduled. - items := stopCollectingMetrics(tCtx, collectorCtx, e.collectorWG, e.w.Threshold, *e.w.ThresholdMetricSelector, opIndex, e.collectors) + items := stopCollectingMetrics(tCtx, collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) e.dataItems = append(e.dataItems, items...) *e.collectorCtx = nil } @@ -1872,7 +1872,7 @@ func (e *WorkloadExecutor) doStartCollectingMetricsOp(opIndex int, concreteOp *s if *e.collectorCtx != nil { (*e.tCtx).Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - *e.collectorCtx, e.collectors = startCollectingMetrics((*e.tCtx), e.collectorWG, e.podInformer, e.tc.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) + *e.collectorCtx, e.collectors = startCollectingMetrics((*e.tCtx), e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) } func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) { From 3ce36b3b3cedaae7db93f558eb3312a16b0598ac Mon Sep 17 00:00:00 2001 From: YamasouA Date: Thu, 13 Feb 2025 23:11:43 +0900 Subject: [PATCH 07/25] rename doXXX to runXXX --- .../scheduler_perf/scheduler_perf.go | 122 +++++++++--------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 941b92344c6..cc11f3d0fbb 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1537,33 +1537,33 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact } switch concreteOp := realOp.(type) { case *createNodesOp: - executor.doCreateNodesOp(opIndex, concreteOp) + executor.runCreateNodesOp(opIndex, concreteOp) case *createNamespacesOp: - executor.doCreateNamespaceOp(opIndex, concreteOp) + executor.runCreateNamespaceOp(opIndex, concreteOp) case *createPodsOp: - executor.doCreatePodsOp(opIndex, concreteOp) + executor.runCreatePodsOp(opIndex, concreteOp) if *executor.collectorCtx != nil { defer (*executor.collectorCtx).Cancel("cleaning up") } case *deletePodsOp: - executor.doDeletePodsOp(opIndex, concreteOp) + executor.runDeletePodsOp(opIndex, concreteOp) case *churnOp: - executor.doChurnOp(opIndex, concreteOp) + executor.runChurnOp(opIndex, concreteOp) case *barrierOp: - executor.doBarrierOp(opIndex, concreteOp) + executor.runBarrierOp(opIndex, concreteOp) case *sleepOp: select { case <-tCtx.Done(): case <-time.After(concreteOp.Duration.Duration): } case *startCollectingMetricsOp: - executor.doStartCollectingMetricsOp(opIndex, concreteOp) + executor.runStartCollectingMetricsOp(opIndex, concreteOp) defer (*executor.collectorCtx).Cancel("cleaning up") case *stopCollectingMetricsOp: - executor.doStopCollectingMetrics(opIndex) + executor.runStopCollectingMetrics(opIndex) default: - executor.doDefaultOp(opIndex, concreteOp) + executor.runDefaultOp(opIndex, concreteOp) } } @@ -1578,21 +1578,21 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact return dataItems } -func (e *WorkloadExecutor) doCreateNodesOp(opIndex int, concreteOp *createNodesOp) { +func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) { tCtx := *e.tCtx - nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, tCtx.Client()) + nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), op, tCtx.Client()) if err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } if err := nodePreparer.PrepareNodes((*e.tCtx), e.nextNodeIndex); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } - e.nextNodeIndex += concreteOp.Count + e.nextNodeIndex += op.Count } -func (e *WorkloadExecutor) doCreateNamespaceOp(opIndex int, concreteOp *createNamespacesOp) { +func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespacesOp) { tCtx := *e.tCtx - nsPreparer, err := newNamespacePreparer(tCtx, concreteOp) + nsPreparer, err := newNamespacePreparer(tCtx, op) if err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } @@ -1612,38 +1612,38 @@ func (e *WorkloadExecutor) doCreateNamespaceOp(opIndex int, concreteOp *createNa } } -func (e *WorkloadExecutor) doBarrierOp(opIndex int, concreteOp *barrierOp) { +func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) { tCtx := *e.tCtx - for _, namespace := range concreteOp.Namespaces { + for _, namespace := range op.Namespaces { if _, ok := e.numPodsScheduledPerNamespace[namespace]; !ok { tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) } } - switch concreteOp.StageRequirement { + switch op.StageRequirement { case Attempted: - if err := waitUntilPodsAttempted(tCtx, e.podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, e.numPodsScheduledPerNamespace); err != nil { + if err := waitUntilPodsAttempted(tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } case Scheduled: // Default should be treated like "Scheduled", so handling both in the same way. fallthrough default: - if err := waitUntilPodsScheduled(tCtx, e.podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, e.numPodsScheduledPerNamespace); err != nil { + if err := waitUntilPodsScheduled(tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } // At the end of the barrier, we can be sure that there are no pods // pending scheduling in the namespaces that we just blocked on. - if len(concreteOp.Namespaces) == 0 { + if len(op.Namespaces) == 0 { e.numPodsScheduledPerNamespace = make(map[string]int) } else { - for _, namespace := range concreteOp.Namespaces { + for _, namespace := range op.Namespaces { delete(e.numPodsScheduledPerNamespace, namespace) } } } } -func (e *WorkloadExecutor) doStopCollectingMetrics(opIndex int) { +func (e *WorkloadExecutor) runStopCollectingMetrics(opIndex int) { tCtx := *e.tCtx collectorCtx := *e.collectorCtx items := stopCollectingMetrics(tCtx, collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) @@ -1651,44 +1651,44 @@ func (e *WorkloadExecutor) doStopCollectingMetrics(opIndex int) { collectorCtx = nil } -func (e *WorkloadExecutor) doCreatePodsOp(opIndex int, concreteOp *createPodsOp) { +func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { tCtx := *e.tCtx collectorCtx := *e.tCtx var namespace string // define Pod's namespace automatically, and create that namespace. namespace = fmt.Sprintf("namespace-%d", opIndex) - if concreteOp.Namespace != nil { - namespace = *concreteOp.Namespace + if op.Namespace != nil { + namespace = *op.Namespace } createNamespaceIfNotPresent(tCtx, namespace, &e.numPodsScheduledPerNamespace) - if concreteOp.PodTemplatePath == nil { - concreteOp.PodTemplatePath = e.testCase.DefaultPodTemplatePath + if op.PodTemplatePath == nil { + op.PodTemplatePath = e.testCase.DefaultPodTemplatePath } - if concreteOp.CollectMetrics { + if op.CollectMetrics { if *e.collectorCtx != nil { tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } *e.collectorCtx, e.collectors = startCollectingMetrics(tCtx, e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) } - if err := createPodsRapidly(tCtx, namespace, concreteOp); err != nil { + if err := createPodsRapidly(tCtx, namespace, op); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } switch { - case concreteOp.SkipWaitToCompletion: + case op.SkipWaitToCompletion: // Only record those namespaces that may potentially require barriers // in the future. - e.numPodsScheduledPerNamespace[namespace] += concreteOp.Count - case concreteOp.SteadyState: - if err := createPodsSteadily(tCtx, namespace, e.podInformer, concreteOp); err != nil { + e.numPodsScheduledPerNamespace[namespace] += op.Count + case op.SteadyState: + if err := createPodsSteadily(tCtx, namespace, e.podInformer, op); err != nil { tCtx.Fatalf("op %d: %v", opIndex, err) } default: - if err := waitUntilPodsScheduledInNamespace(tCtx, e.podInformer, nil, namespace, concreteOp.Count); err != nil { + if err := waitUntilPodsScheduledInNamespace(tCtx, e.podInformer, nil, namespace, op.Count); err != nil { tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) } } - if concreteOp.CollectMetrics { + if op.CollectMetrics { // CollectMetrics and SkipWaitToCompletion can never be true at the // same time, so if we're here, it means that all pods have been // scheduled. @@ -1698,24 +1698,24 @@ func (e *WorkloadExecutor) doCreatePodsOp(opIndex int, concreteOp *createPodsOp) } } -func (e *WorkloadExecutor) doDeletePodsOp(opIndex int, concreteOp *deletePodsOp) { +func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { tCtx := *e.tCtx - labelSelector := labels.ValidatedSetSelector(concreteOp.LabelSelector) + labelSelector := labels.ValidatedSetSelector(op.LabelSelector) - podsToDelete, err := e.podInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector) + podsToDelete, err := e.podInformer.Lister().Pods(op.Namespace).List(labelSelector) if err != nil { - tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, concreteOp.Namespace, err) + tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, op.Namespace, err) } deletePods := func(opIndex int) { - if concreteOp.DeletePodsPerSecond > 0 { - ticker := time.NewTicker(time.Second / time.Duration(concreteOp.DeletePodsPerSecond)) + if op.DeletePodsPerSecond > 0 { + ticker := time.NewTicker(time.Second / time.Duration(op.DeletePodsPerSecond)) defer ticker.Stop() for i := 0; i < len(podsToDelete); i++ { select { case <-ticker.C: - if err := tCtx.Client().CoreV1().Pods(concreteOp.Namespace).Delete(tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { + if err := tCtx.Client().CoreV1().Pods(op.Namespace).Delete(tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { if errors.Is(err, context.Canceled) { return } @@ -1730,15 +1730,15 @@ func (e *WorkloadExecutor) doDeletePodsOp(opIndex int, concreteOp *deletePodsOp) listOpts := metav1.ListOptions{ LabelSelector: labelSelector.String(), } - if err := tCtx.Client().CoreV1().Pods(concreteOp.Namespace).DeleteCollection(tCtx, metav1.DeleteOptions{}, listOpts); err != nil { + if err := tCtx.Client().CoreV1().Pods(op.Namespace).DeleteCollection(tCtx, metav1.DeleteOptions{}, listOpts); err != nil { if errors.Is(err, context.Canceled) { return } - tCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, concreteOp.Namespace, err) + tCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, op.Namespace, err) } } - if concreteOp.SkipWaitToCompletion { + if op.SkipWaitToCompletion { e.wg.Add(1) go func(opIndex int) { defer e.wg.Done() @@ -1749,11 +1749,11 @@ func (e *WorkloadExecutor) doDeletePodsOp(opIndex int, concreteOp *deletePodsOp) } } -func (e *WorkloadExecutor) doChurnOp(opIndex int, concreteOp *churnOp) { +func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { tCtx := *e.tCtx var namespace string - if concreteOp.Namespace != nil { - namespace = *concreteOp.Namespace + if op.Namespace != nil { + namespace = *op.Namespace } else { namespace = fmt.Sprintf("namespace-%d", opIndex) } @@ -1766,7 +1766,7 @@ func (e *WorkloadExecutor) doChurnOp(opIndex int, concreteOp *churnOp) { var churnFns []func(name string) string - for i, path := range concreteOp.TemplatePaths { + for i, path := range op.TemplatePaths { unstructuredObj, gvk, err := getUnstructuredFromFile(path) if err != nil { tCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) @@ -1802,18 +1802,18 @@ func (e *WorkloadExecutor) doChurnOp(opIndex int, concreteOp *churnOp) { } var interval int64 = 500 - if concreteOp.IntervalMilliseconds != 0 { - interval = concreteOp.IntervalMilliseconds + if op.IntervalMilliseconds != 0 { + interval = op.IntervalMilliseconds } ticker := time.NewTicker(time.Duration(interval) * time.Millisecond) - switch concreteOp.Mode { + switch op.Mode { case Create: e.wg.Add(1) go func() { defer e.wg.Done() defer ticker.Stop() - count, threshold := 0, concreteOp.Number + count, threshold := 0, op.Number if threshold == 0 { threshold = math.MaxInt32 } @@ -1835,9 +1835,9 @@ func (e *WorkloadExecutor) doChurnOp(opIndex int, concreteOp *churnOp) { defer e.wg.Done() defer ticker.Stop() retVals := make([][]string, len(churnFns)) - // For each churn function, instantiate a slice of strings with length "concreteOp.Number". + // For each churn function, instantiate a slice of strings with length "op.Number". for i := range retVals { - retVals[i] = make([]string, concreteOp.Number) + retVals[i] = make([]string, op.Number) } count := 0 @@ -1845,7 +1845,7 @@ func (e *WorkloadExecutor) doChurnOp(opIndex int, concreteOp *churnOp) { select { case <-ticker.C: for i := range churnFns { - retVals[i][count%concreteOp.Number] = churnFns[i](retVals[i][count%concreteOp.Number]) + retVals[i][count%op.Number] = churnFns[i](retVals[i][count%op.Number]) } count++ case <-tCtx.Done(): @@ -1856,11 +1856,11 @@ func (e *WorkloadExecutor) doChurnOp(opIndex int, concreteOp *churnOp) { } } -func (e *WorkloadExecutor) doDefaultOp(opIndex int, concreteOp realOp) { +func (e *WorkloadExecutor) runDefaultOp(opIndex int, op realOp) { tCtx := *e.tCtx - runable, ok := concreteOp.(runnableOp) + runable, ok := op.(runnableOp) if !ok { - tCtx.Fatalf("op %d: invalid op %v", opIndex, concreteOp) + tCtx.Fatalf("op %d: invalid op %v", opIndex, op) } for _, namespace := range runable.requiredNamespaces() { createNamespaceIfNotPresent(tCtx, namespace, &e.numPodsScheduledPerNamespace) @@ -1868,11 +1868,11 @@ func (e *WorkloadExecutor) doDefaultOp(opIndex int, concreteOp realOp) { runable.run(tCtx) } -func (e *WorkloadExecutor) doStartCollectingMetricsOp(opIndex int, concreteOp *startCollectingMetricsOp) { +func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCollectingMetricsOp) { if *e.collectorCtx != nil { (*e.tCtx).Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - *e.collectorCtx, e.collectors = startCollectingMetrics((*e.tCtx), e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector) + *e.collectorCtx, e.collectors = startCollectingMetrics((*e.tCtx), e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, op.Name, op.Namespaces, op.LabelSelector) } func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) { From cc87cb54ab7cd3677bbc0cf5280f4b15f82b445f Mon Sep 17 00:00:00 2001 From: YamasouA Date: Thu, 13 Feb 2025 23:13:57 +0900 Subject: [PATCH 08/25] delete unneccesary define --- test/integration/scheduler_perf/scheduler_perf.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index cc11f3d0fbb..4a952971921 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1654,9 +1654,8 @@ func (e *WorkloadExecutor) runStopCollectingMetrics(opIndex int) { func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { tCtx := *e.tCtx collectorCtx := *e.tCtx - var namespace string // define Pod's namespace automatically, and create that namespace. - namespace = fmt.Sprintf("namespace-%d", opIndex) + namespace := fmt.Sprintf("namespace-%d", opIndex) if op.Namespace != nil { namespace = *op.Namespace } From a9ee6bdf81ae364cdc732704dd09923bb78286a9 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Thu, 13 Feb 2025 23:33:13 +0900 Subject: [PATCH 09/25] use *e.tCtx --- .../scheduler_perf/scheduler_perf.go | 101 ++++++++---------- 1 file changed, 46 insertions(+), 55 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 4a952971921..1a126c8de48 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1579,29 +1579,27 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact } func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) { - tCtx := *e.tCtx - nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), op, tCtx.Client()) + nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), op, (*e.tCtx).Client()) if err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) + (*e.tCtx).Fatalf("op %d: %v", opIndex, err) } - if err := nodePreparer.PrepareNodes((*e.tCtx), e.nextNodeIndex); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) + if err := nodePreparer.PrepareNodes(*e.tCtx, e.nextNodeIndex); err != nil { + (*e.tCtx).Fatalf("op %d: %v", opIndex, err) } e.nextNodeIndex += op.Count } func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespacesOp) { - tCtx := *e.tCtx - nsPreparer, err := newNamespacePreparer(tCtx, op) + nsPreparer, err := newNamespacePreparer(*e.tCtx, op) if err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) + (*e.tCtx).Fatalf("op %d: %v", opIndex, err) } - if err := nsPreparer.prepare(tCtx); err != nil { - err2 := nsPreparer.cleanup(tCtx) + if err := nsPreparer.prepare(*e.tCtx); err != nil { + err2 := nsPreparer.cleanup(*e.tCtx) if err2 != nil { err = fmt.Errorf("prepare: %v; cleanup: %v", err, err2) } - tCtx.Fatalf("op %d: %v", opIndex, err) + (*e.tCtx).Fatalf("op %d: %v", opIndex, err) } for _, n := range nsPreparer.namespaces() { if _, ok := e.numPodsScheduledPerNamespace[n]; ok { @@ -1613,23 +1611,22 @@ func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespace } func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) { - tCtx := *e.tCtx for _, namespace := range op.Namespaces { if _, ok := e.numPodsScheduledPerNamespace[namespace]; !ok { - tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) + (*e.tCtx).Fatalf("op %d: unknown namespace %s", opIndex, namespace) } } switch op.StageRequirement { case Attempted: - if err := waitUntilPodsAttempted(tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) + if err := waitUntilPodsAttempted(*e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { + (*e.tCtx).Fatalf("op %d: %v", opIndex, err) } case Scheduled: // Default should be treated like "Scheduled", so handling both in the same way. fallthrough default: - if err := waitUntilPodsScheduled(tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) + if err := waitUntilPodsScheduled(*e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { + (*e.tCtx).Fatalf("op %d: %v", opIndex, err) } // At the end of the barrier, we can be sure that there are no pods // pending scheduling in the namespaces that we just blocked on. @@ -1644,34 +1641,31 @@ func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) { } func (e *WorkloadExecutor) runStopCollectingMetrics(opIndex int) { - tCtx := *e.tCtx collectorCtx := *e.collectorCtx - items := stopCollectingMetrics(tCtx, collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) + items := stopCollectingMetrics(*e.tCtx, collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) e.dataItems = append(e.dataItems, items...) collectorCtx = nil } func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { - tCtx := *e.tCtx - collectorCtx := *e.tCtx // define Pod's namespace automatically, and create that namespace. namespace := fmt.Sprintf("namespace-%d", opIndex) if op.Namespace != nil { namespace = *op.Namespace } - createNamespaceIfNotPresent(tCtx, namespace, &e.numPodsScheduledPerNamespace) + createNamespaceIfNotPresent(*e.tCtx, namespace, &e.numPodsScheduledPerNamespace) if op.PodTemplatePath == nil { op.PodTemplatePath = e.testCase.DefaultPodTemplatePath } if op.CollectMetrics { if *e.collectorCtx != nil { - tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + (*e.tCtx).Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - *e.collectorCtx, e.collectors = startCollectingMetrics(tCtx, e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) + *e.collectorCtx, e.collectors = startCollectingMetrics(*e.tCtx, e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) } - if err := createPodsRapidly(tCtx, namespace, op); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) + if err := createPodsRapidly(*e.tCtx, namespace, op); err != nil { + (*e.tCtx).Fatalf("op %d: %v", opIndex, err) } switch { case op.SkipWaitToCompletion: @@ -1679,31 +1673,30 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { // in the future. e.numPodsScheduledPerNamespace[namespace] += op.Count case op.SteadyState: - if err := createPodsSteadily(tCtx, namespace, e.podInformer, op); err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) + if err := createPodsSteadily(*e.tCtx, namespace, e.podInformer, op); err != nil { + (*e.tCtx).Fatalf("op %d: %v", opIndex, err) } default: - if err := waitUntilPodsScheduledInNamespace(tCtx, e.podInformer, nil, namespace, op.Count); err != nil { - tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) + if err := waitUntilPodsScheduledInNamespace(*e.tCtx, e.podInformer, nil, namespace, op.Count); err != nil { + (*e.tCtx).Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) } } if op.CollectMetrics { // CollectMetrics and SkipWaitToCompletion can never be true at the // same time, so if we're here, it means that all pods have been // scheduled. - items := stopCollectingMetrics(tCtx, collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) + items := stopCollectingMetrics((*e.tCtx), (*e.collectorCtx), e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) e.dataItems = append(e.dataItems, items...) *e.collectorCtx = nil } } func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { - tCtx := *e.tCtx labelSelector := labels.ValidatedSetSelector(op.LabelSelector) podsToDelete, err := e.podInformer.Lister().Pods(op.Namespace).List(labelSelector) if err != nil { - tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, op.Namespace, err) + (*e.tCtx).Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, op.Namespace, err) } deletePods := func(opIndex int) { @@ -1714,13 +1707,13 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { for i := 0; i < len(podsToDelete); i++ { select { case <-ticker.C: - if err := tCtx.Client().CoreV1().Pods(op.Namespace).Delete(tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { + if err := (*e.tCtx).Client().CoreV1().Pods(op.Namespace).Delete(*e.tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { if errors.Is(err, context.Canceled) { return } - tCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) + (*e.tCtx).Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) } - case <-tCtx.Done(): + case <-(*e.tCtx).Done(): return } } @@ -1729,11 +1722,11 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { listOpts := metav1.ListOptions{ LabelSelector: labelSelector.String(), } - if err := tCtx.Client().CoreV1().Pods(op.Namespace).DeleteCollection(tCtx, metav1.DeleteOptions{}, listOpts); err != nil { + if err := (*e.tCtx).Client().CoreV1().Pods(op.Namespace).DeleteCollection(*e.tCtx, metav1.DeleteOptions{}, listOpts); err != nil { if errors.Is(err, context.Canceled) { return } - tCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, op.Namespace, err) + (*e.tCtx).Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, op.Namespace, err) } } @@ -1749,18 +1742,17 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { } func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { - tCtx := *e.tCtx var namespace string if op.Namespace != nil { namespace = *op.Namespace } else { namespace = fmt.Sprintf("namespace-%d", opIndex) } - restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient(tCtx.Client().Discovery())) + restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient((*e.tCtx).Client().Discovery())) // Ensure the namespace exists. nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} - if _, err := tCtx.Client().CoreV1().Namespaces().Create(tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - tCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) + if _, err := (*e.tCtx).Client().CoreV1().Namespaces().Create(*e.tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + (*e.tCtx).Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) } var churnFns []func(name string) string @@ -1768,31 +1760,31 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { for i, path := range op.TemplatePaths { unstructuredObj, gvk, err := getUnstructuredFromFile(path) if err != nil { - tCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) + (*e.tCtx).Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) } // Obtain GVR. mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version) if err != nil { - tCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) + (*e.tCtx).Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) } gvr := mapping.Resource // Distinguish cluster-scoped with namespaced API objects. var dynRes dynamic.ResourceInterface if mapping.Scope.Name() == meta.RESTScopeNameNamespace { - dynRes = tCtx.Dynamic().Resource(gvr).Namespace(namespace) + dynRes = (*e.tCtx).Dynamic().Resource(gvr).Namespace(namespace) } else { - dynRes = tCtx.Dynamic().Resource(gvr) + dynRes = (*e.tCtx).Dynamic().Resource(gvr) } churnFns = append(churnFns, func(name string) string { if name != "" { - if err := dynRes.Delete(tCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { - tCtx.Errorf("op %d: unable to delete %v: %v", opIndex, name, err) + if err := dynRes.Delete(*e.tCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { + (*e.tCtx).Errorf("op %d: unable to delete %v: %v", opIndex, name, err) } return "" } - live, err := dynRes.Create(tCtx, unstructuredObj, metav1.CreateOptions{}) + live, err := dynRes.Create(*e.tCtx, unstructuredObj, metav1.CreateOptions{}) if err != nil { return "" } @@ -1823,7 +1815,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { churnFns[i]("") } count++ - case <-tCtx.Done(): + case <-(*e.tCtx).Done(): return } } @@ -1847,7 +1839,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { retVals[i][count%op.Number] = churnFns[i](retVals[i][count%op.Number]) } count++ - case <-tCtx.Done(): + case <-(*e.tCtx).Done(): return } } @@ -1856,15 +1848,14 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { } func (e *WorkloadExecutor) runDefaultOp(opIndex int, op realOp) { - tCtx := *e.tCtx runable, ok := op.(runnableOp) if !ok { - tCtx.Fatalf("op %d: invalid op %v", opIndex, op) + (*e.tCtx).Fatalf("op %d: invalid op %v", opIndex, op) } for _, namespace := range runable.requiredNamespaces() { - createNamespaceIfNotPresent(tCtx, namespace, &e.numPodsScheduledPerNamespace) + createNamespaceIfNotPresent(*e.tCtx, namespace, &e.numPodsScheduledPerNamespace) } - runable.run(tCtx) + runable.run(*e.tCtx) } func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCollectingMetricsOp) { From 6d291ddc21b7420e4385bbe2008b0ae9e4760707 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Thu, 13 Feb 2025 23:41:00 +0900 Subject: [PATCH 10/25] fix lint --- test/integration/scheduler_perf/scheduler_perf.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 1a126c8de48..318d181c0a5 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1597,7 +1597,7 @@ func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespace if err := nsPreparer.prepare(*e.tCtx); err != nil { err2 := nsPreparer.cleanup(*e.tCtx) if err2 != nil { - err = fmt.Errorf("prepare: %v; cleanup: %v", err, err2) + err = fmt.Errorf("prepare: %w; cleanup: %w", err, err2) } (*e.tCtx).Fatalf("op %d: %v", opIndex, err) } @@ -1641,10 +1641,9 @@ func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) { } func (e *WorkloadExecutor) runStopCollectingMetrics(opIndex int) { - collectorCtx := *e.collectorCtx - items := stopCollectingMetrics(*e.tCtx, collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) + items := stopCollectingMetrics(*e.tCtx, *e.collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) e.dataItems = append(e.dataItems, items...) - collectorCtx = nil + *e.collectorCtx = nil } func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { From ca8a0f5f1be9ca615297a7fe682d0c5aa3cfb4bc Mon Sep 17 00:00:00 2001 From: YamasouA Date: Thu, 13 Feb 2025 23:46:43 +0900 Subject: [PATCH 11/25] separete sleep func --- test/integration/scheduler_perf/scheduler_perf.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 318d181c0a5..312184fe7be 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1553,10 +1553,7 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact case *barrierOp: executor.runBarrierOp(opIndex, concreteOp) case *sleepOp: - select { - case <-tCtx.Done(): - case <-time.After(concreteOp.Duration.Duration): - } + executor.runSleepOp(concreteOp) case *startCollectingMetricsOp: executor.runStartCollectingMetricsOp(opIndex, concreteOp) defer (*executor.collectorCtx).Cancel("cleaning up") @@ -1640,6 +1637,13 @@ func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) { } } +func (e *WorkloadExecutor) runSleepOp(op *sleepOp) { + select { + case <-(*e.tCtx).Done(): + case <-time.After(op.Duration.Duration): + } +} + func (e *WorkloadExecutor) runStopCollectingMetrics(opIndex int) { items := stopCollectingMetrics(*e.tCtx, *e.collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) e.dataItems = append(e.dataItems, items...) From fcce8aaad8a69d0c680c088c79b8a159e2cdbc97 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Sun, 16 Feb 2025 23:42:20 +0900 Subject: [PATCH 12/25] workloadExecutor's member use value not pointer --- .../scheduler_perf/scheduler_perf.go | 134 +++++++++--------- 1 file changed, 70 insertions(+), 64 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 312184fe7be..59581470d30 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1451,10 +1451,10 @@ func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContex } type WorkloadExecutor struct { - tCtx *ktesting.TContext - wg *sync.WaitGroup - collectorCtx *ktesting.TContext - collectorWG *sync.WaitGroup + tCtx ktesting.TContext + wg sync.WaitGroup + collectorCtx ktesting.TContext + collectorWG sync.WaitGroup collectors []testDataCollector dataItems []DataItem numPodsScheduledPerNamespace map[string]int @@ -1511,10 +1511,10 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact defer collectorWG.Wait() executor := WorkloadExecutor{ - tCtx: &tCtx, - wg: &wg, - collectorCtx: &collectorCtx, - collectorWG: &collectorWG, + tCtx: tCtx, + wg: wg, + collectorCtx: collectorCtx, + collectorWG: collectorWG, collectors: collectors, numPodsScheduledPerNamespace: make(map[string]int), podInformer: podInformer, @@ -1543,8 +1543,8 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact executor.runCreateNamespaceOp(opIndex, concreteOp) case *createPodsOp: executor.runCreatePodsOp(opIndex, concreteOp) - if *executor.collectorCtx != nil { - defer (*executor.collectorCtx).Cancel("cleaning up") + if executor.collectorCtx != nil { + executor.collectorCtx.Cancel("cleaning up") } case *deletePodsOp: executor.runDeletePodsOp(opIndex, concreteOp) @@ -1556,7 +1556,7 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact executor.runSleepOp(concreteOp) case *startCollectingMetricsOp: executor.runStartCollectingMetricsOp(opIndex, concreteOp) - defer (*executor.collectorCtx).Cancel("cleaning up") + defer executor.collectorCtx.Cancel("cleaning up") case *stopCollectingMetricsOp: executor.runStopCollectingMetrics(opIndex) default: @@ -1576,27 +1576,27 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact } func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) { - nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), op, (*e.tCtx).Client()) + nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), op, e.tCtx.Client()) if err != nil { - (*e.tCtx).Fatalf("op %d: %v", opIndex, err) + e.tCtx.Fatalf("op %d: %v", opIndex, err) } - if err := nodePreparer.PrepareNodes(*e.tCtx, e.nextNodeIndex); err != nil { - (*e.tCtx).Fatalf("op %d: %v", opIndex, err) + if err := nodePreparer.PrepareNodes(e.tCtx, e.nextNodeIndex); err != nil { + e.tCtx.Fatalf("op %d: %v", opIndex, err) } e.nextNodeIndex += op.Count } func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespacesOp) { - nsPreparer, err := newNamespacePreparer(*e.tCtx, op) + nsPreparer, err := newNamespacePreparer(e.tCtx, op) if err != nil { - (*e.tCtx).Fatalf("op %d: %v", opIndex, err) + e.tCtx.Fatalf("op %d: %v", opIndex, err) } - if err := nsPreparer.prepare(*e.tCtx); err != nil { - err2 := nsPreparer.cleanup(*e.tCtx) + if err := nsPreparer.prepare(e.tCtx); err != nil { + err2 := nsPreparer.cleanup(e.tCtx) if err2 != nil { err = fmt.Errorf("prepare: %w; cleanup: %w", err, err2) } - (*e.tCtx).Fatalf("op %d: %v", opIndex, err) + e.tCtx.Fatalf("op %d: %v", opIndex, err) } for _, n := range nsPreparer.namespaces() { if _, ok := e.numPodsScheduledPerNamespace[n]; ok { @@ -1610,20 +1610,20 @@ func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespace func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) { for _, namespace := range op.Namespaces { if _, ok := e.numPodsScheduledPerNamespace[namespace]; !ok { - (*e.tCtx).Fatalf("op %d: unknown namespace %s", opIndex, namespace) + e.tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) } } switch op.StageRequirement { case Attempted: - if err := waitUntilPodsAttempted(*e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - (*e.tCtx).Fatalf("op %d: %v", opIndex, err) + if err := waitUntilPodsAttempted(e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { + e.tCtx.Fatalf("op %d: %v", opIndex, err) } case Scheduled: // Default should be treated like "Scheduled", so handling both in the same way. fallthrough default: - if err := waitUntilPodsScheduled(*e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - (*e.tCtx).Fatalf("op %d: %v", opIndex, err) + if err := waitUntilPodsScheduled(e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { + e.tCtx.Fatalf("op %d: %v", opIndex, err) } // At the end of the barrier, we can be sure that there are no pods // pending scheduling in the namespaces that we just blocked on. @@ -1639,15 +1639,15 @@ func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) { func (e *WorkloadExecutor) runSleepOp(op *sleepOp) { select { - case <-(*e.tCtx).Done(): + case <-(e.tCtx).Done(): case <-time.After(op.Duration.Duration): } } func (e *WorkloadExecutor) runStopCollectingMetrics(opIndex int) { - items := stopCollectingMetrics(*e.tCtx, *e.collectorCtx, e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) + items := stopCollectingMetrics(e.tCtx, e.collectorCtx, &e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) e.dataItems = append(e.dataItems, items...) - *e.collectorCtx = nil + e.collectorCtx = nil } func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { @@ -1656,19 +1656,22 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { if op.Namespace != nil { namespace = *op.Namespace } - createNamespaceIfNotPresent(*e.tCtx, namespace, &e.numPodsScheduledPerNamespace) + createNamespaceIfNotPresent(e.tCtx, namespace, &e.numPodsScheduledPerNamespace) if op.PodTemplatePath == nil { op.PodTemplatePath = e.testCase.DefaultPodTemplatePath } if op.CollectMetrics { - if *e.collectorCtx != nil { - (*e.tCtx).Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + if e.collectorCtx != nil { + e.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - *e.collectorCtx, e.collectors = startCollectingMetrics(*e.tCtx, e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) + e.collectorCtx, e.collectors = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) + // e.collectorCtx.Cleanup(func() { + // e.collectorCtx.Cancel("cleaning up") + // }) } - if err := createPodsRapidly(*e.tCtx, namespace, op); err != nil { - (*e.tCtx).Fatalf("op %d: %v", opIndex, err) + if err := createPodsRapidly(e.tCtx, namespace, op); err != nil { + e.tCtx.Fatalf("op %d: %v", opIndex, err) } switch { case op.SkipWaitToCompletion: @@ -1676,21 +1679,21 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { // in the future. e.numPodsScheduledPerNamespace[namespace] += op.Count case op.SteadyState: - if err := createPodsSteadily(*e.tCtx, namespace, e.podInformer, op); err != nil { - (*e.tCtx).Fatalf("op %d: %v", opIndex, err) + if err := createPodsSteadily(e.tCtx, namespace, e.podInformer, op); err != nil { + e.tCtx.Fatalf("op %d: %v", opIndex, err) } default: - if err := waitUntilPodsScheduledInNamespace(*e.tCtx, e.podInformer, nil, namespace, op.Count); err != nil { - (*e.tCtx).Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) + if err := waitUntilPodsScheduledInNamespace(e.tCtx, e.podInformer, nil, namespace, op.Count); err != nil { + e.tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) } } if op.CollectMetrics { // CollectMetrics and SkipWaitToCompletion can never be true at the // same time, so if we're here, it means that all pods have been // scheduled. - items := stopCollectingMetrics((*e.tCtx), (*e.collectorCtx), e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) + items := stopCollectingMetrics(e.tCtx, e.collectorCtx, &e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) e.dataItems = append(e.dataItems, items...) - *e.collectorCtx = nil + e.collectorCtx = nil } } @@ -1699,7 +1702,7 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { podsToDelete, err := e.podInformer.Lister().Pods(op.Namespace).List(labelSelector) if err != nil { - (*e.tCtx).Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, op.Namespace, err) + e.tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, op.Namespace, err) } deletePods := func(opIndex int) { @@ -1710,13 +1713,13 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { for i := 0; i < len(podsToDelete); i++ { select { case <-ticker.C: - if err := (*e.tCtx).Client().CoreV1().Pods(op.Namespace).Delete(*e.tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { + if err := e.tCtx.Client().CoreV1().Pods(op.Namespace).Delete(e.tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil { if errors.Is(err, context.Canceled) { return } - (*e.tCtx).Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) + e.tCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) } - case <-(*e.tCtx).Done(): + case <-(e.tCtx).Done(): return } } @@ -1725,11 +1728,11 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { listOpts := metav1.ListOptions{ LabelSelector: labelSelector.String(), } - if err := (*e.tCtx).Client().CoreV1().Pods(op.Namespace).DeleteCollection(*e.tCtx, metav1.DeleteOptions{}, listOpts); err != nil { + if err := e.tCtx.Client().CoreV1().Pods(op.Namespace).DeleteCollection(e.tCtx, metav1.DeleteOptions{}, listOpts); err != nil { if errors.Is(err, context.Canceled) { return } - (*e.tCtx).Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, op.Namespace, err) + e.tCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, op.Namespace, err) } } @@ -1751,11 +1754,11 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { } else { namespace = fmt.Sprintf("namespace-%d", opIndex) } - restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient((*e.tCtx).Client().Discovery())) + restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient(e.tCtx.Client().Discovery())) // Ensure the namespace exists. nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} - if _, err := (*e.tCtx).Client().CoreV1().Namespaces().Create(*e.tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - (*e.tCtx).Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) + if _, err := e.tCtx.Client().CoreV1().Namespaces().Create(e.tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + e.tCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) } var churnFns []func(name string) string @@ -1763,31 +1766,31 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { for i, path := range op.TemplatePaths { unstructuredObj, gvk, err := getUnstructuredFromFile(path) if err != nil { - (*e.tCtx).Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) + e.tCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) } // Obtain GVR. mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version) if err != nil { - (*e.tCtx).Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) + e.tCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) } gvr := mapping.Resource // Distinguish cluster-scoped with namespaced API objects. var dynRes dynamic.ResourceInterface if mapping.Scope.Name() == meta.RESTScopeNameNamespace { - dynRes = (*e.tCtx).Dynamic().Resource(gvr).Namespace(namespace) + dynRes = e.tCtx.Dynamic().Resource(gvr).Namespace(namespace) } else { - dynRes = (*e.tCtx).Dynamic().Resource(gvr) + dynRes = e.tCtx.Dynamic().Resource(gvr) } churnFns = append(churnFns, func(name string) string { if name != "" { - if err := dynRes.Delete(*e.tCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { - (*e.tCtx).Errorf("op %d: unable to delete %v: %v", opIndex, name, err) + if err := dynRes.Delete(e.tCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) { + e.tCtx.Errorf("op %d: unable to delete %v: %v", opIndex, name, err) } return "" } - live, err := dynRes.Create(*e.tCtx, unstructuredObj, metav1.CreateOptions{}) + live, err := dynRes.Create(e.tCtx, unstructuredObj, metav1.CreateOptions{}) if err != nil { return "" } @@ -1818,7 +1821,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { churnFns[i]("") } count++ - case <-(*e.tCtx).Done(): + case <-(e.tCtx).Done(): return } } @@ -1842,7 +1845,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { retVals[i][count%op.Number] = churnFns[i](retVals[i][count%op.Number]) } count++ - case <-(*e.tCtx).Done(): + case <-(e.tCtx).Done(): return } } @@ -1853,19 +1856,22 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { func (e *WorkloadExecutor) runDefaultOp(opIndex int, op realOp) { runable, ok := op.(runnableOp) if !ok { - (*e.tCtx).Fatalf("op %d: invalid op %v", opIndex, op) + e.tCtx.Fatalf("op %d: invalid op %v", opIndex, op) } for _, namespace := range runable.requiredNamespaces() { - createNamespaceIfNotPresent(*e.tCtx, namespace, &e.numPodsScheduledPerNamespace) + createNamespaceIfNotPresent(e.tCtx, namespace, &e.numPodsScheduledPerNamespace) } - runable.run(*e.tCtx) + runable.run(e.tCtx) } func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCollectingMetricsOp) { - if *e.collectorCtx != nil { - (*e.tCtx).Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + if e.collectorCtx != nil { + e.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } - *e.collectorCtx, e.collectors = startCollectingMetrics((*e.tCtx), e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, op.Name, op.Namespaces, op.LabelSelector) + e.collectorCtx, e.collectors = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, op.Name, op.Namespaces, op.LabelSelector) + // e.collectorCtx.Cleanup(func() { + // collectorCtx.Cancel("cleaning up") + // }) } func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) { From b1f6cfcfae0448e94857c792067f154194ac9adb Mon Sep 17 00:00:00 2001 From: YamasouA Date: Sat, 22 Feb 2025 20:17:27 +0900 Subject: [PATCH 13/25] change defer order to pass test --- .../scheduler_perf/scheduler_perf.go | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 59581470d30..c05df5bfb20 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1496,9 +1496,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact // Everything else started by this function gets stopped before it returns. tCtx = ktesting.WithCancel(tCtx) - var wg sync.WaitGroup - defer wg.Wait() - defer tCtx.Cancel("workload is done") var dataItems []DataItem @@ -1507,14 +1504,12 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact // the metrics collecting needs to be sure that the goroutines // are stopped. var collectorCtx ktesting.TContext - var collectorWG sync.WaitGroup - defer collectorWG.Wait() executor := WorkloadExecutor{ tCtx: tCtx, - wg: wg, + wg: sync.WaitGroup{}, collectorCtx: collectorCtx, - collectorWG: collectorWG, + collectorWG: sync.WaitGroup{}, collectors: collectors, numPodsScheduledPerNamespace: make(map[string]int), podInformer: podInformer, @@ -1525,6 +1520,10 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact dataItems: dataItems, } + defer executor.wg.Wait() + defer executor.collectorWG.Wait() + defer tCtx.Cancel("workload is done") + for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { realOp, err := op.realOp.patchParams(w) if err != nil { @@ -1538,14 +1537,10 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact switch concreteOp := realOp.(type) { case *createNodesOp: executor.runCreateNodesOp(opIndex, concreteOp) - case *createNamespacesOp: executor.runCreateNamespaceOp(opIndex, concreteOp) case *createPodsOp: executor.runCreatePodsOp(opIndex, concreteOp) - if executor.collectorCtx != nil { - executor.collectorCtx.Cancel("cleaning up") - } case *deletePodsOp: executor.runDeletePodsOp(opIndex, concreteOp) case *churnOp: @@ -1666,9 +1661,7 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { e.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } e.collectorCtx, e.collectors = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) - // e.collectorCtx.Cleanup(func() { - // e.collectorCtx.Cancel("cleaning up") - // }) + defer e.collectorCtx.Cancel("cleaning up") } if err := createPodsRapidly(e.tCtx, namespace, op); err != nil { e.tCtx.Fatalf("op %d: %v", opIndex, err) @@ -1869,9 +1862,6 @@ func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCol e.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } e.collectorCtx, e.collectors = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, op.Name, op.Namespaces, op.LabelSelector) - // e.collectorCtx.Cleanup(func() { - // collectorCtx.Cancel("cleaning up") - // }) } func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) { From 45b323d6a5fdc8da28a5dca0f9e695d00ce647dd Mon Sep 17 00:00:00 2001 From: YamasouA Date: Wed, 26 Feb 2025 23:45:42 +0900 Subject: [PATCH 14/25] use Cleanup func --- test/integration/scheduler_perf/scheduler_perf.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index c05df5bfb20..c01f4f62fbf 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1551,7 +1551,6 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact executor.runSleepOp(concreteOp) case *startCollectingMetricsOp: executor.runStartCollectingMetricsOp(opIndex, concreteOp) - defer executor.collectorCtx.Cancel("cleaning up") case *stopCollectingMetricsOp: executor.runStopCollectingMetrics(opIndex) default: @@ -1661,7 +1660,11 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { e.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } e.collectorCtx, e.collectors = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) - defer e.collectorCtx.Cancel("cleaning up") + e.tCtx.TB().Cleanup(func() { + if e.collectorCtx != nil { + e.collectorCtx.Cancel("cleaning up") + } + }) } if err := createPodsRapidly(e.tCtx, namespace, op); err != nil { e.tCtx.Fatalf("op %d: %v", opIndex, err) @@ -1862,6 +1865,11 @@ func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCol e.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) } e.collectorCtx, e.collectors = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, op.Name, op.Namespaces, op.LabelSelector) + e.tCtx.TB().Cleanup(func() { + if e.collectorCtx != nil { + e.collectorCtx.Cancel("cleaning up") + } + }) } func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) { From f214d8e27a0323a333bc0f7c8b52460e8f58f546 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Thu, 27 Feb 2025 00:10:24 +0900 Subject: [PATCH 15/25] delete unnecessary init --- .../scheduler_perf/scheduler_perf.go | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index c01f4f62fbf..dc8a00dd9b4 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1497,27 +1497,13 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact // Everything else started by this function gets stopped before it returns. tCtx = ktesting.WithCancel(tCtx) - var dataItems []DataItem - - var collectors []testDataCollector - // This needs a separate context and wait group because - // the metrics collecting needs to be sure that the goroutines - // are stopped. - var collectorCtx ktesting.TContext - executor := WorkloadExecutor{ tCtx: tCtx, - wg: sync.WaitGroup{}, - collectorCtx: collectorCtx, - collectorWG: sync.WaitGroup{}, - collectors: collectors, numPodsScheduledPerNamespace: make(map[string]int), podInformer: podInformer, throughputErrorMargin: throughputErrorMargin, testCase: tc, workload: w, - nextNodeIndex: 0, - dataItems: dataItems, } defer executor.wg.Wait() @@ -1566,7 +1552,7 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact // Some tests have unschedulable pods. Do not add an implicit barrier at the // end as we do not want to wait for them. - return dataItems + return executor.dataItems } func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) { @@ -1715,7 +1701,7 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { } e.tCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err) } - case <-(e.tCtx).Done(): + case <-e.tCtx.Done(): return } } From 038b90d475ee6f1bed43c3c8db12d586e0ba9772 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Fri, 28 Feb 2025 00:08:07 +0900 Subject: [PATCH 16/25] return error instead of fatalf --- .../scheduler_perf/scheduler_perf.go | 136 +++++++++++------- 1 file changed, 83 insertions(+), 53 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index dc8a00dd9b4..0cda120ff2f 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1194,7 +1194,10 @@ func RunBenchmarkPerfScheduling(b *testing.B, configFile string, topicName strin b.Fatalf("workload %s is not valid: %v", w.Name, err) } - results := runWorkload(tCtx, tc, w, informerFactory) + results, err := runWorkload(tCtx, tc, w, informerFactory) + if err != nil { + tCtx.Fatalf("%w: %s", w.Name, err) + } dataItems.DataItems = append(dataItems.DataItems, results...) if len(results) > 0 { @@ -1410,7 +1413,7 @@ func checkEmptyInFlightEvents() error { return nil } -func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, podInformer coreinformers.PodInformer, mcc *metricsCollectorConfig, throughputErrorMargin float64, opIndex int, name string, namespaces []string, labelSelector map[string]string) (ktesting.TContext, []testDataCollector) { +func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, podInformer coreinformers.PodInformer, mcc *metricsCollectorConfig, throughputErrorMargin float64, opIndex int, name string, namespaces []string, labelSelector map[string]string) (ktesting.TContext, []testDataCollector, error) { collectorCtx := ktesting.WithCancel(tCtx) workloadName := tCtx.Name() // The first part is the same for each workload, therefore we can strip it. @@ -1421,7 +1424,7 @@ func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, collector := collector err := collector.init() if err != nil { - tCtx.Fatalf("op %d: Failed to initialize data collector: %v", opIndex, err) + return nil, nil, fmt.Errorf("op %d: Failed to initialize data collector: %v", opIndex, err) } collectorWG.Add(1) go func() { @@ -1429,12 +1432,12 @@ func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, collector.run(collectorCtx) }() } - return collectorCtx, collectors + return collectorCtx, collectors, nil } -func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContext, collectorWG *sync.WaitGroup, threshold float64, tms thresholdMetricSelector, opIndex int, collectors []testDataCollector) []DataItem { +func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContext, collectorWG *sync.WaitGroup, threshold float64, tms thresholdMetricSelector, opIndex int, collectors []testDataCollector) ([]DataItem, error) { if collectorCtx == nil { - tCtx.Fatalf("op %d: Missing startCollectingMetrics operation before stopping", opIndex) + return nil, fmt.Errorf("op %d: Missing startCollectingMetrics operation before stopping", opIndex) } collectorCtx.Cancel("collecting metrics, collector must stop first") collectorWG.Wait() @@ -1447,7 +1450,7 @@ func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContex tCtx.Errorf("op %d: %s", opIndex, err) } } - return dataItems + return dataItems, nil } type WorkloadExecutor struct { @@ -1465,7 +1468,7 @@ type WorkloadExecutor struct { nextNodeIndex int } -func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory) []DataItem { +func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory) ([]DataItem, error) { b, benchmarking := tCtx.TB().(*testing.B) if benchmarking { start := time.Now() @@ -1513,70 +1516,74 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { realOp, err := op.realOp.patchParams(w) if err != nil { - tCtx.Fatalf("op %d: %v", opIndex, err) + return nil, fmt.Errorf("op %d: %v", opIndex, err) } select { case <-tCtx.Done(): - tCtx.Fatalf("op %d: %v", opIndex, context.Cause(tCtx)) + return nil, fmt.Errorf("op %d: %v", opIndex, context.Cause(tCtx)) default: } switch concreteOp := realOp.(type) { case *createNodesOp: - executor.runCreateNodesOp(opIndex, concreteOp) + err = executor.runCreateNodesOp(opIndex, concreteOp) case *createNamespacesOp: - executor.runCreateNamespaceOp(opIndex, concreteOp) + err = executor.runCreateNamespaceOp(opIndex, concreteOp) case *createPodsOp: - executor.runCreatePodsOp(opIndex, concreteOp) + err = executor.runCreatePodsOp(opIndex, concreteOp) case *deletePodsOp: - executor.runDeletePodsOp(opIndex, concreteOp) + err = executor.runDeletePodsOp(opIndex, concreteOp) case *churnOp: - executor.runChurnOp(opIndex, concreteOp) + err = executor.runChurnOp(opIndex, concreteOp) case *barrierOp: - executor.runBarrierOp(opIndex, concreteOp) + err = executor.runBarrierOp(opIndex, concreteOp) case *sleepOp: executor.runSleepOp(concreteOp) case *startCollectingMetricsOp: - executor.runStartCollectingMetricsOp(opIndex, concreteOp) + err = executor.runStartCollectingMetricsOp(opIndex, concreteOp) case *stopCollectingMetricsOp: - executor.runStopCollectingMetrics(opIndex) + err = executor.runStopCollectingMetrics(opIndex) default: - executor.runDefaultOp(opIndex, concreteOp) + err = executor.runDefaultOp(opIndex, concreteOp) + } + if err != nil { + return nil, err } } // check unused params and inform users unusedParams := w.unusedParams() if len(unusedParams) != 0 { - tCtx.Fatalf("the parameters %v are defined on workload %s, but unused.\nPlease make sure there are no typos.", unusedParams, w.Name) + return nil, fmt.Errorf("the parameters %v are defined on workload %s, but unused.\nPlease make sure there are no typos.", unusedParams, w.Name) } // Some tests have unschedulable pods. Do not add an implicit barrier at the // end as we do not want to wait for them. - return executor.dataItems + return executor.dataItems, nil } -func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) { +func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) error { nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), op, e.tCtx.Client()) if err != nil { - e.tCtx.Fatalf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %v", opIndex, err) } if err := nodePreparer.PrepareNodes(e.tCtx, e.nextNodeIndex); err != nil { - e.tCtx.Fatalf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %v", opIndex, err) } e.nextNodeIndex += op.Count + return nil } -func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespacesOp) { +func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespacesOp) error { nsPreparer, err := newNamespacePreparer(e.tCtx, op) if err != nil { - e.tCtx.Fatalf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %v", opIndex, err) } if err := nsPreparer.prepare(e.tCtx); err != nil { err2 := nsPreparer.cleanup(e.tCtx) if err2 != nil { err = fmt.Errorf("prepare: %w; cleanup: %w", err, err2) } - e.tCtx.Fatalf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %v", opIndex, err) } for _, n := range nsPreparer.namespaces() { if _, ok := e.numPodsScheduledPerNamespace[n]; ok { @@ -1585,25 +1592,26 @@ func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespace } e.numPodsScheduledPerNamespace[n] = 0 } + return nil } -func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) { +func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) error { for _, namespace := range op.Namespaces { if _, ok := e.numPodsScheduledPerNamespace[namespace]; !ok { - e.tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace) + return fmt.Errorf("op %d: unknown namespace %s", opIndex, namespace) } } switch op.StageRequirement { case Attempted: if err := waitUntilPodsAttempted(e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - e.tCtx.Fatalf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %v", opIndex, err) } case Scheduled: // Default should be treated like "Scheduled", so handling both in the same way. fallthrough default: if err := waitUntilPodsScheduled(e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - e.tCtx.Fatalf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %v", opIndex, err) } // At the end of the barrier, we can be sure that there are no pods // pending scheduling in the namespaces that we just blocked on. @@ -1615,6 +1623,7 @@ func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) { } } } + return nil } func (e *WorkloadExecutor) runSleepOp(op *sleepOp) { @@ -1624,13 +1633,17 @@ func (e *WorkloadExecutor) runSleepOp(op *sleepOp) { } } -func (e *WorkloadExecutor) runStopCollectingMetrics(opIndex int) { - items := stopCollectingMetrics(e.tCtx, e.collectorCtx, &e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) +func (e *WorkloadExecutor) runStopCollectingMetrics(opIndex int) error { + items, err := stopCollectingMetrics(e.tCtx, e.collectorCtx, &e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) + if err != nil { + return err + } e.dataItems = append(e.dataItems, items...) e.collectorCtx = nil + return nil } -func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { +func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) error { // define Pod's namespace automatically, and create that namespace. namespace := fmt.Sprintf("namespace-%d", opIndex) if op.Namespace != nil { @@ -1643,9 +1656,13 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { if op.CollectMetrics { if e.collectorCtx != nil { - e.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + return fmt.Errorf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + } + var err error + e.collectorCtx, e.collectors, err = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) + if err != nil { + return err } - e.collectorCtx, e.collectors = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) e.tCtx.TB().Cleanup(func() { if e.collectorCtx != nil { e.collectorCtx.Cancel("cleaning up") @@ -1653,7 +1670,7 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { }) } if err := createPodsRapidly(e.tCtx, namespace, op); err != nil { - e.tCtx.Fatalf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %v", opIndex, err) } switch { case op.SkipWaitToCompletion: @@ -1662,29 +1679,33 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) { e.numPodsScheduledPerNamespace[namespace] += op.Count case op.SteadyState: if err := createPodsSteadily(e.tCtx, namespace, e.podInformer, op); err != nil { - e.tCtx.Fatalf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %v", opIndex, err) } default: if err := waitUntilPodsScheduledInNamespace(e.tCtx, e.podInformer, nil, namespace, op.Count); err != nil { - e.tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) + return fmt.Errorf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) } } if op.CollectMetrics { // CollectMetrics and SkipWaitToCompletion can never be true at the // same time, so if we're here, it means that all pods have been // scheduled. - items := stopCollectingMetrics(e.tCtx, e.collectorCtx, &e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) + items, err := stopCollectingMetrics(e.tCtx, e.collectorCtx, &e.collectorWG, e.workload.Threshold, *e.workload.ThresholdMetricSelector, opIndex, e.collectors) + if err != nil { + return err + } e.dataItems = append(e.dataItems, items...) e.collectorCtx = nil } + return nil } -func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { +func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) error { labelSelector := labels.ValidatedSetSelector(op.LabelSelector) podsToDelete, err := e.podInformer.Lister().Pods(op.Namespace).List(labelSelector) if err != nil { - e.tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, op.Namespace, err) + return fmt.Errorf("op %d: error in listing pods in the namespace %s: %v", opIndex, op.Namespace, err) } deletePods := func(opIndex int) { @@ -1727,9 +1748,10 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) { } else { deletePods(opIndex) } + return nil } -func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { +func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) error { var namespace string if op.Namespace != nil { namespace = *op.Namespace @@ -1740,7 +1762,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { // Ensure the namespace exists. nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} if _, err := e.tCtx.Client().CoreV1().Namespaces().Create(e.tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - e.tCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) + return fmt.Errorf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) } var churnFns []func(name string) string @@ -1748,12 +1770,12 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { for i, path := range op.TemplatePaths { unstructuredObj, gvk, err := getUnstructuredFromFile(path) if err != nil { - e.tCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) + return fmt.Errorf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) } // Obtain GVR. mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version) if err != nil { - e.tCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) + return fmt.Errorf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) } gvr := mapping.Resource // Distinguish cluster-scoped with namespaced API objects. @@ -1833,41 +1855,49 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) { } }() } + return nil } -func (e *WorkloadExecutor) runDefaultOp(opIndex int, op realOp) { +func (e *WorkloadExecutor) runDefaultOp(opIndex int, op realOp) error { runable, ok := op.(runnableOp) if !ok { - e.tCtx.Fatalf("op %d: invalid op %v", opIndex, op) + return fmt.Errorf("op %d: invalid op %v", opIndex, op) } for _, namespace := range runable.requiredNamespaces() { createNamespaceIfNotPresent(e.tCtx, namespace, &e.numPodsScheduledPerNamespace) } runable.run(e.tCtx) + return nil } -func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCollectingMetricsOp) { +func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCollectingMetricsOp) error { if e.collectorCtx != nil { - e.tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + return fmt.Errorf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + } + var err error + e.collectorCtx, e.collectors, err = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, op.Name, op.Namespaces, op.LabelSelector) + if err != nil { + return err } - e.collectorCtx, e.collectors = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, op.Name, op.Namespaces, op.LabelSelector) e.tCtx.TB().Cleanup(func() { if e.collectorCtx != nil { e.collectorCtx.Cancel("cleaning up") } }) + return nil } -func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) { +func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) error { if _, ok := (*podsPerNamespace)[namespace]; !ok { // The namespace has not created yet. // So, create that and register it. _, err := tCtx.Client().CoreV1().Namespaces().Create(tCtx, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}}, metav1.CreateOptions{}) if err != nil { - tCtx.Fatalf("failed to create namespace for Pod: %v", namespace) + return fmt.Errorf("failed to create namespace for Pod: %v", namespace) } (*podsPerNamespace)[namespace] = 0 } + return nil } type testDataCollector interface { From bee19638f198131e8041e8792040a419f3c544fb Mon Sep 17 00:00:00 2001 From: YamasouA Date: Fri, 28 Feb 2025 22:45:01 +0900 Subject: [PATCH 17/25] tweak --- test/integration/scheduler_perf/scheduler_perf.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 0cda120ff2f..192c6974b91 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1628,7 +1628,7 @@ func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) error { func (e *WorkloadExecutor) runSleepOp(op *sleepOp) { select { - case <-(e.tCtx).Done(): + case <-e.tCtx.Done(): case <-time.After(op.Duration.Duration): } } @@ -1825,7 +1825,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) error { churnFns[i]("") } count++ - case <-(e.tCtx).Done(): + case <-e.tCtx.Done(): return } } @@ -1849,7 +1849,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) error { retVals[i][count%op.Number] = churnFns[i](retVals[i][count%op.Number]) } count++ - case <-(e.tCtx).Done(): + case <-e.tCtx.Done(): return } } From 486d12efc5f28a9116b2e4cb31fb9f0404faf19e Mon Sep 17 00:00:00 2001 From: YamasouA Date: Sat, 1 Mar 2025 00:14:49 +0900 Subject: [PATCH 18/25] call cleanup func position change --- test/integration/scheduler_perf/scheduler_perf.go | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 192c6974b91..1cc7f7d752a 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1426,6 +1426,9 @@ func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, if err != nil { return nil, nil, fmt.Errorf("op %d: Failed to initialize data collector: %v", opIndex, err) } + tCtx.TB().Cleanup(func() { + collectorCtx.Cancel("cleaning up") + }) collectorWG.Add(1) go func() { defer collectorWG.Done() @@ -1663,11 +1666,6 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) error if err != nil { return err } - e.tCtx.TB().Cleanup(func() { - if e.collectorCtx != nil { - e.collectorCtx.Cancel("cleaning up") - } - }) } if err := createPodsRapidly(e.tCtx, namespace, op); err != nil { return fmt.Errorf("op %d: %v", opIndex, err) @@ -1879,11 +1877,6 @@ func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCol if err != nil { return err } - e.tCtx.TB().Cleanup(func() { - if e.collectorCtx != nil { - e.collectorCtx.Cancel("cleaning up") - } - }) return nil } From 75b09b405405abaa5035c3a34dbba12417fe0415 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Sat, 1 Mar 2025 11:24:23 +0900 Subject: [PATCH 19/25] separete runOp --- .../scheduler_perf/scheduler_perf.go | 53 ++++++++++--------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 1cc7f7d752a..bca71f9b061 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1526,30 +1526,9 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact return nil, fmt.Errorf("op %d: %v", opIndex, context.Cause(tCtx)) default: } - switch concreteOp := realOp.(type) { - case *createNodesOp: - err = executor.runCreateNodesOp(opIndex, concreteOp) - case *createNamespacesOp: - err = executor.runCreateNamespaceOp(opIndex, concreteOp) - case *createPodsOp: - err = executor.runCreatePodsOp(opIndex, concreteOp) - case *deletePodsOp: - err = executor.runDeletePodsOp(opIndex, concreteOp) - case *churnOp: - err = executor.runChurnOp(opIndex, concreteOp) - case *barrierOp: - err = executor.runBarrierOp(opIndex, concreteOp) - case *sleepOp: - executor.runSleepOp(concreteOp) - case *startCollectingMetricsOp: - err = executor.runStartCollectingMetricsOp(opIndex, concreteOp) - case *stopCollectingMetricsOp: - err = executor.runStopCollectingMetrics(opIndex) - default: - err = executor.runDefaultOp(opIndex, concreteOp) - } + err = executor.runOp(realOp, opIndex) if err != nil { - return nil, err + return nil, fmt.Errorf("op %d: %v", opIndex, err) } } @@ -1564,6 +1543,31 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact return executor.dataItems, nil } +func (e *WorkloadExecutor) runOp(op realOp, opIndex int) error { + switch concreteOp := op.(type) { + case *createNodesOp: + return e.runCreateNodesOp(opIndex, concreteOp) + case *createNamespacesOp: + return e.runCreateNamespaceOp(opIndex, concreteOp) + case *createPodsOp: + return e.runCreatePodsOp(opIndex, concreteOp) + case *deletePodsOp: + return e.runDeletePodsOp(opIndex, concreteOp) + case *churnOp: + return e.runChurnOp(opIndex, concreteOp) + case *barrierOp: + return e.runBarrierOp(opIndex, concreteOp) + case *sleepOp: + return e.runSleepOp(concreteOp) + case *startCollectingMetricsOp: + return e.runStartCollectingMetricsOp(opIndex, concreteOp) + case *stopCollectingMetricsOp: + return e.runStopCollectingMetrics(opIndex) + default: + return e.runDefaultOp(opIndex, concreteOp) + } +} + func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) error { nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), op, e.tCtx.Client()) if err != nil { @@ -1629,11 +1633,12 @@ func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) error { return nil } -func (e *WorkloadExecutor) runSleepOp(op *sleepOp) { +func (e *WorkloadExecutor) runSleepOp(op *sleepOp) error { select { case <-e.tCtx.Done(): case <-time.After(op.Duration.Duration): } + return nil } func (e *WorkloadExecutor) runStopCollectingMetrics(opIndex int) error { From 2bed3333bc5806387691559d3af8fde08a755cf0 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Sat, 1 Mar 2025 12:07:40 +0900 Subject: [PATCH 20/25] fix lint error --- .../scheduler_perf/scheduler_perf.go | 51 +++++++++++-------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index bca71f9b061..3cc6bba406a 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1196,7 +1196,7 @@ func RunBenchmarkPerfScheduling(b *testing.B, configFile string, topicName strin results, err := runWorkload(tCtx, tc, w, informerFactory) if err != nil { - tCtx.Fatalf("%w: %s", w.Name, err) + tCtx.Fatalf("Error running workload %s: %s", w.Name, err) } dataItems.DataItems = append(dataItems.DataItems, results...) @@ -1295,7 +1295,10 @@ func RunIntegrationPerfScheduling(t *testing.T, configFile string) { t.Fatalf("workload %s is not valid: %v", w.Name, err) } - runWorkload(tCtx, tc, w, informerFactory) + _, err = runWorkload(tCtx, tc, w, informerFactory) + if err != nil { + tCtx.Fatalf("Error running workload %s: %s", w.Name, err) + } if featureGates[features.SchedulerQueueingHints] { // In any case, we should make sure InFlightEvents is empty after running the scenario. @@ -1424,7 +1427,7 @@ func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, collector := collector err := collector.init() if err != nil { - return nil, nil, fmt.Errorf("op %d: Failed to initialize data collector: %v", opIndex, err) + return nil, nil, fmt.Errorf("op %d: Failed to initialize data collector: %w", opIndex, err) } tCtx.TB().Cleanup(func() { collectorCtx.Cancel("cleaning up") @@ -1519,7 +1522,7 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { realOp, err := op.realOp.patchParams(w) if err != nil { - return nil, fmt.Errorf("op %d: %v", opIndex, err) + return nil, fmt.Errorf("op %d: %w", opIndex, err) } select { case <-tCtx.Done(): @@ -1528,14 +1531,14 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact } err = executor.runOp(realOp, opIndex) if err != nil { - return nil, fmt.Errorf("op %d: %v", opIndex, err) + return nil, fmt.Errorf("op %d: %w", opIndex, err) } } // check unused params and inform users unusedParams := w.unusedParams() if len(unusedParams) != 0 { - return nil, fmt.Errorf("the parameters %v are defined on workload %s, but unused.\nPlease make sure there are no typos.", unusedParams, w.Name) + return nil, fmt.Errorf("the parameters %v are defined on workload %s, but unused.\nPlease make sure there are no typos", unusedParams, w.Name) } // Some tests have unschedulable pods. Do not add an implicit barrier at the @@ -1571,10 +1574,10 @@ func (e *WorkloadExecutor) runOp(op realOp, opIndex int) error { func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) error { nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), op, e.tCtx.Client()) if err != nil { - return fmt.Errorf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %w", opIndex, err) } if err := nodePreparer.PrepareNodes(e.tCtx, e.nextNodeIndex); err != nil { - return fmt.Errorf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %w", opIndex, err) } e.nextNodeIndex += op.Count return nil @@ -1583,14 +1586,14 @@ func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) erro func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespacesOp) error { nsPreparer, err := newNamespacePreparer(e.tCtx, op) if err != nil { - return fmt.Errorf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %w", opIndex, err) } if err := nsPreparer.prepare(e.tCtx); err != nil { err2 := nsPreparer.cleanup(e.tCtx) if err2 != nil { err = fmt.Errorf("prepare: %w; cleanup: %w", err, err2) } - return fmt.Errorf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %w", opIndex, err) } for _, n := range nsPreparer.namespaces() { if _, ok := e.numPodsScheduledPerNamespace[n]; ok { @@ -1611,14 +1614,14 @@ func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) error { switch op.StageRequirement { case Attempted: if err := waitUntilPodsAttempted(e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - return fmt.Errorf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %w", opIndex, err) } case Scheduled: // Default should be treated like "Scheduled", so handling both in the same way. fallthrough default: if err := waitUntilPodsScheduled(e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - return fmt.Errorf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %w", opIndex, err) } // At the end of the barrier, we can be sure that there are no pods // pending scheduling in the namespaces that we just blocked on. @@ -1657,7 +1660,10 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) error if op.Namespace != nil { namespace = *op.Namespace } - createNamespaceIfNotPresent(e.tCtx, namespace, &e.numPodsScheduledPerNamespace) + err := createNamespaceIfNotPresent(e.tCtx, namespace, &e.numPodsScheduledPerNamespace) + if err != nil { + return err + } if op.PodTemplatePath == nil { op.PodTemplatePath = e.testCase.DefaultPodTemplatePath } @@ -1673,7 +1679,7 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) error } } if err := createPodsRapidly(e.tCtx, namespace, op); err != nil { - return fmt.Errorf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %w", opIndex, err) } switch { case op.SkipWaitToCompletion: @@ -1682,11 +1688,11 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) error e.numPodsScheduledPerNamespace[namespace] += op.Count case op.SteadyState: if err := createPodsSteadily(e.tCtx, namespace, e.podInformer, op); err != nil { - return fmt.Errorf("op %d: %v", opIndex, err) + return fmt.Errorf("op %d: %w", opIndex, err) } default: if err := waitUntilPodsScheduledInNamespace(e.tCtx, e.podInformer, nil, namespace, op.Count); err != nil { - return fmt.Errorf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) + return fmt.Errorf("op %d: error in waiting for pods to get scheduled: %w", opIndex, err) } } if op.CollectMetrics { @@ -1708,7 +1714,7 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) error podsToDelete, err := e.podInformer.Lister().Pods(op.Namespace).List(labelSelector) if err != nil { - return fmt.Errorf("op %d: error in listing pods in the namespace %s: %v", opIndex, op.Namespace, err) + return fmt.Errorf("op %d: error in listing pods in the namespace %s: %w", opIndex, op.Namespace, err) } deletePods := func(opIndex int) { @@ -1765,7 +1771,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) error { // Ensure the namespace exists. nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} if _, err := e.tCtx.Client().CoreV1().Namespaces().Create(e.tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - return fmt.Errorf("op %d: unable to create namespace %v: %v", opIndex, namespace, err) + return fmt.Errorf("op %d: unable to create namespace %v: %w", opIndex, namespace, err) } var churnFns []func(name string) string @@ -1773,12 +1779,12 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) error { for i, path := range op.TemplatePaths { unstructuredObj, gvk, err := getUnstructuredFromFile(path) if err != nil { - return fmt.Errorf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err) + return fmt.Errorf("op %d: unable to parse the %v-th template path: %w", opIndex, i, err) } // Obtain GVR. mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version) if err != nil { - return fmt.Errorf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err) + return fmt.Errorf("op %d: unable to find GVR for %v: %w", opIndex, gvk, err) } gvr := mapping.Resource // Distinguish cluster-scoped with namespaced API objects. @@ -1867,7 +1873,10 @@ func (e *WorkloadExecutor) runDefaultOp(opIndex int, op realOp) error { return fmt.Errorf("op %d: invalid op %v", opIndex, op) } for _, namespace := range runable.requiredNamespaces() { - createNamespaceIfNotPresent(e.tCtx, namespace, &e.numPodsScheduledPerNamespace) + err := createNamespaceIfNotPresent(e.tCtx, namespace, &e.numPodsScheduledPerNamespace) + if err != nil { + return err + } } runable.run(e.tCtx) return nil From 2c062117d009b814037285df0681071902d54c29 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Sat, 1 Mar 2025 12:32:07 +0900 Subject: [PATCH 21/25] fix lint --- test/integration/scheduler_perf/scheduler_perf.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 3cc6bba406a..f30ff807398 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1526,7 +1526,7 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact } select { case <-tCtx.Done(): - return nil, fmt.Errorf("op %d: %v", opIndex, context.Cause(tCtx)) + return nil, fmt.Errorf("op %d: %w", opIndex, context.Cause(tCtx)) default: } err = executor.runOp(realOp, opIndex) From 41577dea1b090d78babe2c15e4d3e471a968dd96 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Tue, 4 Mar 2025 23:57:02 +0900 Subject: [PATCH 22/25] delete opIndex wrapping --- .../scheduler_perf/scheduler_perf.go | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index f30ff807398..c907cf51bb6 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1574,10 +1574,10 @@ func (e *WorkloadExecutor) runOp(op realOp, opIndex int) error { func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) error { nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), op, e.tCtx.Client()) if err != nil { - return fmt.Errorf("op %d: %w", opIndex, err) + return err } if err := nodePreparer.PrepareNodes(e.tCtx, e.nextNodeIndex); err != nil { - return fmt.Errorf("op %d: %w", opIndex, err) + return err } e.nextNodeIndex += op.Count return nil @@ -1586,14 +1586,14 @@ func (e *WorkloadExecutor) runCreateNodesOp(opIndex int, op *createNodesOp) erro func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespacesOp) error { nsPreparer, err := newNamespacePreparer(e.tCtx, op) if err != nil { - return fmt.Errorf("op %d: %w", opIndex, err) + return err } if err := nsPreparer.prepare(e.tCtx); err != nil { err2 := nsPreparer.cleanup(e.tCtx) if err2 != nil { err = fmt.Errorf("prepare: %w; cleanup: %w", err, err2) } - return fmt.Errorf("op %d: %w", opIndex, err) + return err } for _, n := range nsPreparer.namespaces() { if _, ok := e.numPodsScheduledPerNamespace[n]; ok { @@ -1608,20 +1608,20 @@ func (e *WorkloadExecutor) runCreateNamespaceOp(opIndex int, op *createNamespace func (e *WorkloadExecutor) runBarrierOp(opIndex int, op *barrierOp) error { for _, namespace := range op.Namespaces { if _, ok := e.numPodsScheduledPerNamespace[namespace]; !ok { - return fmt.Errorf("op %d: unknown namespace %s", opIndex, namespace) + return fmt.Errorf("unknown namespace %s", namespace) } } switch op.StageRequirement { case Attempted: if err := waitUntilPodsAttempted(e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - return fmt.Errorf("op %d: %w", opIndex, err) + return err } case Scheduled: // Default should be treated like "Scheduled", so handling both in the same way. fallthrough default: if err := waitUntilPodsScheduled(e.tCtx, e.podInformer, op.LabelSelector, op.Namespaces, e.numPodsScheduledPerNamespace); err != nil { - return fmt.Errorf("op %d: %w", opIndex, err) + return err } // At the end of the barrier, we can be sure that there are no pods // pending scheduling in the namespaces that we just blocked on. @@ -1670,7 +1670,7 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) error if op.CollectMetrics { if e.collectorCtx != nil { - return fmt.Errorf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + return fmt.Errorf("Metrics collection is overlapping. Probably second collector was started before stopping a previous one") } var err error e.collectorCtx, e.collectors, err = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) @@ -1679,7 +1679,7 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) error } } if err := createPodsRapidly(e.tCtx, namespace, op); err != nil { - return fmt.Errorf("op %d: %w", opIndex, err) + return err } switch { case op.SkipWaitToCompletion: @@ -1688,11 +1688,11 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) error e.numPodsScheduledPerNamespace[namespace] += op.Count case op.SteadyState: if err := createPodsSteadily(e.tCtx, namespace, e.podInformer, op); err != nil { - return fmt.Errorf("op %d: %w", opIndex, err) + return err } default: if err := waitUntilPodsScheduledInNamespace(e.tCtx, e.podInformer, nil, namespace, op.Count); err != nil { - return fmt.Errorf("op %d: error in waiting for pods to get scheduled: %w", opIndex, err) + return fmt.Errorf("error in waiting for pods to get scheduled: %w", err) } } if op.CollectMetrics { @@ -1714,7 +1714,7 @@ func (e *WorkloadExecutor) runDeletePodsOp(opIndex int, op *deletePodsOp) error podsToDelete, err := e.podInformer.Lister().Pods(op.Namespace).List(labelSelector) if err != nil { - return fmt.Errorf("op %d: error in listing pods in the namespace %s: %w", opIndex, op.Namespace, err) + return fmt.Errorf("error in listing pods in the namespace %s: %w", op.Namespace, err) } deletePods := func(opIndex int) { @@ -1771,7 +1771,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) error { // Ensure the namespace exists. nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} if _, err := e.tCtx.Client().CoreV1().Namespaces().Create(e.tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - return fmt.Errorf("op %d: unable to create namespace %v: %w", opIndex, namespace, err) + return fmt.Errorf("unable to create namespace %v: %w", namespace, err) } var churnFns []func(name string) string @@ -1779,12 +1779,12 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) error { for i, path := range op.TemplatePaths { unstructuredObj, gvk, err := getUnstructuredFromFile(path) if err != nil { - return fmt.Errorf("op %d: unable to parse the %v-th template path: %w", opIndex, i, err) + return fmt.Errorf("unable to parse the %v-th template path: %w", i, err) } // Obtain GVR. mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version) if err != nil { - return fmt.Errorf("op %d: unable to find GVR for %v: %w", opIndex, gvk, err) + return fmt.Errorf("unable to find GVR for %v: %w", gvk, err) } gvr := mapping.Resource // Distinguish cluster-scoped with namespaced API objects. @@ -1870,7 +1870,7 @@ func (e *WorkloadExecutor) runChurnOp(opIndex int, op *churnOp) error { func (e *WorkloadExecutor) runDefaultOp(opIndex int, op realOp) error { runable, ok := op.(runnableOp) if !ok { - return fmt.Errorf("op %d: invalid op %v", opIndex, op) + return fmt.Errorf("invalid op %v", op) } for _, namespace := range runable.requiredNamespaces() { err := createNamespaceIfNotPresent(e.tCtx, namespace, &e.numPodsScheduledPerNamespace) @@ -1884,7 +1884,7 @@ func (e *WorkloadExecutor) runDefaultOp(opIndex int, op realOp) error { func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCollectingMetricsOp) error { if e.collectorCtx != nil { - return fmt.Errorf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex) + return fmt.Errorf("Metrics collection is overlapping. Probably second collector was started before stopping a previous one") } var err error e.collectorCtx, e.collectors, err = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, op.Name, op.Namespaces, op.LabelSelector) From 8e4b00e9494973e4516e8c611fd568b10d173490 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Wed, 5 Mar 2025 09:24:47 +0900 Subject: [PATCH 23/25] use Cleanup instead of defer --- test/integration/scheduler_perf/scheduler_perf.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index c907cf51bb6..7754ef7a75d 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1515,9 +1515,11 @@ func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFact workload: w, } - defer executor.wg.Wait() - defer executor.collectorWG.Wait() - defer tCtx.Cancel("workload is done") + tCtx.TB().Cleanup(func() { + tCtx.Cancel("workload is done") + executor.collectorWG.Wait() + executor.wg.Wait() + }) for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) { realOp, err := op.realOp.patchParams(w) From 32fd0de21c52233619b119cb8645d27ae1ba3d9e Mon Sep 17 00:00:00 2001 From: YamasouA Date: Wed, 5 Mar 2025 12:59:17 +0900 Subject: [PATCH 24/25] tweak --- test/integration/scheduler_perf/scheduler_perf.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 7754ef7a75d..9d2e2118052 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1672,7 +1672,7 @@ func (e *WorkloadExecutor) runCreatePodsOp(opIndex int, op *createPodsOp) error if op.CollectMetrics { if e.collectorCtx != nil { - return fmt.Errorf("Metrics collection is overlapping. Probably second collector was started before stopping a previous one") + return fmt.Errorf("metrics collection is overlapping. Probably second collector was started before stopping a previous one") } var err error e.collectorCtx, e.collectors, err = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, namespace, []string{namespace}, nil) @@ -1886,7 +1886,7 @@ func (e *WorkloadExecutor) runDefaultOp(opIndex int, op realOp) error { func (e *WorkloadExecutor) runStartCollectingMetricsOp(opIndex int, op *startCollectingMetricsOp) error { if e.collectorCtx != nil { - return fmt.Errorf("Metrics collection is overlapping. Probably second collector was started before stopping a previous one") + return fmt.Errorf("metrics collection is overlapping. Probably second collector was started before stopping a previous one") } var err error e.collectorCtx, e.collectors, err = startCollectingMetrics(e.tCtx, &e.collectorWG, e.podInformer, e.testCase.MetricsCollectorConfig, e.throughputErrorMargin, opIndex, op.Name, op.Namespaces, op.LabelSelector) From 4001c819f09e541b4d84feb74f3ad1d142cd6fc6 Mon Sep 17 00:00:00 2001 From: YamasouA Date: Wed, 5 Mar 2025 21:54:54 +0900 Subject: [PATCH 25/25] fix --- test/integration/scheduler_perf/scheduler_perf.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go index 9d2e2118052..cfd9830874d 100644 --- a/test/integration/scheduler_perf/scheduler_perf.go +++ b/test/integration/scheduler_perf/scheduler_perf.go @@ -1427,7 +1427,7 @@ func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, collector := collector err := collector.init() if err != nil { - return nil, nil, fmt.Errorf("op %d: Failed to initialize data collector: %w", opIndex, err) + return nil, nil, fmt.Errorf("failed to initialize data collector: %w", err) } tCtx.TB().Cleanup(func() { collectorCtx.Cancel("cleaning up") @@ -1443,7 +1443,7 @@ func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContext, collectorWG *sync.WaitGroup, threshold float64, tms thresholdMetricSelector, opIndex int, collectors []testDataCollector) ([]DataItem, error) { if collectorCtx == nil { - return nil, fmt.Errorf("op %d: Missing startCollectingMetrics operation before stopping", opIndex) + return nil, fmt.Errorf("missing startCollectingMetrics operation before stopping") } collectorCtx.Cancel("collecting metrics, collector must stop first") collectorWG.Wait()