diff --git a/test/integration/servicecidr/perf_test.go b/test/integration/servicecidr/perf_test.go new file mode 100644 index 00000000000..07a9f66f3e1 --- /dev/null +++ b/test/integration/servicecidr/perf_test.go @@ -0,0 +1,140 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servicecidr + +import ( + "context" + "fmt" + "testing" + "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" + clientset "k8s.io/client-go/kubernetes" + featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/component-base/metrics/testutil" + "k8s.io/kubernetes/cmd/kube-apiserver/app/options" + "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/test/integration/framework" + "k8s.io/kubernetes/test/utils/ktesting" + netutils "k8s.io/utils/net" +) + +// TestServiceAllocPerformance measure the latency to create N services with a parallelism of K +// using the old and the new ClusterIP allocators. +// The test is skipped to run on CI and is left to execute manually to check for possible regressions. +// The current results with 100 works and 15k services on a (n2-standard-48) vCPU: 48 RAM: 192 GB are: +// legacy perf_test.go:139: [RESULT] Duration 1m9.646167533s: [quantile:0.5 value:0.462886801 quantile:0.9 value:0.496662838 quantile:0.99 value:0.725845905] +// new perf_test.go:139: [RESULT] Duration 2m12.900694343s: [quantile:0.5 value:0.481814448 quantile:0.9 value:1.3867615469999999 quantile:0.99 value:1.888190671] +func TestServiceAllocPerformance(t *testing.T) { + t.Skip("KEP-1880 performance comparison") + serviceCreation := metrics.NewHistogram(&metrics.HistogramOpts{ + Name: "service_duration_seconds", + Help: "A summary of the Service creation durations in seconds.", + Buckets: metrics.DefBuckets, + }) + legacyregistry.MustRegister(serviceCreation) + + svc := func(i, j int) *v1.Service { + return &v1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("svc-%v-%v", i, j), + }, + Spec: v1.ServiceSpec{ + Type: v1.ServiceTypeClusterIP, + Ports: []v1.ServicePort{ + {Port: 80}, + }, + }, + } + } + + worker := func(client clientset.Interface, id int, jobs <-chan int, results chan<- error) { + for j := range jobs { + t.Logf("Worker: %d Job: %d", id, j) + func() { + now := time.Now() + defer func() { + t.Logf("worker %d job %d took %v", id, j, time.Since(now)) + serviceCreation.Observe(time.Since(now).Seconds()) + }() + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + _, err := client.CoreV1().Services(metav1.NamespaceDefault).Create(ctx, svc(id, j), metav1.CreateOptions{}) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + results <- err + }() + } + } + + for _, gate := range []bool{false, true} { + t.Run(fmt.Sprintf("feature-gate=%v", gate), func(t *testing.T) { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.MultiCIDRServiceAllocator, gate) + + tCtx := ktesting.Init(t) + client, _, tearDownFn := framework.StartTestServer(tCtx, t, framework.TestServerSetup{ + ModifyServerRunOptions: func(opts *options.ServerRunOptions) { + // use the larget range possible , this is limited by the old allocator + opts.ServiceClusterIPRanges = "10.0.0.0/12" + opts.GenericServerRunOptions.AdvertiseAddress = netutils.ParseIPSloppy("10.0.0.1") + opts.APIEnablement.RuntimeConfig.Set("networking.k8s.io/v1alpha1=true") // nolint: errcheck + }, + }) + defer tearDownFn() + + legacyregistry.Reset() + + // 100 workers for 15k services + nworkers := 100 + nservices := 15000 + jobs := make(chan int, nservices) + results := make(chan error, nservices) + t.Log("Starting workers to create ClusterIP Service") + now := time.Now() + for w := 0; w < nworkers; w++ { + t.Logf("Starting worker %d", w) + go worker(client, w, jobs, results) + } + for i := 0; i < nservices; i++ { + t.Logf("Sending job %d", i) + jobs <- i + } + t.Log("All jobs processed") + close(jobs) + + for c := 0; c < nservices; c++ { + t.Logf("Getting results %d", c) + err := <-results + if err != nil { + t.Errorf("error creating service: %v", err) + } + } + + vec, err := testutil.GetHistogramVecFromGatherer(legacyregistry.DefaultGatherer, serviceCreation.Name, map[string]string{}) + if err != nil { + t.Error(err) + } + + t.Logf("[RESULT] feature-gate=%v Duration: %v Avg: %.4f p95: %.4f p99: %.4f", gate, time.Since(now), vec.Average(), vec.Quantile(0.95), vec.Quantile(0.99)) + }) + } +}