performance test to compare allocators

The current results with 100 works and 15k services on a (n2-standard-48) vCPU: 48 RAM: 192 GB are:
Old allocator:

perf_test.go:139: [RESULT] Duration 1m9.646167533s: [quantile:0.5  value:0.462886801 quantile:0.9  value:0.496662838 quantile:0.99  value:0.725845905]

New allocator:
perf_test.go:139: [RESULT] Duration 2m12.900694343s: [quantile:0.5  value:0.481814448 quantile:0.9  value:1.3867615469999999 quantile:0.99  value:1.888190671]

The new allocator has higher latency but in contrast allow to use a
larger number of services, when tested with 65k Services the old
allocator etcd crashes with storage exceeded.

The scenario is also not realistic, as a continuous and high load on
Service creation is not expected.
This commit is contained in:
Antonio Ojea
2024-05-21 10:29:11 +00:00
parent 6639411231
commit 8490273979

View File

@@ -0,0 +1,140 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package servicecidr
import (
"context"
"fmt"
"testing"
"time"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
clientset "k8s.io/client-go/kubernetes"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/component-base/metrics/testutil"
"k8s.io/kubernetes/cmd/kube-apiserver/app/options"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/test/integration/framework"
"k8s.io/kubernetes/test/utils/ktesting"
netutils "k8s.io/utils/net"
)
// TestServiceAllocPerformance measure the latency to create N services with a parallelism of K
// using the old and the new ClusterIP allocators.
// The test is skipped to run on CI and is left to execute manually to check for possible regressions.
// The current results with 100 works and 15k services on a (n2-standard-48) vCPU: 48 RAM: 192 GB are:
// legacy perf_test.go:139: [RESULT] Duration 1m9.646167533s: [quantile:0.5 value:0.462886801 quantile:0.9 value:0.496662838 quantile:0.99 value:0.725845905]
// new perf_test.go:139: [RESULT] Duration 2m12.900694343s: [quantile:0.5 value:0.481814448 quantile:0.9 value:1.3867615469999999 quantile:0.99 value:1.888190671]
func TestServiceAllocPerformance(t *testing.T) {
t.Skip("KEP-1880 performance comparison")
serviceCreation := metrics.NewHistogram(&metrics.HistogramOpts{
Name: "service_duration_seconds",
Help: "A summary of the Service creation durations in seconds.",
Buckets: metrics.DefBuckets,
})
legacyregistry.MustRegister(serviceCreation)
svc := func(i, j int) *v1.Service {
return &v1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("svc-%v-%v", i, j),
},
Spec: v1.ServiceSpec{
Type: v1.ServiceTypeClusterIP,
Ports: []v1.ServicePort{
{Port: 80},
},
},
}
}
worker := func(client clientset.Interface, id int, jobs <-chan int, results chan<- error) {
for j := range jobs {
t.Logf("Worker: %d Job: %d", id, j)
func() {
now := time.Now()
defer func() {
t.Logf("worker %d job %d took %v", id, j, time.Since(now))
serviceCreation.Observe(time.Since(now).Seconds())
}()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
_, err := client.CoreV1().Services(metav1.NamespaceDefault).Create(ctx, svc(id, j), metav1.CreateOptions{})
if err != nil {
t.Errorf("unexpected error: %v", err)
}
results <- err
}()
}
}
for _, gate := range []bool{false, true} {
t.Run(fmt.Sprintf("feature-gate=%v", gate), func(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.MultiCIDRServiceAllocator, gate)
tCtx := ktesting.Init(t)
client, _, tearDownFn := framework.StartTestServer(tCtx, t, framework.TestServerSetup{
ModifyServerRunOptions: func(opts *options.ServerRunOptions) {
// use the larget range possible , this is limited by the old allocator
opts.ServiceClusterIPRanges = "10.0.0.0/12"
opts.GenericServerRunOptions.AdvertiseAddress = netutils.ParseIPSloppy("10.0.0.1")
opts.APIEnablement.RuntimeConfig.Set("networking.k8s.io/v1alpha1=true") // nolint: errcheck
},
})
defer tearDownFn()
legacyregistry.Reset()
// 100 workers for 15k services
nworkers := 100
nservices := 15000
jobs := make(chan int, nservices)
results := make(chan error, nservices)
t.Log("Starting workers to create ClusterIP Service")
now := time.Now()
for w := 0; w < nworkers; w++ {
t.Logf("Starting worker %d", w)
go worker(client, w, jobs, results)
}
for i := 0; i < nservices; i++ {
t.Logf("Sending job %d", i)
jobs <- i
}
t.Log("All jobs processed")
close(jobs)
for c := 0; c < nservices; c++ {
t.Logf("Getting results %d", c)
err := <-results
if err != nil {
t.Errorf("error creating service: %v", err)
}
}
vec, err := testutil.GetHistogramVecFromGatherer(legacyregistry.DefaultGatherer, serviceCreation.Name, map[string]string{})
if err != nil {
t.Error(err)
}
t.Logf("[RESULT] feature-gate=%v Duration: %v Avg: %.4f p95: %.4f p99: %.4f", gate, time.Since(now), vec.Average(), vec.Quantile(0.95), vec.Quantile(0.99))
})
}
}