Merge pull request #50913 from shyamjvs/list-call-slo

Automatic merge from submit-queue (batch tested with PRs 50893, 50913, 50963, 50629, 50640) Increase latency threshold for list api calls This is only a short-term solution to make our density test green. In the long-term, we should measure as per our new SLIs. From @wojtek-t's [doc](https://docs.google.com/document/d/1Q5qxdeBPgTTIXZxdsFILg7kgqWhvOwY8uROEf0j5YBw) on the new SLIs/SLOs, we have the following SLO for list calls: ``` SLO1: In default Kubernetes installation, 99th percentile of SLI2 per cluster-day: <= 1s if total number of objects of the same type as resource in the system <= X <= 5s if total number of objects of the same type as resource in the system <= Y <= 30s if total number of objects of the same types as resource in the system <= Z ``` I would guess that 170,000 pods would fall into the 2nd bracket (at least) and hence the new value of 5s. WDYT? cc @kubernetes/sig-scalability-misc @wojtek-t @gmarek
2025-09-20 09:33:52 +00:00 · 2017-08-22 05:31:07 -07:00
parent c13e9d14cd 70123e71bb
commit fdf14b8218
3 changed files with 17 additions and 12 deletions
--- a/test/e2e/framework/metrics_util.go
+++ b/test/e2e/framework/metrics_util.go
@@ -51,9 +51,10 @@ const (
 	// Increasing threshold to 1s is within our SLO and should solve this problem.
 	apiCallLatencyThreshold time.Duration = 1 * time.Second

-	// We set a higher threshold for list apicalls as they can take more time when
-	// the list is really big. For eg. list nodes in a 5000-node cluster.
-	apiListCallLatencyThreshold time.Duration = 2 * time.Second
+	// We use a higher threshold for list apicalls if the cluster is big (i.e having > 500 nodes)
+	// as list response sizes are bigger in general for big clusters.
+	apiListCallLatencyThreshold  time.Duration = 5 * time.Second
+	bigClusterNodeCountThreshold               = 500

 	// Cluster Autoscaler metrics names
 	caFunctionMetric      = "cluster_autoscaler_function_duration_seconds_bucket"
@@ -354,8 +355,10 @@ func readLatencyMetrics(c clientset.Interface) (*APIResponsiveness, error) {
 }

 // Prints top five summary metrics for request types with latency and returns
-// number of such request types above threshold.
-func HighLatencyRequests(c clientset.Interface) (int, *APIResponsiveness, error) {
+// number of such request types above threshold. We use a higher threshold for
+// list calls if nodeCount is above a given threshold (i.e. cluster is big).
+func HighLatencyRequests(c clientset.Interface, nodeCount int) (int, *APIResponsiveness, error) {
+	isBigCluster := (nodeCount > bigClusterNodeCountThreshold)
 	metrics, err := readLatencyMetrics(c)
 	if err != nil {
 		return 0, metrics, err
@@ -364,12 +367,14 @@ func HighLatencyRequests(c clientset.Interface) (int, *APIResponsiveness, error)
 	badMetrics := 0
 	top := 5
 	for i := range metrics.APICalls {
+		latency := metrics.APICalls[i].Latency.Perc99
+		isListCall := (metrics.APICalls[i].Verb == "LIST")
 		isBad := false
-		verb := metrics.APICalls[i].Verb
-		if verb != "LIST" && metrics.APICalls[i].Latency.Perc99 > apiCallLatencyThreshold ||
-			verb == "LIST" && metrics.APICalls[i].Latency.Perc99 > apiListCallLatencyThreshold {
-			badMetrics++
-			isBad = true
+		if latency > apiCallLatencyThreshold {
+			if !isListCall || !isBigCluster || (latency > apiListCallLatencyThreshold) {
+				isBad = true
+				badMetrics++
+			}
 		}
 		if top > 0 || isBad {
 			top--
--- a/test/e2e/scalability/density.go
+++ b/test/e2e/scalability/density.go
@@ -328,7 +328,7 @@ var _ = SIGDescribe("Density", func() {

 		summaries := make([]framework.TestDataSummary, 0, 2)
 		// Verify latency metrics.
-		highLatencyRequests, metrics, err := framework.HighLatencyRequests(c)
+		highLatencyRequests, metrics, err := framework.HighLatencyRequests(c, nodeCount)
 		framework.ExpectNoError(err)
 		if err == nil {
 			summaries = append(summaries, metrics)
--- a/test/e2e/scalability/load.go
+++ b/test/e2e/scalability/load.go
@@ -92,7 +92,7 @@ var _ = SIGDescribe("Load capacity", func() {
 	// TODO add flag that allows to skip cleanup on failure
 	AfterEach(func() {
 		// Verify latency metrics
-		highLatencyRequests, metrics, err := framework.HighLatencyRequests(clientset)
+		highLatencyRequests, metrics, err := framework.HighLatencyRequests(clientset, nodeCount)
 		framework.ExpectNoError(err)
 		if err == nil {
 			summaries := make([]framework.TestDataSummary, 0, 1)