Merge pull request #26959 from wojtek-t/fix_performance_flakes

Automatic merge from submit-queue Fix scalability flakes in small clusters Fix #26185 @xiang90 @hongchaodeng @gmarek
2025-09-13 13:14:05 +00:00 · 2016-06-07 16:13:10 -07:00
parent 0d3be6a316 940882cb44
commit 457d42e70e
1 changed files with 7 additions and 32 deletions
--- a/test/e2e/framework/metrics_util.go
+++ b/test/e2e/framework/metrics_util.go
@@ -42,10 +42,12 @@ const (
 	NodeStartupThreshold = 4 * time.Second
 	podStartupThreshold time.Duration = 5 * time.Second
-	// TODO: Decrease the small threshold to 250ms once tests are fixed.
+	// We are setting 1s threshold for apicalls even in small clusters to avoid flakes.
-	apiCallLatencySmallThreshold  time.Duration = 500 * time.Millisecond
+	// The problem is that if long GC is happening in small clusters (where we have e.g.
-	apiCallLatencyMediumThreshold time.Duration = 500 * time.Millisecond
+	// 1-core master machines) and tests are pretty short, it may consume significant
-	apiCallLatencyLargeThreshold  time.Duration = 1 * time.Second
+	// portion of CPU and basically stop all the real work.
 	// Increasing threshold to 1s is within our SLO and should solve this problem.
 	apiCallLatencyThreshold time.Duration = 1 * time.Second
 )
 type MetricsForE2E metrics.MetricsCollection
@@ -225,31 +227,9 @@ func readLatencyMetrics(c *client.Client) (APIResponsiveness, error) {
 	return a, err
 }
 // Returns threshold for API call depending on the size of the cluster.
 // In general our goal is 1s, but for smaller clusters, we want to enforce
 // smaller limits, to allow noticing regressions.
 func apiCallLatencyThreshold(numNodes int) time.Duration {
 	if numNodes <= 250 {
 		return apiCallLatencySmallThreshold
 	}
 	if numNodes <= 500 {
 		return apiCallLatencyMediumThreshold
 	}
 	return apiCallLatencyLargeThreshold
 }
 func listNodesLatencyThreshold(numNodes int) time.Duration {
 	return apiCallLatencyLargeThreshold
 }
 // Prints top five summary metrics for request types with latency and returns
 // number of such request types above threshold.
 func HighLatencyRequests(c *client.Client) (int, error) {
 	nodes, err := c.Nodes().List(api.ListOptions{})
 	if err != nil {
 		return 0, err
 	}
 	numNodes := len(nodes.Items)
 	metrics, err := readLatencyMetrics(c)
 	if err != nil {
 		return 0, err
@@ -258,13 +238,8 @@ func HighLatencyRequests(c *client.Client) (int, error) {
 	badMetrics := 0
 	top := 5
 	for _, metric := range metrics.APICalls {
 		threshold := apiCallLatencyThreshold(numNodes)
 		if metric.Verb == "LIST" && metric.Resource == "nodes" {
 			threshold = listNodesLatencyThreshold(numNodes)
 		}
 		isBad := false
-		if metric.Latency.Perc99 > threshold {
+		if metric.Latency.Perc99 > apiCallLatencyThreshold {
 			badMetrics++
 			isBad = true
 		}