Merge pull request #44092 from MaciekPytel/ca_overscaleup_e2e

Automatic merge from submit-queue Cluster-Autoscaler e2e to catch unnecessary scale-ups This e2e catches the bug in 0.5.0 CA with reasonable probability. ```release-note ```
2025-07-29 14:37:00 +00:00 · 2017-04-05 08:50:18 -07:00 · 2017-04-05 08:50:18 -07:00 · 17a6a1cb5e
commit 17a6a1cb5e
parent 0f10d6ccf2 67f5ba0004
1 changed files with 104 additions and 13 deletions
--- a/test/e2e/autoscaling/cluster_size_autoscaling.go
+++ b/test/e2e/autoscaling/cluster_size_autoscaling.go
@ -48,12 +48,13 @@ import (
 )

 const (
-	defaultTimeout      = 3 * time.Minute
-	resizeTimeout       = 5 * time.Minute
-	scaleUpTimeout      = 5 * time.Minute
-	scaleDownTimeout    = 15 * time.Minute
-	podTimeout          = 2 * time.Minute
-	nodesRecoverTimeout = 5 * time.Minute
+	defaultTimeout        = 3 * time.Minute
+	resizeTimeout         = 5 * time.Minute
+	scaleUpTimeout        = 5 * time.Minute
+	scaleUpTriggerTimeout = 2 * time.Minute
+	scaleDownTimeout      = 15 * time.Minute
+	podTimeout            = 2 * time.Minute
+	nodesRecoverTimeout   = 5 * time.Minute

 	gkeEndpoint      = "https://test-container.sandbox.googleapis.com"
 	gkeUpdateTimeout = 15 * time.Minute
@ -61,6 +62,9 @@ const (
 	disabledTaint             = "DisabledForAutoscalingTest"
 	newNodesForScaledownTests = 2
 	unhealthyClusterThreshold = 4
+
+	caNoScaleUpStatus      = "NoActivity"
+	caOngoingScaleUpStatus = "InProgress"
 )

 var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
@ -117,7 +121,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {

 	It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() {
 		By("Creating unschedulable pod")
-		ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false)
+		ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false, defaultTimeout)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")

 		By("Waiting for scale up hoping it won't happen")
@ -144,7 +148,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 	})

 	It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", func() {
-		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
+		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")

 		// Verify, that cluster size is increased
@ -153,6 +157,31 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
 	})

+	It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() {
+		status, err := getScaleUpStatus(c)
+		framework.ExpectNoError(err)
+		unmanagedNodes := nodeCount - status.ready
+
+		By("Schedule more pods than can fit and wait for claster to scale-up")
+		ReserveMemory(f, "memory-reservation", 100, (nodeCount+2)*memCapacityMb, false, 1*time.Second)
+		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
+
+		status, err = waitForScaleUpStatus(c, caOngoingScaleUpStatus, scaleUpTriggerTimeout)
+		framework.ExpectNoError(err)
+		target := status.target
+		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
+
+		By("Expect no more scale-up to be happening after all pods are scheduled")
+		status, err = getScaleUpStatus(c)
+		framework.ExpectNoError(err)
+		if status.target != target {
+			glog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target)
+		}
+		Expect(status.status).Should(Equal(caNoScaleUpStatus))
+		Expect(status.ready).Should(Equal(status.target))
+		Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(Equal(status.target + unmanagedNodes))
+	})
+
 	It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() {
 		framework.SkipUnlessProviderIs("gke")

@ -163,7 +192,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 		framework.ExpectNoError(framework.WaitForClusterSize(c, nodeCount+1, resizeTimeout))
 		glog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).")

-		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
+		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")

 		// Verify, that cluster size is increased
@ -295,7 +324,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 		framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))

 		By("Creating rc with 2 pods too big to fit default-pool but fitting extra-pool")
-		ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false)
+		ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false, defaultTimeout)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")

 		// Apparently GKE master is restarted couple minutes after the node pool is added
@ -383,7 +412,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 				nodesToBreak = nodesToBreak[1:]
 				framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction)
 			} else {
-				ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
+				ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
 				defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
 				time.Sleep(scaleUpTimeout)
 				currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
@ -622,7 +651,7 @@ func CreateNodeSelectorPods(f *framework.Framework, id string, replicas int, nod
 	}
 }

-func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool) {
+func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) {
 	By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes))
 	request := int64(1024 * 1024 * megabytes / replicas)
 	config := &testutils.RCConfig{
@ -630,7 +659,7 @@ func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, e
 		InternalClient: f.InternalClientset,
 		Name:           id,
 		Namespace:      f.Namespace.Name,
-		Timeout:        defaultTimeout,
+		Timeout:        timeout,
 		Image:          framework.GetPauseImageName(f.ClientSet),
 		Replicas:       replicas,
 		MemRequest:     request,
@ -896,3 +925,65 @@ func getClusterwideStatus(c clientset.Interface) (string, error) {
 	}
 	return result[1], nil
 }
+
+type scaleUpStatus struct {
+	status string
+	ready  int
+	target int
+}
+
+// Try to get scaleup statuses of all node groups.
+// Status configmap is not parsing-friendly, so evil regexpery follows.
+func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) {
+	configMap, err := c.CoreV1().ConfigMaps("kube-system").Get("cluster-autoscaler-status", metav1.GetOptions{})
+	if err != nil {
+		return nil, err
+	}
+	status, ok := configMap.Data["status"]
+	if !ok {
+		return nil, fmt.Errorf("Status information not found in configmap")
+	}
+	matcher, err := regexp.Compile("s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)")
+	if err != nil {
+		return nil, err
+	}
+	matches := matcher.FindAllStringSubmatch(status, -1)
+	if len(matches) < 1 {
+		return nil, fmt.Errorf("Failed to parse CA status configmap")
+	}
+	result := scaleUpStatus{
+		status: caNoScaleUpStatus,
+		ready:  0,
+		target: 0,
+	}
+	for _, match := range matches {
+		if match[1] == caOngoingScaleUpStatus {
+			result.status = caOngoingScaleUpStatus
+		}
+		newReady, err := strconv.Atoi(match[2])
+		if err != nil {
+			return nil, err
+		}
+		result.ready += newReady
+		newTarget, err := strconv.Atoi(match[3])
+		if err != nil {
+			return nil, err
+		}
+		result.target += newTarget
+	}
+	glog.Infof("Cluster-Autoscaler scale-up status: %v (%v, %v)", result.status, result.ready, result.target)
+	return &result, nil
+}
+
+func waitForScaleUpStatus(c clientset.Interface, expected string, timeout time.Duration) (*scaleUpStatus, error) {
+	for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) {
+		status, err := getScaleUpStatus(c)
+		if err != nil {
+			return nil, err
+		}
+		if status.status == expected {
+			return status, nil
+		}
+	}
+	return nil, fmt.Errorf("ScaleUp status did not reach expected value: %v", expected)
+}