Cluster-Autoscaler e2e to catch unnecessary scale-ups

This fails (with large probability, as issue is not 100% deterministic) on 0.5.0, catching the bug in this version.
2025-08-06 02:34:03 +00:00 · 2017-04-04 10:26:38 +02:00 · 2017-04-04 10:26:38 +02:00 · 67f5ba0004
commit 67f5ba0004
parent 8667d7c4f1
1 changed files with 104 additions and 13 deletions
--- a/test/e2e/autoscaling/cluster_size_autoscaling.go
+++ b/test/e2e/autoscaling/cluster_size_autoscaling.go
@ -51,6 +51,7 @@ const (
 	defaultTimeout        = 3 * time.Minute
 	resizeTimeout         = 5 * time.Minute
 	scaleUpTimeout        = 5 * time.Minute
 	scaleUpTriggerTimeout = 2 * time.Minute
 	scaleDownTimeout      = 15 * time.Minute
 	podTimeout            = 2 * time.Minute
 	nodesRecoverTimeout   = 5 * time.Minute
@ -61,6 +62,9 @@ const (
 	disabledTaint             = "DisabledForAutoscalingTest"
 	newNodesForScaledownTests = 2
 	unhealthyClusterThreshold = 4
 	caNoScaleUpStatus      = "NoActivity"
 	caOngoingScaleUpStatus = "InProgress"
 )
 var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
@ -117,7 +121,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 	It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() {
 		By("Creating unschedulable pod")
-		ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false)
+		ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false, defaultTimeout)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
 		By("Waiting for scale up hoping it won't happen")
@ -144,7 +148,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 	})
 	It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", func() {
-		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
+		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
 		// Verify, that cluster size is increased
@ -153,6 +157,31 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
 	})
 	It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() {
 		status, err := getScaleUpStatus(c)
 		framework.ExpectNoError(err)
 		unmanagedNodes := nodeCount - status.ready
 		By("Schedule more pods than can fit and wait for claster to scale-up")
 		ReserveMemory(f, "memory-reservation", 100, (nodeCount+2)*memCapacityMb, false, 1*time.Second)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
 		status, err = waitForScaleUpStatus(c, caOngoingScaleUpStatus, scaleUpTriggerTimeout)
 		framework.ExpectNoError(err)
 		target := status.target
 		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
 		By("Expect no more scale-up to be happening after all pods are scheduled")
 		status, err = getScaleUpStatus(c)
 		framework.ExpectNoError(err)
 		if status.target != target {
 			glog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target)
 		}
 		Expect(status.status).Should(Equal(caNoScaleUpStatus))
 		Expect(status.ready).Should(Equal(status.target))
 		Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(Equal(status.target + unmanagedNodes))
 	})
 	It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() {
 		framework.SkipUnlessProviderIs("gke")
@ -163,7 +192,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 		framework.ExpectNoError(framework.WaitForClusterSize(c, nodeCount+1, resizeTimeout))
 		glog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).")
-		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
+		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
 		// Verify, that cluster size is increased
@ -295,7 +324,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 		framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
 		By("Creating rc with 2 pods too big to fit default-pool but fitting extra-pool")
-		ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false)
+		ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false, defaultTimeout)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
 		// Apparently GKE master is restarted couple minutes after the node pool is added
@ -383,7 +412,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 				nodesToBreak = nodesToBreak[1:]
 				framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction)
 			} else {
-				ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
+				ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
 				defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
 				time.Sleep(scaleUpTimeout)
 				currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
@ -622,7 +651,7 @@ func CreateNodeSelectorPods(f *framework.Framework, id string, replicas int, nod
 	}
 }
-func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool) {
+func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) {
 	By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes))
 	request := int64(1024 * 1024 * megabytes / replicas)
 	config := &testutils.RCConfig{
@ -630,7 +659,7 @@ func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, e
 		InternalClient: f.InternalClientset,
 		Name:           id,
 		Namespace:      f.Namespace.Name,
-		Timeout:        defaultTimeout,
+		Timeout:        timeout,
 		Image:          framework.GetPauseImageName(f.ClientSet),
 		Replicas:       replicas,
 		MemRequest:     request,
@ -896,3 +925,65 @@ func getClusterwideStatus(c clientset.Interface) (string, error) {
 	}
 	return result[1], nil
 }
 type scaleUpStatus struct {
 	status string
 	ready  int
 	target int
 }
 // Try to get scaleup statuses of all node groups.
 // Status configmap is not parsing-friendly, so evil regexpery follows.
 func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) {
 	configMap, err := c.CoreV1().ConfigMaps("kube-system").Get("cluster-autoscaler-status", metav1.GetOptions{})
 	if err != nil {
 		return nil, err
 	}
 	status, ok := configMap.Data["status"]
 	if !ok {
 		return nil, fmt.Errorf("Status information not found in configmap")
 	}
 	matcher, err := regexp.Compile("s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)")
 	if err != nil {
 		return nil, err
 	}
 	matches := matcher.FindAllStringSubmatch(status, -1)
 	if len(matches) < 1 {
 		return nil, fmt.Errorf("Failed to parse CA status configmap")
 	}
 	result := scaleUpStatus{
 		status: caNoScaleUpStatus,
 		ready:  0,
 		target: 0,
 	}
 	for _, match := range matches {
 		if match[1] == caOngoingScaleUpStatus {
 			result.status = caOngoingScaleUpStatus
 		}
 		newReady, err := strconv.Atoi(match[2])
 		if err != nil {
 			return nil, err
 		}
 		result.ready += newReady
 		newTarget, err := strconv.Atoi(match[3])
 		if err != nil {
 			return nil, err
 		}
 		result.target += newTarget
 	}
 	glog.Infof("Cluster-Autoscaler scale-up status: %v (%v, %v)", result.status, result.ready, result.target)
 	return &result, nil
 }
 func waitForScaleUpStatus(c clientset.Interface, expected string, timeout time.Duration) (*scaleUpStatus, error) {
 	for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) {
 		status, err := getScaleUpStatus(c)
 		if err != nil {
 			return nil, err
 		}
 		if status.status == expected {
 			return status, nil
 		}
 	}
 	return nil, fmt.Errorf("ScaleUp status did not reach expected value: %v", expected)
 }