diff --git a/test/e2e/autoscaling/cluster_size_autoscaling.go b/test/e2e/autoscaling/cluster_size_autoscaling.go index a51117fbf8d..c5cb7a60738 100644 --- a/test/e2e/autoscaling/cluster_size_autoscaling.go +++ b/test/e2e/autoscaling/cluster_size_autoscaling.go @@ -48,12 +48,13 @@ import ( ) const ( - defaultTimeout = 3 * time.Minute - resizeTimeout = 5 * time.Minute - scaleUpTimeout = 5 * time.Minute - scaleDownTimeout = 15 * time.Minute - podTimeout = 2 * time.Minute - nodesRecoverTimeout = 5 * time.Minute + defaultTimeout = 3 * time.Minute + resizeTimeout = 5 * time.Minute + scaleUpTimeout = 5 * time.Minute + scaleUpTriggerTimeout = 2 * time.Minute + scaleDownTimeout = 15 * time.Minute + podTimeout = 2 * time.Minute + nodesRecoverTimeout = 5 * time.Minute gkeEndpoint = "https://test-container.sandbox.googleapis.com" gkeUpdateTimeout = 15 * time.Minute @@ -61,6 +62,9 @@ const ( disabledTaint = "DisabledForAutoscalingTest" newNodesForScaledownTests = 2 unhealthyClusterThreshold = 4 + + caNoScaleUpStatus = "NoActivity" + caOngoingScaleUpStatus = "InProgress" ) var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { @@ -117,7 +121,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() { By("Creating unschedulable pod") - ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false) + ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false, defaultTimeout) defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") By("Waiting for scale up hoping it won't happen") @@ -144,7 +148,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { }) It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", func() { - ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false) + ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout) defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") // Verify, that cluster size is increased @@ -153,6 +157,31 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c)) }) + It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() { + status, err := getScaleUpStatus(c) + framework.ExpectNoError(err) + unmanagedNodes := nodeCount - status.ready + + By("Schedule more pods than can fit and wait for claster to scale-up") + ReserveMemory(f, "memory-reservation", 100, (nodeCount+2)*memCapacityMb, false, 1*time.Second) + defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") + + status, err = waitForScaleUpStatus(c, caOngoingScaleUpStatus, scaleUpTriggerTimeout) + framework.ExpectNoError(err) + target := status.target + framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c)) + + By("Expect no more scale-up to be happening after all pods are scheduled") + status, err = getScaleUpStatus(c) + framework.ExpectNoError(err) + if status.target != target { + glog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target) + } + Expect(status.status).Should(Equal(caNoScaleUpStatus)) + Expect(status.ready).Should(Equal(status.target)) + Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(Equal(status.target + unmanagedNodes)) + }) + It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() { framework.SkipUnlessProviderIs("gke") @@ -163,7 +192,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { framework.ExpectNoError(framework.WaitForClusterSize(c, nodeCount+1, resizeTimeout)) glog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).") - ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false) + ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout) defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") // Verify, that cluster size is increased @@ -295,7 +324,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2)) By("Creating rc with 2 pods too big to fit default-pool but fitting extra-pool") - ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false) + ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false, defaultTimeout) defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") // Apparently GKE master is restarted couple minutes after the node pool is added @@ -383,7 +412,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() { nodesToBreak = nodesToBreak[1:] framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction) } else { - ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false) + ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout) defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation") time.Sleep(scaleUpTimeout) currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet) @@ -622,7 +651,7 @@ func CreateNodeSelectorPods(f *framework.Framework, id string, replicas int, nod } } -func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool) { +func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) { By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes)) request := int64(1024 * 1024 * megabytes / replicas) config := &testutils.RCConfig{ @@ -630,7 +659,7 @@ func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, e InternalClient: f.InternalClientset, Name: id, Namespace: f.Namespace.Name, - Timeout: defaultTimeout, + Timeout: timeout, Image: framework.GetPauseImageName(f.ClientSet), Replicas: replicas, MemRequest: request, @@ -896,3 +925,65 @@ func getClusterwideStatus(c clientset.Interface) (string, error) { } return result[1], nil } + +type scaleUpStatus struct { + status string + ready int + target int +} + +// Try to get scaleup statuses of all node groups. +// Status configmap is not parsing-friendly, so evil regexpery follows. +func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) { + configMap, err := c.CoreV1().ConfigMaps("kube-system").Get("cluster-autoscaler-status", metav1.GetOptions{}) + if err != nil { + return nil, err + } + status, ok := configMap.Data["status"] + if !ok { + return nil, fmt.Errorf("Status information not found in configmap") + } + matcher, err := regexp.Compile("s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)") + if err != nil { + return nil, err + } + matches := matcher.FindAllStringSubmatch(status, -1) + if len(matches) < 1 { + return nil, fmt.Errorf("Failed to parse CA status configmap") + } + result := scaleUpStatus{ + status: caNoScaleUpStatus, + ready: 0, + target: 0, + } + for _, match := range matches { + if match[1] == caOngoingScaleUpStatus { + result.status = caOngoingScaleUpStatus + } + newReady, err := strconv.Atoi(match[2]) + if err != nil { + return nil, err + } + result.ready += newReady + newTarget, err := strconv.Atoi(match[3]) + if err != nil { + return nil, err + } + result.target += newTarget + } + glog.Infof("Cluster-Autoscaler scale-up status: %v (%v, %v)", result.status, result.ready, result.target) + return &result, nil +} + +func waitForScaleUpStatus(c clientset.Interface, expected string, timeout time.Duration) (*scaleUpStatus, error) { + for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) { + status, err := getScaleUpStatus(c) + if err != nil { + return nil, err + } + if status.status == expected { + return status, nil + } + } + return nil, fmt.Errorf("ScaleUp status did not reach expected value: %v", expected) +}