diff --git a/test/e2e/autoscaling/cluster_size_autoscaling.go b/test/e2e/autoscaling/cluster_size_autoscaling.go index a3f0894fb1e..896a7d7d6cc 100644 --- a/test/e2e/autoscaling/cluster_size_autoscaling.go +++ b/test/e2e/autoscaling/cluster_size_autoscaling.go @@ -130,16 +130,10 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() { err = enableAutoscaler("default-pool", 3, 5) framework.ExpectNoError(err) } - Expect(getNAPNodePoolsNumber()).Should(Equal(0)) } }) AfterEach(func() { - if framework.ProviderIs("gke") { - By("Remove changes introduced by NAP tests") - removeNAPNodePools() - disableAutoprovisioning() - } By(fmt.Sprintf("Restoring initial size of the cluster")) setMigSizes(originalSizes) expectedNodes := 0 @@ -326,29 +320,6 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() { Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) }) - // TODO consider moving to [Feature:ClusterSizeAutoscalingGpu] as soon as NAP goes out of beta. Currently - // project needed to run the NAP tests require whitelisting for NAP alpha - It("NAP should add a pool with GPUs if unschedulable POD which require GPU exists [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() { - framework.SkipUnlessProviderIs("gke") - - installNvidiaDriversDaemonSet() - framework.ExpectNoError(enableAutoprovisioning(` -"resource_limits":{"resource_type":"nvidia-tesla-k80", "minimum":0, "maximum": 3}, -"resource_limits":{"resource_type":"cpu", "minimum":0, "maximum":64}, -"resource_limits":{"resource_type":"memory", "minimum":0, "maximum":1000000000000}`)) - - By("Schedule a pod which requires GPU and wait until it is started") - framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc")) - defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc") - - By("Verify cluster size increased") - framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, - func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout)) - - By("Check if NAP group was created") - Expect(getNAPNodePoolsNumber()).Should(Equal(1)) - }) - It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]", func() { framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleUpTest(1) }) @@ -934,103 +905,6 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() { framework.ExpectNoError(framework.WaitForReadyNodes(c, len(nodes.Items), nodesRecoverTimeout)) }) - It("should add new node and new node pool on too big pod, scale down to 1 and scale down to 0 [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() { - framework.SkipUnlessProviderIs("gke") - framework.ExpectNoError(enableAutoprovisioning("")) - By("Create first pod") - cleanupFunc1 := ReserveMemory(f, "memory-reservation1", 1, int(1.1*float64(memAllocatableMb)), true, defaultTimeout) - defer func() { - if cleanupFunc1 != nil { - cleanupFunc1() - } - }() - By("Waiting for scale up") - // Verify that cluster size increased. - framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, - func(size int) bool { return size == nodeCount+1 }, defaultTimeout)) - By("Check if NAP group was created") - Expect(getNAPNodePoolsNumber()).Should(Equal(1)) - By("Create second pod") - cleanupFunc2 := ReserveMemory(f, "memory-reservation2", 1, int(1.1*float64(memAllocatableMb)), true, defaultTimeout) - defer func() { - if cleanupFunc2 != nil { - cleanupFunc2() - } - }() - By("Waiting for scale up") - // Verify that cluster size increased. - framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, - func(size int) bool { return size == nodeCount+2 }, defaultTimeout)) - By("Delete first pod") - cleanupFunc1() - cleanupFunc1 = nil - By("Waiting for scale down to 1") - // Verify that cluster size decreased. - framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, - func(size int) bool { return size == nodeCount+1 }, scaleDownTimeout)) - By("Delete second pod") - cleanupFunc2() - cleanupFunc2 = nil - By("Waiting for scale down to 0") - // Verify that cluster size decreased. - framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, - func(size int) bool { return size == nodeCount }, scaleDownTimeout)) - By("Waiting for NAP group remove") - framework.ExpectNoError(waitTillAllNAPNodePoolsAreRemoved()) - By("Check if NAP group was removeed") - Expect(getNAPNodePoolsNumber()).Should(Equal(0)) - }) - - It("shouldn't add new node group if not needed [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() { - framework.SkipUnlessProviderIs("gke") - framework.ExpectNoError(enableAutoprovisioning("")) - By("Create pods") - // Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created. - cleanupFunc := ReserveMemory(f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout) - defer cleanupFunc() - By("Waiting for scale up") - // Verify that cluster size increased. - framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, - func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout)) - By("Check if NAP group was created hoping id didn't happen") - Expect(getNAPNodePoolsNumber()).Should(Equal(0)) - }) - - It("shouldn't scale up if cores limit too low, should scale up after limit is changed [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() { - framework.SkipUnlessProviderIs("gke") - By(fmt.Sprintf("Set core limit to %d", coreCount)) - framework.ExpectNoError(enableAutoprovisioning(fmt.Sprintf(`"resource_limits":{"resource_type":"cpu", "minimum":2, "maximum":%d}, "resource_limits":{"resource_type":"memory", "minimum":0, "maximum":10000000}`, coreCount))) - // Create pod allocating 1.1 allocatable for present nodes. Bigger node will have to be created. - cleanupFunc := ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memAllocatableMb)), false, time.Second) - defer cleanupFunc() - By(fmt.Sprintf("Waiting for scale up hoping it won't happen, sleep for %s", scaleUpTimeout.String())) - time.Sleep(scaleUpTimeout) - // Verify that cluster size is not changed - framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, - func(size int) bool { return size == nodeCount }, time.Second)) - By("Change resource limits") - framework.ExpectNoError(enableAutoprovisioning(fmt.Sprintf(`"resource_limits":{"resource_type":"cpu", "minimum":2, "maximum":%d}, "resource_limits":{"resource_type":"memory", "minimum":0, "maximum":10000000}`, coreCount+5))) - By("Wait for scale up") - framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, - func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout)) - By("Check if NAP group was created") - Expect(getNAPNodePoolsNumber()).Should(Equal(1)) - }) - - It("should create new node if there is no node for node selector [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() { - framework.SkipUnlessProviderIs("gke") - framework.ExpectNoError(enableAutoprovisioning("")) - // Create pod allocating 0.7 allocatable for present nodes with node selector. - cleanupFunc := ReserveMemoryWithSelector(f, "memory-reservation", 1, int(0.7*float64(memAllocatableMb)), true, scaleUpTimeout, map[string]string{"test": "test"}) - defer cleanupFunc() - By("Waiting for scale up") - // Verify that cluster size increased. - framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, - func(size int) bool { return size == nodeCount+1 }, defaultTimeout)) - By("Check if NAP group was created") - Expect(getNAPNodePoolsNumber()).Should(Equal(1)) - }) - It("shouldn't scale up when expendable pod is created [Feature:ClusterSizeAutoscalingScaleUp]", func() { // TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta. framework.SkipUnlessProviderIs("gce") @@ -1288,17 +1162,6 @@ func disableAutoscaler(nodePool string, minCount, maxCount int) error { return fmt.Errorf("autoscaler still enabled, last error: %v", finalErr) } -func isAutoprovisioningEnabled() (bool, error) { - strBody, err := getCluster("v1alpha1") - if err != nil { - return false, err - } - if strings.Contains(strBody, "\"enableNodeAutoprovisioning\": true") { - return true, nil - } - return false, nil -} - func executeHTTPRequest(method string, url string, body string) (string, error) { client := &http.Client{} req, err := http.NewRequest(method, url, strings.NewReader(body)) @@ -1318,126 +1181,6 @@ func executeHTTPRequest(method string, url string, body string) (string, error) return string(respBody), nil } -func enableAutoprovisioning(resourceLimits string) error { - By("Using API to enable autoprovisioning.") - var body string - if resourceLimits != "" { - body = fmt.Sprintf(`{"update": {"desired_cluster_autoscaling": {"enable_node_autoprovisioning": true, %s}}}`, resourceLimits) - } else { - body = `{"update": {"desired_cluster_autoscaling": {"enable_node_autoprovisioning": true, "resource_limits":{"resource_type":"cpu", "minimum":0, "maximum":100}, "resource_limits":{"resource_type":"memory", "minimum":0, "maximum":10000000}}}}` - } - _, err := executeHTTPRequest(http.MethodPut, getGKEClusterURL("v1alpha1"), body) - if err != nil { - glog.Errorf("Request error: %s", err.Error()) - return err - } - glog.Infof("Wait for enabling autoprovisioning.") - for start := time.Now(); time.Since(start) < gkeUpdateTimeout; time.Sleep(30 * time.Second) { - enabled, err := isAutoprovisioningEnabled() - if err != nil { - glog.Errorf("Error: %s", err.Error()) - return err - } - if enabled { - By("Autoprovisioning enabled.") - return nil - } - glog.Infof("Waiting for enabling autoprovisioning") - } - return fmt.Errorf("autoprovisioning wasn't enabled (timeout).") -} - -func disableAutoprovisioning() error { - enabled, err := isAutoprovisioningEnabled() - if err != nil { - glog.Errorf("Error: %s", err.Error()) - return err - } - if !enabled { - By("Autoprovisioning disabled.") - return nil - } - By("Using API to disable autoprovisioning.") - _, err = executeHTTPRequest(http.MethodPut, getGKEClusterURL("v1alpha1"), "{\"update\": {\"desired_cluster_autoscaling\": {}}}") - if err != nil { - glog.Errorf("Request error: %s", err.Error()) - return err - } - By("Wait for disabling autoprovisioning.") - for start := time.Now(); time.Since(start) < gkeUpdateTimeout; time.Sleep(30 * time.Second) { - enabled, err := isAutoprovisioningEnabled() - if err != nil { - glog.Errorf("Error: %s", err.Error()) - return err - } - if !enabled { - By("Autoprovisioning disabled.") - return nil - } - By("Waiting for disabling autoprovisioning") - } - return fmt.Errorf("autoprovisioning wasn't disabled (timeout).") -} - -func getNAPNodePools() ([]string, error) { - if framework.ProviderIs("gke") { - args := []string{"container", "node-pools", "list", "--cluster=" + framework.TestContext.CloudConfig.Cluster} - output, err := execCmd(getGcloudCommand(args)...).CombinedOutput() - if err != nil { - glog.Errorf("Failed to get instance groups: %v", string(output)) - return nil, err - } - re := regexp.MustCompile("nap.* ") - lines := re.FindAllString(string(output), -1) - for i, line := range lines { - lines[i] = line[:strings.Index(line, " ")] - } - return lines, nil - } else { - return nil, fmt.Errorf("provider does not support NAP") - } -} - -func removeNAPNodePools() error { - By("Remove NAP node pools") - pools, err := getNAPNodePools() - if err != nil { - return err - } - for _, pool := range pools { - By("Remove node pool: " + pool) - suffix := fmt.Sprintf("projects/%s/zones/%s/clusters/%s/nodePools/%s", - framework.TestContext.CloudConfig.ProjectID, - framework.TestContext.CloudConfig.Zone, - framework.TestContext.CloudConfig.Cluster, - pool) - _, err := executeHTTPRequest(http.MethodDelete, getGKEURL("v1alpha1", suffix), "") - if err != nil { - glog.Errorf("Request error: %s", err.Error()) - return err - } - } - err = waitTillAllNAPNodePoolsAreRemoved() - if err != nil { - glog.Errorf(fmt.Sprintf("Couldn't remove NAP groups: %s", err.Error())) - } - return err -} - -func getNAPNodePoolsNumber() int { - groups, err := getNAPNodePools() - framework.ExpectNoError(err) - return len(groups) -} - -func waitTillAllNAPNodePoolsAreRemoved() error { - By("Wait till all NAP node pools are removed") - err := wait.PollImmediate(5*time.Second, defaultTimeout, func() (bool, error) { - return getNAPNodePoolsNumber() == 0, nil - }) - return err -} - func addNodePool(name string, machineType string, numNodes int) { args := []string{"container", "node-pools", "create", name, "--quiet", "--machine-type=" + machineType,