From 1b0f981f82ce7ba9fe906d29332aa78f2a0d842d Mon Sep 17 00:00:00 2001 From: Andrzej Wasylkowski Date: Thu, 1 Jun 2017 13:12:27 +0200 Subject: [PATCH 1/3] Made the GetReplicas function count only ready replicas. Counting all replicas makes functions dependent on GetReplicas proceed with trying to use replicas that are not ready yet. --- test/e2e/common/autoscaling_utils.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/e2e/common/autoscaling_utils.go b/test/e2e/common/autoscaling_utils.go index a2abe0a3906..aa16ef4e207 100644 --- a/test/e2e/common/autoscaling_utils.go +++ b/test/e2e/common/autoscaling_utils.go @@ -321,21 +321,21 @@ func (rc *ResourceConsumer) GetReplicas() int { if replicationController == nil { framework.Failf(rcIsNil) } - return int(replicationController.Status.Replicas) + return int(replicationController.Status.ReadyReplicas) case KindDeployment: deployment, err := rc.framework.ClientSet.Extensions().Deployments(rc.framework.Namespace.Name).Get(rc.name, metav1.GetOptions{}) framework.ExpectNoError(err) if deployment == nil { framework.Failf(deploymentIsNil) } - return int(deployment.Status.Replicas) + return int(deployment.Status.ReadyReplicas) case KindReplicaSet: rs, err := rc.framework.ClientSet.Extensions().ReplicaSets(rc.framework.Namespace.Name).Get(rc.name, metav1.GetOptions{}) framework.ExpectNoError(err) if rs == nil { framework.Failf(rsIsNil) } - return int(rs.Status.Replicas) + return int(rs.Status.ReadyReplicas) default: framework.Failf(invalidKind) } From 38f175f11583d2957eaed28827698600ff4c2d57 Mon Sep 17 00:00:00 2001 From: Andrzej Wasylkowski Date: Sat, 10 Jun 2017 10:47:31 +0200 Subject: [PATCH 2/3] Made WaitForReplicas take a "how long to wait" parameter instead of it being hardcoded. --- test/e2e/autoscaling/horizontal_pod_autoscaling.go | 5 +++-- test/e2e/common/autoscaling_utils.go | 3 +-- test/e2e/stackdriver_monitoring.go | 2 +- test/e2e/upgrades/horizontal_pod_autoscalers.go | 8 +++++--- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/test/e2e/autoscaling/horizontal_pod_autoscaling.go b/test/e2e/autoscaling/horizontal_pod_autoscaling.go index 826215ca803..1e76cea54f1 100644 --- a/test/e2e/autoscaling/horizontal_pod_autoscaling.go +++ b/test/e2e/autoscaling/horizontal_pod_autoscaling.go @@ -114,17 +114,18 @@ type HPAScaleTest struct { // The second state change (optional) is due to the CPU burst parameter, which HPA again responds to. // TODO The use of 3 states is arbitrary, we could eventually make this test handle "n" states once this test stabilizes. func (scaleTest *HPAScaleTest) run(name, kind string, rc *common.ResourceConsumer, f *framework.Framework) { + const timeToWait = 15 * time.Minute rc = common.NewDynamicResourceConsumer(name, kind, int(scaleTest.initPods), int(scaleTest.totalInitialCPUUsage), 0, 0, scaleTest.perPodCPURequest, 200, f) defer rc.CleanUp() hpa := common.CreateCPUHorizontalPodAutoscaler(rc, scaleTest.targetCPUUtilizationPercent, scaleTest.minPods, scaleTest.maxPods) defer common.DeleteHorizontalPodAutoscaler(rc, hpa.Name) - rc.WaitForReplicas(int(scaleTest.firstScale)) + rc.WaitForReplicas(int(scaleTest.firstScale), timeToWait) if scaleTest.firstScaleStasis > 0 { rc.EnsureDesiredReplicas(int(scaleTest.firstScale), scaleTest.firstScaleStasis) } if scaleTest.cpuBurst > 0 && scaleTest.secondScale > 0 { rc.ConsumeCPU(scaleTest.cpuBurst) - rc.WaitForReplicas(int(scaleTest.secondScale)) + rc.WaitForReplicas(int(scaleTest.secondScale), timeToWait) } } diff --git a/test/e2e/common/autoscaling_utils.go b/test/e2e/common/autoscaling_utils.go index aa16ef4e207..8d15bc0295a 100644 --- a/test/e2e/common/autoscaling_utils.go +++ b/test/e2e/common/autoscaling_utils.go @@ -342,8 +342,7 @@ func (rc *ResourceConsumer) GetReplicas() int { return 0 } -func (rc *ResourceConsumer) WaitForReplicas(desiredReplicas int) { - duration := 15 * time.Minute +func (rc *ResourceConsumer) WaitForReplicas(desiredReplicas int, duration time.Duration) { interval := 20 * time.Second err := wait.PollImmediate(interval, duration, func() (bool, error) { replicas := rc.GetReplicas() diff --git a/test/e2e/stackdriver_monitoring.go b/test/e2e/stackdriver_monitoring.go index f8060d90854..f175c13e7d3 100644 --- a/test/e2e/stackdriver_monitoring.go +++ b/test/e2e/stackdriver_monitoring.go @@ -89,7 +89,7 @@ func testStackdriverMonitoring(f *framework.Framework, pods, allPodsCPU int, per rc := common.NewDynamicResourceConsumer(rcName, common.KindDeployment, pods, allPodsCPU, memoryUsed, 0, perPodCPU, memoryLimit, f) defer rc.CleanUp() - rc.WaitForReplicas(pods) + rc.WaitForReplicas(pods, 15*time.Minute) metricsMap := map[string]bool{} pollingFunction := checkForMetrics(projectId, gcmService, time.Now(), metricsMap, allPodsCPU, perPodCPU) diff --git a/test/e2e/upgrades/horizontal_pod_autoscalers.go b/test/e2e/upgrades/horizontal_pod_autoscalers.go index 51074e91a98..bb5110ebb38 100644 --- a/test/e2e/upgrades/horizontal_pod_autoscalers.go +++ b/test/e2e/upgrades/horizontal_pod_autoscalers.go @@ -18,6 +18,7 @@ package upgrades import ( "fmt" + "time" autoscalingv1 "k8s.io/kubernetes/pkg/apis/autoscaling/v1" "k8s.io/kubernetes/test/e2e/common" @@ -72,22 +73,23 @@ func (t *HPAUpgradeTest) Teardown(f *framework.Framework) { } func (t *HPAUpgradeTest) test() { + const timeToWait = 15 * time.Minute t.rc.Resume() By(fmt.Sprintf("HPA scales to 1 replica: consume 10 millicores, target per pod 100 millicores, min pods 1.")) t.rc.ConsumeCPU(10) /* millicores */ By(fmt.Sprintf("HPA waits for 1 replica")) - t.rc.WaitForReplicas(1) + t.rc.WaitForReplicas(1, timeToWait) By(fmt.Sprintf("HPA scales to 3 replicas: consume 250 millicores, target per pod 100 millicores.")) t.rc.ConsumeCPU(250) /* millicores */ By(fmt.Sprintf("HPA waits for 3 replicas")) - t.rc.WaitForReplicas(3) + t.rc.WaitForReplicas(3, timeToWait) By(fmt.Sprintf("HPA scales to 5 replicas: consume 700 millicores, target per pod 100 millicores, max pods 5.")) t.rc.ConsumeCPU(700) /* millicores */ By(fmt.Sprintf("HPA waits for 5 replicas")) - t.rc.WaitForReplicas(5) + t.rc.WaitForReplicas(5, timeToWait) // We need to pause background goroutines as during upgrade master is unavailable and requests issued by them fail. t.rc.Pause() From ce9f3bcfefdbbaaec2c2c3f71f81c617dbd1e242 Mon Sep 17 00:00:00 2001 From: Andrzej Wasylkowski Date: Thu, 1 Jun 2017 13:13:18 +0200 Subject: [PATCH 3/3] Added an end-to-end test measuring autoscaling's efficiency. --- test/e2e/autoscaling/BUILD | 1 + test/e2e/autoscaling/autoscaling_timer.go | 111 ++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 test/e2e/autoscaling/autoscaling_timer.go diff --git a/test/e2e/autoscaling/BUILD b/test/e2e/autoscaling/BUILD index 1517a3c440f..d257c00fbac 100644 --- a/test/e2e/autoscaling/BUILD +++ b/test/e2e/autoscaling/BUILD @@ -10,6 +10,7 @@ load( go_library( name = "go_default_library", srcs = [ + "autoscaling_timer.go", "cluster_size_autoscaling.go", "dns_autoscaling.go", "horizontal_pod_autoscaling.go", diff --git a/test/e2e/autoscaling/autoscaling_timer.go b/test/e2e/autoscaling/autoscaling_timer.go new file mode 100644 index 00000000000..c0b71930c42 --- /dev/null +++ b/test/e2e/autoscaling/autoscaling_timer.go @@ -0,0 +1,111 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package autoscaling + +import ( + "strings" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/test/e2e/common" + "k8s.io/kubernetes/test/e2e/framework" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +var _ = framework.KubeDescribe("[Feature:ClusterSizeAutoscalingScaleUp] [Slow] Autoscaling", func() { + f := framework.NewDefaultFramework("autoscaling") + + framework.KubeDescribe("Autoscaling a service", func() { + BeforeEach(func() { + // Check if Cloud Autoscaler is enabled by trying to get its ConfigMap. + _, err := f.ClientSet.CoreV1().ConfigMaps("kube-system").Get("cluster-autoscaler-status", metav1.GetOptions{}) + if err != nil { + framework.Skipf("test expects Cluster Autoscaler to be enabled") + } + }) + + Context("from 1 pod and 3 nodes to 8 pods and >=4 nodes", func() { + const nodesNum = 3 // Expect there to be 3 nodes before and after the test. + var nodeGroupName string // Set by BeforeEach, used by AfterEach to scale this node group down after the test. + var nodes *v1.NodeList // Set by BeforeEach, used by Measure to calculate CPU request based on node's sizes. + + BeforeEach(func() { + // Make sure there is only 1 node group, otherwise this test becomes useless. + nodeGroups := strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") + if len(nodeGroups) != 1 { + framework.Skipf("test expects 1 node group, found %d", len(nodeGroups)) + } + nodeGroupName = nodeGroups[0] + + // Make sure the node group has exactly 'nodesNum' nodes, otherwise this test becomes useless. + nodeGroupSize, err := framework.GroupSize(nodeGroupName) + framework.ExpectNoError(err) + if nodeGroupSize != nodesNum { + framework.Skipf("test expects %d nodes, found %d", nodesNum, nodeGroupSize) + } + + // Make sure all nodes are schedulable, otherwise we are in some kind of a problem state. + nodes = framework.GetReadySchedulableNodesOrDie(f.ClientSet) + schedulableCount := len(nodes.Items) + Expect(schedulableCount).To(Equal(nodeGroupSize), "not all nodes are schedulable") + }) + + AfterEach(func() { + // Scale down back to only 'nodesNum' nodes, as expected at the start of the test. + framework.ExpectNoError(framework.ResizeGroup(nodeGroupName, nodesNum)) + framework.ExpectNoError(framework.WaitForClusterSize(f.ClientSet, nodesNum, 15*time.Minute)) + }) + + Measure("takes less than 15 minutes", func(b Benchmarker) { + // Measured over multiple samples, scaling takes 10 +/- 2 minutes, so 15 minutes should be fully sufficient. + const timeToWait = 15 * time.Minute + + // Calculate the CPU request of the service. + // This test expects that 8 pods will not fit in 'nodesNum' nodes, but will fit in >='nodesNum'+1 nodes. + // Make it so that 'nodesNum' pods fit perfectly per node (in practice other things take space, so less than that will fit). + nodeCpus := nodes.Items[0].Status.Capacity[v1.ResourceCPU] + nodeCpuMillis := (&nodeCpus).MilliValue() + cpuRequestMillis := int64(nodeCpuMillis / nodesNum) + + // Start the service we want to scale and wait for it to be up and running. + nodeMemoryBytes := nodes.Items[0].Status.Capacity[v1.ResourceMemory] + nodeMemoryMB := (&nodeMemoryBytes).Value() / 1024 / 1024 + memRequestMB := nodeMemoryMB / 10 // Ensure each pod takes not more than 10% of node's total memory. + replicas := 1 + resourceConsumer := common.NewDynamicResourceConsumer("resource-consumer", common.KindDeployment, replicas, 0, 0, 0, cpuRequestMillis, memRequestMB, f) + defer resourceConsumer.CleanUp() + resourceConsumer.WaitForReplicas(replicas, 1*time.Minute) // Should finish ~immediately, so 1 minute is more than enough. + + // Enable Horizontal Pod Autoscaler with 50% target utilization and + // scale up the CPU usage to trigger autoscaling to 8 pods for target to be satisfied. + targetCpuUtilizationPercent := int32(50) + hpa := common.CreateCPUHorizontalPodAutoscaler(resourceConsumer, targetCpuUtilizationPercent, 1, 10) + defer common.DeleteHorizontalPodAutoscaler(resourceConsumer, hpa.Name) + cpuLoad := 8 * cpuRequestMillis * int64(targetCpuUtilizationPercent) / 100 // 8 pods utilized to the target level + resourceConsumer.ConsumeCPU(int(cpuLoad)) + + // Measure the time it takes for the service to scale to 8 pods with 50% CPU utilization each. + b.Time("total scale-up time", func() { + resourceConsumer.WaitForReplicas(8, timeToWait) + }) + }, 1) // Increase to run the test more than once. + }) + }) +})