Merge pull request #44092 from MaciekPytel/ca_overscaleup_e2e

Automatic merge from submit-queue

Cluster-Autoscaler e2e to catch unnecessary scale-ups

This e2e catches the bug in 0.5.0 CA with reasonable probability.
```release-note
```
This commit is contained in:
Kubernetes Submit Queue 2017-04-05 08:50:18 -07:00 committed by GitHub
commit 17a6a1cb5e

View File

@ -48,12 +48,13 @@ import (
)
const (
defaultTimeout = 3 * time.Minute
resizeTimeout = 5 * time.Minute
scaleUpTimeout = 5 * time.Minute
scaleDownTimeout = 15 * time.Minute
podTimeout = 2 * time.Minute
nodesRecoverTimeout = 5 * time.Minute
defaultTimeout = 3 * time.Minute
resizeTimeout = 5 * time.Minute
scaleUpTimeout = 5 * time.Minute
scaleUpTriggerTimeout = 2 * time.Minute
scaleDownTimeout = 15 * time.Minute
podTimeout = 2 * time.Minute
nodesRecoverTimeout = 5 * time.Minute
gkeEndpoint = "https://test-container.sandbox.googleapis.com"
gkeUpdateTimeout = 15 * time.Minute
@ -61,6 +62,9 @@ const (
disabledTaint = "DisabledForAutoscalingTest"
newNodesForScaledownTests = 2
unhealthyClusterThreshold = 4
caNoScaleUpStatus = "NoActivity"
caOngoingScaleUpStatus = "InProgress"
)
var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
@ -117,7 +121,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() {
By("Creating unschedulable pod")
ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false)
ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false, defaultTimeout)
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
By("Waiting for scale up hoping it won't happen")
@ -144,7 +148,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
})
It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", func() {
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
// Verify, that cluster size is increased
@ -153,6 +157,31 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
})
It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() {
status, err := getScaleUpStatus(c)
framework.ExpectNoError(err)
unmanagedNodes := nodeCount - status.ready
By("Schedule more pods than can fit and wait for claster to scale-up")
ReserveMemory(f, "memory-reservation", 100, (nodeCount+2)*memCapacityMb, false, 1*time.Second)
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
status, err = waitForScaleUpStatus(c, caOngoingScaleUpStatus, scaleUpTriggerTimeout)
framework.ExpectNoError(err)
target := status.target
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
By("Expect no more scale-up to be happening after all pods are scheduled")
status, err = getScaleUpStatus(c)
framework.ExpectNoError(err)
if status.target != target {
glog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target)
}
Expect(status.status).Should(Equal(caNoScaleUpStatus))
Expect(status.ready).Should(Equal(status.target))
Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(Equal(status.target + unmanagedNodes))
})
It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() {
framework.SkipUnlessProviderIs("gke")
@ -163,7 +192,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
framework.ExpectNoError(framework.WaitForClusterSize(c, nodeCount+1, resizeTimeout))
glog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).")
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
// Verify, that cluster size is increased
@ -295,7 +324,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
By("Creating rc with 2 pods too big to fit default-pool but fitting extra-pool")
ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false)
ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false, defaultTimeout)
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
// Apparently GKE master is restarted couple minutes after the node pool is added
@ -383,7 +412,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
nodesToBreak = nodesToBreak[1:]
framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction)
} else {
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
time.Sleep(scaleUpTimeout)
currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
@ -622,7 +651,7 @@ func CreateNodeSelectorPods(f *framework.Framework, id string, replicas int, nod
}
}
func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool) {
func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) {
By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes))
request := int64(1024 * 1024 * megabytes / replicas)
config := &testutils.RCConfig{
@ -630,7 +659,7 @@ func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, e
InternalClient: f.InternalClientset,
Name: id,
Namespace: f.Namespace.Name,
Timeout: defaultTimeout,
Timeout: timeout,
Image: framework.GetPauseImageName(f.ClientSet),
Replicas: replicas,
MemRequest: request,
@ -896,3 +925,65 @@ func getClusterwideStatus(c clientset.Interface) (string, error) {
}
return result[1], nil
}
type scaleUpStatus struct {
status string
ready int
target int
}
// Try to get scaleup statuses of all node groups.
// Status configmap is not parsing-friendly, so evil regexpery follows.
func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) {
configMap, err := c.CoreV1().ConfigMaps("kube-system").Get("cluster-autoscaler-status", metav1.GetOptions{})
if err != nil {
return nil, err
}
status, ok := configMap.Data["status"]
if !ok {
return nil, fmt.Errorf("Status information not found in configmap")
}
matcher, err := regexp.Compile("s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)")
if err != nil {
return nil, err
}
matches := matcher.FindAllStringSubmatch(status, -1)
if len(matches) < 1 {
return nil, fmt.Errorf("Failed to parse CA status configmap")
}
result := scaleUpStatus{
status: caNoScaleUpStatus,
ready: 0,
target: 0,
}
for _, match := range matches {
if match[1] == caOngoingScaleUpStatus {
result.status = caOngoingScaleUpStatus
}
newReady, err := strconv.Atoi(match[2])
if err != nil {
return nil, err
}
result.ready += newReady
newTarget, err := strconv.Atoi(match[3])
if err != nil {
return nil, err
}
result.target += newTarget
}
glog.Infof("Cluster-Autoscaler scale-up status: %v (%v, %v)", result.status, result.ready, result.target)
return &result, nil
}
func waitForScaleUpStatus(c clientset.Interface, expected string, timeout time.Duration) (*scaleUpStatus, error) {
for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) {
status, err := getScaleUpStatus(c)
if err != nil {
return nil, err
}
if status.status == expected {
return status, nil
}
}
return nil, fmt.Errorf("ScaleUp status did not reach expected value: %v", expected)
}