From 9b0f4c9f7ce4988cfd599b71fc6122842ea1b706 Mon Sep 17 00:00:00 2001
From: Andrzej Wasylkowski <wasylkowski@google.com>
Date: Fri, 18 Aug 2017 14:30:42 +0200
Subject: [PATCH] Added an end-to-end test ensuring that Cluster Autoscaler
 does not scale up when all pending pods are unschedulable.

---
 .../cluster_autoscaler_scalability.go         | 74 +++++++++++++++----
 test/e2e/framework/util.go                    | 17 +++++
 2 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/test/e2e/autoscaling/cluster_autoscaler_scalability.go b/test/e2e/autoscaling/cluster_autoscaler_scalability.go
index 3081c037b2d..54255fb609d 100644
--- a/test/e2e/autoscaling/cluster_autoscaler_scalability.go
+++ b/test/e2e/autoscaling/cluster_autoscaler_scalability.go
@@ -19,6 +19,7 @@ package autoscaling
 import (
 	"encoding/json"
 	"fmt"
+	"math"
 	"strings"
 	"time"
 
@@ -159,8 +160,11 @@ var _ = framework.KubeDescribe("Cluster size autoscaler scalability [Slow]", fun
 	It("should scale up twice [Feature:ClusterAutoscalerScalability2]", func() {
 		perNodeReservation := int(float64(memCapacityMb) * 0.95)
 		replicasPerNode := 10
-		additionalNodes1 := int(0.7 * maxNodes)
-		additionalNodes2 := int(0.25 * maxNodes)
+		additionalNodes1 := int(math.Ceil(0.7 * maxNodes))
+		additionalNodes2 := int(math.Ceil(0.25 * maxNodes))
+		if additionalNodes1+additionalNodes2 > maxNodes {
+			additionalNodes2 = maxNodes - additionalNodes1
+		}
 
 		replicas1 := additionalNodes1 * replicasPerNode
 		replicas2 := additionalNodes2 * replicasPerNode
@@ -168,7 +172,8 @@ var _ = framework.KubeDescribe("Cluster size autoscaler scalability [Slow]", fun
 		glog.Infof("cores per node: %v", coresPerNode)
 
 		// saturate cluster
-		reservationCleanup := ReserveMemory(f, "some-pod", nodeCount, nodeCount*perNodeReservation, true, memoryReservationTimeout)
+		initialReplicas := nodeCount
+		reservationCleanup := ReserveMemory(f, "some-pod", initialReplicas, nodeCount*perNodeReservation, true, memoryReservationTimeout)
 		defer reservationCleanup()
 		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
 
@@ -179,10 +184,10 @@ var _ = framework.KubeDescribe("Cluster size autoscaler scalability [Slow]", fun
 		expectedResult := createClusterPredicates(nodeCount + additionalNodes1)
 		config := createScaleUpTestConfig(nodeCount, nodeCount, rcConfig, expectedResult)
 
-		epsilon := 0.05
-
 		// run test #1
-		testCleanup1 := simpleScaleUpTestWithEpsilon(f, config, epsilon)
+		tolerateUnreadyNodes := additionalNodes1 / 20
+		tolerateUnreadyPods := (initialReplicas + replicas1) / 20
+		testCleanup1 := simpleScaleUpTestWithTolerance(f, config, tolerateUnreadyNodes, tolerateUnreadyPods)
 		defer testCleanup1()
 
 		glog.Infof("Scaled up once")
@@ -193,7 +198,9 @@ var _ = framework.KubeDescribe("Cluster size autoscaler scalability [Slow]", fun
 		config2 := createScaleUpTestConfig(nodeCount+additionalNodes1, nodeCount+additionalNodes2, rcConfig2, expectedResult2)
 
 		// run test #2
-		testCleanup2 := simpleScaleUpTestWithEpsilon(f, config2, epsilon)
+		tolerateUnreadyNodes = maxNodes / 20
+		tolerateUnreadyPods = (initialReplicas + replicas1 + replicas2) / 20
+		testCleanup2 := simpleScaleUpTestWithTolerance(f, config2, tolerateUnreadyNodes, tolerateUnreadyPods)
 		defer testCleanup2()
 
 		glog.Infof("Scaled up twice")
@@ -201,7 +208,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaler scalability [Slow]", fun
 
 	It("should scale down empty nodes [Feature:ClusterAutoscalerScalability3]", func() {
 		perNodeReservation := int(float64(memCapacityMb) * 0.7)
-		replicas := int(float64(maxNodes) * 0.7)
+		replicas := int(math.Ceil(maxNodes * 0.7))
 		totalNodes := maxNodes
 
 		// resize cluster to totalNodes
@@ -215,7 +222,9 @@ var _ = framework.KubeDescribe("Cluster size autoscaler scalability [Slow]", fun
 		rcConfig := reserveMemoryRCConfig(f, "some-pod", replicas, replicas*perNodeReservation, largeScaleUpTimeout)
 		expectedResult := createClusterPredicates(totalNodes)
 		config := createScaleUpTestConfig(totalNodes, totalNodes, rcConfig, expectedResult)
-		testCleanup := simpleScaleUpTestWithEpsilon(f, config, 0.1)
+		tolerateUnreadyNodes := totalNodes / 10
+		tolerateUnreadyPods := replicas / 10
+		testCleanup := simpleScaleUpTestWithTolerance(f, config, tolerateUnreadyNodes, tolerateUnreadyPods)
 		defer testCleanup()
 
 		// check if empty nodes are scaled down
@@ -321,6 +330,41 @@ var _ = framework.KubeDescribe("Cluster size autoscaler scalability [Slow]", fun
 		Expect(len(nodes.Items)).Should(Equal(totalNodes))
 	})
 
+	Specify("CA ignores unschedulable pods while scheduling schedulable pods [Feature:ClusterAutoscalerScalability6]", func() {
+		// Start a number of pods saturating existing nodes.
+		perNodeReservation := int(float64(memCapacityMb) * 0.80)
+		replicasPerNode := 10
+		initialPodReplicas := nodeCount * replicasPerNode
+		initialPodsTotalMemory := nodeCount * perNodeReservation
+		reservationCleanup := ReserveMemory(f, "initial-pod", initialPodReplicas, initialPodsTotalMemory, true /* wait for pods to run */, memoryReservationTimeout)
+		defer reservationCleanup()
+		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
+
+		// Configure a number of unschedulable pods.
+		unschedulableMemReservation := memCapacityMb * 2
+		unschedulablePodReplicas := 1000
+		totalMemReservation := unschedulableMemReservation * unschedulablePodReplicas
+		timeToWait := 5 * time.Minute
+		podsConfig := reserveMemoryRCConfig(f, "unschedulable-pod", unschedulablePodReplicas, totalMemReservation, timeToWait)
+		framework.RunRC(*podsConfig) // Ignore error (it will occur because pods are unschedulable)
+		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, podsConfig.Name)
+
+		// Ensure that no new nodes have been added so far.
+		Expect(framework.ClusterSize(f.ClientSet)).To(Equal(nodeCount))
+
+		// Start a number of schedulable pods to ensure CA reacts.
+		additionalNodes := maxNodes - nodeCount
+		replicas := additionalNodes * replicasPerNode
+		totalMemory := additionalNodes * perNodeReservation
+		rcConfig := reserveMemoryRCConfig(f, "extra-pod", replicas, totalMemory, largeScaleUpTimeout)
+		expectedResult := createClusterPredicates(nodeCount + additionalNodes)
+		config := createScaleUpTestConfig(nodeCount, initialPodReplicas, rcConfig, expectedResult)
+
+		// Test that scale up happens, allowing 1000 unschedulable pods not to be scheduled.
+		testCleanup := simpleScaleUpTestWithTolerance(f, config, 0, unschedulablePodReplicas)
+		defer testCleanup()
+	})
+
 })
 
 func makeUnschedulable(f *framework.Framework, nodes []v1.Node) error {
@@ -350,24 +394,24 @@ func anyKey(input map[string]int) string {
 	return ""
 }
 
-func simpleScaleUpTestWithEpsilon(f *framework.Framework, config *scaleUpTestConfig, epsilon float64) func() error {
+func simpleScaleUpTestWithTolerance(f *framework.Framework, config *scaleUpTestConfig, tolerateMissingNodeCount int, tolerateMissingPodCount int) func() error {
 	// resize cluster to start size
 	// run rc based on config
 	By(fmt.Sprintf("Running RC %v from config", config.extraPods.Name))
 	start := time.Now()
 	framework.ExpectNoError(framework.RunRC(*config.extraPods))
 	// check results
-	if epsilon > 0 && epsilon < 1 {
+	if tolerateMissingNodeCount > 0 {
 		// Tolerate some number of nodes not to be created.
-		minExpectedNodeCount := int(float64(config.expectedResult.nodes) - epsilon*float64(config.expectedResult.nodes))
+		minExpectedNodeCount := config.expectedResult.nodes - tolerateMissingNodeCount
 		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
 			func(size int) bool { return size >= minExpectedNodeCount }, scaleUpTimeout))
 	} else {
 		framework.ExpectNoError(framework.WaitForClusterSize(f.ClientSet, config.expectedResult.nodes, scaleUpTimeout))
 	}
 	glog.Infof("cluster is increased")
-	if epsilon > 0 && epsilon < 0 {
-		framework.ExpectNoError(waitForCaPodsReadyInNamespace(f, f.ClientSet, int(epsilon*float64(config.extraPods.Replicas)+1)))
+	if tolerateMissingPodCount > 0 {
+		framework.ExpectNoError(waitForCaPodsReadyInNamespace(f, f.ClientSet, tolerateMissingPodCount))
 	} else {
 		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, f.ClientSet))
 	}
@@ -378,7 +422,7 @@ func simpleScaleUpTestWithEpsilon(f *framework.Framework, config *scaleUpTestCon
 }
 
 func simpleScaleUpTest(f *framework.Framework, config *scaleUpTestConfig) func() error {
-	return simpleScaleUpTestWithEpsilon(f, config, 0)
+	return simpleScaleUpTestWithTolerance(f, config, 0, 0)
 }
 
 func reserveMemoryRCConfig(f *framework.Framework, id string, replicas, megabytes int, timeout time.Duration) *testutils.RCConfig {
diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go
index 6afa232f188..206192b4c74 100644
--- a/test/e2e/framework/util.go
+++ b/test/e2e/framework/util.go
@@ -3892,6 +3892,23 @@ func WaitForControllerManagerUp() error {
 	return fmt.Errorf("waiting for controller-manager timed out")
 }
 
+// Returns cluster size (number of ready Nodes excluding Master Node).
+func ClusterSize(c clientset.Interface) (int, error) {
+	nodes, err := c.Core().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
+		"spec.unschedulable": "false",
+	}.AsSelector().String()})
+	if err != nil {
+		Logf("Failed to list nodes: %v", err)
+		return 0, err
+	}
+
+	// Filter out not-ready nodes.
+	FilterNodes(nodes, func(node v1.Node) bool {
+		return IsNodeConditionSetAsExpected(&node, v1.NodeReady, true)
+	})
+	return len(nodes.Items), nil
+}
+
 // WaitForClusterSize waits until the cluster has desired size and there is no not-ready nodes in it.
 // By cluster size we mean number of Nodes excluding Master Node.
 func WaitForClusterSize(c clientset.Interface, size int, timeout time.Duration) error {