From 0d666e1fcd9bc05a4340a23644729dfc83393bc6 Mon Sep 17 00:00:00 2001
From: Maciej Pytel <maciekpytel@google.com>
Date: Tue, 21 Mar 2017 14:02:55 +0100
Subject: [PATCH 1/2] Cluster-autoscaler multistep node drain e2e

---
 .../autoscaling/cluster_size_autoscaling.go   | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/test/e2e/autoscaling/cluster_size_autoscaling.go b/test/e2e/autoscaling/cluster_size_autoscaling.go
index 155d64339ba..33a609e5ad8 100644
--- a/test/e2e/autoscaling/cluster_size_autoscaling.go
+++ b/test/e2e/autoscaling/cluster_size_autoscaling.go
@@ -330,7 +330,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 	})
 
 	It("should be able to scale down when rescheduling a pod is required and pdb allows for it[Feature:ClusterSizeAutoscalingScaleDown]", func() {
-		runDrainTest(f, originalSizes, 1, func(increasedSize int) {
+		runDrainTest(f, originalSizes, 1, 1, func(increasedSize int) {
 			By("Some node should be removed")
 			framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
 				func(size int) bool { return size < increasedSize }, scaleDownTimeout))
@@ -338,7 +338,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 	})
 
 	It("shouldn't be able to scale down when rescheduling a pod is required, but pdb doesn't allow drain[Feature:ClusterSizeAutoscalingScaleDown]", func() {
-		runDrainTest(f, originalSizes, 0, func(increasedSize int) {
+		runDrainTest(f, originalSizes, 1, 0, func(increasedSize int) {
 			By("No nodes should be removed")
 			time.Sleep(scaleDownTimeout)
 			nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
@@ -346,9 +346,17 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 		})
 	})
 
+	It("should be able to scale down by draining multiple pods one by one as dictated by pdb[Feature:ClusterSizeAutoscalingScaleDown]", func() {
+		runDrainTest(f, originalSizes, 2, 1, func(increasedSize int) {
+			By("Some node should be removed")
+			framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
+				func(size int) bool { return size < increasedSize }, scaleDownTimeout))
+		})
+	})
+
 })
 
-func runDrainTest(f *framework.Framework, migSizes map[string]int, pdbSize int, verifyFunction func(int)) {
+func runDrainTest(f *framework.Framework, migSizes map[string]int, podsPerNode, pdbSize int, verifyFunction func(int)) {
 	increasedSize := manuallyIncreaseClusterSize(f, migSizes)
 
 	nodes, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
@@ -356,10 +364,10 @@ func runDrainTest(f *framework.Framework, migSizes map[string]int, pdbSize int,
 	}.AsSelector().String()})
 	framework.ExpectNoError(err)
 	namespace := f.Namespace.Name
-	numPods := len(nodes.Items)
+	numPods := len(nodes.Items) * podsPerNode
 	testId := string(uuid.NewUUID()) // So that we can label and find pods
 	labelMap := map[string]string{"test_id": testId}
-	framework.ExpectNoError(runReplicatedPodOnEachNode(f, nodes.Items, "reschedulable-pods", labelMap))
+	framework.ExpectNoError(runReplicatedPodOnEachNode(f, nodes.Items, podsPerNode, "reschedulable-pods", labelMap))
 
 	defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "reschedulable-pods")
 
@@ -705,16 +713,16 @@ func makeNodeSchedulable(c clientset.Interface, node *v1.Node) error {
 	return nil
 }
 
-// Creat an RC running a single pod on each node without adding any constraint forcing such
-// pod distribution. This is meant to create a bunch of underutilized (but not unused) nodes
+// Creat an RC running a given number of pods on each node without adding any constraint forcing
+// such pod distribution. This is meant to create a bunch of underutilized (but not unused) nodes
 // with pods that can be rescheduled on different nodes.
 // This is achieved using the following method:
 // 1. disable scheduling on each node
 // 2. create an empty RC
 // 3. for each node:
 // 3a. enable scheduling on that node
-// 3b. increase number of replicas in RC by 1
-func runReplicatedPodOnEachNode(f *framework.Framework, nodes []v1.Node, id string, labels map[string]string) error {
+// 3b. increase number of replicas in RC by podsPerNode
+func runReplicatedPodOnEachNode(f *framework.Framework, nodes []v1.Node, podsPerNode int, id string, labels map[string]string) error {
 	By("Run a pod on each node")
 	for _, node := range nodes {
 		err := makeNodeUnschedulable(f.ClientSet, &node)
@@ -754,7 +762,7 @@ func runReplicatedPodOnEachNode(f *framework.Framework, nodes []v1.Node, id stri
 		// Update replicas count, to create new pods that will be allocated on node
 		// (we retry 409 errors in case rc reference got out of sync)
 		for j := 0; j < 3; j++ {
-			*rc.Spec.Replicas = int32(i + 1)
+			*rc.Spec.Replicas = int32((i + 1) * podsPerNode)
 			rc, err = f.ClientSet.Core().ReplicationControllers(f.Namespace.Name).Update(rc)
 			if err == nil {
 				break
@@ -771,7 +779,7 @@ func runReplicatedPodOnEachNode(f *framework.Framework, nodes []v1.Node, id stri
 
 		err = wait.PollImmediate(5*time.Second, podTimeout, func() (bool, error) {
 			rc, err = f.ClientSet.Core().ReplicationControllers(f.Namespace.Name).Get(id, metav1.GetOptions{})
-			if err != nil || rc.Status.ReadyReplicas < int32(i+1) {
+			if err != nil || rc.Status.ReadyReplicas < int32((i+1)*podsPerNode) {
 				return false, nil
 			}
 			return true, nil

From 2630cefcce316e84ce0ca17205a60757dced5978 Mon Sep 17 00:00:00 2001
From: Maciej Pytel <maciekpytel@google.com>
Date: Tue, 28 Mar 2017 17:10:46 +0200
Subject: [PATCH 2/2] Add retries in cluster-autoscaler e2e

---
 .../autoscaling/cluster_size_autoscaling.go   | 68 ++++++++++++-------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/test/e2e/autoscaling/cluster_size_autoscaling.go b/test/e2e/autoscaling/cluster_size_autoscaling.go
index 33a609e5ad8..0abc33ca1e0 100644
--- a/test/e2e/autoscaling/cluster_size_autoscaling.go
+++ b/test/e2e/autoscaling/cluster_size_autoscaling.go
@@ -674,43 +674,61 @@ func setMigSizes(sizes map[string]int) {
 
 func makeNodeUnschedulable(c clientset.Interface, node *v1.Node) error {
 	By(fmt.Sprintf("Taint node %s", node.Name))
-	freshNode, err := c.Core().Nodes().Get(node.Name, metav1.GetOptions{})
-	if err != nil {
-		return err
-	}
-	for _, taint := range freshNode.Spec.Taints {
-		if taint.Key == disabledTaint {
+	for j := 0; j < 3; j++ {
+		freshNode, err := c.Core().Nodes().Get(node.Name, metav1.GetOptions{})
+		if err != nil {
+			return err
+		}
+		for _, taint := range freshNode.Spec.Taints {
+			if taint.Key == disabledTaint {
+				return nil
+			}
+		}
+		freshNode.Spec.Taints = append(freshNode.Spec.Taints, v1.Taint{
+			Key:    disabledTaint,
+			Value:  "DisabledForTest",
+			Effect: v1.TaintEffectNoSchedule,
+		})
+		_, err = c.Core().Nodes().Update(freshNode)
+		if err == nil {
 			return nil
 		}
+		if !errors.IsConflict(err) {
+			return err
+		}
+		glog.Warningf("Got 409 conflict when trying to taint node, retries left: %v", 3-j)
 	}
-	freshNode.Spec.Taints = append(freshNode.Spec.Taints, v1.Taint{
-		Key:    disabledTaint,
-		Value:  "DisabledForTest",
-		Effect: v1.TaintEffectNoSchedule,
-	})
-	_, err = c.Core().Nodes().Update(freshNode)
-	return err
+	return fmt.Errorf("Failed to taint node in allowed number of retries")
 }
 
 func makeNodeSchedulable(c clientset.Interface, node *v1.Node) error {
 	By(fmt.Sprintf("Remove taint from node %s", node.Name))
-	freshNode, err := c.Core().Nodes().Get(node.Name, metav1.GetOptions{})
-	if err != nil {
-		return err
-	}
-	newTaints := make([]v1.Taint, 0)
-	for _, taint := range freshNode.Spec.Taints {
-		if taint.Key != disabledTaint {
-			newTaints = append(newTaints, taint)
+	for j := 0; j < 3; j++ {
+		freshNode, err := c.Core().Nodes().Get(node.Name, metav1.GetOptions{})
+		if err != nil {
+			return err
+		}
+		newTaints := make([]v1.Taint, 0)
+		for _, taint := range freshNode.Spec.Taints {
+			if taint.Key != disabledTaint {
+				newTaints = append(newTaints, taint)
+			}
 		}
-	}
 
-	if len(newTaints) != len(freshNode.Spec.Taints) {
+		if len(newTaints) == len(freshNode.Spec.Taints) {
+			return nil
+		}
 		freshNode.Spec.Taints = newTaints
 		_, err = c.Core().Nodes().Update(freshNode)
-		return err
+		if err == nil {
+			return nil
+		}
+		if !errors.IsConflict(err) {
+			return err
+		}
+		glog.Warningf("Got 409 conflict when trying to taint node, retries left: %v", 3-j)
 	}
-	return nil
+	return fmt.Errorf("Failed to remove taint from node in allowed number of retries")
 }
 
 // Creat an RC running a given number of pods on each node without adding any constraint forcing