From 9808dd9a03ec2528339cf27f1bf3cdc2be5df6fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20K=C5=82obuszewski?= <danielmk@google.com>
Date: Tue, 31 Aug 2021 11:37:25 +0200
Subject: [PATCH] Wait 15m after instead of before breaking nodes

15m is enough for Cluster Autoscaler to remove empty nodes, so we need
to break them sooner than that. Instead, wait 15m after breaking them to
ensure Cluster Autoscaler will consider them as unready instead of still
starting.
---
 test/e2e/autoscaling/cluster_size_autoscaling.go | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/test/e2e/autoscaling/cluster_size_autoscaling.go b/test/e2e/autoscaling/cluster_size_autoscaling.go
index 1f743307711..00f3daf2a5e 100644
--- a/test/e2e/autoscaling/cluster_size_autoscaling.go
+++ b/test/e2e/autoscaling/cluster_size_autoscaling.go
@@ -892,7 +892,14 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
 		// node to be unhealthy unless it was created more than 15m
 		// ago. Within that 15m window, it'll assume node is just
 		// starting and not unhealthy.
-		time.Sleep(15 * time.Minute)
+		//
+		// However, waiting for 15m would allow scale down to kick in
+		// and remove recently added nodes, so here we just wait 2m for
+		// nodes to come up (1m should be enough, another 1m added as
+		// an extra buffer. Then, we break connectivity to a subset of
+		// nodes and only after that we wait for 15m, since scale down
+		// shouldn't happen when the cluster is unhealthy.
+		time.Sleep(2 * time.Minute)
 
 		ginkgo.By("Block network connectivity to some nodes to simulate unhealthy cluster")
 		nodesToBreakCount := int(math.Ceil(math.Max(float64(unhealthyClusterThreshold), 0.5*float64(clusterSize))))
@@ -915,7 +922,8 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
 			} else {
 				ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, defaultTimeout)
 				defer e2erc.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
-				time.Sleep(scaleUpTimeout)
+				// Wait for 15m to ensure Cluster Autoscaler won't consider broken nodes as still starting.
+				time.Sleep(15 * time.Minute)
 				currentNodes, err := e2enode.GetReadySchedulableNodes(f.ClientSet)
 				framework.ExpectNoError(err)
 				framework.Logf("Currently available nodes: %v, nodes available at the start of test: %v, disabled nodes: %v", len(currentNodes.Items), len(nodes.Items), nodesToBreakCount)