From 6aaabc6f46e4a67df1f958791e164cd05985dd82 Mon Sep 17 00:00:00 2001
From: gmarek <gmarek@google.com>
Date: Mon, 1 Feb 2016 12:37:38 +0100
Subject: [PATCH] Allow some NotReady nodes in 1000 node clusters

---
 cluster/validate-cluster.sh | 22 ++++++++++++++++++++--
 hack/jenkins/e2e.sh         | 28 ++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/cluster/validate-cluster.sh b/cluster/validate-cluster.sh
index 017ba477a54..c6d0622b19f 100755
--- a/cluster/validate-cluster.sh
+++ b/cluster/validate-cluster.sh
@@ -15,6 +15,10 @@
 # limitations under the License.
 
 # Validates that the cluster is healthy.
+# Error codes are:
+# 0 - success
+# 1 - fatal (cluster is unlikely to work)
+# 2 - non-fatal (encountered some errors, but cluster should be working correctly)
 
 set -o errexit
 set -o nounset
@@ -29,11 +33,14 @@ fi
 source "${KUBE_ROOT}/cluster/kube-env.sh"
 source "${KUBE_ROOT}/cluster/kube-util.sh"
 
+ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
+
 EXPECTED_NUM_NODES="${NUM_NODES}"
 if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then
   EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+1))
 fi
 # Make several attempts to deal with slow cluster birth.
+return_value=0
 attempt=0
 while true; do
   # The "kubectl get nodes -o template" exports node information.
@@ -59,7 +66,12 @@ while true; do
     if (( attempt > 100 )); then
       echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
       "${KUBE_ROOT}/cluster/kubectl.sh" get nodes
-      exit 2
+      if [ "$((${EXPECTED_NUM_NODES} - ${found}))" -gt "${ALLOWED_NOTREADY_NODES}" ]; then
+        exit 1
+      else
+        return_value=2
+        break
+      fi
 		else
       echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
     fi
@@ -99,4 +111,10 @@ done
 
 echo "Validate output:"
 "${KUBE_ROOT}/cluster/kubectl.sh" get cs
-echo -e "${color_green}Cluster validation succeeded${color_norm}"
+if [ "${return_value}" == "0" ]; then 
+  echo -e "${color_green}Cluster validation succeeded${color_norm}"
+else
+  echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}"
+fi
+
+exit "${return_value}"
diff --git a/hack/jenkins/e2e.sh b/hack/jenkins/e2e.sh
index d10b5155cd8..fffd1d790a3 100755
--- a/hack/jenkins/e2e.sh
+++ b/hack/jenkins/e2e.sh
@@ -508,6 +508,7 @@ case ${JOB_NAME} in
 
   # Runs the performance/scalability test on huge 1000-node cluster on GCE.
   # Flannel is used as network provider.
+  # Allows a couple of nodes to be NotReady during startup
   kubernetes-e2e-gce-enormous-cluster)
     : ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-cluster"}
     : ${E2E_NETWORK:="e2e-enormous-cluster"}
@@ -526,6 +527,32 @@ case ${JOB_NAME} in
     NODE_SIZE="n1-standard-1"
     NODE_DISK_SIZE="50GB"
     NUM_NODES="1000"
+    ALLOWED_NOTREADY_NODES="2"
+    # Reduce logs verbosity
+    TEST_CLUSTER_LOG_LEVEL="--v=1"
+    # Increase resync period to simulate production
+    TEST_CLUSTER_RESYNC_PERIOD="--min-resync-period=12h"
+    ;;
+
+  # Starts and tears down 1000-node cluster on GCE using flannel networking
+  # Requires all 1000 nodes to come up.
+  kubernetes-e2e-gce-enormous-startup)
+    : ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-startup"}
+    # TODO: increase a quota for networks in kubernetes-scale and move this test to its own network
+    : ${E2E_NETWORK:="e2e-enormous-cluster"}
+    : ${E2E_TEST:="false"}
+    : ${KUBE_GCE_INSTANCE_PREFIX:="e2e-enormous-startup"}
+    : ${PROJECT:="kubernetes-scale"}
+    # Override GCE defaults.
+    NETWORK_PROVIDER="flannel"
+    # Temporarily switch of Heapster, as this will not schedule anywhere.
+    # TODO: Think of a solution to enable it.
+    ENABLE_CLUSTER_MONITORING="none"
+    E2E_ZONE="asia-east1-a"
+    MASTER_SIZE="n1-standard-32"
+    NODE_SIZE="n1-standard-1"
+    NODE_DISK_SIZE="50GB"
+    NUM_NODES="1000"
     # Reduce logs verbosity
     TEST_CLUSTER_LOG_LEVEL="--v=1"
     # Increase resync period to simulate production
@@ -900,6 +927,7 @@ export KUBE_GCE_NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-}
 export KUBE_OS_DISTRIBUTION=${KUBE_OS_DISTRIBUTION:-}
 export GCE_SERVICE_ACCOUNT=$(gcloud auth list 2> /dev/null | grep active | cut -f3 -d' ')
 export FAIL_ON_GCP_RESOURCE_LEAK="${FAIL_ON_GCP_RESOURCE_LEAK:-false}"
+export ALLOWED_NOTREADY_NODES=${ALLOWED_NOTREADY_NODES:-}
 
 # GKE variables
 export CLUSTER_NAME=${E2E_CLUSTER_NAME}