From 6aaabc6f46e4a67df1f958791e164cd05985dd82 Mon Sep 17 00:00:00 2001 From: gmarek Date: Mon, 1 Feb 2016 12:37:38 +0100 Subject: [PATCH] Allow some NotReady nodes in 1000 node clusters --- cluster/validate-cluster.sh | 22 ++++++++++++++++++++-- hack/jenkins/e2e.sh | 28 ++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/cluster/validate-cluster.sh b/cluster/validate-cluster.sh index 017ba477a54..c6d0622b19f 100755 --- a/cluster/validate-cluster.sh +++ b/cluster/validate-cluster.sh @@ -15,6 +15,10 @@ # limitations under the License. # Validates that the cluster is healthy. +# Error codes are: +# 0 - success +# 1 - fatal (cluster is unlikely to work) +# 2 - non-fatal (encountered some errors, but cluster should be working correctly) set -o errexit set -o nounset @@ -29,11 +33,14 @@ fi source "${KUBE_ROOT}/cluster/kube-env.sh" source "${KUBE_ROOT}/cluster/kube-util.sh" +ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}" + EXPECTED_NUM_NODES="${NUM_NODES}" if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+1)) fi # Make several attempts to deal with slow cluster birth. +return_value=0 attempt=0 while true; do # The "kubectl get nodes -o template" exports node information. @@ -59,7 +66,12 @@ while true; do if (( attempt > 100 )); then echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}" "${KUBE_ROOT}/cluster/kubectl.sh" get nodes - exit 2 + if [ "$((${EXPECTED_NUM_NODES} - ${found}))" -gt "${ALLOWED_NOTREADY_NODES}" ]; then + exit 1 + else + return_value=2 + break + fi else echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}" fi @@ -99,4 +111,10 @@ done echo "Validate output:" "${KUBE_ROOT}/cluster/kubectl.sh" get cs -echo -e "${color_green}Cluster validation succeeded${color_norm}" +if [ "${return_value}" == "0" ]; then + echo -e "${color_green}Cluster validation succeeded${color_norm}" +else + echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}" +fi + +exit "${return_value}" diff --git a/hack/jenkins/e2e.sh b/hack/jenkins/e2e.sh index d10b5155cd8..fffd1d790a3 100755 --- a/hack/jenkins/e2e.sh +++ b/hack/jenkins/e2e.sh @@ -508,6 +508,7 @@ case ${JOB_NAME} in # Runs the performance/scalability test on huge 1000-node cluster on GCE. # Flannel is used as network provider. + # Allows a couple of nodes to be NotReady during startup kubernetes-e2e-gce-enormous-cluster) : ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-cluster"} : ${E2E_NETWORK:="e2e-enormous-cluster"} @@ -526,6 +527,32 @@ case ${JOB_NAME} in NODE_SIZE="n1-standard-1" NODE_DISK_SIZE="50GB" NUM_NODES="1000" + ALLOWED_NOTREADY_NODES="2" + # Reduce logs verbosity + TEST_CLUSTER_LOG_LEVEL="--v=1" + # Increase resync period to simulate production + TEST_CLUSTER_RESYNC_PERIOD="--min-resync-period=12h" + ;; + + # Starts and tears down 1000-node cluster on GCE using flannel networking + # Requires all 1000 nodes to come up. + kubernetes-e2e-gce-enormous-startup) + : ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-startup"} + # TODO: increase a quota for networks in kubernetes-scale and move this test to its own network + : ${E2E_NETWORK:="e2e-enormous-cluster"} + : ${E2E_TEST:="false"} + : ${KUBE_GCE_INSTANCE_PREFIX:="e2e-enormous-startup"} + : ${PROJECT:="kubernetes-scale"} + # Override GCE defaults. + NETWORK_PROVIDER="flannel" + # Temporarily switch of Heapster, as this will not schedule anywhere. + # TODO: Think of a solution to enable it. + ENABLE_CLUSTER_MONITORING="none" + E2E_ZONE="asia-east1-a" + MASTER_SIZE="n1-standard-32" + NODE_SIZE="n1-standard-1" + NODE_DISK_SIZE="50GB" + NUM_NODES="1000" # Reduce logs verbosity TEST_CLUSTER_LOG_LEVEL="--v=1" # Increase resync period to simulate production @@ -900,6 +927,7 @@ export KUBE_GCE_NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-} export KUBE_OS_DISTRIBUTION=${KUBE_OS_DISTRIBUTION:-} export GCE_SERVICE_ACCOUNT=$(gcloud auth list 2> /dev/null | grep active | cut -f3 -d' ') export FAIL_ON_GCP_RESOURCE_LEAK="${FAIL_ON_GCP_RESOURCE_LEAK:-false}" +export ALLOWED_NOTREADY_NODES=${ALLOWED_NOTREADY_NODES:-} # GKE variables export CLUSTER_NAME=${E2E_CLUSTER_NAME}